Re: Alternate parsing approach for MediaWiki::DumpFile::SQL

* Bjoern Hoehrmann wrote:
>I am wondering whether it would make sense to adopt a different approach
>to parsing the INSERT statement, namely create a regular expression from
>the schema and use that to match. I've been toying around with this and
>came up with the following script. It parses the page table, uses named
>captures for convenient key access and a tied hash to do the unescaping.

I've now put revised code into the form of a module

  package MediaWiki::DumpFile::SQL::Callback::TiedPlus;
  
  use strict;
  use warnings;
  
  our %ESCAPE_MAP = (
    '\\' => '\\',
    "'"  => "'",
    '"'  => '"',
    'n'  => "\n",
    't'  => "\t",
  );
  
  BEGIN {
    no strict 'refs';
  
    # Install forwarding methods
    for my $meth (qw/NEXTKEY FIRSTKEY STORE DELETE
      CLEAR EXISTS SCALAR/) {
      *{$meth} = sub { shift()->{tiedplus}->$meth(@_) }
    }
  }
  
  sub TIEHASH {
    my ($class, $plus, $unescape) = @_;
  
    my $self = bless {
      tiedplus => $plus,
      unescape => $unescape
    }, $class;
  
    $self;
  }
  
  sub FETCH {
    my ($self, $key) = @_;
  
    my $value = $self->{tiedplus}->FETCH($key);
  
    return $value unless $self->{unescape}{$key};
  
    $value = "$value";
    $value =~ s/\\(.)/$ESCAPE_MAP{$1}/g;
    $value;
  }
  
  package MediaWiki::DumpFile::SQL::Callback;
  use strict;
  use warnings;
  use MediaWiki::DumpFile::SQL;
  use IO::File;
  
  use constant Type2ReTmpl => {
    int       => q/(?<%s>\d+)/,
    bigint    => q/(?<%s>\d+)/,
    tinyint   => q/(?<%s>\d+)/,
    timestamp => q/(?<%s>\d+)/,
    double    => q/(?<%s>[^,]+)/, # This could use a better pattern
    varbinary => q/'(?<%s>.*?)'/, # There is some remote chance that 
    tinyblob  => q/'(?<%s>.*?)'/, # this will mismatch if there is
    blob      => q/'(?<%s>.*?)'/, # specially crafted code in one of
    char      => q/'(?<%s>.*?)'/, # the quoted columns that resembles 
    varchar   => q/'(?<%s>.*?)'/, # the dump format, in that case the
    enum      => q/'(?<%s>.*?)'/, # parse should ultimately fail.
  };
  
  use constant NeedsUnescaping => {
    varbinary => 1,
    tinyblob  => 1,
    blob      => 1,
    char      => 1,
    varchar   => 1,
    enum      => 1,
  };
  
  sub new {
    my ($class, $args) = @_;
    my $self = bless { %$args }, $class;
    return $self;
  }
  
  sub parse_file {
    my ($self, $path) = @_;
  
    my $sql = MediaWiki::DumpFile::SQL->new($path);
  
    my @schema = $sql->schema;
    my $table  = $sql->table_name;
  
    if ($self->{start_document}) {
      $self->{start_document}->($table, \@schema);
    }
  
    undef $sql;
  
    my $io = IO::File->new($path, '<:utf8');
  
    my $re = "";
    foreach my $col (@schema) {
      my ($name, $type) = @$col;
  
      # The names of named captures cannot be escaped and allow for only
      # a rather limited set of characters, so we do not escape them
      # here.
      my $inner = sprintf Type2ReTmpl->{$type}, $name;
  
      $re .= ',' if $re;
      $re .= "(?:NULL|$inner)";
    }
  
    # Keys where the value needs unescaping
    my %unkeys = map { $_->[0] => 1 }
                 grep { NeedsUnescaping->{ $_->[1] } }
                 @schema;
  
    tie my %row,
      "MediaWiki::DumpFile::SQL::Callback::TiedPlus", tied %+, \%unkeys;
  
    my $seen_insert = 0;
    my $buffer = "";
  
    # Read and parse until we have all parsable entries
    while (!$io->eof) {
  
      # Read in 1MB blocks
      $io->read(my $tmp, 1024 * 1024);
  
      die unless defined $tmp;
  
      $buffer .= $tmp;
  
      # Skip to the INSERT statemement
      if (!$seen_insert && $buffer =~ s/.*?INSERT INTO .*? VALUES //s) {
        $seen_insert = 1;
      }
  
      next unless $seen_insert;
  
      my $cb = $self->{row};
      # Use /c to memorize the position after match failure
      while ($buffer =~ /\($re\)[,;]/gc) {
        $cb->(\%row)
      }
  
      # Keep only the remainder
      substr $buffer, 0, pos($buffer), '';
    }
  
    if ($self->{end_document}) {
      $self->{end_document}->()
    }
  
  }
  
  1;
  
Usage would go like this:

  #!perl -w
  use MediaWiki::DumpFile::SQL::Callback;
  use strict;
  use warnings;
  use YAML;
  
  my $wp = MediaWiki::DumpFile::SQL::Callback->new({
    "row" => sub {
      print YAML::Dump($_[0]);
    }
  });
  
  $wp->parse_file('dewiki-20091208-page.sql');

Which would print

  ---
  page_counter: 156
  page_id: 1
  page_is_new: 0
  page_is_redirect: 0
  page_latest: 67447568
  page_len: 4260
  page_namespace: 0
  page_no_title_convert: 0
  page_random: 0.0864337124735431
  page_restrictions: ''
  page_title: Alan_Smithee
  page_touched: 20091130142149
  ---
  page_counter: 271
  page_id: 3
  page_is_new: 0
  page_is_redirect: 0
  page_latest: 67126559
  page_len: 11137
  page_namespace: 0
  page_no_title_convert: 0
  page_random: 0.445624627587967
  page_restrictions: ''
  page_title: Actinium
  page_touched: 20091205140714
  ...

(Tyler Riddle confirmed that my code is very much faster than his and
will look into improving performance of the original module, so I'll not
push a module like this onto CPAN for the moment).
-- 
Björn Höhrmann · mailto:bjoern@hoehrmann.de · http://bjoern.hoehrmann.de
Am Badedeich 7 · Telefon: +49(0)160/4415681 · http://www.bjoernsworld.de
25899 Dagebüll · PGP Pub. KeyID: 0xA4357E78 · http://www.websitedev.de/ 

Received on Monday, 4 January 2010 15:58:59 UTC