- From: Bjoern Hoehrmann <derhoermi@gmx.net>
- Date: Mon, 04 Jan 2010 16:58:29 +0100
- To: www-archive@w3.org
* Bjoern Hoehrmann wrote: >I am wondering whether it would make sense to adopt a different approach >to parsing the INSERT statement, namely create a regular expression from >the schema and use that to match. I've been toying around with this and >came up with the following script. It parses the page table, uses named >captures for convenient key access and a tied hash to do the unescaping. I've now put revised code into the form of a module package MediaWiki::DumpFile::SQL::Callback::TiedPlus; use strict; use warnings; our %ESCAPE_MAP = ( '\\' => '\\', "'" => "'", '"' => '"', 'n' => "\n", 't' => "\t", ); BEGIN { no strict 'refs'; # Install forwarding methods for my $meth (qw/NEXTKEY FIRSTKEY STORE DELETE CLEAR EXISTS SCALAR/) { *{$meth} = sub { shift()->{tiedplus}->$meth(@_) } } } sub TIEHASH { my ($class, $plus, $unescape) = @_; my $self = bless { tiedplus => $plus, unescape => $unescape }, $class; $self; } sub FETCH { my ($self, $key) = @_; my $value = $self->{tiedplus}->FETCH($key); return $value unless $self->{unescape}{$key}; $value = "$value"; $value =~ s/\\(.)/$ESCAPE_MAP{$1}/g; $value; } package MediaWiki::DumpFile::SQL::Callback; use strict; use warnings; use MediaWiki::DumpFile::SQL; use IO::File; use constant Type2ReTmpl => { int => q/(?<%s>\d+)/, bigint => q/(?<%s>\d+)/, tinyint => q/(?<%s>\d+)/, timestamp => q/(?<%s>\d+)/, double => q/(?<%s>[^,]+)/, # This could use a better pattern varbinary => q/'(?<%s>.*?)'/, # There is some remote chance that tinyblob => q/'(?<%s>.*?)'/, # this will mismatch if there is blob => q/'(?<%s>.*?)'/, # specially crafted code in one of char => q/'(?<%s>.*?)'/, # the quoted columns that resembles varchar => q/'(?<%s>.*?)'/, # the dump format, in that case the enum => q/'(?<%s>.*?)'/, # parse should ultimately fail. }; use constant NeedsUnescaping => { varbinary => 1, tinyblob => 1, blob => 1, char => 1, varchar => 1, enum => 1, }; sub new { my ($class, $args) = @_; my $self = bless { %$args }, $class; return $self; } sub parse_file { my ($self, $path) = @_; my $sql = MediaWiki::DumpFile::SQL->new($path); my @schema = $sql->schema; my $table = $sql->table_name; if ($self->{start_document}) { $self->{start_document}->($table, \@schema); } undef $sql; my $io = IO::File->new($path, '<:utf8'); my $re = ""; foreach my $col (@schema) { my ($name, $type) = @$col; # The names of named captures cannot be escaped and allow for only # a rather limited set of characters, so we do not escape them # here. my $inner = sprintf Type2ReTmpl->{$type}, $name; $re .= ',' if $re; $re .= "(?:NULL|$inner)"; } # Keys where the value needs unescaping my %unkeys = map { $_->[0] => 1 } grep { NeedsUnescaping->{ $_->[1] } } @schema; tie my %row, "MediaWiki::DumpFile::SQL::Callback::TiedPlus", tied %+, \%unkeys; my $seen_insert = 0; my $buffer = ""; # Read and parse until we have all parsable entries while (!$io->eof) { # Read in 1MB blocks $io->read(my $tmp, 1024 * 1024); die unless defined $tmp; $buffer .= $tmp; # Skip to the INSERT statemement if (!$seen_insert && $buffer =~ s/.*?INSERT INTO .*? VALUES //s) { $seen_insert = 1; } next unless $seen_insert; my $cb = $self->{row}; # Use /c to memorize the position after match failure while ($buffer =~ /\($re\)[,;]/gc) { $cb->(\%row) } # Keep only the remainder substr $buffer, 0, pos($buffer), ''; } if ($self->{end_document}) { $self->{end_document}->() } } 1; Usage would go like this: #!perl -w use MediaWiki::DumpFile::SQL::Callback; use strict; use warnings; use YAML; my $wp = MediaWiki::DumpFile::SQL::Callback->new({ "row" => sub { print YAML::Dump($_[0]); } }); $wp->parse_file('dewiki-20091208-page.sql'); Which would print --- page_counter: 156 page_id: 1 page_is_new: 0 page_is_redirect: 0 page_latest: 67447568 page_len: 4260 page_namespace: 0 page_no_title_convert: 0 page_random: 0.0864337124735431 page_restrictions: '' page_title: Alan_Smithee page_touched: 20091130142149 --- page_counter: 271 page_id: 3 page_is_new: 0 page_is_redirect: 0 page_latest: 67126559 page_len: 11137 page_namespace: 0 page_no_title_convert: 0 page_random: 0.445624627587967 page_restrictions: '' page_title: Actinium page_touched: 20091205140714 ... (Tyler Riddle confirmed that my code is very much faster than his and will look into improving performance of the original module, so I'll not push a module like this onto CPAN for the moment). -- Björn Höhrmann · mailto:bjoern@hoehrmann.de · http://bjoern.hoehrmann.de Am Badedeich 7 · Telefon: +49(0)160/4415681 · http://www.bjoernsworld.de 25899 Dagebüll · PGP Pub. KeyID: 0xA4357E78 · http://www.websitedev.de/
Received on Monday, 4 January 2010 15:58:59 UTC