- From: Bjoern Hoehrmann <derhoermi@gmx.net>
- Date: Mon, 04 Jan 2010 16:58:29 +0100
- To: www-archive@w3.org
* Bjoern Hoehrmann wrote:
>I am wondering whether it would make sense to adopt a different approach
>to parsing the INSERT statement, namely create a regular expression from
>the schema and use that to match. I've been toying around with this and
>came up with the following script. It parses the page table, uses named
>captures for convenient key access and a tied hash to do the unescaping.
I've now put revised code into the form of a module
package MediaWiki::DumpFile::SQL::Callback::TiedPlus;
use strict;
use warnings;
our %ESCAPE_MAP = (
'\\' => '\\',
"'" => "'",
'"' => '"',
'n' => "\n",
't' => "\t",
);
BEGIN {
no strict 'refs';
# Install forwarding methods
for my $meth (qw/NEXTKEY FIRSTKEY STORE DELETE
CLEAR EXISTS SCALAR/) {
*{$meth} = sub { shift()->{tiedplus}->$meth(@_) }
}
}
sub TIEHASH {
my ($class, $plus, $unescape) = @_;
my $self = bless {
tiedplus => $plus,
unescape => $unescape
}, $class;
$self;
}
sub FETCH {
my ($self, $key) = @_;
my $value = $self->{tiedplus}->FETCH($key);
return $value unless $self->{unescape}{$key};
$value = "$value";
$value =~ s/\\(.)/$ESCAPE_MAP{$1}/g;
$value;
}
package MediaWiki::DumpFile::SQL::Callback;
use strict;
use warnings;
use MediaWiki::DumpFile::SQL;
use IO::File;
use constant Type2ReTmpl => {
int => q/(?<%s>\d+)/,
bigint => q/(?<%s>\d+)/,
tinyint => q/(?<%s>\d+)/,
timestamp => q/(?<%s>\d+)/,
double => q/(?<%s>[^,]+)/, # This could use a better pattern
varbinary => q/'(?<%s>.*?)'/, # There is some remote chance that
tinyblob => q/'(?<%s>.*?)'/, # this will mismatch if there is
blob => q/'(?<%s>.*?)'/, # specially crafted code in one of
char => q/'(?<%s>.*?)'/, # the quoted columns that resembles
varchar => q/'(?<%s>.*?)'/, # the dump format, in that case the
enum => q/'(?<%s>.*?)'/, # parse should ultimately fail.
};
use constant NeedsUnescaping => {
varbinary => 1,
tinyblob => 1,
blob => 1,
char => 1,
varchar => 1,
enum => 1,
};
sub new {
my ($class, $args) = @_;
my $self = bless { %$args }, $class;
return $self;
}
sub parse_file {
my ($self, $path) = @_;
my $sql = MediaWiki::DumpFile::SQL->new($path);
my @schema = $sql->schema;
my $table = $sql->table_name;
if ($self->{start_document}) {
$self->{start_document}->($table, \@schema);
}
undef $sql;
my $io = IO::File->new($path, '<:utf8');
my $re = "";
foreach my $col (@schema) {
my ($name, $type) = @$col;
# The names of named captures cannot be escaped and allow for only
# a rather limited set of characters, so we do not escape them
# here.
my $inner = sprintf Type2ReTmpl->{$type}, $name;
$re .= ',' if $re;
$re .= "(?:NULL|$inner)";
}
# Keys where the value needs unescaping
my %unkeys = map { $_->[0] => 1 }
grep { NeedsUnescaping->{ $_->[1] } }
@schema;
tie my %row,
"MediaWiki::DumpFile::SQL::Callback::TiedPlus", tied %+, \%unkeys;
my $seen_insert = 0;
my $buffer = "";
# Read and parse until we have all parsable entries
while (!$io->eof) {
# Read in 1MB blocks
$io->read(my $tmp, 1024 * 1024);
die unless defined $tmp;
$buffer .= $tmp;
# Skip to the INSERT statemement
if (!$seen_insert && $buffer =~ s/.*?INSERT INTO .*? VALUES //s) {
$seen_insert = 1;
}
next unless $seen_insert;
my $cb = $self->{row};
# Use /c to memorize the position after match failure
while ($buffer =~ /\($re\)[,;]/gc) {
$cb->(\%row)
}
# Keep only the remainder
substr $buffer, 0, pos($buffer), '';
}
if ($self->{end_document}) {
$self->{end_document}->()
}
}
1;
Usage would go like this:
#!perl -w
use MediaWiki::DumpFile::SQL::Callback;
use strict;
use warnings;
use YAML;
my $wp = MediaWiki::DumpFile::SQL::Callback->new({
"row" => sub {
print YAML::Dump($_[0]);
}
});
$wp->parse_file('dewiki-20091208-page.sql');
Which would print
---
page_counter: 156
page_id: 1
page_is_new: 0
page_is_redirect: 0
page_latest: 67447568
page_len: 4260
page_namespace: 0
page_no_title_convert: 0
page_random: 0.0864337124735431
page_restrictions: ''
page_title: Alan_Smithee
page_touched: 20091130142149
---
page_counter: 271
page_id: 3
page_is_new: 0
page_is_redirect: 0
page_latest: 67126559
page_len: 11137
page_namespace: 0
page_no_title_convert: 0
page_random: 0.445624627587967
page_restrictions: ''
page_title: Actinium
page_touched: 20091205140714
...
(Tyler Riddle confirmed that my code is very much faster than his and
will look into improving performance of the original module, so I'll not
push a module like this onto CPAN for the moment).
--
Björn Höhrmann · mailto:bjoern@hoehrmann.de · http://bjoern.hoehrmann.de
Am Badedeich 7 · Telefon: +49(0)160/4415681 · http://www.bjoernsworld.de
25899 Dagebüll · PGP Pub. KeyID: 0xA4357E78 · http://www.websitedev.de/
Received on Monday, 4 January 2010 15:58:59 UTC