Faster `sdiff` based on `compact_diff` for Algorithm::Diff::XS

Hi,

  As noted in http://search.cpan.org/dist/Algorithm-Diff-XS v0.04, "only
the LCSidx function is optimized in XS at the moment, which means only
compact_diff will get significantly faster for large data sets, while
diff and sdiff will run in identical speed as Algorithm::Diff." Now the
`sdiff` function can be implemented on top of `compact_diff`. The code
below does that, including verification against random inputs by using
the `Algorithm::Diff::sdiff` function and comparing the result. Please
feel free to use the code under the same terms as Algorithm::Diff::XS. I
have seen speedups of an order of magnitude and greater.

  #!perl -w
  use Modern::Perl;
  use Algorithm::Diff::XS;
  use Test::More;
  use Data::Random qw/rand_chars/;
  use POSIX qw//;
  
  sub fast_sdiff {
    my $x = shift;
    my $y = shift;
    my $keyfunc = shift;
    my @keyargs = @_;
    my @cdiff;
    
    if ($keyfunc) {
      my @dx = map { $keyfunc->($_, @keyargs) } @$x;
      my @dy = map { $keyfunc->($_, @keyargs) } @$y;
      @cdiff = Algorithm::Diff::XS::compact_diff(\@dx, \@dy);
    } else {
      @cdiff = Algorithm::Diff::XS::compact_diff($x, $y);
    }
    
    _compact_diff_to_sdiff($x, $y, @cdiff);
  }
  
  sub _compact_diff_to_sdiff {
    my ($a, $b, @cdiff) = @_;
    my $MIN = -(POSIX::DBL_MAX);
    my @temp;
    my $add = sub {
      my ($op, $ax, $bx, $count) = @_;
      push @temp, [$op, $ax + $_, $bx + $_] for 0 .. $count - 1;
    };
  
    for (my $ix = 0; $ix < @cdiff - 2; $ix += 2) {
      my ($a_from, $b_from, $a2_from, $b2_from) =
        @cdiff[ $ix .. $ix + 3 ];
  
      my $a_len = ($a2_from - 1) - $a_from + 1;
      my $b_len = ($b2_from - 1) - $b_from + 1;
  
      if ($ix & 2) {
        # modified
        if ($a_from == $a2_from) {
          # addition
          $add->('+', $MIN, $b_from, $b_len);
        } elsif ($b_from == $b2_from) {
          # removal
          $add->('-', $a_from, $MIN, $a_len);
        } else {
          # change
          if ($a_len < $b_len) {
            $add->('c', $a_from, $b_from, $a_len);
            $add->('+', $MIN, $b_from + $a_len, $b_len - $a_len);
          } elsif ($a_len > $b_len) {
            $add->('c', $a_from, $b_from, $b_len);
            $add->('-', $a_from + $b_len, $MIN, $a_len - $b_len);
          } else {
            $add->('c', $a_from, $b_from, $a_len);
          }
        }
      } else {
        # unchanged
        $add->('u', $a_from, $b_from, $a_len);
      }
    }
  
    $_->[1] = $_->[1] >= 0 ? $a->[$_->[1]] : '' for @temp;
    $_->[2] = $_->[2] >= 0 ? $b->[$_->[2]] : '' for @temp;
    @temp;
  }
  
  for (0 .. 1000) {
    my @x = map { rand_chars(set => 'all', max => 26) } 0 .. 2;
    my @y = map { rand_chars(set => 'all', max => 26) } 0 .. 2;
    my @sdiff = fast_sdiff(\@x, \@y);
    my @sdiff_orig = Algorithm::Diff::XS::sdiff(\@x, \@y);
    is_deeply(\@sdiff, \@sdiff_orig,
      'fast_sdiff(...) == sdiff(...)');
  }
  
regards,
-- 
Björn Höhrmann · mailto:bjoern@hoehrmann.de · http://bjoern.hoehrmann.de
Am Badedeich 7 · Telefon: +49(0)160/4415681 · http://www.bjoernsworld.de
25899 Dagebüll · PGP Pub. KeyID: 0xA4357E78 · http://www.websitedev.de/ 

Received on Friday, 25 October 2013 19:56:26 UTC