W3C home > Mailing lists > Public > public-ws-desc-eds@w3.org > November 2005

2002/ws/desc/tools diffmk,NONE,1.1 diffmk.xml,NONE,1.1

From: Hugo Haas via cvs-syncmail <cvsmail@w3.org>
Date: Fri, 18 Nov 2005 15:44:28 +0000
To: public-ws-desc-eds@w3.org
Message-Id: <20051118154428.BD7C94F3BD@homer.w3.org>

Update of /sources/public/2002/ws/desc/tools
In directory homer:/tmp/cvs-serv5684

Added Files:
	diffmk diffmk.xml 
Log Message:
DiffMk

--- NEW FILE: diffmk.xml ---
<?xml version='1.0'?>
<!DOCTYPE diffmk SYSTEM "diffmk.dtd">
<diffmk xmlns='http://www.sun.com/xml/diffmk'
        xmlns:xsi='http://www.w3.org/2000/10/XMLSchema-instance'
        xsi:schemaLocation='http://www.sun.com/xml/diffmk
                            diffmk.xsd'>

<doctype name="docbook"
         attribute="revisionflag"
         changed="changed"
         added="added"
         deleted="deleted">
  <wrapper element="phrase"/>
  <wrapper parent="article" element="para"/>
  <wrapper parent="section" element="para"/>
  <wrapper parent="sect1" element="para"/>
  <wrapper parent="sect2" element="para"/>
  <wrapper parent="sect3" element="para"/>
  <wrapper parent="sect4" element="para"/>
  <wrapper parent="sect5" element="para"/>
  <wrapper parent="refsect1" element="para"/>
  <wrapper parent="refsect2" element="para"/>
  <wrapper parent="refsect3" element="para"/>
</doctype>

<doctype name="xmlspec"
         attribute="diff"
         changed="chg"
         added="add"
         deleted="del">
  <wrapper element="phrase"/>
  <wrapper parent="status" element="p"/>
  <wrapper parent="div1" element="p"/>
  <wrapper parent="div2" element="p"/>
  <wrapper parent="div3" element="p"/>
  <wrapper parent="inform-div1" element="p"/>
  <wrapper parent="back" element="p"/>
</doctype>

<doctype name="xhtml"
	 attribute="class"
         changed="diff-chg"
         added="diff-add"
         deleted="diff-del">
  <wrapper element="span"/>
  <wrapper parent="body" element="div"/>
</doctype>

</diffmk>

--- NEW FILE: diffmk ---
#!/usr/bin/perl -- # -*- Perl -*-

# diffmk -- Calculate the differences between two XML documents
# Copyright (C) 2000 Sun Microsystems, Inc., Norman Walsh
#
# For more information, see README.{xml,html} and refentry.{xml,html}

# TODO: Add option to ignore some markup (empty elements only?)
#       POD?

use XML::DOM;
use Algorithm::Diff qw(traverse_sequences);
use Getopt::Long;
use strict;

select(STDOUT); $| = 1;

my $RCSID   = '$Id: diffmk,v 1.1 2005/11/18 15:44:26 hugo Exp $ ';
my $VERSION = '0.1';
my $usage   = "Usage: $0 [options] oldfile.xml newfile.xml\n";

my $parser  = new XML::DOM::Parser (NoExpand => 0);

my %doctypes = ();
my $diffmkxml = undef;

# Find the diffmk.xml data file...
foreach my $file ($ENV{'HOME'} . "/.diffmk.xml",
		  "$0.xml") {
    $diffmkxml = $file if -f $file && !defined($diffmkxml);
}

foreach my $dir (@INC) {
    my $file = "$dir/diffmk.xml";
    $diffmkxml = $file if -f $file && !defined($diffmkxml);
}

if ($diffmkxml) {
    # Load the diffmk.xml file; building the doctypes hash...
    #
    # my %doctypes = ('xmlspec' => { 'attr'    => 'diff',
    #                                'changed' => 'chg',
    #                                'added'   => 'add',
    #                                'deleted' => 'del',
    #                                'wrapper' => {'status' => 'p',
    #                                              'div1' => 'p',
    #                                              'div2' => 'p',
    #                                              'div3' => 'p',
    #                                              'inform-div1' => 'p',
    #                                              'back' => 'p',
    #                                              '*' => 'phrase'}},
    my $diffmkdom = $parser->parsefile($diffmkxml);
    my $diffmk = $diffmkdom->getDocumentElement();
    my $doctypelist = $diffmk->getElementsByTagName('doctype');
    for (my $count = 0; $count < $doctypelist->getLength(); $count++) {
	my $doctype = $doctypelist->item($count);
	my $name = $doctype->getAttribute('name');
	$doctypes{$name} = {};
	$doctypes{$name}->{'attr'} = $doctype->getAttribute('attribute');
	$doctypes{$name}->{'changed'} = $doctype->getAttribute('changed');
	$doctypes{$name}->{'added'} = $doctype->getAttribute('added');
	$doctypes{$name}->{'deleted'} = $doctype->getAttribute('deleted');
	$doctypes{$name}->{'wrapper'} = {};

	my $wrapperlist = $doctype->getElementsByTagName('wrapper');
	for (my $wcount = 0; $wcount < $wrapperlist->getLength(); $wcount++) {
	    my $wrapper = $wrapperlist->item($wcount);
	    my $parent = $wrapper->getAttribute('parent') || '*';
	    my $element = $wrapper->getAttribute('element');
	    $doctypes{$name}->{'wrapper'}->{$parent} = $element;
	}
    }
}

my %option = ('debug' => 0,
	      'verbose' => 1,
	      'diff' => 'text',
	      'output' => '',
	      'showdelete' => '1',
	      'includedeletedelements' => '0',
	      'ignorewhitespace' => '1',
	      'usechanged' => '1',
	      'doctype' => '',
	      'attribute' => '',
	      'changed' => '',
	      'added' => '',
	      'deleted' => '',
	      'wrapper' => '');

my %opt = ();
&GetOptions(\%opt,
	    'debug+',
	    'verbose+',
	    'diff=s',
	    'output=s',
	    'showdelete!',
	    'includedeletedelements!',
	    'ignorewhitespace!',
	    'usechanged!',
	    'doctype=s',
	    'attribute=s',
	    'changed=s',
	    'added=s',
	    'deleted=s',
	    'wrapper=s') || die $usage;

foreach my $key (keys %option) {
    $option{$key} = $opt{$key} if exists($opt{$key});
}

# check the options...
die "Unknown diff type: " . $option{'diff'} . "\n$usage"
    if ($option{'diff'} ne 'text'
	&& $option{'diff'} ne 'element'
	&& $option{'diff'} ne 'both');

my $diffType = $option{'diff'};
my $diffAttr = undef;
my $diffChanged = undef;
my $diffAdded = undef;
my $diffDeleted = undef;
my $diffWrap = undef;
my $showDelete = $option{'showdelete'};

my $countAdded = 0;
my $countChanged = 0;
my $countDeleted = 0;

if ($option{'doctype'}) {
    die "Unknown doctype: " . $option{'doctype'} . "\n$usage"
	if !exists($doctypes{$option{'doctype'}});

    $diffAttr = $doctypes{$option{'doctype'}}->{'attr'};
    $diffChanged = $doctypes{$option{'doctype'}}->{'changed'};
    $diffAdded = $doctypes{$option{'doctype'}}->{'added'};
    $diffDeleted = $doctypes{$option{'doctype'}}->{'deleted'};
    $diffWrap = $doctypes{$option{'doctype'}}->{'wrapper'};
}

$diffAttr = $option{'attribute'} if $option{'attribute'};
$diffChanged = $option{'changed'} if $option{'changed'};
$diffAdded = $option{'added'} if $option{'added'};
$diffDeleted = $option{'deleted'} if $option{'deleted'};
$diffWrap = $option{'wrapper'} if $option{'wrapper'};

die "Some markup parameter(s) undefined.\n$usage"
    if (!defined($diffAttr)
	|| !defined($diffChanged)
	|| !defined($diffAdded)
	|| !defined($diffDeleted)
	|| !defined($diffWrap));

my $file1 = shift @ARGV || die $usage;
my $file2 = shift @ARGV || die $usage;

my $output = undef;

if ($option{'output'}) {
    $output = $option{'output'};
} else {
    if (@ARGV) {
	$output = shift @ARGV;
    } else {
	$output = "-";
    }
}

print "Loading $file1...\n";
my $dom1    = $parser->parsefile($file1);

print "Loading $file2...\n";
my $dom2    = $parser->parsefile($file2);

print "Calculating node list for $file1...";
my @elements1 = &inorder_nodes($dom1->getDocumentElement());
printf "%d nodes\n", $#elements1+1;

print "Calculating node list for $file2...";
my @elements2 = &inorder_nodes($dom2->getDocumentElement());
printf "%d nodes\n", $#elements2+1;

my @actions = ();

print "Calculating differences...\n";
&traverse_sequences(\@elements1, \@elements2,
		    { MATCH => \&diff_match,
		      DISCARD_A => \&diff_discard_a,
		      DISCARD_B => \&diff_discard_b,
		    },
		    \&diff_gen_key);

if ($option{'debug'}) {
    print "Actions:\n";
    &show_actions(@actions);
    print "List 1:\n";
    &show_sequence(@elements1);
    print "List 2:\n";
    &show_sequence(@elements2);
}

@actions = &find_changes(@actions) if $option{'usechanged'};

@actions = &trim_matches(@actions);

if ($option{'debug'}) {
    print "Actions (after merge):\n";
    &show_actions(@actions);
}

# now apply the actions to $dom2
for (my $count = 0; $count <= $#actions; $count++) {
    my $action = $actions[$count];

    next if $action->{'action'} eq 'match';

    my $node1 = $elements1[$action->{'a'}];
    my $node2 = $elements2[$action->{'b'}];

    if ($option{'debug'}) {
	print "Processing action $count [", $action->{'action'}, "]:\n";
	print "\tNode1: ";
	&show_node($node1);
	print "\tNode2: ";
	&show_node($node1);
    }

    # disca is a special case, we have to add a 'deleted' text node
    # into dom2
    if ($action->{'action'} eq 'disca') {
	if ($showDelete) {
	    # whitespace?
	    if ($node1->getNodeType() == XML::DOM::TEXT_NODE) {
		my $text = $node1->getData();
		if ($text =~ /^\s*$/sg && $option{'ignorewhitespace'}) {
		    next;
		}
	    }

	    # Wait, what if this node is inside a node that's already
	    # been copied over and marked deleted. Then don't bother
	    # copying this one too...
	    my $parent = $node1->getParentNode();
	    my $skip = 0;
	    while (!$skip
		   && $parent->getNodeType() == XML::DOM::ELEMENT_NODE) {
		$skip = $parent->getAttribute($diffAttr) eq $diffDeleted;
		$parent = $parent->getParentNode();
	    }

	    next if $skip;

	    # if node1 is a text node, or we're exposing deleted elements,
	    # go ahead and insert it...
	    if ($node1->getNodeType() == XML::DOM::TEXT_NODE) {
		my $textNode = $dom2->createTextNode($node1->getData());
		my $parent = $node2->getParentNode(); # yes, node2
		my $wrapName = $diffWrap;

		if (ref $diffWrap eq 'HASH') {
		    $wrapName = $diffWrap->{'*'};
		    $wrapName = $diffWrap->{$parent->getTagName()}
		        if $diffWrap->{$parent->getTagName()}
		}

		my $wrap = $dom2->createElement($wrapName);

		if (!$skip) {
		    $parent->insertBefore($wrap, $node2);
		    $wrap->appendChild($textNode);
		    $wrap->setAttribute($diffAttr, $diffDeleted);
		    $countDeleted++;
		}
	    } elsif ($node1->getNodeType == XML::DOM::ELEMENT_NODE
		     && $option{'includedeletedelements'}) {
		my $elemNode = $node1->cloneNode(1);
		my $parent = $node2->getParentNode();

		$elemNode->setOwnerDocument($node2->getOwnerDocument());
		$elemNode->setAttribute($diffAttr, $diffDeleted);
		$countDeleted++;

		# hack: mark node1 as "deleted" so we can tell when we're
		# looking at its children
		$node1->setAttribute($diffAttr, $diffDeleted);

		$parent->insertBefore($elemNode, $node2);
	    }
	}
	next;
    }

    # whitespace?
    if ($node2->getNodeType() == XML::DOM::TEXT_NODE) {
	my $text = $node2->getData();
	if ($text =~ /^\s*$/sg && $option{'ignorewhitespace'}) {
	    next;
	}
    }

    # If $node2 is a text node, and it is the only child of an element,
    # then this action can safely be applied to the parent element
    if ($node2->getNodeType() == XML::DOM::TEXT_NODE
	&& $node2->getPreviousSibling() == undef
	&& $node2->getNextSibling() == undef) {
	$node2 = $node2->getParentNode();
    }

    # if we're adding something, and one of our parents is already
    # marked as an addition; we don't need to mark this one...
    if ($action->{'action'} eq 'discb') {
	my $parent = $node2->getParentNode();
	my $found = 0;
	while (!$found && $parent) {
	    $found = 1 if ($parent->getNodeType() == XML::DOM::ELEMENT_NODE
			   && $parent->getAttribute($diffAttr) eq $diffAdded);
	    $parent = $parent->getParentNode();
	}
	next if $found;
    }

    # If $node2 is still a text node, wrap it in an element
    if ($node2->getNodeType() == XML::DOM::TEXT_NODE) {
	my $parent = $node2->getParentNode();
	my $wrapName = $diffWrap;

	if (ref $diffWrap eq 'HASH') {
	    $wrapName = $diffWrap->{'*'};
	    $wrapName = $diffWrap->{$parent->getTagName()}
	    if $diffWrap->{$parent->getTagName()}
	}

	my $wrap = $dom2->createElement($wrapName);

	$parent->insertBefore($wrap, $node2);
	$node2 = $parent->removeChild($node2);
	$wrap->appendChild($node2);
	$node2 = $wrap;
    }

    if ($action->{'action'} eq 'discb') {
	$node2->setAttribute($diffAttr, $diffAdded);
	$countAdded++;
    } elsif ($action->{'action'} eq 'changeab') {
	$node2->setAttribute($diffAttr, $diffChanged);
	$countChanged++;
    } else {
	die "Unexpected action: " . $action->{'action'} . "\n";
    }
}

# Add our PI to the dom
my $pidata = "version='$VERSION'\n"
    . "\toldfile='$file1'\n"
    . "\tnewfile='$file2'\n"
    . "\tattribute='$diffAttr'\n"
    . "\tchanged='$diffChanged'\n"
    . "\tadded='$diffAdded'\n"
    . "\tdeleted='$diffDeleted'\n";

$pidata .= "\twrapper='$diffWrap'\n" if ref $diffWrap ne 'HASH';

foreach my $opt ('diff', 'showdelete', 'includedeletedelements',
		 'ignorewhitespace', 'usechanged') {
    $pidata .= "\t$opt='" . $option{$opt} . "'\n";
}

my $pi = $dom2->createProcessingInstruction('diffmk', $pidata);
$dom2->insertBefore($pi, $dom2->getDocumentElement());

open (OUT, ">$output");
$dom2->printToFileHandle(\*OUT);
close (OUT);

print "Diff'd: $countAdded additions, $countDeleted deletions, and ";
print "$countChanged changes.\n";

# ======================================================================
# Functions to build and traverse the lists

sub inorder_nodes {
    my $node = shift;
    my @nodes = @_;

    if (($node->getNodeType() == XML::DOM::ELEMENT_NODE
	 && ($diffType eq 'element' || $diffType eq 'both'))
	|| ($node->getNodeType() == XML::DOM::TEXT_NODE
	    && ($diffType eq 'text' || $diffType eq 'both'))) {
	push (@nodes, $node);
    }

    if ($node->getNodeType() == XML::DOM::ELEMENT_NODE) {
	my $child = $node->getFirstChild();
	while ($child) {
	    @nodes = &inorder_nodes($child, @nodes);
	    $child = $child->getNextSibling();
	}
    }

    return @nodes;
}

sub diff_match {
    my $a_index = shift;
    my $b_index = shift;
    my @rest = @_;

    my $action = {};

    $action->{'action'} = 'match';
    $action->{'a'} = $a_index;
    $action->{'b'} = $b_index;

    push (@actions, $action);
}

sub diff_discard_a {
    my $a_index = shift;
    my $b_index = shift;
    my @rest = @_;

    my $action = {};

    $action->{'action'} = 'disca';
    $action->{'a'} = $a_index;
    $action->{'b'} = $b_index;

    push (@actions, $action);
}

sub diff_discard_b {
    my $a_index = shift;
    my $b_index = shift;
    my @rest = @_;

    my $action = {};

    $action->{'action'} = 'discb';
    $action->{'a'} = $a_index;
    $action->{'b'} = $b_index;

    push (@actions, $action);
}

sub diff_gen_key {
    my $elem = shift;

    if ($elem->getNodeType() == XML::DOM::ELEMENT_NODE) {
	# element nodes are the same if they've got the same name
	# and attribute values
	my $elemkey = $elem->getTagName();
	my $attr = $elem->getAttributes();
	for (my $count = 0; $count < $attr->getLength(); $count++) {
	    my $name = $attr->item($count)->getName();
	    my $val = $attr->item($count)->getValue();

	    # normalize all space, if we're ignoring whitespace
	    $val =~ s/\s+/ /sg if $option{'ignorewhitespace'};

	    $elemkey .= " $name=$val";
	}

	return 'E: ' . $elemkey;
    } else {
	# text nodes are the same if they're equal, modulo whitespace
	my $text = $elem->getData();
	$text =~ s/\s+/ /sg; # normalize all space...
	return 'T: ' . $text;
    }
}

# ======================================================================
# These are debugging functions...

sub show_actions {
    my @actions = @_;

    my $count = 0;

    foreach my $act (@actions) {
	printf "%03d: ", $count;
	$count++;

	print $act->{'action'}, " ", $act->{'a'}, " ", $act->{'b'}, "\n";
    }

    print "\n";
}

sub show_sequence {
    my @seq = @_;
    my $count = 0;

    for (my $count = 0; $count <= $#seq; $count++) {
	my $elem = $seq[$count];

	printf "%03d: ", $count;
	&show_node($elem);
    }

    print "\n";
}

sub show_node {
    my $elem = shift;

    if ($elem->getNodeType() == XML::DOM::ELEMENT_NODE) {
	print &diff_gen_key($elem);
    } else {
	my $text = &diff_gen_key($elem);
	$text = substr($text, 0, 70);
	print $text;
    }

    print "\n";
}


# ======================================================================
# Functions to manipulate the @action list

sub find_changes {
    my @actions = @_;
    my @newactions = ();

    # The sequence 'match', 'disca'+, 'discb'+, 'match' is a change...

    while (@actions) {
	my $action = shift @actions;
	push (@newactions, $action);
	if ($action->{'action'} eq 'match') {
	    # maybe we're at the start of a change...
	    my $runlength = &balanced_pairs(@actions);
	    if ($runlength > 0) {
		my @new = ();
		for (my $count = 0; $count < $runlength; $count++) {
		    my $act = shift @actions;
		    $act->{'action'} = 'changeab';
		    unshift(@new, $act);
		}
		for (my $count = 0; $count < $runlength; $count++) {
		    my $act1 = shift @new;
		    my $act2 = shift @actions;
		    $act1->{'b'} = $act2->{'b'};
		    push (@newactions, $act1);
		}
	    }
	}
    }

    return @newactions;
}

sub balanced_pairs {
    my @actions = @_;
    my $count = 0;
    my $pos = 0;

    while ($actions[$pos]->{'action'} eq 'disca') {
	$count++;
	$pos++;
    }

    if ($count > 0) {
	my $runlength = $count;
	while ($actions[$pos]->{'action'} eq 'discb' && $count > 0) {
	    $count--;
	    $pos++;
	}

	if ($count eq '0') {
	    return $runlength;
	} else {
	    return 0;
	}
    } else {
	return 0;
    }
}

sub trim_matches {
    my @actions = @_;
    my @newactions = ();
    my $lastaction = undef;

    while (@actions) {
        # Reduce strings of matches to a single match...
	my $action = shift @actions;
	if ($action->{'action'} eq 'match'
	    && $lastaction eq 'match') {
	    pop @newactions;
	    push (@newactions, $action);
	} else {
	    push (@newactions, $action);
	    $lastaction = $action->{'action'};
	}
    }

    return @newactions;
}

# ======================================================================
Received on Friday, 18 November 2005 15:44:46 GMT

This archive was generated by hypermail 2.2.0 + w3c-0.30 : Friday, 18 November 2005 15:44:48 GMT