- From: Hugo Haas via cvs-syncmail <cvsmail@w3.org>
- Date: Fri, 18 Nov 2005 15:44:28 +0000
- To: public-ws-desc-eds@w3.org
Update of /sources/public/2002/ws/desc/tools
In directory homer:/tmp/cvs-serv5684
Added Files:
diffmk diffmk.xml
Log Message:
DiffMk
--- NEW FILE: diffmk.xml ---
<?xml version='1.0'?>
<!DOCTYPE diffmk SYSTEM "diffmk.dtd">
<diffmk xmlns='http://www.sun.com/xml/diffmk'
xmlns:xsi='http://www.w3.org/2000/10/XMLSchema-instance'
xsi:schemaLocation='http://www.sun.com/xml/diffmk
diffmk.xsd'>
<doctype name="docbook"
attribute="revisionflag"
changed="changed"
added="added"
deleted="deleted">
<wrapper element="phrase"/>
<wrapper parent="article" element="para"/>
<wrapper parent="section" element="para"/>
<wrapper parent="sect1" element="para"/>
<wrapper parent="sect2" element="para"/>
<wrapper parent="sect3" element="para"/>
<wrapper parent="sect4" element="para"/>
<wrapper parent="sect5" element="para"/>
<wrapper parent="refsect1" element="para"/>
<wrapper parent="refsect2" element="para"/>
<wrapper parent="refsect3" element="para"/>
</doctype>
<doctype name="xmlspec"
attribute="diff"
changed="chg"
added="add"
deleted="del">
<wrapper element="phrase"/>
<wrapper parent="status" element="p"/>
<wrapper parent="div1" element="p"/>
<wrapper parent="div2" element="p"/>
<wrapper parent="div3" element="p"/>
<wrapper parent="inform-div1" element="p"/>
<wrapper parent="back" element="p"/>
</doctype>
<doctype name="xhtml"
attribute="class"
changed="diff-chg"
added="diff-add"
deleted="diff-del">
<wrapper element="span"/>
<wrapper parent="body" element="div"/>
</doctype>
</diffmk>
--- NEW FILE: diffmk ---
#!/usr/bin/perl -- # -*- Perl -*-
# diffmk -- Calculate the differences between two XML documents
# Copyright (C) 2000 Sun Microsystems, Inc., Norman Walsh
#
# For more information, see README.{xml,html} and refentry.{xml,html}
# TODO: Add option to ignore some markup (empty elements only?)
# POD?
use XML::DOM;
use Algorithm::Diff qw(traverse_sequences);
use Getopt::Long;
use strict;
select(STDOUT); $| = 1;
my $RCSID = '$Id: diffmk,v 1.1 2005/11/18 15:44:26 hugo Exp $ ';
my $VERSION = '0.1';
my $usage = "Usage: $0 [options] oldfile.xml newfile.xml\n";
my $parser = new XML::DOM::Parser (NoExpand => 0);
my %doctypes = ();
my $diffmkxml = undef;
# Find the diffmk.xml data file...
foreach my $file ($ENV{'HOME'} . "/.diffmk.xml",
"$0.xml") {
$diffmkxml = $file if -f $file && !defined($diffmkxml);
}
foreach my $dir (@INC) {
my $file = "$dir/diffmk.xml";
$diffmkxml = $file if -f $file && !defined($diffmkxml);
}
if ($diffmkxml) {
# Load the diffmk.xml file; building the doctypes hash...
#
# my %doctypes = ('xmlspec' => { 'attr' => 'diff',
# 'changed' => 'chg',
# 'added' => 'add',
# 'deleted' => 'del',
# 'wrapper' => {'status' => 'p',
# 'div1' => 'p',
# 'div2' => 'p',
# 'div3' => 'p',
# 'inform-div1' => 'p',
# 'back' => 'p',
# '*' => 'phrase'}},
my $diffmkdom = $parser->parsefile($diffmkxml);
my $diffmk = $diffmkdom->getDocumentElement();
my $doctypelist = $diffmk->getElementsByTagName('doctype');
for (my $count = 0; $count < $doctypelist->getLength(); $count++) {
my $doctype = $doctypelist->item($count);
my $name = $doctype->getAttribute('name');
$doctypes{$name} = {};
$doctypes{$name}->{'attr'} = $doctype->getAttribute('attribute');
$doctypes{$name}->{'changed'} = $doctype->getAttribute('changed');
$doctypes{$name}->{'added'} = $doctype->getAttribute('added');
$doctypes{$name}->{'deleted'} = $doctype->getAttribute('deleted');
$doctypes{$name}->{'wrapper'} = {};
my $wrapperlist = $doctype->getElementsByTagName('wrapper');
for (my $wcount = 0; $wcount < $wrapperlist->getLength(); $wcount++) {
my $wrapper = $wrapperlist->item($wcount);
my $parent = $wrapper->getAttribute('parent') || '*';
my $element = $wrapper->getAttribute('element');
$doctypes{$name}->{'wrapper'}->{$parent} = $element;
}
}
}
my %option = ('debug' => 0,
'verbose' => 1,
'diff' => 'text',
'output' => '',
'showdelete' => '1',
'includedeletedelements' => '0',
'ignorewhitespace' => '1',
'usechanged' => '1',
'doctype' => '',
'attribute' => '',
'changed' => '',
'added' => '',
'deleted' => '',
'wrapper' => '');
my %opt = ();
&GetOptions(\%opt,
'debug+',
'verbose+',
'diff=s',
'output=s',
'showdelete!',
'includedeletedelements!',
'ignorewhitespace!',
'usechanged!',
'doctype=s',
'attribute=s',
'changed=s',
'added=s',
'deleted=s',
'wrapper=s') || die $usage;
foreach my $key (keys %option) {
$option{$key} = $opt{$key} if exists($opt{$key});
}
# check the options...
die "Unknown diff type: " . $option{'diff'} . "\n$usage"
if ($option{'diff'} ne 'text'
&& $option{'diff'} ne 'element'
&& $option{'diff'} ne 'both');
my $diffType = $option{'diff'};
my $diffAttr = undef;
my $diffChanged = undef;
my $diffAdded = undef;
my $diffDeleted = undef;
my $diffWrap = undef;
my $showDelete = $option{'showdelete'};
my $countAdded = 0;
my $countChanged = 0;
my $countDeleted = 0;
if ($option{'doctype'}) {
die "Unknown doctype: " . $option{'doctype'} . "\n$usage"
if !exists($doctypes{$option{'doctype'}});
$diffAttr = $doctypes{$option{'doctype'}}->{'attr'};
$diffChanged = $doctypes{$option{'doctype'}}->{'changed'};
$diffAdded = $doctypes{$option{'doctype'}}->{'added'};
$diffDeleted = $doctypes{$option{'doctype'}}->{'deleted'};
$diffWrap = $doctypes{$option{'doctype'}}->{'wrapper'};
}
$diffAttr = $option{'attribute'} if $option{'attribute'};
$diffChanged = $option{'changed'} if $option{'changed'};
$diffAdded = $option{'added'} if $option{'added'};
$diffDeleted = $option{'deleted'} if $option{'deleted'};
$diffWrap = $option{'wrapper'} if $option{'wrapper'};
die "Some markup parameter(s) undefined.\n$usage"
if (!defined($diffAttr)
|| !defined($diffChanged)
|| !defined($diffAdded)
|| !defined($diffDeleted)
|| !defined($diffWrap));
my $file1 = shift @ARGV || die $usage;
my $file2 = shift @ARGV || die $usage;
my $output = undef;
if ($option{'output'}) {
$output = $option{'output'};
} else {
if (@ARGV) {
$output = shift @ARGV;
} else {
$output = "-";
}
}
print "Loading $file1...\n";
my $dom1 = $parser->parsefile($file1);
print "Loading $file2...\n";
my $dom2 = $parser->parsefile($file2);
print "Calculating node list for $file1...";
my @elements1 = &inorder_nodes($dom1->getDocumentElement());
printf "%d nodes\n", $#elements1+1;
print "Calculating node list for $file2...";
my @elements2 = &inorder_nodes($dom2->getDocumentElement());
printf "%d nodes\n", $#elements2+1;
my @actions = ();
print "Calculating differences...\n";
&traverse_sequences(\@elements1, \@elements2,
{ MATCH => \&diff_match,
DISCARD_A => \&diff_discard_a,
DISCARD_B => \&diff_discard_b,
},
\&diff_gen_key);
if ($option{'debug'}) {
print "Actions:\n";
&show_actions(@actions);
print "List 1:\n";
&show_sequence(@elements1);
print "List 2:\n";
&show_sequence(@elements2);
}
@actions = &find_changes(@actions) if $option{'usechanged'};
@actions = &trim_matches(@actions);
if ($option{'debug'}) {
print "Actions (after merge):\n";
&show_actions(@actions);
}
# now apply the actions to $dom2
for (my $count = 0; $count <= $#actions; $count++) {
my $action = $actions[$count];
next if $action->{'action'} eq 'match';
my $node1 = $elements1[$action->{'a'}];
my $node2 = $elements2[$action->{'b'}];
if ($option{'debug'}) {
print "Processing action $count [", $action->{'action'}, "]:\n";
print "\tNode1: ";
&show_node($node1);
print "\tNode2: ";
&show_node($node1);
}
# disca is a special case, we have to add a 'deleted' text node
# into dom2
if ($action->{'action'} eq 'disca') {
if ($showDelete) {
# whitespace?
if ($node1->getNodeType() == XML::DOM::TEXT_NODE) {
my $text = $node1->getData();
if ($text =~ /^\s*$/sg && $option{'ignorewhitespace'}) {
next;
}
}
# Wait, what if this node is inside a node that's already
# been copied over and marked deleted. Then don't bother
# copying this one too...
my $parent = $node1->getParentNode();
my $skip = 0;
while (!$skip
&& $parent->getNodeType() == XML::DOM::ELEMENT_NODE) {
$skip = $parent->getAttribute($diffAttr) eq $diffDeleted;
$parent = $parent->getParentNode();
}
next if $skip;
# if node1 is a text node, or we're exposing deleted elements,
# go ahead and insert it...
if ($node1->getNodeType() == XML::DOM::TEXT_NODE) {
my $textNode = $dom2->createTextNode($node1->getData());
my $parent = $node2->getParentNode(); # yes, node2
my $wrapName = $diffWrap;
if (ref $diffWrap eq 'HASH') {
$wrapName = $diffWrap->{'*'};
$wrapName = $diffWrap->{$parent->getTagName()}
if $diffWrap->{$parent->getTagName()}
}
my $wrap = $dom2->createElement($wrapName);
if (!$skip) {
$parent->insertBefore($wrap, $node2);
$wrap->appendChild($textNode);
$wrap->setAttribute($diffAttr, $diffDeleted);
$countDeleted++;
}
} elsif ($node1->getNodeType == XML::DOM::ELEMENT_NODE
&& $option{'includedeletedelements'}) {
my $elemNode = $node1->cloneNode(1);
my $parent = $node2->getParentNode();
$elemNode->setOwnerDocument($node2->getOwnerDocument());
$elemNode->setAttribute($diffAttr, $diffDeleted);
$countDeleted++;
# hack: mark node1 as "deleted" so we can tell when we're
# looking at its children
$node1->setAttribute($diffAttr, $diffDeleted);
$parent->insertBefore($elemNode, $node2);
}
}
next;
}
# whitespace?
if ($node2->getNodeType() == XML::DOM::TEXT_NODE) {
my $text = $node2->getData();
if ($text =~ /^\s*$/sg && $option{'ignorewhitespace'}) {
next;
}
}
# If $node2 is a text node, and it is the only child of an element,
# then this action can safely be applied to the parent element
if ($node2->getNodeType() == XML::DOM::TEXT_NODE
&& $node2->getPreviousSibling() == undef
&& $node2->getNextSibling() == undef) {
$node2 = $node2->getParentNode();
}
# if we're adding something, and one of our parents is already
# marked as an addition; we don't need to mark this one...
if ($action->{'action'} eq 'discb') {
my $parent = $node2->getParentNode();
my $found = 0;
while (!$found && $parent) {
$found = 1 if ($parent->getNodeType() == XML::DOM::ELEMENT_NODE
&& $parent->getAttribute($diffAttr) eq $diffAdded);
$parent = $parent->getParentNode();
}
next if $found;
}
# If $node2 is still a text node, wrap it in an element
if ($node2->getNodeType() == XML::DOM::TEXT_NODE) {
my $parent = $node2->getParentNode();
my $wrapName = $diffWrap;
if (ref $diffWrap eq 'HASH') {
$wrapName = $diffWrap->{'*'};
$wrapName = $diffWrap->{$parent->getTagName()}
if $diffWrap->{$parent->getTagName()}
}
my $wrap = $dom2->createElement($wrapName);
$parent->insertBefore($wrap, $node2);
$node2 = $parent->removeChild($node2);
$wrap->appendChild($node2);
$node2 = $wrap;
}
if ($action->{'action'} eq 'discb') {
$node2->setAttribute($diffAttr, $diffAdded);
$countAdded++;
} elsif ($action->{'action'} eq 'changeab') {
$node2->setAttribute($diffAttr, $diffChanged);
$countChanged++;
} else {
die "Unexpected action: " . $action->{'action'} . "\n";
}
}
# Add our PI to the dom
my $pidata = "version='$VERSION'\n"
. "\toldfile='$file1'\n"
. "\tnewfile='$file2'\n"
. "\tattribute='$diffAttr'\n"
. "\tchanged='$diffChanged'\n"
. "\tadded='$diffAdded'\n"
. "\tdeleted='$diffDeleted'\n";
$pidata .= "\twrapper='$diffWrap'\n" if ref $diffWrap ne 'HASH';
foreach my $opt ('diff', 'showdelete', 'includedeletedelements',
'ignorewhitespace', 'usechanged') {
$pidata .= "\t$opt='" . $option{$opt} . "'\n";
}
my $pi = $dom2->createProcessingInstruction('diffmk', $pidata);
$dom2->insertBefore($pi, $dom2->getDocumentElement());
open (OUT, ">$output");
$dom2->printToFileHandle(\*OUT);
close (OUT);
print "Diff'd: $countAdded additions, $countDeleted deletions, and ";
print "$countChanged changes.\n";
# ======================================================================
# Functions to build and traverse the lists
sub inorder_nodes {
my $node = shift;
my @nodes = @_;
if (($node->getNodeType() == XML::DOM::ELEMENT_NODE
&& ($diffType eq 'element' || $diffType eq 'both'))
|| ($node->getNodeType() == XML::DOM::TEXT_NODE
&& ($diffType eq 'text' || $diffType eq 'both'))) {
push (@nodes, $node);
}
if ($node->getNodeType() == XML::DOM::ELEMENT_NODE) {
my $child = $node->getFirstChild();
while ($child) {
@nodes = &inorder_nodes($child, @nodes);
$child = $child->getNextSibling();
}
}
return @nodes;
}
sub diff_match {
my $a_index = shift;
my $b_index = shift;
my @rest = @_;
my $action = {};
$action->{'action'} = 'match';
$action->{'a'} = $a_index;
$action->{'b'} = $b_index;
push (@actions, $action);
}
sub diff_discard_a {
my $a_index = shift;
my $b_index = shift;
my @rest = @_;
my $action = {};
$action->{'action'} = 'disca';
$action->{'a'} = $a_index;
$action->{'b'} = $b_index;
push (@actions, $action);
}
sub diff_discard_b {
my $a_index = shift;
my $b_index = shift;
my @rest = @_;
my $action = {};
$action->{'action'} = 'discb';
$action->{'a'} = $a_index;
$action->{'b'} = $b_index;
push (@actions, $action);
}
sub diff_gen_key {
my $elem = shift;
if ($elem->getNodeType() == XML::DOM::ELEMENT_NODE) {
# element nodes are the same if they've got the same name
# and attribute values
my $elemkey = $elem->getTagName();
my $attr = $elem->getAttributes();
for (my $count = 0; $count < $attr->getLength(); $count++) {
my $name = $attr->item($count)->getName();
my $val = $attr->item($count)->getValue();
# normalize all space, if we're ignoring whitespace
$val =~ s/\s+/ /sg if $option{'ignorewhitespace'};
$elemkey .= " $name=$val";
}
return 'E: ' . $elemkey;
} else {
# text nodes are the same if they're equal, modulo whitespace
my $text = $elem->getData();
$text =~ s/\s+/ /sg; # normalize all space...
return 'T: ' . $text;
}
}
# ======================================================================
# These are debugging functions...
sub show_actions {
my @actions = @_;
my $count = 0;
foreach my $act (@actions) {
printf "%03d: ", $count;
$count++;
print $act->{'action'}, " ", $act->{'a'}, " ", $act->{'b'}, "\n";
}
print "\n";
}
sub show_sequence {
my @seq = @_;
my $count = 0;
for (my $count = 0; $count <= $#seq; $count++) {
my $elem = $seq[$count];
printf "%03d: ", $count;
&show_node($elem);
}
print "\n";
}
sub show_node {
my $elem = shift;
if ($elem->getNodeType() == XML::DOM::ELEMENT_NODE) {
print &diff_gen_key($elem);
} else {
my $text = &diff_gen_key($elem);
$text = substr($text, 0, 70);
print $text;
}
print "\n";
}
# ======================================================================
# Functions to manipulate the @action list
sub find_changes {
my @actions = @_;
my @newactions = ();
# The sequence 'match', 'disca'+, 'discb'+, 'match' is a change...
while (@actions) {
my $action = shift @actions;
push (@newactions, $action);
if ($action->{'action'} eq 'match') {
# maybe we're at the start of a change...
my $runlength = &balanced_pairs(@actions);
if ($runlength > 0) {
my @new = ();
for (my $count = 0; $count < $runlength; $count++) {
my $act = shift @actions;
$act->{'action'} = 'changeab';
unshift(@new, $act);
}
for (my $count = 0; $count < $runlength; $count++) {
my $act1 = shift @new;
my $act2 = shift @actions;
$act1->{'b'} = $act2->{'b'};
push (@newactions, $act1);
}
}
}
}
return @newactions;
}
sub balanced_pairs {
my @actions = @_;
my $count = 0;
my $pos = 0;
while ($actions[$pos]->{'action'} eq 'disca') {
$count++;
$pos++;
}
if ($count > 0) {
my $runlength = $count;
while ($actions[$pos]->{'action'} eq 'discb' && $count > 0) {
$count--;
$pos++;
}
if ($count eq '0') {
return $runlength;
} else {
return 0;
}
} else {
return 0;
}
}
sub trim_matches {
my @actions = @_;
my @newactions = ();
my $lastaction = undef;
while (@actions) {
# Reduce strings of matches to a single match...
my $action = shift @actions;
if ($action->{'action'} eq 'match'
&& $lastaction eq 'match') {
pop @newactions;
push (@newactions, $action);
} else {
push (@newactions, $action);
$lastaction = $action->{'action'};
}
}
return @newactions;
}
# ======================================================================
Received on Friday, 18 November 2005 15:44:46 UTC