- From: Hugo Haas via cvs-syncmail <cvsmail@w3.org>
- Date: Fri, 18 Nov 2005 15:44:28 +0000
- To: public-ws-desc-eds@w3.org
Update of /sources/public/2002/ws/desc/tools In directory homer:/tmp/cvs-serv5684 Added Files: diffmk diffmk.xml Log Message: DiffMk --- NEW FILE: diffmk.xml --- <?xml version='1.0'?> <!DOCTYPE diffmk SYSTEM "diffmk.dtd"> <diffmk xmlns='http://www.sun.com/xml/diffmk' xmlns:xsi='http://www.w3.org/2000/10/XMLSchema-instance' xsi:schemaLocation='http://www.sun.com/xml/diffmk diffmk.xsd'> <doctype name="docbook" attribute="revisionflag" changed="changed" added="added" deleted="deleted"> <wrapper element="phrase"/> <wrapper parent="article" element="para"/> <wrapper parent="section" element="para"/> <wrapper parent="sect1" element="para"/> <wrapper parent="sect2" element="para"/> <wrapper parent="sect3" element="para"/> <wrapper parent="sect4" element="para"/> <wrapper parent="sect5" element="para"/> <wrapper parent="refsect1" element="para"/> <wrapper parent="refsect2" element="para"/> <wrapper parent="refsect3" element="para"/> </doctype> <doctype name="xmlspec" attribute="diff" changed="chg" added="add" deleted="del"> <wrapper element="phrase"/> <wrapper parent="status" element="p"/> <wrapper parent="div1" element="p"/> <wrapper parent="div2" element="p"/> <wrapper parent="div3" element="p"/> <wrapper parent="inform-div1" element="p"/> <wrapper parent="back" element="p"/> </doctype> <doctype name="xhtml" attribute="class" changed="diff-chg" added="diff-add" deleted="diff-del"> <wrapper element="span"/> <wrapper parent="body" element="div"/> </doctype> </diffmk> --- NEW FILE: diffmk --- #!/usr/bin/perl -- # -*- Perl -*- # diffmk -- Calculate the differences between two XML documents # Copyright (C) 2000 Sun Microsystems, Inc., Norman Walsh # # For more information, see README.{xml,html} and refentry.{xml,html} # TODO: Add option to ignore some markup (empty elements only?) # POD? use XML::DOM; use Algorithm::Diff qw(traverse_sequences); use Getopt::Long; use strict; select(STDOUT); $| = 1; my $RCSID = '$Id: diffmk,v 1.1 2005/11/18 15:44:26 hugo Exp $ '; my $VERSION = '0.1'; my $usage = "Usage: $0 [options] oldfile.xml newfile.xml\n"; my $parser = new XML::DOM::Parser (NoExpand => 0); my %doctypes = (); my $diffmkxml = undef; # Find the diffmk.xml data file... foreach my $file ($ENV{'HOME'} . "/.diffmk.xml", "$0.xml") { $diffmkxml = $file if -f $file && !defined($diffmkxml); } foreach my $dir (@INC) { my $file = "$dir/diffmk.xml"; $diffmkxml = $file if -f $file && !defined($diffmkxml); } if ($diffmkxml) { # Load the diffmk.xml file; building the doctypes hash... # # my %doctypes = ('xmlspec' => { 'attr' => 'diff', # 'changed' => 'chg', # 'added' => 'add', # 'deleted' => 'del', # 'wrapper' => {'status' => 'p', # 'div1' => 'p', # 'div2' => 'p', # 'div3' => 'p', # 'inform-div1' => 'p', # 'back' => 'p', # '*' => 'phrase'}}, my $diffmkdom = $parser->parsefile($diffmkxml); my $diffmk = $diffmkdom->getDocumentElement(); my $doctypelist = $diffmk->getElementsByTagName('doctype'); for (my $count = 0; $count < $doctypelist->getLength(); $count++) { my $doctype = $doctypelist->item($count); my $name = $doctype->getAttribute('name'); $doctypes{$name} = {}; $doctypes{$name}->{'attr'} = $doctype->getAttribute('attribute'); $doctypes{$name}->{'changed'} = $doctype->getAttribute('changed'); $doctypes{$name}->{'added'} = $doctype->getAttribute('added'); $doctypes{$name}->{'deleted'} = $doctype->getAttribute('deleted'); $doctypes{$name}->{'wrapper'} = {}; my $wrapperlist = $doctype->getElementsByTagName('wrapper'); for (my $wcount = 0; $wcount < $wrapperlist->getLength(); $wcount++) { my $wrapper = $wrapperlist->item($wcount); my $parent = $wrapper->getAttribute('parent') || '*'; my $element = $wrapper->getAttribute('element'); $doctypes{$name}->{'wrapper'}->{$parent} = $element; } } } my %option = ('debug' => 0, 'verbose' => 1, 'diff' => 'text', 'output' => '', 'showdelete' => '1', 'includedeletedelements' => '0', 'ignorewhitespace' => '1', 'usechanged' => '1', 'doctype' => '', 'attribute' => '', 'changed' => '', 'added' => '', 'deleted' => '', 'wrapper' => ''); my %opt = (); &GetOptions(\%opt, 'debug+', 'verbose+', 'diff=s', 'output=s', 'showdelete!', 'includedeletedelements!', 'ignorewhitespace!', 'usechanged!', 'doctype=s', 'attribute=s', 'changed=s', 'added=s', 'deleted=s', 'wrapper=s') || die $usage; foreach my $key (keys %option) { $option{$key} = $opt{$key} if exists($opt{$key}); } # check the options... die "Unknown diff type: " . $option{'diff'} . "\n$usage" if ($option{'diff'} ne 'text' && $option{'diff'} ne 'element' && $option{'diff'} ne 'both'); my $diffType = $option{'diff'}; my $diffAttr = undef; my $diffChanged = undef; my $diffAdded = undef; my $diffDeleted = undef; my $diffWrap = undef; my $showDelete = $option{'showdelete'}; my $countAdded = 0; my $countChanged = 0; my $countDeleted = 0; if ($option{'doctype'}) { die "Unknown doctype: " . $option{'doctype'} . "\n$usage" if !exists($doctypes{$option{'doctype'}}); $diffAttr = $doctypes{$option{'doctype'}}->{'attr'}; $diffChanged = $doctypes{$option{'doctype'}}->{'changed'}; $diffAdded = $doctypes{$option{'doctype'}}->{'added'}; $diffDeleted = $doctypes{$option{'doctype'}}->{'deleted'}; $diffWrap = $doctypes{$option{'doctype'}}->{'wrapper'}; } $diffAttr = $option{'attribute'} if $option{'attribute'}; $diffChanged = $option{'changed'} if $option{'changed'}; $diffAdded = $option{'added'} if $option{'added'}; $diffDeleted = $option{'deleted'} if $option{'deleted'}; $diffWrap = $option{'wrapper'} if $option{'wrapper'}; die "Some markup parameter(s) undefined.\n$usage" if (!defined($diffAttr) || !defined($diffChanged) || !defined($diffAdded) || !defined($diffDeleted) || !defined($diffWrap)); my $file1 = shift @ARGV || die $usage; my $file2 = shift @ARGV || die $usage; my $output = undef; if ($option{'output'}) { $output = $option{'output'}; } else { if (@ARGV) { $output = shift @ARGV; } else { $output = "-"; } } print "Loading $file1...\n"; my $dom1 = $parser->parsefile($file1); print "Loading $file2...\n"; my $dom2 = $parser->parsefile($file2); print "Calculating node list for $file1..."; my @elements1 = &inorder_nodes($dom1->getDocumentElement()); printf "%d nodes\n", $#elements1+1; print "Calculating node list for $file2..."; my @elements2 = &inorder_nodes($dom2->getDocumentElement()); printf "%d nodes\n", $#elements2+1; my @actions = (); print "Calculating differences...\n"; &traverse_sequences(\@elements1, \@elements2, { MATCH => \&diff_match, DISCARD_A => \&diff_discard_a, DISCARD_B => \&diff_discard_b, }, \&diff_gen_key); if ($option{'debug'}) { print "Actions:\n"; &show_actions(@actions); print "List 1:\n"; &show_sequence(@elements1); print "List 2:\n"; &show_sequence(@elements2); } @actions = &find_changes(@actions) if $option{'usechanged'}; @actions = &trim_matches(@actions); if ($option{'debug'}) { print "Actions (after merge):\n"; &show_actions(@actions); } # now apply the actions to $dom2 for (my $count = 0; $count <= $#actions; $count++) { my $action = $actions[$count]; next if $action->{'action'} eq 'match'; my $node1 = $elements1[$action->{'a'}]; my $node2 = $elements2[$action->{'b'}]; if ($option{'debug'}) { print "Processing action $count [", $action->{'action'}, "]:\n"; print "\tNode1: "; &show_node($node1); print "\tNode2: "; &show_node($node1); } # disca is a special case, we have to add a 'deleted' text node # into dom2 if ($action->{'action'} eq 'disca') { if ($showDelete) { # whitespace? if ($node1->getNodeType() == XML::DOM::TEXT_NODE) { my $text = $node1->getData(); if ($text =~ /^\s*$/sg && $option{'ignorewhitespace'}) { next; } } # Wait, what if this node is inside a node that's already # been copied over and marked deleted. Then don't bother # copying this one too... my $parent = $node1->getParentNode(); my $skip = 0; while (!$skip && $parent->getNodeType() == XML::DOM::ELEMENT_NODE) { $skip = $parent->getAttribute($diffAttr) eq $diffDeleted; $parent = $parent->getParentNode(); } next if $skip; # if node1 is a text node, or we're exposing deleted elements, # go ahead and insert it... if ($node1->getNodeType() == XML::DOM::TEXT_NODE) { my $textNode = $dom2->createTextNode($node1->getData()); my $parent = $node2->getParentNode(); # yes, node2 my $wrapName = $diffWrap; if (ref $diffWrap eq 'HASH') { $wrapName = $diffWrap->{'*'}; $wrapName = $diffWrap->{$parent->getTagName()} if $diffWrap->{$parent->getTagName()} } my $wrap = $dom2->createElement($wrapName); if (!$skip) { $parent->insertBefore($wrap, $node2); $wrap->appendChild($textNode); $wrap->setAttribute($diffAttr, $diffDeleted); $countDeleted++; } } elsif ($node1->getNodeType == XML::DOM::ELEMENT_NODE && $option{'includedeletedelements'}) { my $elemNode = $node1->cloneNode(1); my $parent = $node2->getParentNode(); $elemNode->setOwnerDocument($node2->getOwnerDocument()); $elemNode->setAttribute($diffAttr, $diffDeleted); $countDeleted++; # hack: mark node1 as "deleted" so we can tell when we're # looking at its children $node1->setAttribute($diffAttr, $diffDeleted); $parent->insertBefore($elemNode, $node2); } } next; } # whitespace? if ($node2->getNodeType() == XML::DOM::TEXT_NODE) { my $text = $node2->getData(); if ($text =~ /^\s*$/sg && $option{'ignorewhitespace'}) { next; } } # If $node2 is a text node, and it is the only child of an element, # then this action can safely be applied to the parent element if ($node2->getNodeType() == XML::DOM::TEXT_NODE && $node2->getPreviousSibling() == undef && $node2->getNextSibling() == undef) { $node2 = $node2->getParentNode(); } # if we're adding something, and one of our parents is already # marked as an addition; we don't need to mark this one... if ($action->{'action'} eq 'discb') { my $parent = $node2->getParentNode(); my $found = 0; while (!$found && $parent) { $found = 1 if ($parent->getNodeType() == XML::DOM::ELEMENT_NODE && $parent->getAttribute($diffAttr) eq $diffAdded); $parent = $parent->getParentNode(); } next if $found; } # If $node2 is still a text node, wrap it in an element if ($node2->getNodeType() == XML::DOM::TEXT_NODE) { my $parent = $node2->getParentNode(); my $wrapName = $diffWrap; if (ref $diffWrap eq 'HASH') { $wrapName = $diffWrap->{'*'}; $wrapName = $diffWrap->{$parent->getTagName()} if $diffWrap->{$parent->getTagName()} } my $wrap = $dom2->createElement($wrapName); $parent->insertBefore($wrap, $node2); $node2 = $parent->removeChild($node2); $wrap->appendChild($node2); $node2 = $wrap; } if ($action->{'action'} eq 'discb') { $node2->setAttribute($diffAttr, $diffAdded); $countAdded++; } elsif ($action->{'action'} eq 'changeab') { $node2->setAttribute($diffAttr, $diffChanged); $countChanged++; } else { die "Unexpected action: " . $action->{'action'} . "\n"; } } # Add our PI to the dom my $pidata = "version='$VERSION'\n" . "\toldfile='$file1'\n" . "\tnewfile='$file2'\n" . "\tattribute='$diffAttr'\n" . "\tchanged='$diffChanged'\n" . "\tadded='$diffAdded'\n" . "\tdeleted='$diffDeleted'\n"; $pidata .= "\twrapper='$diffWrap'\n" if ref $diffWrap ne 'HASH'; foreach my $opt ('diff', 'showdelete', 'includedeletedelements', 'ignorewhitespace', 'usechanged') { $pidata .= "\t$opt='" . $option{$opt} . "'\n"; } my $pi = $dom2->createProcessingInstruction('diffmk', $pidata); $dom2->insertBefore($pi, $dom2->getDocumentElement()); open (OUT, ">$output"); $dom2->printToFileHandle(\*OUT); close (OUT); print "Diff'd: $countAdded additions, $countDeleted deletions, and "; print "$countChanged changes.\n"; # ====================================================================== # Functions to build and traverse the lists sub inorder_nodes { my $node = shift; my @nodes = @_; if (($node->getNodeType() == XML::DOM::ELEMENT_NODE && ($diffType eq 'element' || $diffType eq 'both')) || ($node->getNodeType() == XML::DOM::TEXT_NODE && ($diffType eq 'text' || $diffType eq 'both'))) { push (@nodes, $node); } if ($node->getNodeType() == XML::DOM::ELEMENT_NODE) { my $child = $node->getFirstChild(); while ($child) { @nodes = &inorder_nodes($child, @nodes); $child = $child->getNextSibling(); } } return @nodes; } sub diff_match { my $a_index = shift; my $b_index = shift; my @rest = @_; my $action = {}; $action->{'action'} = 'match'; $action->{'a'} = $a_index; $action->{'b'} = $b_index; push (@actions, $action); } sub diff_discard_a { my $a_index = shift; my $b_index = shift; my @rest = @_; my $action = {}; $action->{'action'} = 'disca'; $action->{'a'} = $a_index; $action->{'b'} = $b_index; push (@actions, $action); } sub diff_discard_b { my $a_index = shift; my $b_index = shift; my @rest = @_; my $action = {}; $action->{'action'} = 'discb'; $action->{'a'} = $a_index; $action->{'b'} = $b_index; push (@actions, $action); } sub diff_gen_key { my $elem = shift; if ($elem->getNodeType() == XML::DOM::ELEMENT_NODE) { # element nodes are the same if they've got the same name # and attribute values my $elemkey = $elem->getTagName(); my $attr = $elem->getAttributes(); for (my $count = 0; $count < $attr->getLength(); $count++) { my $name = $attr->item($count)->getName(); my $val = $attr->item($count)->getValue(); # normalize all space, if we're ignoring whitespace $val =~ s/\s+/ /sg if $option{'ignorewhitespace'}; $elemkey .= " $name=$val"; } return 'E: ' . $elemkey; } else { # text nodes are the same if they're equal, modulo whitespace my $text = $elem->getData(); $text =~ s/\s+/ /sg; # normalize all space... return 'T: ' . $text; } } # ====================================================================== # These are debugging functions... sub show_actions { my @actions = @_; my $count = 0; foreach my $act (@actions) { printf "%03d: ", $count; $count++; print $act->{'action'}, " ", $act->{'a'}, " ", $act->{'b'}, "\n"; } print "\n"; } sub show_sequence { my @seq = @_; my $count = 0; for (my $count = 0; $count <= $#seq; $count++) { my $elem = $seq[$count]; printf "%03d: ", $count; &show_node($elem); } print "\n"; } sub show_node { my $elem = shift; if ($elem->getNodeType() == XML::DOM::ELEMENT_NODE) { print &diff_gen_key($elem); } else { my $text = &diff_gen_key($elem); $text = substr($text, 0, 70); print $text; } print "\n"; } # ====================================================================== # Functions to manipulate the @action list sub find_changes { my @actions = @_; my @newactions = (); # The sequence 'match', 'disca'+, 'discb'+, 'match' is a change... while (@actions) { my $action = shift @actions; push (@newactions, $action); if ($action->{'action'} eq 'match') { # maybe we're at the start of a change... my $runlength = &balanced_pairs(@actions); if ($runlength > 0) { my @new = (); for (my $count = 0; $count < $runlength; $count++) { my $act = shift @actions; $act->{'action'} = 'changeab'; unshift(@new, $act); } for (my $count = 0; $count < $runlength; $count++) { my $act1 = shift @new; my $act2 = shift @actions; $act1->{'b'} = $act2->{'b'}; push (@newactions, $act1); } } } } return @newactions; } sub balanced_pairs { my @actions = @_; my $count = 0; my $pos = 0; while ($actions[$pos]->{'action'} eq 'disca') { $count++; $pos++; } if ($count > 0) { my $runlength = $count; while ($actions[$pos]->{'action'} eq 'discb' && $count > 0) { $count--; $pos++; } if ($count eq '0') { return $runlength; } else { return 0; } } else { return 0; } } sub trim_matches { my @actions = @_; my @newactions = (); my $lastaction = undef; while (@actions) { # Reduce strings of matches to a single match... my $action = shift @actions; if ($action->{'action'} eq 'match' && $lastaction eq 'match') { pop @newactions; push (@newactions, $action); } else { push (@newactions, $action); $lastaction = $action->{'action'}; } } return @newactions; } # ======================================================================
Received on Friday, 18 November 2005 15:44:46 UTC