validator/httpd/cgi-bin check,1.483,1.484

Update of /sources/public/validator/httpd/cgi-bin
In directory hutz:/tmp/cvs-serv13633/httpd/cgi-bin

Modified Files:
	check 
Log Message:
Adding XML well-formedness parsing for XML docs
with ideas from Jacques Distler http://golem.ph.utexas.edu/~distler/blog/archives/001054.html

We could be using only the XML parser and not parse twice, but
at the moment I am tempted to keep using opensp, for as long 
as we have the best library of error message and explanations for it.

Code is not really pretty, and should be amended at some point to
use a real SAX ErrorHandler. That said, it very much tolls the bell for the
"Validator XML support has some limitations." message.



Index: check
===================================================================
RCS file: /sources/public/validator/httpd/cgi-bin/check,v
retrieving revision 1.483
retrieving revision 1.484
diff -u -d -r1.483 -r1.484
--- check	16 Mar 2007 12:42:36 -0000	1.483
+++ check	19 Mar 2007 00:58:44 -0000	1.484
@@ -604,6 +604,79 @@
 # Sanity check Charset information and add any warnings necessary.
 $File = &charset_conflicts($File);
 
+
+
+# before we start the parsing, clean slate
+$File->{'Is Valid'} = TRUE;
+$File->{Errors} = [];
+
+# preparse with XML parser if necessary
+# we should really be using a SAX ErrorHandler, but I can't find 
+# a way to make it work with XML::LibXML::SAX::Parser... ** FIXME **
+# ditto, we should try using W3C::Validator::ErrorHandler,
+# but it's badly linked to opensp at the moment
+if (&is_xml($File)) {
+  
+       use XML::LibXML;
+       my $xmlparser = XML::LibXML->new();
+       $xmlparser->line_numbers(1);
+       eval {
+         $xmlparser->parse_string(join"\n",@{$File->{Content}});
+       };
+       my $xml_parse_errors_line = undef;
+       my @xmlwf_error_list;
+       if ($@) {
+         
+         my $xmlwf_errors = $@;
+         my $xmlwf_error_line = undef;
+         my $xmlwf_error_col = undef;
+         my $xmlwf_error_msg = undef;
+         my $num_xmlwf_error = 0;
+         my $last_err_msg = undef;
+         my $err;
+         foreach my $msg_line (split "\n", $xmlwf_errors){
+          $msg_line =~ s{[^\x0d\x0a](:\d+:)}{\n$1}g;
+          $msg_line =~ s{[^\x0d\x0a]+[\x0d\x0a]$}{};
+          if ($msg_line =~ /(:\d+:)(.*)/ ){
+            $xmlwf_error_line = $1;
+            $xmlwf_error_msg = $2;
+            $xmlwf_error_line =~ s/:(\d+):/$1/;
+            $xmlwf_error_msg =~ s/ parser error :/XML Parsing Error: /;
+          }
+          if ($msg_line =~ /(.+)\^/){
+            $xmlwf_error_col = length($1);
+          }
+          
+          if ((defined $xmlwf_error_line) and (defined $xmlwf_error_col) and (defined $xmlwf_error_msg)){
+            $err->{src}  = '...'; # do this with show_open_entities()?
+            $err->{line} = $xmlwf_error_line;
+            $err->{char} = $xmlwf_error_col;
+            $err->{num}  = 0;
+            $err->{type} = "E";
+            $err->{msg}  = $xmlwf_error_msg;
+
+            # ...
+            $last_err_msg = $err;
+            push (@xmlwf_error_list, $err);
+            $err = undef;
+            $xmlwf_error_line = undef;
+            $xmlwf_error_col = undef;
+            $xmlwf_error_msg = undef;
+            $num_xmlwf_error++;
+            
+          }                  
+         }
+         foreach my $errmsg (@xmlwf_error_list){
+           $File->{'Is Valid'} = FALSE;
+           push @{$File->{Errors}}, $errmsg;
+         }
+
+       }
+}
+
+
+
+
 #
 # Abandon all hope ye who enter here...
 $File = &parse($File);
@@ -631,6 +704,7 @@
     # FIXME when fixed s:p:o gets released
   }
 
+
   #
   # Parser configuration
   $opensp->search_dirs($CFG->{Paths}->{SGML}->{Library});
@@ -680,11 +754,12 @@
   #
   # Set Version to be the FPI initially.
   $File->{Version} = $File->{DOCTYPE};
-
   return $File;
 }
 
 
+
+
 #
 # Force "XML" if type is an XML type and an FPI was not found.
 # Otherwise set the type to be the FPI.
@@ -2052,10 +2127,6 @@
 
   my $self = { _file => $File, _parser => $parser };
 
-  # ...
-  $File->{'Is Valid'} = TRUE;
-  $File->{Errors} = [];
-
   bless $self, $class;
 }
 

Received on Monday, 19 March 2007 00:58:50 UTC