validator/httpd/cgi-bin check,1.644,1.645

Update of /sources/public/validator/httpd/cgi-bin
In directory hutz:/tmp/cvs-serv2350

Modified Files:
	check 
Log Message:
* first implementation of XML::LibXML structured errors. 
    Will only work well with XML::LibXML > 0.69 since versions 0.67 to 0.69 have a bug reporting a single error instead of many
    Also, there are a couple apparent bugs wrt the offset/column position of the errors - 
    see http://lists.w3.org/Archives/Public/public-qa-dev/2009Feb/0007.html

* fixig a bug whereby error column position reported by SGML::Parser::OpenSP would start at 0

* fixing a bug in direct input mode where no content-type would be passed to html5 validation engine, resulting in a crash and... no error reporting



Index: check
===================================================================
RCS file: /sources/public/validator/httpd/cgi-bin/check,v
retrieving revision 1.644
retrieving revision 1.645
diff -u -d -r1.644 -r1.645
--- check	4 Feb 2009 20:06:48 -0000	1.644
+++ check	5 Feb 2009 21:54:08 -0000	1.645
@@ -708,8 +708,43 @@
     $xml_string = undef;
     my $xml_parse_errors_line = undef;
     my @xmlwf_error_list;
+    my @xmlwf_obj_error_list;
+    if (ref($@)) {
+        # handle a structured error (XML::LibXML::Error object)
+        # (lib XML::LibXML > 0.66, but will work MUCH better > 0.69 )
+        push (@xmlwf_obj_error_list, $@);
+        # my $prev_err = $@->_prev();
+        # die($prev_err->{msg});
+        my $err_obj = $@;
+        while($err_obj->_prev()) {
+            $err_obj = $err_obj->_prev();
+            unshift(@xmlwf_obj_error_list, $err_obj);
+        }
+        my $num_xmlwf_error = 0;
+        foreach my $err_obj (@xmlwf_obj_error_list){ 
+            #die($err_obj->dump());
+            my $err;
+            my $offset;
+            #if($err_obj->int2()) {$offset = $err_obj->int2();} # this should be the location of the column per http://xmlsoft.org/html/libxml-xmlerror.html
+            if ($err_obj->num2()) {$offset = $err_obj->num2();} # this is bogus but seems to be the actual behavior
+            $err->{src}  = '...'; # do this with show_open_entities()?
+            $err->{line} = $err_obj->line();
+            $err->{char} = $offset;
+            $err->{num}  = "libxml2-".$err_obj->code();
+            $err->{type} = "E";
+            $err->{msg}  = $err_obj->message();
+            # The validator will sometimes fail to dereference entities files
+            # we're filtering the bogus resulting error
+            if ($err->{msg} =~ /Entity '\w+' not defined/) {
+              $err = undef;
+              next;
+            }
+            push (@xmlwf_error_list, $err);
+            $num_xmlwf_error++;
+        }
+    } 
+    
     if ($@) {
-
       my $xmlwf_errors = $@;
       my $xmlwf_error_line = undef;
       my $xmlwf_error_col = undef;
@@ -789,10 +824,10 @@
 
         }
       }
-      foreach my $errmsg (@xmlwf_error_list){
-        $File->{'Is Valid'} = FALSE;
-        push @{$File->{WF_Errors}}, $errmsg;
-      }
+    }
+    foreach my $errmsg (@xmlwf_error_list){
+      $File->{'Is Valid'} = FALSE;
+      push @{$File->{WF_Errors}}, $errmsg;
     }
   }
 }
@@ -837,8 +872,12 @@
     # most likely be a source of errors about internal/actual charset
     # differences as long as our transcoding process does not "fix" the
     # charset info in XML declaration and meta http-equiv (any others?).
-
-    $req->content_type("$File->{ContentType}; charset=UTF-8");
+    if($File->{'Direct Input'}) { # sane default when using html5 validator by direct input
+        $req->content_type("text/html; charset=UTF-8");        
+    }
+    else {
+        $req->content_type("$File->{ContentType}; charset=UTF-8");
+    }
     $req->content(Encode::encode_utf8(join("\n", @{$File->{Content}})));
   }
   else {
@@ -906,7 +945,7 @@
 
           }
           if($attribute->name eq "last-column") {
-            $html5_error_col = $attribute->getValue()-1;
+            $html5_error_col = $attribute->getValue();
           }
           if($attribute->name eq "last-line") {
             $html5_error_line = $attribute->getValue();
@@ -2018,7 +2057,7 @@
     } elsif ($col == length $line) { # If error is at EOL...
       $length = $col - 1; # ...leave last char to indicate position.
     } else { # Otherwise grab everything up to pos of error.
-      $length = $col;
+      $length = $col-1;
     }
     $left = substr $line, $offset, $length;
   }
@@ -2033,7 +2072,7 @@
     if ($col == length $line) { # If err is at EOL...
       $offset = $col - 1; # ...then grab last char on line instead.
     } else {
-      $offset = $col; # Otherwise just grab the char.
+      $offset = $col-1; # Otherwise just grab the char.
     }
     $char = substr $line, $offset, $length;
     $char = &ent($char);
@@ -2050,7 +2089,7 @@
     if ($col == length $line) { # If at EOL...
       $offset = 0; # Don't bother as there is nothing left to grab.
     } else {
-      $offset = $col + 1; # Otherwise get everything from char-after-error.
+      $offset = $col; # Otherwise get everything from char-after-error.
     }
 
     # Length...
@@ -2926,7 +2965,7 @@
 
   $err->{src}  = '...'; # do this with show_open_entities()?
   $err->{line} = $mess->{primary_message}{LineNumber};
-  $err->{char} = $mess->{primary_message}{ColumnNumber};
+  $err->{char} = $mess->{primary_message}{ColumnNumber}+1;
   $err->{num}  = $mess->{primary_message}{Number};
   $err->{type} = $mess->{primary_message}{Severity};
   $err->{msg}  = $mess->{primary_message}{Text};

Received on Thursday, 5 February 2009 21:54:20 UTC