W3C home > Mailing lists > Public > www-validator-cvs@w3.org > June 2010

validator/httpd/cgi-bin check,1.776,1.777

From: Ville Skytta via cvs-syncmail <cvsmail@w3.org>
Date: Thu, 10 Jun 2010 22:15:49 +0000
To: www-validator-cvs@w3.org
Message-Id: <E1OMq2P-0003CN-Ox@lionel-hutz.w3.org>
Update of /sources/public/validator/httpd/cgi-bin
In directory hutz:/tmp/cvs-serv12149/httpd/cgi-bin

Modified Files:
	check 
Log Message:
Enable external entity loading in XML::LibXML to fix #9899.


Index: check
===================================================================
RCS file: /sources/public/validator/httpd/cgi-bin/check,v
retrieving revision 1.776
retrieving revision 1.777
diff -u -d -r1.776 -r1.777
--- check	7 May 2010 17:41:29 -0000	1.776
+++ check	10 Jun 2010 22:15:47 -0000	1.777
@@ -43,6 +43,7 @@
 
 use CGI 2.81 qw(-newstyle_urls -private_tempfiles redirect);
 use CGI::Carp qw(carp croak fatalsToBrowser);
+use Config qw(%Config);
 use Config::General 2.32 qw();    # Need 2.32 for <msg 0>, rt.cpan.org#17852
 use Encode qw();
 use Encode::Alias qw();
@@ -61,7 +62,7 @@
 use SGML::Parser::OpenSP 0.991 qw();
 use URI qw();
 use URI::Escape qw(uri_escape);
-use XML::LibXML 1.70 qw();         # Need 1.70 for (working) structured errors
+use URI::file;
 
 ###############################################################################
 #### Constant definitions. ####################################################
@@ -207,6 +208,22 @@
         require Encode::JIS2K;    # for optional extra Japanese encodings
     };
 
+    # Tell libxml to load _only_ our XML catalog.  This is because our entity
+    # load jailing may trap the libxml internal default catalog (which is
+    # automatically loaded).  Preventing loading that from the input callback
+    # will cause libxml to not see the document content at all but to throw
+    # weird "Document is empty" errors, at least as of XML::LibXML 1.70 and
+    # libxml 2.7.7.  XML_CATALOG_FILES needs to be in effect at XML::LibXML
+    # load time which is why we're using "require" here instead of pulling it
+    # in with "use" as usual.  And finally, libxml should have support for
+    # SGML open catalogs but they don't seem to work (again as of 1.70 and
+    # 2.7.7); if we use xml.soc here, no entities seem to end up being resolved
+    # from it - so we use a (redundant) XML catalog which works.
+    local $ENV{XML_CATALOG_FILES} =
+        catfile($CFG->{Paths}->{SGML}->{Library}, 'catalog.xml');
+    require XML::LibXML;
+    XML::LibXML->VERSION(1.70);
+
 }    # end of BEGIN block.
 
 #
@@ -606,13 +623,17 @@
         my $xmlparser = XML::LibXML->new();
         $xmlparser->line_numbers(1);
         $xmlparser->validation(0);
-        $xmlparser->load_ext_dtd(0);
         $xmlparser->base_uri($File->{URI})
             unless ($File->{'Direct Input'} || $File->{'Is Upload'});
 
-        # [NOT] loading the XML catalog for entities resolution as it seems to
-        # cause a lot of unnecessary DTD/entities fetching
-        #$xmlparser->load_catalog(catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc'));
+        # Restrict file reading similar to what SGML::Parser::OpenSP does.
+        # Note that all inputs go through the callback so if we were passing
+        # a URI/filename to the parser, it would be affected as well and would
+        # break fetching the initial document.  As long as we pass the doc as
+        # string, this should work.
+        my $cb = XML::LibXML::InputCallback->new();
+        $cb->register_callbacks([\&xml_jail_match, sub { }, sub { }, sub { }]);
+        $xmlparser->input_callbacks($cb);
 
         &override_charset($File, "UTF-8");
 
@@ -1892,6 +1913,36 @@
 }
 
 #
+# XML::LibXML::InputCallback matcher using our SGML search path jail.
+sub xml_jail_match
+{
+    my $arg = shift;
+
+    # Ensure we have a file:// URI if we get a file.
+    my $uri = URI->new($arg);
+    if (!$uri->scheme()) {
+        $uri = URI::file->new_abs($arg);
+    }
+    $uri = $uri->canonical();
+
+    # Do not trap non-file URIs.
+    return 0 unless ($uri->scheme() eq "file");
+
+    # Do not trap file URIs within our jail.
+    for my $dir ($CFG->{Paths}->{SGML}->{Library},
+        split(/\Q$Config{path_sep}\E/o, $ENV{SGML_SEARCH_PATH} || ''))
+    {
+        next unless $dir;
+        my $dir_uri = URI::file->new_abs($dir)->canonical()->as_string();
+        $dir_uri =~ s|/*$|/|;    # ensure it ends with a slash
+        return 0 if ($uri =~ /^\Q$dir_uri\E/);
+    }
+
+    # We have a match (a file outside the jail).
+    return 1;
+}
+
+#
 # Escape text to be included in markup comment.
 sub escape_comment
 {
Received on Thursday, 10 June 2010 22:15:53 GMT

This archive was generated by hypermail 2.2.0+W3C-0.50 : Thursday, 26 April 2012 12:55:20 GMT