html5/spec/static Makefile,1.2,1.3 Overview.html,1.7,1.8 spec-splitter.py,1.2,1.3 from Michael Smith via cvs-syncmail on 2009-02-16 (public-html-commits@w3.org from February 2009)

From: Michael Smith via cvs-syncmail <cvsmail@w3.org>
Date: Mon, 16 Feb 2009 10:45:23 +0000
To: public-html-commits@w3.org
Message-Id: <E1LZ0yq-0004b0-2V@lionel-hutz.w3.org>
Update of /sources/public/html5/spec/static
In directory hutz:/tmp/cvs-serv17540

Modified Files:
	Makefile Overview.html spec-splitter.py 
Log Message:
checkpointing WD version


Index: spec-splitter.py
===================================================================
RCS file: /sources/public/html5/spec/static/spec-splitter.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -d -r1.2 -r1.3
--- spec-splitter.py	10 Jun 2008 09:25:37 -0000	1.2
+++ spec-splitter.py	16 Feb 2009 10:45:21 -0000	1.3
@@ -1,13 +1,4 @@
-try:
-    import psyco
-    psyco.full() # make html5lib faster
-except ImportError:
-    pass
-
 import sys
-import html5lib
-import html5lib.serializer
-import html5lib.treewalkers
 import re
 from lxml import etree # requires lxml 2.0
 from copy import deepcopy
@@ -16,6 +7,8 @@
 
 absolute_uris = False
 w3c = False
+use_html5lib_parser = False
+use_html5lib_serialiser = False
 file_args = []
 
 for arg in sys.argv[1:]:
@@ -23,6 +16,10 @@
         absolute_uris = True
     elif arg == '--w3c':
         w3c = True
+    elif arg == '--html5lib-parser':
+        use_html5lib_parser = True
+    elif arg == '--html5lib-serialiser':
+        use_html5lib_serialiser = True
     else:
         file_args.append(arg)
 
@@ -31,10 +28,17 @@
     print '(The directory "multipage" must already exist)'
     print
     print 'Options:'
-    print '  --absolute  convert relative URIs to absolute (e.g. for images)'
-    print '  --w3c       use W3C variant instead of WHATWG'
+    print '  --absolute ............. convert relative URLs to absolute (e.g. for images)'
+    print '  --w3c .................. use W3C variant instead of WHATWG'
+    print '  --html5lib-parser ...... use html5lib parser instead of lxml'
+    print '  --html5lib-serialiser .. use html5lib serialiser instead of lxml'
     sys.exit()
 
+if use_html5lib_parser or use_html5lib_serialiser:
+    import html5lib
+    import html5lib.serializer
+    import html5lib.treewalkers
+
 if w3c:
     index_page = 'Overview'
 else:
@@ -44,17 +48,21 @@
 # (which were chosen to split any pages that were larger than about 100-200KB, and
 # may need to be adjusted as the spec changes):
 split_exceptions = [
-    'offline', 'history', 'structured',
-    'the-root', 'text-level', 'embedded0', 'video', 'the-canvas', 'tabular', 'interactive-elements',
-    'parsing', 'tokenisation', 'tree-construction', 'serializing', 'named',
+    'text-level-semantics', 'embedded-content-0', 'video', 'the-canvas-element', 'tabular-data', 'forms', 'interactive-elements',
+    'offline', 'history', 'structured-client-side-storage',
+    'parsing', 'tokenization', 'tree-construction', 'serializing-html-fragments', 'named-character-references',
 ]
 
 
 print "Parsing..."
 
 # Parse document
-parser = html5lib.html5parser.HTMLParser(tree = html5lib.treebuilders.getTreeBuilder('lxml'))
-doc = parser.parse(open(file_args[0]), encoding='utf-8')
+if use_html5lib_parser:
+    parser = html5lib.html5parser.HTMLParser(tree = html5lib.treebuilders.getTreeBuilder('lxml'))
+    doc = parser.parse(open(file_args[0]), encoding='utf-8')
+else:
+    parser = etree.HTMLParser(encoding='utf-8', recover=False)
+    doc = etree.parse(open(file_args[0]), parser)
 
 print "Splitting..."
 
@@ -86,7 +94,7 @@
 
 # Prepare the link-fixup script
 if not w3c:
-    link_fixup_script = etree.XML('<script src="fragment-links.js"/>')
+    link_fixup_script = etree.XML('<script src="link-fixup.js"/>')
     doc.find('head')[-1].tail = '\n  '
     doc.find('head').append(link_fixup_script)
     link_fixup_script.tail = '\n  '
@@ -105,7 +113,7 @@
         id_pages[e.get('id')] = page
 
 # Updates all the href="#id" to point to page#id
-missing_warnings = []
+missing_warnings = set()
 def fix_refs(page, node):
     for e in node.findall('.//a[@href]'):
         if e.get('href')[0] == '#':
@@ -114,9 +122,11 @@
                 if id_pages[id] != page: # only do non-local links
                     e.set('href', '%s#%s' % (get_page_filename(id_pages[id]), id))
             else:
-                if id not in missing_warnings:
-                    print "warning: can't find target for #%s" % id
-                    missing_warnings.append(id)
+                missing_warnings.add(id)
+
+def report_broken_refs():
+    for id in sorted(missing_warnings):
+        print "warning: can't find target for #%s" % id
 
 pages = [] # for saving all the output, so fix_refs can be called in a second pass
 
@@ -147,7 +157,7 @@
     title = getNodeText(heading)
     name = heading.get('id')
     if name == index_page: name = 'section-%s' % name
-    print '  %s' % name
+    print '  <%s> %s' % (heading.tag, name)
 
     page = deepcopy(doc)
     page_body = page.find('body')
@@ -219,6 +229,8 @@
 
     doc.find('body').insert(1, nav) # after the header
 
+report_broken_refs()
+
 print "Outputting..."
 
 # Output all the pages
@@ -228,14 +240,17 @@
         f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">\n')
     else:
         f.write('<!DOCTYPE HTML>\n')
-    tokens = html5lib.treewalkers.getTreeWalker('lxml')(doc)
-    serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False)
-    for text in serializer.serialize(tokens, encoding='us-ascii'):
-        f.write(text)
+    if use_html5lib_serialiser:
+        tokens = html5lib.treewalkers.getTreeWalker('lxml')(doc)
+        serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False)
+        for text in serializer.serialize(tokens, encoding='us-ascii'):
+            f.write(text)
+    else:
+        f.write(etree.tostring(doc, pretty_print=False, method="html"))
 
 # Generate the script to fix broken links
 f = open('%s/fragment-links.js' % (file_args[1]), 'w')
-f.write('var fragment_links = { ' + ','.join("'%s':'%s'" % (k,v) for (k,v) in id_pages.items()) + ' };\n')
+f.write('var fragment_links = { ' + ','.join("'%s':'%s'" % (k.replace("\\", "\\\\").replace("'", "\\'"), v) for (k,v) in id_pages.items()) + ' };\n')
 f.write("""
 var fragid = window.location.hash.substr(1);
 if (!fragid) { /* handle section-foo.html links from the old multipage version, and broken foo.html from the new version */

Index: Overview.html
===================================================================
RCS file: /sources/public/html5/spec/static/Overview.html,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -d -r1.7 -r1.8
--- Overview.html	10 Jun 2008 16:44:07 -0000	1.7
+++ Overview.html	16 Feb 2009 10:45:21 -0000	1.8
@@ -1,6389 +1,5735 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
-<!-- when publishing, change bits marked ZZZ -->
-
-<html lang=en-US>
- <head>
-  <title>HTML 5</title>
-
-  <style type="text/css">
-   dt, dfn { font-weight: bold; font-style: normal; }
-   img.extra { float: right; }
-   body ins, body del { display: block; }
[...88651 lines suppressed...]
+       and "base-sixteen", the order that 0-9 A-Z a-z is mentioned,
+       and the detail to which the spec explains how to interpret a
+       string as a hexadecimal number.
+ XXX * expose the form data set, either as an object (on which one can
+       invoke the JSON serialiser), or in the form of a method on
+       HTMLFormElement that returns the form data set serialised
+       according to a particular encoding (defaulting to the form's
+       enctype="" one, probably). This would allow forms to be used
+       with XHR-like systems without having to manually construct the
+       form data set the way that is done today.
+ XXX * placeholder="" for <textarea>, e.g. as seen on:
+       http://code.google.com/p/support/issues/detail?id=1#makechanges
+ XXX * become more consistent about what markup we use to mark up
+       productions (nothing? <i>? <code>?)
+ XXX * expose the value of a radio button group
+        - either on the NodeList returned by HTMLFormControlCollection
+        - or on the radio button itself
+        - or both, so it works even when the form controls have names
+          that vary more than HTMLFormControlCollection allows?
 -->

Index: Makefile
===================================================================
RCS file: /sources/public/html5/spec/static/Makefile,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -d -r1.2 -r1.3
--- Makefile	10 Jun 2008 12:56:19 -0000	1.2
+++ Makefile	16 Feb 2009 10:45:21 -0000	1.3
@@ -26,12 +26,19 @@
 
 all: images multipage/fragment-links.js multipage/images
 
-release: all valid check
+release: all valid CHECKLOG single-page/Overview.html
 
 multipage/fragment-links.js: Overview.html
 	-mkdir multipage
 	$(PYTHON) $(PYTHONFLAGS) $(SPLITTER) $(SPLITTERFLAGS) $< multipage
 
+multipage/single-page/Overview.html: Overview.html multipage/single-page/images
+	cp -p $< $@
+
+multipage/single-page/images: images
+	-mkdir multipage/single-page
+	cp -pR $< $@
+
 valid: Overview.html
 	$(CURL) $(CURLFLAGS) $(VNUFLAGS) -F doc=@$< $(VNU)
 
@@ -54,11 +61,14 @@
 images: ../images
 	-mkdir images
 	cp -pR $</*.png $@
+	cp -pR $</*.svg $@
 
 multipage/images: ../images
 	-mkdir -p multipage/images
 	cp -pR $</*.png $@
+	cp -pR $</*.svg $@
 
 clean:
 	$(RM) -r multipage
 	$(RM) -r images
+	$(RM) CHECKLOG
Received on Monday, 16 February 2009 10:45:48 UTC