W3C home > Mailing lists > Public > public-html-commits@w3.org > May 2010

html5/spec/tools spec-splitter.py,1.6,1.7

From: Michael Smith via cvs-syncmail <cvsmail@w3.org>
Date: Fri, 14 May 2010 10:09:29 +0000
To: public-html-commits@w3.org
Message-Id: <E1OCrph-0007ZX-PV@lionel-hutz.w3.org>
Update of /sources/public/html5/spec/tools
In directory hutz:/tmp/cvs-serv29064/tools

Modified Files:
	spec-splitter.py 
Log Message:
Remove the experimental timed tracks stuff from the W3C copy, by request of the chairs.

[updated by splitter]


Index: spec-splitter.py
===================================================================
RCS file: /sources/public/html5/spec/tools/spec-splitter.py,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -d -r1.6 -r1.7
--- spec-splitter.py	4 Mar 2010 15:14:52 -0000	1.6
+++ spec-splitter.py	14 May 2010 10:09:27 -0000	1.7
@@ -42,15 +42,27 @@
 if w3c:
     index_page = 'Overview'
 else:
-    index_page = 'spec'
+    index_page =  'spec'
 
 # The document is split on all <h2> elements, plus the following specific elements
 # (which were chosen to split any pages that were larger than about 100-200KB, and
 # may need to be adjusted as the spec changes):
 split_exceptions = [
-    'text-level-semantics', 'embedded-content-0', 'video', 'the-canvas-element', 'tabular-data', 'forms', 'interactive-elements',
-    'offline', 'history', 'structured-client-side-storage',
-    'parsing', 'tokenization', 'tree-construction', 'serializing-html-fragments', 'named-character-references',
+    'common-microsyntaxes', 'urls', 'fetching-resources', 'common-dom-interfaces', 'namespaces', # <-- infrastructure
+    'elements', 'content-models', 'apis-in-html-documents', # <-- dom
+
+    'scripting-1', 'sections', 'grouping-content', 'text-level-semantics', 'edits',
+    'embedded-content-1', 'the-iframe-element', 'video', 'the-canvas-element', 'the-map-element', 'tabular-data',
+    'forms', 'the-input-element', 'states-of-the-type-attribute', 'number-state', 'common-input-element-attributes', 'the-button-element', 'association-of-controls-and-forms',
+    'interactive-elements', 'commands', # <-- semantics
+
+    'predefined-vocabularies-0', 'converting-html-to-other-formats', # <-- microdata
+    'origin-0', 'timers', 'offline', 'history', 'links', # <-- browsers
+    'dnd', # <-- editing
+
+    'workers', 'network', 'web-messaging', 'webstorage',
+
+    'parsing', 'tokenization', 'tree-construction', 'the-end', 'named-character-references', # <-- syntax
 ]
 
 
@@ -61,11 +73,13 @@
     parser = html5lib.html5parser.HTMLParser(tree = html5lib.treebuilders.getTreeBuilder('lxml'))
     doc = parser.parse(open(file_args[0]), encoding='utf-8')
 else:
-    parser = etree.HTMLParser(encoding='utf-8', recover=False)
+    parser = etree.HTMLParser(encoding='utf-8')
     doc = etree.parse(open(file_args[0]), parser)
 
 print "Splitting..."
 
+doctitle = doc.find('.//title').text
+
 # Absolutise some references, so the spec can be hosted elsewhere
 if absolute_uris:
     for a in ('href', 'src'):
@@ -86,15 +100,27 @@
 original_body.getparent().replace(original_body, default_body)
 
 # Extract the header, so we can reuse it in every page
-header = original_body.find('.//div[@class="head"]')
+header = original_body.find('.//*[@class="head"]')
 
 # Make a stripped-down version of it
 short_header = deepcopy(header)
-del short_header[3:]
+del short_header[2:]
+
+# Extract the items in the TOC (remembering their nesting depth)
+def extract_toc_items(items, ol, depth):
+    for li in ol.iterchildren():
+        for c in li.iterchildren():
+            if c.tag == 'a':
+                assert c.get('href')[0] == '#'
+                items.append( (depth, c.get('href')[1:], c) )
+            elif c.tag == 'ol':
+                extract_toc_items(items, c, depth+1)
+toc_items = []
+extract_toc_items(toc_items, original_body.find('.//ol[@class="toc"]'), 0)
 
 # Prepare the link-fixup script
 if not w3c:
-    link_fixup_script = etree.XML('<script src="link-fixup.js" type="text/javascript"/>')
+    link_fixup_script = etree.XML('<script src="link-fixup.js"/>')
     doc.find('head')[-1].tail = '\n  '
     doc.find('head').append(link_fixup_script)
     link_fixup_script.tail = '\n  '
@@ -133,9 +159,16 @@
 # Iterator over the full spec's body contents
 child_iter = original_body.iterchildren()
 
+def add_class(e, cls):
+    if e.get('class'):
+        e.set('class', e.get('class') + ' ' + cls)
+    else:
+        e.set('class', cls)
+
 # Contents/intro page:
 
 page = deepcopy(doc)
+add_class(page.getroot(), 'split index')
 page_body = page.find('body')
 
 # Keep copying stuff from the front of the source document into this
@@ -149,20 +182,35 @@
 
 # Section/subsection pages:
 
-def getNodeText(node):
-    return re.sub('\s+', ' ', etree.tostring(node, method='text').strip())
+def should_split(e):
+    if e.tag == 'h2': return True
+    if e.get('id') in split_exceptions: return True
+    if e.tag == 'div':
+        c = e.getchildren()
+        if len(c):
+            if c[0].tag == 'h2': return True
+            if c[0].get('id') in split_exceptions: return True
+    return False
+
+def get_heading_text_and_id(e):
+    if e.tag == 'div':
+        node = e.getchildren()[0]
+    else:
+        node = e
+    title = re.sub('\s+', ' ', etree.tostring(node, method='text').strip())
+    return title, node.get('id')
 
 for heading in child_iter:
     # Handle the heading for this section
-    title = getNodeText(heading)
-    name = heading.get('id')
+    title, name = get_heading_text_and_id(heading)
     if name == index_page: name = 'section-%s' % name
     print '  <%s> %s - %s' % (heading.tag, name, title)
 
     page = deepcopy(doc)
+    add_class(page.getroot(), 'split chapter')
     page_body = page.find('body')
 
-    page.find('//title').text = title + u' \u2014 HTML 5'
+    page.find('//title').text = title + u' \u2014 HTML5 '
 
     # Add the header
     page_body.append(deepcopy(short_header))
@@ -174,9 +222,7 @@
     # Keep copying stuff from the source, until we reach the end of the
     # document or find a header to split on
     e = heading
-    while e.getnext() is not None and not (
-            e.getnext().tag == 'h2' or e.getnext().get('id') in split_exceptions
-        ):
+    while e.getnext() is not None and not should_split(e.getnext()):
         e = child_iter.next()
         extract_ids(name, e)
         page_body.append(deepcopy(e))
@@ -226,6 +272,36 @@
         link.tail = '\n  '
         head.append(link)
 
+    # Add a subset of the TOC to each page:
+
+    # Find the items that are on this page
+    new_toc_items = [ (d, id, e) for (d, id, e) in toc_items if id_pages[id] == name ]
+    if len(new_toc_items) > 1: # don't bother if there's only one item, since it looks silly
+        # Construct the new toc <ol>
+        new_toc = etree.XML(u'<ol class="toc"/>')
+        cur_ol = new_toc
+        cur_li = None
+        cur_depth = 0
+        # Add each item, reconstructing the nested <ol>s and <li>s to preserve
+        # the nesting depth of each item
+        for (d, id, e) in new_toc_items:
+            while d > cur_depth:
+                if cur_li is None:
+                    cur_li = etree.XML(u'<li/>')
+                    cur_ol.append(cur_li)
+                cur_ol = etree.XML('<ol/>')
+                cur_li.append(cur_ol)
+                cur_li = None
+                cur_depth += 1
+            while d < cur_depth:
+                cur_li = cur_ol.getparent()
+                cur_ol = cur_li.getparent()
+                cur_depth -= 1
+            cur_li = etree.XML(u'<li/>')
+            cur_li.append(deepcopy(e))
+            cur_ol.append(cur_li)
+        nav.append(new_toc)
+
     doc.find('body').insert(1, nav) # after the header
 
 report_broken_refs()
Received on Friday, 14 May 2010 10:09:32 GMT

This archive was generated by hypermail 2.2.0+W3C-0.50 : Friday, 14 May 2010 10:09:34 GMT