- From: Michael Smith via cvs-syncmail <cvsmail@w3.org>
- Date: Fri, 14 May 2010 10:09:29 +0000
- To: public-html-commits@w3.org
Update of /sources/public/html5/spec/tools
In directory hutz:/tmp/cvs-serv29064/tools
Modified Files:
spec-splitter.py
Log Message:
Remove the experimental timed tracks stuff from the W3C copy, by request of the chairs.
[updated by splitter]
Index: spec-splitter.py
===================================================================
RCS file: /sources/public/html5/spec/tools/spec-splitter.py,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -d -r1.6 -r1.7
--- spec-splitter.py 4 Mar 2010 15:14:52 -0000 1.6
+++ spec-splitter.py 14 May 2010 10:09:27 -0000 1.7
@@ -42,15 +42,27 @@
if w3c:
index_page = 'Overview'
else:
- index_page = 'spec'
+ index_page = 'spec'
# The document is split on all <h2> elements, plus the following specific elements
# (which were chosen to split any pages that were larger than about 100-200KB, and
# may need to be adjusted as the spec changes):
split_exceptions = [
- 'text-level-semantics', 'embedded-content-0', 'video', 'the-canvas-element', 'tabular-data', 'forms', 'interactive-elements',
- 'offline', 'history', 'structured-client-side-storage',
- 'parsing', 'tokenization', 'tree-construction', 'serializing-html-fragments', 'named-character-references',
+ 'common-microsyntaxes', 'urls', 'fetching-resources', 'common-dom-interfaces', 'namespaces', # <-- infrastructure
+ 'elements', 'content-models', 'apis-in-html-documents', # <-- dom
+
+ 'scripting-1', 'sections', 'grouping-content', 'text-level-semantics', 'edits',
+ 'embedded-content-1', 'the-iframe-element', 'video', 'the-canvas-element', 'the-map-element', 'tabular-data',
+ 'forms', 'the-input-element', 'states-of-the-type-attribute', 'number-state', 'common-input-element-attributes', 'the-button-element', 'association-of-controls-and-forms',
+ 'interactive-elements', 'commands', # <-- semantics
+
+ 'predefined-vocabularies-0', 'converting-html-to-other-formats', # <-- microdata
+ 'origin-0', 'timers', 'offline', 'history', 'links', # <-- browsers
+ 'dnd', # <-- editing
+
+ 'workers', 'network', 'web-messaging', 'webstorage',
+
+ 'parsing', 'tokenization', 'tree-construction', 'the-end', 'named-character-references', # <-- syntax
]
@@ -61,11 +73,13 @@
parser = html5lib.html5parser.HTMLParser(tree = html5lib.treebuilders.getTreeBuilder('lxml'))
doc = parser.parse(open(file_args[0]), encoding='utf-8')
else:
- parser = etree.HTMLParser(encoding='utf-8', recover=False)
+ parser = etree.HTMLParser(encoding='utf-8')
doc = etree.parse(open(file_args[0]), parser)
print "Splitting..."
+doctitle = doc.find('.//title').text
+
# Absolutise some references, so the spec can be hosted elsewhere
if absolute_uris:
for a in ('href', 'src'):
@@ -86,15 +100,27 @@
original_body.getparent().replace(original_body, default_body)
# Extract the header, so we can reuse it in every page
-header = original_body.find('.//div[@class="head"]')
+header = original_body.find('.//*[@class="head"]')
# Make a stripped-down version of it
short_header = deepcopy(header)
-del short_header[3:]
+del short_header[2:]
+
+# Extract the items in the TOC (remembering their nesting depth)
+def extract_toc_items(items, ol, depth):
+ for li in ol.iterchildren():
+ for c in li.iterchildren():
+ if c.tag == 'a':
+ assert c.get('href')[0] == '#'
+ items.append( (depth, c.get('href')[1:], c) )
+ elif c.tag == 'ol':
+ extract_toc_items(items, c, depth+1)
+toc_items = []
+extract_toc_items(toc_items, original_body.find('.//ol[@class="toc"]'), 0)
# Prepare the link-fixup script
if not w3c:
- link_fixup_script = etree.XML('<script src="link-fixup.js" type="text/javascript"/>')
+ link_fixup_script = etree.XML('<script src="link-fixup.js"/>')
doc.find('head')[-1].tail = '\n '
doc.find('head').append(link_fixup_script)
link_fixup_script.tail = '\n '
@@ -133,9 +159,16 @@
# Iterator over the full spec's body contents
child_iter = original_body.iterchildren()
+def add_class(e, cls):
+ if e.get('class'):
+ e.set('class', e.get('class') + ' ' + cls)
+ else:
+ e.set('class', cls)
+
# Contents/intro page:
page = deepcopy(doc)
+add_class(page.getroot(), 'split index')
page_body = page.find('body')
# Keep copying stuff from the front of the source document into this
@@ -149,20 +182,35 @@
# Section/subsection pages:
-def getNodeText(node):
- return re.sub('\s+', ' ', etree.tostring(node, method='text').strip())
+def should_split(e):
+ if e.tag == 'h2': return True
+ if e.get('id') in split_exceptions: return True
+ if e.tag == 'div':
+ c = e.getchildren()
+ if len(c):
+ if c[0].tag == 'h2': return True
+ if c[0].get('id') in split_exceptions: return True
+ return False
+
+def get_heading_text_and_id(e):
+ if e.tag == 'div':
+ node = e.getchildren()[0]
+ else:
+ node = e
+ title = re.sub('\s+', ' ', etree.tostring(node, method='text').strip())
+ return title, node.get('id')
for heading in child_iter:
# Handle the heading for this section
- title = getNodeText(heading)
- name = heading.get('id')
+ title, name = get_heading_text_and_id(heading)
if name == index_page: name = 'section-%s' % name
print ' <%s> %s - %s' % (heading.tag, name, title)
page = deepcopy(doc)
+ add_class(page.getroot(), 'split chapter')
page_body = page.find('body')
- page.find('//title').text = title + u' \u2014 HTML 5'
+ page.find('//title').text = title + u' \u2014 HTML5 '
# Add the header
page_body.append(deepcopy(short_header))
@@ -174,9 +222,7 @@
# Keep copying stuff from the source, until we reach the end of the
# document or find a header to split on
e = heading
- while e.getnext() is not None and not (
- e.getnext().tag == 'h2' or e.getnext().get('id') in split_exceptions
- ):
+ while e.getnext() is not None and not should_split(e.getnext()):
e = child_iter.next()
extract_ids(name, e)
page_body.append(deepcopy(e))
@@ -226,6 +272,36 @@
link.tail = '\n '
head.append(link)
+ # Add a subset of the TOC to each page:
+
+ # Find the items that are on this page
+ new_toc_items = [ (d, id, e) for (d, id, e) in toc_items if id_pages[id] == name ]
+ if len(new_toc_items) > 1: # don't bother if there's only one item, since it looks silly
+ # Construct the new toc <ol>
+ new_toc = etree.XML(u'<ol class="toc"/>')
+ cur_ol = new_toc
+ cur_li = None
+ cur_depth = 0
+ # Add each item, reconstructing the nested <ol>s and <li>s to preserve
+ # the nesting depth of each item
+ for (d, id, e) in new_toc_items:
+ while d > cur_depth:
+ if cur_li is None:
+ cur_li = etree.XML(u'<li/>')
+ cur_ol.append(cur_li)
+ cur_ol = etree.XML('<ol/>')
+ cur_li.append(cur_ol)
+ cur_li = None
+ cur_depth += 1
+ while d < cur_depth:
+ cur_li = cur_ol.getparent()
+ cur_ol = cur_li.getparent()
+ cur_depth -= 1
+ cur_li = etree.XML(u'<li/>')
+ cur_li.append(deepcopy(e))
+ cur_ol.append(cur_li)
+ nav.append(new_toc)
+
doc.find('body').insert(1, nav) # after the header
report_broken_refs()
Received on Friday, 14 May 2010 10:09:32 UTC