- From: Michael Smith via cvs-syncmail <cvsmail@w3.org>
- Date: Fri, 14 May 2010 10:09:29 +0000
- To: public-html-commits@w3.org
Update of /sources/public/html5/spec/tools In directory hutz:/tmp/cvs-serv29064/tools Modified Files: spec-splitter.py Log Message: Remove the experimental timed tracks stuff from the W3C copy, by request of the chairs. [updated by splitter] Index: spec-splitter.py =================================================================== RCS file: /sources/public/html5/spec/tools/spec-splitter.py,v retrieving revision 1.6 retrieving revision 1.7 diff -u -d -r1.6 -r1.7 --- spec-splitter.py 4 Mar 2010 15:14:52 -0000 1.6 +++ spec-splitter.py 14 May 2010 10:09:27 -0000 1.7 @@ -42,15 +42,27 @@ if w3c: index_page = 'Overview' else: - index_page = 'spec' + index_page = 'spec' # The document is split on all <h2> elements, plus the following specific elements # (which were chosen to split any pages that were larger than about 100-200KB, and # may need to be adjusted as the spec changes): split_exceptions = [ - 'text-level-semantics', 'embedded-content-0', 'video', 'the-canvas-element', 'tabular-data', 'forms', 'interactive-elements', - 'offline', 'history', 'structured-client-side-storage', - 'parsing', 'tokenization', 'tree-construction', 'serializing-html-fragments', 'named-character-references', + 'common-microsyntaxes', 'urls', 'fetching-resources', 'common-dom-interfaces', 'namespaces', # <-- infrastructure + 'elements', 'content-models', 'apis-in-html-documents', # <-- dom + + 'scripting-1', 'sections', 'grouping-content', 'text-level-semantics', 'edits', + 'embedded-content-1', 'the-iframe-element', 'video', 'the-canvas-element', 'the-map-element', 'tabular-data', + 'forms', 'the-input-element', 'states-of-the-type-attribute', 'number-state', 'common-input-element-attributes', 'the-button-element', 'association-of-controls-and-forms', + 'interactive-elements', 'commands', # <-- semantics + + 'predefined-vocabularies-0', 'converting-html-to-other-formats', # <-- microdata + 'origin-0', 'timers', 'offline', 'history', 'links', # <-- browsers + 'dnd', # <-- editing + + 'workers', 'network', 'web-messaging', 'webstorage', + + 'parsing', 'tokenization', 'tree-construction', 'the-end', 'named-character-references', # <-- syntax ] @@ -61,11 +73,13 @@ parser = html5lib.html5parser.HTMLParser(tree = html5lib.treebuilders.getTreeBuilder('lxml')) doc = parser.parse(open(file_args[0]), encoding='utf-8') else: - parser = etree.HTMLParser(encoding='utf-8', recover=False) + parser = etree.HTMLParser(encoding='utf-8') doc = etree.parse(open(file_args[0]), parser) print "Splitting..." +doctitle = doc.find('.//title').text + # Absolutise some references, so the spec can be hosted elsewhere if absolute_uris: for a in ('href', 'src'): @@ -86,15 +100,27 @@ original_body.getparent().replace(original_body, default_body) # Extract the header, so we can reuse it in every page -header = original_body.find('.//div[@class="head"]') +header = original_body.find('.//*[@class="head"]') # Make a stripped-down version of it short_header = deepcopy(header) -del short_header[3:] +del short_header[2:] + +# Extract the items in the TOC (remembering their nesting depth) +def extract_toc_items(items, ol, depth): + for li in ol.iterchildren(): + for c in li.iterchildren(): + if c.tag == 'a': + assert c.get('href')[0] == '#' + items.append( (depth, c.get('href')[1:], c) ) + elif c.tag == 'ol': + extract_toc_items(items, c, depth+1) +toc_items = [] +extract_toc_items(toc_items, original_body.find('.//ol[@class="toc"]'), 0) # Prepare the link-fixup script if not w3c: - link_fixup_script = etree.XML('<script src="link-fixup.js" type="text/javascript"/>') + link_fixup_script = etree.XML('<script src="link-fixup.js"/>') doc.find('head')[-1].tail = '\n ' doc.find('head').append(link_fixup_script) link_fixup_script.tail = '\n ' @@ -133,9 +159,16 @@ # Iterator over the full spec's body contents child_iter = original_body.iterchildren() +def add_class(e, cls): + if e.get('class'): + e.set('class', e.get('class') + ' ' + cls) + else: + e.set('class', cls) + # Contents/intro page: page = deepcopy(doc) +add_class(page.getroot(), 'split index') page_body = page.find('body') # Keep copying stuff from the front of the source document into this @@ -149,20 +182,35 @@ # Section/subsection pages: -def getNodeText(node): - return re.sub('\s+', ' ', etree.tostring(node, method='text').strip()) +def should_split(e): + if e.tag == 'h2': return True + if e.get('id') in split_exceptions: return True + if e.tag == 'div': + c = e.getchildren() + if len(c): + if c[0].tag == 'h2': return True + if c[0].get('id') in split_exceptions: return True + return False + +def get_heading_text_and_id(e): + if e.tag == 'div': + node = e.getchildren()[0] + else: + node = e + title = re.sub('\s+', ' ', etree.tostring(node, method='text').strip()) + return title, node.get('id') for heading in child_iter: # Handle the heading for this section - title = getNodeText(heading) - name = heading.get('id') + title, name = get_heading_text_and_id(heading) if name == index_page: name = 'section-%s' % name print ' <%s> %s - %s' % (heading.tag, name, title) page = deepcopy(doc) + add_class(page.getroot(), 'split chapter') page_body = page.find('body') - page.find('//title').text = title + u' \u2014 HTML 5' + page.find('//title').text = title + u' \u2014 HTML5 ' # Add the header page_body.append(deepcopy(short_header)) @@ -174,9 +222,7 @@ # Keep copying stuff from the source, until we reach the end of the # document or find a header to split on e = heading - while e.getnext() is not None and not ( - e.getnext().tag == 'h2' or e.getnext().get('id') in split_exceptions - ): + while e.getnext() is not None and not should_split(e.getnext()): e = child_iter.next() extract_ids(name, e) page_body.append(deepcopy(e)) @@ -226,6 +272,36 @@ link.tail = '\n ' head.append(link) + # Add a subset of the TOC to each page: + + # Find the items that are on this page + new_toc_items = [ (d, id, e) for (d, id, e) in toc_items if id_pages[id] == name ] + if len(new_toc_items) > 1: # don't bother if there's only one item, since it looks silly + # Construct the new toc <ol> + new_toc = etree.XML(u'<ol class="toc"/>') + cur_ol = new_toc + cur_li = None + cur_depth = 0 + # Add each item, reconstructing the nested <ol>s and <li>s to preserve + # the nesting depth of each item + for (d, id, e) in new_toc_items: + while d > cur_depth: + if cur_li is None: + cur_li = etree.XML(u'<li/>') + cur_ol.append(cur_li) + cur_ol = etree.XML('<ol/>') + cur_li.append(cur_ol) + cur_li = None + cur_depth += 1 + while d < cur_depth: + cur_li = cur_ol.getparent() + cur_ol = cur_li.getparent() + cur_depth -= 1 + cur_li = etree.XML(u'<li/>') + cur_li.append(deepcopy(e)) + cur_ol.append(cur_li) + nav.append(new_toc) + doc.find('body').insert(1, nav) # after the header report_broken_refs()
Received on Friday, 14 May 2010 10:09:32 UTC