W3C home > Mailing lists > Public > public-html-commits@w3.org > July 2009

html5/spec/tools spec-splitter.py,NONE,1.1

From: Michael Smith via cvs-syncmail <cvsmail@w3.org>
Date: Sat, 18 Jul 2009 09:45:14 +0000
To: public-html-commits@w3.org
Message-Id: <E1MS6Ti-0004r6-4U@lionel-hutz.w3.org>
Update of /sources/public/html5/spec/tools
In directory hutz:/tmp/cvs-serv18399/tools

Added Files:
Log Message:
checked in a multipage version of Hixie's draft, and made it the default view

--- NEW FILE: spec-splitter.py ---
import sys
import re
from lxml import etree # requires lxml 2.0
from copy import deepcopy

print "HTML5 Spec Splitter"

absolute_uris = False
w3c = False
use_html5lib_parser = False
use_html5lib_serialiser = False
file_args = []

for arg in sys.argv[1:]:
    if arg == '--absolute':
        absolute_uris = True
    elif arg == '--w3c':
        w3c = True
    elif arg == '--html5lib-parser':
        use_html5lib_parser = True
    elif arg == '--html5lib-serialiser':
        use_html5lib_serialiser = True

if len(file_args) != 2:
    print 'Run like "python [options] spec-splitter.py index multipage"'
    print '(The directory "multipage" must already exist)'
    print 'Options:'
    print '  --absolute ............. convert relative URLs to absolute (e.g. for images)'
    print '  --w3c .................. use W3C variant instead of WHATWG'
    print '  --html5lib-parser ...... use html5lib parser instead of lxml'
    print '  --html5lib-serialiser .. use html5lib serialiser instead of lxml'

if use_html5lib_parser or use_html5lib_serialiser:
    import html5lib
    import html5lib.serializer
    import html5lib.treewalkers

if w3c:
    index_page = 'Overview'
    index_page = 'index'

# The document is split on all <h2> elements, plus the following specific elements
# (which were chosen to split any pages that were larger than about 100-200KB, and
# may need to be adjusted as the spec changes):
split_exceptions = [
    'text-level-semantics', 'embedded-content-0', 'video', 'the-canvas-element', 'tabular-data', 'forms', 'interactive-elements',
    'offline', 'history', 'structured-client-side-storage',
    'parsing', 'tokenization', 'tree-construction', 'serializing-html-fragments', 'named-character-references',

print "Parsing..."

# Parse document
if use_html5lib_parser:
    parser = html5lib.html5parser.HTMLParser(tree = html5lib.treebuilders.getTreeBuilder('lxml'))
    doc = parser.parse(open(file_args[0]), encoding='utf-8')
    parser = etree.HTMLParser(encoding='utf-8', recover=False)
    doc = etree.parse(open(file_args[0]), parser)

print "Splitting..."

# Absolutise some references, so the spec can be hosted elsewhere
if absolute_uris:
    for a in ('href', 'src'):
        for t in ('link', 'script', 'img'):
            for e in doc.findall('//%s[@%s]' % (t, a)):
                if e.get(a)[0] == '/':
                    e.set(a, 'http://www.whatwg.org' + e.get(a))
                    e.set(a, 'http://www.whatwg.org/specs/web-apps/current-work/' + e.get(a))

# Extract the body from the source document
original_body = doc.find('body')

# Create an empty body, for the page content to be added into later
default_body = etree.Element('body')
if original_body.get('class'): default_body.set('class', original_body.get('class'))
if original_body.get('onload'): default_body.set('onload', 'fixBrokenLink(); %s' % original_body.get('onload'))
original_body.getparent().replace(original_body, default_body)

# Extract the header, so we can reuse it in every page
header = original_body.find('.//div[@class="head"]')

# Make a stripped-down version of it
short_header = deepcopy(header)
del short_header[3:]

# Prepare the link-fixup script
if not w3c:
    link_fixup_script = etree.XML('<script src="link-fixup.js"/>')
    doc.find('head')[-1].tail = '\n  '
    link_fixup_script.tail = '\n  '

# Stuff for fixing up references:

def get_page_filename(name):
    return '%s.html' % name

# Finds all the ids and remembers which page they were on
id_pages = {}
def extract_ids(page, node):
    if node.get('id'):
        id_pages[node.get('id')] = page
    for e in node.findall('.//*[@id]'):
        id_pages[e.get('id')] = page

# Updates all the href="#id" to point to page#id
missing_warnings = set()
def fix_refs(page, node):
    for e in node.findall('.//a[@href]'):
        if e.get('href')[0] == '#':
            id = e.get('href')[1:]
            if id in id_pages:
                if id_pages[id] != page: # only do non-local links
                    e.set('href', '%s#%s' % (get_page_filename(id_pages[id]), id))

def report_broken_refs():
    for id in sorted(missing_warnings):
        print "warning: can't find target for #%s" % id

pages = [] # for saving all the output, so fix_refs can be called in a second pass

# Iterator over the full spec's body contents
child_iter = original_body.iterchildren()

# Contents/intro page:

page = deepcopy(doc)
page_body = page.find('body')

# Keep copying stuff from the front of the source document into this
# page, until we find the first heading that isn't class="no-toc"
for e in child_iter:
    if e.getnext().tag == 'h2' and 'no-toc' not in (e.getnext().get('class') or '').split(' '):

pages.append( (index_page, page, 'Front cover') )

# Section/subsection pages:

def getNodeText(node):
    return re.sub('\s+', ' ', etree.tostring(node, method='text').strip())

for heading in child_iter:
    # Handle the heading for this section
    title = getNodeText(heading)
    name = heading.get('id')
    if name == index_page: name = 'section-%s' % name
    print '  <%s> %s' % (heading.tag, name)

    page = deepcopy(doc)
    page_body = page.find('body')

    # Add the header

    # Add the page heading
    extract_ids(name, heading)

    # Keep copying stuff from the source, until we reach the end of the
    # document or find a header to split on
    e = heading
    while e.getnext() is not None and not (
            e.getnext().tag == 'h2' or e.getnext().get('id') in split_exceptions
        e = child_iter.next()
        extract_ids(name, e)

    pages.append( (name, page, title) )

# Fix the links, and add some navigation:

for i in range(len(pages)):
    name, doc, title = pages[i]

    fix_refs(name, doc)

    if name == index_page: continue # don't add nav links to the TOC page

    head = doc.find('head')

    if w3c:
        nav = etree.Element('div') # HTML 4 compatibility
        nav = etree.Element('nav')
    nav.text = '\n   '
    nav.tail = '\n\n  '

    if i > 1:
        href = get_page_filename(pages[i-1][0])
        title = pages[i-1][2]
        a = etree.XML(u'<a href="%s">\u2190 %s</a>' % (href, title))
        a.tail = u' \u2013\n   '
        link = etree.XML('<link href="%s" title="%s" rel="prev"/>' % (href, title))
        link.tail = '\n  '

    a = etree.XML('<a href="%s.html#contents">Table of contents</a>' % index_page)
    a.tail = '\n  '
    link = etree.XML('<link href="%s.html#contents" title="Table of contents" rel="index"/>' % index_page)
    link.tail = '\n  '

    if i != len(pages)-1:
        href = get_page_filename(pages[i+1][0])
        title = pages[i+1][2]
        a = etree.XML(u'<a href="%s">%s \u2192</a>' % (href, title))
        a.tail = '\n  '
        a.getprevious().tail = u' \u2013\n   '
        link = etree.XML('<link href="%s" title="%s" rel="next"/>' % (href, title))
        link.tail = '\n  '

    doc.find('body').insert(1, nav) # after the header


print "Outputting..."

# Output all the pages
for name, doc, title in pages:
    f = open('%s/%s' % (file_args[1], get_page_filename(name)), 'w')
    if w3c:
        f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">\n')
        f.write('<!DOCTYPE HTML>\n')
    if use_html5lib_serialiser:
        tokens = html5lib.treewalkers.getTreeWalker('lxml')(doc)
        serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False)
        for text in serializer.serialize(tokens, encoding='us-ascii'):
        f.write(etree.tostring(doc, pretty_print=False, method="html"))

# Generate the script to fix broken links
f = open('%s/fragment-links.js' % (file_args[1]), 'w')
f.write('var fragment_links = { ' + ','.join("'%s':'%s'" % (k.replace("\\", "\\\\").replace("'", "\\'"), v) for (k,v) in id_pages.items()) + ' };\n')
var fragid = window.location.hash.substr(1);
if (!fragid) { /* handle section-foo.html links from the old multipage version, and broken foo.html from the new version */
    var m = window.location.pathname.match(/\/(?:section-)?([\w\-]+)\.html/);
    if (m) fragid = m[1];
var page = fragment_links[fragid];
if (page) {

print "Done."
Received on Saturday, 18 July 2009 09:45:24 GMT

This archive was generated by hypermail 2.2.0+W3C-0.50 : Saturday, 18 July 2009 09:45:24 GMT