- From: Michael Smith via cvs-syncmail <cvsmail@w3.org>
- Date: Sat, 10 Dec 2011 06:15:51 +0000
- To: public-html-commits@w3.org
Update of /sources/public/html5/spec/tools In directory hutz:/tmp/cvs-serv12849/tools Modified Files: spec-splitter.py Log Message: Remove some copypasta. (whatwg r6869) [updated by splitter] Index: spec-splitter.py =================================================================== RCS file: /sources/public/html5/spec/tools/spec-splitter.py,v retrieving revision 1.8 retrieving revision 1.9 diff -u -d -r1.8 -r1.9 --- spec-splitter.py 5 Dec 2010 09:40:04 -0000 1.8 +++ spec-splitter.py 10 Dec 2011 06:15:49 -0000 1.9 @@ -9,6 +9,7 @@ w3c = False use_html5lib_parser = False use_html5lib_serialiser = False +make_index_of_terms = False file_args = [] for arg in sys.argv[1:]: @@ -20,6 +21,8 @@ use_html5lib_parser = True elif arg == '--html5lib-serialiser': use_html5lib_serialiser = True + elif arg == '--make-index-of-terms': + make_index_of_terms = True else: file_args.append(arg) @@ -42,25 +45,92 @@ if w3c: index_page = 'Overview' else: - index_page = 'spec' + index_page = 'index' # The document is split on all <h2> elements, plus the following specific elements # (which were chosen to split any pages that were larger than about 100-200KB, and # may need to be adjusted as the spec changes): split_exceptions = [ - 'common-microsyntaxes', 'urls', 'fetching-resources', 'common-dom-interfaces', 'namespaces', # <-- infrastructure + 'the-a-element', 'the-abbr-element', 'the-address-element', + 'the-area-element', 'the-article-element', 'the-aside-element', + 'the-audio-element', 'the-b-element', 'the-base-element', + 'the-bdi-element', 'the-bdo-element', 'the-blockquote-element', + 'the-body-element', 'the-br-element', 'the-button-element', + 'the-canvas-element', 'the-caption-element', 'the-cite-element', + 'the-code-element', 'the-col-element', 'the-colgroup-element', + 'the-command-element', 'the-datalist-element', 'the-dd-element', + 'the-del-element', 'the-details-element', 'the-dfn-element', + 'the-dir-element', 'the-div-element', 'the-dl-element', + 'the-dt-element', 'the-em-element', 'the-embed-element', + 'the-fieldset-element', 'the-figcaption-element', 'the-figure-element', + 'the-footer-element', 'the-form-element', + 'the-h1-h2-h3-h4-h5-and-h6-elements', 'the-head-element', + 'the-header-element', 'the-hgroup-element', 'the-hr-element', + 'the-html-element', 'the-i-element', 'the-iframe-element', + 'the-img-element', 'the-input-element', 'the-ins-element', + 'the-kbd-element', 'the-keygen-element', 'the-label-element', + 'the-legend-element', 'the-li-element', 'the-link-element', + 'the-map-element', 'the-mark-element', 'the-menu-element', + 'the-meta-element', 'the-meter-element', 'the-nav-element', + 'the-noscript-element', 'the-object-element', 'the-ol-element', + 'the-optgroup-element', 'the-option-element', 'the-output-element', + 'the-p-element', 'the-param-element', 'the-pre-element', + 'the-progress-element', 'the-q-element', 'the-rp-element', + 'the-rt-element', 'the-ruby-element', 'the-s-element', + 'the-samp-element', 'the-script-element', 'the-section-element', + 'the-select-element', 'the-small-element', 'the-source-element', + 'the-span-element', 'the-strong-element', 'the-style-element', + 'the-sub-and-sup-elements', 'the-summary-element', 'the-table-element', + 'the-tbody-element', 'the-td-element', 'the-textarea-element', + 'the-tfoot-element', 'the-th-element', 'the-thead-element', + 'the-time-element', 'the-title-element', 'the-tr-element', + 'the-track-element', 'the-u-element', 'the-ul-element', + 'the-var-element', 'the-video-element', 'the-wbr-element', + + 'styling', + 'usage-summary', + 'attributes-common-to-ins-and-del-elements', + 'edits-and-paragraphs', + 'edits-and-lists', + 'media-elements', + 'image-maps', + 'mathml', + 'svg-0', + 'dimension-attributes', + 'attributes-common-to-td-and-th-elements', + 'examples', + 'common-input-element-apis', + + 'global-attributes', + 'element-definitions', + 'common-dom-interfaces', + 'namespaces', + 'requirements-relating-to-bidirectional-algorithm-formatting-characters', + 'wai-aria', + 'headings-and-sections', + + 'dynamic-markup-insertion', + 'common-microsyntaxes', 'urls', # <-- infrastructure 'elements', 'content-models', 'apis-in-html-documents', # <-- dom + 'attributes-common-to-form-controls', + 'textFieldSelection', + 'constraints', + 'form-submission', + + 'common-idioms-without-dedicated-elements', + 'scripting-1', 'sections', 'grouping-content', 'text-level-semantics', 'edits', - 'embedded-content-1', 'the-iframe-element', 'video', 'the-canvas-element', 'the-map-element', 'tabular-data', - 'forms', 'the-input-element', 'states-of-the-type-attribute', 'number-state', 'common-input-element-attributes', 'the-button-element', 'association-of-controls-and-forms', + 'embedded-content-1', 'tabular-data', + 'forms', 'states-of-the-type-attribute', 'number-state', 'common-input-element-attributes', 'the-button-element', 'association-of-controls-and-forms', 'interactive-elements', 'commands', # <-- semantics 'predefined-vocabularies-0', 'converting-html-to-other-formats', # <-- microdata 'origin-0', 'timers', 'offline', 'history', 'links', # <-- browsers + 'user-prompts', + 'system-state-and-capabilities', 'dnd', # <-- editing - - 'workers', 'network', 'web-messaging', 'webstorage', + 'editing-apis', 'parsing', 'tokenization', 'tree-construction', 'the-end', 'named-character-references', # <-- syntax ] @@ -80,6 +150,10 @@ doctitle = doc.find('.//title').text +if make_index_of_terms: + # get all the nodes from the index of terms (if any) and save for later + index_of_terms = doc.xpath("//*[@class='index-of-terms']//dl") + # Absolutise some references, so the spec can be hosted elsewhere if absolute_uris: for a in ('href', 'src'): @@ -104,14 +178,14 @@ # Make a stripped-down version of it short_header = deepcopy(header) -del short_header[2:] +del short_header[4:] # Extract the items in the TOC (remembering their nesting depth) def extract_toc_items(items, ol, depth): for li in ol.iterchildren(): for c in li.iterchildren(): if c.tag == 'a': - assert c.get('href')[0] == '#' + if c.get('href')[0] == '#': items.append( (depth, c.get('href')[1:], c) ) elif c.tag == 'ol': extract_toc_items(items, c, depth+1) @@ -168,8 +242,8 @@ # Contents/intro page: page = deepcopy(doc) -add_class(page.getroot(), 'split index') page_body = page.find('body') +add_class(page_body, 'split index') # Keep copying stuff from the front of the source document into this # page, until we find the first heading that isn't class="no-toc" @@ -185,7 +259,7 @@ def should_split(e): if e.tag == 'h2': return True if e.get('id') in split_exceptions: return True - if e.tag == 'div': + if e.tag == 'div' and e.get('class') == 'impl': c = e.getchildren() if len(c): if c[0].tag == 'h2': return True @@ -193,7 +267,7 @@ return False def get_heading_text_and_id(e): - if e.tag == 'div': + if e.tag == 'div' and e.get('class') == 'impl': node = e.getchildren()[0] else: node = e @@ -207,10 +281,10 @@ print ' <%s> %s - %s' % (heading.tag, name, title) page = deepcopy(doc) - add_class(page.getroot(), 'split chapter') page_body = page.find('body') + add_class(page_body, 'split chapter') - page.find('//title').text = title + u' \u2014 HTML5 ' + page.find('//title').text = title + u' \u2014 ' + doctitle # Add the header page_body.append(deepcopy(short_header)) @@ -239,17 +313,19 @@ if name == index_page: continue # don't add nav links to the TOC page head = doc.find('head') - style = etree.XML('<link href="style.css" rel="stylesheet"/>') - head.append(style) - nav = etree.Element('div') # HTML 4 compatibility + if w3c: + nav = etree.Element('div') # HTML 4 compatibility + else: + nav = etree.Element('nav') + nav.set('class', 'prev_next') nav.text = '\n ' nav.tail = '\n\n ' if i > 1: href = get_page_filename(pages[i-1][0]) title = pages[i-1][2] - a = etree.XML(u'<a href="%s" class="prev">%s</a>' % (href, title)) + a = etree.XML(u'<a href="%s">\u2190 %s</a>' % (href, title)) a.tail = u' \u2013\n ' nav.append(a) link = etree.XML('<link href="%s" title="%s" rel="prev"/>' % (href, title)) @@ -259,14 +335,14 @@ a = etree.XML('<a href="%s.html#contents">Table of contents</a>' % index_page) a.tail = '\n ' nav.append(a) - link = etree.XML('<link href="%s.html#contents" title="Table of contents" rel="index"/>' % index_page) + link = etree.XML('<link href="%s.html#contents" title="Table of contents" rel="contents"/>' % index_page) link.tail = '\n ' head.append(link) if i != len(pages)-1: href = get_page_filename(pages[i+1][0]) title = pages[i+1][2] - a = etree.XML(u'<a href="%s" class="next">%s</a>' % (href, title)) + a = etree.XML(u'<a href="%s">%s \u2192</a>' % (href, title)) a.tail = '\n ' nav.append(a) a.getprevious().tail = u' \u2013\n ' @@ -306,6 +382,22 @@ doc.find('body').insert(1, nav) # after the header +if make_index_of_terms: +# Write additional separate files for each term entry in the index of terms. +# Each term entry should be a <dl> with an id attribute whose value is an id of +# a <dfn>, with the string "_index" appended to it. +# For now, the subdirectory for the files is hardcoded here as "index-of-terms". + for term in index_of_terms: + # the firstChild <dt> here is a name and link for the defining instance of + # each index term; we don't need that in this context, so just remove it + term.remove(term.find("./dt")) + fix_refs('DUMMY', term) + # we use the ID of the term as the base for the filename, minus the last six + # characters ("_index") + id = term.get("id")[:-6] + f = open('%s/%s' % ("index-of-terms", id+".html"), 'w') + f.write(etree.tostring(term, pretty_print=True, method="html")) + report_broken_refs() print "Outputting..." @@ -313,12 +405,13 @@ # Output all the pages for name, doc, title in pages: f = open('%s/%s' % (file_args[1], get_page_filename(name)), 'w') - f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">\n') if use_html5lib_serialiser: tokens = html5lib.treewalkers.getTreeWalker('lxml')(doc) serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False) + f.write("<!doctype html>\n") for text in serializer.serialize(tokens, encoding='us-ascii'): - f.write(text) + if text != '<!DOCTYPE html>': # some versions of lxml emit this; get rid of it if so + f.write(text) else: f.write(etree.tostring(doc, pretty_print=False, method="html"))
Received on Saturday, 10 December 2011 06:15:56 UTC