- From: Michael Smith via cvs-syncmail <cvsmail@w3.org>
- Date: Sat, 10 Dec 2011 06:15:51 +0000
- To: public-html-commits@w3.org
Update of /sources/public/html5/spec/tools
In directory hutz:/tmp/cvs-serv12849/tools
Modified Files:
spec-splitter.py
Log Message:
Remove some copypasta. (whatwg r6869)
[updated by splitter]
Index: spec-splitter.py
===================================================================
RCS file: /sources/public/html5/spec/tools/spec-splitter.py,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -d -r1.8 -r1.9
--- spec-splitter.py 5 Dec 2010 09:40:04 -0000 1.8
+++ spec-splitter.py 10 Dec 2011 06:15:49 -0000 1.9
@@ -9,6 +9,7 @@
w3c = False
use_html5lib_parser = False
use_html5lib_serialiser = False
+make_index_of_terms = False
file_args = []
for arg in sys.argv[1:]:
@@ -20,6 +21,8 @@
use_html5lib_parser = True
elif arg == '--html5lib-serialiser':
use_html5lib_serialiser = True
+ elif arg == '--make-index-of-terms':
+ make_index_of_terms = True
else:
file_args.append(arg)
@@ -42,25 +45,92 @@
if w3c:
index_page = 'Overview'
else:
- index_page = 'spec'
+ index_page = 'index'
# The document is split on all <h2> elements, plus the following specific elements
# (which were chosen to split any pages that were larger than about 100-200KB, and
# may need to be adjusted as the spec changes):
split_exceptions = [
- 'common-microsyntaxes', 'urls', 'fetching-resources', 'common-dom-interfaces', 'namespaces', # <-- infrastructure
+ 'the-a-element', 'the-abbr-element', 'the-address-element',
+ 'the-area-element', 'the-article-element', 'the-aside-element',
+ 'the-audio-element', 'the-b-element', 'the-base-element',
+ 'the-bdi-element', 'the-bdo-element', 'the-blockquote-element',
+ 'the-body-element', 'the-br-element', 'the-button-element',
+ 'the-canvas-element', 'the-caption-element', 'the-cite-element',
+ 'the-code-element', 'the-col-element', 'the-colgroup-element',
+ 'the-command-element', 'the-datalist-element', 'the-dd-element',
+ 'the-del-element', 'the-details-element', 'the-dfn-element',
+ 'the-dir-element', 'the-div-element', 'the-dl-element',
+ 'the-dt-element', 'the-em-element', 'the-embed-element',
+ 'the-fieldset-element', 'the-figcaption-element', 'the-figure-element',
+ 'the-footer-element', 'the-form-element',
+ 'the-h1-h2-h3-h4-h5-and-h6-elements', 'the-head-element',
+ 'the-header-element', 'the-hgroup-element', 'the-hr-element',
+ 'the-html-element', 'the-i-element', 'the-iframe-element',
+ 'the-img-element', 'the-input-element', 'the-ins-element',
+ 'the-kbd-element', 'the-keygen-element', 'the-label-element',
+ 'the-legend-element', 'the-li-element', 'the-link-element',
+ 'the-map-element', 'the-mark-element', 'the-menu-element',
+ 'the-meta-element', 'the-meter-element', 'the-nav-element',
+ 'the-noscript-element', 'the-object-element', 'the-ol-element',
+ 'the-optgroup-element', 'the-option-element', 'the-output-element',
+ 'the-p-element', 'the-param-element', 'the-pre-element',
+ 'the-progress-element', 'the-q-element', 'the-rp-element',
+ 'the-rt-element', 'the-ruby-element', 'the-s-element',
+ 'the-samp-element', 'the-script-element', 'the-section-element',
+ 'the-select-element', 'the-small-element', 'the-source-element',
+ 'the-span-element', 'the-strong-element', 'the-style-element',
+ 'the-sub-and-sup-elements', 'the-summary-element', 'the-table-element',
+ 'the-tbody-element', 'the-td-element', 'the-textarea-element',
+ 'the-tfoot-element', 'the-th-element', 'the-thead-element',
+ 'the-time-element', 'the-title-element', 'the-tr-element',
+ 'the-track-element', 'the-u-element', 'the-ul-element',
+ 'the-var-element', 'the-video-element', 'the-wbr-element',
+
+ 'styling',
+ 'usage-summary',
+ 'attributes-common-to-ins-and-del-elements',
+ 'edits-and-paragraphs',
+ 'edits-and-lists',
+ 'media-elements',
+ 'image-maps',
+ 'mathml',
+ 'svg-0',
+ 'dimension-attributes',
+ 'attributes-common-to-td-and-th-elements',
+ 'examples',
+ 'common-input-element-apis',
+
+ 'global-attributes',
+ 'element-definitions',
+ 'common-dom-interfaces',
+ 'namespaces',
+ 'requirements-relating-to-bidirectional-algorithm-formatting-characters',
+ 'wai-aria',
+ 'headings-and-sections',
+
+ 'dynamic-markup-insertion',
+ 'common-microsyntaxes', 'urls', # <-- infrastructure
'elements', 'content-models', 'apis-in-html-documents', # <-- dom
+ 'attributes-common-to-form-controls',
+ 'textFieldSelection',
+ 'constraints',
+ 'form-submission',
+
+ 'common-idioms-without-dedicated-elements',
+
'scripting-1', 'sections', 'grouping-content', 'text-level-semantics', 'edits',
- 'embedded-content-1', 'the-iframe-element', 'video', 'the-canvas-element', 'the-map-element', 'tabular-data',
- 'forms', 'the-input-element', 'states-of-the-type-attribute', 'number-state', 'common-input-element-attributes', 'the-button-element', 'association-of-controls-and-forms',
+ 'embedded-content-1', 'tabular-data',
+ 'forms', 'states-of-the-type-attribute', 'number-state', 'common-input-element-attributes', 'the-button-element', 'association-of-controls-and-forms',
'interactive-elements', 'commands', # <-- semantics
'predefined-vocabularies-0', 'converting-html-to-other-formats', # <-- microdata
'origin-0', 'timers', 'offline', 'history', 'links', # <-- browsers
+ 'user-prompts',
+ 'system-state-and-capabilities',
'dnd', # <-- editing
-
- 'workers', 'network', 'web-messaging', 'webstorage',
+ 'editing-apis',
'parsing', 'tokenization', 'tree-construction', 'the-end', 'named-character-references', # <-- syntax
]
@@ -80,6 +150,10 @@
doctitle = doc.find('.//title').text
+if make_index_of_terms:
+ # get all the nodes from the index of terms (if any) and save for later
+ index_of_terms = doc.xpath("//*[@class='index-of-terms']//dl")
+
# Absolutise some references, so the spec can be hosted elsewhere
if absolute_uris:
for a in ('href', 'src'):
@@ -104,14 +178,14 @@
# Make a stripped-down version of it
short_header = deepcopy(header)
-del short_header[2:]
+del short_header[4:]
# Extract the items in the TOC (remembering their nesting depth)
def extract_toc_items(items, ol, depth):
for li in ol.iterchildren():
for c in li.iterchildren():
if c.tag == 'a':
- assert c.get('href')[0] == '#'
+ if c.get('href')[0] == '#':
items.append( (depth, c.get('href')[1:], c) )
elif c.tag == 'ol':
extract_toc_items(items, c, depth+1)
@@ -168,8 +242,8 @@
# Contents/intro page:
page = deepcopy(doc)
-add_class(page.getroot(), 'split index')
page_body = page.find('body')
+add_class(page_body, 'split index')
# Keep copying stuff from the front of the source document into this
# page, until we find the first heading that isn't class="no-toc"
@@ -185,7 +259,7 @@
def should_split(e):
if e.tag == 'h2': return True
if e.get('id') in split_exceptions: return True
- if e.tag == 'div':
+ if e.tag == 'div' and e.get('class') == 'impl':
c = e.getchildren()
if len(c):
if c[0].tag == 'h2': return True
@@ -193,7 +267,7 @@
return False
def get_heading_text_and_id(e):
- if e.tag == 'div':
+ if e.tag == 'div' and e.get('class') == 'impl':
node = e.getchildren()[0]
else:
node = e
@@ -207,10 +281,10 @@
print ' <%s> %s - %s' % (heading.tag, name, title)
page = deepcopy(doc)
- add_class(page.getroot(), 'split chapter')
page_body = page.find('body')
+ add_class(page_body, 'split chapter')
- page.find('//title').text = title + u' \u2014 HTML5 '
+ page.find('//title').text = title + u' \u2014 ' + doctitle
# Add the header
page_body.append(deepcopy(short_header))
@@ -239,17 +313,19 @@
if name == index_page: continue # don't add nav links to the TOC page
head = doc.find('head')
- style = etree.XML('<link href="style.css" rel="stylesheet"/>')
- head.append(style)
- nav = etree.Element('div') # HTML 4 compatibility
+ if w3c:
+ nav = etree.Element('div') # HTML 4 compatibility
+ else:
+ nav = etree.Element('nav')
+ nav.set('class', 'prev_next')
nav.text = '\n '
nav.tail = '\n\n '
if i > 1:
href = get_page_filename(pages[i-1][0])
title = pages[i-1][2]
- a = etree.XML(u'<a href="%s" class="prev">%s</a>' % (href, title))
+ a = etree.XML(u'<a href="%s">\u2190 %s</a>' % (href, title))
a.tail = u' \u2013\n '
nav.append(a)
link = etree.XML('<link href="%s" title="%s" rel="prev"/>' % (href, title))
@@ -259,14 +335,14 @@
a = etree.XML('<a href="%s.html#contents">Table of contents</a>' % index_page)
a.tail = '\n '
nav.append(a)
- link = etree.XML('<link href="%s.html#contents" title="Table of contents" rel="index"/>' % index_page)
+ link = etree.XML('<link href="%s.html#contents" title="Table of contents" rel="contents"/>' % index_page)
link.tail = '\n '
head.append(link)
if i != len(pages)-1:
href = get_page_filename(pages[i+1][0])
title = pages[i+1][2]
- a = etree.XML(u'<a href="%s" class="next">%s</a>' % (href, title))
+ a = etree.XML(u'<a href="%s">%s \u2192</a>' % (href, title))
a.tail = '\n '
nav.append(a)
a.getprevious().tail = u' \u2013\n '
@@ -306,6 +382,22 @@
doc.find('body').insert(1, nav) # after the header
+if make_index_of_terms:
+# Write additional separate files for each term entry in the index of terms.
+# Each term entry should be a <dl> with an id attribute whose value is an id of
+# a <dfn>, with the string "_index" appended to it.
+# For now, the subdirectory for the files is hardcoded here as "index-of-terms".
+ for term in index_of_terms:
+ # the firstChild <dt> here is a name and link for the defining instance of
+ # each index term; we don't need that in this context, so just remove it
+ term.remove(term.find("./dt"))
+ fix_refs('DUMMY', term)
+ # we use the ID of the term as the base for the filename, minus the last six
+ # characters ("_index")
+ id = term.get("id")[:-6]
+ f = open('%s/%s' % ("index-of-terms", id+".html"), 'w')
+ f.write(etree.tostring(term, pretty_print=True, method="html"))
+
report_broken_refs()
print "Outputting..."
@@ -313,12 +405,13 @@
# Output all the pages
for name, doc, title in pages:
f = open('%s/%s' % (file_args[1], get_page_filename(name)), 'w')
- f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">\n')
if use_html5lib_serialiser:
tokens = html5lib.treewalkers.getTreeWalker('lxml')(doc)
serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False)
+ f.write("<!doctype html>\n")
for text in serializer.serialize(tokens, encoding='us-ascii'):
- f.write(text)
+ if text != '<!DOCTYPE html>': # some versions of lxml emit this; get rid of it if so
+ f.write(text)
else:
f.write(etree.tostring(doc, pretty_print=False, method="html"))
Received on Saturday, 10 December 2011 06:15:56 UTC