html5/spec/tools spec-splitter.py,1.8,1.9

Update of /sources/public/html5/spec/tools
In directory hutz:/tmp/cvs-serv12849/tools

Modified Files:
	spec-splitter.py 
Log Message:
Remove some copypasta. (whatwg r6869)

[updated by splitter]


Index: spec-splitter.py
===================================================================
RCS file: /sources/public/html5/spec/tools/spec-splitter.py,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -d -r1.8 -r1.9
--- spec-splitter.py	5 Dec 2010 09:40:04 -0000	1.8
+++ spec-splitter.py	10 Dec 2011 06:15:49 -0000	1.9
@@ -9,6 +9,7 @@
 w3c = False
 use_html5lib_parser = False
 use_html5lib_serialiser = False
+make_index_of_terms = False
 file_args = []
 
 for arg in sys.argv[1:]:
@@ -20,6 +21,8 @@
         use_html5lib_parser = True
     elif arg == '--html5lib-serialiser':
         use_html5lib_serialiser = True
+    elif arg == '--make-index-of-terms':
+        make_index_of_terms = True
     else:
         file_args.append(arg)
 
@@ -42,25 +45,92 @@
 if w3c:
     index_page = 'Overview'
 else:
-    index_page =  'spec'
+    index_page = 'index'
 
 # The document is split on all <h2> elements, plus the following specific elements
 # (which were chosen to split any pages that were larger than about 100-200KB, and
 # may need to be adjusted as the spec changes):
 split_exceptions = [
-    'common-microsyntaxes', 'urls', 'fetching-resources', 'common-dom-interfaces', 'namespaces', # <-- infrastructure
+    'the-a-element', 'the-abbr-element', 'the-address-element',
+    'the-area-element', 'the-article-element', 'the-aside-element',
+    'the-audio-element', 'the-b-element', 'the-base-element',
+    'the-bdi-element', 'the-bdo-element', 'the-blockquote-element',
+    'the-body-element', 'the-br-element', 'the-button-element',
+    'the-canvas-element', 'the-caption-element', 'the-cite-element',
+    'the-code-element', 'the-col-element', 'the-colgroup-element',
+    'the-command-element', 'the-datalist-element', 'the-dd-element',
+    'the-del-element', 'the-details-element', 'the-dfn-element',
+    'the-dir-element', 'the-div-element', 'the-dl-element',
+    'the-dt-element', 'the-em-element', 'the-embed-element',
+    'the-fieldset-element', 'the-figcaption-element', 'the-figure-element',
+    'the-footer-element', 'the-form-element',
+    'the-h1-h2-h3-h4-h5-and-h6-elements', 'the-head-element',
+    'the-header-element', 'the-hgroup-element', 'the-hr-element',
+    'the-html-element', 'the-i-element', 'the-iframe-element',
+    'the-img-element', 'the-input-element', 'the-ins-element',
+    'the-kbd-element', 'the-keygen-element', 'the-label-element',
+    'the-legend-element', 'the-li-element', 'the-link-element',
+    'the-map-element', 'the-mark-element', 'the-menu-element',
+    'the-meta-element', 'the-meter-element', 'the-nav-element',
+    'the-noscript-element', 'the-object-element', 'the-ol-element',
+    'the-optgroup-element', 'the-option-element', 'the-output-element',
+    'the-p-element', 'the-param-element', 'the-pre-element',
+    'the-progress-element', 'the-q-element', 'the-rp-element',
+    'the-rt-element', 'the-ruby-element', 'the-s-element',
+    'the-samp-element', 'the-script-element', 'the-section-element',
+    'the-select-element', 'the-small-element', 'the-source-element',
+    'the-span-element', 'the-strong-element', 'the-style-element',
+    'the-sub-and-sup-elements', 'the-summary-element', 'the-table-element',
+    'the-tbody-element', 'the-td-element', 'the-textarea-element',
+    'the-tfoot-element', 'the-th-element', 'the-thead-element',
+    'the-time-element', 'the-title-element', 'the-tr-element',
+    'the-track-element', 'the-u-element', 'the-ul-element',
+    'the-var-element', 'the-video-element', 'the-wbr-element',
+
+    'styling',
+    'usage-summary',
+    'attributes-common-to-ins-and-del-elements',
+    'edits-and-paragraphs',
+    'edits-and-lists',
+    'media-elements',
+    'image-maps',
+    'mathml',
+    'svg-0',
+    'dimension-attributes',
+    'attributes-common-to-td-and-th-elements',
+    'examples',
+    'common-input-element-apis',
+
+    'global-attributes',
+    'element-definitions',
+    'common-dom-interfaces',
+    'namespaces',
+    'requirements-relating-to-bidirectional-algorithm-formatting-characters',
+    'wai-aria',
+    'headings-and-sections',
+
+    'dynamic-markup-insertion',
+    'common-microsyntaxes', 'urls', # <-- infrastructure
     'elements', 'content-models', 'apis-in-html-documents', # <-- dom
 
+    'attributes-common-to-form-controls',
+    'textFieldSelection',
+    'constraints',
+    'form-submission',
+
+    'common-idioms-without-dedicated-elements',
+
     'scripting-1', 'sections', 'grouping-content', 'text-level-semantics', 'edits',
-    'embedded-content-1', 'the-iframe-element', 'video', 'the-canvas-element', 'the-map-element', 'tabular-data',
-    'forms', 'the-input-element', 'states-of-the-type-attribute', 'number-state', 'common-input-element-attributes', 'the-button-element', 'association-of-controls-and-forms',
+    'embedded-content-1', 'tabular-data',
+    'forms', 'states-of-the-type-attribute', 'number-state', 'common-input-element-attributes', 'the-button-element', 'association-of-controls-and-forms',
     'interactive-elements', 'commands', # <-- semantics
 
     'predefined-vocabularies-0', 'converting-html-to-other-formats', # <-- microdata
     'origin-0', 'timers', 'offline', 'history', 'links', # <-- browsers
+    'user-prompts',
+    'system-state-and-capabilities',
     'dnd', # <-- editing
-
-    'workers', 'network', 'web-messaging', 'webstorage',
+    'editing-apis',
 
     'parsing', 'tokenization', 'tree-construction', 'the-end', 'named-character-references', # <-- syntax
 ]
@@ -80,6 +150,10 @@
 
 doctitle = doc.find('.//title').text
 
+if make_index_of_terms:
+  # get all the nodes from the index of terms (if any) and save for later
+  index_of_terms = doc.xpath("//*[@class='index-of-terms']//dl")
+
 # Absolutise some references, so the spec can be hosted elsewhere
 if absolute_uris:
     for a in ('href', 'src'):
@@ -104,14 +178,14 @@
 
 # Make a stripped-down version of it
 short_header = deepcopy(header)
-del short_header[2:]
+del short_header[4:]
 
 # Extract the items in the TOC (remembering their nesting depth)
 def extract_toc_items(items, ol, depth):
     for li in ol.iterchildren():
         for c in li.iterchildren():
             if c.tag == 'a':
-                assert c.get('href')[0] == '#'
+              if c.get('href')[0] == '#':
                 items.append( (depth, c.get('href')[1:], c) )
             elif c.tag == 'ol':
                 extract_toc_items(items, c, depth+1)
@@ -168,8 +242,8 @@
 # Contents/intro page:
 
 page = deepcopy(doc)
-add_class(page.getroot(), 'split index')
 page_body = page.find('body')
+add_class(page_body, 'split index')
 
 # Keep copying stuff from the front of the source document into this
 # page, until we find the first heading that isn't class="no-toc"
@@ -185,7 +259,7 @@
 def should_split(e):
     if e.tag == 'h2': return True
     if e.get('id') in split_exceptions: return True
-    if e.tag == 'div':
+    if e.tag == 'div' and e.get('class') == 'impl':
         c = e.getchildren()
         if len(c):
             if c[0].tag == 'h2': return True
@@ -193,7 +267,7 @@
     return False
 
 def get_heading_text_and_id(e):
-    if e.tag == 'div':
+    if e.tag == 'div' and e.get('class') == 'impl':
         node = e.getchildren()[0]
     else:
         node = e
@@ -207,10 +281,10 @@
     print '  <%s> %s - %s' % (heading.tag, name, title)
 
     page = deepcopy(doc)
-    add_class(page.getroot(), 'split chapter')
     page_body = page.find('body')
+    add_class(page_body, 'split chapter')
 
-    page.find('//title').text = title + u' \u2014 HTML5 '
+    page.find('//title').text = title + u' \u2014 ' + doctitle
 
     # Add the header
     page_body.append(deepcopy(short_header))
@@ -239,17 +313,19 @@
     if name == index_page: continue # don't add nav links to the TOC page
 
     head = doc.find('head')
-    style = etree.XML('<link href="style.css" rel="stylesheet"/>')
-    head.append(style)
 
-    nav = etree.Element('div') # HTML 4 compatibility
+    if w3c:
+        nav = etree.Element('div') # HTML 4 compatibility
+    else:
+        nav = etree.Element('nav')
+    nav.set('class', 'prev_next')
     nav.text = '\n   '
     nav.tail = '\n\n  '
 
     if i > 1:
         href = get_page_filename(pages[i-1][0])
         title = pages[i-1][2]
-        a = etree.XML(u'<a href="%s" class="prev">%s</a>' % (href, title))
+        a = etree.XML(u'<a href="%s">\u2190 %s</a>' % (href, title))
         a.tail = u' \u2013\n   '
         nav.append(a)
         link = etree.XML('<link href="%s" title="%s" rel="prev"/>' % (href, title))
@@ -259,14 +335,14 @@
     a = etree.XML('<a href="%s.html#contents">Table of contents</a>' % index_page)
     a.tail = '\n  '
     nav.append(a)
-    link = etree.XML('<link href="%s.html#contents" title="Table of contents" rel="index"/>' % index_page)
+    link = etree.XML('<link href="%s.html#contents" title="Table of contents" rel="contents"/>' % index_page)
     link.tail = '\n  '
     head.append(link)
 
     if i != len(pages)-1:
         href = get_page_filename(pages[i+1][0])
         title = pages[i+1][2]
-        a = etree.XML(u'<a href="%s" class="next">%s</a>' % (href, title))
+        a = etree.XML(u'<a href="%s">%s \u2192</a>' % (href, title))
         a.tail = '\n  '
         nav.append(a)
         a.getprevious().tail = u' \u2013\n   '
@@ -306,6 +382,22 @@
 
     doc.find('body').insert(1, nav) # after the header
 
+if make_index_of_terms:
+# Write additional separate files for each term entry in the index of terms.
+# Each term entry should be a <dl> with an id attribute whose value is an id of
+# a <dfn>, with the string "_index" appended to it.
+# For now, the subdirectory for the files is hardcoded here as "index-of-terms".
+  for term in index_of_terms:
+  # the firstChild <dt> here is a name and link for the defining instance of
+  # each index term; we don't need that in this context, so just remove it
+      term.remove(term.find("./dt"))
+      fix_refs('DUMMY', term)
+      # we use the ID of the term as the base for the filename, minus the last six
+      # characters ("_index")
+      id = term.get("id")[:-6]
+      f = open('%s/%s' % ("index-of-terms", id+".html"), 'w')
+      f.write(etree.tostring(term, pretty_print=True, method="html"))
+
 report_broken_refs()
 
 print "Outputting..."
@@ -313,12 +405,13 @@
 # Output all the pages
 for name, doc, title in pages:
     f = open('%s/%s' % (file_args[1], get_page_filename(name)), 'w')
-    f.write('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">\n')
     if use_html5lib_serialiser:
         tokens = html5lib.treewalkers.getTreeWalker('lxml')(doc)
         serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True, inject_meta_charset=False)
+        f.write("<!doctype html>\n")
         for text in serializer.serialize(tokens, encoding='us-ascii'):
-            f.write(text)
+            if text != '<!DOCTYPE html>': # some versions of lxml emit this; get rid of it if so
+                f.write(text)
     else:
         f.write(etree.tostring(doc, pretty_print=False, method="html"))
 

Received on Saturday, 10 December 2011 06:15:56 UTC