- From: Michael Smith via cvs-syncmail <cvsmail@w3.org>
- Date: Sun, 15 Aug 2010 22:55:08 +0000
- To: public-html-commits@w3.org
Update of /sources/public/html5/spec-author-view
In directory hutz:/tmp/cvs-serv21531
Modified Files:
patch.anolis
Log Message:
added mechanism for generating an inndex of terms; checkpointing
Index: patch.anolis
===================================================================
RCS file: /sources/public/html5/spec-author-view/patch.anolis,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- patch.anolis 10 Aug 2010 10:45:42 -0000 1.1
+++ patch.anolis 15 Aug 2010 22:55:06 -0000 1.2
@@ -1,6 +1,30 @@
+diff -r 16550726fd0d anolis
+--- anolis Sun Aug 30 16:53:19 2009 -0500
++++ anolis Sun Aug 15 20:17:47 2010 +0900
+@@ -206,7 +206,7 @@
+ dest="output_encoding", help="Output encoding")
+
+ parser.set_defaults(
+- processes=["filter", "sub", "toc", "xref", "annotate"],
++ processes=["filter", "sub", "toc", "xref", "annotate", "terms"],
+ parser="html5lib",
+ serializer="html5lib",
+ newline_char=u"\n",
+diff -r 16550726fd0d anolislib/generator.py
+--- anolis/anolislib/generator.py Sun Aug 30 16:53:19 2009 -0500
++++ anolis/anolislib/generator.py Sun Aug 15 20:17:47 2010 +0900
+@@ -28,7 +28,7 @@
+ from lxml import etree
+
+
+-def process(tree, processes=["sub", "toc", "xref"], **kwargs):
++def process(tree, processes=["sub", "toc", "xref", "terms"], **kwargs):
+ """ Process the given tree. """
+
+ # Find number of passes to do
diff -r 16550726fd0d anolislib/processes/filter.py
--- anolis/anolislib/processes/filter.py Sun Aug 30 16:53:19 2009 -0500
-+++ anolis/anolislib/processes/filter.py Tue Aug 10 19:18:46 2010 +0900
++++ anolis/anolislib/processes/filter.py Sun Aug 15 20:17:47 2010 +0900
@@ -5,23 +5,4 @@
return
selector = cssselect.CSSSelector(kwargs["filter"])
@@ -26,34 +50,270 @@
-
-
+ element.drop_tree()
-diff -r 16550726fd0d anolislib/processes/xref.py
---- anolis/anolislib/processes/xref.py Sun Aug 30 16:53:19 2009 -0500
-+++ anolis/anolislib/processes/xref.py Tue Aug 10 19:18:46 2010 +0900
-@@ -25,7 +25,7 @@
-
- from anolislib import utils
-
--instance_elements = frozenset([u"span", u"abbr", u"code", u"var", u"i"])
-+instance_elements = frozenset([u"a", u"span", u"abbr", u"code", u"var", u"i"])
- w3c_instance_elements = frozenset([u"abbr", u"acronym", u"b", u"bdo", u"big",
- u"code", u"del", u"em", u"i", u"ins",
- u"kbd", u"label", u"legend", u"q", u"samp",
-@@ -96,13 +96,17 @@
- break
-
- if goodParentingAndChildren:
-- if element.tag == u"span":
-+ id = utils.generateID(element, **kwargs)
-+ if element.tag == u"span"\
-+ or element.tag == u"a":
- element.tag = u"a"
- element.set(u"href", u"#" + self.dfns[term])
-+ element.set(u"id", id)
- else:
- link = etree.Element(u"a",
- {u"href":
- u"#" + self.dfns[term]})
-+ link.set(u"id", id)
- if w3c_compat or w3c_compat_xref_a_placement:
- for node in element:
- link.append(node)
+diff -r 16550726fd0d anolislib/processes/terms.py
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ anolis/anolislib/processes/terms.py Sun Aug 15 20:17:47 2010 +0900
+@@ -0,0 +1,263 @@
++# coding=UTF-8
++# Copyright (c) 2010 Michael(tm) Smith
++#
++# Permission is hereby granted, free of charge, to any person obtaining a copy
++# of this software and associated documentation files (the "Software"), to deal
++# in the Software without restriction, including without limitation the rights
++# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
++# copies of the Software, and to permit persons to whom the Software is
++# furnished to do so, subject to the following conditions:
++#
++# The above copyright notice and this permission notice shall be included in
++# all copies or substantial portions of the Software.
++#
++# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
++# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
++# THE SOFTWARE.
++
++import re
++
++from lxml import etree
++from copy import deepcopy
++
++from anolislib import utils
++
++class terms(object):
++ """Build and add an index of terms."""
++
++ terms = None
++
++ def __init__(self, ElementTree, **kwargs):
++ self.terms = etree.Element(u"div")
++ self.buildTerms(ElementTree, **kwargs)
++ self.addTerms(ElementTree, **kwargs)
++
++ def buildTerms(self, ElementTree, w3c_compat=False, **kwargs):
++ self.terms.text = "\n"
++ # make a list of all the defining instances of "terms" in the document
++ # -- <dfn> elements
++ dfnList = ElementTree.findall("//dfn")
++ if dfnList:
++ # sort the list of <dfn> terms by the lowercase value of the DOM
++ # textContent of the <dfn> element (concantentation of the <dfn>
++ # text nodes and that of any of its descendant elements)
++ dfnList.sort(key=lambda dfn: dfn.text_content().lower())
++ for dfn in dfnList:
++ # we don't need the tail, so copy the <dfn> and drop the tail
++ term = deepcopy(dfn)
++ term.tail = None
++ termID = None
++ if dfn.get("id"):
++ # if this <dfn> itself has an id, we'll us it as part of the
++ # id on the index entry for this term
++ termID = dfn.get("id")
++ elif dfn.getparent().get("id"):
++ # if this <dfn> itself has no id, use the id of its parent
++ # node as the id on the index entry for this term
++ termID = dfn.getparent().get("id")
++ # if we found an id, then create an index entry for this <dfn>
++ # term; otherwise, do nothing further
++ if termID:
++ indexEntry = etree.Element(u"dl",{u"id": termID+"_index"})
++ indexEntry.text = "\n"
++ # termName is the name of the term as it appears in the index
++ termName = etree.Element(u"dt")
++ # textContent of the DOM textContent of this <dfn> element
++ textContent = dfn.text_content()
++ # normalize the text content of each <dfn> in the document
++ # and the normalize the text content of this <dfn>, then
++ # do a case-insensitive comparison of them and count how
++ # many matches we have
++ expr = "count(//dfn\
++ [normalize-space(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'))\
++ =normalize-space(translate($content,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'))])"
++ if ElementTree.xpath(expr, content = textContent) > 1:
++ # we have more than one <dfn> in the document whose
++ # content is a case-insensitive match for the
++ # textContent of this <dfn>; so, we qualify the name of
++ # each such term in the index listing by appending the
++ # id of the <dfn> to it
++ termName.text = textContent+" "
++ indexEntryID = etree.Element(u"span",{u"class": u"index-id", u"title": u""})
++ indexEntryID.text = "("+termID+")"
++ indexEntryID.tail = " "
++ termName.append(indexEntryID)
++ else:
++ # otherwise, the textContent of this <dfn> is unique
++ # among the <dfn>s in this document, so we just use the
++ # textContent as the name of the term
++ termName.text = textContent+" "
++ indexEntry.append(termName)
++ # add a hyperlink back to the <dfn> that is the defining
++ # instance of the term
++ dfnLink = etree.Element(u"a", {u"href": "#"+termID, u"class": "dfn-ref"})
++ dfnLink.text = u"\u203B"
++ termName.append(dfnLink)
++ termName.tail = "\n"
++ # #########################################################
++ # make a list of all the instances of terms in the document
++ # that are hyperlinked references back to the <dfn> term
++ # that is the defining instance of the term
++ instanceList = ElementTree.xpath("//a[substring-after(@href,'#') = $targetID]", targetID = termID)
++ # if we found any instances of hyperlinked references to
++ # this <dfn> term, then we process the list
++ if instanceList:
++ instanceItem = None
++ lastLinkToHeading = None
++ lastInstanceItem = None
++ for instance in instanceList:
++ # each of these term instances is an <a> hyperlink
++ # without an id attributes, but we need each of
++ # these <a> instance hyperlinks to have an id
++ # attribute so that we can link back to it from the
++ # index of terms; so we create an id for each
++ instanceID = utils.generateID(instance, **kwargs)
++ instance.set(u"id",instanceID)
++ # make a copy of the node of the h1-h6 heading for the
++ # section that contains this instance hyperlink
++ linkToHeading = deepcopy(self.getAncestorHeading(instance))
++ # some headings may have id attributes, but we don't
++ # want the id attribute, so drop it if we find one
++ if "id" in linkToHeading.attrib:
++ del linkToHeading.attrib["id"]
++ # some headings may contain descendants that are <a>
++ # links, and/or that have id attributeds
++ embeddedLinks = linkToHeading.xpath(".//*[@href or @id]")
++ # because we later transform the copy of this
++ # heading itself into a hyperlink, it can't
++ # contain descendant links; so, we un-linkify
++ # any elements that have @href attributes
++ # by turning it into an @href-less <span>
++ for element in embeddedLinks:
++ if "href" in element.attrib:
++ del element.attrib["href"]
++ element.tag = "span"
++ # this might be an <a> element that we added an
++ # id attribute to earlier and/or maybe be a
++ # <dfn> that already had an id attribute; but we
++ # don't want to copy the id attributes here, so
++ # drop any id attribute we find
++ if "id" in element.attrib:
++ del element.attrib["id"]
++ # if this heading is not the same as one that we've
++ # already added to the index entry for this term,
++ # then process the heading
++ if lastLinkToHeading is None or linkToHeading.text_content() != lastLinkToHeading.text_content():
++ instanceItem = etree.Element(u"dd")
++ instanceItem.text = "\n"
++ lastLinkToHeading = linkToHeading
++ n = 1
++ # change this copy from being an h1-h6 node, to
++ # just being an <a> hyperlink
++ linkToHeading.tag = "a"
++ # make this item link back to the actual place
++ # in the document where we found this particular
++ # instance of the term; we use the value of the
++ # id attribute that we added to the instance earlier
++ linkToHeading.set(u"href","#"+instanceID)
++ # we wait to add the item for the previous
++ # instance at this point because we need to
++ # delay adding in order to see if for this
++ # instance there are multiple references to the
++ # same ancestor heading (if there are, we append
++ # link numbers to them, instead of repeating the
++ # heading; see below)
++ if lastInstanceItem is not None:
++ #print(etree.tostring(lastInstanceItem,method="text"))
++ indexEntry.append(lastInstanceItem)
++ lastInstanceItem = instanceItem
++ linkToHeading.tail = "\n"
++ instanceItem.append(linkToHeading)
++ instanceItem.tail = "\n"
++ # otherwise, this heading is the same as one that
++ # we've already added to the index entry for this
++ # term; so instead of reprocessing the heading, we
++ # just append one or more link numbers to it
++ else:
++ n += 1
++ counterLink = etree.Element(u"a",{u"href": instanceID, u"class": "index-counter"})
++ counterLink.text = "("+str(n)+")"
++ counterLink.tail = "\n"
++ instanceItem.append(counterLink)
++ # if the value of our n counter is still at 1 at
++ # this point, it means the document contains only
++ # one instance of a reference this term, so we need
++ # to add that instance now
++ if n == 1:
++ indexEntry.append(instanceItem)
++ # otherwise, the document contains no hyperlinked references
++ # to this term at all, so we just add a note to indicate that
++ else:
++ noRefsNote = etree.Element(u"dd",{u"class": "index-norefs"})
++ noRefsNote.text = "No references in this file."
++ indexEntry.append(noRefsNote)
++ self.terms.append(indexEntry)
++ indexEntry.tail = "\n"
++ self.terms.tail = "\n"
++
++ def getAncestorHeading(self, descendantNode):
++ """ Given a node, return the node of the heading for the section that contains it."""
++ node = descendantNode
++ while (node is not None):
++ if isinstance(node.tag,str) and re.match("^[hH][1-6]$",node.tag):
++ return node
++ elif node.getprevious() == None:
++ node = node.getparent()
++ else:
++ node = node.getprevious()
++ if isinstance(node.tag,str) and node.get("class") == "impl":
++ node = xpath("($thisnode/node())[last()]", thisNode = node)
++ return None
++
++ def addTerms(self, ElementTree, **kwargs):
++ to_remove = set()
++ in_terms = False
++ for node in ElementTree.iter():
++ if in_terms:
++ if node.tag is etree.Comment and \
++ node.text.strip(utils.spaceCharacters) == u"end-index-terms":
++ if node.getparent() is not terms_parent:
++ raise DifferentParentException(u"begin-index-terms and end-index-terms have different parents")
++ in_terms = False
++ else:
++ to_remove.add(node)
++ elif node.tag is etree.Comment:
++ if node.text.strip(utils.spaceCharacters) == u"begin-index-terms":
++ terms_parent = node.getparent()
++ in_terms = True
++ node.tail = None
++ node.addnext(deepcopy(self.terms))
++ self.indentNode(node.getnext(), 0, **kwargs)
++ elif node.text.strip(utils.spaceCharacters) == u"index-terms":
++ node.addprevious(etree.Comment(u"begin-index-terms"))
++ self.indentNode(node.getprevious(), 0, **kwargs)
++ node.addprevious(deepcopy(self.terms))
++ self.indentNode(node.getprevious(), 0, **kwargs)
++ node.addprevious(etree.Comment(u"end-index-terms"))
++ self.indentNode(node.getprevious(), 0, **kwargs)
++ node.getprevious().tail = node.tail
++ to_remove.add(node)
++ for node in to_remove:
++ node.getparent().remove(node)
++
++ def indentNode(self, node, indent=0, newline_char=u"\n", indent_char=u" ",
++ **kwargs):
++ whitespace = newline_char + indent_char * indent
++ if node.getprevious() is not None:
++ if node.getprevious().tail is None:
++ node.getprevious().tail = whitespace
++ else:
++ node.getprevious().tail += whitespace
++ else:
++ if node.getparent().text is None:
++ node.getparent().text = whitespace
++ else:
++ node.getparent().text += whitespace
++
++class DifferentParentException(utils.AnolisException):
++ """begin-index-terms and end-index-terms do not have the same parent."""
++ pass
Received on Sunday, 15 August 2010 22:55:09 UTC