- From: Michael Smith via cvs-syncmail <cvsmail@w3.org>
- Date: Sun, 15 Aug 2010 22:55:08 +0000
- To: public-html-commits@w3.org
Update of /sources/public/html5/spec-author-view In directory hutz:/tmp/cvs-serv21531 Modified Files: patch.anolis Log Message: added mechanism for generating an inndex of terms; checkpointing Index: patch.anolis =================================================================== RCS file: /sources/public/html5/spec-author-view/patch.anolis,v retrieving revision 1.1 retrieving revision 1.2 diff -u -d -r1.1 -r1.2 --- patch.anolis 10 Aug 2010 10:45:42 -0000 1.1 +++ patch.anolis 15 Aug 2010 22:55:06 -0000 1.2 @@ -1,6 +1,30 @@ +diff -r 16550726fd0d anolis +--- anolis Sun Aug 30 16:53:19 2009 -0500 ++++ anolis Sun Aug 15 20:17:47 2010 +0900 +@@ -206,7 +206,7 @@ + dest="output_encoding", help="Output encoding") + + parser.set_defaults( +- processes=["filter", "sub", "toc", "xref", "annotate"], ++ processes=["filter", "sub", "toc", "xref", "annotate", "terms"], + parser="html5lib", + serializer="html5lib", + newline_char=u"\n", +diff -r 16550726fd0d anolislib/generator.py +--- anolis/anolislib/generator.py Sun Aug 30 16:53:19 2009 -0500 ++++ anolis/anolislib/generator.py Sun Aug 15 20:17:47 2010 +0900 +@@ -28,7 +28,7 @@ + from lxml import etree + + +-def process(tree, processes=["sub", "toc", "xref"], **kwargs): ++def process(tree, processes=["sub", "toc", "xref", "terms"], **kwargs): + """ Process the given tree. """ + + # Find number of passes to do diff -r 16550726fd0d anolislib/processes/filter.py --- anolis/anolislib/processes/filter.py Sun Aug 30 16:53:19 2009 -0500 -+++ anolis/anolislib/processes/filter.py Tue Aug 10 19:18:46 2010 +0900 ++++ anolis/anolislib/processes/filter.py Sun Aug 15 20:17:47 2010 +0900 @@ -5,23 +5,4 @@ return selector = cssselect.CSSSelector(kwargs["filter"]) @@ -26,34 +50,270 @@ - - + element.drop_tree() -diff -r 16550726fd0d anolislib/processes/xref.py ---- anolis/anolislib/processes/xref.py Sun Aug 30 16:53:19 2009 -0500 -+++ anolis/anolislib/processes/xref.py Tue Aug 10 19:18:46 2010 +0900 -@@ -25,7 +25,7 @@ - - from anolislib import utils - --instance_elements = frozenset([u"span", u"abbr", u"code", u"var", u"i"]) -+instance_elements = frozenset([u"a", u"span", u"abbr", u"code", u"var", u"i"]) - w3c_instance_elements = frozenset([u"abbr", u"acronym", u"b", u"bdo", u"big", - u"code", u"del", u"em", u"i", u"ins", - u"kbd", u"label", u"legend", u"q", u"samp", -@@ -96,13 +96,17 @@ - break - - if goodParentingAndChildren: -- if element.tag == u"span": -+ id = utils.generateID(element, **kwargs) -+ if element.tag == u"span"\ -+ or element.tag == u"a": - element.tag = u"a" - element.set(u"href", u"#" + self.dfns[term]) -+ element.set(u"id", id) - else: - link = etree.Element(u"a", - {u"href": - u"#" + self.dfns[term]}) -+ link.set(u"id", id) - if w3c_compat or w3c_compat_xref_a_placement: - for node in element: - link.append(node) +diff -r 16550726fd0d anolislib/processes/terms.py +--- /dev/null Thu Jan 01 00:00:00 1970 +0000 ++++ anolis/anolislib/processes/terms.py Sun Aug 15 20:17:47 2010 +0900 +@@ -0,0 +1,263 @@ ++# coding=UTF-8 ++# Copyright (c) 2010 Michael(tm) Smith ++# ++# Permission is hereby granted, free of charge, to any person obtaining a copy ++# of this software and associated documentation files (the "Software"), to deal ++# in the Software without restriction, including without limitation the rights ++# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++# copies of the Software, and to permit persons to whom the Software is ++# furnished to do so, subject to the following conditions: ++# ++# The above copyright notice and this permission notice shall be included in ++# all copies or substantial portions of the Software. ++# ++# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++# THE SOFTWARE. ++ ++import re ++ ++from lxml import etree ++from copy import deepcopy ++ ++from anolislib import utils ++ ++class terms(object): ++ """Build and add an index of terms.""" ++ ++ terms = None ++ ++ def __init__(self, ElementTree, **kwargs): ++ self.terms = etree.Element(u"div") ++ self.buildTerms(ElementTree, **kwargs) ++ self.addTerms(ElementTree, **kwargs) ++ ++ def buildTerms(self, ElementTree, w3c_compat=False, **kwargs): ++ self.terms.text = "\n" ++ # make a list of all the defining instances of "terms" in the document ++ # -- <dfn> elements ++ dfnList = ElementTree.findall("//dfn") ++ if dfnList: ++ # sort the list of <dfn> terms by the lowercase value of the DOM ++ # textContent of the <dfn> element (concantentation of the <dfn> ++ # text nodes and that of any of its descendant elements) ++ dfnList.sort(key=lambda dfn: dfn.text_content().lower()) ++ for dfn in dfnList: ++ # we don't need the tail, so copy the <dfn> and drop the tail ++ term = deepcopy(dfn) ++ term.tail = None ++ termID = None ++ if dfn.get("id"): ++ # if this <dfn> itself has an id, we'll us it as part of the ++ # id on the index entry for this term ++ termID = dfn.get("id") ++ elif dfn.getparent().get("id"): ++ # if this <dfn> itself has no id, use the id of its parent ++ # node as the id on the index entry for this term ++ termID = dfn.getparent().get("id") ++ # if we found an id, then create an index entry for this <dfn> ++ # term; otherwise, do nothing further ++ if termID: ++ indexEntry = etree.Element(u"dl",{u"id": termID+"_index"}) ++ indexEntry.text = "\n" ++ # termName is the name of the term as it appears in the index ++ termName = etree.Element(u"dt") ++ # textContent of the DOM textContent of this <dfn> element ++ textContent = dfn.text_content() ++ # normalize the text content of each <dfn> in the document ++ # and the normalize the text content of this <dfn>, then ++ # do a case-insensitive comparison of them and count how ++ # many matches we have ++ expr = "count(//dfn\ ++ [normalize-space(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'))\ ++ =normalize-space(translate($content,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'))])" ++ if ElementTree.xpath(expr, content = textContent) > 1: ++ # we have more than one <dfn> in the document whose ++ # content is a case-insensitive match for the ++ # textContent of this <dfn>; so, we qualify the name of ++ # each such term in the index listing by appending the ++ # id of the <dfn> to it ++ termName.text = textContent+" " ++ indexEntryID = etree.Element(u"span",{u"class": u"index-id", u"title": u""}) ++ indexEntryID.text = "("+termID+")" ++ indexEntryID.tail = " " ++ termName.append(indexEntryID) ++ else: ++ # otherwise, the textContent of this <dfn> is unique ++ # among the <dfn>s in this document, so we just use the ++ # textContent as the name of the term ++ termName.text = textContent+" " ++ indexEntry.append(termName) ++ # add a hyperlink back to the <dfn> that is the defining ++ # instance of the term ++ dfnLink = etree.Element(u"a", {u"href": "#"+termID, u"class": "dfn-ref"}) ++ dfnLink.text = u"\u203B" ++ termName.append(dfnLink) ++ termName.tail = "\n" ++ # ######################################################### ++ # make a list of all the instances of terms in the document ++ # that are hyperlinked references back to the <dfn> term ++ # that is the defining instance of the term ++ instanceList = ElementTree.xpath("//a[substring-after(@href,'#') = $targetID]", targetID = termID) ++ # if we found any instances of hyperlinked references to ++ # this <dfn> term, then we process the list ++ if instanceList: ++ instanceItem = None ++ lastLinkToHeading = None ++ lastInstanceItem = None ++ for instance in instanceList: ++ # each of these term instances is an <a> hyperlink ++ # without an id attributes, but we need each of ++ # these <a> instance hyperlinks to have an id ++ # attribute so that we can link back to it from the ++ # index of terms; so we create an id for each ++ instanceID = utils.generateID(instance, **kwargs) ++ instance.set(u"id",instanceID) ++ # make a copy of the node of the h1-h6 heading for the ++ # section that contains this instance hyperlink ++ linkToHeading = deepcopy(self.getAncestorHeading(instance)) ++ # some headings may have id attributes, but we don't ++ # want the id attribute, so drop it if we find one ++ if "id" in linkToHeading.attrib: ++ del linkToHeading.attrib["id"] ++ # some headings may contain descendants that are <a> ++ # links, and/or that have id attributeds ++ embeddedLinks = linkToHeading.xpath(".//*[@href or @id]") ++ # because we later transform the copy of this ++ # heading itself into a hyperlink, it can't ++ # contain descendant links; so, we un-linkify ++ # any elements that have @href attributes ++ # by turning it into an @href-less <span> ++ for element in embeddedLinks: ++ if "href" in element.attrib: ++ del element.attrib["href"] ++ element.tag = "span" ++ # this might be an <a> element that we added an ++ # id attribute to earlier and/or maybe be a ++ # <dfn> that already had an id attribute; but we ++ # don't want to copy the id attributes here, so ++ # drop any id attribute we find ++ if "id" in element.attrib: ++ del element.attrib["id"] ++ # if this heading is not the same as one that we've ++ # already added to the index entry for this term, ++ # then process the heading ++ if lastLinkToHeading is None or linkToHeading.text_content() != lastLinkToHeading.text_content(): ++ instanceItem = etree.Element(u"dd") ++ instanceItem.text = "\n" ++ lastLinkToHeading = linkToHeading ++ n = 1 ++ # change this copy from being an h1-h6 node, to ++ # just being an <a> hyperlink ++ linkToHeading.tag = "a" ++ # make this item link back to the actual place ++ # in the document where we found this particular ++ # instance of the term; we use the value of the ++ # id attribute that we added to the instance earlier ++ linkToHeading.set(u"href","#"+instanceID) ++ # we wait to add the item for the previous ++ # instance at this point because we need to ++ # delay adding in order to see if for this ++ # instance there are multiple references to the ++ # same ancestor heading (if there are, we append ++ # link numbers to them, instead of repeating the ++ # heading; see below) ++ if lastInstanceItem is not None: ++ #print(etree.tostring(lastInstanceItem,method="text")) ++ indexEntry.append(lastInstanceItem) ++ lastInstanceItem = instanceItem ++ linkToHeading.tail = "\n" ++ instanceItem.append(linkToHeading) ++ instanceItem.tail = "\n" ++ # otherwise, this heading is the same as one that ++ # we've already added to the index entry for this ++ # term; so instead of reprocessing the heading, we ++ # just append one or more link numbers to it ++ else: ++ n += 1 ++ counterLink = etree.Element(u"a",{u"href": instanceID, u"class": "index-counter"}) ++ counterLink.text = "("+str(n)+")" ++ counterLink.tail = "\n" ++ instanceItem.append(counterLink) ++ # if the value of our n counter is still at 1 at ++ # this point, it means the document contains only ++ # one instance of a reference this term, so we need ++ # to add that instance now ++ if n == 1: ++ indexEntry.append(instanceItem) ++ # otherwise, the document contains no hyperlinked references ++ # to this term at all, so we just add a note to indicate that ++ else: ++ noRefsNote = etree.Element(u"dd",{u"class": "index-norefs"}) ++ noRefsNote.text = "No references in this file." ++ indexEntry.append(noRefsNote) ++ self.terms.append(indexEntry) ++ indexEntry.tail = "\n" ++ self.terms.tail = "\n" ++ ++ def getAncestorHeading(self, descendantNode): ++ """ Given a node, return the node of the heading for the section that contains it.""" ++ node = descendantNode ++ while (node is not None): ++ if isinstance(node.tag,str) and re.match("^[hH][1-6]$",node.tag): ++ return node ++ elif node.getprevious() == None: ++ node = node.getparent() ++ else: ++ node = node.getprevious() ++ if isinstance(node.tag,str) and node.get("class") == "impl": ++ node = xpath("($thisnode/node())[last()]", thisNode = node) ++ return None ++ ++ def addTerms(self, ElementTree, **kwargs): ++ to_remove = set() ++ in_terms = False ++ for node in ElementTree.iter(): ++ if in_terms: ++ if node.tag is etree.Comment and \ ++ node.text.strip(utils.spaceCharacters) == u"end-index-terms": ++ if node.getparent() is not terms_parent: ++ raise DifferentParentException(u"begin-index-terms and end-index-terms have different parents") ++ in_terms = False ++ else: ++ to_remove.add(node) ++ elif node.tag is etree.Comment: ++ if node.text.strip(utils.spaceCharacters) == u"begin-index-terms": ++ terms_parent = node.getparent() ++ in_terms = True ++ node.tail = None ++ node.addnext(deepcopy(self.terms)) ++ self.indentNode(node.getnext(), 0, **kwargs) ++ elif node.text.strip(utils.spaceCharacters) == u"index-terms": ++ node.addprevious(etree.Comment(u"begin-index-terms")) ++ self.indentNode(node.getprevious(), 0, **kwargs) ++ node.addprevious(deepcopy(self.terms)) ++ self.indentNode(node.getprevious(), 0, **kwargs) ++ node.addprevious(etree.Comment(u"end-index-terms")) ++ self.indentNode(node.getprevious(), 0, **kwargs) ++ node.getprevious().tail = node.tail ++ to_remove.add(node) ++ for node in to_remove: ++ node.getparent().remove(node) ++ ++ def indentNode(self, node, indent=0, newline_char=u"\n", indent_char=u" ", ++ **kwargs): ++ whitespace = newline_char + indent_char * indent ++ if node.getprevious() is not None: ++ if node.getprevious().tail is None: ++ node.getprevious().tail = whitespace ++ else: ++ node.getprevious().tail += whitespace ++ else: ++ if node.getparent().text is None: ++ node.getparent().text = whitespace ++ else: ++ node.getparent().text += whitespace ++ ++class DifferentParentException(utils.AnolisException): ++ """begin-index-terms and end-index-terms do not have the same parent.""" ++ pass
Received on Sunday, 15 August 2010 22:55:09 UTC