W3C home > Mailing lists > Public > public-html-commits@w3.org > July 2008

html5/html-author/utils entity-processor.py,NONE,1.1 footer.html,NONE,1.1 generate.sh,NONE,1.1 header.html,NONE,1.1 legacy.txt,NONE,1.1

From: Lachlan Hunt via cvs-syncmail <cvsmail@w3.org>
Date: Tue, 22 Jul 2008 14:39:38 +0000
To: public-html-commits@w3.org
Message-Id: <E1KLJ1e-0002wX-6i@lionel-hutz.w3.org>

Update of /sources/public/html5/html-author/utils
In directory hutz:/tmp/cvs-serv11262/utils

Added Files:
	entity-processor.py footer.html generate.sh header.html 
	legacy.txt 
Log Message:
Utilites for generating character reference chart

--- NEW FILE: legacy.txt ---
AElig
AMP
Aacute
Acirc
Agrave
Aring
Atilde
Auml
COPY
Ccedil
ETH
Eacute
Ecirc
Egrave
Euml
GT
Iacute
Icirc
Igrave
Iuml
LT
Ntilde
Oacute
Ocirc
Ograve
Oslash
Otilde
Ouml
QUOT
REG
THORN
Uacute
Ucirc
Ugrave
Uuml
Yacute
aacute
acirc
acute
aelig
agrave
amp
aring
atilde
auml
brvbar
ccedil
cedil
cent
copy
curren
deg
divide
eacute
ecirc
egrave
eth
euml
frac12
frac14
frac34
gt
iacute
icirc
iexcl
igrave
iquest
iuml
laquo
lt
macr
micro
middot
nbsp
not
ntilde
oacute
ocirc
ograve
ordf
ordm
oslash
otilde
ouml
para
plusmn
pound
quot
raquo
reg
sect
shy
sup1
sup2
sup3
szlig
thorn
times
uacute
ucirc
ugrave
uml
uuml
yacute
yen
yuml

--- NEW FILE: entity-processor.py ---
import xml.dom.minidom

document = xml.dom.minidom.parse('unicode.xml')

sets = []
blocks = []
entities = {}

for block in document.getElementsByTagName('block'):
	start = int(block.getAttribute("start"), 16)
	end = int(block.getAttribute("end"), 16)
	name = block.getAttribute("name")
	blocks.append({"name":name, "start":start, "end":end})

for group in document.getElementsByTagName('group'):
	if (group.getAttribute('name') == 'html5' or group.getAttribute('name') == 'mathml'):
		for set in group.getElementsByTagName('set'):
			sets.append(set.getAttribute('name'))

for entity in document.getElementsByTagName('entity'):
	assert entity.parentNode.tagName == 'character'
	assert entity.hasAttribute('set')
	set = entity.getAttribute('set')
	if (set in sets):
		assert entity.hasAttribute('id')
		name = entity.getAttribute('id')
		assert len(name) > 0
		assert entity.parentNode.hasAttribute('id')
		value = entity.parentNode.getAttribute('id')
		if ('-' not in value):
			#assert name not in entities or entities[name] == value, '(name: ' + name + ' old value: ' + entities[name] + ' new value: ' + value + ')'
			if (value not in entities):
				entities[value] = {}
				entities[value]["hex-value"] = value[1:]
				entities[value]["dec-value"] = int(entities[value]["hex-value"], 16)
				entities[value]["character"] = "&#x%s;" % (entities[value]["hex-value"])
				entities[value]["named"]     = []
				entities[value]["set"]       = []
				entities[value]["hex"]       = "&amp;#x%s;" % (entities[value]["hex-value"])
				entities[value]["dec"]       = "&amp;#%s;"  % (entities[value]["dec-value"])
				entities[value]["desc"]      = entity.parentNode.getElementsByTagName("description")[0].childNodes[0].data
				entities[value]["category"]  = entity.parentNode.getElementsByTagName("unicodedata")[0].getAttribute("category")

			entityRef = "&amp;%s;" % (name)
			if (entityRef not in entities[value]["named"]):
				entities[value]["named"].append(entityRef)
			if (set not in entities[value]["set"]):
				entities[value]["set"].append(set)

for value in entities:
	char = entities[value]
	i = 0
	while (char["dec-value"] > blocks[i]["end"]):
		i = i + 1
	
	print "<tr title=\"U+%s %s\" data-block=\"%s\" data-category=\"%s\" data-set=\"%s\">"\
	          "<td class=\"character\">%s"\
	          "<td class=\"named\"><code>%s</code>"\
	          "<td class=\"hex\"><code>%s</code>"\
	          "<td class=\"dec\"><code>%s</code>"\
	          "<td class=\"desc\">%s"\
	      % (char["hex-value"], char["desc"], blocks[i]["name"], char["category"], " ".join(char["set"]),\
	         char["character"], " ".join(char["named"]), char["hex"], char["dec"], char["desc"])

--- NEW FILE: generate.sh ---
# Fetches the latest copy of unicode.xml, processes it with the entity-processor
# if an only if it's an updated copy, and then generates a new copy of the
# character reference chart

wget -o /dev/null -N http://www.w3.org/2003/entities/2007xml/unicode.xml
[ unicode.xml -nt entities-unicode.inc ] && nice -n 19 python entity-processor.py > .new-entities-unicode.inc
[ -s .new-entities-unicode.inc ] && mv -f .new-entities-unicode.inc entities-unicode.inc
cat entities-unicode.inc | sort | uniq > entities.inc
cat header.html entities.inc footer.html > ../charref.html

--- NEW FILE: footer.html ---
</table>
--- NEW FILE: header.html ---
<!DOCTYPE html>
<meta charset="UTF-8">
<title>Character Entity Reference Chart</title>
<style>
table { font: .8em Monaco; display: block; }
tr { background: #EEE; display: inline-block; text-align: center; width: 10em; height: 10em; margin: .5em; padding: 1px; position: relative; }
td { display: block; }

.character { font: 4em "Georgia", "Apple Symbols", serif; line-height: 1.1; }

.hex, .dec { visibility: hidden; }
.desc { display: none; position :absolute; text-align: center; background: silver; color: black; }

tr:hover .character { font-size: 2em; }
tr:hover .hex, tr:hover .dec { visibility: visible; }
xtr:hover .desc { display: block; top: 100%; left: 0; }
</style>
<table>
Received on Tuesday, 22 July 2008 14:40:11 GMT

This archive was generated by hypermail 2.2.0+W3C-0.50 : Thursday, 9 October 2008 20:32:58 GMT