#!/usr/bin/python
#
# Attempt at writing functions that extract HTML5 element and attribute
# information from the HTML5 specification.

SPECIFICATION = "http://www.whatwg.org/specs/web-apps/current-work/"

import sys
import urllib

def extract(link):
    doc = urllib.urlopen(link)
    result = {}
    name = ""
    lookingForAttributes = False
    attributesComing = False

    for l in doc:
        # This does not catch h1,h2,h3,h4,h5,h6,sub,sup
        if l.endswith("</code></dfn> element</h4>\n"):
             name = l.split("<code>")[-1].split("</code>")[0]
             result[name] = []
             lookingForAttributes = True
             continue
        if lookingForAttributes and l == "   <dt>Element-specific attributes:</dt>\n":
            lookingForAttributes = False
            attributesComing = True
            continue
        if attributesComing and l == "   <dd>None.</dd>\n" or l == "   <dt>DOM interface:</dt>\n":
            attributesComing = False
            continue
        if attributesComing:
            attr = l.split("</a></code>")[0].split(">")[-1]
            # This does not catch that embed allows any other non-namespaced attribute too
            if attr != "\n":
                result[name].append(attr)
        if not attributesComing and name == "div":
            break
    return str(result)

sys.stdout.write(extract(SPECIFICATION))
