- From: Richard A. O'Keefe <ok@atlas.otago.ac.nz>
- Date: Thu, 9 Nov 2000 16:53:32 +1300 (NZDT)
- To: html-tidy@w3.org
Ok, I've implemented two new options: drop-unknown-attrs: yes|no Tidy always reports unknown attributes. If this option is yes, it deletes them as well (from the start- tag token, before it even gets put into the tree). drop-duplicate-atts: never|keep-first|keep-last|if-equal Tidy always reports duplicated attributes. (Now it reports them once, instead of once for each instance.) If never: they are retained as they are now. If keep-first: the first instance is kept and the others deleted. If keep-last: the last instance is kept and the others deleted. If if-equal: redundant instances with exactly the same value (alphabetic case and all) are deleted, others are kept. Attributes are deleted from the start-tag token, before it even gets put into the tree. There's still some clean-up to be done, removing dead code and improving some of the Check* functions in attrs.c. At the moment, old code is surrounded by #if OLD_CHECK. The heart is the new function CheckAndPruneAttributes. Here is a "diff -C 5" of all the affected files. *** bak/html.h Thu Nov 9 15:39:07 2000 --- html.h Thu Nov 9 14:44:47 2000 *************** *** 469,478 **** --- 469,479 ---- Attribute *FindAttribute(AttVal *attval); AttVal *GetAttrByName(Node *node, char *name); void AddAttribute(Node *node, char *name, char *value); void CheckUniqueAttributes(Lexer *lexer, Node *node); void CheckAttributes(Lexer *lexer, Node *node); + void CheckAndPruneAttributes(Lexer *lexer, Node *node); Attribute *CheckAttribute(Lexer *lexer, Node *node, AttVal *attval); Bool IsUrl(char *attrname); Bool IsScript(char *attrname); /* istack.c */ *************** *** 752,761 **** --- 753,768 ---- extern Bool WrapPhp; extern Bool FixBackslash; extern Bool IndentAttributes; extern Bool Word2000; extern Bool Emacs; /* sasdjb 01May00 GNU Emacs error output format */ + extern Bool DropUnknownAtts; + extern int DropDuplicateAtts; + #define Drop_Never 0 + #define Drop_First 1 + #define Drop_Last 2 + #define Drop_Equal 3 /* Parser methods for tags */ Parser ParseHTML; Parser ParseHead; *** bak/attrs.c Thu Nov 9 15:38:58 2000 --- attrs.c Thu Nov 9 15:29:50 2000 *************** *** 329,338 **** --- 329,342 ---- } return attr; } + /* If you want to drop unknown attributes, DON'T try to do it here. + Tidy uses this function ONLY when it wants to add an attribute + (with 'name' a known string at compile time) to a node. + */ void AddAttribute(Node *node, char *name, char *value) { AttVal *av = NewAttribute(); av->delim = '"'; av->attribute = wstrdup(name); *************** *** 417,426 **** --- 421,523 ---- hashtab[i] = null; } } + /* Each attribute should be known and appear at most once. + Tidy doesn't know which attributes go with what element types, + so we don't detect misplaced attributes. That's an obvious + enhancement. If in the future, anyone wants attributes sorted, + this would be the single place to change. There's one tricky + point here: Tidy builds the list of attributes *BACKWARDS*, + so to keep the first/last *source* attribute, keep the last/first + *stored* attribute instead. This also explains why Tidy reports + attribute warnings back to front. + */ + void CheckAndPruneAttributes(Lexer *lexer, Node *node) { + AttVal *p; /* current attribute=value pair */ + AttVal *q; /* next attribute=value pair */ + AttVal**e; /* where to put attributes that are retained */ + AttVal *r; /* scans remainder of list (for uniqueness) */ + AttVal *s; /* next pair, for r scan */ + AttVal**h; /* for deleting duplicates */ + Bool d; /* duplicate of p found? */ + Bool unk = /* check for unknown attributes? */ + !XmlTags && !(node->tag != 0 && + (node->tag->versions & VERS_PROPRIETARY) != 0); + + e = &node->attributes; + for (p = *e; p; p = q) { + q = p->next; + if (p->asp != null || p->php != null) { + /* ASP and PHP attributes are heavy magic and are not */ + /* to be checked like other attributes */ + *e = p; + e = &p->next; + } else + if (p->dict == null && unk) { + ReportAttrError(lexer, node, p->attribute, UNKNOWN_ATTRIBUTE); + if (DropUnknownAtts) { + FreeAttribute(p); + } else { + *e = p; + e = &p->next; + } + } else { + /* This is an ordinary attribute=value pair that is a known */ + /* or proprietary or XML attribute; check uniqueness */ + d = no; + h = &q; + for (r = q; r != null; r = s) { + s = r->next; + if (r->asp == null && r->php == null + && wstrcasecmp(r->attribute, p->attribute) == 0 + ) { + d = yes; + switch (DropDuplicateAtts) { + case Drop_Never: + h = &r->next; + break; + case Drop_Last: /* Tidy reversed the list! */ + FreeAttribute(p); + p = r; + *h = s; + break; + case Drop_First: /* Tidy reversed the list! */ + FreeAttribute(r); + *h = s; + break; + case Drop_Equal: + if (p->value == 0 ? r->value == 0 + : r->value != 0 && + wstrcasecmp(p->value, r->value) == 0 + ) { + FreeAttribute(r); + *h = s; + } else { + h = &r->next; + } + break; + } + } else { + h = &r->next; + } + } + *h = 0; + if (d) + ReportAttrError(lexer, node, p->attribute, REPEATED_ATTRIBUTE); + *e = p; + e = &p->next; + } + } + *e = 0; + } + + #if OLDCHECK + /* CheckUniqueAttributes duplicates part of the work done by + CheckAndPruneAttributes; we don't need it any more. + */ /* the same attribute name can't be used more than once in each element */ *************** *** 448,465 **** --- 545,566 ---- { if (attval->asp == null && attval->php == null) CheckUniqueAttribute(lexer, node, attval); } } + #endif /* ignore unknown attributes for proprietary elements */ Attribute *CheckAttribute(Lexer *lexer, Node *node, AttVal *attval) { Attribute *attribute; + #if OLDCHECK + /* This is superseded by the check in CheckAndPruneAttributes */ if (attval->asp == null && attval->php == null) CheckUniqueAttribute(lexer, node, attval); + #endif if ((attribute = attval->dict) != null) { /* title is vers 2.0 for A and LINK otherwise vers 4.0 */ if (attribute == attr_title && *************** *** 474,486 **** --- 575,590 ---- lexer->versions &= attribute->versions; if (attribute->attrchk) attribute->attrchk(lexer, node, attval); } + #if OLDCHECK + /* This is superseded by the check in CheckAndPruneAttributes */ else if (!XmlTags && !(node->tag == null) && attval->asp == null && !(node->tag && (node->tag->versions & VERS_PROPRIETARY))) ReportAttrError(lexer, node, attval->attribute, UNKNOWN_ATTRIBUTE); + #endif return attribute; } Bool IsBoolAttribute(AttVal *attval) *** bak/config.c Thu Nov 9 15:38:58 2000 --- config.c Thu Nov 9 12:08:35 2000 *************** *** 38,47 **** --- 38,49 ---- ParseProperty ParseString; /* a string including whitespace */ ParseProperty ParseTagNames; /* a space separated list of tag names */ ParseProperty ParseCharEncoding; /* RAW, ASCII, LATIN1, UTF8 or ISO2022 */ ParseProperty ParseIndent; /* specific to the indent option */ ParseProperty ParseDocType; /* omit | auto | strict | loose | <fpi> */ + ParseProperty ParseDups; /* never | keep-first | keep-last | if-equal */ + /* | drop-last | drop-first */ uint spaces = 2; /* default indentation */ uint wraplen = 68; /* default wrap margin */ int CharEncoding = ASCII; int tabsize = 4; *************** *** 91,100 **** --- 93,105 ---- Bool EncloseBlockText = no; /* if yes text in blocks is wrapped in <p>'s */ Bool KeepFileTimes = yes; /* if yes last modied time is preserved */ Bool Word2000 = no; /* draconian cleaning for Word2000 */ Bool TidyMark = yes; /* add meta element indicating tidied doc */ Bool Emacs = no; /* if true format error output for GNU Emacs */ + Bool DropUnknownAtts = no; /* discard non-standard attributes when true */ + int DropDuplicateAtts = /* duplicate attributes: discard Never/first */ + Drop_Never; /* last/only if equal values. */ typedef struct _lex PLex; static uint c; /* current char in input stream */ static FILE *fin; /* file pointer for input stream */ *************** *** 182,194 **** {"new-pre-tags", {(int *)&pre_tags}, ParseTagNames}, {"char-encoding", {(int *)&CharEncoding}, ParseCharEncoding}, {"doctype", {(int *)&doctype_str}, ParseDocType}, {"fix-backslash", {(int *)&FixBackslash}, ParseBool}, {"gnu-emacs", {(int *)&Emacs}, ParseBool}, ! /* this must be the final entry */ ! {0, 0, 0} }; static unsigned hash(char *s) { unsigned hashval; --- 187,202 ---- {"new-pre-tags", {(int *)&pre_tags}, ParseTagNames}, {"char-encoding", {(int *)&CharEncoding}, ParseCharEncoding}, {"doctype", {(int *)&doctype_str}, ParseDocType}, {"fix-backslash", {(int *)&FixBackslash}, ParseBool}, {"gnu-emacs", {(int *)&Emacs}, ParseBool}, ! {"drop-unknown-atts", ! {(int*)&DropUnknownAtts}, ParseBool}, ! {"drop-duplicate-atts", ! {(int*)&DropDuplicateAtts}, ParseDups}, /* this must be the final entry */ ! {(char*)0, {(int*)0}, 0} }; static unsigned hash(char *s) { unsigned hashval; *************** *** 289,329 **** return *config_text++; return EOF; } ! static int AdvanceChar() { if (c != EOF) c = (uint)GetC(fin); return c; } ! static int SkipWhite() { while (IsWhite((uint) c)) c = (uint)GetC(fin); return c; } /* skip until end of line */ ! static void SkipToEndofLine() { while (c != EOF) { c = (uint)GetC(fin); if (c == '\n' || c == '\r') break; } } /* skip over line continuations to start of next property */ ! static int NextProperty() { do { /* skip to end of line */ while (c != '\n' && c != '\r' && c != EOF) --- 297,339 ---- return *config_text++; return EOF; } ! static int AdvanceChar(void) { if (c != EOF) c = (uint)GetC(fin); return c; } ! static int SkipWhite(void) { while (IsWhite((uint) c)) c = (uint)GetC(fin); return c; } + #if 0 /* skip until end of line */ ! static void SkipToEndofLine(void) { while (c != EOF) { c = (uint)GetC(fin); if (c == '\n' || c == '\r') break; } } + #endif /* skip over line continuations to start of next property */ ! static int NextProperty(void) { do { /* skip to end of line */ while (c != '\n' && c != '\r' && c != EOF) *************** *** 753,762 **** --- 763,803 ---- *location.number = ISO2022; else if (wstrcasecmp(buf, "mac") == 0) *location.number = MACROMAN; else ReportBadArgument(option); + + NextProperty(); + } + + void ParseDups(Location location, char *option) + { + char buf[64]; + int i = 0; + + SkipWhite(); + + while (i < 62 && c != EOF && !IsWhite(c)) + { + buf[i++] = c; + AdvanceChar(); + } + + buf[i] = '\0'; + + if (wstrcasecmp(buf, "never") == 0) + *location.number = Drop_Never; + else if (wstrcasecmp(buf, "keep-first") == 0 + || wstrcasecmp(buf, "drop-last") == 0) + *location.number = Drop_Last; + else if (wstrcasecmp(buf, "keep-last") == 0 + || wstrcasecmp(buf, "drop-first") == 0) + *location.number = Drop_First; + else if (wstrcasecmp(buf, "if-equal") == 0) + *location.number = Drop_Equal; + else + ReportBadArgument(option); NextProperty(); } /* slight hack to avoid changes to pprint.c */ *** bak/lexer.c Thu Nov 9 15:38:58 2000 --- lexer.c Thu Nov 9 15:28:54 2000 *************** *** 955,966 **** attr = NewAttribute(); attr->delim = '"'; attr->attribute = wstrdup("xmlns"); attr->value = wstrdup(profile); attr->dict = FindAttribute(attr); ! attr->next = node->attributes; ! node->attributes = attr; } } } Bool SetXHTMLDocType(Lexer *lexer, Node *root) --- 955,968 ---- attr = NewAttribute(); attr->delim = '"'; attr->attribute = wstrdup("xmlns"); attr->value = wstrdup(profile); attr->dict = FindAttribute(attr); ! if (attr->dict != null) { ! attr->next = node->attributes; ! node->attributes = attr; ! } } } } Bool SetXHTMLDocType(Lexer *lexer, Node *root) *************** *** 1833,1849 **** --- 1835,1859 ---- if (!MakeClean && (lexer->token->tag == tag_nobr || lexer->token->tag == tag_wbr)) ReportWarning(lexer, null, lexer->token, PROPRIETARY_ELEMENT); } + #if OLDCHECK if (lexer->token->tag->chkattrs) { CheckUniqueAttributes(lexer, lexer->token); lexer->token->tag->chkattrs(lexer, lexer->token); } else CheckAttributes(lexer, lexer->token); + #else + CheckAndPruneAttributes(lexer, lexer->token); + if (lexer->token->tag->chkattrs) + lexer->token->tag->chkattrs(lexer, lexer->token); + else + CheckAttributes(lexer, lexer->token); + #endif } return lexer->token; /* return start tag */ case LEX_COMMENT: /* seen <!-- so look for --> */ *** bak/tidy.c Thu Nov 9 15:38:58 2000 --- tidy.c Thu Nov 9 12:01:47 2000 *************** *** 801,810 **** --- 801,820 ---- OnlyErrors = yes; else if (strcmp(arg, "quiet") == 0) Quiet = yes; else if (strcmp(arg, "slides") == 0) BurstSlides = yes; + else if (strncmp(arg, "drop-", 5) == 0) + switch (arg[5]) { + case 'u': DropUnknownAtts = yes; break; + case 'f': DropDuplicateAtts = Drop_First; break; + case 'l': DropDuplicateAtts = Drop_Last; break; + case 'e': /* drop-[if-]equal */ + case 'i': DropDuplicateAtts = Drop_Equal; break; + case 'n': DropDuplicateAtts = Drop_Never; break; + default : UnknownOption(stderr, 'd'); break; + } else if (strcmp(arg, "help") == 0 || argv[1][1] == '?'|| argv[1][1] == 'h') { HelpText(stdout, prog); return 1; *** bak/Overview.html Thu Nov 9 15:43:19 2000 --- Overview.html Thu Nov 9 16:30:03 2000 *************** *** 523,532 **** --- 523,553 ---- <p>I would be interested in hearing from anyone who can offer help with using JavaScript for adding dynamic effects to slides, for instance similar to those available in Microsoft PowerPoint.</p> + <h3>Removing unknown or duplicated attributes</h3> + + <p>Some HTML editors insert attributes that are not in any W3C + recommendation and are not widely understood. Tidy has always + warned about such attributes. If you provide + the <code>-drop-unknown</code> command line option, or have + <code>drop-unknown-atts: yes</code> in your configuration file, + Tidy will automatically turn <code><p dark-side=yes></code> + into <code><p></code></p> + + <p>Some HTML editors will also insert duplicates of existing attributes. + This is particularly troublesome because some Web browsers act on the + first instance, while others act on the last, so a document may look very + different in a browser other than the one the editor is part of/was + designed for. Tidy has always warned about duplicate attributes. + With the <code>-drop-(first|last|equal)</code> command line option + (and the corresponding <code>drop-duplicate-atts</code> entry in + configuration files) you can control whether duplicates are retained, + duplicates are eliminated if they are equal, + the first of them retained, or the last of them retained.</p> + <h3>Indenting text for a better layout</h3> <p>Indenting the content of elements makes the markup easier to read. Tidy can do this for all elements or just for those where it's needed. The auto-indent mode has been used below to avoid *************** *** 951,963 **** <dt>drop-font-tags: <em>bool</em></dt> <dd>If set to <em>yes</em> together with the clean option (see above), Tidy will discard font and center tags rather than ! creating the corresponding style rules. The default is <em> ! no</em>.</dd> <dt>enclose-text: <em>bool</em></dt> <dd>If set to <em>yes</em>, this causes Tidy to enclose any text it finds in the body element within a p element. This is useful when you want to take an existing html file and use it with a --- 972,1010 ---- <dt>drop-font-tags: <em>bool</em></dt> <dd>If set to <em>yes</em> together with the clean option (see above), Tidy will discard font and center tags rather than ! creating the corresponding style rules. The default is <em>no</em>.</dd> + <dt>drop-unknown-atts: <em>bool</em></dt> + + <dd>If set to <em>yes</em>, Tidy will discard unknown attributes. + The default is <em>no</em>. + The command line option <code>-drop-unknown</code> is equivalent to + specifying <em>yes</em>.</dd> + + <dt>drop-duplicate-atts: <em>never, drop-first, keep-last, + drop-last, keep-first,</em> or <em>if-equal</em></dt> + + <dd>Determines what Tidy does with duplicate attributes. + Consider <code><p align=left align=right></code> + Tidy will always warn about duplicates. With the default, + <em>never</em>, both copies are kept. With <em>keep-first</em> + or its synonym <em>drop-last</em>, Tidy will keep the first + copy (<code>align=left</code>) and drop any others. With + <em>keep-last</em> or its synonym <em>drop-first</em>, + Tidy will keep the last copy (<code>align=right</code>) and + drop any others. With <em>if-equal</em>, Tidy will drop + later copies that provide the same value as the first. + Currently that heeds alphabetic case, so + <code>align=left align=left</code> will be tidied to + <code>align=left</code>, but <code>align="left" align="LEFT"</code> + will be left unchanged. + The command-line options <code>-drop-first</code>, <code>-drop-last</code>, + <code>-drop-equal</code>, and <code>drop-never</code> set this property.</dd> + <dt>enclose-text: <em>bool</em></dt> <dd>If set to <em>yes</em>, this causes Tidy to enclose any text it finds in the body element within a p element. This is useful when you want to take an existing html file and use it with a *************** *** 1100,1109 **** --- 1147,1158 ---- quote-ampersand: no break-before-br: no uppercase-tags: no uppercase-attributes: no char-encoding: latin1 + drop-unknown-atts: no + drop-duplicate-atts: if-equal new-inline-tags: cfif, cfelse, math, mroot, mrow, mi, mn, mo, msqrt, mfrac, msubsup, munderover, munder, mover, mmultiscripts, msup, msub, mtext, mprescripts, mtable, mtr, mtd, mth new-blocklevel-tags: cfoutput, cfquery
Received on Wednesday, 8 November 2000 22:54:15 UTC