Module: html-internals Synopsis: HTML parser and printer Author: Scott McKay Copyright: Original Code is Copyright (c) 1995-2004 Functional Objects, Inc. All rights reserved. License: Functional Objects Library Public License Version 1.0 Dual-license: GNU Lesser General Public License Warranty: Distributed WITHOUT WARRANTY OF ANY KIND /// HTML parser // #t means to create a DOM object // #f means to do nothing // A function means to call that function as a callback define constant = type-union(, ); define sealed method parse-html-from-file (file :: , #key text-action :: = #t, markup-action :: = #t) => (html :: ) with-open-file (stream = file, direction: #"input") let document = create-html-document(file); parse-html-from-stream(stream, document: document, text-action: text-action, markup-action: markup-action) end end method parse-html-from-file; define thread variable *html-document* :: false-or() = #f; define sealed method parse-html-from-stream (stream :: , #key document, text-action :: = #t, markup-action :: = #t) => (html :: ) let document :: = document | create-html-document(""); dynamic-bind (*html-document* = document) block (end-of-html-stream) parse-html(make(, inner-stream: stream), context: document, text-action: text-action, markup-action: markup-action, end-of-html-stream: end-of-html-stream) end end; document end method parse-html-from-stream; /// Condition classes define abstract class (, ) end class ; define sealed domain make (subclass()); define sealed domain initialize (); define sealed class () sealed constant slot invalid-character-code, required-init-keyword: code:; end class ; define sealed class () end class ; define sealed class () end class ; define sealed class () end class ; define sealed class () end class ; define sealed class () end class ; /// String resources define constant $result-strings :: = make(); //---*** This isn't thread-safe... define macro with-string-buffer { with-string-buffer (?result:name) ?:body end } => { let ?result :: limited(, of: ) = if (empty?($result-strings)) make(limited(, of: )) else pop($result-strings) end; block () ?result.size :=0; ?body cleanup push($result-strings, ?result) end } end macro with-string-buffer; /// HTML parsing streams define sealed class () sealed slot xstream-buffer :: = #(); end class ; define sealed domain make (subclass()); define sealed domain initialize (); define sealed inline method readch (x :: , #key eof-error? = #t) => (char :: false-or()) if (~empty?(xstream-buffer(x))) pop!(xstream-buffer(x)) else read-element(inner-stream(x), on-end-of-stream: #f) end end method readch; define sealed inline method peekch (x :: , #key eof-error? = #t) => (char :: false-or()) if (~empty?(xstream-buffer(x))) head(xstream-buffer(x)) else peek(inner-stream(x), on-end-of-stream: #f) end end method peekch; define sealed inline method unreadch (x :: , char :: ) => (char :: ) push!(xstream-buffer(x), char); char end method unreadch; /// HTML parser context management define thread variable *html-context* :: false-or() = #f; define thread variable *html-context-stack* :: = #(); define macro with-html-context { with-html-context (?context:expression) ?:body end } => { begin let with-html-context-body = method () ?body end; do-with-html-context(?context, with-html-context-body) end } end macro with-html-context; define sealed method do-with-html-context (context, continuation :: ) => (#rest values) block (context-thunk) dynamic-bind (*html-context-stack* = pair(pair(context, context-thunk), *html-context-stack*), *html-context* = context) continuation() end end end method do-with-html-context; define sealed method exit-html-context (name :: , putback) => () let context = find(*html-context-stack*, name, test: method (name, elt) node-name(head(elt)) = name end method); if (context) // If found, it's a non-local exit thunk... let thunk = tail(context); thunk(putback) else error(make(, format-string: " seen but no <%s> pending", format-arguments: vector(name, name))) end end method exit-html-context; define sealed method maybe-exit-html-context (name :: , putback) => (putback) local method any-context (names) position-if(*html-context-stack*, method (elt) member?(node-name(head(elt)), names, test: \=) end method) end method; select (name by \=) "OPTION" => let pos = any-context(#["OPTION", "SELECT"]); let key = pos & node-name(head(*html-context-stack*[pos])); when (key == "OPTION") exit-html-context(key, putback) end; "LI" => let pos = any-context(#["LI", "UL", "OL"]); let key = pos & node-name(head(*html-context-stack*[pos])); when (key = "LI") exit-html-context(key, putback) end; "P" => let pos = any-context(#["UL", "OL", "LI", "DL", "DT", "DD", "TABLE", "TH", "TD", "P", "PRE", "CENTER", "DIV"]); let key = pos & node-name(head(*html-context-stack*[pos])); when (key = "P") exit-html-context(key, putback) end; "DT", "DD" => let pos = any-context(#["DD", "DT", "DL"]); let key = pos & node-name(head(*html-context-stack*[pos])); when (key = "DT" | key = "DD") exit-html-context(key, putback) end; "TD", "TH" => let pos = any-context(#["TD", "TH", "TR", "TABLE", "CAPTION"]); let key = pos & node-name(head(*html-context-stack*[pos])); when (key = "TD" | key = "TH") exit-html-context(key, putback) end; "TR" => let pos = any-context(#["TR", "TABLE", "CAPTION"]); let key = pos & node-name(head(*html-context-stack*[pos])); when (key = "TR") exit-html-context(key, putback) end; otherwise => #f end; putback end method maybe-exit-html-context; /// The HTML parser define sealed method parse-html (xstream :: , #key context = *html-context*, end-of-html-stream :: , text-action :: = #t, markup-action :: = #t) let putback = #f; // We exit this loop via the 'end-of-html-stream' thunk... while (#t) let item = putback | parse-html-content(xstream, text-action: text-action, markup-action: markup-action); putback := #f; case instance?(item, ) => doctype(*html-document*) := item; ~item => end-of-html-stream(#f); //--- KMP's code had 'object-empty?', but I don't know what it's supposed to do! ~instance?(item, ) | html-empty-element-name?(node-name(item)) => when (select (item by instance?) => text-action == #t; => markup-action == #t; otherwise => #f; end) append-child(context, item) end; otherwise => putback := begin // Store the new item first -- we won't get another chance to do it when (select (item by instance?) => text-action == #t; => markup-action == #t; otherwise => #f; end) append-child(context, item) end; with-html-context (item) // This normally returns by non-local exit to 'end-of-html-stream' parse-html(xstream, text-action: text-action, markup-action: markup-action, end-of-html-stream: end-of-html-stream) end end; end end; context end method parse-html; define sealed method parse-html-content (xstream :: , #key text-action :: = #t, markup-action :: = #t) => (elt :: false-or(type-union(, , , ))) let c = peekch(xstream, eof-error?: #f); case ~c => #f; c == '<' => parse-html-element(xstream, markup-action: markup-action); otherwise => if (text-action) let text = parse-html-text(xstream); if (text-action == #t) create-text-node(*html-document*, text) else text-action(text); text end; else skip-html-text(xstream); parse-html-element(xstream, markup-action: markup-action) end end end method parse-html-content; define thread variable *parsing-html-tag?* :: = #f; define sealed method parse-html-element (xstream :: , #key markup-action :: = #t) => (elt :: false-or(type-union(, , ))) block (return) let c = readch(xstream, eof-error?: #f); case ~c => #f; c ~== '<' => error(make(, format-string: "Missing '<' at start of HTML element")); otherwise => when (peekch(xstream, eof-error?: #f) == '!') let markup = parse-sgml-markup(xstream); return(markup) end; skip-html-whitespace(xstream); let c0 = peekch(xstream); let open? = (c0 ~== '/'); unless (open?) readch(xstream) end; let name = parse-html-name(xstream); assert(name, "Missing element name within <...>"); //--- KMP's code used to pass 'empty?: html-empty-element-name?(name)' //---*** The context stack should have triple of {name, elt, thunk} //---*** so that we don't have to create these useless elements! let elt = create-element(*html-document*, name); let c = peekch(xstream); assert(c ~== '=', "Character '%c' seen where not expected", c); let attributes :: = make(); for (tag = parse-html-name(xstream) then parse-html-name(xstream), until: ~tag) let char = peekch(xstream); let val = if (char == '=') readch(xstream); parse-html-attribute-value(xstream) else #f end; if (open?) when (markup-action) if (markup-action == #t) let attribute :: = create-attribute(*html-document*, tag); value(attribute) := val; set-attribute-node(elt, attribute) else add!(attributes, tag); add!(attributes, val) end end else error(make(, format-string: "Found attributes in ", format-arguments: vector(name))) end; finally if (open?) when (markup-action & markup-action ~== #t) markup-action(name, attributes) end; maybe-exit-html-context(name, elt) else exit-html-context(name, #f) end end; end end end method parse-html-element; define sealed method parse-html-name (xstream :: ) => (name :: false-or()) skip-html-whitespace(xstream); let c0 = readch(xstream); dynamic-bind (*parsing-html-tag?* = #t) case c0 = '>' => #f; otherwise => with-string-buffer (result) for (c :: = c0 then readch(xstream), until: html-whitespace?(c) | c == '=' | c == '>') add!(result, c); finally if (c == '=' | c == '>') unreadch(xstream, c) else skip-html-whitespace(xstream) end; as-uppercase!(as(, result)) end end; end end end method parse-html-name; define constant $html-empty-elements :: = #["META", "BR", "IMG", "HR", "INPUT"]; define inline method html-empty-element-name? (name :: ) => (empty? :: ) member?(name, $html-empty-elements, test: \=) & #t end method html-empty-element-name?; define inline method html-whitespace? (c :: ) => (whitespace? :: ) c == ' ' | c == '\t' | c == '\r' | c == '\n' end method html-whitespace?; define sealed method skip-html-whitespace (xstream :: ) => () block (return) for (c = readch(xstream, eof-error?: #f) then readch(xstream, eof-error?: #f), until: ~c) unless (html-whitespace?(c)) unreadch(xstream, c); return(#f) end end end end method skip-html-whitespace; define sealed method parse-html-attribute-value (xstream :: ) => (value :: false-or()) skip-html-whitespace(xstream); let c0 = readch(xstream); dynamic-bind (*parsing-html-tag?* = #t) with-string-buffer (result) if (c0 == '"' | c0 == '\'') block (return) for (c :: = readch(xstream) then readch(xstream), until: c == c0) when (c == '>') // What the heck, it's not fatal signal(make(, format-string: "Unbalanced '\"' while parsing attribute value")); unreadch(xstream, c); return(as(, result)) end; // '&' is handled as quoted in strings by some browsers, but it's // really supposed to be escaped if (c == '&') add!(result, parse-html-entityref(xstream)) else add!(result, c) end; finally return(as(, result)) end end else for (c :: = c0 then readch(xstream), until: html-whitespace?(c) | c == '>') if (c == '&') add!(result, parse-html-entityref(xstream)) else add!(result, c) end; finally when (c == '>') unreadch(xstream, c) end; as(, result) end end end end end method parse-html-attribute-value; define sealed method parse-html-text (xstream :: ) => (text :: ) block (return) with-string-buffer (result) for (c = readch(xstream, eof-error?: #f) then readch(xstream, eof-error?: #f), until: ~c) local method done () unreadch(xstream, c); return(as(, result)) end method; let c :: = c; case c == '<' => done(); c == '&' => if (~empty?(result)) done() else let ref = parse-html-entityref(xstream); return(if (instance?(ref, )) make(, size: 1, fill: ref) else ref end) end; otherwise => add!(result, c) end; finally return(as(, result)) end end end end method parse-html-text; define sealed method skip-html-text (xstream :: ) => () block (return) for (c = readch(xstream, eof-error?: #f) then readch(xstream, eof-error?: #f), until: ~c) case c == '<' => unreadch(xstream, c); return(#f); c == '&' => skip-html-entityref(xstream) end end end end method skip-html-text; define variable *unknown-entity-treatment* = #"unparse"; define sealed method parse-html-entityref (xstream :: ) => (ref :: type-union(, )) if (peekch(xstream, eof-error?: #f) = '#') parse-html-charref(xstream) else let semi? = #f; let name = as(, block (return) let result :: = make(); for (c :: = readch(xstream, eof-error?: #f) then readch(xstream, eof-error?: #f), until: c == ';') add!(result, c); unless (c & alpha-char?(c)) when (c) unreadch(xstream, c) end; return(result) end; finally semi? := #t; return(result) end end); let character = element(*entity-names->character*, name, default: #f); case character => character; *parsing-html-tag?* | *unknown-entity-treatment* == #"unparse" => when (semi?) unreadch(xstream, ';') end; for (i :: from size(name) - 1 to 0) unreadch(xstream, name[i]) end; '&'; *unknown-entity-treatment* == #"error" => error(make(, format-string: "Undefined HTML character: '%='", format-arguments: vector(name))); otherwise => create-entity-reference(*html-document*, name); end end end method parse-html-entityref; define sealed method parse-html-charref (xstream :: ) => (ref :: type-union(, )) readch(xstream); let semi? = #f; let name = as(, block (return) let result :: = make(); for (c :: = readch(xstream, eof-error?: #f) then readch(xstream, eof-error?: #f), until: c == ';') add!(result, c); unless (c & digit-char?(c)) when (c) unreadch(xstream, c) end; return(result) end; finally semi? := #t; return(result) end end); let code = string-to-integer(name, default: #f); case code & code >= 0 & code < 256 => as(, code); *parsing-html-tag?* | *unknown-entity-treatment* == #"unparse" => when (semi?) unreadch(xstream, ';') end; for (i :: from size(name) - 1 to 0) unreadch(xstream, name[i]) end; unreadch(xstream, '#'); '&'; *unknown-entity-treatment* == #"error" => error(make(, format-string: "Undefined HTML character: '%='", format-arguments: vector(name))); otherwise => create-entity-reference(*html-document*, concatenate-as(, "#", name)); end end method parse-html-charref; define inline method skip-html-entityref (xstream :: ) => () for (c = readch(xstream) then readch(xstream), until: c == ';') #f end end method skip-html-entityref; /// Random SGML markup parsing define constant $SGML-markup-node :: = 100; define sealed class () keyword type: = $SGML-markup-node; end class ; //--- Is the comment parsing really completely correct? define sealed method parse-sgml-markup (xstream :: ) => (markup :: false-or(type-union(, ))) assert(readch(xstream) == '!', "Missing '!' at start of SGML-like markup"); let stream = make(, direction: #"output"); let level :: = 1; let markup = block (return) local method test-char(ch) select (ch) '>' => dec!(level); when (level = 0) return(stream-contents(stream)) end; '<' => inc!(level); otherwise => #f; end; write-element(stream, ch); end method; let ch1 = readch(xstream); test-char(ch1); let ch2 = readch(xstream); test-char(ch2); if (ch1 = '-' & ch2 = '-') // Process a comment let first-dash? = #f; let second-dash? = #f; for (ch = readch(xstream) then readch(xstream)) when (first-dash? & second-dash? & ch = '>') return(stream-contents(stream)) end; if (ch = '-') if (~first-dash?) first-dash? := #t else when (~second-dash?) second-dash? := #t end end else first-dash? := #f; second-dash? := #f end; write-element(stream, ch); end else for (ch = readch(xstream) then readch(xstream)) test-char(ch) end end; cleanup close(stream) end; case string-equal?(markup, "--", end1: min(2, markup.size)) => assert(string-equal?(markup, "--", start1: max(size(markup) - 2, 0)), "Missing '-->' at the end of a comment"); make(, document: *html-document*, name: "#comment", // not #"comment"! value: copy-sequence(markup, start: 2, end: size(markup) - 2)); string-equal?(markup, "DOCTYPE", end1: min(7, markup.size)) => make(, document: *html-document*, name: "#doctype", // not #"doctype"! value: markup); otherwise => make(, document: *html-document*, name: "#markup", // not #"markup"! value: markup); end end method parse-sgml-markup; /// HTML entity names define variable *entity-names->character* :: = make(); define variable *character->entity-names* :: = make(, size: 256); define function initialize-html-entity-names () local method code (x) if (instance?(x, )) as(, x) else x end end method; for (entry in #[#("lt", '<'), #("gt", '>'), #("amp", '&'), #("quot", '"'), #("nbsp", 160, 160), // Non-breaking space #("iexcl", 161, 161), // Inverted exclamation point #("cent", 162, 162), // Cent sign #("pound", 163, 163), // Pounds Sterling #("curren", 164, 164),// General currency #("yen", 165, 165), // Yen #("brvbar", 166, 166),// Broken vertical bar #("sect", 167, 167), // Section #("uml", 168, 168), // Umlaut (dieresis) #("copy", 169, 169), // Copyright #("ordf", 170, 170), // Ordinal indicator, feminine #("laquo", 171, 171), // guillemotleft #("not", 172, 172), // logical not #("shy", 173, 173), // soft hyphen #("reg", 174, 174), // registered trademark #("macr", 175, 175), // macron #("deg", 176, 176), // degree #("plusmn", 177, 177),// plus-or-minus #("sup2", 178, 178), // superscript 2 #("sup3", 179, 179), // superscript 3 #("acute", 180, 180), // acute accent #("micro", 181, 181), // micro #("para", 182, 182), // para #("middot", 183, 183), // Vertically-centered period (dot accent) #("cedil", 184, 184), // cedilla #("sup1", 185, 185), // superscript 1 #("ordm", 186, 186), // ordinal indicator, masculine #("raquo", 187, 187), // guillemotright #("frac14", 188, 188),// fractional 1/4 #("frac12", 189, 189),// fractional 1/2 #("frac34", 190, 190),// fractional 3/4 #("iquest", 191, 191),// inverted question mark #("Agrave", 192), // capital A, grave accent #("Aacute", 193), // capital A, acute accent #("Acirc", 194), // capital A, circumflex accent #("Atilde", 195), // capital A, tilde #("Auml", 196), // capital A, dieresis or umlaut mark #("Aring", 197), // capital A, ring #("AElig", 198), // capital AE diphthong (ligature) #("Ccedil", 199), // capital C, cedilla #("Egrave", 200), // capital E, grave accent #("Eacute", 201), // capital E, acute accent #("Ecirc", 202), // capital E, circumflex accent #("Euml", 203), // capital E, dieresis or umlaut mark #("Igrave", 204), // capital I, grave accent #("Iacute", 205), // capital I, acute accent #("Icirc", 206), // capital I, circumflex accent #("Iuml", 207), // capital I, dieresis or umlaut mark #("ETH", 208), // capital ETH, Icelandic #("Ntilde", 209), // capital N, tilde #("Ograve", 210), // capital O, grave accent #("Oacute", 211), // capital O, acute accent #("Ocirc", 212), // capital O, circumflex accent #("Otilde", 213), // capital O, tilde #("Ouml", 214), // capital O, dieresis or umlaut mark #("times", 215), // multiply sign #("Oslash", 216), // capital O, slash #("Ugrave", 217), // capital U, grave accent #("Uacute", 218), // capital U, acute accent #("Ucirc", 219), // capital U, circumflex accent #("Uuml", 220), // capital U, dieresis or umlaut mark #("Yacute", 221), // capital Y, acute accent #("THORN", 222), // capital THORN, Icelandic #("szlig", 223), // small sharp s, German (sz ligature) #("agrave", 224), // small a, grave accent #("aacute", 225), // small a, acute accent #("acirc", 226), // small a, circumflex accent #("atilde", 227), // small a, tilde #("auml", 228), // small a, dieresis or umlaut mark #("aring", 229), // small a, ring #("aelig", 230), // small ae diphthong (ligature) #("ccedil", 231), // small c, cedilla #("egrave", 232), // small e, grave accent #("eacute", 233), // small e, acute accent #("ecirc", 234), // small e, circumflex accent #("euml", 235), // small e, dieresis or umlaut mark #("igrave", 236), // small i, grave accent #("iacute", 237), // small i, acute accent #("icirc", 238), // small i, circumflex accent #("iuml", 239), // small i, dieresis or umlaut mark #("eth", 240), // small Icelandic eth #("ntilde", 241), // small n, tilde #("ograve", 242), // small o, grave accent #("oacute", 243), // small o, acute accent #("ocirc", 244), // small o, circumflex accent #("otilde", 245), // small o, tilde #("ouml", 246), // small o, dieresis or umlaut mark #("divide", 247), // divide sign #("oslash", 248), // small o, slash #("ugrave", 249), // small u, grave accent #("uacute", 250), // small u, acute accent #("ucirc", 251), // small u, circumflex accent #("uuml", 252), // small u, dieresis or umlaut mark #("yacute", 253), // small y, acute accent #("thorn", 254), // small thorn, Icelandic #("yuml", 255)]) let name = head(entry); let codes = tail(entry); //--- KMP's code asserted this: '(apply #'= (mapcar #'code codes))' *entity-names->character*[name] := as(, head(codes)); *character->entity-names*[as(, head(codes))] := name; end end function initialize-html-entity-names; initialize-html-entity-names();