{
open Html
}
(* Simplified rules: Only Latin-1 is recognized as character set *)
let letter = ['A'-'Z' 'a'-'z' '\192'-'\214' '\216'-'\246' '\248'-'\255']
let extender = '\183'
let digit = ['0'-'9']
let hexdigit = ['0'-'9' 'A'-'F' 'a'-'f']
let namechar = letter | digit | '.' | ':' | '-' | '_' | extender
let name = ( letter | '_' | ':' ) namechar*
let nmtoken = namechar+
let ws = [ ' ' '\t' '\r' '\n' ]
let string_literal1 = '"' [^ '"' '>' '<' '\n']* '"'
let string_literal2 = "'" [^ '\'' '>' '<' '\n']* "'"
(* This following rules reflect HTML as it is used, not the SGML
* rules.
*)
rule scan_document = parse
| ""
{ Rcomment }
| "-"
{ Mcomment }
| eof
{ Eof }
| [^ '-']+
{ Mcomment }
and scan_doctype = parse
| ">" (* Occurence in strings, and [ ] brackets ignored *)
{ Rdoctype }
| eof
{ Eof }
| [^ '>' ] +
{ Mdoctype }
and scan_element = parse
| ">"
{ Relement }
| ws+
{ Space (String.length (Lexing.lexeme lexbuf)) }
| name
{ Name (Lexing.lexeme lexbuf) }
| "="
{ Is }
| string_literal1
{ let s = Lexing.lexeme lexbuf in
Literal (String.sub s 1 (String.length s - 2))
}
| string_literal2
{ let s = Lexing.lexeme lexbuf in
Literal (String.sub s 1 (String.length s - 2))
}
| eof
{ Eof }
| _
{ Other }