(* Title: Pure/Thy/thy_scan.ML ID: $Id: thy_scan.ML,v 1.22 2005/03/03 11:44:25 skalberg Exp $ Author: Markus Wenzel, TU Muenchen Lexer for *old-style* outer syntax. *) signature THY_SCAN = sig datatype token_kind = Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF val name_of_kind: token_kind -> string val tokenize: Scan.lexicon -> string list -> (token_kind * string * int) list end; structure ThyScan: THY_SCAN = struct (** token kinds **) datatype token_kind = Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF; fun name_of_kind Keyword = "keyword" | name_of_kind Ident = "identifier" | name_of_kind LongIdent = "long identifier" | name_of_kind Var = "schematic variable" | name_of_kind TypeVar = "type variable" | name_of_kind Nat = "natural number" | name_of_kind String = "string" | name_of_kind Verbatim = "verbatim text" | name_of_kind Ignore = "ignore" | name_of_kind EOF = "end-of-file"; (** scanners **) (* diagnostics *) fun lex_err NONE msg = "Outer lexical error: " ^ msg | lex_err (SOME n) msg = "Outer lexical error on line " ^ string_of_int n ^ ": " ^ msg; (* line numbering *) val incr = Option.map (fn n:int => n + 1); fun incr_line scan = Scan.depend (fn n => scan >> pair (incr n)); val keep_line = Scan.lift; val scan_blank = incr_line ($$ "\n") || keep_line (Scan.one Symbol.is_blank); (* scan ML-style strings *) val scan_ctrl = $$ "^" ^^ Scan.one (fn c => Symbol.is_ascii c andalso ord c >= 64 andalso ord c <= 95); val scan_dig = Scan.one Symbol.is_digit; val scan_esc = keep_line ($$ "\\") ^^ !! (fn ((n, _), _) => lex_err n "bad escape sequence in string") (keep_line ($$ "n" || $$ "t" || $$ "\"" || $$ "\\" || scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig) || (Scan.repeat1 scan_blank >> implode) ^^ keep_line ($$ "\\")); val scan_str = scan_esc || scan_blank >> K Symbol.space || keep_line (Scan.one (not_equal "\"" andf Symbol.not_eof)); val scan_string = keep_line ($$ "\"") ^^ !! (fn ((n, _), _) => lex_err n "missing quote at end of string") ((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\"")); (* scan verbatim text *) val scan_verb = scan_blank || keep_line ($$ "|" --| Scan.ahead (Scan.one (not_equal "}"))) || keep_line (Scan.one (not_equal "|" andf Symbol.not_eof)); val scan_verbatim = keep_line ($$ "{" -- $$ "|") |-- !! (fn ((n, _), _) => lex_err n "missing end of verbatim text") ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "|" -- $$ "}")); (* scan nested comments *) val scan_cmt = Scan.lift scan_blank || Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) || Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) || Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) || Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_eof))); val scan_comment = keep_line ($$ "(" -- $$ "*") |-- !! (fn ((n, _), _) => lex_err n "missing end of comment") (Scan.pass 0 (Scan.repeat scan_cmt) |-- keep_line ($$ "*" -- $$ ")") >> K ""); (* scan token *) fun token k NONE x = (k, x, 0) | token k (SOME n) x = (k, x, n); fun scan_tok lex (n, cs) = (scan_string >> token String n || scan_verbatim >> token Verbatim n || Scan.repeat1 scan_blank >> (token Ignore n o implode) || scan_comment >> token Ignore n || keep_line (Scan.max (fn ((_, x, _), (_, x', _)) => x <= x') (Scan.literal lex >> (token Keyword n o implode)) (Syntax.scan_longid >> token LongIdent n || Syntax.scan_id >> token Ident n || Syntax.scan_var >> token Var n || Syntax.scan_tid >> token TypeVar n || Syntax.scan_nat >> token Nat n))) (n, cs); val scan_rest = Scan.any Symbol.not_eof >> implode; fun scan_token lex x = (case scan_tok lex x of ((Keyword, "ML", n), x') => (keep_line scan_rest >> token Verbatim (SOME n)) x' | tok_x' => tok_x'); (* tokenize *) fun tokenize lex chs = let val scan_toks = Scan.repeat (scan_token lex); val ignore = fn (Ignore, _, _) => true | _ => false; in (case Scan.error (Scan.finite' Symbol.stopper scan_toks) (SOME 1, chs) of (toks, (n, [])) => filter_out ignore toks @ [token EOF n "end-of-file"] | (_, (n, cs)) => error (lex_err n ("Bad input " ^ quote (Symbol.beginning 10 cs)))) end; end;