(**************************************************************************) (* The CDuce compiler *) (* Alain Frisch and the CDuce team *) (* Copyright CNRS,INRIA, 2003,2004,2005,2006 (see LICENSE for details) *) (**************************************************************************) module L = Ulexing let keywords : (string,unit) Hashtbl.t = Hashtbl.create 17 exception Error of int * int * string let error i j s = raise (Error (i,j,s)) (* Buffer for string literals *) let string_buff = Buffer.create 1024 let store_lexeme lexbuf = Buffer.add_string string_buff (Ulexing.utf8_lexeme lexbuf) let store_ascii = Buffer.add_char string_buff let store_code = Utf8.store string_buff let clear_buff () = Buffer.clear string_buff let get_stored_string () = let s = Buffer.contents string_buff in clear_buff (); Buffer.clear string_buff; s let enc = ref L.Latin1 (* Parse characters literals \123; \x123; *) let hexa_digit = function | '0'..'9' as c -> (Char.code c) - (Char.code '0') | 'a'..'f' as c -> (Char.code c) - (Char.code 'a') + 10 | 'A'..'F' as c -> (Char.code c) - (Char.code 'A') + 10 | _ -> -1 let parse_char lexbuf base i = let s = L.latin1_sub_lexeme lexbuf i (L.lexeme_length lexbuf - i - 1) in let r = ref 0 in for i = 0 to String.length s - 1 do let c = hexa_digit s.[i] in if (c >= base) || (c < 0) then error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "invalid digit"; r := !r * base + c; done; !r let regexp ncname_char = xml_letter | xml_digit | [ '-' '_' ] | xml_combining_char | xml_extender | "\\." let regexp ncname = (xml_letter ncname_char*) | ('_' ncname_char+) let regexp qname = (ncname ':')? ncname let illegal lexbuf = error (L.lexeme_start lexbuf) (L.lexeme_end lexbuf) "Illegal character" let in_comment = ref false let return lexbuf tok = (tok, L.loc lexbuf) let return_loc i j tok = (tok, (i,j)) let rec token = lexer | xml_blank+ -> token lexbuf | qname -> let s = L.utf8_lexeme lexbuf in return lexbuf (if Hashtbl.mem keywords s then "",s else "IDENT",s) | ncname ":*" -> let s = L.utf8_sub_lexeme lexbuf 0 (L.lexeme_length lexbuf - 2) in return lexbuf ("ANY_IN_NS", s) | ".:*" -> return lexbuf ("ANY_IN_NS", "") | '-'? ['0'-'9']+ -> return lexbuf ("INT", L.utf8_lexeme lexbuf) | [ "<>=.,:;+-*/@&{}[]()|?`!" ] | "->" | "::" | ";;" | "--" | "//" | "/@" | ":=" | "\\" | "++" | "<=" | ">=" | "<<" | ">>" | "||" | "&&" | "**" | "_" | ".." | ["?+*"] "?" | "#" -> return lexbuf ("", L.utf8_lexeme lexbuf) | '"' | "'" -> let start = L.lexeme_start lexbuf in let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in string (L.lexeme_start lexbuf) double_quote lexbuf; return_loc start (L.lexeme_end lexbuf) ((if double_quote then "STRING2" else "STRING1"), (get_stored_string())) | "(*" -> in_comment := true; comment (L.lexeme_start lexbuf) lexbuf; in_comment := false; token lexbuf | "/*" -> in_comment := true; tcomment (L.lexeme_start lexbuf) lexbuf; in_comment := false; token lexbuf | eof -> return lexbuf ("EOI","") | _ -> illegal lexbuf and comment start = lexer | "(*" -> comment (L.lexeme_start lexbuf) lexbuf; comment start lexbuf | "*)" -> () | '"' | "'" -> let double_quote = L.latin1_lexeme_char lexbuf 0 = '"' in string (L.lexeme_start lexbuf) double_quote lexbuf; clear_buff (); comment start lexbuf | eof -> error start (start+2) "Unterminated comment" | _ -> comment start lexbuf and tcomment start = lexer | "*/" -> () | eof -> error start (start+2) "Unterminated comment" | _ -> tcomment start lexbuf and string start double = lexer | '"' | "'" -> let d = L.latin1_lexeme_char lexbuf 0 = '"' in if d != double then (store_lexeme lexbuf; string start double lexbuf) | '\\' ['\\' '"' '\''] -> store_ascii (L.latin1_lexeme_char lexbuf 1); string start double lexbuf | "\\n" -> store_ascii '\n'; string start double lexbuf | "\\t" -> store_ascii '\t'; string start double lexbuf | "\\r" -> store_ascii '\r'; string start double lexbuf | '\\' ['0'-'9']+ ';' -> store_code (parse_char lexbuf 10 1); string start double lexbuf | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F']+ ';' -> store_code (parse_char lexbuf 16 2); string start double lexbuf | '\\' -> illegal lexbuf; | eof -> error start (start+1) "Unterminated string" | _ -> store_lexeme lexbuf; string start double lexbuf let lexbuf = ref None let last_tok = ref ("","") let raise_clean e = clear_buff (); in_comment := false; (* reinit encoding ? *) raise e let pos_of_int i = { Lexing.pos_fname = ""; Lexing.pos_lnum = 0; Lexing.pos_bol = 0; Lexing.pos_cnum = i } let make_stream_and_location f = Token.make_stream_and_flocation (fun () -> let (tok,(i,j)) = f () in (tok, (pos_of_int i, pos_of_int j)) ) let tok_func cs = let lb = L.from_var_enc_stream enc cs in (lexer ("#!" [^ '\n']* "\n")? -> ()) lb; lexbuf := Some lb; let next () = let tok = try token lb with | Ulexing.Error -> raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb, "Unexpected character")) | Ulexing.InvalidCodepoint i -> raise_clean (Error (Ulexing.lexeme_end lb, Ulexing.lexeme_end lb, "Code point invalid for the current encoding")) | e -> raise_clean e in last_tok := fst tok; tok in make_stream_and_location next let register_kw (s1,s2) = if s1 = "" then match s2.[0] with | 'a' .. 'z' when not (Hashtbl.mem keywords s2) -> Hashtbl.add keywords s2 () | _ -> () let lex = { Token.tok_func = tok_func; Token.tok_using = register_kw; Token.tok_removing = (fun _ -> ()); Token.tok_match = Token.default_match; Token.tok_text = Token.lexer_text; Token.tok_comm = None; } let dump_file f = let ic = open_in f in let lexbuf = L.from_var_enc_channel enc ic in (try while true do let ((a,b),_) = token lexbuf in Printf.printf "%s: \"%s\"\n" a b; if a = "EOI" then exit 0 done with | Ulexing.Error -> Printf.eprintf "Lexing error at offset %i\n:Unexpected character\n" (Ulexing.lexeme_end lexbuf) | Error (i,j,s) -> Printf.eprintf "Lexing error at offset %i-%i:\n%s\n" i j s | Ulexing.InvalidCodepoint i -> Printf.eprintf "Lexing error at offset %i\n:Invalid code point for the current encoding\n" (Ulexing.lexeme_end lexbuf) ); close_in ic