(* $Id: main.ml 674 2004-06-13 12:33:48Z gerd $ * ---------------------------------------------------------------------- * *) open Uni_types open Ucs2_to_utf8 open Printf type lextool = [ `OCAMLLEX | `WLEX | `ULEX ] type config = { mutable char_classes_file : string; mutable encoding : Netconversion.encoding; mutable encoding_name : string; mutable lex_src_file : string; mutable link_src_file : string; mutable out_format : lextool; mutable out_multiple : bool; mutable out_lex_prefix : string; mutable out_link_prefix : string; } type char_classes = CC_generic of (definition list * string) (* regexp definitions, section LET *) | CC_wlex of (string * string) (* section CLASSES, section LETS *) let get_char_classes cfg = let filename = cfg.char_classes_file in let cc_sections = Lexpp_file.read_sections filename in let type_generic = List.mem_assoc "TYPE_GENERIC" cc_sections in let type_wlex = List.mem_assoc "TYPE_WLEX" cc_sections in ( match type_generic, type_wlex with false, false -> failwith ("File " ^ filename ^ ": type indicator is missing"); | true, false | false, true -> () | _, _ -> failwith ("File " ^ filename ^ ": type indicator is ambiguous"); ); let let_section = try List.assoc "LET" cc_sections with Not_found -> failwith ("File " ^ filename ^ ": no LET section") in match () with () when type_generic -> let let_unicode_section = try List.assoc "LET_UNICODE" cc_sections with Not_found -> failwith ("File " ^ filename ^ ": no LET_UNICODE section") in CC_generic(Lexpp_file.parse_char_classes let_unicode_section, let_section); | () when type_wlex -> let classes_section = try List.assoc "CLASSES" cc_sections with Not_found -> failwith ("File " ^ filename ^ ": no CLASSES section") in CC_wlex(classes_section, let_section) | () -> assert false (* programming error *) ;; let recode_char_classes_8bit cfg cc = (* Works only for 8 bit character encodings. *) let out_enc = cfg.encoding in let recode_char n = try let s = Netconversion.makechar out_enc n in (* or Not_found *) if String.length s <> 1 then failwith("Character " ^ string_of_int n ^ " has a multibyte representation"); Some(Char.code s.[0]) with Not_found -> None in let rec recode_interval cur_ival cur_code last_code = if cur_code <= last_code then begin match recode_char cur_code with None -> recode_interval cur_ival (cur_code+1) last_code | Some p -> ( match cur_ival with None -> recode_interval (Some(p,p)) (cur_code+1) last_code | Some(m,n) -> if n+1 = p then recode_interval (Some(m,p)) (cur_code+1) last_code else (m,n) :: recode_interval (Some(p,p)) (cur_code+1) last_code ) end else match cur_ival with None -> [] | Some(m,n) -> [m,n] in let rec recode_regexp re = (* recodes a regexp to a regexp list *) match re with Char n -> ( match recode_char n with Some p -> [Char p] | None -> [] ) | Interval(m,n) -> List.map (fun (m,n) -> if m=n then Char m else Interval(m,n) ) (recode_interval None m n) | Identifier id -> [Identifier id] | Concat _ -> assert false (* not used *) in let recode_def { id = id; rel = rel } = { id = id; rel = List.flatten (List.map recode_regexp rel) } in match cc with CC_wlex(_,_) -> assert false | CC_generic(defs, let_section) -> CC_generic(List.map recode_def defs, let_section) ;; let recode_char_classes cfg cc = (* FUTURE: Use Netconversion.is_ascii_compatible and is_single_byte *) printf "[Recoding character classes to %s]\n" (Netconversion.string_of_encoding cfg.encoding); flush stdout; match cfg.encoding with `Enc_utf8 -> ( match cc with CC_generic(defs, let_section) -> CC_generic(List.map ucs2_to_utf8 defs, let_section) | CC_wlex(_,_) -> failwith "Char classes of type wlex not compatible with output format" ) | (`Enc_java | `Enc_utf16 | `Enc_utf16_le | `Enc_utf16_be) -> failwith "This character encoding is not supported!" | _ -> (* May not work... *) recode_char_classes_8bit cfg cc ;; let space_re = Netstring_str.regexp "[ \t\r\n]+" ;; let name_of_rule rule_str = (* Extract the name of the rule definition. The name is the first word * in [rule_str] outside of comments. * FIXME: Comments containing inner comments or code are not supported. *) let rec skip_comments l = match l with "*)" :: l' -> l' | _ :: l' -> skip_comments l' | [] -> failwith ("Unfinished comment in: " ^ rule_str) and find_first_word l = match l with "(*" :: l' -> find_first_word (skip_comments l') | w :: l' -> w | [] -> failwith ("Cannot determine rule name in: " ^ rule_str) in let words = Netstring_str.split space_re rule_str in find_first_word words ;; type mlltok = Mll_lexer.mlltok let parse_term str = let rec norm need_sep term = match term with (`Sep s1) :: (`Sep s2) :: term' -> norm need_sep (`Sep(s1^s2) :: term') | (`Sep _ as sep) :: term' -> sep :: norm false term' | tok :: term' -> if need_sep then (`Sep "") :: tok :: norm false term' else tok :: norm true term' | [] -> [] in let rec rem_comments term = match term with `Comment _ :: term' -> rem_comments term' | tok :: term' -> tok :: rem_comments term' | [] -> [] in let lexbuf = Lexing.from_string str in let term = Mll_lexer.recurse_until `EOF Mll_lexer.definition lexbuf in norm true (rem_comments term) ;; let string_of_term (term : mlltok list) = let buf = Buffer.create 1000 in let rec print outermost last_tok (term : mlltok list) = match term with tok :: term' -> ( match tok with `Sep s -> if not outermost || (last_tok <> `EOF && term' <> []) then Buffer.add_string buf s (* don't print at the beginning and at the end *) | `Comment l -> Buffer.add_char buf ' ' | `Brace(_,l) -> Buffer.add_char buf '{'; print false `EOF l; Buffer.add_char buf '}'; | `Paren l -> Buffer.add_char buf '('; print false `EOF l; Buffer.add_char buf ')' | `Bracket l -> Buffer.add_char buf '['; print false `EOF l; Buffer.add_char buf ']' | `Stringliteral s -> Buffer.add_char buf '"'; Buffer.add_string buf s; Buffer.add_char buf '"'; | `Charliteral s -> Buffer.add_char buf '\''; Buffer.add_string buf s; Buffer.add_char buf '\''; | `Char c -> Buffer.add_char buf c | `Ident name -> Buffer.add_string buf name | `EOF | `E_Brace | `E_Bracket | `E_Comment | `E_Paren -> () ); print outermost tok term' | [] -> () in print true `EOF term; Buffer.contents buf ;; let transform_let_to_ulex let_str = (* Transforms a regexp "let" definition to ulex syntax *) let term = parse_term let_str in let rec transform term = match term with `Ident "let" :: term' -> `Sep "\n" :: `Ident "let" :: `Sep " " :: `Ident "regexp" :: transform term' | tok :: term' -> tok :: transform term' | [] -> [] in string_of_term (transform term) ;; let transform_rule_to_ulex rule_str = (* Transforms the rule into ulex syntax *) let term = parse_term rule_str in let rec transform term = match term with | `Ident "parse" :: term' -> `Ident "lexer" :: transform term' | `Brace (_,l) :: term' -> `Sep " " :: `Char '-' :: `Char '>' :: `Sep " " :: `Paren l :: transform term' | tok :: term' -> tok :: transform term' | [] -> [] in string_of_term (transform term) ;; let subst_re = Netstring_str.regexp "[$][{]\\([a-zA-Z0-9_]+\\)[}]";; let subst_link_pattern lookup link_str = (* Substitutes the pattern ${name} by the value returned by the function * [lookup], applied on the name. *) Netstring_str.global_substitute subst_re (fun r s -> let name = Netstring_str.matched_group r 1 s in lookup name ) link_str ;; let open_out_ann name = printf "[writing %s]\n" name; flush stdout; let f = open_out name in output_string f "(* THIS FILE IS GENERATED BY LEXPP. DO NOT EDIT MANUALLY! *)\n\n"; f ;; let write_output_files cfg cc = let lex_src = Lexpp_file.read_sections cfg.lex_src_file in let link_src = Lexpp_file.read_sections cfg.link_src_file in let write_header out = match cc with CC_generic(defs,let_str) -> ( match cfg.out_format with `OCAMLLEX -> output_string out "{\n"; if List.mem_assoc "HEADER" lex_src then ( output_string out (List.assoc "HEADER" lex_src); ); if List.mem_assoc "HEADER_OCAMLLEX" lex_src then ( output_string out (List.assoc "HEADER_OCAMLLEX" lex_src); ); output_string out "}\n"; List.iter (Lexpp_file.print_definition out) defs; output_string out let_str; if List.mem_assoc "LET" lex_src then output_string out (List.assoc "LET" lex_src); | `ULEX -> if List.mem_assoc "HEADER" lex_src then ( output_string out (List.assoc "HEADER" lex_src); ); if List.mem_assoc "HEADER_ULEX" lex_src then ( output_string out (List.assoc "HEADER_ULEX" lex_src); ); List.iter (Lexpp_file.print_ulex_definition out) defs; output_string out (transform_let_to_ulex let_str); if List.mem_assoc "LET" lex_src then let s = List.assoc "LET" lex_src in output_string out (transform_let_to_ulex s) | `WLEX -> failwith "Output format wlex is incompatible with generic char classes" ) | CC_wlex(classes_str,let_str) -> if cfg.out_format <> `WLEX then failwith "This output format is incompatible with wlex char classes"; output_string out classes_str; output_string out "{\n"; if List.mem_assoc "HEADER" lex_src then ( output_string out (List.assoc "HEADER" lex_src); ); if List.mem_assoc "HEADER_WLEX" lex_src then ( output_string out (List.assoc "HEADER_WLEX" lex_src); ); output_string out "}\n"; output_string out let_str; if List.mem_assoc "LET" lex_src then output_string out (List.assoc "LET" lex_src); in let write_rule out is_first_rule rule_str = match cfg.out_format with `OCAMLLEX | `WLEX -> if is_first_rule then output_string out "rule " else output_string out "and "; output_string out rule_str | `ULEX -> output_string out "\nlet "; output_string out (transform_rule_to_ulex rule_str) in let suffix = match cfg.out_format with `OCAMLLEX | `WLEX -> ".mll" | `ULEX -> ".ml" in let module_of_rule = Hashtbl.create 10 in if cfg.out_multiple then begin let n = ref 1 in List.iter (fun (_,rule_str) -> let rule_name = name_of_rule rule_str in let mod_name = sprintf "%s_%02d" cfg.out_lex_prefix !n in Hashtbl.add module_of_rule rule_name mod_name; let out = open_out_ann (mod_name ^ suffix) in write_header out; write_rule out true rule_str; close_out out; incr n; ) (List.filter (fun (name,str) -> name = "RULE") lex_src); end else begin let out = open_out_ann (cfg.out_lex_prefix ^ "_01" ^ suffix) in write_header out; let is_first = ref true in List.iter (fun (_,rule_str) -> let rule_name = name_of_rule rule_str in Hashtbl.add module_of_rule rule_name (cfg.out_lex_prefix ^ "_01"); write_rule out !is_first rule_str; is_first := false ) (List.filter (fun (name,str) -> name = "RULE") lex_src); close_out out end; (* Hashtbl.iter (fun r m -> printf "rule %s => module %s\n" r m) module_of_rule; *) let link_str = try List.assoc "LINK" link_src with Not_found -> failwith ("Section LINK is missing in " ^ cfg.link_src_file) in let lookup name = if name = "encoding" then cfg.encoding_name else let filename = try Hashtbl.find module_of_rule name with Not_found -> failwith ("No such rule: " ^ name) in String.capitalize (Filename.basename filename) in let link_str' = subst_link_pattern lookup link_str in let out = open_out_ann (cfg.out_link_prefix ^ ".ml") in output_string out link_str'; close_out out ;; let main() = let cfg = { char_classes_file = "cc.def"; encoding = `Enc_iso88591; encoding_name = "iso88591"; lex_src_file = "lex.src"; link_src_file = "link.src"; out_format = `OCAMLLEX; out_multiple = false; out_lex_prefix = "out"; out_link_prefix = "out_link"; } in let pformat s = match s with "ocamllex" -> `OCAMLLEX | "wlex" -> `WLEX | "ulex" -> `ULEX | _ -> raise (Arg.Bad ("Unknown output format: " ^ s)) in Arg.parse [ "-charclasses", Arg.String (fun s -> cfg.char_classes_file <- s), " The name of the character classes input file"; "-lexsrc", Arg.String (fun s -> cfg.lex_src_file <- s), " The name of the lex source input file"; "-linksrc", Arg.String (fun s -> cfg.link_src_file <- s), " The name of the link source input file"; "-encoding", Arg.String (fun s -> cfg.encoding <- Netconversion.encoding_of_string s; cfg.encoding_name <- s; ), " The character encoding"; "-outformat", Arg.String (fun s -> cfg.out_format <- pformat s), "(ocamllex|wlex|ulex) Output format"; "-outlexprefix", Arg.String (fun s -> cfg.out_lex_prefix <- s), " The common prefix of the lexer files"; "-outlinkprefix", Arg.String (fun s -> cfg.out_link_prefix <- s), " The prefix of the link file"; "-multiple", Arg.Unit (fun () -> cfg.out_multiple <- true), " Generate multiple output files"; ] (fun _ -> raise(Arg.Bad("Bad usage!"))) "usage: lexpp "; let cc = get_char_classes cfg in let cc' = match cfg.out_format with `OCAMLLEX -> recode_char_classes cfg cc | _ -> cc in write_output_files cfg cc' ;; main();;