(* $Id: main.ml 674 2004-06-13 12:33:48Z gerd $
 * ----------------------------------------------------------------------
 *
 *)


open Uni_types
open Ucs2_to_utf8
open Printf

type lextool =
    [ `OCAMLLEX | `WLEX | `ULEX ]

type config =
    { mutable char_classes_file : string;
      mutable encoding : Netconversion.encoding;
      mutable encoding_name : string;
      mutable lex_src_file : string;
      mutable link_src_file : string;
      mutable out_format : lextool;
      mutable out_multiple : bool;
      mutable out_lex_prefix : string;
      mutable out_link_prefix : string;
    }

type char_classes =
    CC_generic of (definition list * string)
      (* regexp definitions, section LET *)
  | CC_wlex of (string * string)
      (* section CLASSES, section LETS *)

let get_char_classes cfg =
  let filename =  cfg.char_classes_file in
  let cc_sections = Lexpp_file.read_sections filename in
  let type_generic = List.mem_assoc "TYPE_GENERIC" cc_sections in
  let type_wlex = List.mem_assoc "TYPE_WLEX" cc_sections in
  ( match type_generic, type_wlex with
	false, false ->
	  failwith ("File " ^ filename ^ ": type indicator is missing");
      | true, false
      | false, true ->
	  ()
      | _, _ ->
	  failwith ("File " ^ filename ^ ": type indicator is ambiguous");
  );

  let let_section =
    try List.assoc "LET" cc_sections
    with Not_found -> failwith ("File " ^ filename ^ ": no LET section") in

  match () with
      () when type_generic ->
	let let_unicode_section =
	  try List.assoc "LET_UNICODE" cc_sections
	  with Not_found -> failwith ("File " ^ filename ^ ": no LET_UNICODE section") in

	CC_generic(Lexpp_file.parse_char_classes let_unicode_section, let_section);
    | () when type_wlex ->
	let classes_section =
	  try List.assoc "CLASSES" cc_sections
	  with Not_found -> failwith ("File " ^ filename ^ ": no CLASSES section")
	in
	CC_wlex(classes_section, let_section)

    | () ->
	assert false (* programming error *)
;;


let recode_char_classes_8bit cfg cc =
  (* Works only for 8 bit character encodings. *)
  let out_enc = cfg.encoding in

  let recode_char n =
    try
      let s = Netconversion.makechar out_enc n in (* or Not_found *)
      if String.length s <> 1 then
	failwith("Character " ^  string_of_int n ^
		 " has a multibyte representation");
      Some(Char.code s.[0])
    with
	Not_found ->
	  None
  in

  let rec recode_interval cur_ival cur_code last_code =
    if cur_code <= last_code then begin
      match recode_char cur_code with
	  None ->
	    recode_interval cur_ival (cur_code+1) last_code
	| Some p ->
	    ( match cur_ival with
		  None ->
		    recode_interval (Some(p,p)) (cur_code+1) last_code
		| Some(m,n) ->
		    if n+1 = p then
		      recode_interval (Some(m,p)) (cur_code+1) last_code
		    else
		      (m,n) :: recode_interval (Some(p,p)) (cur_code+1) last_code
	    )
    end
    else
      match cur_ival with
	  None ->
	    []
	| Some(m,n) ->
	    [m,n]
  in

  let rec recode_regexp re =
    (* recodes a regexp to a regexp list *)
    match re with
	Char n ->
	  ( match recode_char n with
		Some p -> [Char p]
	      | None   -> []
	  )
      | Interval(m,n) ->
	  List.map
	    (fun (m,n) ->
	       if m=n then Char m else Interval(m,n)
	    )
	    (recode_interval None m n)
      | Identifier id ->
	  [Identifier id]
      | Concat _ ->
	  assert false      (* not used *)
  in

  let recode_def { id = id; rel = rel } =
    { id = id; rel = List.flatten (List.map recode_regexp rel) }
  in

  match cc with
      CC_wlex(_,_) ->
	assert false
    | CC_generic(defs, let_section) ->
	CC_generic(List.map recode_def defs, let_section)
;;


let recode_char_classes cfg cc =
  (* FUTURE: Use Netconversion.is_ascii_compatible and is_single_byte *)
  printf
    "[Recoding character classes to %s]\n"
    (Netconversion.string_of_encoding cfg.encoding);
  flush stdout;
  match cfg.encoding with
      `Enc_utf8 ->
	( match cc with
	      CC_generic(defs, let_section) ->
		CC_generic(List.map ucs2_to_utf8 defs, let_section)
	    | CC_wlex(_,_) ->
		failwith "Char classes of type wlex not compatible with output format"
	)
    | (`Enc_java | `Enc_utf16 | `Enc_utf16_le | `Enc_utf16_be) ->
	failwith "This character encoding is not supported!"
    | _ ->
	(* May not work... *)
	recode_char_classes_8bit cfg cc
;;


let space_re =
  Netstring_str.regexp "[ \t\r\n]+" ;;


let name_of_rule rule_str =
  (* Extract the name of the rule definition. The name is the first word
   * in [rule_str] outside of comments.
   * FIXME: Comments containing inner comments or code are not supported.
   *)
  let rec skip_comments l =
    match l with
	"*)" :: l' -> l'
      | _ :: l' -> skip_comments l'
      | [] -> failwith ("Unfinished comment in: " ^ rule_str)
  and find_first_word l =
    match l with
	"(*" :: l' -> find_first_word (skip_comments l')
      | w :: l' -> w
      | [] -> failwith ("Cannot determine rule name in: " ^ rule_str)
  in

  let words = Netstring_str.split space_re rule_str in
  find_first_word words
;;


type mlltok = Mll_lexer.mlltok


let parse_term str =
  let rec norm need_sep term =
    match term with
	(`Sep s1) :: (`Sep s2) :: term' ->
	  norm need_sep (`Sep(s1^s2) :: term')
      | (`Sep _  as sep) :: term' ->
	  sep :: norm false term'
      | tok :: term' ->
	  if need_sep then
	    (`Sep "") :: tok :: norm false term'
	  else
	    tok :: norm true term'
      | [] ->
	  []
  in
  let rec rem_comments term =
    match term with
	`Comment _ :: term' ->
	  rem_comments term'
      | tok :: term' ->
	  tok :: rem_comments term'
      | [] ->
	  []
  in
  let lexbuf = Lexing.from_string str in
  let term = Mll_lexer.recurse_until `EOF Mll_lexer.definition lexbuf in
  norm true (rem_comments term)
;;


let string_of_term (term : mlltok list) =
  let buf = Buffer.create 1000 in

  let rec print outermost last_tok (term : mlltok list) =
    match term with
	tok :: term' ->
	  ( match tok with
		`Sep s ->
		  if not outermost || (last_tok <> `EOF && term' <> []) then
		    Buffer.add_string buf s
		      (* don't print at the beginning and at the end *)
	      | `Comment l ->
		  Buffer.add_char buf ' '
	      | `Brace(_,l) ->
		  Buffer.add_char buf '{';
		  print false `EOF l;
		  Buffer.add_char buf '}';
	      | `Paren l ->
		  Buffer.add_char buf '(';
		  print false `EOF l;
		  Buffer.add_char buf ')'
	      | `Bracket l ->
		  Buffer.add_char buf '[';
		  print false `EOF l;
		  Buffer.add_char buf ']'
	      | `Stringliteral s ->
		  Buffer.add_char buf '"';
		  Buffer.add_string buf s;
		  Buffer.add_char buf '"';
	      | `Charliteral s ->
		  Buffer.add_char buf '\'';
		  Buffer.add_string buf s;
		  Buffer.add_char buf '\'';
	      | `Char c ->
		  Buffer.add_char buf c
	      | `Ident name ->
		  Buffer.add_string buf name
	      | `EOF
	      | `E_Brace
	      | `E_Bracket
	      | `E_Comment
	      | `E_Paren ->
		  ()
	  );
	  print outermost tok term'
      | [] ->
	  ()
  in
  print true `EOF term;
  Buffer.contents buf
;;


let transform_let_to_ulex let_str =
  (* Transforms a regexp "let" definition to ulex syntax *)
  let term = parse_term let_str in
  let rec transform term =
    match term with
	`Ident "let" :: term' ->
	  `Sep "\n" :: `Ident "let" :: `Sep " " :: `Ident "regexp" ::
	  transform term'
      | tok :: term' ->
	  tok :: transform term'
      | [] ->
	  []
  in
  string_of_term (transform term)
;;


let transform_rule_to_ulex rule_str =
  (* Transforms the rule into ulex syntax *)
  let term = parse_term rule_str in
  let rec transform term =
    match term with
      | `Ident "parse" :: term' ->
	  `Ident "lexer" :: transform term'
      | `Brace (_,l) :: term' ->
	  `Sep " " :: `Char '-' :: `Char '>' :: `Sep " " :: `Paren l ::
	  transform term'
      | tok :: term' ->
	  tok :: transform term'
      | [] ->
	  []
  in
  string_of_term (transform term)
;;


let subst_re = Netstring_str.regexp "[$][{]\\([a-zA-Z0-9_]+\\)[}]";;

let subst_link_pattern lookup link_str =
  (* Substitutes the pattern ${name} by the value returned by the function
   * [lookup], applied on the name.
   *)
  Netstring_str.global_substitute
    subst_re
    (fun r s ->
       let name = Netstring_str.matched_group r 1 s in
       lookup name
    )
    link_str
;;


let open_out_ann name =
  printf "[writing %s]\n" name;
  flush stdout;
  let f = open_out name in
  output_string f
    "(* THIS FILE IS GENERATED BY LEXPP. DO NOT EDIT MANUALLY! *)\n\n";
  f
;;


let write_output_files cfg cc =
  let lex_src  = Lexpp_file.read_sections cfg.lex_src_file in
  let link_src = Lexpp_file.read_sections cfg.link_src_file in

  let write_header out =
    match cc with
	CC_generic(defs,let_str) ->
	  ( match cfg.out_format with
		`OCAMLLEX ->
		  output_string out "{\n";
		  if List.mem_assoc "HEADER" lex_src then (
		    output_string out (List.assoc "HEADER" lex_src);
		  );
		  if List.mem_assoc "HEADER_OCAMLLEX" lex_src then (
		    output_string out (List.assoc "HEADER_OCAMLLEX" lex_src);
		  );
		  output_string out "}\n";
		  List.iter (Lexpp_file.print_definition out) defs;
		  output_string out let_str;
		  if List.mem_assoc "LET" lex_src then
		    output_string out (List.assoc "LET" lex_src);
	      | `ULEX ->
		  if List.mem_assoc "HEADER" lex_src then (
		    output_string out (List.assoc "HEADER" lex_src);
		  );
		  if List.mem_assoc "HEADER_ULEX" lex_src then (
		    output_string out (List.assoc "HEADER_ULEX" lex_src);
		  );
		  List.iter (Lexpp_file.print_ulex_definition out) defs;
		  output_string out (transform_let_to_ulex let_str);
		  if List.mem_assoc "LET" lex_src then
		    let s = List.assoc "LET" lex_src in
		    output_string out (transform_let_to_ulex s)
	      | `WLEX ->
		  failwith "Output format wlex is incompatible with generic char classes"
	  )
      | CC_wlex(classes_str,let_str) ->
	  if cfg.out_format <> `WLEX then
	    failwith "This output format is incompatible with wlex char classes";
	  output_string out classes_str;
	  output_string out "{\n";
	  if List.mem_assoc "HEADER" lex_src then (
	    output_string out (List.assoc "HEADER" lex_src);
	  );
	  if List.mem_assoc "HEADER_WLEX" lex_src then (
	    output_string out (List.assoc "HEADER_WLEX" lex_src);
	  );
	  output_string out "}\n";
	  output_string out let_str;
	  if List.mem_assoc "LET" lex_src then
	    output_string out (List.assoc "LET" lex_src);
  in

  let write_rule out is_first_rule rule_str =
    match cfg.out_format with
	`OCAMLLEX
      | `WLEX ->
	  if is_first_rule then
	    output_string out "rule "
	  else
	    output_string out "and ";
	  output_string out rule_str
      | `ULEX ->
	  output_string out "\nlet ";
	  output_string out (transform_rule_to_ulex rule_str)
  in

  let suffix =
    match cfg.out_format with
	`OCAMLLEX
      | `WLEX     ->  ".mll"
      | `ULEX     ->  ".ml"
  in

  let module_of_rule = Hashtbl.create 10 in

  if cfg.out_multiple then begin
    let n = ref 1 in
    List.iter
      (fun (_,rule_str) ->
	 let rule_name = name_of_rule rule_str in
	 let mod_name = sprintf "%s_%02d" cfg.out_lex_prefix !n in
	 Hashtbl.add module_of_rule rule_name mod_name;
	 let out = open_out_ann (mod_name ^ suffix) in
	 write_header out;
	 write_rule out true rule_str;
	 close_out out;
	 incr n;
      )
      (List.filter (fun (name,str) -> name = "RULE") lex_src);
  end
  else begin
    let out = open_out_ann (cfg.out_lex_prefix ^ "_01" ^ suffix) in
    write_header out;
    let is_first = ref true in
    List.iter
      (fun (_,rule_str) ->
	 let rule_name = name_of_rule rule_str in
	 Hashtbl.add module_of_rule rule_name (cfg.out_lex_prefix ^ "_01");
	 write_rule out !is_first rule_str;
	 is_first := false
      )
      (List.filter (fun (name,str) -> name = "RULE") lex_src);
    close_out out
  end;

  (*
    Hashtbl.iter
    (fun r m ->
    printf "rule %s => module %s\n" r m)
    module_of_rule;
  *)

  let link_str =
    try List.assoc "LINK" link_src
    with Not_found ->
      failwith ("Section LINK is missing in " ^ cfg.link_src_file) in

  let lookup name =
    if name = "encoding" then
      cfg.encoding_name
    else
      let filename =
	try Hashtbl.find module_of_rule name
	with Not_found ->
	  failwith ("No such rule: " ^  name)
      in
      String.capitalize (Filename.basename filename)
  in

  let link_str' = subst_link_pattern lookup link_str in
  let out = open_out_ann (cfg.out_link_prefix ^ ".ml") in
  output_string out link_str';
  close_out out
;;


let main() =
  let cfg =
    { char_classes_file = "cc.def";
      encoding = `Enc_iso88591;
      encoding_name = "iso88591";
      lex_src_file = "lex.src";
      link_src_file = "link.src";
      out_format = `OCAMLLEX;
      out_multiple = false;
      out_lex_prefix = "out";
      out_link_prefix = "out_link";
    }
  in

  let pformat s =
    match s with
	"ocamllex" -> `OCAMLLEX
      | "wlex"     -> `WLEX
      | "ulex"     -> `ULEX
      | _          -> raise (Arg.Bad ("Unknown output format: " ^ s))
  in

  Arg.parse
      [ "-charclasses",  Arg.String (fun s -> cfg.char_classes_file <- s),
	             "<file>     The name of the character classes input file";
	"-lexsrc", Arg.String (fun s -> cfg.lex_src_file <- s),
	        "<file>          The name of the lex source input file";
	"-linksrc", Arg.String (fun s -> cfg.link_src_file <- s),
	         "<file>         The name of the link source input file";
	"-encoding", Arg.String (fun s ->
				   cfg.encoding <-
				     Netconversion.encoding_of_string s;
				   cfg.encoding_name <- s;
				),
	          "<name>        The character encoding";
	"-outformat", Arg.String (fun s -> cfg.out_format <- pformat s),
	           "(ocamllex|wlex|ulex) Output format";
	"-outlexprefix", Arg.String (fun s -> cfg.out_lex_prefix <- s),
	              "<prefix>  The common prefix of the lexer files";
	"-outlinkprefix", Arg.String (fun s -> cfg.out_link_prefix <- s),
	               "<prefix> The prefix of the link file";
	"-multiple", Arg.Unit (fun () -> cfg.out_multiple <- true),
	          "              Generate multiple output files";
      ]
      (fun _ -> raise(Arg.Bad("Bad usage!")))
      "usage: lexpp <options>";

  let cc = get_char_classes cfg in
  let cc' =
    match cfg.out_format with
	`OCAMLLEX ->
	  recode_char_classes cfg cc
      | _ ->
	  cc in
  write_output_files cfg cc'
;;


main();;