(* $Id: pxp_reader.ml 689 2004-08-07 17:01:52Z gerd $ * ---------------------------------------------------------------------- * PXP: The polymorphic XML parser for Objective Caml. * Copyright by Gerd Stolpmann. See LICENSE for details. *) open Pxp_core_types;; open Netchannels;; exception Not_competent = Pxp_core_types.Not_competent;; exception Not_resolvable (* of exn *) = Pxp_core_types.Not_resolvable;; type lexer_source = { lsrc_lexbuf : Lexing.lexbuf Lazy.t; lsrc_unicode_lexbuf : Netulex.ULB.unicode_lexbuf Lazy.t; } let ensure_space_minimum p f g = (* The functional [f] is called with a function as argument that refills * a buffer string (think f = Lexing.from_function). This argument is [g], * i.e. in most cases this is the same as [f g]. However, it is ensured * that the string buffer has the minimum free space [p]. This is achieved * by using an auxiliary buffer. *) let buf = String.create p in let bufpos = ref 0 in let buflen = ref 0 in f (fun s n -> assert(n>0); if !buflen > 0 then ( let m = min n !buflen in String.blit buf !bufpos s 0 m; bufpos := !bufpos + m; buflen := !buflen - m; m ) else if n < p then ( let l = g buf p in let m = min l n in String.blit buf 0 s 0 m; bufpos := m; buflen := l-m; m ) else g s n ) ;; class type resolver = object method init_rep_encoding : rep_encoding -> unit method init_warner : symbolic_warnings option -> collect_warnings -> unit method rep_encoding : rep_encoding method open_in : ext_id -> lexer_source method open_rid : resolver_id -> lexer_source method active_id : resolver_id method close_in : unit (* method close_all : unit *) (* [close_all] is no longer supported in PXP 1.2 *) method change_encoding : string -> unit method clone : resolver end ;; let null_resolver = resolver_id_of_ext_id Anonymous ;; (* All components are None *) class virtual resolve_general = object (self) val mutable internal_encoding = `Enc_utf8 val mutable is_open = false val mutable mode = None (* Whether the [lexbuf_reader] or the [unicode_lexbuf_reader] * is used. One can only invoke one of them. *) val mutable encoding = `Enc_utf8 val mutable encoding_requested = false val mutable encoding_request_post = (fun () -> ()) (* post action *) val mutable active_id = null_resolver val mutable warner = new drop_warnings val mutable swarner = None val mutable enc_initialized = false val mutable wrn_initialized = false method init_rep_encoding e = internal_encoding <- e; enc_initialized <- true; method init_warner sw w = warner <- w; swarner <- sw; wrn_initialized <- true; method rep_encoding = (internal_encoding :> rep_encoding) method private warn (k:int) = (* Called if a character not representable has been found. * k is the character code. *) if k < 0xd800 or (k >= 0xe000 & k <= 0xfffd) or (k >= 0x10000 & k <= 0x10ffff) then begin warn swarner warner (`W_code_point_cannot_be_represented k); end else raise (WF_error("Code point " ^ string_of_int k ^ " outside the accepted range of code points")) method private autodetect s = (* s must be at least 4 bytes long. The slot 'encoding' is * set to: * "UTF-16-BE": UTF-16/UCS-2 encoding big endian * "UTF-16-LE": UTF-16/UCS-2 encoding little endian * "UTF-8": UTF-8 encoding * * Note: Four bytes are required for cases not yet handled * (e.g. UTF-32). *) if String.sub s 0 2 = "\254\255" then encoding <- `Enc_utf16 (* Note: Netconversion.recode will detect the big endianess, too *) else if String.sub s 0 2 = "\255\254" then encoding <- `Enc_utf16 (* Note: Netconversion.recode will detect the little endianess, too *) else encoding <- `Enc_utf8 method private virtual next_string : string -> int -> int -> int method private virtual init_in : resolver_id -> unit method virtual close_in : unit (* must reset is_open! *) method private lexbuf_reader () = if mode <> None && mode <> Some `Lexbuf then failwith "lexbuf_reader: other reader already working"; mode <- Some `Lexbuf; let direct_reader = ref false in (* whether to bypass the buffer *) let buf_max = 4096 in let buf = ref (String.make buf_max ' ') in let buf_beg = ref 0 in let buf_end = ref 0 in let buf_eof = ref false in (* The buffer is used if [not direct_reader]. [buf_beg] is the * beginning of the filled part of the buffer, [buf_end] is the * end (plus 1). [buf_eof] indicates that EOF was already signaled, * and should be processed after the current contents. *) let refill() = (* Refill the buffer from [next_string]. * It is important to refill as much as possible for the * algorithm below. *) while not !buf_eof && !buf_end < buf_max do let n = self # next_string !buf !buf_end (buf_max - !buf_end) in if n=0 then buf_eof := true else buf_end := !buf_end + n done in let convert s n max_chars = (* Convert characters from [buf] to [s], [n]. The number of characters * it limited by [max_chars] *) assert(n>=6); if !buf_beg + 6 > !buf_end && not !buf_eof then ( (* Less than 6 bytes in [buf]. We need at least this * number of bytes, which is the length of the longest * UTF-8 char, otherwise we cannot ensure to convert * at least one character. *) let m = !buf_end - !buf_beg in String.blit !buf !buf_beg !buf 0 m; buf_beg := 0; buf_end := m; refill(); ); (* It is still possible that there are less than 6 bytes * in [buf], but only if [buf_eof] is true at the same * time. *) let m = !buf_end - !buf_beg in if m=0 then ( assert !buf_eof; 0 (* EOF! *) ) else ( let (n_in, n_out, encoding') = Netconversion.recode ~in_enc:encoding ~in_buf:!buf ~in_pos:!buf_beg ~in_len:m ~out_enc:(internal_encoding : rep_encoding :> encoding) ~out_buf:s ~out_pos:0 ~out_len:n ~max_chars ~subst:(fun k -> self # warn k; "") in if n_in = 0 then ( (* An incomplete character at the end of the stream. * Note: This test assumes that there is one character to * convert in [buf], and that there is enough free space in [s]. * This is the case because we ensure [m>=6] and [n>=6]. *) assert !buf_eof; raise Netconversion.Malformed_code; ); encoding <- encoding'; buf_beg := !buf_beg + n_in; assert(n_out > 0); n_out ) in (* Fill the buffer initially. We start always with buffered reading, * but we try to switch to direct reading later. *) refill(); if !buf_end >= 4 && not encoding_requested then self # autodetect !buf; (* Ensure that [n >= 6], the longest UTF-8 character, so we can always * put at least one character into [s] *) ensure_space_minimum 6 Lexing.from_function (fun s n -> (* Fill the string [s] with at most [n] bytes. Return the number * of bytes, or 0 to signal EOF. *) if not is_open then failwith "trying to read from resolver, but resolver is not open"; if !direct_reader then self # next_string s 0 n else ( if encoding_requested then ( (* In this case, the encoding will not change any more. We * can read any number of characters at once. *) if encoding = (internal_encoding : rep_encoding :> encoding) then ( (* No conversion is needed. In order to speed up reading, * we are going to enable [direct_reader]. *) if !buf_beg < !buf_end then ( (* There are still bytes in [buf], return them first *) let m = min n (!buf_end - !buf_beg) in String.blit !buf !buf_beg s 0 m; buf_beg := !buf_beg + m; m ) else ( (* Either we are already at EOF, or we can switch to * [direct_reader]. *) buf := ""; (* Free buf, it will never be used again *) if !buf_eof then 0 else ( direct_reader := true; self # next_string s 0 n ) ) ) else ( (* Character conversion is needed from [encoding] to * [internal_encoding]. *) convert s n max_int ) ) else ( (* In this case, the encoding might change at any time. * Because of this, we only read one character at a time. *) convert s n 1 ) ) ) method private unicode_lexbuf_reader () = if mode <> None && mode <> Some `Unicode_lexbuf then failwith "unicode_lexbuf_reader: other reader already working"; mode <- Some `Unicode_lexbuf; let buf = Netbuffer.create 4 in (* Only used for autodetection! *) let buf_eof = ref false in if not encoding_requested then ( while not !buf_eof && Netbuffer.length buf < 4 do let n = Netbuffer.add_inplace ~len:(4-Netbuffer.length buf) buf self#next_string in buf_eof := (n=0) done; if Netbuffer.length buf >= 4 then self # autodetect (Netbuffer.contents buf); ); let lexbuf = Netulex.ULB.from_function ~enc_change_hook:( fun ulb -> encoding <- ulb.Netulex.ULB.ulb_encoding ) ~refill:( fun s p n -> (* Fill the string [s] at position [p] with at most [n] bytes. * Return the number of bytes, or 0 to signal EOF. *) if not is_open then failwith "trying to read from resolver, but resolver is not open"; if encoding_requested && Netbuffer.length buf = 0 then ( (* In this case, the encoding will not change any more. We * can read any number of characters at once. *) self # next_string s p n ) else ( (* In this case, the encoding might change at any time. * Because of this, we only read one character at a time. * If there are still characters in [buf], take these first. *) if Netbuffer.length buf > 0 then ( let c = (Netbuffer.contents buf).[0] in s.[p] <- c; Netbuffer.delete buf 0 1; 1 ) else ( if !buf_eof then 0 (* EOF already seen *) else self # next_string s p 1 ) ) ) encoding in encoding_request_post <- ( fun () -> (* This function is called after the encoding was requested for * the first time *) Netulex.ULB.set_encoding encoding lexbuf ); lexbuf method open_in xid = self # open_rid (resolver_id_of_ext_id xid) method open_rid rid = assert(enc_initialized && wrn_initialized); encoding <- `Enc_utf8; encoding_requested <- false; self # init_in rid; (* may raise Not_competent *) (* init_in: may already set 'encoding' *) is_open <- true; { lsrc_lexbuf = lazy(self # lexbuf_reader() ); lsrc_unicode_lexbuf = lazy(self # unicode_lexbuf_reader() ); } method change_encoding enc = if not is_open then failwith "#change_encoding: resolver is not open"; if not encoding_requested then begin if enc <> "" then begin match Netconversion.encoding_of_string enc with `Enc_utf16 -> (match encoding with (`Enc_utf16_le | `Enc_utf16_be) -> () | `Enc_utf16 -> assert false | _ -> raise(WF_error "Encoding of data stream and encoding declaration mismatch") ) | e -> encoding <- e end; (* else: the autodetected encoding counts *) encoding_requested <- true; encoding_request_post(); end; method active_id = if not is_open then failwith "#active_id: resolver is not open"; active_id end ;; type accepted_id = in_obj_channel * encoding option * resolver_id option ;; let close_ch (ch : in_obj_channel) = ch # close_in() ;; class resolve_to_any_obj_channel ?(close=close_ch) ~channel_of_id () = object(self) inherit resolve_general as super val f_open = channel_of_id val mutable current_channel = None val close = close method private init_in (id:resolver_id) = if current_channel <> None then failwith "Pxp_reader.resolve_to_any_obj_channel # init_in"; let ch, enc_opt, rid_opt = f_open id in (* may raise Not_competent *) begin match enc_opt with None -> () | Some enc -> encoding <- enc; encoding_requested <- true end; begin match rid_opt with None -> active_id <- id | Some r -> active_id <- r end; current_channel <- Some ch; method private next_string s ofs len = match current_channel with None -> failwith "Pxp_reader.resolve_read_any_channel # next_string" | Some ch -> try ch # input s ofs len with End_of_file -> 0 method close_in = is_open <- false; match current_channel with None -> () | Some ch -> close ch; current_channel <- None method clone = let c = new resolve_to_any_obj_channel ?close:(Some close) ~channel_of_id:f_open () in c # init_rep_encoding internal_encoding; c # init_warner swarner warner; (* clones <- c :: clones; *) (c :> resolver) end ;; let rid_rid_intersection bound_rid actual_rid = (* Returns a resolver_id where unequal IDs are reset to None. The * rid_system_base is set corresponding to rid_system. * * Notes: * (1) an empty SYSTEM name does not match another empty SYSTEM name * (2) PUBLIC names must be normalized *) let isect opt1 opt2 = if opt1 = opt2 then opt1 else None in let sys_isect opt1 opt2 = if opt1 = opt2 && opt1 <> Some "" then opt1 else None in { rid_private = isect bound_rid.rid_private actual_rid.rid_private; rid_public = isect bound_rid.rid_public actual_rid.rid_public; rid_system = sys_isect bound_rid.rid_system actual_rid.rid_system; rid_system_base = if bound_rid.rid_system = actual_rid.rid_system && bound_rid.rid_system <> None && bound_rid.rid_system <> Some "" then actual_rid.rid_system_base else None; } ;; let rid_matches_rid bound_rid actual_rid = (* definition: * rid_matches_rid r1 r2 =def= * rid_rid_intersection r1 r2 <> null_resolver * * See also the notes for rid_rid_intersection *) (bound_rid.rid_private <> None && bound_rid.rid_private = actual_rid.rid_private) || (bound_rid.rid_public <> None && bound_rid.rid_public = actual_rid.rid_public) || (bound_rid.rid_system <> None && bound_rid.rid_system <> Some "" && bound_rid.rid_system = actual_rid.rid_system) ;; let xid_rid_intersection bound_xid actual_rid = rid_rid_intersection (resolver_id_of_ext_id bound_xid) actual_rid ;; let xid_matches_rid bound_xid actual_rid = (* definition: * xid_matches_rid x r =def= * xid_rid_intersection x r <> null_resolver * * See also the notes for rid_rid_intersection *) match bound_xid with System sys -> sys <> "" && actual_rid.rid_system = Some sys | Public(pub,sys) -> (actual_rid.rid_public = Some pub) || (sys <> "" && actual_rid.rid_system = Some sys) | Anonymous -> false | Private p -> actual_rid.rid_private = Some p ;; let id_intersection bound_rid_opt bound_xid_opt actual_rid = (* Intersections the actual_rid with both bound_rid_opt and bound_xid_opt * in turn *) let rid1 = match bound_rid_opt with Some bound_rid -> rid_rid_intersection bound_rid actual_rid | None -> actual_rid in let rid2 = match bound_xid_opt with Some bound_xid -> xid_rid_intersection bound_xid rid1 | None -> rid1 in rid2 ;; class resolve_to_this_obj_channel1 is_stale ?id ?rid ?fixenc ?close ch = let getchannel = ref (fun rid -> assert false) in object (self) inherit resolve_to_any_obj_channel ?close ~channel_of_id:(fun rid -> !getchannel rid) () as super val is_stale = is_stale (* The channel can only be read once. To avoid that the channel * is opened several times, the flag 'is_stale' is set after the * first time. *) val fixid = id val fixrid = rid val fixenc = fixenc val fixch = ch initializer getchannel := self # getchannel method private getchannel rid = let m_xid = match fixid with None -> false | Some bound_xid -> xid_matches_rid bound_xid rid in let m_rid = match fixrid with None -> false | Some bound_rid -> rid_matches_rid bound_rid rid in (* By definition, if both fixid and fixrid are None, the resolver * will match always. *) if (fixid <> None || fixrid <> None) && not m_xid && not m_rid then raise Not_competent; let final_id = id_intersection fixrid fixid rid in ch, fixenc, Some final_id method private init_in (rid:resolver_id) = if !is_stale then raise Not_competent else begin super # init_in rid; is_stale := true end method clone = let c = new resolve_to_this_obj_channel1 is_stale ?id:fixid ?rid:fixrid ?fixenc:fixenc ?close:(Some close) fixch in c # init_rep_encoding internal_encoding; c # init_warner swarner warner; (* clones <- c :: clones; *) (c :> resolver) end ;; class resolve_to_this_obj_channel ?id ?rid ?fixenc ?close ch = let is_stale = ref false in resolve_to_this_obj_channel1 is_stale ?id ?rid ?fixenc ?close ch ;; class resolve_to_url_obj_channel ?close ~url_of_id ~base_url_of_id ~channel_of_url () = let channel_of_id rid = let rel_url = url_of_id rid in (* may raise Not_competent *) try (* Now compute the absolute URL: *) let abs_url = if Neturl.url_provides ~scheme:true rel_url then rel_url else let base_url = base_url_of_id rid in Neturl.apply_relative_url base_url rel_url in (* may raise Malformed_URL *) (* Simple check whether 'abs_url' is really absolute: *) if not(Neturl.url_provides ~scheme:true abs_url) then raise Not_competent; let rid' = { rid with rid_system = Some(Neturl.string_of_url abs_url) } in (* Get and return the channel: *) let ch, enc_opt, active_id_opt = channel_of_url rid' abs_url (* may raise Not_competent *) in (ch, enc_opt, (match active_id_opt with None -> Some rid' | _ -> active_id_opt )) with Neturl.Malformed_URL -> raise (Not_resolvable Neturl.Malformed_URL) | Not_competent -> raise (Not_resolvable Not_found) in resolve_to_any_obj_channel ?close ~channel_of_id () ;; let base_url_syntax = { Neturl.null_url_syntax with Neturl.url_enable_scheme = Neturl.Url_part_required; Neturl.url_enable_host = Neturl.Url_part_allowed; Neturl.url_enable_path = Neturl.Url_part_required; Neturl.url_accepts_8bits = true; } ;; type spec = [ `Not_recognized | `Allowed | `Required ] class resolve_as_file ?(file_prefix = (`Allowed :> spec)) ?(host_prefix = (`Allowed :> spec)) ?(system_encoding = `Enc_utf8) ?(map_private_id = (fun _ -> raise Not_competent)) ?(open_private_id = (fun _ -> raise Not_competent)) ?(base_url_defaults_to_cwd = false) ?(not_resolvable_if_not_found = true) () = let url_syntax = let enable_if = function `Not_recognized -> Neturl.Url_part_not_recognized | `Allowed -> Neturl.Url_part_allowed | `Required -> Neturl.Url_part_required in { Neturl.null_url_syntax with Neturl.url_enable_scheme = enable_if file_prefix; Neturl.url_enable_host = enable_if host_prefix; Neturl.url_enable_path = Neturl.Url_part_required; Neturl.url_accepts_8bits = true; } in let default_base_url = if base_url_defaults_to_cwd then begin let cwd = Sys.getcwd() in let cwd_utf8 = Netconversion.recode_string ~in_enc: system_encoding ~out_enc: `Enc_utf8 cwd in let l = String.length cwd_utf8 in let cwd_utf8 = if cwd_utf8 = "" || cwd_utf8.[l-1] <> '/' then cwd_utf8 ^ "/" else cwd_utf8 in Some(Neturl.file_url_of_local_path cwd_utf8) end else None in let use_private_id = ref false in let url_of_id rid = let file_url_of_sysname sysname = (* By convention, we can assume that sysname is a URL conforming * to RFC 1738 with the exception that it may contain non-ASCII * UTF-8 characters. *) try Neturl.url_of_string url_syntax sysname (* may raise Malformed_URL *) with Neturl.Malformed_URL -> raise Not_competent in use_private_id := false; let url = match rid.rid_system with None -> ( match rid.rid_private with None -> raise Not_competent | Some p -> let url = map_private_id p in use_private_id := true; url ) | Some sysname -> file_url_of_sysname sysname in let scheme = try Neturl.url_scheme url with Not_found -> "file" in let host = try Neturl.url_host url with Not_found -> "" in if scheme <> "file" then raise Not_competent; if host <> "" && host <> "localhost" then raise Not_competent; url in let base_url_of_id rid = match rid.rid_system_base with Some sysname -> Neturl.url_of_string base_url_syntax sysname | None -> ( match default_base_url with Some url -> url | None -> raise Not_competent ) in let channel_of_url rid url = if !use_private_id then begin match rid.rid_private with Some p -> let ch, enc_opt = open_private_id p in (new input_channel ch, enc_opt, None) | None -> assert false end else begin try let path_utf8 = Neturl.local_path_of_file_url url (* may fail *) in (* Note: it is only assumed that the path is UTF-8 *) let path = Netconversion.recode_string ~in_enc: `Enc_utf8 ~out_enc: system_encoding path_utf8 in (* May raise Malformed_code *) if (not not_resolvable_if_not_found) && not(Sys.file_exists path) then raise Not_competent; (new input_channel(open_in_bin path), None, None) (* May raise Sys_error *) with | Netconversion.Malformed_code as e -> raise (Not_resolvable e) | Sys_error _ as e -> raise (Not_resolvable e) end in resolve_to_url_obj_channel ~url_of_id ~base_url_of_id ~channel_of_url () ;; let make_file_url ?(system_encoding = `Enc_utf8) ?(enc = `Enc_utf8) filename = let utf8_filename = Netconversion.recode_string ~in_enc: enc ~out_enc: `Enc_utf8 filename in let getcwd() = let cwd = Sys.getcwd() in let cwd_utf8 = Netconversion.recode_string ~in_enc: system_encoding ~out_enc: `Enc_utf8 cwd in cwd_utf8 in Neturl.file_url_of_local_path ~getcwd utf8_filename ;; class lookup_id_nonorm (catalog : (ext_id * resolver) list) = ( object (self) val cat = catalog val mutable internal_encoding = `Enc_utf8 val mutable warner = new drop_warnings val mutable swarner = None val mutable active_resolver = None method init_rep_encoding enc = internal_encoding <- enc method init_warner sw w = swarner <- sw; warner <- w; method rep_encoding = internal_encoding (* CAUTION: This may not be the truth! *) method open_in xid = self # open_rid (resolver_id_of_ext_id xid) method open_rid rid = if active_resolver <> None then failwith "Pxp_reader.lookup_* # open_rid"; let selected_xid, r = try List.find (fun (xid,r) -> xid_matches_rid xid rid ) cat with Not_found -> raise Not_competent in let r' = r # clone in r' # init_rep_encoding internal_encoding; r' # init_warner swarner warner; let lb = r' # open_rid rid in (* may raise Not_competent *) active_resolver <- Some (selected_xid,r'); lb method close_in = match active_resolver with None -> () | Some(_,r) -> r # close_in; active_resolver <- None method active_id = match active_resolver with None -> failwith "#active_id: resolver is not open" | Some(selected_xid, r) -> ( match selected_xid with Private p -> { null_resolver with rid_private = Some p } | System sysid -> { null_resolver with rid_system = Some sysid } (* Note: Relative URLs do not make sense in catalogs, * so ignore this case here *) | Public(pubid,sysid) -> { null_resolver with rid_public = Some pubid; rid_system = if sysid = "" then None else Some sysid; } | Anonymous -> assert false ) (* method close_all = (* CHECK: Müssen nicht die Klone auch geschlossen werden? *) self # close_in *) method change_encoding (enc:string) = match active_resolver with None -> failwith "Pxp_reader.lookup_* # change_encoding" | Some(_,r) -> r # change_encoding enc method clone = let c = new lookup_id_nonorm cat in c # init_rep_encoding internal_encoding; c # init_warner swarner warner; c end : resolver ) ;; class lookup_id (catalog : (ext_id * resolver) list) = let norm_catalog = (* catalog with normalized PUBLIC ids *) List.map (fun (id,s) -> match id with Public(pubid,sysid) -> let norm_pubid = Pxp_aux.normalize_public_id pubid in (Public(norm_pubid,sysid), s) | _ -> (id,s) ) catalog in lookup_id_nonorm norm_catalog ;; class lookup_id_as_file ?(fixenc:encoding option) catalog = let ch_of_id filename id = let ch = open_in_bin filename in (* may raise Sys_error *) (new input_channel ch, fixenc, None) in let catalog' = List.map (fun (id,s) -> (id, new resolve_to_any_obj_channel ~channel_of_id:(ch_of_id s) () ) ) catalog in lookup_id catalog' ;; class lookup_id_as_string ?(fixenc:encoding option) catalog = let ch_of_id s rid = (new input_string s, fixenc, None) in let catalog' = List.map (fun (id,s) -> (id, new resolve_to_any_obj_channel ~channel_of_id:(ch_of_id s) () ) ) catalog in lookup_id catalog' ;; let map_public_id catalog = List.map (fun (pubid,x) -> (Public(pubid,""), x)) catalog ;; let map_system_id catalog = List.map (fun (sysid,x) -> (System sysid, x)) catalog ;; class lookup_public_id catalog = lookup_id (map_public_id catalog) ;; class lookup_public_id_as_file ?fixenc catalog = lookup_id_as_file ?fixenc (map_public_id catalog) ;; class lookup_public_id_as_string ?fixenc catalog = lookup_id_as_string ?fixenc (map_public_id catalog) ;; class lookup_system_id catalog = lookup_id (map_system_id catalog) ;; class lookup_system_id_as_file ?fixenc catalog = lookup_id_as_file ?fixenc (map_system_id catalog) ;; class lookup_system_id_as_string ?fixenc catalog = lookup_id_as_string ?fixenc (map_system_id catalog) ;; type combination_mode = Public_before_system | System_before_public ;; class combine ?mode rl = object (self) val mode = mode val resolvers = (rl : resolver list) val mutable internal_encoding = `Enc_utf8 val mutable warner = new drop_warnings val mutable swarner = None val mutable active_resolver = None (* (* needed to support close_all: *) val mutable clones = [] *) method init_rep_encoding enc = List.iter (fun r -> r # init_rep_encoding enc) rl; internal_encoding <- enc method init_warner sw w = List.iter (fun r -> r # init_warner sw w) rl; swarner <- sw; warner <- w; method rep_encoding = internal_encoding (* CAUTION: This may not be the truth! *) method open_in xid = self # open_rid (resolver_id_of_ext_id xid) method open_rid rid = let rec find_competent_resolver_for rid' rl = match rl with r :: rl' -> begin try r, (r # open_rid rid') with Not_competent -> find_competent_resolver_for rid' rl' end; | [] -> raise Not_competent in let find_competent_resolver rl = match mode with None -> find_competent_resolver_for rid rl | Some Public_before_system -> ( try find_competent_resolver_for { rid with rid_system = None } rl with Not_competent -> find_competent_resolver_for { rid with rid_public = None } rl ) | Some System_before_public -> ( try find_competent_resolver_for { rid with rid_public = None } rl with Not_competent -> find_competent_resolver_for { rid with rid_system = None } rl ) in if active_resolver <> None then failwith "Pxp_reader.combine # open_rid"; let r, lb = find_competent_resolver resolvers in active_resolver <- Some r; lb method close_in = match active_resolver with None -> () | Some r -> r # close_in; active_resolver <- None (* method close_all = List.iter (fun r -> r # close_in) clones *) method change_encoding (enc:string) = match active_resolver with None -> failwith "Pxp_reader.combine # change_encoding" | Some r -> r # change_encoding enc method active_id = match active_resolver with None -> failwith "#active_id: resolver not open" | Some r -> r # active_id method clone = let c = match active_resolver with None -> new combine ?mode (List.map (fun q -> q # clone) resolvers) | Some r -> let r' = r # clone in new combine ?mode (List.map (fun q -> if q == r then r' else q # clone) resolvers) in c # init_rep_encoding internal_encoding; c # init_warner swarner warner; (* clones <- c :: clones; *) c end ;; let norm_url_syntax = { Neturl.null_url_syntax with Neturl.url_enable_scheme = Neturl.Url_part_allowed; Neturl.url_enable_user = Neturl.Url_part_allowed; Neturl.url_enable_password = Neturl.Url_part_allowed; Neturl.url_enable_host = Neturl.Url_part_allowed; Neturl.url_enable_port = Neturl.Url_part_allowed; Neturl.url_enable_path = Neturl.Url_part_required; (* rest: Url_part_not_recognized *) Neturl.url_accepts_8bits = true; } ;; class norm_system_id (subresolver : resolver) = object(self) val subresolver = subresolver val mutable current_rid = null_resolver (* for rewrite_system_id *) method init_rep_encoding enc = subresolver # init_rep_encoding enc method init_warner sw w = subresolver # init_warner sw w; method rep_encoding = subresolver # rep_encoding method open_in xid = (* It is not possible to normalize the SYSTEM id of a xid *) subresolver # open_in xid method open_rid rid = (* (1) check that the system name is a URL * (2) if the URL is relative: make it absolute (use system base name) * (3) remove .. and . from the URL path as much as possible * (4) all other names are left unmodified *) let norm sysname = try (* prerr_endline ("sysname=" ^ sysname); *) let sysurl = Neturl.url_of_string norm_url_syntax sysname in let sysurl_abs = if Neturl.url_provides ~scheme:true sysurl then sysurl else match rid.rid_system_base with None -> (* The sysurl is relative, but we do not have a base URL. * There is no way to interpret this case, so we reject * it. *) raise Not_competent | Some sysbase -> let baseurl = Neturl.url_of_string norm_url_syntax sysbase in Neturl.apply_relative_url baseurl sysurl in let path = Neturl.url_path sysurl_abs in let path' = Neturl.norm_path path in (* remove .., ., // *) let sysurl' = Neturl.modify_url ~path:path' sysurl_abs in (* prerr_endline ("Before rewrite: " ^ Neturl.string_of_url sysurl'); *) let sysurl'' = self # rewrite sysurl' in let sysname' = Neturl.string_of_url sysurl'' in (* prerr_endline ("sysname'=" ^ sysname'); *) sysname' with Neturl.Malformed_URL -> raise Not_competent in let rid' = { rid with rid_system = ( match rid.rid_system with None -> None | Some sysname -> Some(norm sysname) ) } in let lex = subresolver # open_rid rid' in current_rid <- rid; (* the original, unmodified version! *) lex method private rewrite sysurl = sysurl method close_in = subresolver # close_in (* method close_all = subresolver # close_all *) method change_encoding enc = subresolver # change_encoding enc method active_id = subresolver # active_id method clone = let c = subresolver # clone in ( {< subresolver = c >} :> resolver ) end ;; let try_to_get f arg = try Some(f arg) with Not_found -> None ;; let remove_trailing_slash p = match p with [] -> [] | [""] -> [""] | _ -> let p' = List.rev p in if List.hd p' = "" then List.rev(List.tl p') else p ;; let rec path_matches pattern p = match (pattern, p) with ( [], [] ) -> (* Case: pattern = p *) true | ( [""], (_::_) ) -> (* Case: pattern ends with slash, and is a prefix of p *) true | ( (pat :: pattern'), (p0 :: p') ) when pat = p0 -> path_matches pattern' p' | _ -> false ;; let rec path_subst pattern subst p = match (pattern, p) with ( [], [] ) -> (* Case: pattern = p *) subst | ( [""], (_::_) ) -> (* Case: pattern ends with slash, and is a prefix of p *) (* If subst ends with a slash, remove it *) let subst' = remove_trailing_slash subst in subst' @ p | ( (pat :: pattern'), (p0 :: p') ) when pat = p0 -> path_subst pattern' subst p' | _ -> assert false (* no match *) ;; class rewrite_system_id ?(forward_unmatching_urls=false) rw_spec subresolver = object(self) inherit norm_system_id subresolver val forward_unmatching_urls = forward_unmatching_urls val rw_spec = List.map (fun (sysfrom, systo) -> let sysfrom_url = Neturl.url_of_string norm_url_syntax sysfrom in let systo_url = Neturl.url_of_string norm_url_syntax systo in (* if sysfrom_url ends with a slash, systo_url must end with it, too *) let ends_with_slash url = let path = Neturl.url_path url in List.hd (List.rev path) = "" in if ends_with_slash sysfrom_url && not(ends_with_slash systo_url) then failwith "Illegal rewrite specification: Cannot map directory to non-directory"; (sysfrom_url, systo_url) ) rw_spec method private rewrite url = try let sysfrom_url, systo_url = List.find (* may raise Not_found *) (fun (sysfrom_url, systo_url) -> (* Check whether url matches sysfrom_url *) (try_to_get Neturl.url_scheme sysfrom_url = try_to_get Neturl.url_scheme url) && (try_to_get Neturl.url_user sysfrom_url = try_to_get Neturl.url_user url) && (try_to_get Neturl.url_password sysfrom_url = try_to_get Neturl.url_password url) && (try_to_get Neturl.url_host sysfrom_url = try_to_get Neturl.url_host url) && (try_to_get Neturl.url_port sysfrom_url = try_to_get Neturl.url_port url) && (let sysfrom_p = Neturl.url_path sysfrom_url in let p = Neturl.url_path url in (List.hd sysfrom_p = "") && (* i.e. sysfrom_p is absolute *) (List.hd p = "") && (* i.e. p is absolute *) (path_matches sysfrom_p p)) ) rw_spec in (* prerr_endline("sysfrom_url=" ^ Neturl.string_of_url sysfrom_url); prerr_endline("systo_url=" ^ Neturl.string_of_url systo_url); *) let sysfrom_p = Neturl.url_path sysfrom_url in let systo_p = Neturl.url_path systo_url in let p = Neturl.url_path url in let p' = path_subst sysfrom_p systo_p p in Neturl.modify_url ~path:p' systo_url with Not_found -> if forward_unmatching_urls then url else raise Not_competent method active_id = (* hide the rewritten URL *) let aid = subresolver # active_id in { aid with rid_system = current_rid.rid_system; rid_system_base = current_rid.rid_system_base; } end ;; (**********************************************************************) (* EMULATION OF DEPRECATED CLASSES *) (**********************************************************************) let rec try_several f l = (* Applies the function f to all elements of l in turn. The function can * return a result value, or can raise Not_competent. The elements are * tried until a result value is found. If no element leads to a result, * the exception Not_competent is raised. *) match l with [] -> raise Not_competent | x :: l' -> ( try f x with Not_competent -> try_several f l') ;; let xid_list_of_rid rid = (* Returns a list of ext_ids that are compatible to the rid *) (match rid.rid_private with Some p -> [ Private p ] | None -> [] ) @ (match rid.rid_public, rid.rid_system with (Some pub, Some sys) -> [ Public(pub,sys) ] | (Some pub, None) -> [ Public(pub,"") ] | (None, Some sys) -> [ System(sys) ] | (None, None) -> [] ) @ [ Anonymous ] (* We always append Anonymous, because this xid is included in all rids *) ;; class resolve_read_any_channel ?(close=close_in) ~channel_of_id () = (* reduce resolve_read_any_channel to resolve_to_any_obj_channel *) let current_ch = ref None in let obj_channel_of_id rid = try_several (fun xid -> let ch, enc_opt = channel_of_id xid in (* may raise Not_competent *) current_ch := Some ch; (new input_channel ch, enc_opt, Some (resolver_id_of_ext_id xid)) ) (xid_list_of_rid rid) in let close_obj _ = match !current_ch with Some ch -> close ch; current_ch := None | None -> () in resolve_to_any_obj_channel ~close:close_obj ~channel_of_id:obj_channel_of_id () ;; class resolve_read_this_channel ?id ?fixenc ?close ch = (* reduce resolve_read_this_channel to resolve_to_this_obj_channel *) let obj_ch = new input_channel ch in let close_obj _ = match close with Some f -> f ch | None -> () in resolve_to_this_obj_channel ?id ?fixenc ~close:close_obj obj_ch ;; class resolve_read_any_string ~string_of_id () = (* reduce resolve_read_any_string to resolve_to_any_obj_channel *) let obj_channel_of_id rid = try_several (fun xid -> let s, enc_opt = string_of_id xid in (* may raise Not_competent *) (new input_string s, enc_opt, Some(resolver_id_of_ext_id xid)) ) (xid_list_of_rid rid) in resolve_to_any_obj_channel ~channel_of_id:obj_channel_of_id () ;; class resolve_read_this_string ?id ?fixenc s = let string_of_id tried_xid = match id with None -> (* Open always! *) (s, fixenc) | Some my_xid -> if my_xid = tried_xid then (s, fixenc) else raise Not_competent in resolve_read_any_string ~string_of_id () ;; class resolve_read_url_channel (* reduce resolve_read_url_channel to resolve_to_url_obj_channel *) ?(base_url = Neturl.null_url) ?(close = close_in) ~url_of_id ~channel_of_url () = let current_ch = ref None in let current_xid = ref Anonymous in let url_of_id' rid = try_several (fun xid -> let url = url_of_id xid in (* or Not_competent *) current_xid := xid; url ) (xid_list_of_rid rid) in let base_url_of_id rid = ( match rid.rid_system_base with Some sys -> Neturl.url_of_string base_url_syntax sys | None -> raise Not_competent ) in let channel_of_url' rid url = let ch, enc_opt = channel_of_url !current_xid url in (* may raise Not_competent *) current_ch := Some ch; let active_id = resolver_id_of_ext_id !current_xid in let active_id' = { active_id with rid_system = rid.rid_system } in (new input_channel ch, enc_opt, Some active_id') in let close_obj _ = match !current_ch with Some ch -> close ch; current_ch := None | None -> () in resolve_to_url_obj_channel ~close:close_obj ~url_of_id:url_of_id' ~base_url_of_id ~channel_of_url:channel_of_url' () ;; let lookup_public_id_as_file ?fixenc catalog = new lookup_public_id_as_file ?fixenc catalog ;; let lookup_public_id_as_string ?fixenc catalog = new lookup_public_id_as_string ?fixenc catalog ;; let lookup_system_id_as_file ?fixenc catalog = new lookup_system_id_as_file ?fixenc catalog ;; let lookup_system_id_as_string ?fixenc catalog = new lookup_system_id_as_string ?fixenc catalog ;;