(***********************************************************************)
(*                                                                     *)
(*                 SpamOracle -- a Bayesian spam filter                *)
(*                                                                     *)
(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
(*                                                                     *)
(*  Copyright 2002 Institut National de Recherche en Informatique et   *)
(*  en Automatique.  This file is distributed under the terms of the   *)
(*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
(*                                                                     *)
(***********************************************************************)

(* $Id: mail.ml,v 1.8 2003/08/28 14:38:38 xleroy Exp $ *)

(** Parsing of e-mail messages, including attachments *)

type message =
  { headers: (string * string) list;
    body: string;
    parts: message list }

let base64_decode_char c =
  match c with
    'A' .. 'Z' -> Char.code c - 65
  | 'a' .. 'z' -> Char.code c - 97 + 26
  | '0' .. '9' -> Char.code c - 48 + 52
  | '+' -> 62
  | '/' -> 63
  | _ -> -1

let decode_base64 s =
  let d = Buffer.create (String.length s * 3 / 4) in
  let buf = Array.create 4 0 in
  let pos = ref 0 in
  for i = 0 to String.length s - 1 do
    let n = base64_decode_char s.[i] in
    if n >= 0 then begin
      buf.(!pos) <- n;
      incr pos;
      if !pos = 4 then begin
        Buffer.add_char d (Char.chr(buf.(0) lsl 2 + buf.(1) lsr 4));
        Buffer.add_char d (Char.chr((buf.(1) land 15) lsl 4 + buf.(2) lsr 2));
        Buffer.add_char d (Char.chr((buf.(2) land 3) lsl 6 + buf.(3)));
        pos := 0
      end
    end
  done;
  begin match !pos with
    2 ->
      Buffer.add_char d (Char.chr(buf.(0) lsl 2 + buf.(1) lsr 4))
  | 3 ->
      Buffer.add_char d (Char.chr(buf.(0) lsl 2 + buf.(1) lsr 4));
      Buffer.add_char d (Char.chr((buf.(1) land 15) lsl 4 + buf.(2) lsr 2))
  | _ ->
      ()
  end;
  Buffer.contents d

let hexa_digit c =
  if c >= '0' && c <= '9' then Char.code c - 48
  else if c >= 'A' && c <= 'F' then Char.code c - 65 + 10
  else if c >= 'a' && c <= 'f' then Char.code c - 97 + 10
  else raise Not_found

let decode_qp s =
  let len = String.length s in
  let d = Buffer.create (String.length s) in
  let pos = ref 0 in
  while !pos < len do
    let c = s.[!pos] in
    if c = '=' && !pos + 1 < len && s.[!pos + 1] = '\n' then begin
      pos := !pos + 2
    end else if c = '=' && !pos + 2 < len then begin
      try
        let h1 = hexa_digit s.[!pos + 1]
        and h2 = hexa_digit s.[!pos + 2] in
        Buffer.add_char d (Char.chr(h1 lsl 4 + h2));
        pos := !pos + 3
      with Not_found ->
        Buffer.add_char d c;
        incr pos
    end else begin
      Buffer.add_char d c;
      incr pos
    end
  done;
  Buffer.contents d

let re_base64 = Str.regexp_case_fold "base64"
let re_qp = Str.regexp_case_fold "quoted-printable"

let decode encoding s =
  if Str.string_match re_base64 encoding 0 then
    decode_base64 s
  else if Str.string_match re_qp encoding 0 then
    decode_qp s
  else
    s

let re_encoded_header =
  Str.regexp "=\\?[_A-Za-z0-9-]+\\?\\([BbQq]\\)\\?\\([^?]*\\)\\?="

let decode_header s =
  let decode_group s =
    let enc = Str.matched_group 1 s
    and txt = Str.matched_group 2 s in
    match enc with
      "B" | "b" -> decode_base64 txt
    | "Q" | "q" -> decode_qp txt
    | _ -> assert false in
  Str.global_substitute re_encoded_header decode_group s

let re_continuation = Str.regexp "\n[ \t]+"
let re_nl = Str.regexp "\n"
let re_field = Str.regexp "\\([A-Za-z-]+[: ]\\)[ \t]*\\(.*\\)"

let parse_header s =
  let rec parse_field accu = function
    [] -> List.rev accu
  | line :: rem ->
      if Str.string_match re_field line 0 then begin
        let field_name = String.lowercase (Str.matched_group 1 line)
        and field_val  = Str.matched_group 2 line in
        parse_field ((field_name, decode_header field_val) :: accu) rem
      end else
        parse_field accu rem in
  parse_field [] (Str.split re_nl (Str.global_replace re_continuation " " s))

let find_header name headers =
  try List.assoc name headers with Not_found -> ""

let re_nl_nl = Str.regexp "\n\n"
let re_multipart =
  Str.regexp_case_fold
    "multipart/.*boundary *= *\\(\"\\([^\"]+\\)\"\\|\\([^ \t]+\\)\\)"

let rec parse_message s =
  try
    let pos_sep = Str.search_forward re_nl_nl s 0 in
    let headers = parse_header (String.sub s 0 pos_sep) in
    let body = String.sub s (pos_sep + 2) (String.length s - pos_sep - 2) in
    let encoding = find_header "content-transfer-encoding:" headers in
    let ctype = find_header "content-type:" headers in
    if Str.string_match re_multipart ctype 0 then begin
      let boundary =
        try
          Str.matched_group 2 ctype
        with Not_found -> try
          Str.matched_group 3 ctype
        with Not_found ->
          assert false in
      let re_bound =
        Str.regexp ("--" ^ Str.quote boundary ^ "[ \t\n]*") in
      match Str.split_delim re_bound body with
        [] ->
          { headers = headers;
            body = decode encoding body;
            parts = [] }
      | blurb :: parts ->
          { headers = headers;
            body = decode encoding blurb;
            parts = List.map parse_message parts }
    end else
      { headers = headers;
        body = decode encoding body;
        parts = [] }
  with Not_found ->
    { headers = [];
      body = s;
      parts = [] }

let header s msg =
  let rec hdr = function
    [] -> []
  | (h,v) :: rem -> if h = s then v :: hdr rem else hdr rem in
  String.concat "\n" (hdr msg.headers)

let header_matches s re msg =
  let rec hmatch = function
    [] -> false
  | (h,v) :: rem -> (h = s && Str.string_match re v 0) || hmatch rem
  in hmatch msg.headers

let re_content_text =
  Str.regexp_case_fold "text/plain\\|text/enriched\\|text;\\|text$"
let re_content_html =
  Str.regexp_case_fold "text/html"
let re_content_message_rfc822 =
  Str.regexp_case_fold "message/rfc822"
let re_content_multipart =
  Str.regexp_case_fold "multipart/"

let rec iter_text_parts fn m =
  if header_matches "content-type:" re_content_text m 
  || not(List.mem_assoc "content-type:" m.headers) then
    fn m
  else if header_matches "content-type:" re_content_html m then
    fn {m with body = Htmlscan.extract_text m.body}
  else if header_matches "content-type:" re_content_multipart m then begin
    fn m;
    List.iter (iter_text_parts fn) m.parts
  end else if header_matches "content-type:" re_content_message_rfc822 m then
    iter_text_parts fn (parse_message m.body)
  else
    ()

let iter_message fn msg =
  List.iter
    (fun (h, v) -> if Str.string_match !Config.mail_headers h 0 then fn v)
    msg.headers;
  iter_text_parts
    (fun m -> fn m.body)
    msg