(***********************************************************************)
(*                                                                     *)
(*                 SpamOracle -- a Bayesian spam filter                *)
(*                                                                     *)
(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
(*                                                                     *)
(*  Copyright 2002 Institut National de Recherche en Informatique et   *)
(*  en Automatique.  This file is distributed under the terms of the   *)
(*  GNU Public License version 2.                                      *)
(*                                                                     *)
(***********************************************************************)

(* $Id: wordsplit.mlp,v 1.8 2003/08/31 16:57:49 xleroy Exp $ *)

(* Decompose a string into words. *)

{
(* Map uppercase to lowercase.  Remove ISO Latin 1 accents. *)

let tbl = "\
\000\001\002\003\004\005\006\007\008\t\n\011\012\013\014\015\
\016\017\018\019\020\021\022\023\024\025\026\027\028\029\030\031 \
!\"#$%&'()*+,-./\
0123456789:;<=>?\
@abcdefghijklmno\
pqrstuvwxyz[\\]^_\
`abcdefghijklmno\
pqrstuvwxyz{|}~\127\
\128\129\130\131\132\133\134\135\136\137\138\139\140\141\142\143\
\144\145\146\147\148\149\150\151\152\153\154\155\156\157\158\159\
\160¡¢£¤¥¦§¨©ª«¬­®¯\
°±²³´µ¶·¸¹º»¼½¾¿\
aaaaaaeceeeeiiii\
Ðnooooo×ouuuuypß\
aaaaaaeceeeeiiii\
ðnooooo÷ouuuuypy"

let normalize s = s
  (* for i = 0 to String.length s - 1 do
    s.[i] <- tbl.[Char.code s.[i]]
  done *)

let all_uppercase s =
  try
    for i = 0 to String.length s - 1 do
      let c = s.[i] in
      if (c >= 'A' && c <= 'Z')
      || (c >= '\192' && c <= '\214')
      || (c >= '\216' && c <= '\222')
      then ()
      else raise Exit
    done;
    true
  with Exit ->
    false

let adjust s n =
  if String.length s <= n then s else String.sub s 0 n

let unstretch s =
  let t = String.create ((String.length s + 1) / 2) in
  for i = 0 to String.length t - 1 do
    t.[i] <- s.[i + i]
  done;
  t

}

let letter = [
   'a'-'z' 'A'-'Z'
#ifdef FRENCH
   'À' 'Â' 'Ç' 'È' 'É' 'Ê' 'Ë' 'Î' 'Ï' 'Ô' 'Ù' 'Û' 'Ü'
   'à' 'â' 'ç' 'è' 'é' 'ê' 'ë' 'î' 'ï' 'ô' 'ù' 'û' 'ü' 'ÿ'
#endif
#ifdef SPANISH
   'Á' 'É' 'Í' 'Ó' 'Ú' 'Ü' 'Ñ'
   'á' 'é' 'í' 'ó' 'ú' 'ü' 'ñ'
#endif
#ifdef ITALIAN
   'É' 'Í' 'Ú' 'À' 'È' 'Ò'
   'é' 'í' 'ú' 'à' 'è' 'ò'
#endif
#ifdef GERMAN
   'Ä' 'Ö' 'Ü'
   'ä' 'ö' 'ü' 'ß' 
#endif
#ifdef PORTUGUESE
   'À' 'Á' 'Â' 'Ã' 'Ç' 'É' 'Ê' 'Í' 'Ó' 'Ô' 'Õ' 'Ú' 'Ü'
   'à' 'á' 'â' 'ã' 'ç' 'é' 'ê' 'í' 'ó' 'ô' 'õ' 'ú' 'ü'
#endif
#ifdef RUSSIAN
  'Á' 'Â' '×' 'Ç' 'Ä' 'Å' '£' 'Ö' 'Ú' 'É' 'Ê' 'Ë' 'Ì' 'Í' 'Î' 'Ï' 'Ð' 'Ò' 'Ó' 'Ô' 'Õ' 'Æ' 'È' 'Ã' 'Þ' 'Û' 'Ý' 'ß' 'Ù' 'Ø' 'Ü' 'À' 'Ñ'
  'á' 'â' '÷' 'ç' 'ä' 'å' '³' 'ö' 'ú' 'é' 'ê' 'ë' 'ì' 'í' 'î' 'ï' 'ð' 'ò' 'ó' 'ô' 'õ' 'æ' 'è' 'ã' 'þ' 'û' 'ý' 'ÿ' 'ù' 'ø' 'ü' 'à' 'ñ'
#endif
]

let word_constituent = letter | ['-' '\'']

let stretched_word =
    letter ' ' letter ' ' letter ' ' letter (' ' letter) *
  | letter '-' letter '-' letter '-' letter ('-' letter) *
  | letter '_' letter '_' letter '_' letter ('_' letter) *
  | letter '.' letter '.' letter '.' letter ('.' letter) *
  | letter '*' letter '*' letter '*' letter ('*' letter) *
  | letter '\'' letter '\'' letter '\'' letter ('\'' letter) *

let numeric = ['0'-'9' '.' ',' '$' '%' '\164' (* Euro *)]

let weird_character = ['\127' - '\255']

#ifdef JAPANESE
let jis_snd =
  ['\032'-'\127']

let kanji =
  ['\048'-'\116'] jis_snd

let katakana =
  (['\037'] jis_snd | "\033\060")

let jis_char =
  ['\032'-'\116'] jis_snd

let jis_in =
  "\027$" '(' ? ['@' 'B']

let jis_out =
  "\027(" ['B' 'J' 'H']
#endif

rule split = parse
    stretched_word
      { fun action ->
          let s = unstretch (Lexing.lexeme lexbuf) in
          if all_uppercase s then
            action ("U" ^ string_of_int (String.length s));
          if String.length s <= 12 then (normalize s; action s);
          split lexbuf action }
  | word_constituent word_constituent word_constituent word_constituent *
      { fun action ->
          let s = Lexing.lexeme lexbuf in
          if all_uppercase s then
            action ("U" ^ string_of_int (String.length s));
          if String.length s <= 12 then (normalize s; action s);
          split lexbuf action }
  | numeric numeric numeric numeric *
      { fun action ->
          let s = Lexing.lexeme lexbuf in
          if String.length s <= 8 then (normalize s; action s);
          split lexbuf action }
#ifdef JAPANESE
  | jis_in
      { split_jis lexbuf }
#endif
  | weird_character weird_character weird_character weird_character *
      { fun action ->
          let s = Lexing.lexeme lexbuf in
          action ("W" ^ string_of_int (String.length s));
          split lexbuf action }
  | eof
      { fun action -> () }
  | _
      { split lexbuf }

#ifdef JAPANESE
and split_jis = parse
    kanji kanji *
      { fun action ->
          let s = Lexing.lexeme lexbuf in
          action ("\027$B" ^ adjust s 8 ^ "\027(B");
          split_jis lexbuf action }
  | katakana katakana katakana katakana *
      { fun action ->
          let s = Lexing.lexeme lexbuf in
          action ("\027$B" ^ adjust s 12 ^ "\027(B");
          split_jis lexbuf action }
  | jis_char
      { split_jis lexbuf }
  | eof
      { fun action -> () }
  | jis_out
      { split lexbuf }
  | _
      { split lexbuf }
#endif

{

let iter action txt =
  split (Lexing.from_string txt) action

}