% -*- Erlang -*- % File: mail_html.erl % Author: Johan Bevemyr % Created: Sat Jun 19 15:13:49 2004 % Purpose: Transform HTML to text -module('mail_html'). -author('jb@mor.bevemyr.com'). -export([html_to_text/1]). html_to_text(Input) -> Tokens = tokenize(lists:flatten(Input), [], [], 1), Ehtml = parse(Tokens), RevText = ehtml_to_text(Ehtml, []), lists:reverse(RevText). ehtml_to_text([], Acc) -> Acc; ehtml_to_text([{Tag, Opts}|Rest], Acc) -> Acc2 = add_tag_space(Tag, Acc), ehtml_to_text(Rest, Acc2); ehtml_to_text([{script, Opts, Body}|Rest], Acc) -> ehtml_to_text(Rest, Acc); ehtml_to_text([{Tag, Opts, Body}|Rest], Acc) -> Acc1 = add_tag_space(Tag, Acc), Acc2 = ehtml_to_text(Body, Acc1), ehtml_to_text(Rest, Acc2); ehtml_to_text([Text|Rest], Acc) -> Text2 = text_reformat(Text, []), ehtml_to_text(Rest, [Text2|Acc]). add_tag_space(p, Acc) -> [$\n,$\r|Acc]; add_tag_space(br, Acc) -> [$\n,$\r|Acc]; add_tag_space(hr, Acc) -> [$\n,$\r|Acc]; add_tag_space(_, Acc) -> Acc. text_reformat([], Acc) -> lists:reverse(Acc); text_reformat([$\n|R], [$ |Acc]) -> text_reformat(R, Acc); text_reformat([$\n|R], Acc) -> text_reformat(R, [$ |Acc]); text_reformat([$\r|R], Acc) -> text_reformat(R, Acc); text_reformat([C|R], Acc) -> text_reformat(R, [C|Acc]). %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% Alternative parser, recursive as hell %% parse(Tokens) -> parse(Tokens, []). parse([], Acc) -> lists:reverse(Acc); parse([{begin_tag, T, A, L}|Rest], Acc) -> case tag_type(T) of leaf -> parse(Rest, [{T,A}|Acc]); node -> case find_body(T, Rest, []) of {error, Reason} -> %% no body found, assume leaf %% io:format("Error: ~s on line ~p\n", [Reason, L]), parse(Rest, [{T,A}|Acc]); {Body,Rest2} -> ParsedBody = parse(Body), parse(Rest2, [{T,A,ParsedBody}|Acc]) end end; parse([{end_tag, T, A, L}|Rest], Acc) -> %% errounous end tag, ignore parse(Rest, Acc); parse([{data, Data, L}|Rest], Acc) -> parse(Rest, [Data|Acc]). find_body(Tag, [], Acc) -> {error, "Missing end tag for "++atom_to_list(Tag)}; find_body(Tag, [{end_tag,Tag,_,_}|Rest], Acc) -> {lists:reverse(Acc),Rest}; find_body(Tag, [{begin_tag, Tag, A, L}|Rest], Acc) -> case find_body(Tag, Rest, []) of {error, Reason} -> %% no body found {error, Reason}; {Body, Rest1} -> find_body(Tag, Rest1, [{end_tag, Tag, [], -1}|lists:reverse(Body)++ [{begin_tag, Tag, A, L}|Acc]]) end; find_body(Tag, [X|Rest], Acc) -> find_body(Tag, Rest, [X|Acc]). %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %tag_type(option) -> leaf; tag_type(p) -> leaf; tag_type(hr) -> leaf; tag_type(input) -> leaf; tag_type(base) -> leaf; tag_type(img) -> leaf; tag_type('!doctype') -> leaf; tag_type(meta) -> leaf; tag_type(link) -> leaf; tag_type(br) -> leaf; tag_type(param) -> leaf; tag_type(_) -> node. % tokenize(Input, DataAcc, TokenAcc, LineNr) tokenize([], [], Tokens, _Line) -> lists:reverse(Tokens); tokenize([], Acc, Tokens, Line) -> lists:reverse([{data, lists:reverse(Acc), Line}|Tokens]); tokenize([$<,$!,$-,$-|R0], Acc, Tokens, L0) -> {R1, L1} = skip_comment(R0,L0), tokenize(R1, Acc, Tokens, L1); tokenize([$<|R0], Acc, Tokens, L0) -> {Tag,R1,L1} = scan_tag(R0,L0), if Acc == [] -> next_token(Tag, R1, [Tag|Tokens], L1); true -> Data = {data,lists:reverse(Acc),L0}, next_token(Tag, R1, [Tag,Data|Tokens], L1) end; tokenize([C=$\n|R0], Acc, Tokens, L) -> tokenize(R0, [C|Acc], Tokens, L+1); tokenize([C=$\r|R0], Acc, Tokens, L) -> tokenize(R0, [C|Acc], Tokens, L+1); tokenize([C|R0], Acc, Tokens, L) -> tokenize(R0, [C|Acc], Tokens, L). % next_token({begin_tag, script, _, _}, R, Tokens, L) -> {Data, R1, L1} = scan_endtag(R, "script", L), tokenize(R1, [], [{data, Data, L}|Tokens], L1); next_token({begin_tag, style, _, _}, R, Tokens, L) -> {Data, R1, L1} = scan_endtag(R, "style", L), tokenize(R1, [], [{data, Data, L}|Tokens], L1); next_token(_Tag, R, Tokens, L) -> tokenize(R, [], Tokens, L). %% '<' + [*['=']]* ['/'] '>' scan_tag([$/|I], L) -> {R0,L0} = skip_space(I, L), {Name,R1,L1} = scan_tag_name(I, L0), {R2,L2} = skip_space(R1, L1), {Args,R3,L3} = scan_tag_args(R2, L2), {{end_tag,list_to_atom(lowercase(Name)),Args,L0}, R3, L3}; scan_tag(I, L) -> {R0,L0} = skip_space(I, L), {Name,R1,L1} = scan_tag_name(I, L0), {R2,L2} = skip_space(R1, L1), {Args,R3,L3} = scan_tag_args(R2, L2), {{begin_tag,list_to_atom(lowercase(Name)),Args,L0}, R3, L3}. % scan_tag_name(I, L) -> scan_token(I, [], L). % scan_tag_args(I, L) -> scan_tag_args(I, [], L). scan_tag_args([], Acc, L) -> {lists:reverse(Acc), [], L}; scan_tag_args([$>|R], Acc, L) -> {lists:reverse(Acc), R, L}; scan_tag_args(R=[$<|_], Acc, L) -> %% bad html {lists:reverse(Acc), R, L}; scan_tag_args(R0, Acc, L0) -> {Name,R1,L1} = scan_value(R0, L0), {R2, L2} = skip_space(R1, L1), case R2 of [$=|R3] -> {R4,L4} = skip_space(R3, L2), {Value,R5,L5} = scan_value(R4, L4), {R6,L6} = skip_space(R5, L5), OptName = list_to_atom(lowercase(Name)), scan_tag_args(R6, [{OptName,Value}|Acc], L6); _ -> scan_tag_args(R2, [Name|Acc], L2) end. % scan_value([$"|R], L) -> scan_quote(R, [], $", L); scan_value([$'|R], L) -> scan_quote(R, [], $', L); scan_value(R, L) -> scan_token(R, [], L). % scan_token([], Acc, L) -> {lists:reverse(Acc), [], L}; scan_token(R=[$>|_], Acc, L) -> {lists:reverse(Acc), R, L}; scan_token(R=[$<|_], Acc, L) -> %% bad html {lists:reverse(Acc), R, L}; scan_token(R=[$=|_], Acc, L) -> %% bad html {lists:reverse(Acc), R, L}; scan_token([C|R], Acc, L0) -> case char_class(C) of space -> {lists:reverse(Acc), R, L0}; nl -> {lists:reverse(Acc), R, L0+1}; _ -> scan_token(R, [C|Acc], L0) end. % scan_quote([], Acc, _Q, L) -> {lists:reverse(Acc), [], L}; scan_quote([Q|R], Acc, Q, L) -> {lists:reverse(Acc), R, L}; scan_quote([C=$\n|R], Acc, Q, L) -> scan_quote(R, [C|Acc], Q, L+1); scan_quote([C=$\r|R], Acc, Q, L) -> scan_quote(R, [C|Acc], Q, L+1); scan_quote([C|R], Acc, Q, L) -> scan_quote(R, [C|Acc], Q, L). % scan_endtag(R, Tag, L) -> scan_endtag(R, Tag, [], L). scan_endtag([], _Tag, Acc, L) -> {lists:reverse(Acc), [], L}; scan_endtag(R=[$<,$/|R0], Tag, Acc, L0) -> case casecmp(Tag, R0) of {true, R1} -> {R2,_} = skip_space(R1,L0), if hd(R2) == $> -> {lists:reverse(Acc), R, L0}; true -> scan_endtag(R0, Tag, Acc, L0) end; false -> scan_endtag(R0, Tag, Acc, L0) end; scan_endtag([C=$\n|R], Tag, Acc, L) -> scan_endtag(R, Tag, [C|Acc], L+1); scan_endtag([C=$\r|R], Tag, Acc, L) -> scan_endtag(R, Tag, [C|Acc], L+1); scan_endtag([C|R], Tag, Acc, L) -> scan_endtag(R, Tag, [C|Acc], L). % casecmp([], R) -> {true, R}; casecmp([C1|T1], [C2|T2]) -> C2low = lowercase_ch(C2), if C1 == C2low -> casecmp(T1,T2); true -> false end. % char_class($\n) -> nl; char_class($\r) -> nl; char_class($ ) -> space; char_class($\t) -> space; char_class(C) when C >= $a, C =< $z -> alpha; char_class(C) when C >= $A, C =< $Z -> alpha; char_class(C) when C >= $0, C =< $9 -> digit; char_class(C) -> other. % skip_space([], L) -> {[], L}; skip_space(R = [C|R0], L) -> case char_class(C) of nl -> skip_space(R0, L+1); space -> skip_space(R0, L); _ -> {R, L} end. % skip_comment([], L) -> {[], L}; skip_comment([$-,$-,$>|R],L) -> {R,L}; skip_comment([$\n|R],L) -> skip_comment(R,L+1); skip_comment([$\r|R],L) -> skip_comment(R,L+1); skip_comment([C|R],L) -> skip_comment(R,L). % lowercase(Str) -> [lowercase_ch(S) || S <- Str]. lowercase_ch(C) when C>=$A, C=<$Z -> C + 32; lowercase_ch(C) -> C. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%