#!/usr/bin/perl -w
use strict;
# textmail - mail filter to replace MS Word/HTML attachments with plain text
#
# Copyright (C) 2003-2007 raf <raf@raf.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
# or visit http://www.gnu.org/copyleft/gpl.html
#
# 20070803 raf <raf@raf.org>
=head1 NAME
I<textmail> - mail filter to replace MS Word/HTML attachments with plain text
=head1 SYNOPSIS
usage: textmail [options]
options:
-h - Print the help message then exit
-m - Print the manpage then exit
-w - Print the manpage in html format then exit
-r - Print the manpage in nroff format then exit
-M - Output in mailbox format (mboxrd)
-T - Output in raw mail format (for smtp)
-W - Don't replace MS Word attachments with text
-E - Don't replace MS Excel attachments with csv
-H - Don't replace HTML attachments with text
-R - Don't replace RTF attachments with text
-P - Don't replace PDF attachments with text
-U - Don't translate winmail.dat attachments
-L - Don't reduce appledouble attachments
-I - Don't delete image attachments
-A - Don't delete audio attachments
-V - Don't delete video attachments
-X - Don't delete MS Windows executable attachments
-B - Don't recode text that was base64-encoded
-S - Don't replace spaces in filenames with underscores
-Z - Do translate signed content (discards signatures)
-O - Delete all application/octet-stream attachments
-! - Delete all application/* attachments
-D hdrs - Delete headers (list of header prefixes and filenames)
-K types - Keep attachments (list of mimetypes and filenames)
-f - On translation error, keep translation, not original
-? - Print paths of helper applications then exit
=head1 DESCRIPTION
I<textmail> filters a mail message or mbox, replacing MS Word, MS
Excel, HTML, RTF and PDF attachments with the plain text contained therein.
By default, the following attachments are also deleted: image, audio, video
and MS Windows executables. MS C<winmail.dat> attachments are replaced by
any attachments contained therein which are then replaced by text or deleted
in the same fashion. Any of these actions can be suppressed with the command
line options. Mail headers can also be selectively deleted.
This is useful for increasing the accessibility of mail messages (by
reducing their dependence on proprietary file formats), for dramatically
reducing their size (and the time it takes to download them and the time it
takes to read them), and for dramatically reducing the risk of mail-borne
viruses. Its intended use is as a preprocessor for mailing lists. This is
more friendly than a strict "No Attachments" policy.
=head1 OPTIONS
=over 4
=item C<-h>
Print the help message then exit.
=item C<-m>
Print the manpage then exit. This is equivalent to executing C<man textmail>
but this works even when the manpage isn't installed.
=item C<-w>
Print the manpage in html format then exit. This lets you install the
manpage in html format with a command like:
mkdir -p /usr/local/share/doc/textmail/html &&
textmail -w > /usr/local/share/doc/textmail/html/textmail.1.html
=item C<-r>
Print the manpage in nroff format then exit. This lets you install the
manpage with a command like:
textmail -r > /usr/local/share/man/man1/textmail.1
=item C<-M>
This option causes the output to be in mboxrd format by adding a mailbox
C<From> line at the top if there isn't one already and ensures that there is
a blank line at the bottom of the output. It also performs mailbox quoting
on any lines in the body that look like mailbox C<From> headers. Use this
when the output is to be stored directly in a mailbox file. It is not
necessary when I<textmail> is being used as a mail filter by I<procmail(1)>.
=item C<-T>
This option causes the output to be in raw mail format by removing any
mailbox C<From> line and by not performing mailbox quoting. Use this when
the output is to be sent directly to an SMTP server. It is not necessary
when I<textmail> is being used as a mail filter by I<procmail(1)>.
=item C<-W>
By default, I<textmail> replaces MS Word attachments with inline plain text
attachments that contain just the plain text within the original document.
This option leaves MS Word attachments intact.
=item C<-E>
By default, I<textmail> replaces MS Excel attachments with CSV file
attachments that contain just the data within the original document. This
option leaves MS Excel attachments intact.
=item C<-H>
By default, I<textmail> replaces HTML attachments with inline plain text
attachments that contain just the text within the original document. It also
reduces text-versus-html alternative attachments to just the text
attachment. This option leaves HTML (and alternative) attachments intact.
=item C<-R>
By default, I<textmail> replaces RTF attachments with inline plain text
attachments that contain just the plain text within the original document.
This option leaves RTF attachments intact.
=item C<-P>
By default, I<textmail> replaces PDF attachments with inline plain text
attachments that contain just the plain text within the original document.
This option leaves PDF attachments intact.
=item C<-U>
By default, I<textmail> replaces MS TNEF (i.e. C<winmail.dat>) attachments
with the attachments contained therein which are then translated to text as
normal. This option leaves C<winmail.dat> attachments intact. This option,
together with the C<-!> option will cause winmail.dat attachments to be
deleted rather than translated.
=item C<-L>
By default, I<textmail> replaces C<multipart/appledouble> attachments with
just the data fork attachment contained therein which is then translated to
text as normal. This option leaves appledouble attachments intact. However,
the data fork attachment will still be translated as normal resulting in a
probably inappropriate and possibly broken resource fork attachment.
Therefore, this option should probably only be used in conjunction with
other options that suppress the translation of the data fork attachment.
=item C<-I>
By default, I<textmail> deletes image attachments. This option leaves image
attachments intact.
=item C<-A>
By default, I<textmail> deletes audio attachments. This option leaves audio
attachments intact.
=item C<-V>
By default, I<textmail> deletes video attachments. This option leaves video
attachments intact.
=item C<-X>
By default, I<textmail> deletes attachments containing MS Windows
executables. That means C<application/octet-stream> attachments with the
following filename extensions: C<com>, C<exe>, C<pif>, C<dll>, C<ocx>,
C<scr>, C<vbs> and C<js>. This option leaves MS Windows executable
attachments intact. To delete C<zip> files as well, you could use either the
C<-O> option or the C<-!> option.
=item C<-B>
By default, when text is encountered that is C<base64>-encoded, I<textmail>
will recode it as either C<7bit> or C<quoted-printable>, whichever is
appropriate. This option suppresses this recoding. Note that if the text is
large enough and contains a high enough proportion of non-ASCII characters,
it will remain C<base64>-encoded to minimise space.
=item C<-S>
When translating attachments, I<textmail> replaces bad filename characters
such as space characters with the underscore character. This option causes
underscore characters to subsequently be converted into space characters. In
other words, you can use this option to preserve space characters in
attachment filenames (other bad filename characters will then be converted
to spaces as well).
=item C<-Z>
By default, I<textmail> will not translate C<multipart/signed> attachments.
This option causes C<multipart/signed> attachments to be replaced by the
signed attachment contained therein, discarding the signature control data.
The no-longer-signed data is then translated to text as normal. Note that
C<multipart/encrypted> attachments are never translated.
=item C<-O>
Delete all C<application/octet-stream> attachments, not just MS Windows
executables. Note that this overrides C<-X> but C<-K> overrides this.
=item C<-!>
Delete all C<application/*> attachments. Note that this overrides C<-X> but
C<-K> overrides this. Also note that translated documents are no longer
C<application/*> attachments so they aren't deleted unless their translation
is suppressed with the appropriate command line option.
=item C<-D> I<hdrs>
Delete particular headers. The I<hdrs> argument is a comma separated list of
header name prefixes and/or the names of files containing header name
prefixes (blank lines, whitespace and shell style comments are ignored). For
example, C<textmail -DX-> deletes all headers whose names begin with C<X->.
=item C<-K> I<types>
By default, I<textmail> deletes several types of non-text attachment. The
C<-O> and C<-!> options delete even more. This option specifies, by mimetype
and/or filename extension, a list of attachments not to delete. This
overrides all deletions.
The I<types> argument is a comma separated list of mimetypes and/or filename
extensions and/or the names of files containing mimetypes and/or filename
extensions (blank lines, whitespace and shell style comments are ignored).
Note that the elements are interpreted as a complete mimetype, if they
contain a slash character, or as either the C<*> in C<application/*> or as a
filename extension if they do not contain a slash character. For example,
C<textmail -Wf!Kdoc> deletes all C<application/*> attachments except MS Word
documents.
=item C<-f>
Whenever I<textmail> is unable to translate any attachment into text, it
will leave the attachment intact. This happens when the requisite
translation software can't be found, when it runs but returns an error code,
and when it produces an empty file. It also happens when C<winmail.dat>
attachments are corrupt. This option causes the empty translation to take
the place of the original attachment. Only the name of the attachment is
preserved. This is needed to ensure plain text even in the face of an MS
Word document that contains no text (e.g. only images).
=item C<-?>
Print the paths of all helper applications then exit.
=back
=head1 EXAMPLES
A I<procmail(1)> recipe that insists on pure text and no C<X-> headers (with
output in mailbox format):
:0 fw
| textmail -Mf!DX-
Do the same but to an existing mailbox file:
textmail -Mf!DX- < mailbox > mailbox-as-text
Delete all C<application/*> attachments except for PostScript and PDF (and
don't translate PDF into text):
textmail -!PKps,pdf
Delete all C<application/*> attachments except for zip files and gzipped tar
files:
textmail -!Ktar.gz,zip
A I<procmail(1)> recipe that just unpacks winmail.dat attachments but
doesn't translate the attachments contained therein into text and doesn't
delete windows executables (with output in mailbox format):
:0 fw
| textmail -MWEHRPLIAVXS
=head1 REQUIREMENTS
MS Word and RTF documents are translated into plain text using
I<antiword(1)> or I<catdoc(1)>. If I<textmail> can't find I<antiword(1)> or
I<catdoc(1)>, then MS Word and RTF attachments are left intact. So make sure
that I<antiword(1)> or I<catdoc(1)> is installed and in the C<$PATH>.
MS Excel documents are translated into csv files using I<xls2csv(1)>. If
I<textmail> can't find I<xls2csv(1)>, then MS Excel attachments are left
intact. So make sure that I<xls2csv(1)> is installed and in the C<$PATH>.
HTML documents are translated into plain text using I<lynx(1)>. If
I<textmail> can't find I<lynx(1)>, then HTML attachments are left intact. So
make sure that I<lynx(1)> is installed and in the C<$PATH>.
PDF documents are translated into plain text using I<pdftotext(1)>. If
I<textmail> can't find I<pdftotext(1)>, then PDF attachments are left
intact. So make sure that I<pdftotext(1)> is installed and in the C<$PATH>.
I<textmail> also requires I<perl(1)> and I<pod2man(1)> and I<pod2html(1)>
(which come with I<perl(1)>) and I<mktemp(1)>.
If I<textmail> fails to create a temporary directory, or if it is instructed
to do nothing (i.e. C<-WEHRPULIAVX>), then it degenerates into I<cat(1)>.
=head1 CAVEAT
The latest version of I<xls2csv(1)> at the time of writing (i.e.
catdoc-0.93.3) loses data.
If I<textmail> is unable to create a temporary directory (in C</tmp>), then
it degenerates into I<cat(1)>. Without a temporary directory, no attachments
will be translated or deleted no matter what options (even C<-f>) were given
to I<textmail>. So make sure that C</tmp> is writable. Also make sure that
I<mktemp(1)> is available otherwise an insecure temporary directory will be
created.
=head1 SEE ALSO
I<procmail(1)>,
I<antiword(1)>,
I<catdoc(1)>,
I<xls2csv(1)>,
I<lynx(1)>,
I<pdftotext(1)>,
I<pod2man(1)>,
I<pod2html(1)>,
C<http://raf.org/minimail/>
=head1 AUTHOR
20070803 raf <raf@raf.org>
=head1 URL
C<http://raf.org/textmail/>
=cut
# Functions from minimail: see http://raf.org/minimail/
sub formail # rfc2822 + mboxrd format (see http://www.qmail.org/man/man5/mbox.html)
{
sub mime # rfc2045, rfc2046
{
my ($mail, $parent) = @_;
return $mail unless exists $mail->{header} && exists $mail->{header}->{'content-type'} || defined $parent && exists $parent->{mime_type} && $parent->{mime_type} =~ /^multipart\/digest$/i;
my ($content_type) = (exists $mail->{header} && exists $mail->{header}->{'content-type'}) ? @{$mail->{header}->{'content-type'}} : "Content-Type: message/rfc822\n";
my ($type) = $content_type =~ /^content-type:\s*([\w\/.-]+)/i;
my $boundary = param($mail, 'content-type', 'boundary') if $type =~ /^multipart\//i;
return $mail unless defined $type && ($type =~ /^multipart\//i && $boundary || $type =~ /^message\/rfc822$/i);
($mail->{mime_boundary}) = $boundary =~ /^(.*\S)/ if $boundary;
$mail->{mime_type} = $type;
$mail->{mime_message} = mimepart(delete $mail->{body} || '', $mail), return $mail if $type =~ /^message\/(?:rfc822|external-body)$/i;
return mimeparts($mail, $parent);
}
sub mimeparts
{
my ($mail, $parent) = @_;
my $state = 'preamble';
my $text = '';
for (split /(?<=\n)/, delete $mail->{body} || '')
{
if (/^--\Q$mail->{mime_boundary}\E(--)?/)
{
if ($state eq 'preamble')
{
$state = 'part';
$mail->{mime_preamble} = $text if length $text;
}
elsif ($state eq 'part')
{
$state = 'epilogue' if defined $1 && $1 eq '--';
push @{$mail->{mime_parts}}, mimepart($text, $mail);
}
$text = '', next;
}
$text .= $_;
}
push @{$mail->{mime_parts}}, mimepart($text, $mail) if $state eq 'part' && length $text;
$mail->{mime_epilogue} = $text if $state eq 'epilogue' && length $text;
return $mail;
}
sub mimepart
{
my ($mail, $parent) = @_;
my @lines = split /(?<=\n)/, $mail;
formail(sub { shift @lines }, sub { $mail = shift }, $parent);
return $mail;
}
my ($rd, $act, $parent) = @_;
my $state = 'header';
my $mail; my $last;
while (defined($_ = $rd->()))
{
s/\r(?=\n)//g; #, tr/\r/\n/;
if (!defined $parent && /^From (?:\S[^\n]+)?\s+[a-zA-Z]+\s+[a-zA-Z]+\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+(?:[A-Z]+\s+)?\d{4}/) # mbox header
{
$mail->{body} =~ s/\n\n\z/\n/ if $mail && exists $mail->{mbox} && exists $mail->{body};
my $mbox = $_; $act->(mime($mail, $parent)) or return if $mail;
$mail = { mbox => $mbox }, $state = 'header', undef $last, next;
}
if ($state eq 'header')
{
if (/^([\w-]+):/) # mail header
{
push @{$mail->{headers}}, $_;
push @{$mail->{header}->{$last = lc $1}}, $_;
}
elsif (/^$/) # blank line after mail headers
{
$mail->{body} = '', $state = 'body';
}
else # mail header continuation or error
{
${$mail->{headers}}[$#{$mail->{headers}}] .= $_ if defined $last;
${$mail->{header}->{$last}}[$#{$mail->{header}->{$last}}] .= $_ if defined $last;
}
}
elsif ($state eq 'body')
{
s/^>(>*From )/$1/ if exists $mail->{mbox};
$mail->{body} .= $_;
}
}
$mail->{body} =~ s/\n\n\z/\n/ if $mail && exists $mail->{mbox} && exists $mail->{body};
$act->(mime($mail, $parent)) if $mail;
}
sub mail2str
{
my $mail = shift;
my $head = '';
$head .= $mail->{mbox} if exists $mail->{mbox};
$head .= join '', @{$mail->{headers}} if exists $mail->{headers};
my $body = '';
$body .= $mail->{body} if exists $mail->{body};
$body .= "$mail->{mime_preamble}" if exists $mail->{mime_preamble};
$body .= "--$mail->{mime_boundary}\n" if exists $mail->{mime_boundary} && !exists $mail->{mime_parts};
$body .= join("", map { "--$mail->{mime_boundary}\n" . mail2str($_) } @{$mail->{mime_parts}}) if exists $mail->{mime_parts};
$body .= "--$mail->{mime_boundary}--\n" if exists $mail->{mime_boundary};
$body .= "$mail->{mime_epilogue}" if exists $mail->{mime_epilogue};
$body .= mail2str($mail->{mime_message}) if exists $mail->{mime_message};
$body =~ s/^(>*From )/>$1/mg, $body =~ s/([^\n])\n?\z/$1\n\n/ if exists $mail->{mbox};
return $head . "\n" . $body;
}
my $bchar = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'()+_,-.\/:=?";
sub mail2multipart
{
my $m = shift;
return $m if exists $m->{mime_type} && $m->{mime_type} =~ /^multipart\//i;
my $p = {};
append_header($p, $_) for grep { /^content-/i } @{$m->{headers}};
$p->{body} = delete $m->{body} if exists $m->{body};
$p->{mime_message} = delete $m->{mime_message} if exists $m->{mime_message};
$p->{mime_type} = $m->{mime_type} if exists $m->{mime_type};
$m->{mime_type} = 'multipart/mixed';
$m->{mime_boundary} = exists $m->{mime_prev_boundary} ? delete $m->{mime_prev_boundary} : join '', map { substr $bchar, int(rand(length $bchar)), 1 } 0..30;
$m->{mime_preamble} = delete $m->{mime_prev_preamble} if exists $m->{mime_prev_preamble};
$m->{mime_epilogue} = delete $m->{mime_prev_epilogue} if exists $m->{mime_prev_epilogue};
delete_header($m, qr/content-[^:]*/i);
append_header($m, 'MIME-Version: 1.0') unless exists $m->{header} && exists $m->{header}->{'mime-version'};
append_header($m, "Content-Type: $m->{mime_type}; boundary=\"$m->{mime_boundary}\"");
$m->{mime_parts} = [$p];
return $m;
}
sub mail2singlepart
{
my $m = shift;
$m->{mime_message} = mail2singlepart($m->{mime_message}), return $m if exists $m->{mime_type} && $m->{mime_type} =~ /^message\//i;
return $m unless exists $m->{mime_type} && $m->{mime_type} =~ /^multipart\//i && @{$m->{mime_parts}} <= 1;
my $p = shift @{$m->{mime_parts}};
$m->{mime_prev_boundary} = delete $m->{mime_boundary} if exists $m->{mime_boundary};
$m->{mime_prev_preamble} = delete $m->{mime_preamble} if exists $m->{mime_preamble};
$m->{mime_prev_epilogue} = delete $m->{mime_epilogue} if exists $m->{mime_epilogue};
$m->{body} = $p->{body} if exists $p->{body};
$m->{mime_message} = $p->{mime_message} if exists $p->{mime_message};
delete $m->{mime_type}; $m->{mime_type} = $p->{mime_type} if exists $p->{mime_type};
delete $m->{mime_parts}; $m->{mime_parts} = $p->{mime_parts} if exists $p->{mime_parts};
$m->{mime_boundary} = $p->{mime_boundary} if exists $p->{mime_boundary};
$m->{mime_preamble} = $p->{mime_preamble} if exists $p->{mime_preamble};
$m->{mime_epilogue} = $p->{mime_epilogue} if exists $p->{mime_epilogue};
my $explicit = 0;
delete_header($m, qr/content-[^:]*/i);
append_header($m, $_), ++$explicit for grep { /^content-/i } @{$p->{headers}};
delete_header($m, 'mime-version') unless $explicit;
return mail2singlepart($m);
}
sub mail2mbox
{
my $m = shift;
return $m if exists $m->{mbox};
my ($f) = header($m, 'sender');
($f) = header($m, 'from') unless defined $f;
$f =~ s/"(?:\\[^\r\n]|[^\\"])*"//g, $f =~ s/\s*;.*//, $f =~s/^[^:]+:\s*//, $f =~ s/\s*,.*$//, $f =~ s/^[^<]*<\s*//, $f =~ s/\s*>.*$// if defined $f;
$f = 'unknown' unless defined $f;
use POSIX; $m->{mbox} = "From $f " . ctime(time());
return $m;
}
sub append_header
{
my ($m, $h, $l, $c) = @_;
$h = header_format($h, $l, $c);
my ($n) = $h =~ /^([^:]+):/;
push @{$m->{headers}}, $h;
push @{$m->{header}->{lc $n}}, $h;
}
sub delete_header
{
my ($m, $h, $r) = @_;
return undef unless exists $m->{header};
@{$m->{headers}} = grep { !/^$h:/i } @{$m->{headers}};
delete $m->{header}->{$_} for grep { /^$h$/i } keys %{$m->{header}};
if ($r && exists $m->{mime_parts}) { delete_header($_, $h, $r) for @{$m->{mime_parts}} }
if ($r && exists $m->{mime_message}) { delete_header($m->{mime_message}, $h, $r) }
}
sub header
{
my ($m, $h) = @_;
return () unless exists $m->{header} && exists $m->{header}->{lc $h};
return map { s/\n\s+/ /g; $_ = header_display($_); /^$h:\s*(.*)\s*$/i; $1 } @{$m->{header}->{lc $h}};
}
my $encword = qr/=\?(us-ascii|iso-8859-\d)(?:\*\w+)?\?(q|b)\?([^? ]+)\?=/i; # encoded words to display (should really only decode ascii)
sub header_display # rfc2047, rfc2231
{
return join '',
map { tr/ \t/ /s; $_ } # finally, squeeze multiple whitespace
map { tr/\x00-\x08\x0b-\x1f\x7f//d; $_ } # strip control characters
map { s/$encword/lc $2 eq 'q' ? join ' ', split '_', decode_quoted_printable($3), -1 : decode_base64($3)/ieg; $_ } # decode encoded words
map { s/($encword)\s+($encword)/$1$5/g while /$encword\s+$encword/; $_ } # strip space between encoded words that we're about to decode
map { s/\((?:\\[^\r\n]|[^\\()])*\)//g unless /^".*"$/; $_ } # strip (comments) outside "quoted strings"
split /("(?:\\[^\r\n]|[^\\"])*")/, shift; # split on "quoted strings"
}
sub header_format # rfc2822, rfc2047
{
my ($h, $l, $c) = @_;
$h =~ s/^\s+//, $h =~ s/\s+$//, $h =~ tr/ \t\n\r/ /s;
$h = join ' ', map { /^".*"$/ ? $_ : !tr/\x80-\xff// ? $_ : tr/a-zA-Z0-9!*\/+-//c > length >> 1 ? join(' ', map { '=?' . ($c || 'iso-8859-1') . ($l ? "*$l" : '') . '?b?' . substr(encode_base64($_), 0, -1) . '?=' } (split /\n/, (s/([^\r\n]{38})/$1\n/g, $_))) : join(' ', map { '=?' . ($c || 'iso-8859-1') . ($l ? "*$l" : '') . '?q?' . substr(encode_quoted_printable($_), 0, -2) . '?=' } (split /\n/, (s/([^\r\n]{17})/$1\n/g, $_))) } map { /^[^\s"]*".*"[^\s"]*$/ ? $_ : split / / } split /(\S*"(?:\\[^\r\n]|[^\\"])*"\S*)/, $h;
my ($f, $p, $lf) = ('', 0); $lf = length $f, $f .= ($lf && $lf + ($lf ? 1 : 0) + length($_) - $p > 78) ? ($p = $lf, "\n") : '', $f .= $f ? ' ' : '', $f .= $_ for map { /^\S*".*"\S*$/ ? $_ : grep { length } split / / } split /(\S*"(?:\\[^\r\n]|[^\\"\r\n])*"\S*)/, $h; # fold
return $f . "\n";
}
sub param # rfc2231, rfc2045
{
my ($m, $h, $p) = @_;
my @p; my $decode = 0;
for (header($m, $h))
{
while (/(\b\Q$p\E(?:\*|\*\d\*?)?)=("(?:\\[^\n]|[^"\n])*"|[^\x00-\x20()<>@,;:\\"\/\[\]?=]+)/ig)
{
my ($n, $v) = ($1, $2);
$v =~ s/^"//, $v =~ s/"$//, $v =~ s/\\(.)/$1/g if $v =~ /^".*"$/;
$v =~ s/^(?:us-ascii|iso-8859-\d)'\w+'//i and $decode = 1;
$v =~ s/%([\da-fA-f]{2})/chr hex $1/eg if $decode && substr($n, -1) eq '*';
push @p, [lc $n, $v];
}
}
return join '', map { $_->[1] } sort { my ($ad) = $a->[0] =~ /(\d+)/; my ($bd) = $b->[0] =~ /(\d+)/; $ad <=> $bd } @p;
}
sub mimetype # rfc2045, rfc2046
{
my ($m, $p) = @_;
my ($e) = header($m, 'content-transfer-encoding');
return 'application/octet-stream' if defined $e && $e !~ /^(?:[78]bit|binary|quoted-printable|base64)$/i;
my ($type) = header($m, 'content-type');
return lc $1 if defined $type && $type =~ /^((?:text|image|audio|video|application|message|multipart)\/[^\s;]+)/i;
return 'message/rfc822' if !defined $type && defined $p && exists $p->{mime_type} && $p->{mime_type} =~ /^multipart\/digest/i;
return 'text/plain';
}
sub encoding # rfc2045
{
my $m = shift;
my ($e) = header($m, 'content-transfer-encoding');
return (defined $e && $e =~ /^([78]bit|binary|quoted-printable|base64)$/i) ? lc $1 : (exists $m->{body} && $m->{body} =~ tr/\x80-\xff//) ? '8bit' : '7bit';
}
my $unique;
sub filename # rfc2183, rfc2045?
{
my $p = shift;
my $fn = param($p, 'content-disposition', 'filename') || param($p, 'content-type', 'name') || 'attachment' . ++$unique;
$fn =~ s/^.*[\\\/]//, $fn =~ tr/\x00-\x1f !"#\$%&'()*\/:;<=>?@[\\]^`{|}~\x7f/_/s;
return $fn;
}
sub body
{
my $m = shift;
return exists $m->{body} ? decode($m->{body}, encoding($m)) : undef;
}
sub parts
{
my ($m, $p) = @_;
return [@{$m->{mime_parts}}] unless defined $p;
$m->{mime_parts} = [@$p];
}
sub newparam # rfc2231, rfc2045
{
my ($n, $v, $l, $c) = (@_, '', '');
my $high = $v =~ tr/\x80-\xff//;
my $ctrl = $v =~ tr/\x00-\x06\x0e-\x1f\x7f//;
my $enc = $high || $ctrl ? '*' : '';
$c = ('high' ? 'iso-8859-1' : 'us-ascii') if $enc && !$c;
$l = 'en' if $c && !$l;
$v = "$c'$l'$v" if $enc;
my @p; push @p, $_ while $_ = substr $v, 0, 40, '';
s/([\x00-\x20\x7f-\xff])/sprintf '%%%02X', ord $1/eg for grep { tr/\x00-\x06\x0e-\x1f\x7f-\xff// } @p;
s/"/\\"/g, s/^/"/g, s/$/"/g for grep { tr/\x00-\x06\x0e-\x1f\x7f ()<>@,;:\\"\/[]?=// } @p;
return "; $n$enc=$p[0]" if @p == 1;
return join '', map { "; $n*$_$enc=$p[$_]" } 0..$#p;
}
sub newmail # rfc2822, rfc2045, rfc2046, rfc2183 (also rfc3282, rfc3066, rfc2424, rfc2557, rfc2110, rfc3297, rfc2912, rfc2533, rfc1864)
{
my @a = @_; my %a = @_; my $m = {};
sub rfc822date { use POSIX; return strftime '%a, %d %b %Y %H:%M:%S +0000', gmtime shift; }
my $type = $a{type} || (exists $a{parts} ? 'multipart/mixed' : exists $a{message} ? 'message/rfc822' : 'text/plain');
my $multi = $type =~ /^multipart\//i;
my $msg = $type =~ /^message\/rfc822$/i;
($a{body}, $a{modified}, $a{read}, $a{size}) = (do { local $/; my $b = <F>; close F; $b }, exists $a{modified} ? $a{modified} : rfc822date((stat _)[9]), exists $a{read} ? $a{read} : rfc822date((stat _)[8]), (stat _)[7]) if exists $a{filename} && !exists $a{body} && !exists $a{message} && !exists $a{parts} && -r $a{filename} && stat($a{filename}) && open F, $a{filename};
($a{filename}) = $a{filename} =~ /([^\\\/]+)$/ if $a{filename};
my $bound = $multi ? join '', map { substr $bchar, int(rand(length $bchar)), 1 } 0..30 : '';
my $disp = $a{disposition} || ($type =~ /^(?:text\/|message\/rfc822)/i ? 'inline' : 'attachment');
my $char = $a{charset} || ($a{body} && $a{body} =~ tr/\x80-\xff// ? 'iso-8859-1' : 'us-ascii');
my $enc = $a{encoding} || ($multi || $msg ? '7bit' : $a{body} ? choose_encoding($a{body}) : '7bit');
append_header($m, $a[$_] . ': ' . $a[$_ + 1]) for grep { $_ % 2 == 0 && $a[$_] =~ /^[A-Z]/ } 0..$#a;
append_header($m, 'Date: ' . rfc822date(time)) if grep { /^(?:date|from|sender|reply-to)$/i } keys %a and !grep { /^date$/i } keys %a;
append_header($m, 'MIME-Version: 1.0') if grep { /^(?:date|from|sender|reply-to)$/i } keys %a and !grep { /^mime-version$/ } keys %a;
use Sys::Hostname; append_header($m, "Message-ID: <@{[time]}.$$.@{[++$unique]}\@@{[hostname]}>") if grep { /^(?:date|from|sender|reply-to)$/i } keys %a and !grep { /^message-id$/i } keys %a;
append_header($m, "Content-Type: $type" . ($bound ? newparam('boundary', $bound) : '') . ($char =~ /^us-ascii$/i ? '' : newparam('charset', $char))) unless $type =~ /^text\/plain$/i && $char =~ /^us-ascii$/i;
append_header($m, "Content-Transfer-Encoding: $enc") unless $enc =~ /^7bit$/i;
append_header($m, "Content-Disposition: $disp" . ($a{filename} ? newparam('filename', $a{filename}) : '') . ($a{size} ? newparam('size', $a{size}) : '')) if $a{filename} || $a{size};
append_header($m, "Content-@{[ucfirst $_]}: $a{$_}") for grep { $a{$_} } qw(description language duration location base features alternative);
append_header($m, "Content-@{[uc $_]}: $a{$_}") for grep { $a{$_} } qw(id md5);
($m->{mime_type}, $m->{mime_boundary}, $m->{mime_parts}) = ($type =~ /^\s*([\w\/.-]+)/, $bound, $a{parts} || []) if $multi;
($m->{mime_type}, $m->{mime_message}) = ($type =~ /^\s*([\w\/.-]+)/, $a{message} || {}) if $msg;
$m->{body} = encode($a{body} || '', $enc) unless $multi || $msg;
$m->{mbox} = $a{mbox} if exists $a{mbox} && defined $a{mbox} && length $a{mbox};
return $m;
}
sub decode
{
my ($d, $e) = @_;
return $e =~ /^base64$/i ? decode_base64($d) : $e =~ /^quoted-printable$/i ? decode_quoted_printable($d) : substr($d, 0, -1);
}
sub encode
{
my ($d, $e) = @_;
return $e =~ /^base64$/i ? encode_base64($d) : $e =~ /^quoted-printable$/i ? encode_quoted_printable($d) : $d . "\n";
}
sub choose_encoding # rfc2822, rfc2045
{
my $len = length $_[0];
my $high = $_[0] =~ tr/\x80-\xff//;
my $ctrl = $_[0] =~ tr/\x00-\x06\x0e-\x1f\x7f//;
my ($maxlen, $pos, $next) = (0, 0, 0);
for (; ($next = index($_[0], "\n", $pos)) != -1; $pos = $next + 1)
{
$maxlen = $next - $pos if $next - $pos > $maxlen;
}
$maxlen = $len - $pos if $len - $pos > $maxlen;
return $ctrl ? 'base64' : $high ? $len > 1024 && $high > $len * 0.167 ? 'base64' : 'quoted-printable' : $maxlen > 998 ? 'quoted-printable' : '7bit';
}
sub encode_base64 # MIME::Base64 (Gisle Aas)
{
pos $_[0] = 0; # Note: Text must be in canonical form (i.e. with "\r\n")
my $padlen = (3 - length($_[0]) % 3) % 3;
my $encoded = join '', map { pack('u', $_) =~ /^.(\S*)/ } $_[0] =~ /(.{1,45})/gs;
$encoded =~ tr{` -_}{AA-Za-z0-9+/};
$encoded =~ s/.{$padlen}$/'=' x $padlen/e if $padlen;
$encoded =~ s/(.{1,76})/$1\n/g;
return $encoded;
}
sub decode_base64 # MIME::Base64 (Gisle Aas)
{
my $data = shift;
$data =~ tr{A-Za-z0-9+=/}{}cd;
$data =~ s/=+$//;
$data =~ tr{A-Za-z0-9+/}{ -_};
return join '', map { unpack("u", chr(32 + length($_) * 3 / 4) . $_) } $data =~ /(.{1,60})/gs;
}
sub encode_quoted_printable
{
my $quoted = shift;
my $binary = ($quoted =~ tr/\x00-\x06\x0e-\x1f\x7f//) ? '' : '\r\n';
$quoted =~ s/([^!-<>-~ \t$binary])/sprintf '=%02X', ord $1/eg;
$quoted =~ s/((?:[^\r\n]{73,75})(?=[=])|(?:[^\r\n]{75}(?=[ \t]))|(?:[^\r\n]{75})(?=[^\r\n]{2})|(?:[^\r\n]{75})(?=[^\r\n]$))/$1=\n/g;
$quoted =~ s/([ \t])$/sprintf '=%02X', ord $1/emg;
$quoted .= "=\n" unless $quoted =~ /\n$/;
return $quoted;
}
sub decode_quoted_printable
{
my $quoted = shift;
$quoted =~ tr/\x00-\x08\x0b-\x0c\x0e-\x19\x7f-\xff//d;
$quoted =~ s/=\n//g;
$quoted =~ s/=([0-9A-Fa-f]{2})/chr hex $1/eg;
return $quoted;
}
my %mimetype =
(
txt => 'text/plain', csv => 'text/plain', htm => 'text/html', html => 'text/html', vcf => 'text/x-vcard',
gif => 'image/gif', jpg => 'image/jpeg', jpeg => 'image/jpeg', jpe => 'image/jpeg', png => 'image/png', bmp => 'image/bmp', tiff => 'image/tiff', tif => 'image/tiff', jp2 => 'image/jp2', jpf => 'image/jpx', jpm => 'image/jpm',
mp2 => 'audio/mpeg', mp3 => 'audio/mpeg', au => 'audio/au', aif => 'audio/x-aiff', wav => 'audio/wav',
mpeg => 'video/mpeg', mpg => 'video/mpeg', mpe => 'video/mpeg', qt => 'video/quicktime', mov => 'video/quicktime', avi => 'video/x-msvideo', mj2 => 'video/mj2',
rtf => 'application/rtf', doc => 'application/vnd.ms-word', wri => 'application/vnd.ms-word', xls => 'application/vnd.ms-excel', ppt => 'application/vnd.ms-powerpoint',
pdf => 'application/pdf', ps => 'application/ps', eps => 'application/ps', zip => 'application/zip', other => 'application/octet-stream'
);
sub add_mimetypes
{
open M, '/etc/mime.types' or return;
while (<M>)
{
s/#.*$//, s/^\s+//, s/\s+$//; next unless $_;
my ($mimetype, $ext) = /^(\S+)\s+(.*)$/; next unless $ext;
$mimetype{$_} = $mimetype for split /\s+/, $ext;
}
close M;
}
sub MESSAGE { 1 }
sub ATTACHMENT { 2 }
sub MESSAGE_CLASS { 0x00078008 }
sub ATTACH_ATTACHMENT { 0x00069005 }
sub ATTACH_DATA { 0x0006800f }
sub ATTACH_FILENAME { 0x00018010 }
sub ATTACH_RENDDATA { 0x00069002 }
sub ATTACH_MODIFIED { 0x00038013 }
my $data; my @attachment; my $attachment; my $pos; my $badtnef;
sub winmail
{
sub read_message_attribute
{
my $type = unpack 'C', substr $data, $pos, 1;
return 0 unless defined $type && $type == MESSAGE; ++$pos;
my $id = unpack 'V', substr $data, $pos, 4; $pos += 4;
my $len = unpack 'V', substr $data, $pos, 4; $pos += 4;
++$badtnef, return 0 if $pos + $len > length $data;
my $buf = substr $data, $pos, $len; $pos += $len;
my $chk = unpack 'v', substr $data, $pos, 2; $pos += 2;
my $tot = unpack '%16C*', $buf;
++$badtnef unless $chk == $tot;
return $chk == $tot;
}
sub read_attribute_message_class
{
my $type = unpack 'C', substr $data, $pos, 1;
return unless defined $type && $type == MESSAGE;
my $id = unpack 'V', substr $data, $pos + 1, 4;
return unless $id == MESSAGE_CLASS; $pos += 5;
my $len = unpack 'V', substr $data, $pos, 4; $pos += 4;
++$badtnef, return if $pos + $len > length $data;
my $buf = substr $data, $pos, $len; $pos += $len;
my $chk = unpack 'v', substr $data, $pos, 2; $pos += 2;
my $tot = unpack '%16C*', $buf;
++$badtnef unless $chk == $tot;
}
sub read_attachment_attribute
{
my $type = unpack 'C', substr $data, $pos, 1;
return 0 unless defined $type && $type == ATTACHMENT; ++$pos;
my $id = unpack 'V', substr $data, $pos, 4; $pos += 4;
++$badtnef if $id == ATTACH_RENDDATA && @attachment && !exists $attachment->{body};
push @attachment, $attachment = {} if $id == ATTACH_RENDDATA;
my $len = unpack 'V', substr $data, $pos, 4; $pos += 4;
++$badtnef, return 0 if $pos + $len > length $data;
my $buf = substr $data, $pos, $len; $pos += $len;
my $chk = unpack 'v', substr $data, $pos, 2; $pos += 2;
my $tot = unpack '%16C*', $buf;
++$badtnef, return 0 unless $chk == $tot;
$attachment->{body} = $buf, $attachment->{size} = length $buf if $id == ATTACH_DATA;
$buf =~ s/\x00+$//, $attachment->{filename} = $buf, $attachment->{type} = $mimetype{($attachment->{filename} =~ /\.([^.]+)$/) || 'other'} || 'application/octet-stream' if $id == ATTACH_FILENAME && !exists $attachment->{filename};
my $fname; $attachment->{filename} = $fname, $attachment->{type} = $mimetype{($attachment->{filename} =~ /\.([^.]+)$/) || 'other'} || 'application/octet-stream' if $id == ATTACH_ATTACHMENT && ($fname = realname($buf));
use POSIX; sub word { unpack 'v', substr($_[0], $_[1] * 2, 2) }
$attachment->{modified} = strftime '%a, %d %b %Y %H:%M:%S +0000', gmtime mktime word($buf, 5), word($buf, 4), word($buf, 3), word($buf, 2), word($buf, 1) - 1, word($buf, 0) - 1900 if $id == ATTACH_MODIFIED;
return 1;
}
sub realname
{
my $buf = shift;
my $pos = index $buf, "\x1e\x00\x01\x30\x01"; return unless $pos >= 0; $pos += 8;
my $len = unpack 'V', substr($buf, $pos, 4); $pos += 4;
my $name = substr($buf, $pos, $len) or return;
$name =~ s/\x00+$//;
return $name;
}
my $m = shift;
$pos = 0; $data = body($m); @attachment = (); $badtnef = 0;
my $signature = unpack 'V', substr($data, $pos, 4); $pos += 4;
return $m unless $signature == 0x223E9F78;
my $key = unpack 'v', substr($data, $pos, 2); $pos += 2;
my $type = unpack 'C', substr($data, $pos, 1);
return $m unless $type == MESSAGE || $type == ATTACHMENT;
do {} while read_message_attribute();
read_attribute_message_class();
do {} while read_message_attribute();
do {} while read_attachment_attribute();
++$badtnef if @attachment && !exists $attachment->{body};
return ($badtnef) ? $m : map { newmail(%$_) } @attachment;
}
# Doco functions: usage and manpage (via $PAGER or as nroff or html)
$ENV{LANG} = 'C';
sub help
{
print
"usage: textmail [options]\n",
"options:\n",
" -h - Print the help message then exit\n",
" -m - Print the manpage then exit\n",
" -w - Print the manpage in html format then exit\n",
" -r - Print the manpage in nroff format then exit\n",
" -M - Output in mailbox format\n",
" -T - Output in raw mail format (for smtp)\n",
" -W - Don't replace MS Word attachments with text\n",
" -E - Don't replace MS Excel attachments with csv\n",
" -H - Don't replace HTML attachments with text\n",
" -R - Don't replace RTF attachments with text\n",
" -P - Don't replace PDF attachments with text\n",
" -U - Don't translate winmail.dat attachments\n",
" -L - Don't reduce appledouble attachments\n",
" -I - Don't delete image attachments\n",
" -A - Don't delete audio attachments\n",
" -V - Don't delete video attachments\n",
" -X - Don't delete MS Windows executable attachments\n",
" -B - Don't recode text that was base64-encoded\n",
" -S - Don't replace spaces in filenames with underscores\n",
" -Z - Do translate signed content (discards signatures)\n",
" -O - Delete all application/octet-stream attachments\n",
" -! - Delete all application/* attachments\n",
" -D hdrs - Delete headers (list of header prefixes and filenames)\n",
" -K types - Keep attachments (list of mimetypes and filenames)\n",
" -f - On translation error, keep translation, not original\n",
" -? - Print paths of helper applications then exit\n",
"\n",
"Filters a mail message or mbox, replacing MS Word, MS Excel, HTML, RTF and PDF\n",
"attachments with the plain text contained therein. By default, the following\n",
"attachments are also deleted: image, audio, video and MS Windows executables.\n",
"MS winmail.dat attachments are replaced by any attachments contained therein\n",
"which are then replaced by text or deleted in the same fashion. Any of these\n",
"actions can be suppressed with the command line options. Mail headers can also\n",
"be selectively deleted.\n";
exit;
}
sub man
{
my $noquotes = (`pod2man -h 2>&1` =~ /--quotes=/) ? '--quotes=none' : '';
system "pod2man $noquotes $0 | nroff -man | " . ($ENV{PAGER} || 'more');
exit;
}
sub nroff
{
my $noquotes = (`pod2man -h 2>&1` =~ /--quotes=/) ? '--quotes=none' : '';
system "pod2man $noquotes $0";
exit;
}
sub html
{
system "pod2html --noindex $0";
unlink glob 'pod2htm*';
exit;
}
# Parse command line
my %opt;
use Getopt::Std;
help unless getopts 'hmrwMTWEHRPLUIAVXBSZO!D:K:f?', \%opt;
help if exists $opt{h};
man if exists $opt{m};
nroff if exists $opt{r};
html if exists $opt{w};
my $mailbox = exists $opt{M};
my $raw = exists $opt{T};
die "textmail: The -M and -T options are incompatible\n" if $mailbox && $raw;
my $catdoc = find('catdoc');
my $antiword = find('antiword');
$antiword = $antiword ? $catdoc ? "$antiword|$catdoc" : $antiword : $catdoc;
my $xls2csv = find('xls2csv');
my $lynx = find('lynx');
my $pdftotext = find('pdftotext');
my $mktemp = find('mktemp');
paths() if exists $opt{'?'};
my @exe = qw(com exe pif dll ocx scr vbs js);
my @image = qw(gif jpg jpeg jpe png bmp tiff tif jp2 jpf jpm);
my @audio = qw(mp2 mp3 au aif wav ogg flac);
my @video = qw(mpeg mpg mpe qt mov avi mj2);
my $force = exists $opt{f};
my $remove_word = (defined $antiword || $force) && ! exists $opt{W};
my $remove_excel = (defined $xls2csv || $force) && ! exists $opt{E};
my $remove_html = (defined $lynx || $force) && ! exists $opt{H};
my $remove_rtf = (defined $catdoc || $force) && ! exists $opt{R};
my $remove_pdf = (defined $pdftotext || $force) && ! exists $opt{P};
my $remove_tnef = ! exists $opt{U};
my $remove_apple = ! exists $opt{L};
my $remove_images = ! exists $opt{I};
my $remove_audio = ! exists $opt{A};
my $remove_video = ! exists $opt{V};
my $remove_exe = ! exists $opt{X};
my $recode_base64_text = ! exists $opt{B};
my $replace_space = ' ' if exists $opt{S};
my $remove_signed = exists $opt{Z};
my $remove_octet = exists $opt{O};
my $remove_application = exists $opt{'!'};
my $remove_headers = exists $opt{D};
my @headers = get_file($opt{D}) if $remove_headers;
my $keep_attachments = exists $opt{K};
my @keep = get_file($opt{K}) if $keep_attachments;
my $removing = $remove_word || $remove_excel || $remove_html || $remove_rtf || $remove_pdf || $remove_tnef || $remove_apple || $remove_images || $remove_audio || $remove_video || $remove_exe || $recode_base64_text || $remove_signed || $remove_octet || $remove_application || $remove_headers || $mailbox || $raw;
chop(my $tmp = `$mktemp -dq /tmp/textmail.XXXXXX`) if $removing && defined $mktemp;
if (!$removing || (($? || !defined $tmp || ! -d $tmp) && !mkdir($tmp = "/tmp/textmail.$$", 0700)))
{
exec '/bin/cat' or print STDERR ''; # suppress warning
print do { undef $/; <STDIN> }; # slow cat if exec fails
exit;
};
# Filter mail message(s) on stdin into text on stdout
formail(sub { <> }, sub
{
my $m = mail2singlepart(textmail(mail2multipart(shift)));
delete_header($m, qr/(?:content-length|lines)/i);
delete $m->{mbox} if $raw;
print mail2str($mailbox ? mail2mbox($m) : $m);
});
rmdir $tmp or system "rm -rf $tmp";
BEGIN { $SIG{INT} = $SIG{QUIT} = $SIG{TERM} = sub { rmdir $tmp or system "rm -rf $tmp" if defined $tmp; exit } }
# Print paths to helper applications then exit
sub paths
{
print(defined $antiword ? $antiword : "antiword/catdoc not found: MS Word will not be translated", "\n");
print(defined $catdoc ? $catdoc : "catdoc not found: MS RTF will not be translated", "\n");
print(defined $xls2csv ? $xls2csv : "xls2csv not found: MS Excel with not be translated", "\n");
print(defined $lynx ? $lynx : "lynx not found: HTML will not be translated", "\n");
print(defined $pdftotext ? $pdftotext : "pdftotext not found: PDF will not be translated", "\n");
print(defined $mktemp ? $mktemp : "mktemp not found: insecure temp directory will be used", "\n");
exit;
}
# Translate a multipart mail message
sub textmail
{
my $entity = shift;
my $isapart = shift || 0;
my @parts = @{parts($entity)};
my $mbox = $entity->{mbox} if exists $entity->{mbox};
# Do nothing if this is encrypted (or signed unless -Z)
return $entity if isa($entity, qr/multipart\/encrypted/i);
return $entity if !$remove_signed && isa($entity, qr/multipart\/signed/i);
# Remove headers
delete_header($entity, qr/(?:@{[join '|', @headers]})[^:]*/i) if $remove_headers;
delete_header($entity, 'X-MS-TNEF-Correlator') if $remove_tnef;
# Reduce alternative text-versus-html to just the text
if ($remove_html && isa($entity, 'multipart/alternative') && @parts == 2)
{
if (isa($parts[0], 'text/plain') && isa($parts[1], 'text/html', qr/\.html?$/i) || isa($parts[1], 'text/plain') && isa($parts[0], 'text/html', qr/\.html?$/i))
{
my $plain = $parts[isa($parts[0], 'text/plain') ? 0 : 1];
@{$plain->{headers}} = (grep(!/^content-/i, @{$entity->{headers}}), grep { /^content-/i } @{$plain->{headers}});
%{$plain->{header}} = (map { ($_, $entity->{header}->{$_}) } grep { !/^content-/i } keys %{$entity->{header}}), (map { ($_, $plain->{header}->{$_}) } grep { /^content-/i } keys %{$plain->{header}});
$plain->{mbox} = $mbox if defined $mbox;
return debase64($plain);
}
}
# Reduce appledouble attachments to just the data fork attachment
if ($remove_apple && isa($entity, 'multipart/appledouble') && @parts == 2)
{
if (isa($parts[0], 'application/applefile'))
{
my $data = $parts[1];
@{$data->{headers}} = (grep(!/^content-/i, @{$entity->{headers}}), grep { /^content-/i } @{$data->{headers}});
%{$data->{header}} = (map { ($_, $entity->{header}->{$_}) } grep { !/^content-/i } keys %{$entity->{header}}), (map { ($_, $data->{header}->{$_}) } grep { /^content-/i } keys %{$data->{header}});
$data->{mbox} = $mbox if defined $mbox;
return mail2singlepart(textmail(mail2multipart($parts[1]), 0));
}
}
# Reduce signed attachments to just the signed data attachment
if ($remove_signed && isa($entity, 'multipart/signed') && @parts == 2)
{
if (isa($parts[1], param($entity, 'content-type', 'protocol')))
{
my $data = $parts[0];
@{$data->{headers}} = (grep(!/^content-/i, @{$entity->{headers}}), grep { /^content-/i } @{$data->{headers}});
%{$data->{header}} = (map { ($_, $entity->{header}->{$_}) } grep { !/^content-/i } keys %{$entity->{header}}), (map { ($_, $data->{header}->{$_}) } grep { /^content-/i } keys %{$data->{header}});
$data->{mbox} = $mbox if defined $mbox;
return mail2singlepart(textmail(mail2multipart($parts[0]), 0));
}
}
# Process parts
for (my $i = 0; $i < @parts; ++$i)
{
# Replace MS Word attachments with plain text (via antiword/catdoc)
if ($remove_word && isa($parts[$i], qr/.*ms-?word/i, qr/\.doc$/i))
{
$parts[$i] = translate($parts[$i], 'doc', 'txt', $antiword);
next;
}
# Replace MS Excel attachments with csv (via xls2csv)
if ($remove_excel && isa($parts[$i], qr/.*ms-?excel/i, qr/\.xls$/i))
{
$parts[$i] = translate($parts[$i], 'xls', 'csv', $xls2csv);
next;
}
# Replace HTML attachments with plain text (via lynx -dump)
if ($remove_html && isa($parts[$i], 'text/html', qr/\.html?$/i))
{
$parts[$i] = translate($parts[$i], 'html,htm', 'txt', (defined $lynx) ? "$lynx -dump -force_html" : undef);
next;
}
# Replace RTF attachments with plain text (via catdoc)
if ($remove_rtf && isa($parts[$i], qr/rtf/i, qr/\.rtf$/i))
{
$parts[$i] = translate($parts[$i], 'rtf', 'txt', $catdoc);
next;
}
# Replace PDF attachments with plain text (via pdftotext)
if ($remove_pdf && isa($parts[$i], qr/pdf/i, qr/\.pdf$/i))
{
$parts[$i] = translate($parts[$i], 'pdf', 'txt', $pdftotext);
next;
}
# Replace TNEF attachments with the attachments contained therein
if ($remove_tnef && isa($parts[$i], qr/ms-tnef/i, qr/winmail\.dat$/i))
{
my @a = winmail($parts[$i]);
my $failed = @a == 1 && $a[0] == $parts[$i];
@a = () if $failed && $force;
splice @parts, $i, 1, @a;
--$i if !$failed || $force;
next;
}
# Remove images, audio, video, MS Windows executables, octet streams, application/*
if (!protected($parts[$i]) &&
($remove_images && (mimetype($parts[$i]) =~ /^image\// || filename($parts[$i]) =~ /\.(?:@{[join '|', @image]})(?:\?=)?$/i) ||
$remove_audio && (mimetype($parts[$i]) =~ /^audio\// || filename($parts[$i]) =~ /\.(?:@{[join '|', @video]})(?:\?=)?$/i) ||
$remove_video && (mimetype($parts[$i]) =~ /^video\// || filename($parts[$i]) =~ /\.(?:@{[join '|', @audio]})(?:\?=)?$/i) ||
$remove_exe && mimetype($parts[$i]) =~ /^application\/octet-stream/ && filename($parts[$i]) =~ /\.(?:@{[join '|', @exe]})(?:\?=)?$/i ||
$remove_octet && mimetype($parts[$i]) =~ /^application\/octet-stream/ ||
$remove_application && mimetype($parts[$i]) =~ /^application\//))
{
splice @parts, $i--, 1;
next;
}
# Don't use base64 encoding for text
$parts[$i] = debase64($parts[$i]);
# Nest
$parts[$i]->{mime_message} = mail2singlepart(textmail(mail2multipart($parts[$i]->{mime_message}), 1)) if exists $parts[$i]->{mime_message};
$parts[$i] = textmail($parts[$i], 1) if exists $parts[$i]->{mime_parts};
}
# Replace original parts with processed parts
@parts = grep { exists $_->{mime_type} || exists $_->{body} } @parts;
parts($entity, \@parts);
return $entity;
}
# Do we need to keep this attachment?
sub protected
{
my $entity = shift;
return 0 unless @keep;
for my $spec (map { quotemeta } @keep)
{
return 1 if $spec =~ /\// && mimetype($entity) =~ /^$spec/i;
return 1 if $spec !~ /\// && mimetype($entity) =~ /^application\/$spec/i;
return 1 if $spec !~ /\// && filename($entity) =~ /\.$spec(?:\?=)?$/i;
}
return 0;
}
# Check if a part is of the desired type
sub isa
{
my $entity = shift;
my $type = shift;
my $ext = shift;
return
mimetype($entity) =~ /$type/i ||
mimetype($entity) =~ /^application\/$type/i ||
mimetype($entity) =~ /^application\/octet-stream/i &&
defined $ext && filename($entity) =~ /$ext/i;
}
# Return a translated part
sub translate
{
my $part = shift;
my @ext = split /,/, shift;
my $fmt = shift;
my $cmd = shift;
return $part if !defined $cmd && !$force;
my $origpath = filename($part);
$origpath =~ s/_+/$replace_space/g if defined $replace_space;
$origpath .= '.' . $ext[0] unless $origpath =~ /\.(?:@{[join '|', @ext, $fmt]})$/i;
my $textpath = $origpath;
$textpath =~ s/\.(?:@{[join '|', @ext]})$/.$fmt/i;
$textpath .= ".$fmt" if $textpath eq $origpath && $textpath !~ /\.$fmt$/i;
return newmail(filename => $textpath, body => '') if !defined $cmd && $force;
my $origdata = body($part);
open A, ">$tmp/$origpath" and do { print A $origdata; close A };
my $failed; $failed = $origpath ne $textpath && system($_ . ' ' . quotemeta("$tmp/$origpath") . ' > ' . quotemeta("$tmp/$textpath")) || -s "$tmp/$origpath" && -z "$tmp/$textpath" or last for split /\|/, $cmd;
unlink "$tmp/$origpath" unless $origpath eq $textpath;
unlink("$tmp/$textpath"), return $part if $failed && !$force;
$part = newmail(filename => "$tmp/$textpath"); unlink "$tmp/$textpath";
return $part;
}
# Recode base64-encoded text as 7bit or quoted-printable
sub debase64
{
my $entity = shift;
return $entity unless $recode_base64_text;
my $type = mimetype($entity);
return $entity unless $type =~ /^text\//i && encoding($entity) =~ /^base64$/i;
my $body = body($entity); $body =~ tr/\r//d;
my $name = filename($entity);
my $mbox = $entity->{mbox} if exists $entity->{mbox};
return newmail(type => $type, body => $body, (defined $name ? (name => $name) : ()), (defined $mbox ? (mbox => $mbox) : ()));
}
# Parse a data file
sub get_file
{
my $spec = shift;
my @list;
for my $pat (split /[,\s]+/, $spec)
{
if (-r $pat)
{
open HDRS, $pat or next;
while (<HDRS>)
{
s/#.*$//, s/^\s+//, s/\s+$//; next unless $_;
push @list, $_;
}
close HDRS;
}
else
{
push @list, $pat;
}
}
return @list;
}
# Locate a command in the $PATH
sub find
{
my $cmd = shift;
return $_ for (grep { -x $_ } map { "$_/$cmd" } split /:/, $ENV{PATH});
return undef;
}
# vi:set ts=4 sw=4:
syntax highlighted by Code2HTML, v. 0.9.1