#!/usr/local/bin/perl -w
#
# ARCHMBOX: a simple mailbox archiver.
# Copyright (C) 2001-2005 Alessandro Dotti Contra
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# For any questions related to this software, please write me at:
#
# Alessandro Dotti Contra
# v. Verne, 6
# 40128 Bologna ITALY
#
# or email me at: adotti@users.sourceforge.net
#==============================================================================
# Archive all selected messages from the specified mailbox(es).
#
# usage:
# archmbox [-h|--version]
# archmbox MODE [OPTIONS] -d <date> <mailbox> [<mailbox>, ...]
# archmbox MODE [OPTIONS] -o <days> <mailbox> [<mailbox>, ...]
#
# MODES
# -a, --archive archive mode: archive selected messages
# -k, --kill kill mode: delete selected mesages
# -l, --list list mode: list what messages will be
# selected
# -y, --copy copy mode: copy selected messages
#
# OPTIONS
# -b, --backup backup the original mailbox before execution
# --bzip2 compress the archive mailbox using bzip2 (use with -c)
# -c, --compress compress the archive mailbox
# -d, --date <date> treshold date for messages
# -D, --date-header use 'Date:' header to age the messages
# -e, --extension <extension> suffix for the archive mailbox, "none" means no
# extension (default: "archived")
# -f, --full-name prepend the path of the mailbox to the name of the
# archive mailbox
# --format <format> specify mailboxes format (default: mbox)
# Legal values are: mbox, mbx
# -h, --help prints help
# -i, --ignore <regexp> skip any mailbox/directory matching the regular
# expression when archiving
# --keep-flagged keep flagged messages
# --keep-unread keep unread messages
# -m, --minsize set the minimum mailbox size to be archived (in KB)
# --nosymlik do not follow symlinks when processing mailboxes
# --nowarnings suppress mailbox related warnings
# -o, --offset <days> offset (in days) from today for treshold date
# -1 means "all messages"
# --omit-prefix <prefix> omit <prefix> from archive mailbox name (use with -f)
# -p, --path <directory> where to store archive mailbox (full path)
# (default: ".")
# -r, --reverse offset means messages newer rather than older
# -R, --recursive act recursively on directories (implies -f)
# -t, --tmpdir <directory> directory to store temporary files (full path)
# --time <time> treshold time to refine treshold date
# --totals print an overall summary
# -v, --verbose <level> set the verbosity level (1 or 2) for list mode
# default is 1
# --version prints version number
# -x, --regexp <header=regexp> archive messages using regular expressions
# (can be specified more than once)
#
# <date> must be supplied using the following format: yyyy-mm-dd.
# <time> must be supplied using the following format: hh:mm:ss (24h)
# <mailbox> must be specified with full path.
#==============================================================================
# Shell helpers
#==============================================================================
my $FUSER = "/usr/sbin/lsof";
my $RM = "/usr/local/bin/rm";
my $CAT = "/usr/local/bin/cat";
my $CP = "/usr/local/bin/cp";
my $GZIP = "/usr/local/bin/gzip";
my $GUNZIP = "/usr/local/bin/gunzip";
my $BZIP2 = "/usr/local/bin/bzip2";
my $BUNZIP2 = "/usr/local/bin/bunzip2";
my $MBXCVT = "no";
#==============================================================================
# Compile time options and variables
#==============================================================================
my $TMP_DIR=""; # Working directory for temporary
# mailboxes
#==============================================================================
# Modules
#==============================================================================
use strict;
use File::Basename;
use Getopt::Long;
use Time::Local;
# Modules configuration
Getopt::Long::Configure qw(no_ignore_case);
#==============================================================================
# Functions
#==============================================================================
sub print_help(); # Print help
sub print_version(); # Print version
sub date_from_offset($); # Get a date from an offset
sub collect_regexp_rules($$); # Collect rules for regexp archiviation
sub match_regexp($$); # Verifies of a mesages should be regexp archived
sub get_mailboxes($$$$$$$); # Get mailboxes to parse for archiving
sub clean_header($); # Return name and address parts of an header field
sub check_mailbox_format($$); # Checks the format of the mailbox
sub convert_mailbox($$$$); # Convert mailbox between formats
sub mailbox_in_use($); # Determines if a mailbox is currently in use
sub cleanup(); # Clean all temporary files
sub is_flagged($); # Checks if a message is flagged
sub is_unread($); # Checks if a message is unread
sub print_summary(); # Print an overall summary
#==============================================================================
# Constants
#==============================================================================
#
# Last day of month
#
my %LAST_DAY = ( "01" => 31, "02" => 29, "03" => 31, "04" => 30,
"05" => 31, "06" => 30, "07" => 31, "08" => 31,
"09" => 30, "10" => 31, "11" => 30, "12" => 31
);
#
# Month convertion from char to digit
# Since the return value must be use with timelocal, the count starts
# from zero
#
my %MONTH_TO_DIGIT = ( Jan => 0, Feb => 1, Mar => 2, Apr => 3,
May => 4, Jun => 5, Jul => 6, Aug => 7,
Sep => 8, Oct => 9, Nov => 10, Dec => 11
);
#
# Patterns
#
my $PATTERN_NEW; # Pattern for messages begining
$PATTERN_NEW = '^From\s+(.*)\s+(...)\s+(...)\s+(\d+)\s(..:..:..)\s+(.*\s+)?(\d{4})';
# $1 = who
# $2 = day of week
# $3 = month
# $4 = day of month
# $5 = time
# $6 = timezone
# $7 = year
my $PATTERN_DATE; # Pattern for 'Date:' header
$PATTERN_DATE = '(\d+)\s(...)\s(\d{4})\s(..:..:..)';
# $1 = day of month
# $2 = month
# $3 = year
# $4 = time
#
# Version information
#
my $VERSION = "4.9.0";
#
# Formatting parameters
#
my $FMT_ID = "5"; # message id field's size
my $FMT_FROM = "25"; # from field's size
my $FMT_SUBJ = "42"; # subject field's size
my $FMT_DATE = "25"; # date field's size
#==============================================================================
# Parameters checking
#==============================================================================
#
# Options parsing
#
my $BACKUP; # Backup the original mailbox before execution
my $COMPRESS_BZIP; # Compress the archive mailbox using bzip2
my $DATE; # Treshold date for messages
my $DATE_HEADER; # Use 'Date:' header to age the message
my $DO_ARCHIVE; # Force archive mode, even if -l was specified
my $DO_COMPRESS; # Compress the archive mailbox
my $DO_COPY; # Copy rather than archive; this is a sub mode of archive
my $DO_LIST; # List only messages, do not archive
my $EXTENSION; # Suffix for the archive mailbox
my $FULL_NAME; # Prepend the path of the mailbox the the archive mailbox
my $FORMAT; # Format of maiboxes
my $IGNORE; # Skip these mailboxes/directories (regexp match)
my $DO_KILL; # Just kill messages instead of archiving them
my $KEEP_FLAGGED; # Keep flagged messages
my $KEEP_UNREAD; # Keep unread messages
my $NOSYMLINK; # Do not follow symlonks when processing mailboxes
my $NOWARNINGS; # Suppress mailbox related warnings
my $OFFSET; # Offset (in days) from today for treshold date
my $OFFSET_SENSE; # Older or newer messages
my $OMIT_PREFIX; # Prefix to omit from archive mailbox name when full path is used
my $PATH; # Where to store archive mailbox
my $PRINT_HELP; # Print help
my $PRINT_VERSION; # Print version number
my $RECURSIVE; # Recurse if directories are specified as arguments
my $MINSIZE; # Minimum size, in KB, of mailbox to be archived
my @REGEXP; # Regular expression for messages (array)
my $TIME; # Treshold time to refine treshold date
my $TOTALS; # Print an overall summary
my $VERBOSE; # Verbosity level
unless ( GetOptions (
"a" => \$DO_ARCHIVE,
"archive" => \$DO_ARCHIVE,
"b" => \$BACKUP,
"backup" => \$BACKUP,
"bzip2" => \$COMPRESS_BZIP,
"c" => \$DO_COMPRESS,
"compress" => \$DO_COMPRESS,
"d=s" => \$DATE,
"date=s" => \$DATE,
"D" => \$DATE_HEADER,
"date-header" => \$DATE_HEADER,
"e=s" => \$EXTENSION,
"extension=s" => \$EXTENSION,
"f" => \$FULL_NAME,
"full-name" => \$FULL_NAME,
"format=s" => \$FORMAT,
"h" => \$PRINT_HELP,
"help" => \$PRINT_HELP,
"i=s" => \$IGNORE,
"ignore=s" => \$IGNORE,
"k" => \$DO_KILL,
"kill" => \$DO_KILL,
"keep-flagged" => \$KEEP_FLAGGED,
"keep-unread" => \$KEEP_UNREAD,
"l" => \$DO_LIST,
"list" => \$DO_LIST,
"m=i" => \$MINSIZE,
"minsize=i" => \$MINSIZE,
"nosymlink" => \$NOSYMLINK,
"nowarnings" => \$NOWARNINGS,
"o=i" => \$OFFSET,
"offset=i" => \$OFFSET,
"omit-prefix=s" => \$OMIT_PREFIX,
"p=s" => \$PATH,
"path=s" => \$PATH,
"r" => \$OFFSET_SENSE,
"reverse" => \$OFFSET_SENSE,
"v=i" => \$VERBOSE,
"verbose=i" => \$VERBOSE,
"version" => \$PRINT_VERSION,
"R" => \$RECURSIVE,
"recursive" => \$RECURSIVE,
"x=s@" => \@REGEXP,
"y" => \$DO_COPY,
"copy" => \$DO_COPY,
"regexp=s@" => \@REGEXP,
"t=s" => \$TMP_DIR,
"tmpdir=s" => \$TMP_DIR,
"time=s" => \$TIME,
"totals" => \$TOTALS
))
{ print_help(); exit 1; }
if ($PRINT_HELP) { print_help(); exit 0; } # Print help
if ($PRINT_VERSION) { print_version(); exit 0; } # Print version
#
# Check for mandatory options
#
unless ($DATE xor $OFFSET) { die "You must specify a date (-d) OR an offset (-o).\n"; }
unless ($DO_ARCHIVE or $DO_COPY or $DO_LIST or $DO_KILL) { die "You must specify a mode!\n"; }
#
# Check for parameters
#
if ((scalar @ARGV) == 0) { die "Missing mailbox.\n"; }
#
# Get date (from offset if necessary)
#
$OFFSET = 0 unless ($OFFSET);
if ($OFFSET < -1) { die "Offset must be a positive integer (or -1 for all messages).\n"; }
$DATE = date_from_offset($OFFSET) if $OFFSET; # Treshold date specified as an offset
my $YEAR; my $MONTH; my $DAY;
if ($DATE =~ /(\d{4})-(\d{2})-(\d{2})/)
{
if (($2 < 01) || ($2 > 12)) { die "Month is not valid.\n"; }
if (($3 < 01) || ($3 > $LAST_DAY{$2})) { die "Day is not valid.\n"; }
$YEAR = $1;
$MONTH = $2;
$DAY = $3;
}
#
# Incorrect date format
#
else { die "Date was not specified in the right format!\n"; }
#
# Check time format if specified
#
my $HOUR = 0; my $MINUTE = 0; my $SECOND = 0;
if ($TIME) {
if ($TIME =~ /^(\d{2}):(\d{2}):(\d{2})$/) {
if (($1 < 0) || ($1 > 23)) { die "Hours are not valid.\n"; }
if (($2 < 0) || ($2 > 59)) { die "Minutes are not valid.\n"; }
if (($3 < 0) || ($3 > 59)) { die "Seconds are not valid.\n"; }
$HOUR = $1;
$MINUTE = $2;
$SECOND = $3;
}
else {
# Incorrect time format
die "Time was not specified in the right format!\n";
}
}
#
# Handle helpers for bzip2 compression
#
unless ($BZIP2 =~ /^\//)
{
warn "'bzip2' is not installed... using 'gzip' instead.\n";
$COMPRESS_BZIP = 0;
}
unless ($BUNZIP2 =~ /^\//)
{
warn "'bunzip2' is not installed... using 'gunzip' instead.\n";
$COMPRESS_BZIP = 0;
}
#
# Check archive path
#
if ($PATH)
{
unless ($PATH =~ /^\//) { die "Alternate path must be a full path.\n"; }
unless (-d $PATH) { die "The specified path is not valid... $PATH: $!\n"; }
}
#
# Check temporary directory path
#
if ($TMP_DIR)
{
unless ($TMP_DIR =~ /^\//) { die "$TMP_DIR: the temporary directory must be specified using full path.\n"; }
unless (-d $TMP_DIR) { die "The specified path for the temporary directory is not valid. $TMP_DIR: $!\n"; }
}
#
# IGNORE regexp
#
$IGNORE = 0 unless $IGNORE;
#
# Set verbosity level
#
$VERBOSE = 1 unless $VERBOSE;
#
# Set minimum file size
#
$MINSIZE = 0 unless $MINSIZE;
#
# Suppress mailbox related warnings
#
$NOWARNINGS = 0 unless $NOWARNINGS;
#
# Set running mode (archive, copy, kill, list).
# A better way should be found...
#
$DO_LIST = 0 unless ($DO_LIST && ! $DO_ARCHIVE && ! $DO_COPY && ! $DO_KILL);
$DO_KILL = 0 unless ($DO_KILL && ! $DO_ARCHIVE && ! $DO_COPY);
$DO_COPY = 0 unless ($DO_COPY && ! $DO_ARCHIVE);
#
# Regular expressions
#
foreach my $regexp_rule (@REGEXP)
{
if ($regexp_rule !~ /.*=.*/) { die "Invalid regexp rule given.\n"; }
}
# if no regexp has been specified, a default 'match all' regexp is used
push (@REGEXP, 'Subject=.*') unless scalar(@REGEXP);
#
# Offset sense... newer or older messages
#
$OFFSET_SENSE = -1 unless $OFFSET_SENSE;
#
# Examine all messages?
#
my $ALL_MEX = ( $OFFSET == -1 ? 1 : 0 );
#
# Suffix for the archive mailbox
#
if ($EXTENSION) {
if ($EXTENSION eq "none") { $EXTENSION = ""; }
else { $EXTENSION = ".$EXTENSION"; }
}
else { $EXTENSION = ".archived"; }
#
# Check if the prefix to omit is specified as a full path
#
if ($OMIT_PREFIX) {
if ($OMIT_PREFIX !~ /^\//) {
die "Prefix to omit must be specified as a full path!\n";
}
else {
# Windows system use "\" instad of "/".
# "\" need to be escaped to use $OMIT_PREFIX in regexps
$OMIT_PREFIX =~ s/(^|[^\\])\\([^\\]|$)/$1\\\\$2/g;
}
}
#
# Compression method
#
my $ZEXTENSION = ($COMPRESS_BZIP ? "bz2" : "gz");
my $COMPRESS = ($COMPRESS_BZIP ? $BZIP2 : $GZIP);
my $UNCOMPRESS = ($COMPRESS_BZIP ? $BUNZIP2 : $GUNZIP);
#
# Mailboxes format?
#
$FORMAT = "mbox" unless $FORMAT;
die "$FORMAT: unknown mailbox format.\n" unless $FORMAT =~ /^(mbox|mbx)$/;
die "mbx mailbox format is not supported.\n" if (($FORMAT eq "mbx") && ($MBXCVT eq "no"));
#
# Set total counters
#
my $total_parsed_mailboxes = 0; # Total parsed mailboxes
my $total_skipped_mailboxes = 0; # Total skipped mailboxes
my $total_inuse_mailboxes = 0; # Total in use/modified mailboxes (skipped)
my $total_invalid_mailboxes = 0; # Total invalid mailboxes (skipped)
my $total_nonexistent_mailboxes = 0; # Total non existent mailboxes
my $total_empty_mailboxes = 0; # Total empty mailboxes (skipped)
my $total_parsed_messages = 0; # Total parsed messages
my $total_messages_size = 0; # Total size (in bytes) of parsed messages
my $total_archived_messages = 0; # Total archived messages
my $total_saved_space = 0; # Total saved space (in bytes)
#
# Get mailbox(es)
#
my @MBOXES;
my ($v, $i, $s, $n, $e) =
get_mailboxes(\@MBOXES, \@ARGV, $FORMAT, $RECURSIVE, $MINSIZE, $NOSYMLINK, $IGNORE);
# Update total counters
$total_invalid_mailboxes += $i;
$total_skipped_mailboxes += $s;
$total_nonexistent_mailboxes += $n;
$total_empty_mailboxes += $e;
#==============================================================================
# Prepare for parsing...
#==============================================================================
#
# Compute treshold date for messages age
#
my $TRESHOLD_DATE = (timelocal($SECOND, $MINUTE, $HOUR, $DAY, ($MONTH - 1), ($YEAR - 1900)));
#
# Extract rule from regexp string
#
my %REGEXP_RULES; # Rules for regexp archiviation
collect_regexp_rules(\@REGEXP, \%REGEXP_RULES);
#
# Prepare output formats for listing mode
#
my $OUTPUT_CHANNEL = "STDOUT"; # Safe default
my $OLDFH; # Original output channel
#
# Use full name for the archive maibox(es) if recursion is specified
#
$FULL_NAME = 1 if $RECURSIVE;
#
# Use 'Date:' header to age messages
#
$DATE_HEADER = 0 unless $DATE_HEADER;
#==============================================================================
# Parse mailbox(es)
#==============================================================================
my %OD; # Output data. Used in formats if -l specified
my $messages; # Messages counter (per mailbox)
my $archived; # Archived messages counter (per mailbox)
my $MBOX; # Current mailbox
my $mbox_size; # Total size of the mailbox (bytes)
my $mbox_saved; # Saved space from the mailbox (bytes)
my $tmp_dir; # Temporary working directory
my $PID = $$; # PID of this process. We use it for the name of
# temporary mailboxes
my $SIZE_FACTOR = 1048576; # Bytes to Megabytes
$tmp_dir = ($TMP_DIR ? "$TMP_DIR" : "/tmp");
#
# Create temporary and backup mailboxes with 0600 permissions
#
umask(0177);
#
# Parse mailbox(es)
#
MAILBOX: foreach $MBOX (@MBOXES)
{
#
# Mailbox backup
#
if ($BACKUP) {
qx{ $CP \"$MBOX\" \"$MBOX.backup\" 2\> /dev/null };
if ($?) { die "Unable to create the $MBOX.backup backup file.\n"; }
}
#
# Check if the mailbox is in use
#
# 0: in use
# 1: not in use
# 2: error
my $check = mailbox_in_use($MBOX);
die "Cannot check mailbox's status. Quitting.\n" if ($check == 2);
unless ($check)
{
$NOWARNINGS || warn "\nMailbox $MBOX is in use. Skipped!\n\n";
$total_inuse_mailboxes++;
#
# If the maibox is in use, skip to the next one (if any)
#
next;
}
#
# Define the archive mailbox name
#
my $MBOX_NAME;
if ($FULL_NAME)
{
#
# We want to keep the path in the archive mailbox name
#
$MBOX_NAME = $MBOX;
# Omit the prefix if required
if ($OMIT_PREFIX) { $MBOX_NAME =~ s/^$OMIT_PREFIX//; }
# First we strip the leading "/" from the name
if ($MBOX_NAME =~ /^\//) { $MBOX_NAME = substr($MBOX_NAME, 1); }
# Some substitutions:
# "/" are replaced with "-"
# " " are replaced with "_"
$MBOX_NAME =~ tr/\//-/s;
$MBOX_NAME =~ tr/ /_/s;
# Mozilla style foders have ".sbd" as a suffix.
# We strip it!
$MBOX_NAME =~ s/\.sbd//gi;
}
else
{
$MBOX_NAME = basename($MBOX);
}
#
# Get some info about the mailbox
#
my ($mode, $uid, $gid, $mtime) = (stat $MBOX)[2, 4, 5, 9];
#
# Define temporary files to work on
#
my $TMP_MBOX = "$tmp_dir/" . "archmbox-$PID-" . basename($MBOX) . ".tmp";
$DO_COPY or my $KEPT_MBOX = "$tmp_dir/" . "archmbox-$PID-" . basename($MBOX) . ".kept";
$DO_KILL or my $SAVED_MBOX = "$tmp_dir/" . "archmbox-$PID-" . basename($MBOX) . ".saved";
$DO_KILL or my $ARCHIVED_MBOX = ($PATH ? "$PATH/$MBOX_NAME$EXTENSION" : "$MBOX_NAME$EXTENSION");
#
# Since no extension may be required for the archive mailbox, we have to check
# if we are trying to archive messages to the source mailbox itself.
# We check the inode numbers.
# This check is meaningless in list or kill mode.
#
unless ($DO_LIST or $DO_KILL) {
if ($ARCHIVED_MBOX && -e $ARCHIVED_MBOX && ((stat($MBOX))[1] == (stat($ARCHIVED_MBOX))[1])) {
die "Archive mailbox is the same as source. Quitting!\n";
}
}
#
# Create a temporary mailbox
#
convert_mailbox($MBOX, $FORMAT, "mbox", $TMP_MBOX) or die "Unable to create temporary mailbox.\n";
#
# Open maibox(es)
#
unless(open (MBOX, $TMP_MBOX)) {
warn "Unable to open $TMP_MBOX: $!\n";
cleanup();
die "Quitting now!\n";
}
unless($DO_LIST)
{
unless($DO_COPY) {
unless(open (KEPT_MBOX, ">$KEPT_MBOX")) {
warn "Unable to create $KEPT_MBOX: $!\n";
cleanup();
die "Quitting now!\n";
}
}
unless($DO_KILL)
{
unless(open (SAVED_MBOX, ">$SAVED_MBOX")) {
warn "Unable to create $SAVED_MBOX: $!\n";
cleanup();
die "Quitting now!\n";
}
}
}
my $ARCHIVE = 0; # Archive mode for messages
my $SKIPLINE = 0; # Empty line between message headers and body
$messages = 0;
$archived = 0;
$mbox_size = 0;
$mbox_saved = 0;
#
# Print headers if in list mode
#
if ($DO_LIST) {
print "\n";
printf "%-${FMT_ID}s ", "ID";
if ($VERBOSE > 1) {
printf "%-${FMT_DATE}s ", "DATE";
print "\n";
print " " x ($FMT_ID + 2)
}
printf "%-${FMT_FROM}s ", "FROM";
printf "%-${FMT_SUBJ}s ", "SUBJECT";
print "\n";
print "-" x 80;
print "\n\n";
}
while (<MBOX>)
{
my $line = $_;
my $line_length = length $line;
$mbox_size += $line_length;
chomp $line;
if ( $line =~ /$PATTERN_NEW/) # This is a new message, read headers
{
$messages++;
my $headers_size = 0;
$headers_size += $line_length;
chomp (my $line = $_); # This is not $line from the loop outside!
#
# COLLECT MESSAGE'S HEADERS
#
my @MESSAGE_HEADER; # Message headers collected here
my %HEADER_INFORMATION; # "Structure" filled with parsed header data,
# this is used by regexp based archiving
# Safe default must be provided for the following headers, since they are always printed
# in list mode. They must not be undefined; a meaninfull value should override the default
$HEADER_INFORMATION{'From'} = "(undefined)";
$HEADER_INFORMATION{'Date'} = "(undefined)";
$HEADER_INFORMATION{'Subject'} = "(undefined)";
# Collect this header.
# We use an odd key for the hash since the keyword 'From' would be
# otherwise duplicated.
$HEADER_INFORMATION{'_From'} = $line;
#
# Save the first line of the message
#
push(@MESSAGE_HEADER, $line);
#
# Parse headers
#
my $current_header = 'none';
while (<MBOX>)
{
my $line_length = length $_;
$mbox_size += $line_length;
$headers_size += $line_length;
chomp;
push (@MESSAGE_HEADER, $_ );
# Collect header. We must consider repeated headers (ie: 'Received')
# or folding headers.
if(/(.*?):\s(.*)/) {
# A header. Let's see if it's unique or repeated
if ($1 eq $current_header) { $HEADER_INFORMATION{$1} .= " $2"; }
else {
$HEADER_INFORMATION{$1} = $2;
$current_header = $1;
}
}
elsif (/^\s+(.+)$/) {
# This is part of a folding header
$HEADER_INFORMATION{$current_header} .= " $1";
}
#
# Message content begins, headers are over
#
($SKIPLINE = 1, last) if (/^$|^\r$/);
}
# Get some date/time informations from the message
my $year; my $month; my $day; my $time;
if ($DATE_HEADER && $HEADER_INFORMATION{'Date'} =~ /$PATTERN_DATE/) {
# Use the "Date:" header to get date/time infos
# but only if it is not corrupt
#
$year = ($3 ? $3 : "");
$month = ($2 ? $2 : "");
$day = ($1 ? $1 : "");
$time = ($4 ? $4 : "");
}
else {
# use the beginning of the message (the first line)
#
$HEADER_INFORMATION{'_From'} =~ /$PATTERN_NEW/;
$year = ($7 ? $7 : "");
$month = ($3 ? $3 : "");
$day = ($4 ? $4 : "");
$time = ($5 ? $5 : "");
}
# Get time of the message
my $hh = 0; my $mm = 0; my $ss = 0;
if ($time =~ /^(\d{2}):(\d{2}):(\d{2})$/) {
$hh = $1;
$mm = $2;
$ss = $3;
}
# Get age of the message
my $message_date = timelocal($ss, $mm, $hh, $day, $MONTH_TO_DIGIT{$month}, ($year - 1900));
#
# Should the message be archived?
# (this is based on the from line)
#
$ARCHIVE = 0;
my $compar = $message_date <=> $TRESHOLD_DATE;
$ARCHIVE = 1 if ($ALL_MEX || ($compar == 0 || ($compar == $OFFSET_SENSE)));
#
# Keep flagged messages?
#
if ($ARCHIVE) {
$ARCHIVE = 0 if ($KEEP_FLAGGED && is_flagged(\%HEADER_INFORMATION));
}
#
# Keep unread messages?
#
if ($ARCHIVE) {
$ARCHIVE = 0 if ($KEEP_UNREAD && is_unread(\%HEADER_INFORMATION));
}
#
# Regexp archiving?
#
if ($ARCHIVE)
{
$ARCHIVE = 0 unless match_regexp(\%HEADER_INFORMATION, \%REGEXP_RULES);
}
#
# Messages with mailbox's internal data MUST always be kept.
# We decrease the total messages count as we want this to be
# transparent for the user.
# (it's pretty dirty and may change in future releases)
#
if ($HEADER_INFORMATION{'From'} =~ /Mail System Internal Data/) {
$ARCHIVE = 0;
$messages--;
}
#
# Increase counters
#
$archived++ if $ARCHIVE;
$mbox_saved += $headers_size if $ARCHIVE;
unless ( $DO_LIST )
{
#
# Manage headers
#
if ($ARCHIVE)
{
foreach (@MESSAGE_HEADER)
{
if (! $DO_KILL) {
unless(print SAVED_MBOX $_ , "\n") {
cleanup();
die "Error while archiving message header.\n";
}
}
}
}
else
{
foreach (@MESSAGE_HEADER)
{
if (! $DO_COPY) {
unless(print KEPT_MBOX $_ , "\n") {
cleanup();
die "Error while keeping message header.\n";
}
}
}
}
}
if ($DO_LIST)
{
if ($ARCHIVE) {
printf "%-${FMT_ID}s ", $messages;
if ($VERBOSE > 1) {
printf "%-${FMT_DATE}s ", substr($HEADER_INFORMATION{'Date'}, 0, $FMT_DATE);
print "\n";
print " " x ($FMT_ID + 2)
}
printf "%-${FMT_FROM}s ", substr(clean_header($HEADER_INFORMATION{'From'}), 0, $FMT_FROM);
printf "%-${FMT_SUBJ}s", substr(clean_header($HEADER_INFORMATION{'Subject'}), 0, $FMT_SUBJ);
print "\n";
}
}
undef @MESSAGE_HEADER;
undef %HEADER_INFORMATION;
}
#
# If we already processed $line in inner loop
#
if ($SKIPLINE) { $SKIPLINE = 0; goto ENDLINE; }
# We do not (currently) provide option to list message contents, so this is
# outside the above loop (and no fear from large mbox files)
unless ( $DO_LIST )
{
if ($ARCHIVE)
{
if (! $DO_KILL) {
unless(print SAVED_MBOX "$line\n") {
cleanup();
die "Error while archiving message body.\n";
}
}
}
else
{
if (! $DO_COPY) {
unless(print KEPT_MBOX "$line\n") {
cleanup();
die "Error while keeping message body.\n";
}
}
}
}
$mbox_saved += $line_length if $ARCHIVE;
ENDLINE:
}
#
# Mailbox parsing is over, print summary
#
#
# Print grand total for the mailbox
#
print "\n";
printf "Mailbox $MBOX (%5.2f MB)\n", ($mbox_size / $SIZE_FACTOR);
SWITCH:
{
# Slightly different output for different modes...
if ($DO_COPY)
{
printf "Copied $archived messages out of $messages (%5.2f MB)\n", ($mbox_saved / $SIZE_FACTOR);
last SWITCH;
}
if ($DO_LIST)
{
printf "For archive $archived messages out of $messages (%5.2f MB)\n", ($mbox_saved / $SIZE_FACTOR);
last SWITCH;
}
if ($DO_ARCHIVE)
{
printf "Archived $archived messages out of $messages (%5.2f MB)\n", ($mbox_saved / $SIZE_FACTOR);
last SWITCH;
}
if ($DO_KILL)
{
printf "Deleted $archived messages out of $messages (%5.2f MB)\n", ($mbox_saved / $SIZE_FACTOR);
last SWITCH;
}
}
unless ($DO_LIST || $DO_KILL) {
# Print target mailbox
print "Archive mailbox: $ARCHIVED_MBOX\n";
}
close (MBOX);
unless ($DO_LIST)
{
$DO_COPY or close (KEPT_MBOX);
$DO_KILL or close (SAVED_MBOX);
}
#
# Update mailbox (not in listing mode)
#
unless ($DO_LIST)
{
#
# Is the mailbox in use now or was it modified during operations?
# It's meaningless in copy mode.
unless ($DO_COPY) {
# Check if the mailbox is in use or not
#
# 0: in use
# 1: not in use
# 2: error
#
my $not_used = mailbox_in_use($MBOX);
die "Cannot check mailbox's status. Quitting.\n" if $not_used == 2;
# Check if the mailbox was modified or not
#
my $not_modified = ($mtime == (stat $MBOX)[9]) ? 1: 0;
unless ($not_used and $not_modified)
{
$NOWARNINGS || warn "Mailbox $MBOX was modified or is in use now.\n";
$NOWARNINGS || warn "It's not safe to complete the operation...\n";
# Update counters
$total_skipped_mailboxes++;
#
# Remove temporary mailboxes
#
cleanup();
next MAILBOX;
}
}
#
# If messages were archived:
# replace the original mailbox (not in copy mode) and
# create/update the archive mailbox (not in kill mode)
if ($archived)
{
#
# If kill mode is selected, there's nothing to archive.
#
unless ($DO_KILL)
{
if ($DO_COMPRESS && -e "$ARCHIVED_MBOX.$ZEXTENSION") {
qx{$UNCOMPRESS \"$ARCHIVED_MBOX.$ZEXTENSION\" 2\> /dev/null };
if ($?) {
cleanup();
die "Unable to uncompress the $ARCHIVED_MBOX.$ZEXTENSION file.\n";
}
}
qx{ $CAT \"$SAVED_MBOX\" >> \"$ARCHIVED_MBOX\" 2\> /dev/null };
if ($?) {
cleanup();
die "Unable to merge data from $SAVED_MBOX to $ARCHIVED_MBOX.";
}
# Restore permissions and ownership
chown $uid, $gid, $ARCHIVED_MBOX;
chmod $mode, $ARCHIVED_MBOX;
if ($DO_COMPRESS) {
qx{$COMPRESS \"$ARCHIVED_MBOX\" 2\> /dev/null };
if ($?) {
cleanup();
die "Unable to compress the $ARCHIVED_MBOX file.\n";
}
# Set correct ownership for the compressed mailbox
chown $uid, $gid, $ARCHIVED_MBOX.$ZEXTENSION;
}
}
#
# Replace the original mailbox (but not in copy mode).
#
unless($DO_COPY) {
# If $MBOX format is mbx we need to convert $KEPT_MBOX before
# substitution is performed.
#
if ($FORMAT eq "mbx")
{
unless(convert_mailbox($KEPT_MBOX, "mbox", "mbx", "$KEPT_MBOX.mbx")) {
cleanup();
die "Unable to convert $KEPT_MBOX to $FORMAT format. Quitting!\n";
}
$KEPT_MBOX = "$KEPT_MBOX.mbx";
}
#
# Substitute original mailbox and restore ownership and permissions
#
qx{$CP \"$KEPT_MBOX\" \"$MBOX\" 2\> /dev/null };
if ($?) {
cleanup();
die "Unable to replace the $KEPT_MBOX temporary file.\n";
}
chown $uid, $gid, $MBOX;
chmod $mode, $MBOX;
qx{$RM \"$KEPT_MBOX\" 2\> /dev/null };
if ($?) {
die "Unable to remove $KEPT_MBOX temporary file.\n";
}
}
}
}
#
# Update total counters
#
$total_parsed_mailboxes++;
$total_parsed_messages += $messages;
$total_archived_messages += $archived;
$total_messages_size += $mbox_size;
$total_saved_space += $mbox_saved;
#
# Remove temporary mailboxes
#
cleanup();
}
print_summary() if $TOTALS;
exit 0;
#==============================================================================
# Functions
#==============================================================================
sub print_help()
{
#
# Print help
#
print <<__HELP__;
Archmbox: a simple mailbox archiver.
Archive all selected messages from the specified mailbox(es).
usage:
archmbox [-h|--version]
archmbox MODE [OPTIONS] -d <date> <mailbox> [<mailbox>, ...]
archmbox MODE [OPTIONS] -o <days> <mailbox> [<mailbox>, ...]
MODES
-a, --archive archive mode: archive selected messages
-k, --kill kill mode: delete selected mesages
-l, --list list mode: list what messages will be
selected
-y, --copy copy mode: copy selected messages
OPTIONS
-b, --backup backup the original mailbox before execution
--bzip2 compress the archive mailbox using bzip2
(use with -c)
-c, --compress compress the archive mailbox
-d, --date <date> treshold date for messages
-D, --date-header use 'Date:' header to age the messages
-e, --extension <extension> suffix for the archive mailbox, "none" means no
extension (default: archived)
-f, --full-name prepend the path of the mailbox to the name of
the archive mailbox
--format <format> specify mailboxes format (default: mbox)
Legal values are: mbox, mbx
-h, --help prints help
-i, --ignore <regexp> skip any mailbox/directory matching the regular
expression when archiving
--keep-flagged keep flagged messages
--keep-unread keep unread messages
-m, --minsize the minimum mailbox size to be archived (KB)
--nosymlink do not follow symlinks when processing mailboxes
--nowarnings suppress mailbox related warnings
-o, --offset <days> offset (in days) from today for treshold date
-1 means "all messages"
--omit-prefix <prefix> omit <prefix> from archive mailbox name
(use with -f)
-p, --path <directory> where to store archive mailbox (full path)
(default: ".")
-r, --reverse offset means messages newer rather than older
-R, --recursive act recursively on directories (implies -f)
-t, --tmpdir <directory> directory to store temporary files (full path)
--time <time> treshold time to refine treshold date
--totals print an overall summary
-v, --verbose <level> set the verbosity level (1 or 2) for list mode
default is 1
--version prints version number
-x, --regexp <header=regexp> archive messages using regular expressions
(can be specified more than once)
<date> must be supplied using the following format: yyyy-mm-dd
<time> must be supplied using the following format: hh:mm:ss (24h)
<mailbox> must be specified with full path.
Report bugs to <adotti\@users.sourceforge.net>
__HELP__
}
sub print_version()
{
#
# Print script version
#
print "Archmbox $VERSION\n";
}
sub date_from_offset($)
{
#
# Return the date as today - offset
#
my $SECS_X_DAY = 86400; # Seconds in a day
my $offset = (shift @_);
my $offset_secs = $offset * $SECS_X_DAY;
my ($mday, $mon, $year, $yday) = (localtime(time - $offset_secs))[3,4,5,7];
$year += 1900;
$mon += 1;
$mon = "0" . $mon if ($mon =~ /^\d$/);
$mday = "0" . $mday if ($mday =~ /^\d$/);
return "$year-$mon-$mday";
}
sub collect_regexp_rules($$)
{
#
# Collect rules for regexp archiviation
#
my $regexpref = $_[0]; # Array with the rules (raw format)
my $hashref = $_[1];
my $p; # Position of the rule delimiter
my $r_field; # Filed on which the regexp must be applied
my $r_regexp; # Regexp pattern
for (my $i = 0; $i < scalar(@$regexpref); $i++)
{
$p = index($$regexpref[$i], "=");
$r_field = substr($$regexpref[$i], 0, $p);
$r_regexp = substr($$regexpref[$i], $p+1);
# More than one rule can be specified for a single header, so we use an array for the patterns
push (@{$$hashref{$r_field}}, $r_regexp);
}
}
sub match_regexp($$)
{
#
# Verifies if a message should be regexp archived
#
my $headerref = $_[0];
my $regexpref = $_[1]; # Hash of arrays; keys are the headers
foreach my $field (keys %$regexpref)
{
next unless exists $$headerref{$field};
for (my $i = 0; $i < scalar(@{$$regexpref{$field}}); $i++)
{
my $rule = ${$$regexpref{$field}}[$i];
return 1 if $$headerref{$field} =~ /$rule/i;
}
}
return 0;
}
sub get_mailboxes($$$$$$$)
{
#
# Get mailboxes to parse for archiving
#
# Returns the number of valid/invalid/skipped/non existent/empty mailboxes
my $mboxesref = $_[0]; # Store mailboxes names and paths (array)
my $argsref = $_[1]; # Arguments (array)
my $format = $_[2]; # Format of the mailboxes
my $recursion = $_[3]; # Is recursion active?
my $minsize = $_[4]; # Minimum size of the mailbox
my $nosymlink = $_[5]; # Follow the symbolic links?
my $ignore = $_[6]; # Which mailboxes/directories must be ignored
my $size = 0;
my $KBsize = 0;
my $valid = 0; # Valid mailboxes
my $invalid = 0; # Invalid mailboxes
my $skipped = 0; # Skipped mailboxes
my $nonexistent = 0; # Non existent mailboxes
my $empty = 0; # Empty mailboxes
foreach my $mailbox (@$argsref)
{
# Full path must be used...
if ($mailbox !~ /^\//) {
$NOWARNINGS || warn "$mailbox: use full path!\n";
$invalid++;
next;
}
# Check if the mailbox is a symbolic link and if we have to process it
if ((-l $mailbox) && $nosymlink) {
$NOWARNINGS || warn "$mailbox is a symbolic link... skipped!\n";
$skipped++;
next;
}
# Check if the mailbox has to be ignored
#
if ($ignore && ($mailbox =~ /$ignore/)) {
$NOWARNINGS || warn "$mailbox will be ignored as requested.\n";
$skipped++;
next;
}
stat ($mailbox);
# Skip to next mailbox if the mailbox doesn't exists
unless (-e _) {
$NOWARNINGS || warn "$mailbox does not exists!\n";
$nonexistent++;
next;
}
if (-f _)
{
# Skip to next mailbox if the mailbox is empty
unless (-s _) {
$NOWARNINGS || warn "$mailbox is empty... skipped!\n";
$empty++;
next;
}
# Check mailbox size: if the size is smaller than $minsize
# skip to the next mailbox
$size = -s _;
$KBsize = int ($size/1024);
if ($KBsize < $minsize ) {
$NOWARNINGS || warn "$mailbox is smaller than $minsize KB ...Skipped!\n";
$skipped++;
next;
}
# Check mailbox format: if the format is not correct or the check
# cannot be performed skip to the next mailbox
my $check = check_mailbox_format($mailbox, $format);
if ($check == 0) {
$NOWARNINGS || warn "$mailbox: not a valid $format mailbox.\n";
$invalid++;
next;
}
if ($check == 2) {
$NOWARNINGS || warn "$mailbox: unable to check format.\n";
$invalid++;
next;
}
push (@$mboxesref, $mailbox);
$valid++;
}
elsif (-d _ && $recursion)
{
opendir DIR, $mailbox;
my @mailboxes = map "$mailbox/$_", grep !/^\./, readdir DIR;
my ($v, $i, $s, $n, $e) = get_mailboxes($mboxesref, \@mailboxes, $format, $recursion, $minsize, $nosymlink, $ignore);
$valid += $v;
$invalid += $i;
$skipped += $s;
$nonexistent += $n;
$empty += $e;
}
else
{
$NOWARNINGS || warn "$mailbox is not a valid mailbox.\n";
$invalid++;
}
}
return $valid, $invalid, $skipped, $nonexistent, $empty;
}
sub clean_header($)
{
#
# Return name and address parts of an header field
#
my $header = $_[0]; # Header field with address information
# Clean header
$header =~ tr/\"//d;
$header =~ s/=\?iso-.*?\?q\?//;
$header =~ s/\?=//;
# specific coded chars replacement
#
$header =~ s/=20/ /;
return $header;
}
sub check_mailbox_format($$)
{
#
# Checks the format of the mailbox
# returns 1 if the format is correct, 0 if the format is wrong, 2 if an error occours
#
my $mbox = $_[0]; # Mailbox
my $format = $_[1]; # Mailbox format
my $data;
# Open the mailbox
open (MBOX, "$mbox") or return 2; # Error: unable to open file
if ($format eq "mbox") { sysread(MBOX, $data, 5); close(MBOX); return $data eq "From "; }
if ($format eq "mbx") { sysread(MBOX, $data, 5); close(MBOX); return $data eq "*mbx*"; }
}
sub convert_mailbox($$$$)
{
#
# Convert mailbox between formats. A new mailbox is created.
#
my $mailbox = $_[0]; # Mailbox
my $format = $_[1]; # Format of the mailbox
my $new_format = $_[2]; # Format of temporary mailbox
my $new_mailbox = $_[3]; # Temporary mailbox
SWITCH:
{
#
# 2 mbox conversion
#
if ($new_format eq "mbox")
{
if ($format eq "mbox") {
qx{ $CP \"$mailbox\" \"$new_mailbox\" 2\> /dev/null };
}
if ($format eq "mbx")
{
if (basename($MBXCVT) eq "mbxcvt")
{
qx{ $MBXCVT \"$mailbox\" unix \"$new_mailbox\" 2\> /dev/null };
}
if (basename($MBXCVT) eq "mailutil")
{
qx{ $MBXCVT copy \"$mailbox\" \"#driver.unix:$new_mailbox\" 2\> /dev/null };
}
}
last SWITCH;
}
#
# 2 mbx conversion
#
if ($new_format eq "mbx")
{
if ($format eq "mbx") {
qx{ $CP \"$mailbox\" \"$new_mailbox\" 2\> /dev/null };
}
if ($format eq "mbox")
{
if (basename($MBXCVT) eq "mbxcvt")
{
qx{ $MBXCVT \"$mailbox\" mbx \"$new_mailbox\" 2\> /dev/null };
}
if (basename($MBXCVT) eq "mailutil")
{
qx{ $MBXCVT copy \"$mailbox\" \"#driver.mbx:$new_mailbox\" 2\> /dev/null };
}
}
last SWITCH;
}
}
return -f $new_mailbox;
}
sub mailbox_in_use($) {
# Determines if a mailbox is currently in use e.g. opened by another program.
#
# $_[0]: mailbox to check
#
# Returns:
# 0 if mailbox is in use
# 1 if the mailbox is not in use
# 2 if some error occurs
my $mbox = $_[0];
# This should not happen, but who's to say?
return 2 if $FUSER eq "no";
# We use fuser to check if the mailbox is in use
if ($FUSER =~ /.*fuser$/) {
return system("$FUSER -s \"$mbox\"");
}
elsif ($FUSER =~ /.*lsof$/) {
return system("$FUSER \"$mbox\" 1>/dev/null 2>&1");
}
elsif ($FUSER =~ /.*fstat$/) {
#
# fstat support under *BSD systems
#
# Weird return, but faster than a switch (?)
# Explanation :
# x = fuser(...)|wc -l
# if x == 0 then error
# if x == 1 then file not used
# if x > 1 then file in use
#
my $res = 2 - qx($FUSER \"$mbox\" | /usr/bin/wc -l);
return ($res + abs($res)) / 2;
}
else { return 2; }
}
sub cleanup() {
# Clean all temporary files.
# Returns 0 upon success, 1 otherwise.
qx{ $RM $tmp_dir/*$PID* 2\> /dev/null };
if($?) {
warn "Unable to remove temporary files. Clean $tmp_dir by hand.\n";
return 1;
}
return 0;
}
sub is_unread($) {
# Check if a message is unread.
#
# $_[0]: headers information (reference)
#
# Returns 1 if the message is unread
my $headerref = $_[0];
# No 'Status' header. Assume the message is unread.
return 1 unless exists $$headerref{'Status'};
# Check the status
return 1 if $$headerref{'Status'} !~ /R.+/;
return 0
}
sub is_flagged($) {
# Check if a message is flagged.
#
# $_[0]: headers information (reference)
#
# Returns 1 if the message is flagged
my $headerref = $_[0];
# No 'X-Status' header. Assume the message is not flagged
return 0 unless exists $$headerref{'X-Status'};
# Check the flag
return 1 if $$headerref{'X-Status'} =~ /F/;
return 0
}
sub print_summary() {
#
# Print overall summary.
#
# Counters and mode are read directly from global variables.
# Define the action
my $action = "";
SWITCH:
{
if( $DO_LIST ) { $action = "For archive"; last SWITCH; }
if( $DO_ARCHIVE ) { $action = "Archived"; last SWITCH; }
if( $DO_KILL ) { $action = "Deleted"; last SWITCH; }
if( $DO_COPY ) { $action = "Copied"; last SWITCH; }
}
print "\n\n";
print "Overall summary\n";
print "=" x 50, "\n";
printf "%-30s%20d\n", "Parsed mailboxes:", $total_parsed_mailboxes;
printf "%-30s%20d\n", "Skipped mailboxes:", $total_skipped_mailboxes;
printf "%-30s%20d\n", "Mailboxes in use:", $total_inuse_mailboxes;
printf "%-30s%20d\n", "Invalid mailboxes:", $total_invalid_mailboxes;
printf "%-30s%20d\n", "Non existent mailboxes:", $total_nonexistent_mailboxes;
printf "%-30s%20d\n", "Empty mailboxes:", $total_empty_mailboxes;
printf "%-30s%20d\n", "Parsed messages:", $total_parsed_messages;
printf "%-30s%20.2f\n", "Total used space (MB):", $total_messages_size / $SIZE_FACTOR;
printf "%-30s%20d\n", "$action messages:", $total_archived_messages;
printf "%-30s%20.2f\n", "Total saved space (MB):", $total_saved_space / $SIZE_FACTOR;
print "=" x 50, "\n";
print "\n";
return 0;
}
syntax highlighted by Code2HTML, v. 0.9.1