#!/usr/bin/ruby
#
# A script to split a mbox file into two files with a randomly
# chosen set of emails in each file. Used to create test cases
# for trying out different sets of filter parameters.
#
require 'md5'
if ARGV.length < 2
STDERR.printf("usage: splitmail percentage outfileprefix [filename...]\n")
exit 1
end
max_messages = 8000
ignore_exp = /^Subject:\s+DON.T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA/
header_exp = /(^To:)|(^From:)|(^Date:)|(^Subject:)|(^Cc:)/
skip_pct = 5 + rand(10)
pct = ARGV.shift.to_i
prefix = ARGV.shift
files = [ File.open(prefix + ".1", "w"), File.open(prefix + ".2", "w") ];
first = [ true, true ]
digests = Hash.new
ARGV.each do |filename|
msg_count = 0
dup_count = 0
digester = MD5.new
message = ""
on_blank = true
in_body = false
ignore_it = false
File.open(filename).each do |line|
if on_blank && line =~ /^From\s+\S+\s+\w\w\w\s+\w\w\w\s+\d\d?\s+\d\d\s*:\s*\d\d\s*:\s*\d\d\s+\d\d\d\d/
if (message.length > 0) && (not digests.has_key?(digester.hexdigest) && (not ignore_it) && (rand(100) >= skip_pct))
filenum = (rand(100) <= pct) ? 0 : 1
if first[filenum]
first[filenum] = false
else
files[filenum].print("\n")
end
files[filenum].print(message)
digests.store(digester.hexdigest, 1)
msg_count += 1
elsif message.length > 0
dup_count += 1
end
message = ""
on_blank = false
in_body = false
ignore_it = false
digester = MD5.new
if msg_count > max_messages
break
end
elsif line.length == 1
on_blank = true
in_body = true
end
if in_body
if message.length < 4096
digester.update(line)
end
elsif header_exp =~ line
digester.update(line)
ignore_it = ignore_it || (ignore_exp =~ line)
end
message.concat(line)
end
if (message.length > 0) && (not digests.has_key?(digester.hexdigest) && (not ignore_it))
filenum = (rand(100) <= pct) ? 0 : 1
if first[filenum]
first[filenum] = false
else
files[filenum].print("\n")
end
files[filenum].print(message)
msg_count += 1
elsif message.length > 0
dup_count += 1
end
printf("split file %s with %d unique messages and %d dups\n", filename, msg_count, dup_count)
end
syntax highlighted by Code2HTML, v. 0.9.1