mariadb/tests/mail_to_db.pl

311 lines
7.9 KiB
Perl
Raw Normal View History

2000-07-31 21:29:14 +02:00
#!/usr/bin/perl
# Copyright Abandoned 1998 TCX DataKonsult AB & Monty Program KB & Detron HB
# This file is public domain and comes with NO WARRANTY of any kind
#
# This program is brought to you by Janne-Petteri Koilo with the
# administration of Michael Widenius.
# This program takes your mails and puts them into your database. It ignores
# messages with the same from, date and message text.
# You can use mail-files that are compressed or gzipped and ends with
# -.gz or -.Z.
use DBI;
use Getopt::Long;
$VER = "1.6";
$opt_db = "mail";
$opt_table = "mails";
$opt_max_mail_size = 65536;
$opt_db_engine = "mysql";
$opt_host = "localhost";
$opt_user = $opt_password = "";
$opt_help = $opt_version = $opt_test=0;
GetOptions("help","version","user=s","password=s",
"db_engine=s","db=s","host=s","max_mail_size=s","test") || usage();
usage($VER) if ($opt_help || $opt_version || !$ARGV[0]);
%months= ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5,
'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10,
'Nov' => 11, 'Des' => 12);
$count_no_from = $count_no_txt = $count_too_big = 0;
$count_forwarded_msgs = $count_duplicates = $no_subject = 0;
$inserted_mails = 0;
$dbh=0;
$dbh = DBI->connect("DBI:$opt_db_engine:$opt_db:$opt_host",$opt_user,
$opt_password,{ PrintError => 0}) || die $DBI::errstr;
if (!$opt_test)
{
create_table_if_needed($dbh);
}
foreach (@ARGV)
{
if (/^(.*)\.(gz|Z)$/) #checks if the file is compressed or gzipped
{
open(FILE, "zcat $_ |");
process_mail_file($dbh,$1);
}
else
{
open(FILE,$_);
process_mail_file($dbh,$_);
}
}
$dbh->disconnect if (!$opt_test);
$ignored = $count_no_from + $count_no_txt + $count_too_big + $count_duplicates + $no_subject;
print "Mails inserted:\t\t\t$inserted_mails\n";
print "Mails ignored:\t\t\t$ignored\n";
print "Mails without \"From:\" -field:\t$count_no_from\n";
print "Mails without message:\t\t$count_no_txt\n";
print "Too big mails (> $opt_max_mail_size):\t$count_too_big\n";
print "Duplicate mails:\t\t$count_duplicates\n";
print "Forwarded mails:\t\t$count_forwarded_msgs\n";
print "No subject:\t\t\t$no_subject\n";
print "Mails altogether:\t\t";
print $inserted_mails+$ignored;
print "\n";
exit(0);
sub usage
{
my($VER)=@_;
$0 =~ s/.\/(.+)/$1/;
if ($opt_version)
{
print "$0 version $VER\n";
}
else
{
print <<EOF;
$0 version $VER
Usage: $0 [options] file1 [file2 file3 ...]
Description: Inserts mails from file(s) into a database
Options:
--help show this help and exit
--version shows the version of the program
--db_engine=... database server (default: $opt_db_engine)
--db=... database to be used (default: $opt_db)
--host=... hostname to be used (default: $opt_host)
--password=... user password for the db server
--user=... username for the db server
--max_mail_size=# max size of a mail to be inserted into the db.
mail will be ignored if it exceeds this size
(default $opt_max_mail_size)
--test Don\'t connect to the database, just write the
queries to stdout
EOF
}
exit(0);
}
sub create_table_if_needed
{
my ($dbh)=@_;
my ($sth,$create);
$sth = $dbh->prepare("select count(*) from $opt_table") or die $dbh->errstr;
if (!$sth->execute)
{
$create = "CREATE TABLE $opt_table (msg_nro mediumint unsigned not null ";
$create .= "auto_increment, date DATETIME NOT NULL, time_zone CHAR(6) ";
$create .= "NOT NULL, mail_from char(120) not null, reply char(120), ";
$create .= "mail_to TEXT, cc TEXT, sbj char(200), txt MEDIUMTEXT NOT ";
$create .= "NULL, file char(32) noT NULL, hash INT NOT NULL, key ";
$create .= "(msg_nro), primary key (mail_from, date, time_zone, hash))";
$sth = $dbh->prepare($create) or die $dbh->errstr;
$sth->execute() or die $dbh->errstr;
}
}
sub process_mail_file
{
my ($dbh,$file_name)= @_;
my (%values,$type,$check);
%values=(); $type="";
$check=0;
while (<FILE>)
{
chop;
if ($type ne "message")
{
if (/^Reply-To: (.*)/i) # finding different fields from file
{
$type="reply";
$values{$type}= $1;
}
elsif (/^From: (.*)/i)
{
$type="from";
$values{$type}= $1;
}
elsif (/^To: (.*)/i)
{
$type="to";
$values{$type}= $1;
}
elsif (/^Cc: (.*)/i)
{
$type="cc";
$values{$type}= $1;
}
elsif (/^Subject: (.*)/i)
{
$type="subject";
$values{$type}= $1;
}
elsif (/^Date: (.*)/i)
{
date_parser($1,\%values);
$type="rubbish";
}
elsif (/^[\w\W-]+:\s/)
{
$type="rubbish";
}
elsif ($_ eq "")
{
$type="message";
$values{$type}="";
}
else
{
s/^\s*/ /;
$values{$type}.= $_;
}
}
elsif ($check!=0 && $_ ne "") # in case of forwarded messages
{
$values{$type}.= "\n" . $_;
$check--;
}
elsif (/^From .* \d\d:\d\d:\d\d\s\d\d\d\d$/)
{
$values{'hash'}= checksum("$values{'message'}");
update_table($dbh,$file_name,\%values);
%values=(); $type="";
$check=0;
}
elsif (/-* forwarded message .*-*/i) # in case of forwarded messages
{
$values{$type}.= "\n" . $_;
$check++;
$count_forwarded_msgs++;
}
else
{
$values{$type}.= "\n" . $_;
}
}
$values{'hash'}= checksum("$values{'message'}");
update_table($dbh,$file_name,\%values);
}
########
# converts date to the right form
sub date_parser
{
my ($date_raw,$values)=@_;
$date_raw =~ /\s*(\d{1,2}) (\w+) (\d{2,4}) (\d+:\d+:\d+)\s*([\w-+]{3-5})?/;
$values->{'date'}=$3 . "-" . $months{$2} . "-" . "$1 $4";
$values->{'time_zone'}=$5;
}
#########
# this is runned when the whole mail is gathered.
# this actually puts the mail to the database.
sub update_table
{
my($dbh,$file_name,$values)=@_;
my($query);
if (! defined($values->{'subject'}) || !defined($values->{'to'}))
{
$no_subject++;
return; # Ignore these
}
$values->{'message'} =~ s/^\s*//; #removes whitespaces from the beginning
$values->{'message'} =~ s/\s*$//; #removes whitespaces from the end
$query = "insert into $opt_table values (NULL,'" . $values->{'date'};
$query .= "','" . $values->{'time_zone'} . "',";
$query .= (defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL") . ",";
$query .= (defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL") . ",";
$query .= (defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL") . ",";
$query .= (defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL") . ",";
$query .= $dbh->quote($values->{'subject'}) . ",";
$query .= $dbh->quote($values->{'message'}) . "," . $dbh->quote($file_name);
$query .= ",'" . $values->{'hash'} . "')";
if (length($values->{'message'}) > $opt_max_mail_size) #disables big message
{
$count_too_big++;
}
elsif ($values->{'from'} eq "") #disables mails with no from field
{
$count_no_from++;
}
elsif ($opt_test)
{
print "$query\n";
$inserted_mails++;
}
elsif ($values->{'message'} eq "") #disables mails with no message text
{
$count_no_msg_text++;
}
elsif ($dbh->do($query))
{
$inserted_mails++;
}
elsif (!($dbh->errstr =~ /Duplicate entry /)) #disables duplicates
{
die "Aborting: Got error '" . $dbh->errstr ."' for query: '$query'\n";
}
else
{
$count_duplicates++;
}
$query="";
}
##########
# In case you have two identical messages we wanted to identify them
# and remove additionals; We do this by calculating a hash number of the
# message and ignoring messages with the same from, date and hash.
# This function calculates a simple 32 bit hash value for the message.
sub checksum
{
my ($txt)= @_;
my ($crc,$i,$count);
$count = length($txt);
for ($crc = $i = 0; $i < $count ; $i++)
{
$crc = (($crc << 1) + (ord (substr ($txt, $i, 1)))) +
(($crc & (1 << 30)) ? 1 : 0);
$crc &= ((1 << 31) -1);
}
return $crc;
}