#!/usr/bin/perl -w # # parp -- Perl Anti-spam Replacement for Procmail # # Copyright (c) 1999--2000 Adam Spiers . All rights # reserved. This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # $Id: parp,v 1.91 2001/05/07 14:33:06 adam Exp $ # use strict; my $VERSION = '0.44'; use Carp qw(:DEFAULT confess); use DB_File; use Data::Dumper; use Fcntl qw(:DEFAULT :flock); use Getopt::Std; use Mail::Internet; use Mail::Address; use Mail::Filter; use Mail::Folder; use Mail::Folder::Mbox; use lib "$ENV{HOME}/.parp"; use MyFilter qw(%CONFIG %RE); local $SIG{__DIE__} = \&die_handler; # Process options my %opts = (); getopts('edfhmprs:tvw', \%opts); usage() if $opts{h}; my ($no_dups, $extract_friends, $filter_files, $Mail_Folder, $dry_pipes, $do_RBL, $sample, $test_run, $verbose, $wrong_class) = @opts{qw/d e f m p r s t v w/}; $extract_friends ||= 0; $test_run ||= 0; $wrong_class ||= 0; usage() if ($extract_friends + $test_run + $wrong_class) > 1; $test_run++ if $extract_friends; my $Net_DNS_loaded = 0; if ($do_RBL) { eval { require Net::DNS; } && $Net_DNS_loaded++; } # Share some stuff between diffrent packages *Mail::Filterable::LOG = \*LOG; *Mail::Filterable::vprint = \*vprint; *MyFilter::log_to_file = \*log_to_file; *fatal = \*Mail::Filterable::fatal; *check_file_dir = \*Mail::Filterable::check_file_dir; # Prepare the filter my $filter = new Mail::Filter(\&filter); # Prepare for output $| = 1; unless ($test_run) { die "No log_file specified in %CONFIG\n" unless $CONFIG{log_file}; check_file_dir($CONFIG{log_file}); open(LOG, ">>$CONFIG{log_file}") or die "Couldn't open log file `$CONFIG{log_file}' for writing: $!\n"; } sub vprint (@); sub log_to_file (@); my $WIDTH = 78; my %out_folders = (); # Get a lock as soon as we mean business global_lock(); # Prepare friends database use vars qw(%friends); %friends = (); if (exists $CONFIG{friends_db}) { check_file_dir($CONFIG{friends_db}); tie %friends, 'DB_File', $CONFIG{friends_db}, O_RDWR | O_CREAT, 0600; } use vars qw(@dup_ids); @dup_ids = (); if (exists $CONFIG{id_cache}) { tie @dup_ids, 'DB_File', $CONFIG{id_cache}, O_CREAT | O_RDWR, 0600, $DB_RECNO; } ## ## This is the core of the program. ## if (! $filter_files) { # # Behave like a filter; take one e-mail from STDIN # usage() if -t; usage() if @ARGV; my $mail = new Mail::Internet( [<>] ); $filter->filter($mail); log_to_file '-' x $WIDTH, "\n"; } else { # # Filter all folders given in @ARGV # usage() unless @ARGV; log_to_file "###\n### Run started at ", localtime(), "\n###\n\n"; my ($total, $parsed, $spam, $dups, $main, $aux, $special) = (0, 0, 0, 0, 0, 0, 0); my %inodes_seen = (); foreach my $file (@ARGV) { unless (-f $file) { # TODO: allow symlinks vprint "Skipping non-file $file.\n"; next; } my ($inode, $size) = (stat $file)[1, 7]; if ($size == 0) { vprint "Skipping empty file $file.\n"; next; } if ($inodes_seen{$inode}) { vprint "Skipping $file\n" . " (already seen file $inodes_seen{$inode} with inode $inode\n"; next; } $inodes_seen{$inode} = $file; vprint "Reading $file ... "; my $folder = new Mail::Folder('AUTODETECT', $file); vprint "done.\nSetting $file read-only ... "; $folder->set_readonly(); vprint "done.\n"; my $msg_num = $folder->first_message(); my ($file_total, $friends) = (0, 0); do { $total++; my $mail = $folder->get_message($msg_num); $mail->{parp_foldername} = $file; if (! $mail) { fatal('Mail::Folder::get_message failed', "\$mail:\n", Dumper $mail, ); next; } my $rv = $filter->filter($mail); $parsed++ if $rv; $dups++ if $rv =~ /IS_DUPLICATE/; $spam++ if $rv =~ /IS_SPAM/; $main++ if $rv =~ /TO_MAIN/; $aux++ if $rv =~ /TO_AUX/; $special++ if $rv =~ /IS_SPECIAL/; $friends++ if $rv eq 'EXTRACTED_FRIEND'; $file_total++; log_to_file '-' x $WIDTH, "\n"; } while ($msg_num = $folder->next_message($msg_num)) && ( ((! $sample) || ($file_total < $sample)) || ((! $extract_friends) || ($friends == 0)) ); } log_to_file <sync(); vprint "done.\n"; vprint "Closing $name ... "; $folder->close(); vprint "done.\n"; } else { close($folder); } } global_unlock(); } exit 0; ############################################################################## # # The main filtering logic. # sub filter { my ($filter, $mail) = @_; my $folder; $folder = $mail->{parp_foldername} if $mail->{parp_foldername}; if (! $mail) { fatal('message parsing failed', "\$folder:\n", Dumper($folder), "\n", "\$mail:\n", Dumper($mail), ); return 0; } my $m = new Mail::Filterable $mail; vprint "Parp-ID: $m->{parp_id}\n"; log_to_file <{from} To: $m->{to} EOF log_to_file "Cc: $m->{cc}\n" if $m->{cc}; log_to_file "Subject: $m->{subject}\n"; if ($m->{parp_id}) { } elsif (1) { fatal("Parp-ID not defined", "\$m:\n", Dumper($m), ); } # the following cases should never happen elsif ($m->{id} && $m->{id} ne '<>') { vprint "Message-ID: $m->{id}\n"; } elsif ($m->{date}) { vprint "Date: $m->{date}\n"; } elsif ($m->{subject}) { vprint "Subject: $m->{subject}\n"; } elsif ($m->{from}) { vprint "From: $m->{from}\n"; } else { vprint "From $m->{env_from}\n"; } log_to_file "\n"; return $m->extract_friends($folder) if $extract_friends; if ($no_dups && ! $wrong_class && $m->is_duplicate()) { $m->reject_mail('was duplicate by message id'); # $m->deliver_to_inbox('duplicates'); $m->{backup} = 0; return 'IS_DUPLICATE'; } $m->check_for_old_addresses(); # FIXME: There could be more than one X-Loop header. if (($m->{header}->get('X-Loop') || '') eq $CONFIG{loop_value} and ! $wrong_class) { $m->accept_mail('looped'); return 'LOOPED'; } $m->{filter_category} = $m->categorize(); if (! $wrong_class) { $m->parse_received_headers(); if ($m->{recvd_parses_failed}) { if ($m->{filter_category} eq 'IS_SPAM') { $m->deliver_to('spam_recvds'); } else { vprint $m->{recvd_parses_out}; $m->deliver_to('bad_recvds'); } } if ($m->{filter_category} eq 'TO_MAIN') { $m->deliver_mail(); } elsif ($m->{filter_category} eq 'IS_SPAM') { if ($m->{complain}) { # TODO: write and send a rude letter log_to_file "Would complain\n"; } } elsif ($m->{filter_category} eq 'TO_AUX') { # list mail; already delivered to primary target # - maybe back up though $m->maybe_backup(); } elsif ($m->{filter_category} eq 'IS_SPECIAL') { # special case mail; already delivered to primary target # - maybe back up though $m->maybe_backup(); } else { die "Oh dear."; } } else { # The user's telling us that the filter_category we've just # calculated is wrong. if ($m->{filter_category} eq 'IS_SPAM') { vprint "Reclassification: was incorrectly identified as spam\n"; $m->{filter_category} = 'UNKNOWN_NOT_SPAM'; } elsif ($m->{filter_category} ne 'IS_SPAM') { vprint "Reclassification: was incorrectly identified as bona-fide\n"; $m->{filter_category} = 'IS_SPAM'; } } return $m->{filter_category}; } ############################################################################## # # Miscellaneous routines. # sub log_to_file (@) { Mail::Filterable::log_to_file(@_); } sub vprint (@) { # Deal with messages to be printed/logged when user specifies -v my (@msgs) = @_; if ($verbose) { print @msgs unless $test_run; } log_to_file @msgs; } sub global_lock { die "No lock_file specified in %CONFIG\n" unless $CONFIG{lock_file}; check_file_dir($CONFIG{lock_file}); if (! -e $CONFIG{lock_file}) { unless (open(LOCK, ">$CONFIG{lock_file}")) { fatal("Couldn't create lock file $CONFIG{lock_file}: $!"); exit 3; } } else { unless (open(LOCK, $CONFIG{lock_file})) { fatal("Couldn't open lock file $CONFIG{lock_file}: $!"); exit 4; } } my $wait = 0; until (flock LOCK, LOCK_EX | LOCK_NB) { vprint "\n" if $wait; vprint "Waiting for exclusive lock on $CONFIG{lock_file} ... "; $wait++; sleep 3; } vprint "got it!\n" if $wait; } sub global_unlock { # Don't do anything; LOCK_UN introduces races. } sub usage { warn < Usage: parp [ options ] < email parp [ options ] -f folder1 [ folder2 ... ] Options: -d enable discarding of duplicates (by Message-Id header) -e only extract e-mail addresses and add them to friends database -f operate on given files rather than as a filter -m use Mail::Folder rather than >> for appending (much slower) -p dry run for pipes - don't actually execute pipe commands -r enable RBL checking -s with -f, only sample a maximum of messages per folder -t test run - just show what would have been done -v increase verbosity -w only log that filter's spam detection heuristics failed on the supplied mails; don't do anything else EOF exit 2; } sub die_handler { my ($error) = @_; fatal($error, "Called via DIE handler\n"); exit 255; } ############################################################################## # # Routines for parsing the e-mail being filtered and calculating various # bits of data which will be used in the filtering process. # package Mail::Filterable; use Data::Dumper; use POSIX qw(tmpnam); use File::Path; use Time::Local; use Digest::MD5 qw(md5_base64); use Socket; use Mail::Field::Received; use MyFilter qw(%CONFIG %RE %lists &is_special &is_list_mail &is_from_daemon &deliver_mail); use subs qw(vprint log_to_file); local $SIG{__DIE__} = \&::die_handler; sub new { my $this = shift; my $class = ref($this) || $this; my ($mail, $props_hashref) = @_; my %m = (); my $header = $mail->head(); my $body = $mail->body; $m{content_type} = $header->get('Content-Type') || ''; # Deal with MIME multipart messages without using a very slow # parser from CPAN ... if ($m{content_type} =~ m!^multipart/.*boundary=(.*)\n!s) { my $boundary = $1; $boundary =~ s/^"(.*)"$/$1/; $boundary = quotemeta $boundary; log_to_file qq{Message is multipart; splitting on boundary "$1".\n}; my @parts = split /--$boundary(?:--)?\n?/m, join('', @$body); log_to_file "Deleting non-text parts ... \n"; my @body_lines = (); foreach my $part (@parts) { my @lines = split /(?<=\n)/, $part; my $part_mail = new Mail::Internet(\@lines); next unless @lines; my $content_type = $part_mail->get('Content-Type'); if ($content_type) { chomp $content_type; log_to_file "Content-Type: $content_type"; if ($content_type !~ m!^text/\b!) { log_to_file "; skipping ...\n"; next; } else { log_to_file "\n"; } } else { $content_type = '_unspecified_'; log_to_file "Warning: Content-Type was unspecified; assuming plain text.\n"; } push @body_lines, @{ $part_mail->body() }; } $body = \@body_lines; } $m{mail} = $mail; $m{header} = $header; $m{body} = $body; $m{body_scalar} = join '', map { s/^>From/From/; $_ } @{$m{body}}; # envelope from $header->mail_from('KEEP'); $m{mail_from} = $header->get('From ') || $header->get('Mail-From') || ''; $m{parp_id} = $header->get('X-Parp-Id') || ''; # Don't fold `From ' header, as we want that to match a date regexp below $header->fold(79); $m{from} = $header->get('From') || ''; $m{to} = $header->get('To') || ''; $m{cc} = $header->get('Cc') || ''; $m{subject} = $header->get('Subject') || ''; $m{return_path} = $header->get('Return-Path') || ''; $m{reply_to} = $header->get('Reply-To') || ''; $m{list} = $header->get('X-Mailing-List') || ''; $m{sender} = $header->get('Sender') || ''; $m{in_reply_to} = $header->get('In-Reply-To') || ''; $m{references} = $header->get('References') || ''; $m{id} = $header->get('Message-ID') || ''; $m{date} = $header->get('Date') || ''; $m{status} = $header->get('Status') || ''; $m{a_to} = $header->get('Apparently-To') || ''; $m{mailer} = $header->get('User-Agent') || $header->get('X-Mailer') || ''; # From RFC822: # # --------- 8< --------- 8< --------- 8< --------- 8< --------- 8< --------- # 4.2. FORWARDING # # Some systems permit mail recipients to forward a message, # retaining the original headers, by adding some new fields. This # standard supports such a service, through the "Resent-" prefix to # field names. # # Whenever the string "Resent-" begins a field name, the field # has the same semantics as a field whose name does not have the # prefix. However, the message is assumed to have been forwarded # by an original recipient who attached the "Resent-" field. This # new field is treated as being more recent than the equivalent, # original field. For example, the "Resent-From", indicates the # person that forwarded the message, whereas the "From" field indi- # cates the original author. # # Use of such precedence information depends upon partici- # pants' communication needs. For example, this standard does not # dictate when a "Resent-From:" address should receive replies, in # lieu of sending them to the "From:" address. # # Note: In general, the "Resent-" fields should be treated as con- # taining a set of information that is independent of the # set of original fields. Information for one set should # not automatically be taken from the other. The interpre- # tation of multiple "Resent-" fields, of the same type, is # undefined. # --------- 8< --------- 8< --------- 8< --------- 8< --------- 8< --------- # # So we only take values from Resent- headers when we can't get them # any other way but we really would prefer to have them. my %resent_headers = ( id => 'Message-ID' ); foreach my $header_key (keys %resent_headers) { my $header_name = $resent_headers{$header_key}; $m{$header_key} ||= $header->get("Resent-$header_name") || ''; } foreach my $prop (qw/mail_from parp_id from to cc subject return_path reply_to list in_reply_to references id date status a_to mailer/) { chomp $m{$prop} if $m{$prop}; } if (! $m{mail_from}) { fatal(< $i++ } qw/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/; $m{env_from_time} = timelocal($sec, $min, $hour, $mday, $months{$month}, $year); @{$m{ftc}} = extract_addrs_to_array(@m{qw/env_from from to cc list sender/}); $m{from_addr} = extract_addr_to_scalar($m{from}); $m{env_from_addr} = extract_addr_to_scalar($m{env_from}); @{$m{from_addrs}} = extract_addrs_to_array(@m{qw/env_from from/}); @{$m{all_from_addrs}} = extract_addrs_to_array($m{env_from}, $m{from}, $m{reply_to}, $m{return_path}, $m{id}); $m{recvds_array} = [ $header->get('Received') ]; $m{recvds} = join '', @{$m{recvds_array}}; my @comments = $header->get('Comments'); $m{comments} = \@comments; $m{auth_sender} = ''; foreach my $comment (@comments) { if ($comment =~ /^Authenticated sender is (.*)/i) { $m{auth_sender} = $1; last; } } # Remove all previously existing parp headers except X-Parp-Id my @parp_headers = grep /^X-Parp-(?!Id)/, $m{header}->tags(); foreach my $parp_header (@parp_headers) { $m{header}->delete($parp_header); } # Add a header for the process id to try to chase down obscure bugs. $m{header}->add('X-Parp-pid', $$); system("/bin/date >> ~/mail/.parp.pstree"); system("pstree >> ~/mail/.parp.pstree"); # Calculate a unique id which parp can always refer to. We # calculate the MD5 digest of enough of the whole mail to ensure a # unique id, but without any bits which might change in some way # during the e-mail's life-span, so as to guarantee that during any # subsequent reclassification of the e-mail as a false # positive/negative (in the spam detection sense) this unique id # will match with the original, so that the statistics calculation # program will work. my $immutable_header = join '', map "$_: $m{$_}\n", (qw/mail_from from to cc subject return_path reply_to in_reply_to references id date mailer recvds/); my $immutable = $immutable_header . "\n" . $m{body_scalar}; $immutable =~ s/\n+$/\n/; my $parp_id = $m{env_from_time} . "/" . md5_base64($immutable); # This one was a PAIN to get right. I don't feel safe enough yet to # remove the debugging. my $immutables_dir = "$ENV{HOME}/mail/.immutables"; if (-d $immutables_dir) { my $id_file = $parp_id; $id_file =~ s!/!_!g; $id_file = "$immutables_dir/$id_file"; while (-e $id_file) { # Generate a unique suffix $id_file =~ s/(?:\.(\d+))?$/"." . (($1 || 0) + 1)/e; } if (open(FOO, ">$id_file")) { print FOO $immutable; close(FOO); } else { fatal("Couldn't open $id_file for writing: $!"); } } if ($m{parp_id}) { # This e-mail has already been run through parp, so it already has # an X-Parp-Id header. if ($m{parp_id} ne $parp_id) { # Better check that the id we've just calculated is the same, # otherwise our calculation algorithm is in trouble. fatal("Message already had a parp id of: $m{parp_id}\n" . " but recalculation yielded: $parp_id", #"\%m:\n", Dumper(\%m), ); } } else { # This e-mail hasn't been touched by parp before, so stamp it with # a parp id. $m{header}->add('X-Parp-Id', $parp_id); $m{parp_id} = $parp_id; } $m{backup} = 1; # back up by default $m{complain} = 1; # allow complaining by default my $self = \%m; bless $self, $class; } sub extract_addrs_to_array { my (@lines) = @_; my @addrs = (); foreach my $line (@lines) { my @new_addrs = Mail::Address->parse($line); push @addrs, map { $_->address() } @new_addrs; } return @addrs; } sub extract_addr_to_scalar { my ($line) = @_; my @addrs = (); my @new_addrs = Mail::Address->parse($line); push @addrs, map { $_->address() } @new_addrs; fatal("header passed to extract_addr_to_scalar had " . "more than one address\n", map(" $_\n", @addrs), "line: $line\n", ) if @addrs > 1; return @addrs ? $addrs[0] : undef; } ############################################################################## # # Routines for performing tests on the e-mail being filtered, and # categorizing it accordingly. # use DB_File; use Fcntl qw(:DEFAULT); sub matches { my $m = shift; my ($category, $re, $debug) = @_; foreach my $addr (@{$m->{$category}}) { print "Testing $addr =~ /$re/\n" if $debug; my $matches = $addr =~ $re; print " -- matched!\n" if $debug and $matches; return ($matches, $1, $2, $3, $4, $5, $6, $7, $8, $9) if $matches; } return; } # This subroutine is the heart of the filtering strategy. It places # the mail currently being filtered into one of the following # categories: # # IS_SPAM -- spam # TO_MAIN -- good mail destined for main inbox # TO_AUX -- good mail destined for auxiliary inboxes # SPECIAL -- leave subroutines to do their own thang sub categorize { my $m = shift; # Ignore real lusers. return 'IS_SPAM' if $m->has_spam_from_addresses(); # Allow config file to deal with special cases in its own way. my $rv = $m->is_special(); return $rv if $rv; return 'TO_MAIN' if $m->is_from_daemon(); # Any good signs which indicate that the mail should definitely # NOT be treated as junk? my $grace = $m->is_passworded() || $m->is_from_good_person() || $m->is_from_good_domain() || $m->has_good_headers(); return 'IS_SPAM' if ! $grace && ($m->has_spam_headers() || $m->has_spam_domains_anywhere() || $m->has_spam_content()); if ($m->is_list_mail()) { $m->{complain} = 0; return 'TO_AUX'; } return 'TO_MAIN' if $grace; # We put this one after the check for list mail, because on average, # mail from lists tends to be lower grade than personal mail. return 'IS_SPAM' if $m->has_suspicious_headers(); return 'IS_SPAM' if ! $m->for_me(); $m->accept_mail('passed all tests'); return 'TO_MAIN'; } sub extract_friends { my $m = shift; my ($folder) = @_; my $folder_name = $folder ? qq.`$folder'. : 'unknown'; my %addrs = ( # env_from => { descr => 'envelope From' }, from => { descr => 'From' }, to => { descr => 'to' }, cc => { descr => 'cc' }, reply_to => { descr => 'Reply-To' }, # return_path => { descr => 'Return-Path' }, ); foreach my $addr_type (keys %addrs) { $addrs{$addr_type}{$_} ||= 0 foreach qw/me not_me total/; next unless $m->{$addr_type}; my @addrs = Mail::Address->parse($m->{$addr_type}); foreach my $parsed (@addrs) { my $paddr = $parsed->address(); if ($paddr =~ $RE{me} || $paddr =~ $RE{old_me}) { $addrs{$addr_type}{me}++; } else { $addrs{$addr_type}{not_me}++; } $addrs{$addr_type}{total}++; push @{$addrs{$addr_type}{addrs}}, $paddr; } } # for my $type (qw/from to cc reply_to/) { # vprint "type $type: "; # for my $count (qw/me not_me total/) { # vprint "[$count $addrs{$type}{$count}]"; # } # vprint "\n"; # } my @maybe_new_friends = (); if ($addrs{from}{me} == 1 && $addrs{to}{total} == 1 && $addrs{cc}{total} == 0) { push @maybe_new_friends, { addr => $addrs{to}{addrs}[0], header => 'to' }; log_to_file "Found friend in `To:' header.\n"; } elsif ( # $addrs{to}{me} + $addrs{cc}{me} >= 1 && # could be on a list $addrs{from}{not_me} == 1) { push @maybe_new_friends, { addr => $addrs{from}{addrs}[0], header => 'From' }; push @maybe_new_friends, { addr => $addrs{reply_to}{addrs}[0], header => 'Reply-To' } if $addrs{reply_to}{total} == 1; log_to_file "Found friend in `From' and `Reply-To:' headers.\n"; } my $added = 0; foreach my $maybe_new_friend (@maybe_new_friends) { next if $::friends{$maybe_new_friend->{addr}}; vprint "Adding `$maybe_new_friend->{addr}' to friends database ... \n"; my $source = "friend extracted from `$maybe_new_friend->{header}' " . "header of message"; if ($m->{parp_id}) { $source .= " parp id $m->{parp_id}"; } elsif ($m->{id}) { $source .= " id `$m->{id}'"; } elsif ($m->{date}) { $source .= " dated $m->{date}"; } $source .= " in $folder_name folder"; $m->make_friend($maybe_new_friend->{addr}, $source); $added++; } return $added ? q[EXTRACTED_FRIEND] : q[DIDN'T_EXTRACT_FRIEND]; } sub is_duplicate { my $m = shift; my $found = 0; # Ugh. Wish there were tie-able dual array/hash data structures. # TODO: Maybe there are. Find out. foreach my $cached_id (@::dup_ids) { if ($m->{id} eq $cached_id) { $found++; last; } } return 1 if $found; push @::dup_ids, $m->{id}; shift @::dup_ids if @::dup_ids > $CONFIG{max_cache_ids}; log_to_file "Added id to duplicates cache.\n"; return 0; } sub check_for_old_addresses { my $m = shift; my $found = 0; if ($m->{to} =~ $RE{old_me}) { log_to_file "*** Old address found:\n ", $m->{to}, "\n"; $found++; } if ($m->{cc} =~ $RE{old_me}) { log_to_file "*** Old address found:\n ", $m->{cc}, "\n"; $found++; } $m->deliver_to_inbox('old_addresses') if $found; } sub is_passworded { my $m = shift; if ($m->{subject} =~ /$CONFIG{password}/ or ($m->{header}->get($CONFIG{password_header}) || '') =~ /$CONFIG{password}/) { $m->accept_mail('contains good password'); $m->make_friend($m->{from_addr}, 'gave password'); return 1; } return 0; } sub make_friend { my $m = shift; my ($address, $reason) = @_; $::friends{$address} = $reason; } sub is_from_good_domain { my $m = shift; if ($m->{from} =~ $RE{good_domains} && ($m->{env_from} =~ $RE{good_domains} || $m->{id} =~ $RE{good_domains} || $m->{sender} =~ $RE{good_domains})) { my $good_domain = $1; $m->accept_mail('good domain', $good_domain); return 1; } return 0; } sub has_good_headers { my $m = shift; # Could cross-check In-Reply-To: with good domains, but # no spammers seem to be setting this header yet, which # makes it an even more powerful test. if ($m->{in_reply_to}) { $m->accept_mail('had In-Reply-To: header'); return 1; } if ($m->{references} =~ $RE{good_domains}) { $m->accept_mail('References: had good domain', $1); return 1; } if ($m->{subject} =~ $RE{subject_buzzwords}) { $m->accept_mail('subject had buzzword', $1); return 1; } if ($m->{mailer} =~ /(mozilla.*linux)/i) { $m->accept_mail('good X-Mailer header', $1); return 1; } return 0; } sub is_from_good_person { my $m = shift; if (tied %::friends) { foreach my $addr (@$m{qw/from_addr env_from_addr/}) { if (exists $::friends{$addr}) { $m->accept_mail('from friend', "`$addr' -- $::friends{$addr}"); return 1; } } } return 0; } sub has_spam_headers { my $m = shift; # Many thanks to Mark-Jason Dominus and to the authors of junkfilter # and the NAGS filter for some of the ideas contained herein. my $octet_RE = '([12]?\d\d|\d\d|\d)'; my $ipv4_RE = ("$octet_RE\\." x 3) . $octet_RE; my $foo_RE = qr![\w.%\#\$+-/]+\*?!; if ($m->{id} !~ m/^< ($foo_RE|"$foo_RE") \@ ( [\w-]+ (\. [\w-]+){0,6} | \[ $ipv4_RE \] ) >/x) { $m->reject_junk_mail('invalid Message-ID: header', "`$m->{id}'"); return 1; } if (my @m = $m->matches('ftc', $RE{decoys})) { $m->{complain} = 0; # don't let them wise up to me subscribing to # stuff using a dud address $m->reject_junk_mail('not sent to a proper address', $m[1] || undef); return 1; } foreach my $bad_header (qw/PMFLAGS Advertisement X-Advertisement X-Shock/) { if ($m->{header}->get($bad_header)) { $m->reject_junk_mail('found bad header', $bad_header); return 1; } } my $uidl = $m->{header}->get('X-UIDL') || ''; chomp $uidl; if ($uidl and $uidl !~ /^([0-9a-f]{32}|.{20})$/i) { $m->reject_junk_mail('invalid X-UIDL: header', "`$uidl'"); return 1; } if ($m->{status} =~ /MC/i) { $m->reject_junk_mail('MaxAnnon! mailer'); return 1; } if (($m->{header}->get('X-Distribution') || '') =~ /mass/i) { $m->reject_junk_mail('bulk mail sent with Pegasus'); return 1; } if ($m->{from} =~ /^(<(_?\@_)?>)$/) { $m->reject_junk_mail("bad From: header", "`$1'"); return 1; } if ($m->{return_path} =~ /^(<(_?\@_)?>)$/) { $m->reject_junk_mail("bad Return-Path: header", "contained `$1'"); return 1; } if ($m->{from} eq '') { $m->reject_junk_mail('From: header is blank or missing'); return 1; } if (($m->{subject} =~ tr/\x80-\xff//) > 3) { $m->reject_junk_mail('Subject: header had too many 8-bit characters'); return 1; } if ($m->{date} =~ m![^\w:,()+/ \t-]!) { $m->reject_junk_mail('bad Date: header', "`$m->{date}'"); return 1; } if ($m->{recvds} =~ /(-0600 \(EST\)|-0[57]00 \(EDT\))/) { $m->reject_junk_mail('bad Received: header date', "`$1'"); return 1; } if ($m->{mailer} =~ $RE{bad_words}) { $m->reject_junk_mail("bad X-Mailer: header", "contained `$1'"); return 1; } if ($m->{recvds} =~ $RE{bad_words}) { $m->reject_junk_mail("bad Received: header", "contained `$1'"); return 1; } my $organisation = $m->{header}->get('Organisation') || $m->{header}->get('Organization') || ''; if ($organisation =~ $RE{bad_words}) { $m->reject_junk_mail("bad organisation header", "contained `$1'"); return 1; } if (@{$m->{ftc}} > $CONFIG{max_recipients}) { $m->reject_junk_mail('too_many_recipients'); return 1; } if ($m->{from_addr} =~ /(\@{2,})/) { $m->reject_junk_mail('bad From: address', "contained `$1'"); return 1; } } sub has_suspicious_headers { my $m = shift; if ($m->{to} eq '' and $m->{cc} eq '') { $m->reject_junk_mail('To: and Cc: headers both blank or missing'); return 1; } if ($m->{to} eq '') { $m->reject_junk_mail('To: header blank or missing'); return 1; } if ($m->{to} =~ $RE{bad_to}) { $m->reject_junk_mail("bad To: header", "contained `$1'"); return 1; } if ($m->{subject} =~ $RE{bad_subjects}) { $m->reject_junk_mail("bad Subject: header", "contained `$1'"); return 1; } if (($m->{subject} =~ tr/!/!/) >= 5 || $m->{subject} =~ /!!!!/) { $m->reject_junk_mail("Subject: header contained too many exclamation marks"); return 1; } if ((my @words = $m->{subject} =~ /\b[A-Z]+\b/g) >= 6) { $m->reject_junk_mail('Subject: header had too many all-caps words'); return 1; } # This one is a bit extreme ... # if ($m->{subject} eq '') { # $m->reject_junk_mail('Subject: header is blank or missing'); # return 1; # } # This one is a bit extreme too ... # if ($m->{from} =~ /^(\d+)\@/ || # $m->{from} =~ /^(\d+)\@/) # { # $m->reject_junk_mail('username is all digits', "`$1'"); # return 1; # } return 0; } sub has_spam_from_addresses { my $m = shift; if (my @m = $m->matches('from_addrs', $RE{bad_from})) { $m->reject_junk_mail('bad from address', "contained `$m[1]'"); return 1; } return 0; } sub has_spam_domains_anywhere { my $m = shift; if (my @m = $m->matches('all_from_addrs', $RE{bad_origins})) { $m->reject_junk_mail('bad from/return address', "`$m[1]'"); return 1; } $m->parse_received_headers(); my $debug = 0; my %ips = (); if ($do_RBL) { foreach my $recv (@{$m->{recvds_array}}) { # Avoid various false positives $recv =~ s/JetMail \d\.\d\.\d\.\d\b//g; my @ips = $recv =~ m@(?reject_junk_mail(@reject); return 1; } else { vprint "not found\n" if $debug; } } } } return 0; } sub rbl_lookup { my ($ip) = @_; my @octets = split /\./, $ip; my $name = (join '.', reverse @octets) . '.rbl.maps.vix.com'; my $naddr = gethostbyname($name) or return 0; my $A_RR = inet_ntoa($naddr); return 0 unless $A_RR eq '127.0.0.2'; my $TXT; if ($Net_DNS_loaded) { my $res = Net::DNS::Resolver->new(); my $query = $res->query($name, "TXT"); if ($query) { foreach my $rr ($query->answer) { next unless $rr->type eq "TXT"; $TXT = $rr->txtdata; } } else { print "failed: ", $res->errorstring, "\n"; } $TXT =~ s/Blackholed - //; } else { # vprint "Net::DNS not loaded; won't find out TXT RR\n"; } return $TXT || 1; } sub parse_received_headers { my $m = shift; return if $m->{recvd_parses_done}; my $failed_parses_output = ''; foreach my $recv (@{$m->{recvds_array}}) { $recv =~ s/\s*\n\s*/ /gm; my $obj = Mail::Field->new('Received', $recv); $obj->debug(5); if (! $obj->parsed_ok()) { # Output follows in order ... # First, preamble before parser errors $failed_parses_output .= <{from} To: $m->{to} EOF $failed_parses_output .= "Cc: $m->{cc}\n" if $m->{cc}; $failed_parses_output .= <{subject} Message-ID: $m->{id} EOF # Finally, the incomplete parse tree $failed_parses_output .= Dumper($obj->parse_tree()) . "\n"; $m->{recvd_parses_failed}++; $failed_parses_output .= $obj->diagnostics(); } $m->{recvd_parse_trees}{$recv} = $obj->parse_tree(); } $m->{recvd_parses_out} = $failed_parses_output; $m->{recvd_parses_done} = 1; } sub for_me { my $m = shift; if ($m->{to} =~ $RE{me} or $m->{cc} =~ $RE{me}) { return 1; } $m->reject_junk_mail('not addressed to me'); return 0; } sub has_spam_content { my $m = shift; # Copy body to a single scalar my $all = $m->{body_scalar}; # Strip blank and quoted lines my @body_lines = grep ! /^\s*$|^> /, @{$m->{body}}; my $not_quoted = join '', @body_lines; my $max = $RE{max_forwards}; my @matches = ($all =~ /^\s*(>\s*){$max,}/mg); # log_to_file "Lines exceeding max_forwards: ", scalar(@matches), "\n"; if (@matches > $RE{max_forwards_lines}) { $m->reject_junk_mail("forwarded more than $RE{max_forwards} times"); return 1; } # Check first few for spam my $first_how_many = 4; my ($start, $end) = (0, $first_how_many); $end = $#body_lines if $#body_lines < $first_how_many; my $first_few = join '', @body_lines[$start .. $end]; if ($first_few =~ /^\s* (Dear\ ( friend | .* surfer | $RE{me} )) /imx) { $m->reject_junk_mail('Suspicious method of address', "`$1'"); return 1; } # Check last few for spam my $last_how_many = 12; ($start, $end) = (-$last_how_many, -1); if (@body_lines < $last_how_many) { ($start, $end) = (0, $#body_lines); } my $last_few = join '', @body_lines[$start .. $end]; if ($last_few =~ /\bremoved?\b/i && $last_few =~ /respond|notify|reply|send|forward|click|software| mailto|type/ix && $last_few =~ /subjec?t|process|automatically/i) { $m->reject_junk_mail('body confessed it was junk'); return 1; } if ($last_few =~ /group.mail/i) { $m->reject_junk_mail('body suggested that a group mailer was used'); return 1; } if ($not_quoted =~ $RE{very_bad_words}) { $m->reject_junk_mail('body contained a very bad word', "`$1'"); return 1; } @matches = ($not_quoted =~ /$RE{quite_bad_words}/g); if (@matches > $RE{max_quite_bad_words}) { my %uniques = map { lc $_ => $_ } @matches; log_to_file "Quite bad words found in body: ", scalar(@matches), " (", scalar(keys %uniques), " unique)\n"; if (scalar(keys %uniques) > $RE{max_unique_quite_bad_words}) { $m->reject_junk_mail('body contained too many bad words', join ', ', map { "`$_'" } values %uniques); return 1; } } return 0; } ############################################################################## # # Routines providing actions to taken on the e-mail being filtered. # sub ditch_mail { my $m = shift; log_to_file "Delivered to /dev/null", @_ ? " (@_)" : '', "\n"; } sub deliver_to_inbox { my $m = shift; my $inbox = shift; $m->deliver_to("$CONFIG{inbox_dir}/$inbox", @_); } sub maybe_backup { my $m = shift; $m->deliver_to($CONFIG{backup_folder}) if $m->{backup}; } sub deliver_to { return if $wrong_class; my $m = shift; my ($folder) = @_; my $file = ($folder =~ m!^/!) ? $folder : "$CONFIG{mail_dir}/$folder"; if ($test_run) { log_to_file "Would deliver to $file\n"; return; } if (! exists $out_folders{$file}) { check_file_dir($file); if ($Mail_Folder) { # Use Mail::Folder if (-e $file) { vprint "Opening $file for appending ... "; $out_folders{$file} = new Mail::Folder('AUTODETECT', $file); vprint "done.\n"; } else { $out_folders{$file} = new Mail::Folder('mbox', $file, Create => 1); } } else { # Don't use Mail::Folder local *FH; unless (open(FH, ">>$file")) { fatal("Couldn't open $file for delivery: $!"); exit 5; } vprint "Opened $file for appending.\n"; $out_folders{$file} = *FH; } } my $out = $out_folders{$file}; if (ref($out) eq 'Mail::Folder') { $out->append_message($m->{mail}); } else { my $text = $m->{mail}->as_mbox_string(); $text =~ s/^Mail-From: /From /; $text =~ s/\n+$/\n\n/; print $out $text; } log_to_file "Delivered to $file\n"; } sub accept_mail { return if $wrong_class; my $m = shift; my ($reason_ident, @details) = @_; $m->{accepted} = [ $reason_ident, @details ]; my $text = "$reason_ident" . (@details ? " (@details)" : ''); $m->{header}->add('X-Parp-Accepted', $text); log_to_file "Accepted: $text\n"; } sub reject_junk_mail { return if $wrong_class; my $m = shift; $m->reject_mail(@_); # $m->{backup} = 0; $m->deliver_to_inbox('junk-mail'); } sub reject_mail { return if $wrong_class; my $m = shift; my ($reason_ident, @details) = @_; $m->{rejected} = [ $reason_ident, @details ]; my $text = "$reason_ident" . (@details ? " (@details)" : '') . "\n"; $m->{header}->add('X-Parp-Rejected', $text); log_to_file "REJECTED: $text"; } sub pipe_forward { return if $wrong_class; my $m = shift; my ($pipe_command) = @_; if ($dry_pipes || $test_run) { log_to_file "Would pipe | $pipe_command\n"; } else { log_to_file "Piping | $pipe_command ... "; if (! open(PIPE, "| $pipe_command")) { fatal("Couldn't open pipe command $pipe_command: $!"); } else { print PIPE $m->{mail}->as_mbox_string(); close(PIPE); log_to_file "done.\n"; } } } ############################################################################## # # Miscellaneous routines # sub check_file_dir { my ($file) = @_; my $umask = 0700; my ($dir) = $file =~ m!(.*)/!; return unless $dir; unless (-d $dir) { mkpath([$dir], 0, $umask); vprint sprintf "Created directory $dir with umask %04o.\n", $umask if fileno(LOG); } } sub fatal { my ($error, @context) = @_; my $long_message = < To: "$realname" <$username\@localhost> Date: $date Subject: parp experienced a fatal error $long_message End of report if (open FATALS, ">>$CONFIG{fatals_folder}") { print FATALS $report; close FATALS; } } } sub log_to_file { my (@msgs) = @_; if ($test_run) { print @msgs; } else { if (fileno LOG) { print LOG @msgs; } else { open(BROKEN, ">>/home/adam/mail/.parp.broken") or die "argh!!! $!"; print BROKEN "fileno(LOG) undef:\n@msgs\n"; close(BROKEN); } } }