#!/usr/bin/perl -w # # stats -- statistics calculation program for parp # # Copyright (c) 2000 Adam Spiers . All rights # reserved. This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # # $Id: stats,v 1.16 2002/04/11 13:59:55 adams Exp $ # use strict; use Time::Local; use Parp::Config qw(config); my $program = $0; $program =~ s!.*/!!; die "Usage: $program [ log file ]\n" if @ARGV > 1; my $log_file = $ARGV[0] || config->log_file; my %totals = (); my %dates = (); my %messages = (); my %tests = (); read_log_file($log_file); output_stats($log_file); exit 0; ############################################################################## sub read_log_file { my ($log_file) = @_; open(LOGS, $log_file) or die "Couldn't open log file $log_file: $!\n"; my $id = ''; my %date = (); my $class = 'unknown'; while () { if (! $id) { next unless /^Parp-ID: (.*)/; $id = $1; my ($time, $md5) = $id =~ m,(\d+)/(.*),; @date{qw/sec min hour mday mon year wday yday isdst/} = localtime($time); $dates{earliest} = $time if ! $dates{earliest} or $time < $dates{earliest}; $dates{latest} = $time if ! $dates{latest} or $time > $dates{latest}; $totals{year}{$date{year}+1900}++; $totals{month}{$date{mon}}++; $totals{mday}{$date{mday}}++; $totals{wday}{$date{wday}}++; # counts mails by wday my $day_time = timelocal(0, 0, 12, $date{mday}, $date{mon}, $date{year}); $totals{wday2}{$date{wday}}{$day_time}++; # counts number of each wday $totals{date}{$day_time}++; next; } if (/^--------------------/) { $messages{$id}{class} ||= $class; $totals{class}{$class}++; $id = ''; %date = (); $class = 'unknown'; next; } if (/^(Accepted|REJECTED): (.*?)(?: \((.*)\))?\s*$/) { my ($action, $test, $details) = ($1, $2, $3); $totals{test}{$test}++; $totals{'tested_' . (($action eq 'Accepted') ? 'ok' : 'spam')}{$test}++; $tests{$test}{$details || '__undef__' }++; $class = 'not_spam' if $action eq 'Accepted'; next; } if (/^Delivered to (.*)\s*$/) { my $delivered_to = $1; $totals{delivered_to}{$delivered_to}++; $class = 'not_spam'; $class = 'spam' if $delivered_to =~ /junk|spam/i; next; } if (/^(Would complain|Complained)/) { $messages{$id}{complaint}++; $totals{complaints}++; next; } if (/^Reclassification: was incorrectly identified as spam/) { $messages{$id}{real_class} = 'not_spam'; } elsif (/^Reclassification: was incorrectly identified as bona-fide/) { $messages{$id}{real_class} = 'spam'; } } close(LOGS); $totals{days} = keys %{ $totals{date} }; } ## sub output_stats { my ($log_file) = @_; print "Statistics for parp log file $log_file\n"; print '=' x 78, "\n\n"; my $left_col = "%17s"; my $template = "$left_col %s\n"; printf "$left_col %d bytes\n", 'Size of log file:', (stat $log_file)[7]; unless (scalar keys %messages) { print "No e-mails found.\n"; return; } printf $template, 'Earliest entry:', scalar localtime($dates{earliest}) if $dates{earliest}; printf $template, 'Latest entry:', scalar localtime($dates{latest}) if $dates{latest}; printf $template, 'Total mails:', scalar keys %messages; printf $template, 'Number of days:', $totals{days}; print "\n"; output_class_stats(); output_test_stats(); output_date_stats(); } sub output_class_stats { $totals{false}{negatives} = 0; $totals{false}{positives} = 0; $totals{class}{spam} ||= 0; $totals{class}{not_spam} ||= 0; $totals{real_class}{spam} = 0; $totals{real_class}{not_spam} = 0; $totals{complaints} ||= 0; $totals{false_complaints} ||= 0; $totals{missing_complaints} ||= 0; foreach my $id (keys %messages) { my $class = $messages{$id}{class}; if ($messages{$id}{real_class}) { my $real_class = $messages{$id}{real_class}; $totals{real_class}{$real_class}++; if ($class eq 'not_spam' and $real_class eq 'spam') { $totals{missing_complaints}++ unless $messages{$id}{complaint}; $totals{false}{negatives}++; # not a disaster } elsif ($class eq 'spam' and $real_class eq 'not_spam') { $totals{false_complaints}++ if $messages{$id}{complaint}; $totals{false}{positives}++; # oops! } } else { $totals{real_class}{$class}++; } } my $wrong_class = $totals{false}{positives} + $totals{false}{negatives}; my $total_mails = keys %messages; my $correct_class = $total_mails - $wrong_class; my $accuracy = sprintf "%.4f%%", $correct_class / $total_mails * 100; my $every = $wrong_class ? (sprintf "%.0f", $total_mails / $wrong_class) : '\infinity'; header('Classification success', 0); print < $totals{delivered_to}{$a} } keys %{$totals{delivered_to}}) { printf "%5d %s\n", $totals{delivered_to}{$dest}, $dest; } } sub output_test_stats { header('Tests by frequency'); foreach my $test (sort { $totals{test}{$b} <=> $totals{test}{$a} } keys %{$totals{test}} ) { printf "%5d %s\n", $totals{test}{$test}, $test; } print "\nSpam:\n"; foreach my $test (sort { $totals{tested_spam}{$b} <=> $totals{tested_spam}{$a} } keys %{$totals{tested_spam}} ) { printf "%5d %s\n", $totals{tested_spam}{$test}, $test; } print "\nOK:\n"; foreach my $test (sort { $totals{tested_ok}{$b} <=> $totals{tested_ok}{$a} } keys %{$totals{tested_ok}} ) { printf "%5d %s\n", $totals{tested_ok}{$test}, $test; } header('Lexical breakdown of tests'); print "All:\n"; foreach my $test (sort keys %{$totals{test}} ) { printf "%5d %s\n", $totals{test}{$test}, $test; foreach my $details (sort { $tests{$test}{$b} <=> $tests{$test}{$a} } keys %{$tests{$test}}) { printf " %5d %s\n", $tests{$test}{$details}, $details; } print "\n"; } print "\nSpam:\n"; foreach my $test (sort keys %{$totals{tested_spam}} ) { printf "%5d %s\n", $totals{tested_spam}{$test}, $test; foreach my $details (sort { $tests{$test}{$b} <=> $tests{$test}{$a} } keys %{$tests{$test}}) { printf " %5d %s\n", $tests{$test}{$details}, $details; } print "\n"; } print "\nOK:\n"; foreach my $test (sort keys %{$totals{tested_ok}} ) { printf "%5d %s\n", $totals{tested_ok}{$test}, $test; foreach my $details (sort { $tests{$test}{$b} <=> $tests{$test}{$a} } keys %{$tests{$test}}) { printf " %5d %s\n", $tests{$test}{$details}, $details; } print "\n"; } } sub by_time_prefix { my ($c, $d) = ($a, $b); $c =~ s,!.*,,; $d =~ s,!.*,,; return $c <=> $d; } sub output_date_stats { header('Mails by day of week, most frequent first'); my @days_of_week = qw/Sunday Monday Tuesday Wednesday Thursday Friday Saturday/; foreach my $wday (sort { $totals{wday}{$b} <=> $totals{wday}{$a} } keys %{$totals{wday}}) { my $wday_freq = keys %{ $totals{wday2}{$wday} }; my $average = $totals{wday}{$wday} / $wday_freq; printf "%5d in %3d %-10s (average %d)\n", $totals{wday}{$wday}, $wday_freq, $days_of_week[$wday] . ($wday_freq == 1 ? '' : 's'), $average; } my $average = keys(%messages) / $totals{days}; printf "\nAverage mails per day: %d\n", $average; header('Mails by date, most frequent first'); foreach my $day_time (sort { $totals{date}{$b} <=> $totals{date}{$a} } keys %{$totals{date}}) { printf "%5d %s\n", $totals{date}{$day_time}, scalar(localtime $day_time); } header('Mails by date, earliest first'); foreach my $day_time (sort by_time_prefix keys %{$totals{date}}) { printf "%5d %s\n", $totals{date}{$day_time}, scalar(localtime $day_time); } header('Mails by day of month, most frequent first'); foreach my $mday (sort { $totals{mday}{$b} <=> $totals{mday}{$a} } keys %{$totals{mday}}) { printf "%5d %d\n", $totals{mday}{$mday}, $mday; } header('Mails by day of month, earliest first'); foreach my $mday (sort { $a <=> $b } keys %{$totals{mday}}) { printf "%5d %d\n", $totals{mday}{$mday}, $mday; } header('Mails by month'); foreach my $month (sort { $a <=> $b } keys %{$totals{month}}) { printf "%5d %s\n", $totals{month}{$month}, (qw/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/)[$month]; } header('Mails by year'); foreach my $year (sort { $b <=> $a } keys %{$totals{year}}) { printf "%5d %d\n", $totals{year}{$year}, $year; } } sub header { my ($header, $pre_lines) = @_; $pre_lines = 2 unless defined $pre_lines; for my $i (1 .. $pre_lines) { print "\f" if $i == $pre_lines; print "\n"; } print "$header:\n", ('-' x length $header), "\n\n"; }