#!/usr/bin/perl -w my $num_words; %wordscores = qw(advertising 40 service 20 solution 15 guarantee 30 visa 20 mastercard 30 american 10 express 20 e-commerce 50 commerce 50 free 30 website 15 pay 20 buy 20 cpan -100 perl -100 mlm 100 xxx 100 inconvenience 50 homeowner 30 cash 30 loan 30 income 20 rich 40 financial 20 millennium 50 millenium 50 executive 30 opportunity 40 legal 30 illegal 30 viagra 70 exotic 70 payment 30 satisfaction 30 friend 20 copyright 20 invest 40 freedom 20 quick 20 descrambler 70 cable 40 dollar 30 dollars 30 chain 30 adult 40 spam 40 very 10 extremely 10); while ($filename = shift @ARGV) { open(FILE, $filename) or die "Unable to open $filename. This is not our fault: $!\n"; @message = ; $spam_score = spam_score(@message); # print "$filename\t$spam_score\t"; if ($spam_score > 0.8) { print "YES\n"; } else { print "NO\n"; } } sub spam_score { my $score = 0; my @message = @_; my $headers = ''; HEADER: while ($line = shift @message) { chomp $line; last HEADER if ($line eq ''); $headers .= "$line "; } my $body = join(' ', @message); # deal with headers # deal with body $seen = word_counts($body); foreach (keys %wordscores) { if ($seen{$_}){ $score += sqrt($seen{$_}) * $wordscores{$_}; } } if (phone_number($body)) {$score += 40;} # in first 2/3 of file if (toll_free($body)) {$score += 70;} # $score /= $num_words; $score /= scalar(keys %seen); return ($score); } sub phone_number { my $str = shift @_; my $start = substr($str, 0, int(0.66 * length($str))); return ($start =~ /\d{3}.\d{4}/); } sub toll_free { my $str = shift @_; return ($str =~ /8[087]{2}.{1,3}\d{3}.\d{4}/) } sub word_counts { undef %seen; $num_words = 0; $str = shift @_; while ($str =~ /(\w[\'\w-]*)/g ) { $seen{lc $1}++; $num_words += 1; } return \%seen; }