#!/usr/sbin/perl -w
 
use strict ;

# Need at least $minrecent transactions before taking action on an IP
my $minrecent = 10 ;  
# Trim the list when it gets this big (usually same as min)
my $maxrecent = 10 ;  
# how many "bad" transactions qualify an IP for listing
my $threshhold = 9 ;
# How long we should keep entries on the list
my $expire_secs = (2*60*60) ;

# Name of file to read, expecting lines like 
#   "time=Jan 16 00:29:03, ip=221.15.80.188, result=No such user: alfisi@mfg.sgi.com"
my $inputfile = "/logs/cuda/summarize" ;
# Should we reopen the file if it is moved? (0=quit 1=reopen,continue)
my $reopen = 1 ;
# Should we start at the very end (0) or X bytes from the end, or "top" for the whole file
my $rewind = 0 ;
# Note: script assumes we are running in real time, so if you rewind or
# replay old entries for analysis, IPs will be exipred according to the 
# time we actually add them, not the log time
# If you are rewinding the file for analysis, you can:
#   write the BL to another file, if you want to see it, or
#   set $bl_write_seconds to 0 below and we won't write, just output stats

# where to write the actual RBL we are building
my $bl_outfile = "/logs/cuda/myrbl" ;
# don't use /dev/null because the new file is renamed to this after it is written
# write BL file every N seconds (if $bl_write_seconds is 0 file will not be written)
my $bl_write_seconds = (5*60) ;  

# Debug output level
# 0 = no diagnostics, 1 = errors and unexpected input, 2 = status ticker, 3 = each line detail
my $debug = 1 ; 
# print stats to STDERR every X transactions (0 for no stats)
# Stats will also be printed if we receive a USR1 signal and at quit time
my $stats_every = 0 ;  

# $cachesize is how many IPs we keep details about before forgetting some
# about 1M memory needed per 1000 cache
my $cachesize = 20000 ;  
# remove this many at a time when the limit is reached
my $cachepurge = $cachesize/10 ;

# If you want to process only part of the file, set this to a time pattern, like:
#     my $stoptime = '01:\d\d:\d\d' ;  # stop at logtime 1 am
# Best used with $rewind = "top"
my $stoptime = undef ;

# $tail - Wait for more at end of file instead of stopping
my $tail = 1 ;		
# $mysleep - counter, to sleep for an increasing number of secs (1-30)
my $mysleep = 0 ;	

# Pattern matches - ALL activity should fall into one of these patterns
# If we find lines that don't match, we complain to STDERR (debug>=1) and treat as OK

# $rbl_result_expr: Match expression that finds other RBL already blocked this IP
# RBL activity is tracked so we can compare our BL to others in a dry run
# RBL results don't count as good or bad, they only affect stats
# After we go live we will probably stop logging RBL to our input file anyway (myscanner show rbl)
my $rbl_result_expr = "Blocked: using" ;

# $bad_result_expr: Match expression that finds any "bad" activity
my $bad_result_expr = "Recipient address rejected|Sender address rejected|No such user|Blocked: spam" ; 
# $ok_result_expr: Match expression that finds "good" or "neutral" activity
my $ok_result_expr = "Delivered|Tagged|Quarantined" ; 



######################## BE CAREFUL - change anything below here at your own risk :)

my ( $line, $partial ) ;	# current line, and any leftover from incomplete line read

sub printstats() ;
sub write_bl_file() ;

my $count_total=0 ;		# count all transaction (matched) lines
my $count_ok=0 ;		# 	ok
my $count_rbl=0 ;		#	existing rbl
my $count_bad=0 ;		#	bad
my $count_unknown=0 ;		#	wtf
my $count_would_block=0 ;	# count transactions that should have been blocked by our RBL
my $count_would_block_ok=0 ;	#	ok
my $count_would_block_rbl=0 ;	#	overlap with existing RBL
my $count_would_block_bad=0 ;	#	bad
my $count_would_block_unknown=0 ;  #	wtf

my ( $timestamp, $starttime, $endtime ) ;	# time info from log
my ( $hashref, %ip_seen_cache ) ;	# %ip_seen_cache is a "hash of hashes" (man perlref)
# to track items once we list them, we use these three - they should stay in sync
my %ip_knownbad ;		# hash of just IP addresses (value is just a counter)
my @knownbad_fifo_ip ;		# list of IPs blocked so we can expire them in order w/o sort()
my @knownbad_fifo_expire ;	# list of expire times to match list of ips

# On receiving ^C we will print final stats and exit
my $intr;
sub catch_intr { $intr=1; }
$SIG{INT} = \&catch_intr ;

# On receiving USR1 signal we will output stats so far to STDERR
my $stats_requested = 0 ;
sub catch_usr1 { $SIG{USR1} = \&catch_usr1 ; $stats_requested = 1 ; }
$SIG{USR1} = \&catch_usr1 ;

 
# this was a test to see if expire worked correctly
	#$rewind = "top" ; 
	#$bl_write_seconds = 60 ;
	#$bl_outfile = "/logs/cuda/myrbl.test" ;
	#$expire_secs = 120 ;

# Open the input file for reading
open (INPUTFILE, "< $inputfile" )  or die "Unable to open $inputfile : $!" ;

# Seek to the end so we can start reading recent activity
if ($tail) {
	seek (INPUTFILE, 0, 2) ;
	# the first line may be cut off a bit but we don't care much
	# this is mostly just to keep the debug output down
	sleep 1 ; my $line = <INPUTFILE> ;
}

# Rewind if requested
if ($rewind eq "top")  {
        seek (INPUTFILE, 0, 0) ;
} elsif ($rewind)  {
        print STDERR "Rewinding $rewind characters\n" ;
        seek (INPUTFILE, -($rewind), 2) ;
        my $line = <INPUTFILE> ;
}

# Preset some counters
my $bl_next_write = time()+$bl_write_seconds ;
$partial = "" ;

# LINE - start of main event loop
LINE:
while ( !$intr ) {
	my ( $ip, $result ) ;

	# Do some housekeeping

	my $now = time() ;
	# for each trip through, we may pop one entry if it is expired
	# that should be enough to keep the list pruned
	if ( scalar(@knownbad_fifo_expire) > 0 && $now > $knownbad_fifo_expire[0] ) {
		shift(@knownbad_fifo_expire) ;
		$ip=shift(@knownbad_fifo_ip) ;
		delete($ip_knownbad{$ip}) ;
	}
	# write BL to file every so often
	if ( $bl_write_seconds && ($now>($bl_next_write)) ) {
		write_bl_file() ;
		$bl_next_write = time()+$bl_write_seconds ;
	}

	# Actually read a line from the file, and sleep on EOF or partial line
	# Check if file moved/recreated after (4+3+2+1) seconds

	$line = <INPUTFILE> ;
	if ( defined($line) && ! ($line =~ /\n/) ) {
		$partial = $partial . $line ;
		$line = undef ;
	}
	if ( !defined($line) ) {
		if (!$tail) { $intr = 1; next LINE ;}
		sleep ($mysleep++) ;
		if ($mysleep > 4) {
			# If our input file (name) is more recent than the open file (handle)
			# then it must have moved.  To trigger a reopen, file must exist 
			# and be newer than the file we have already open
			my ( $mtime1, $mtime2 ) ;
			$mtime1 = (stat(INPUTFILE))[9] ;
			$mtime2 = (stat($inputfile))[9] ;
			if ( defined($mtime2) && ($mtime2>$mtime1) ) {
				if (!$reopen) {
					($debug>=2) && print "File seems to have moved, exiting\n";
					$intr=1 ;
					next LINE ;
				}
				($debug>=2) && print "File seems to have moved, reopening\n" ;
				close(INPUTFILE) ;
				open (INPUTFILE, "< $inputfile" )  or die "Unable to reopen $inputfile : $!" ;
			}

		}
		if ($mysleep > 30) { 	# Don't sleep longer than 30 sec each time
			$mysleep = 30 ;
		}
		next LINE ;
	} else {
		# If a line was actually read, reset sleeptime counter
		$mysleep=0 ;
	}
	# tack on previous bit if it was an incomplete line
	if ($partial) { $line = $partial . $line ; $partial = "" ; }

	# More housekeeping, trim cache down to size if limit is reached
	# otherwise memory will grow forever :)

	if (scalar(keys(%ip_seen_cache)) >= $cachesize) {
		my ( @keyslist, @deletelist ) ;
		($debug>=2) && print "Cache is ", $cachesize, ", trimming ", $cachepurge, " oldest entries\n" ;
		# Sort the keys (IPs) according to the last line number they appeared on
		# For this we need to pass our own compare function to sort (see man perlfunc)
		@keyslist = sort {$ip_seen_cache{$a}->{"lastline"} <=> $ip_seen_cache{$b}->{"lastline"}} (keys(%ip_seen_cache));

		# Partial list or "slice" contains the first $cachepurge items (oldest)
		@deletelist = @keyslist[0..($cachepurge-1)] ;

		# If debug>=2 we will output the timestamps of the stuff we are purging
		my $start = $ip_seen_cache{$deletelist[0]}->{"timestamp"} ;
		my $end = $ip_seen_cache{$deletelist[($#deletelist)]}->{"timestamp"} ;
		($debug>=2) && print "purged  \tfrom $start \tto $end\n" ;

		# Actually do the delete
		delete @ip_seen_cache{@deletelist} ;
		
		# If debug>=2 we will output the timestamps of the stuff we are *keeping*
		$start = $ip_seen_cache{$keyslist[$cachepurge]}->{"timestamp"} ;
		$end = $ip_seen_cache{$keyslist[($#keyslist)]}->{"timestamp"} ;
		($debug>=2) && print "leaving \tfrom $start \tto $end\n" ;
		
	}


	# Now we get to actually pattern-matching the line

	chomp $line ;

	#   "time=Jan 16 00:29:03, ip=221.15.80.188, result=No such user: alfisi@mfg.sgi.com"
	#         ^^ $1 ^^^^^^^^^  ^^ $2 ^^^^^^^^^^         ^^ $3 ^^^^^^^^^^^^^^^^^^^^^^^^^^
	if ( $line =~ m"^time=(\w+\s+\d+ \d+:\d+:\d+), ip=(\d+\.\d+\.\d+\.\d+), result=(.*)$" ) {
		$timestamp = $1 ;
		$ip = $2 ;
		$result = $3 ;
	} else {
		# Complain on non-matching line.  We assume the file is just for us, but
		# if we are reading from some other file like maillog, we should either
		# comment this out or use ($debug>=3)
		$debug && print "Unknown line: $line\n" ;
		next LINE ;
	} # end if $line

	# Keep track of start time and end time.  Stop if we have reached $stoptime.
	if ( !defined($starttime) ) { $starttime = $timestamp; }
	$endtime = $timestamp ;
	if ( defined($stoptime) && $timestamp =~ m".*$stoptime.*"o ) {
		$intr = 1 ;
	}


	# If we haven't seen activity from this IP, add it to our hash of hashes

	if (!defined($ip_seen_cache{$ip})) {
		#Create a new anonymous hash to hold data about the transaction (see man perlref)
		$ip_seen_cache{$ip} = {} ;
		$hashref = $ip_seen_cache{$ip} ;
		$hashref->{"lines"} = 1;
		$hashref->{"recent"} = [];	# a list within a hash within a hash :)
	} else {
		$hashref = $ip_seen_cache{$ip} ;
		$hashref->{"lines"}++ ;
	}
	$hashref->{"lastline"} = $count_total ;
	$hashref->{"timestamp"} = $timestamp ;

	# Output stats if we have reached our interval or if we received USR1
	if ( $stats_every && $count_total && ( ($count_total % $stats_every) == 0 ) ) {
		printstats() ;  $stats_requested = 0 ;
	} elsif ( $stats_requested ) {
		printstats() ;  $stats_requested = 0 ;
	}

	# Matching section.  Check for RBL, Bad, OK.  If not found, count as "wtf"

	if ( $result =~ m"$rbl_result_expr"o ) {
		# RBL activity is tracked mostly for statistical purposes, so we can see
		# where our RBL overlaps other existing RBLs.  We don't count RBL hits
		# as either good or bad, though we do keep the IPs in our cache
		# When detailed evaluation is done and we have been live for a while
		# we will probably turn off rbl lines in our output (myscanner show rbl)

		($debug>=3) && print "$ip\tRBL\t$result\n" ;
		$count_total++; $count_rbl++;
		## no push
		if ( defined($ip_knownbad{$ip}) ) {
			($debug>=2) && print "$ip\tAlready blocked\t($result)\n";
			$ip_knownbad{$ip}++ ; $count_would_block++; $count_would_block_rbl++;
			next LINE ;
		}
		# putting next here short-circuits any further analysis on this IP this time
		next LINE ;

	} elsif ( $result =~ m"$bad_result_expr"o ) {
		# BAD activity is added to the list-within-a-hash
		# We will analyze the activity only if the IP is not already blocked

		($debug>=3) && print "$ip\tBad\t$result\n" ;
		$count_total++; $count_bad++;
		push ( @{$hashref->{"recent"}}, "bad" ) ;
		if ( defined($ip_knownbad{$ip}) ) {
			($debug>=2) && print "$ip\tAlready blocked\t($result)\n";
			$ip_knownbad{$ip}++ ; $count_would_block++; $count_would_block_bad++;
			next LINE ;
		}

	} elsif (  $result =~ m"$ok_result_expr"o ) {
		# OK activity is added to the list-within-a-hash
		# We will analyze the activity only if the IP is not already blocked

		($debug>=3) && print "$ip\tOK\t$result\n" ;
		$count_total++; $count_ok++;
		push ( @{$hashref->{"recent"}}, "ok" ) ;
		if ( defined($ip_knownbad{$ip}) ) {
			($debug>=2) && print "$ip\tAlready blocked\t($result)\n";
			$ip_knownbad{$ip}++ ; $count_would_block++; $count_would_block_ok++;
			next LINE ;
		}

	} else  {
		# If nothing matched above, record the line as "wtf" activity
		# This counts as "OK" for tracking/blocking for safety ,
		# but should also output complaints on $debug = 1

		$debug && print "Unknown result: $line\n" ;
		$count_total++; $count_unknown++;
		push ( @{$hashref->{"recent"}}, "wtf" ) ;
		if ( defined($ip_knownbad{$ip}) ) {
			($debug>=2) && print "$ip\tAlready blocked\t($result)\n";
			$ip_knownbad{$ip}++ ; $count_would_block++; $count_would_block_unknown++;
			next LINE ;
		}
	} # end if $result

	# shift off oldest activity (only one) if $maxrecent is reached
	if ( scalar(@{$hashref->{"recent"}}) > $maxrecent ) {
		shift(@{$hashref->{"recent"}}) ;
	}
	
	# for detailed debugging, show the list so far in order occurred
	($debug>=3) && print "$ip : ", join(",", @{$hashref->{"recent"}}), "\n" ;

	# If we have enough activity, see if at least $threshhold of them are bad
	# We do this by:
	#	sorting the recent activity ("bad" first by alpha)
	#	taking the first $threshhold items and joining with commas to make a list
	#	comparing that to a list of "bad" repeated $threshhold times
	# Note that the same $threshhold applies whether we are at min or max
	# so if we want a consistent percentage, we keep min = max

	if ( scalar(@{$hashref->{"recent"}}) >= $minrecent ) {
		my @recent = @{$hashref->{"recent"}} ;
		@recent = sort(@recent) ; #depends on sort order (bad,ok,wtf)
		@recent = @recent[0..($threshhold-1)] ;
		($debug>=3) && print join(",",@recent), " == ", join( ",", ("bad")x$threshhold ), "\n" ;
		if ( join(",",@recent) eq join( ",", ("bad")x$threshhold ) ) {
			# Here is where we actually add the IP to the known bad list
			# Must add to hash and two fifo lists at the same time, to keep consistent
			$ip_knownbad{$ip}=0;
			push( @knownbad_fifo_ip, $ip ) ;
			push( @knownbad_fifo_expire, (time()+$expire_secs) ) ;
			($debug>=2) && print "$ip\tAdded to block list\n" ;
		}
	}

} # end while LINE

printstats() ;

exit 0 ;


sub printstats() {

print STDERR "\n" ;
print STDERR "From: $starttime  To: $endtime\n" ;

print STDERR "total = $count_total (100%) (",
	"rbl = ", int(100*($count_rbl/$count_total)), ", ",
	"ok = ", int(100*($count_ok/$count_total)), ", ",
	"bad = ", int(100*($count_bad/$count_total)),
	")\n" ;

print STDERR "would block = $count_would_block (", int(100*($count_would_block/$count_total)), "%) ",
	"(",
	"rbl = ", int(100*($count_would_block_rbl/$count_total)), ", ",
	"ok = ", int(100*($count_would_block_ok/$count_total)), ", ",
	"bad = ", int(100*($count_would_block_bad/$count_total)),
	")\n" ;

print STDERR "cache size = ", scalar(keys(%ip_seen_cache)), ", blocks size = ", scalar(keys(%ip_knownbad)), "\n" ;

my ($user,$system,$cuser,$csystem) ;
($user,$system,$cuser,$csystem) = times();

# this ps output is specific to IRIX; change if you like, or 
# change to if (0) if you don't need SIZE in your stats output.
my $ps = `ps -l -p $$ ` ;
if ( $ps =~ /(\d\d\d+:\d\d\d+)/ ) {
	print STDERR "usertime=$user, systime=$system, SZ:RSS=$1\n" ;
} else {
	print STDERR "usertime=$user, systime=$system\n SZ:RSS=??" ;
}

} # end sub printstats()


sub write_bl_file() {
my $newfile = "${bl_outfile}.new" ;
open ( BLFILE, "> $newfile" ) or die "Can't write to $newfile" ;

print BLFILE  join( "\n", @knownbad_fifo_ip ), "\n" ;
close BLFILE ;

rename( $newfile, $bl_outfile ) ;

}
