#!/usr/bin/perl
# Process mail from imap server shared folder 'spam' & 'not-spam' through sa-learn
# Heavily reworked from original:
# dmz@dmzs.com - March 19, 2004
# http://www.dmzs.com/tools/files/spam.phtml
# LGPL
# by:
# Kris Deugau <kdeugau@deepnet.cx> 2009/01/21
##wrapreq libmail-imapclient-perl (>= 3.11), libio-socket-ssl-perl, install-sa
##wrapver 0.2.4
##wrapsum Learn from spam and ham IMAP folders
# $Id: imap-learner 82 2010-04-30 18:43:32Z kdeugau $

use strict;
use warnings;
use IO::Socket::SSL;
use Mail::IMAPClient;

my $debug = 0;
my $sadebug = 0;	# sa-learn -D spits out a LOT of useless crap - better to only activate if specifically needed
my $imapdebug = 0;	# so does Mail::IMAPClient...  as in, the whole content of all the mail you look at.  O_o
my $delete_after_learning = 0;	# set to 1 if you want to delete mail right away after learning
				# - note this makes it rather harder to deep-scan the messages to create local rules
my $verbose = 1;

my $tmpdir = '/var/tmp';
my $salearn = '/usr/local/bin/sa-learn';
my $learnargs = ($sadebug ? ' -D' : '').' --showdots ';

die "eeep!  $salearn doesn't exist!\n" if ! -e $salearn;

my $spamfolder = 'confirmed';
my $hamfolder = 'notspam';

my $learnresults;
my $pylearn;

# non-SSL IMAP settings:
#my $imap = Mail::IMAPClient->new( Server=> 'imapmailhost:143',
#                                  User => 'imapspamuser',
#                                  Password => 'imapspamuserpassword',
#                                  Debug => $imapdebug);

my $imap = Mail::IMAPClient->new(
        User => 'imapspamuser',
        Password => 'imapspamuserpassword',
        Socket   => IO::Socket::SSL->new(
                Proto    => 'tcp',
                PeerAddr => 'imapserver.example.com',
                PeerPort => 993, # IMAP over SSL standard port
                ),
        Debug => $imapdebug,
        );

if (!defined($imap)) { die "IMAP Login Failed"; }

my $spamcount = $imap->message_count($spamfolder);
my $hamcount = $imap->message_count($hamfolder);

# If debugging, print out the total counts for each mailbox
if ($debug) {
  print $spamcount, " spam(s) to process\n";
  print $hamcount, " ham(s) to process\n";
}

## Process the spam mailbox
$imap->select($spamfolder);
my @spammsgs = $imap->search("ALL");

# Since the data goes into files anyway, why not make a mockery of a maildir and let sa-learn iterate over it?
my $spamtmp = "$tmpdir/spam.".time.".$$";  # this should give us a suitably pseudorandom directory
mkdir $spamtmp or die "couldn't create temporary pen for spam: $!";
for (my $i=0; $i<$spamcount; $i++) {
  $imap->message_to_file("$spamtmp/$i",$spammsgs[$i]);  # paranoia sez this needs to be less predictable
# leave until we get a local pyzor server working
#  $pylearn .= `pyzor report < $spamtmp/$i`;
}
if ($verbose) {
  print "wrote $spamcount spams in $spamtmp:\n";
  print `ls -lt $spamtmp` if $debug;
}

# Larn'em
$learnresults = `$salearn $learnargs --spam $spamtmp 2>&1`;
#$pylearn = `find $spamtmp -type f -exec 'pyzor report < {}' \;`;
print "-------\nSpam:\n".$learnresults.($debug ? "\n$pylearn" : '')."\n-------\n" if $verbose;

# Clean 'em up
for (my $i=0; $i<$spamcount; $i++) { unlink "$spamtmp/$i"; }
rmdir $spamtmp;

if ($delete_after_learning) {
  for (my $i=0; $i<$spamcount; $i++) { $imap->delete_message($spammsgs[$i]); }
  $imap->expunge();
}

$imap->close();


## Process the not-spam mailbox
$imap->select($hamfolder);
my @hammsgs = $imap->search("ALL");

# Since the data goes into files anyway, why not make a mockery of a maildir and let sa-learn iterate over it?
my $hamtmp = "$tmpdir/ham.".time.".$$";  # this should give us a suitably pseudorandom directory
mkdir $hamtmp or die "couldn't create temporary pen for ham: $!";
for (my $i=0; $i<$hamcount; $i++) {
  $imap->message_to_file("$hamtmp/$i",$hammsgs[$i]);  # paranoia sez this needs to be less predictable
}
if ($verbose) {
  print "wrote $hamcount hams in $hamtmp:\n";
  print `ls -lt $hamtmp` if $debug;
}

# Larn'em
$learnresults = `$salearn $learnargs --ham $hamtmp 2>&1`;
print "-------\nHam:\n",$learnresults,"\n-------\n" if $verbose;

# Clean 'em up
for (my $i=0; $i<$hamcount; $i++) { unlink "$hamtmp/$i"; }
rmdir $hamtmp;

if ($delete_after_learning) {
  for (my $i=0; $i<$hamcount; $i++) { $imap->delete_message($hammsgs[$i]); }
  $imap->expunge();
}

$imap->close();

# Close IMAP connection cleanly.
$imap->logout();

# integrate learned stuff - journal sync etc IFF bayes_journal is set AND you want to sync right away
# irrelevant for SQL Bayes
#my $sarebuild = `/usr/bin/sa-learn --rebuild`;
#print "-------\nRebuild: ",$sarebuild,"\n-------\n" if $debug;

