#!/usr/bin/perl -w

# Script to take a list of directories and attempt to unify files, listing
# both files with matching names (in different directories) but different
# contents, or distinct files with matching contents.
#
# With the -m option, also sort messages in mbox files

# options:	[ options apply to files/dirs later on the command line ]
#	-r	report new content, nothing else
#	-f	report only duplicate content, nothing else
#	-b	--preload	report nothing; preload for comparisons
#	-m	check mboxes too
#	-n	don't check names
#	-v	verbose
#	-x	exclude files
#	-o	stop excluding files

use strict;
use Digest;
use Digest::MD5;
use Mail::Box::Manager;
use Fcntl;
use BerkeleyDB;
use Cwd 'abs_path';

my %database;

# parameter stuff
my $report_new	      = 0;
my $report_fname      = 1;
my $report_dup	      = 1;
my $check_names	      = 1;
my $sort_mail	      = 0;
my $sort_report	      = 1;
my $verbose	      = 0;
my $exclude	      = 0;
my $report_statistics = 0;

my ($fname,$arg);
my $files = {};
my $sums = {};
my %excludes;
my $digest;
my $mailboxmanager;
my $ignorefiles = {
	'logo.gif' => 1,
	'spacer.gif' => 1,
	'prototype.js' => 1,
	'style.css' => 1,
};
my $ignoresums = {
	'56398e76be6355ad5999b262208a17c9' => 1,	# 1x1 gif
	'221d8352905f2c38b3cb2bd191d630b0' => 1,	# another 1x1 gif
	'df3e567d6f16d040326c7a0ea29a4f41' => 1,	# still another 1x1 gif
	'd41d8cd98f00b204e9800998ecf8427e' => 1,	# zero-length
};
my @all_reports;

$digest = Digest->new('MD5');
$mailboxmanager = Mail::Box::Manager->new;

tie %database, 'BerkeleyDB::Hash',
		-Filename	=> glob("~/.var/filesort.db"),
		-Flags		=> DB_CREATE
or warn "Couldn't tie DB.";

if (!@ARGV)
{
	@ARGV = ('.');
}

foreach $arg (@ARGV)
{
	if ($arg eq "-r")
	{
		# any new content?
		$report_new = 1;
		$report_fname = 0;
		$report_dup = 0;
	}
	elsif ($arg eq "-f")
	{
		# don't care about filenames, just content
		$report_new = 0;
		$report_fname = 0;
		$report_dup = 1;
	}
	elsif (($arg eq "-b") || ($arg eq "--preload"))
	{
		# just preloading for comparison
		$report_new = 0;
		$report_fname = 0;
		$report_dup = 0;
	}
	elsif ($arg eq "-m")
	{
		# check individual message bodies where in mbox format
		$sort_mail = 1;
		# keep in mind that the numbers are for unsorted mailboxes
	}
	elsif ($arg eq "-n")
	{
		# Don't check names
		$check_names = 0;
	}
	elsif ($arg eq "-u")	# u = unsorted
	{
		# Don't sort output
		$sort_report = 0;
	}
	elsif ($arg eq "-v")
	{
		# verbose
		$verbose++;
	}
	elsif ($arg eq "-vv")
	{
		# verbose
		$verbose+=2;
	}
	elsif ($arg eq "--stats")
	{
		$report_statistics = 1;
	}
	elsif (($arg eq "-x") || ($arg eq "--exclude"))
	{
		$exclude = 1;
	}
	elsif (($arg eq "-o") || ($arg eq "--noexclude"))
	{
		$exclude = 0;
	}
	else
	{
		if ($exclude)
		{
			$excludes{$arg}++;
			if ($verbose >= 1)
			{
				print "Excluding $arg\n";
			}
		} else {
			&do_entry('.', $arg);
		}
	}
}

sub report_line {
	my $line = shift;

	if ($sort_report)
	{
		push @all_reports, $line;
	} else {
		print $line;
	}
}

sub do_report {
	my $fname = shift;
	my $file = shift;
	my $sum = shift;

	if ($report_new && (!defined($sums->{$sum})))
	{
		&report_line("New content: $fname\n");
	}

	if ($report_dup && defined($sums->{$sum}) && !defined($ignoresums->{$sum}))
	{
		&report_line("Dup content: $fname, $sums->{$sum}\n");
	}
	elsif ($report_fname && defined($files->{$file}) && !defined($ignorefiles->{$file}))
	{
		&report_line("Dup filename: $file ($fname, $files->{$file})\n");
	}

	$sums->{$sum} = $fname;
	$files->{$file} = $fname;
}

sub do_file {
	my $path = shift @_;
	my $file = shift @_;
	my $fname;
	my $sum;
	my $bad;

	my $dev;
	my $ino;
	my $mode;
	my $nlink;
	my $uid;
	my $gid;
	my $rdev;
	my $size;
	my $atime;
	my $mtime;
	my $ctime;
	my $blksize;
	my $blocks;


	my $cache_sum;
	my $cache_size;
	my $cache_ctime;

# some issue here (TODO) with actual absolute-path arguments, or maybe ~/
	my $abs_path = abs_path($path . '/' . $file);

	# get full name
	if ($path eq '.')
	{
		$fname = $file;			# local dir; just $file
	} else {
		$fname = $path . '/' . $file;	# concat path,file
	}

	# check for sane names
	if ($file =~ /^[\/0-9a-zA-Z\._=][= :~,+\/a-zA-Z0-9_\.\-]*$/)
	{
		$bad = 0;
	} else {
		if ($file =~ /^[0-9A-Za-z\(\) \-]*\.(mp3|pdf|epub)$/)
		{
			$bad = 0;
		} else {
			$bad = 1;
		}
	}

	($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
		$atime,$mtime,$ctime,$blksize,$blocks) = stat($fname);

	if ($bad && $check_names)
	{
		&report_line("Bad filename: $fname\n");
	}

	if ($sort_mail)
	{
		my $message;
		my $count = 0;

		my $mbox = $mailboxmanager->open($fname,
			 access => 'r');
		foreach $message ($mbox->messages)
		{
			$count++;
			$digest->add($message->body->string);
			# resets $digest
			$sum = $digest->hexdigest;
			&do_report($count . ":" . $fname, $count . ":" . $file, $sum);
		}
		$mbox->close;	# readonly; ignore close result
	}

	my $cache_valid = 1;

	my $db_result = $database{$abs_path};
	if (!defined($db_result))
	{
		$cache_valid = 0;
		if ($verbose >= 1)
		{
			print "Cache invalid: $fname: undef\n";
		}
	}

	# something's wrong if these aren't defined..
	if (!defined($fname))
	{
		die "Filename undefined";
	}
	if (!defined($abs_path))
	{
		die "abs_path undefined";
	}
# $database{$abs_path}\n");

	if ($cache_valid)
	{
		($cache_sum,$cache_size,$cache_ctime) = split(' ',$db_result);
		if (($cache_size != $size) || ($cache_ctime != $ctime))
		{
			$cache_valid = 0;
			if ($verbose >= 1)
			{
				print "Cache invalid: $fname: changed\n";
			}
		}
	}

	if (!$cache_valid)
	{
		if (open (FILE, $fname))
		{
			$digest->addfile(\*FILE);
			close FILE;
			$sum = $digest->hexdigest;	# resets $digest
			if (!defined($sum))
			{
				die "cache-write sum undefined";
			}
			if (!defined($size))
			{
				die "cache-write size undefined";
			}
			if (!defined($ctime))
			{
				die "cache-write ctime undefined";
			}
			my $dbval = $sum . ' ' . $size . ' ' . $ctime;
			$database{$abs_path} = $dbval;
			if ($verbose >= 1)
			{
				print "Stat update [pre]: $fname: $abs_path: $dbval\n";
			}
			if (!defined($database{$abs_path}))
			{
				die "db(abs_path) [$abs_path] undefined after write";
			}
			if ($verbose >= 1)
			{
				print "Stat update: $fname: $abs_path: $database{$abs_path}\n";
			}
		} else {
			warn "Couldn't open $fname";
			$sum = undef;
		}
	} else {
		$sum = $cache_sum;
		if ($verbose >= 2)
		{
			print "Cached: $fname: $sum\n";
		}
	}

	if (defined($sum))
	{
		&do_report($fname, $file, $sum);
	}
}

sub do_dir {
	my @files;
	my $file;

	my $dir = shift @_;

	if (opendir(DIR,$dir))
	{
		@files = readdir(DIR);
		closedir(DIR);	

		foreach $file (@files)
		{
			if (($file ne '.') && ($file ne '..') && ($file ne '.svn'))
			{
				&do_entry($dir, $file);
			}
		}
	} else {
		warn "Can't open dir $dir";
	}
}

sub do_entry {
	my $path = shift @_;
	my $arg = shift @_;
	my $file = $path . '/' . $arg;

	$path =~ s/\/$//;

	if ($path eq '.')
	{
		$file = $arg;
	}

	if ($verbose >= 2)
	{
		print "Test $file for exclusion\n";
	}

	if (defined($excludes{$file}) && ($excludes{$file} > 0))
	{
		if ($verbose >= 1)
		{
			print "Excluded $file\n";
		}
		$excludes{$file}--;
	} else {
		if ($path eq '.')
		{
			$file = $arg;
		}
		if (-l $file)
		{
			#link: ignore
		}
		elsif (-f $file)
		{
			do_file($path, $arg);
		}
		elsif (-d $file)
		{
			&do_dir($file);
		}
		elsif (-e $file)
		{
			warn "Unexpected type: $file";
		}
		else
		{
			&report_line("Nonexistent file: $file\n");
		}
	}
}

print sort @all_reports;

if ($report_statistics)
{
	print (($#all_reports+1) . " reports.\n");
}

untie %database;
