#!/usr/bin/perl
# dbxrecover, a program for recovering mail from damaged Outlook dbx files.
# Copyright (C) 2005 Piotr Pawlow <pp@siedziba.pl>
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
use strict;
use warnings;
{
package DBX::Chain;
sub new
{
my $class = shift;
my $self = {};
my $arrayref = shift;
my @ca = @{$arrayref};
$self->{chunks} = \@ca;
$self->{idx} = scalar @ca;
bless ($self, $class);
return $self;
}
# we assume here the single message is small enough to fit into memory
# this may not be a case for publishing companies or other businesses sending huge amounts of data by mail :)
sub data
{
my $self = shift;
my $data = "";
for(@{$self->{chunks}})
{
$data .= $_->data;
}
return $data;
}
sub broken
{
my $self = shift;
return ($self->{chunks}->[$self->{idx}-1]->next ne "\x00\x00\x00\x00");
}
}
{
package DBX::ChainFinder;
sub new
{
my $class = shift;
my $self = {};
bless ($self, $class);
$self->reset;
return $self;
}
sub reset
{
my $self = shift;
$self->{chunks} = {};
$self->{first} = {};
$self->{stats} =
{ 'chunkstot' => 0,
'chunksdupl' => 0,
'chunkidsdupl' => 0,
'chainstot' => 0,
'chainloops' => 0,
'chainsign' => 0,
'chainsbroken' => 0,
};
}
sub add
{
my $self = shift;
my $chunk = shift;
my $id = $chunk->id;
my $next = $chunk->next;
$self->{stats}->{chunkstot}++;
my $chunks = $self->{chunks}->{$id};
if (defined($chunks))
{
my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks);
for (@chunks)
{
$self->{stats}->{chunksdupl}++,return if (!$chunk->differ($_)); # don't add chunk if there is a duplicate present
}
$self->{stats}->{chunkidsdupl}++;
push @chunks, $chunk;
$self->{chunks}->{$id} = \@chunks;
}
else
{
$self->{chunks}->{$id} = $chunk;
}
# we store if there is any other chunk pointing to this one
if (!defined($self->{first}->{$id})) { $self->{first}->{$id} = 1 };
$self->{first}->{$next} = 0;
}
sub find
{
my $self = shift;
my $first = $self->{first};
my @ids;
while (my ($key, $value) = each %$first)
{
push @ids,$key if $value == 1;
}
return \@ids;
}
sub walk_int
{
no warnings qw(recursion);
my $self = shift;
my $id = shift;
my $chain = shift;
my $chains = shift;
my $seen = shift;
my $combinations = shift;
$self->{stats}->{chainsign}++,return if ($combinations > 16); # we don't extract the message if there are too many branches
$seen->{$id} = 1; # note chunk identifiers for infinite loop testing
my $chunks = $self->{chunks}->{$id};
my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks);
for(@chunks)
{
my $chunk = $_;
push @{$chain}, $chunk;
if (defined($self->{chunks}->{$chunk->next}) && !$seen->{$chunk->next})
{
my $nextchunks = $self->{chunks}->{$chunk->next};
my $nextcount = (ref($nextchunks) eq "ARRAY") ? scalar @{$nextchunks} : 1;
$self->walk_int($chunk->next, $chain, $chains, $seen, $combinations * $nextcount);
}
else
{
$self->{stats}->{chainloops}++ if $seen->{$chunk->next};
my $chainobj = DBX::Chain->new($chain);
$self->{stats}->{chainsbroken}++ if $chainobj->broken;
$self->{stats}->{chainstot}++;
push @{$chains}, $chainobj;
}
pop @{$chain};
}
$seen->{$id} = 0;
}
sub walk
{
my $self = shift;
my $id = shift;
my $chains = [];
$self->walk_int($id, [], $chains, {}, 1);
return $chains;
}
sub stats
{
my $self = shift;
return $self->{stats};
}
}
{
package DBX::Chunk;
use Digest::SHA1 qw(sha1);
sub new
{
my $class = shift;
my $self = {};
$self->{f} = shift;
return if ($self->{f}->read($self->{h}, 16) != 16);
my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $self->{h});
# test for a valid header
# chunk size is always 512 bytes in all dbx files I have seen
if (
($chunksize == 512) &&
($datasize <= $chunksize) &&
($datasize > 0) &&
($id != 0) &&
(($next == 0) || ($datasize == $chunksize))
)
{
$self->{p} = $self->{f}->getpos;
# if the header seems valid, we skip the whole chunk
# the chance we miss a start of another chunk is low, because
# the test above is pretty strict and false positives are quite rare
# it also helps in cases when there are dbx files contained inside dbx
$self->{f}->seek($chunksize, 1);
bless ($self, $class);
return $self;
}
else
{
# skip 4 bytes and try again
# headers were always at 4 byte boundary in every dbx file I have seen
$self->{f}->seek(-12, 1);
return 0;
}
}
sub datahash
{
my $self = shift;
if (!defined($self->{hash})) { $self->{hash} = sha1($self->data) };
return $self->{hash};
}
sub differ
{
my $self = shift;
my $other = shift;
return 1 if (($self->{h}) ne ($other->{h}));
# use a hash for comparision, to avoid storing message data, or retrieving message data on each comparision
return 1 if ($self->datahash ne $other->datahash);
return 0;
}
sub data
{
my $self = shift;
my $datasize = unpack("V", substr($self->{h}, 8, 4));
my $data;
my $pos = $self->{f}->getpos;
$self->{f}->setpos($self->{p});
$self->{f}->read($data, $datasize);
$self->{f}->setpos($pos);
return $data;
}
sub id
{
my $self = shift;
return substr($self->{h}, 0, 4);
}
sub next
{
my $self = shift;
return substr($self->{h}, 12, 4);
}
}
{
package DBX::Scan;
sub new
{
my $class = shift;
my $self = {};
$self->{file} = shift;
$self->{cf} = DBX::ChainFinder->new;
bless ($self, $class);
return $self;
}
sub scan
{
my $self = shift;
$self->{file}->binmode();
$self->{file}->seek(0, 0);
$self->{cf}->reset;
my $time = 0;
while (defined(my $chunk = DBX::Chunk->new($self->{file})))
{
if ($chunk)
{
$self->{cf}->add($chunk);
if (time - $time > 1)
{
$self->printchunkstats;
$time = time;
}
}
}
$self->printchunkstats;
print STDERR "\n";
}
sub chains
{
my $self = shift;
my $chainids = $self->{cf}->find;
my $time = 0;
my $tot = scalar @{$chainids};
my $idx = 0;
for (@{$chainids})
{
my $chains = $self->{cf}->walk($_);
$idx++;
for(@{$chains})
{
my $message = $_->data;
$message =~ s/^>*From />$&/mg; # mbox "From " escaping
# we don't have the original envelope address, so just output a fake one
print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n";
print $message;
print "\r\n\r\n";
}
if (time - $time > 1)
{
$self->printchainstats(int($idx/$tot*100));
$time = time;
}
}
$self->printchainstats(100);
}
sub printchunkstats
{
my $self = shift;
my $stats = $self->{cf}->stats;
print STDERR "Chunks found: ".$stats->{chunkstot}."; ";
print STDERR "Duplicates: ".$stats->{chunksdupl}."; ";
print STDERR "Duplicate IDs: ".$stats->{chunkidsdupl}."\r";
}
sub printchainstats
{
my $self = shift;
my $percdone = shift;
my $stats = $self->{cf}->stats;
print STDERR "Chains found: ".$stats->{chainstot}."; ";
print STDERR "Broken: ".$stats->{chainsbroken}."; ";
print STDERR "Loops: ".$stats->{chainloops}."; ";
print STDERR "Dropped: ".$stats->{chainsign}."; ";
print STDERR $percdone."% done\r";
}
}
use IO::File;
(my $dbxname=$ARGV[0]) and ((scalar @ARGV) == 1) or print(STDERR "Usage: $0 input.dbx >output.mbox\n"),exit;
print STDERR "Opening $dbxname for reading...\n";
my $dbxfile = IO::File->new($dbxname, "r") or die("Error opening file!\n");
my $dbxscanner = DBX::Scan->new($dbxfile);
print STDERR "Searching for chunks...\n";
$dbxscanner->scan;
print STDERR "Searching for chains and printing...\n";
$dbxscanner->chains;
print STDERR "\n";