- 1 pass version. Faster, uses less memory, works as a filter
- 1 pass version. Faster, uses less memory, works as a filter
git-svn-id: https://siedziba.pl:790/svn/repos/dbxrecover@235 455248ca-bdda-0310-9134-f4ebb693071a
#!/usr/bin/perl
# dbxrecover, a program for recovering mail from damaged Outlook dbx files.
# Copyright (C) 2005 Piotr Pawlow <pp@siedziba.pl>
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
use strict;
{
package DBX::Chain;
sub new
{
my $class = shift;
my $self = {};
my $arrayref = shift;
my @ca = @{$arrayref};
$self->{chunks} = \@ca;
$self->{idx} = scalar @ca;
bless ($self, $class);
return $self;
}
# we assume here the single message is small enough to fit into memory
# this may not be a case for publishing companies or other businesses sending huge amounts of data by mail :)
sub data
{
my $self = shift;
my $data = "";
for(@{$self->{chunks}})
{
$data .= $_->data;
}
return $data;
}
sub broken
{
my $self = shift;
return ($self->{chunks}->[$self->{idx}-1]->next != 0);
}
}
{
package DBX::ChainFinder;
sub new
{
my $class = shift;
my $self = {};
bless ($self, $class);
$self->reset;
return $self;
}
sub reset
{
my $self = shift;
$self->{chunks} = {};
$self->{first} = {};
}
sub add
{
my $self = shift;
my $chunk = shift;
my $id = $chunk->id;
my $next = $chunk->next;
if (!defined($self->{chunks}->{$id}))
{
$self->{chunks}->{$id} = [];
}
else
{
for (@{$self->{chunks}->{$id}})
{
return if (!$chunk->differ($_)); # don't add chunk if there is a duplicate present
}
}
push @{$self->{chunks}->{$id}}, $chunk;
# we store if there is any other chunk pointing to this one
if (!defined($self->{first}->{$id})) { $self->{first}->{$id} = 1 };
$self->{first}->{$next} = 0;
# adding a new chunk requires rescanning for chains
$self->{chains} = undef;
}
sub find
{
my $self = shift;
$self->{chains} = [];
for(keys %{$self->{first}})
{
if ($self->{first}->{$_} == 1)
{
# if there is no other chunk pointing to this one, start following it
$self->walk($_);
}
}
return $self->{chains};
}
sub walk
{
my $self = shift;
my $id = shift;
my $chain = shift;
my $seen = shift;
my $combinations = shift;
return if ($combinations > 16); # we don't extract the message if there are too many branches
if (!defined($chain)) { $chain = [] };
if (!defined($seen)) { $seen = {} };
if (!defined($combinations)) { $combinations = 1 };
$seen->{$id} = 1; # note chunk identifiers for infinite loop testing
for(@{$self->{chunks}->{$id}})
{
my $chunk = $_;
push @{$chain}, $chunk;
if (defined($self->{chunks}->{$chunk->next}) && !$seen->{$chunk->next})
{
$self->walk($chunk->next, $chain, $seen, $combinations * scalar @{$self->{chunks}->{$chunk->next}});
}
else
{
push @{$self->{chains}}, DBX::Chain->new($chain);
}
pop @{$chain};
}
$seen->{$id} = 0;
}
}
{
package DBX::Chunk;
use Digest::SHA1 qw(sha1);
sub new
{
my $class = shift;
my $self = {};
$self->{file} = shift;
return if ($self->{file}->read($self->{header}, 16) != 16);
my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $self->{header});
# test for a valid header
# chunk size is always 512 bytes in all dbx files I have seen
if (
($chunksize == 512) &&
($datasize <= $chunksize) &&
($datasize > 0) &&
($id != 0) &&
(($next == 0) || ($datasize == $chunksize))
)
{
$self->{filepos} = $self->{file}->getpos;
# if the header seems valid, we skip the whole chunk
# the chance we miss a start of another chunk is low, because
# the test above is pretty strict and false positives are quite rare
# it also helps in cases when there are dbx files contained inside dbx
$self->{file}->seek($chunksize, 1);
$self->{hash} = undef;
($self->{id}, $self->{chunksize}, $self->{datasize}, $self->{next}) = ($id, $chunksize, $datasize, $next);
bless ($self, $class);
return $self;
}
else
{
# skip 4 bytes and try again
# headers were always at 4 byte boundary in every dbx file I have seen
$self->{file}->seek(-12, 1);
return 0;
}
}
sub datahash
{
my $self = shift;
if (!defined($self->{hash})) { $self->{hash} = sha1($self->data) };
return $self->{hash};
}
sub differ
{
my $self = shift;
my $other = shift;
return 1 if (($self->{header}) ne ($other->{header}));
# use a hash for comparision, to avoid storing message data, or retrieving message data on each comparision
return 1 if ($self->datahash ne $other->datahash);
return 0;
}
sub data
{
my $self = shift;
my $data;
$self->{file}->setpos($self->{filepos});
$self->{file}->read($data, $self->{datasize});
return $data;
}
sub id
{
my $self = shift;
return $self->{id};
}
sub next
{
my $self = shift;
return $self->{next};
}
}
{
package DBX::Scan;
sub new
{
my $class = shift;
my $self = {};
$self->{file} = shift;
$self->{cf} = DBX::ChainFinder->new;
bless ($self, $class);
return $self;
}
sub scan
{
my $self = shift;
$self->{file}->binmode();
$self->{file}->seek(0, 0);
$self->{cf}->reset;
while (defined(my $chunk = DBX::Chunk->new($self->{file})))
{
$self->{cf}->add($chunk) if ($chunk);
}
}
sub chains
{
my $self = shift;
return $self->{cf}->find;
}
}
use IO::File;
(my $dbxname=$ARGV[0]) and ((scalar @ARGV) == 1) or print(STDERR "Usage: $0 input.dbx >output.mbox\n"),exit;
print STDERR "Opening $dbxname for reading...\n";
my $dbxfile = IO::File->new($dbxname, "r") or die("Error opening file!\n");
my $dbxscanner = DBX::Scan->new($dbxfile);
print STDERR "Searching for chunks...\n";
$dbxscanner->scan;
print STDERR "Reconstructing chains...\n";
my $chains = $dbxscanner->chains;
print STDERR "Printing...\n";
for(@{$chains})
{
my $message = $_->data;
$message =~ s/^>*From />$&/m; # mbox "From " escaping
# we don't have the original envelope address, so just output a fake one
print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n";
print $message;
print "\r\n\r\n";
}