summaryrefslogtreecommitdiff
path: root/dbxrecover-2p
diff options
context:
space:
mode:
authorpp <pp@455248ca-bdda-0310-9134-f4ebb693071a>2006-05-09 18:11:30 +0000
committerpp <pp@455248ca-bdda-0310-9134-f4ebb693071a>2006-05-09 18:11:30 +0000
commitd79a49d882243ba777014c605af60e843b91fbc5 (patch)
treef337cb456d3d0b76513205251a2dd4c55153382f /dbxrecover-2p
parent79a1cc8cecba5a66b3462e7228b2adb37051ad0f (diff)
- 1 pass version. Faster, uses less memory, works as a filter
git-svn-id: https://siedziba.pl:790/svn/repos/dbxrecover@235 455248ca-bdda-0310-9134-f4ebb693071a
Diffstat (limited to 'dbxrecover-2p')
-rwxr-xr-xdbxrecover-2p368
1 files changed, 368 insertions, 0 deletions
diff --git a/dbxrecover-2p b/dbxrecover-2p
new file mode 100755
index 0000000..6d2f71c
--- /dev/null
+++ b/dbxrecover-2p
@@ -0,0 +1,368 @@
+#!/usr/bin/perl
+
+# dbxrecover, a program for recovering mail from damaged Outlook dbx files.
+# Copyright (C) 2005 Piotr Pawlow <pp@siedziba.pl>
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+use strict;
+use warnings;
+
+{
+ package DBX::Chain;
+
+ sub new
+ {
+ my $class = shift;
+ my $self = {};
+ my $arrayref = shift;
+ my @ca = @{$arrayref};
+ $self->{chunks} = \@ca;
+ $self->{idx} = scalar @ca;
+ bless ($self, $class);
+ return $self;
+ }
+
+ # we assume here the single message is small enough to fit into memory
+ # this may not be a case for publishing companies or other businesses sending huge amounts of data by mail :)
+ sub data
+ {
+ my $self = shift;
+ my $data = "";
+ for(@{$self->{chunks}})
+ {
+ $data .= $_->data;
+ }
+ return $data;
+ }
+
+ sub broken
+ {
+ my $self = shift;
+ return ($self->{chunks}->[$self->{idx}-1]->next ne "\x00\x00\x00\x00");
+ }
+}
+
+{
+ package DBX::ChainFinder;
+
+ sub new
+ {
+ my $class = shift;
+ my $self = {};
+ bless ($self, $class);
+ $self->reset;
+ return $self;
+ }
+
+ sub reset
+ {
+ my $self = shift;
+ $self->{chunks} = {};
+ $self->{first} = {};
+ $self->{stats} =
+ { 'chunkstot' => 0,
+ 'chunksdupl' => 0,
+ 'chunkidsdupl' => 0,
+ 'chainstot' => 0,
+ 'chainloops' => 0,
+ 'chainsign' => 0,
+ 'chainsbroken' => 0,
+ };
+ }
+
+ sub add
+ {
+ my $self = shift;
+ my $chunk = shift;
+ my $id = $chunk->id;
+ my $next = $chunk->next;
+ $self->{stats}->{chunkstot}++;
+ my $chunks = $self->{chunks}->{$id};
+ if (defined($chunks))
+ {
+ my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks);
+ for (@chunks)
+ {
+ $self->{stats}->{chunksdupl}++,return if (!$chunk->differ($_)); # don't add chunk if there is a duplicate present
+ }
+ $self->{stats}->{chunkidsdupl}++;
+ push @chunks, $chunk;
+ $self->{chunks}->{$id} = \@chunks;
+ }
+ else
+ {
+ $self->{chunks}->{$id} = $chunk;
+ }
+
+ # we store if there is any other chunk pointing to this one
+ if (!defined($self->{first}->{$id})) { $self->{first}->{$id} = 1 };
+ $self->{first}->{$next} = 0;
+ }
+
+ sub find
+ {
+ my $self = shift;
+ my $first = $self->{first};
+ my @ids;
+ while (my ($key, $value) = each %$first)
+ {
+ push @ids,$key if $value == 1;
+ }
+ return \@ids;
+ }
+
+ sub walk_int
+ {
+ no warnings qw(recursion);
+
+ my $self = shift;
+ my $id = shift;
+ my $chain = shift;
+ my $chains = shift;
+ my $seen = shift;
+ my $combinations = shift;
+ $self->{stats}->{chainsign}++,return if ($combinations > 16); # we don't extract the message if there are too many branches
+ $seen->{$id} = 1; # note chunk identifiers for infinite loop testing
+ my $chunks = $self->{chunks}->{$id};
+ my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks);
+ for(@chunks)
+ {
+ my $chunk = $_;
+ push @{$chain}, $chunk;
+ if (defined($self->{chunks}->{$chunk->next}) && !$seen->{$chunk->next})
+ {
+ my $nextchunks = $self->{chunks}->{$chunk->next};
+ my $nextcount = (ref($nextchunks) eq "ARRAY") ? scalar @{$nextchunks} : 1;
+ $self->walk_int($chunk->next, $chain, $chains, $seen, $combinations * $nextcount);
+ }
+ else
+ {
+ $self->{stats}->{chainloops}++ if $seen->{$chunk->next};
+ my $chainobj = DBX::Chain->new($chain);
+ $self->{stats}->{chainsbroken}++ if $chainobj->broken;
+ $self->{stats}->{chainstot}++;
+ push @{$chains}, $chainobj;
+ }
+ pop @{$chain};
+ }
+ $seen->{$id} = 0;
+ }
+
+ sub walk
+ {
+ my $self = shift;
+ my $id = shift;
+ my $chains = [];
+ $self->walk_int($id, [], $chains, {}, 1);
+ return $chains;
+ }
+
+ sub stats
+ {
+ my $self = shift;
+ return $self->{stats};
+ }
+}
+
+{
+ package DBX::Chunk;
+ use Digest::SHA1 qw(sha1);
+
+ sub new
+ {
+ my $class = shift;
+ my $self = {};
+ $self->{f} = shift;
+ return if ($self->{f}->read($self->{h}, 16) != 16);
+ my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $self->{h});
+
+ # test for a valid header
+ # chunk size is always 512 bytes in all dbx files I have seen
+
+ if (
+ ($chunksize == 512) &&
+ ($datasize <= $chunksize) &&
+ ($datasize > 0) &&
+ ($id != 0) &&
+ (($next == 0) || ($datasize == $chunksize))
+ )
+ {
+ $self->{p} = $self->{f}->getpos;
+
+ # if the header seems valid, we skip the whole chunk
+
+ # the chance we miss a start of another chunk is low, because
+ # the test above is pretty strict and false positives are quite rare
+
+ # it also helps in cases when there are dbx files contained inside dbx
+
+ $self->{f}->seek($chunksize, 1);
+ bless ($self, $class);
+ return $self;
+ }
+ else
+ {
+
+ # skip 4 bytes and try again
+ # headers were always at 4 byte boundary in every dbx file I have seen
+
+ $self->{f}->seek(-12, 1);
+ return 0;
+ }
+ }
+
+ sub datahash
+ {
+ my $self = shift;
+ if (!defined($self->{hash})) { $self->{hash} = sha1($self->data) };
+ return $self->{hash};
+ }
+
+ sub differ
+ {
+ my $self = shift;
+ my $other = shift;
+ return 1 if (($self->{h}) ne ($other->{h}));
+ # use a hash for comparision, to avoid storing message data, or retrieving message data on each comparision
+ return 1 if ($self->datahash ne $other->datahash);
+ return 0;
+ }
+
+ sub data
+ {
+ my $self = shift;
+ my $datasize = unpack("V", substr($self->{h}, 8, 4));
+ my $data;
+ my $pos = $self->{f}->getpos;
+ $self->{f}->setpos($self->{p});
+ $self->{f}->read($data, $datasize);
+ $self->{f}->setpos($pos);
+ return $data;
+ }
+
+ sub id
+ {
+ my $self = shift;
+ return substr($self->{h}, 0, 4);
+ }
+
+ sub next
+ {
+ my $self = shift;
+ return substr($self->{h}, 12, 4);
+ }
+
+}
+
+{
+ package DBX::Scan;
+
+ sub new
+ {
+ my $class = shift;
+ my $self = {};
+ $self->{file} = shift;
+ $self->{cf} = DBX::ChainFinder->new;
+ bless ($self, $class);
+ return $self;
+ }
+
+ sub scan
+ {
+ my $self = shift;
+ $self->{file}->binmode();
+ $self->{file}->seek(0, 0);
+ $self->{cf}->reset;
+ my $time = 0;
+
+ while (defined(my $chunk = DBX::Chunk->new($self->{file})))
+ {
+ if ($chunk)
+ {
+ $self->{cf}->add($chunk);
+ if (time - $time > 1)
+ {
+ $self->printchunkstats;
+ $time = time;
+ }
+ }
+ }
+ $self->printchunkstats;
+ print STDERR "\n";
+ }
+
+ sub chains
+ {
+ my $self = shift;
+ my $chainids = $self->{cf}->find;
+ my $time = 0;
+ my $tot = scalar @{$chainids};
+ my $idx = 0;
+ for (@{$chainids})
+ {
+ my $chains = $self->{cf}->walk($_);
+ $idx++;
+ for(@{$chains})
+ {
+ my $message = $_->data;
+ $message =~ s/^>*From />$&/m; # mbox "From " escaping
+ # we don't have the original envelope address, so just output a fake one
+ print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n";
+ print $message;
+ print "\r\n\r\n";
+ }
+ if (time - $time > 1)
+ {
+ $self->printchainstats(int($idx/$tot*100));
+ $time = time;
+ }
+ }
+ $self->printchainstats(100);
+ }
+
+ sub printchunkstats
+ {
+ my $self = shift;
+ my $stats = $self->{cf}->stats;
+ print STDERR "Chunks found: ".$stats->{chunkstot}."; ";
+ print STDERR "Duplicates: ".$stats->{chunksdupl}."; ";
+ print STDERR "Duplicate IDs: ".$stats->{chunkidsdupl}."\r";
+ }
+
+ sub printchainstats
+ {
+ my $self = shift;
+ my $percdone = shift;
+ my $stats = $self->{cf}->stats;
+ print STDERR "Chains found: ".$stats->{chainstot}."; ";
+ print STDERR "Broken: ".$stats->{chainsbroken}."; ";
+ print STDERR "Loops: ".$stats->{chainloops}."; ";
+ print STDERR "Dropped: ".$stats->{chainsign}."; ";
+ print STDERR $percdone."% done\r";
+ }
+}
+
+use IO::File;
+
+(my $dbxname=$ARGV[0]) and ((scalar @ARGV) == 1) or print(STDERR "Usage: $0 input.dbx >output.mbox\n"),exit;
+print STDERR "Opening $dbxname for reading...\n";
+my $dbxfile = IO::File->new($dbxname, "r") or die("Error opening file!\n");
+my $dbxscanner = DBX::Scan->new($dbxfile);
+print STDERR "Searching for chunks...\n";
+$dbxscanner->scan;
+print STDERR "Searching for chains and printing...\n";
+$dbxscanner->chains;
+print STDERR "\n";