diff options
| author | pp <pp@455248ca-bdda-0310-9134-f4ebb693071a> | 2006-05-09 18:11:30 +0000 |
|---|---|---|
| committer | pp <pp@455248ca-bdda-0310-9134-f4ebb693071a> | 2006-05-09 18:11:30 +0000 |
| commit | d79a49d882243ba777014c605af60e843b91fbc5 (patch) | |
| tree | f337cb456d3d0b76513205251a2dd4c55153382f /dbxrecover-2p | |
| parent | 79a1cc8cecba5a66b3462e7228b2adb37051ad0f (diff) | |
- 1 pass version. Faster, uses less memory, works as a filter
git-svn-id: https://siedziba.pl:790/svn/repos/dbxrecover@235 455248ca-bdda-0310-9134-f4ebb693071a
Diffstat (limited to 'dbxrecover-2p')
| -rwxr-xr-x | dbxrecover-2p | 368 |
1 files changed, 368 insertions, 0 deletions
diff --git a/dbxrecover-2p b/dbxrecover-2p new file mode 100755 index 0000000..6d2f71c --- /dev/null +++ b/dbxrecover-2p @@ -0,0 +1,368 @@ +#!/usr/bin/perl + +# dbxrecover, a program for recovering mail from damaged Outlook dbx files. +# Copyright (C) 2005 Piotr Pawlow <pp@siedziba.pl> + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +use strict; +use warnings; + +{ + package DBX::Chain; + + sub new + { + my $class = shift; + my $self = {}; + my $arrayref = shift; + my @ca = @{$arrayref}; + $self->{chunks} = \@ca; + $self->{idx} = scalar @ca; + bless ($self, $class); + return $self; + } + + # we assume here the single message is small enough to fit into memory + # this may not be a case for publishing companies or other businesses sending huge amounts of data by mail :) + sub data + { + my $self = shift; + my $data = ""; + for(@{$self->{chunks}}) + { + $data .= $_->data; + } + return $data; + } + + sub broken + { + my $self = shift; + return ($self->{chunks}->[$self->{idx}-1]->next ne "\x00\x00\x00\x00"); + } +} + +{ + package DBX::ChainFinder; + + sub new + { + my $class = shift; + my $self = {}; + bless ($self, $class); + $self->reset; + return $self; + } + + sub reset + { + my $self = shift; + $self->{chunks} = {}; + $self->{first} = {}; + $self->{stats} = + { 'chunkstot' => 0, + 'chunksdupl' => 0, + 'chunkidsdupl' => 0, + 'chainstot' => 0, + 'chainloops' => 0, + 'chainsign' => 0, + 'chainsbroken' => 0, + }; + } + + sub add + { + my $self = shift; + my $chunk = shift; + my $id = $chunk->id; + my $next = $chunk->next; + $self->{stats}->{chunkstot}++; + my $chunks = $self->{chunks}->{$id}; + if (defined($chunks)) + { + my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks); + for (@chunks) + { + $self->{stats}->{chunksdupl}++,return if (!$chunk->differ($_)); # don't add chunk if there is a duplicate present + } + $self->{stats}->{chunkidsdupl}++; + push @chunks, $chunk; + $self->{chunks}->{$id} = \@chunks; + } + else + { + $self->{chunks}->{$id} = $chunk; + } + + # we store if there is any other chunk pointing to this one + if (!defined($self->{first}->{$id})) { $self->{first}->{$id} = 1 }; + $self->{first}->{$next} = 0; + } + + sub find + { + my $self = shift; + my $first = $self->{first}; + my @ids; + while (my ($key, $value) = each %$first) + { + push @ids,$key if $value == 1; + } + return \@ids; + } + + sub walk_int + { + no warnings qw(recursion); + + my $self = shift; + my $id = shift; + my $chain = shift; + my $chains = shift; + my $seen = shift; + my $combinations = shift; + $self->{stats}->{chainsign}++,return if ($combinations > 16); # we don't extract the message if there are too many branches + $seen->{$id} = 1; # note chunk identifiers for infinite loop testing + my $chunks = $self->{chunks}->{$id}; + my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks); + for(@chunks) + { + my $chunk = $_; + push @{$chain}, $chunk; + if (defined($self->{chunks}->{$chunk->next}) && !$seen->{$chunk->next}) + { + my $nextchunks = $self->{chunks}->{$chunk->next}; + my $nextcount = (ref($nextchunks) eq "ARRAY") ? scalar @{$nextchunks} : 1; + $self->walk_int($chunk->next, $chain, $chains, $seen, $combinations * $nextcount); + } + else + { + $self->{stats}->{chainloops}++ if $seen->{$chunk->next}; + my $chainobj = DBX::Chain->new($chain); + $self->{stats}->{chainsbroken}++ if $chainobj->broken; + $self->{stats}->{chainstot}++; + push @{$chains}, $chainobj; + } + pop @{$chain}; + } + $seen->{$id} = 0; + } + + sub walk + { + my $self = shift; + my $id = shift; + my $chains = []; + $self->walk_int($id, [], $chains, {}, 1); + return $chains; + } + + sub stats + { + my $self = shift; + return $self->{stats}; + } +} + +{ + package DBX::Chunk; + use Digest::SHA1 qw(sha1); + + sub new + { + my $class = shift; + my $self = {}; + $self->{f} = shift; + return if ($self->{f}->read($self->{h}, 16) != 16); + my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $self->{h}); + + # test for a valid header + # chunk size is always 512 bytes in all dbx files I have seen + + if ( + ($chunksize == 512) && + ($datasize <= $chunksize) && + ($datasize > 0) && + ($id != 0) && + (($next == 0) || ($datasize == $chunksize)) + ) + { + $self->{p} = $self->{f}->getpos; + + # if the header seems valid, we skip the whole chunk + + # the chance we miss a start of another chunk is low, because + # the test above is pretty strict and false positives are quite rare + + # it also helps in cases when there are dbx files contained inside dbx + + $self->{f}->seek($chunksize, 1); + bless ($self, $class); + return $self; + } + else + { + + # skip 4 bytes and try again + # headers were always at 4 byte boundary in every dbx file I have seen + + $self->{f}->seek(-12, 1); + return 0; + } + } + + sub datahash + { + my $self = shift; + if (!defined($self->{hash})) { $self->{hash} = sha1($self->data) }; + return $self->{hash}; + } + + sub differ + { + my $self = shift; + my $other = shift; + return 1 if (($self->{h}) ne ($other->{h})); + # use a hash for comparision, to avoid storing message data, or retrieving message data on each comparision + return 1 if ($self->datahash ne $other->datahash); + return 0; + } + + sub data + { + my $self = shift; + my $datasize = unpack("V", substr($self->{h}, 8, 4)); + my $data; + my $pos = $self->{f}->getpos; + $self->{f}->setpos($self->{p}); + $self->{f}->read($data, $datasize); + $self->{f}->setpos($pos); + return $data; + } + + sub id + { + my $self = shift; + return substr($self->{h}, 0, 4); + } + + sub next + { + my $self = shift; + return substr($self->{h}, 12, 4); + } + +} + +{ + package DBX::Scan; + + sub new + { + my $class = shift; + my $self = {}; + $self->{file} = shift; + $self->{cf} = DBX::ChainFinder->new; + bless ($self, $class); + return $self; + } + + sub scan + { + my $self = shift; + $self->{file}->binmode(); + $self->{file}->seek(0, 0); + $self->{cf}->reset; + my $time = 0; + + while (defined(my $chunk = DBX::Chunk->new($self->{file}))) + { + if ($chunk) + { + $self->{cf}->add($chunk); + if (time - $time > 1) + { + $self->printchunkstats; + $time = time; + } + } + } + $self->printchunkstats; + print STDERR "\n"; + } + + sub chains + { + my $self = shift; + my $chainids = $self->{cf}->find; + my $time = 0; + my $tot = scalar @{$chainids}; + my $idx = 0; + for (@{$chainids}) + { + my $chains = $self->{cf}->walk($_); + $idx++; + for(@{$chains}) + { + my $message = $_->data; + $message =~ s/^>*From />$&/m; # mbox "From " escaping + # we don't have the original envelope address, so just output a fake one + print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n"; + print $message; + print "\r\n\r\n"; + } + if (time - $time > 1) + { + $self->printchainstats(int($idx/$tot*100)); + $time = time; + } + } + $self->printchainstats(100); + } + + sub printchunkstats + { + my $self = shift; + my $stats = $self->{cf}->stats; + print STDERR "Chunks found: ".$stats->{chunkstot}."; "; + print STDERR "Duplicates: ".$stats->{chunksdupl}."; "; + print STDERR "Duplicate IDs: ".$stats->{chunkidsdupl}."\r"; + } + + sub printchainstats + { + my $self = shift; + my $percdone = shift; + my $stats = $self->{cf}->stats; + print STDERR "Chains found: ".$stats->{chainstot}."; "; + print STDERR "Broken: ".$stats->{chainsbroken}."; "; + print STDERR "Loops: ".$stats->{chainloops}."; "; + print STDERR "Dropped: ".$stats->{chainsign}."; "; + print STDERR $percdone."% done\r"; + } +} + +use IO::File; + +(my $dbxname=$ARGV[0]) and ((scalar @ARGV) == 1) or print(STDERR "Usage: $0 input.dbx >output.mbox\n"),exit; +print STDERR "Opening $dbxname for reading...\n"; +my $dbxfile = IO::File->new($dbxname, "r") or die("Error opening file!\n"); +my $dbxscanner = DBX::Scan->new($dbxfile); +print STDERR "Searching for chunks...\n"; +$dbxscanner->scan; +print STDERR "Searching for chains and printing...\n"; +$dbxscanner->chains; +print STDERR "\n"; |
