#!/usr/bin/perl # dbxrecover, a program for recovering mail from damaged Outlook dbx files. # Copyright (C) 2005 Piotr Pawlow <pp@siedziba.pl> # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. use strict; use warnings; { package DBX::Chain; sub new { my $class = shift; my $self = {}; my $arrayref = shift; my @ca = @{$arrayref}; $self->{chunks} = \@ca; $self->{idx} = scalar @ca; bless ($self, $class); return $self; } # we assume here the single message is small enough to fit into memory # this may not be a case for publishing companies or other businesses sending huge amounts of data by mail :) sub data { my $self = shift; my $data = ""; for(@{$self->{chunks}}) { $data .= $_->data; } return $data; } sub broken { my $self = shift; return ($self->{chunks}->[$self->{idx}-1]->next ne "\x00\x00\x00\x00"); } } { package DBX::ChainFinder; sub new { my $class = shift; my $self = {}; bless ($self, $class); $self->reset; return $self; } sub reset { my $self = shift; $self->{chunks} = {}; $self->{first} = {}; $self->{stats} = { 'chunkstot' => 0, 'chunksdupl' => 0, 'chunkidsdupl' => 0, 'chainstot' => 0, 'chainloops' => 0, 'chainsign' => 0, 'chainsbroken' => 0, }; } sub add { my $self = shift; my $chunk = shift; my $id = $chunk->id; my $next = $chunk->next; $self->{stats}->{chunkstot}++; my $chunks = $self->{chunks}->{$id}; if (defined($chunks)) { my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks); for (@chunks) { $self->{stats}->{chunksdupl}++,return if (!$chunk->differ($_)); # don't add chunk if there is a duplicate present } $self->{stats}->{chunkidsdupl}++; push @chunks, $chunk; $self->{chunks}->{$id} = \@chunks; } else { $self->{chunks}->{$id} = $chunk; } # we store if there is any other chunk pointing to this one if (!defined($self->{first}->{$id})) { $self->{first}->{$id} = 1 }; $self->{first}->{$next} = 0; } sub find { my $self = shift; my $first = $self->{first}; my @ids; while (my ($key, $value) = each %$first) { push @ids,$key if $value == 1; } return \@ids; } sub walk_int { no warnings qw(recursion); my $self = shift; my $id = shift; my $chain = shift; my $chains = shift; my $seen = shift; my $combinations = shift; $self->{stats}->{chainsign}++,return if ($combinations > 16); # we don't extract the message if there are too many branches $seen->{$id} = 1; # note chunk identifiers for infinite loop testing my $chunks = $self->{chunks}->{$id}; my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks); for(@chunks) { my $chunk = $_; push @{$chain}, $chunk; if (defined($self->{chunks}->{$chunk->next}) && !$seen->{$chunk->next}) { my $nextchunks = $self->{chunks}->{$chunk->next}; my $nextcount = (ref($nextchunks) eq "ARRAY") ? scalar @{$nextchunks} : 1; $self->walk_int($chunk->next, $chain, $chains, $seen, $combinations * $nextcount); } else { $self->{stats}->{chainloops}++ if $seen->{$chunk->next}; my $chainobj = DBX::Chain->new($chain); $self->{stats}->{chainsbroken}++ if $chainobj->broken; $self->{stats}->{chainstot}++; push @{$chains}, $chainobj; } pop @{$chain}; } $seen->{$id} = 0; } sub walk { my $self = shift; my $id = shift; my $chains = []; $self->walk_int($id, [], $chains, {}, 1); return $chains; } sub stats { my $self = shift; return $self->{stats}; } } { package DBX::Chunk; use Digest::SHA1 qw(sha1); sub new { my $class = shift; my $self = {}; $self->{f} = shift; return if ($self->{f}->read($self->{h}, 16) != 16); my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $self->{h}); # test for a valid header # chunk size is always 512 bytes in all dbx files I have seen if ( ($chunksize == 512) && ($datasize <= $chunksize) && ($datasize > 0) && ($id != 0) && (($next == 0) || ($datasize == $chunksize)) ) { $self->{p} = $self->{f}->getpos; # if the header seems valid, we skip the whole chunk # the chance we miss a start of another chunk is low, because # the test above is pretty strict and false positives are quite rare # it also helps in cases when there are dbx files contained inside dbx $self->{f}->seek($chunksize, 1); bless ($self, $class); return $self; } else { # skip 4 bytes and try again # headers were always at 4 byte boundary in every dbx file I have seen $self->{f}->seek(-12, 1); return 0; } } sub datahash { my $self = shift; if (!defined($self->{hash})) { $self->{hash} = sha1($self->data) }; return $self->{hash}; } sub differ { my $self = shift; my $other = shift; return 1 if (($self->{h}) ne ($other->{h})); # use a hash for comparision, to avoid storing message data, or retrieving message data on each comparision return 1 if ($self->datahash ne $other->datahash); return 0; } sub data { my $self = shift; my $datasize = unpack("V", substr($self->{h}, 8, 4)); my $data; my $pos = $self->{f}->getpos; $self->{f}->setpos($self->{p}); $self->{f}->read($data, $datasize); $self->{f}->setpos($pos); return $data; } sub id { my $self = shift; return substr($self->{h}, 0, 4); } sub next { my $self = shift; return substr($self->{h}, 12, 4); } } { package DBX::Scan; sub new { my $class = shift; my $self = {}; $self->{file} = shift; $self->{cf} = DBX::ChainFinder->new; bless ($self, $class); return $self; } sub scan { my $self = shift; $self->{file}->binmode(); $self->{file}->seek(0, 0); $self->{cf}->reset; my $time = 0; while (defined(my $chunk = DBX::Chunk->new($self->{file}))) { if ($chunk) { $self->{cf}->add($chunk); if (time - $time > 1) { $self->printchunkstats; $time = time; } } } $self->printchunkstats; print STDERR "\n"; } sub chains { my $self = shift; my $chainids = $self->{cf}->find; my $time = 0; my $tot = scalar @{$chainids}; my $idx = 0; for (@{$chainids}) { my $chains = $self->{cf}->walk($_); $idx++; for(@{$chains}) { my $message = $_->data; $message =~ s/^>*From />$&/m; # mbox "From " escaping # we don't have the original envelope address, so just output a fake one print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n"; print $message; print "\r\n\r\n"; } if (time - $time > 1) { $self->printchainstats(int($idx/$tot*100)); $time = time; } } $self->printchainstats(100); } sub printchunkstats { my $self = shift; my $stats = $self->{cf}->stats; print STDERR "Chunks found: ".$stats->{chunkstot}."; "; print STDERR "Duplicates: ".$stats->{chunksdupl}."; "; print STDERR "Duplicate IDs: ".$stats->{chunkidsdupl}."\r"; } sub printchainstats { my $self = shift; my $percdone = shift; my $stats = $self->{cf}->stats; print STDERR "Chains found: ".$stats->{chainstot}."; "; print STDERR "Broken: ".$stats->{chainsbroken}."; "; print STDERR "Loops: ".$stats->{chainloops}."; "; print STDERR "Dropped: ".$stats->{chainsign}."; "; print STDERR $percdone."% done\r"; } } use IO::File; (my $dbxname=$ARGV[0]) and ((scalar @ARGV) == 1) or print(STDERR "Usage: $0 input.dbx >output.mbox\n"),exit; print STDERR "Opening $dbxname for reading...\n"; my $dbxfile = IO::File->new($dbxname, "r") or die("Error opening file!\n"); my $dbxscanner = DBX::Scan->new($dbxfile); print STDERR "Searching for chunks...\n"; $dbxscanner->scan; print STDERR "Searching for chains and printing...\n"; $dbxscanner->chains; print STDERR "\n";