#!/usr/bin/perl # dbxrecover, a program for recovering mail from damaged Outlook dbx files. # Copyright (C) 2005 Piotr Pawlow <pp@siedziba.pl> # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. use strict; { package DBX::Chain; sub new { my $class = shift; my $self = {}; my $arrayref = shift; my @ca = @{$arrayref}; $self->{chunks} = \@ca; $self->{idx} = scalar @ca; bless ($self, $class); return $self; } # we assume here the single message is small enough to fit into memory # this may not be a case for publishing companies or other businesses sending huge amounts of data by mail :) sub data { my $self = shift; my $data = ""; for(@{$self->{chunks}}) { $data .= $_->data; } return $data; } sub broken { my $self = shift; return ($self->{chunks}->[$self->{idx}-1]->next != 0); } } { package DBX::ChainFinder; sub new { my $class = shift; my $self = {}; bless ($self, $class); $self->reset; return $self; } sub reset { my $self = shift; $self->{chunks} = {}; $self->{first} = {}; $self->{stats} = { 'chunkstot' => 0, 'chunksdupl' => 0, 'chunkidsdupl' => 0, 'chainstot' => 0, 'chainloops' => 0, 'chainsign' => 0, }; } sub add { my $self = shift; my $chunk = shift; my $id = $chunk->id; my $next = $chunk->next; $self->{stats}->{chunkstot}++; if (!defined($self->{chunks}->{$id})) { $self->{chunks}->{$id} = []; } else { for (@{$self->{chunks}->{$id}}) { $self->{stats}->{chunksdupl}++,return if (!$chunk->differ($_)); # don't add chunk if there is a duplicate present } } push @{$self->{chunks}->{$id}}, $chunk; # we store if there is any other chunk pointing to this one if (!defined($self->{first}->{$id})) { $self->{first}->{$id} = 1 }; $self->{first}->{$next} = 0; # adding a new chunk requires rescanning for chains $self->{chains} = undef; } sub find { my $self = shift; $self->{chains} = []; for(keys %{$self->{first}}) { if ($self->{first}->{$_} == 1) { # if there is no other chunk pointing to this one, start following it $self->walk($_); } } return $self->{chains}; } sub walk { my $self = shift; my $id = shift; my $chain = shift; my $seen = shift; my $combinations = shift; $self->{stats}->{chainsign}++,return if ($combinations > 16); # we don't extract the message if there are too many branches if (!defined($chain)) { $chain = [] }; if (!defined($seen)) { $seen = {} }; if (!defined($combinations)) { $combinations = 1 }; $seen->{$id} = 1; # note chunk identifiers for infinite loop testing for(@{$self->{chunks}->{$id}}) { my $chunk = $_; push @{$chain}, $chunk; if (defined($self->{chunks}->{$chunk->next}) && !$seen->{$chunk->next}) { $self->walk($chunk->next, $chain, $seen, $combinations * scalar @{$self->{chunks}->{$chunk->next}}); } else { $self->{stats}->{chainloops}++ if $seen->{$chunk->next}; push @{$self->{chains}}, DBX::Chain->new($chain); } pop @{$chain}; } $seen->{$id} = 0; } sub stats { my $self = shift; $self->{stats}->{chunkidsdupl} = $self->{stats}->{chunkstot} - scalar keys %{$self->{chunks}}; $self->{stats}->{chainstot} = scalar @{$self->{chains}} if ($self->{chains}); return $self->{stats}; } } { package DBX::Chunk; use Digest::SHA1 qw(sha1); sub new { my $class = shift; my $self = {}; $self->{file} = shift; return if ($self->{file}->read($self->{header}, 16) != 16); my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $self->{header}); # test for a valid header # chunk size is always 512 bytes in all dbx files I have seen if ( ($chunksize == 512) && ($datasize <= $chunksize) && ($datasize > 0) && ($id != 0) && (($next == 0) || ($datasize == $chunksize)) ) { $self->{filepos} = $self->{file}->getpos; # if the header seems valid, we skip the whole chunk # the chance we miss a start of another chunk is low, because # the test above is pretty strict and false positives are quite rare # it also helps in cases when there are dbx files contained inside dbx $self->{file}->seek($chunksize, 1); bless ($self, $class); return $self; } else { # skip 4 bytes and try again # headers were always at 4 byte boundary in every dbx file I have seen $self->{file}->seek(-12, 1); return 0; } } sub datahash { my $self = shift; if (!defined($self->{hash})) { $self->{hash} = sha1($self->data) }; return $self->{hash}; } sub differ { my $self = shift; my $other = shift; return 1 if (($self->{header}) ne ($other->{header})); # use a hash for comparision, to avoid storing message data, or retrieving message data on each comparision return 1 if ($self->datahash ne $other->datahash); return 0; } sub data { my $self = shift; my $datasize = unpack("V", substr($self->{header}, 8, 4)); my $data; my $pos = $self->{file}->getpos; $self->{file}->setpos($self->{filepos}); $self->{file}->read($data, $datasize); $self->{file}->setpos($pos); return $data; } sub id { my $self = shift; return scalar unpack("V", $self->{header}); } sub next { my $self = shift; return scalar unpack("V", substr($self->{header}, 12, 4)); } } { package DBX::Scan; sub new { my $class = shift; my $self = {}; $self->{file} = shift; $self->{cf} = DBX::ChainFinder->new; bless ($self, $class); return $self; } sub scan { my $self = shift; $self->{file}->binmode(); $self->{file}->seek(0, 0); $self->{cf}->reset; my $time; while (defined(my $chunk = DBX::Chunk->new($self->{file}))) { $self->{cf}->add($chunk) if ($chunk); if (time - $time > 1) { $self->printchunkstats; $time = time; } } $self->printchunkstats; print STDERR "\n"; } sub chains { my $self = shift; my $chains = $self->{cf}->find; $self->printchainstats; return $chains; } sub printchunkstats { my $self = shift; my $stats = $self->{cf}->stats; print STDERR "Chunks found: ".$stats->{chunkstot}."; "; print STDERR "Duplicates: ".$stats->{chunksdupl}."; "; print STDERR "Duplicate IDs: ".$stats->{chunkidsdupl}."\r"; } sub printchainstats { my $self = shift; my $stats = $self->{cf}->stats; print STDERR "Chains found: ".$stats->{chainstot}."\n"; print STDERR "Chain loops found: ".$stats->{chainloops}."\n"; print STDERR "Chains ignored due to duplicate chunk ids: ".$stats->{chainsign}."\n"; } } use IO::File; (my $dbxname=$ARGV[0]) and ((scalar @ARGV) == 1) or print(STDERR "Usage: $0 input.dbx >output.mbox\n"),exit; print STDERR "Opening $dbxname for reading...\n"; my $dbxfile = IO::File->new($dbxname, "r") or die("Error opening file!\n"); my $dbxscanner = DBX::Scan->new($dbxfile); print STDERR "Searching for chunks...\n"; $dbxscanner->scan; print STDERR "Reconstructing chains...\n"; my $chains = $dbxscanner->chains; print STDERR "Printing...\n"; for(@{$chains}) { my $message = $_->data; $message =~ s/^>*From />$&/m; # mbox "From " escaping # we don't have the original envelope address, so just output a fake one print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n"; print $message; print "\r\n\r\n"; }