- initial import

git-svn-id: https://siedziba.pl:790/svn/repos/dbxrecover@222 455248ca-bdda-0310-9134-f4ebb693071a
author: pp <pp@455248ca-bdda-0310-9134-f4ebb693071a> 2005-11-30 20:36:50 +0000
committer: pp <pp@455248ca-bdda-0310-9134-f4ebb693071a> 2005-11-30 20:36:50 +0000
commit: 98badafe03c0707ca73833e285e619ba0fda8270 (patch)
tree: d864bd739498b68975fd9dadbbb12d82a6940178 /dbxrecover
1 files changed, 290 insertions, 0 deletions
diff --git a/dbxrecover b/dbxrecover
new file mode 100755
index 0000000..0dffecc
--- /dev/null
+++ b/dbxrecover
@@ -0,0 +1,290 @@
+#!/usr/bin/perl
+
+# dbxrecover, a program for recovering mail from damaged Outlook dbx files.
+# Copyright (C) 2005  Piotr Pawłow <pp@siedziba.pl>
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+use strict;
+
+{
+  package DBX::Chain;
+
+  sub new
+  {
+    my $class = shift;
+    my $self = {};
+    my $arrayref = shift;
+    my @ca = @{$arrayref};
+    $self->{chunks} = \@ca;
+    $self->{idx} = scalar @ca;
+    bless ($self, $class);
+    return $self;
+  }
+
+  # we assume here the single message is small enough to fit into memory
+  # this may not be a case for publishing companies or other businesses sending huge amounts of data by mail :)
+  sub data
+  {
+    my $self = shift;
+    my $data = "";
+    for(@{$self->{chunks}})
+    {
+      $data .= $_->data;
+    }
+    return $data;
+  }
+
+  sub broken
+  {
+    my $self = shift;
+    return ($self->{chunks}->[$self->{idx}-1]->next != 0);
+  }
+}
+
+{
+  package DBX::ChainFinder;
+
+  sub new
+  {
+    my $class = shift;
+    my $self = {};
+    bless ($self, $class);
+    $self->reset;
+    return $self;
+  }
+
+  sub reset
+  {
+    my $self = shift;
+    $self->{chunks} = {};
+    $self->{first} = {};
+  }
+
+  sub add
+  {
+    my $self = shift;
+    my $chunk = shift;
+    my $id = $chunk->id;
+    my $next = $chunk->next;
+    if (!defined($self->{chunks}->{$id}))
+    {
+      $self->{chunks}->{$id} = [];
+    }
+    else
+    {
+      for (@{$self->{chunks}->{$id}})
+      {
+        return if (!$chunk->differ($_)); # don't add chunk if there is a duplicate present
+      }
+    }
+    push @{$self->{chunks}->{$id}}, $chunk;
+
+    # we store if there is any other chunk pointing to this one
+    if (!defined($self->{first}->{$id})) { $self->{first}->{$id} = 1 };
+    $self->{first}->{$next} = 0;
+
+    # adding a new chunk requires rescanning for chains
+    $self->{chains} = undef;
+  }
+
+  sub find
+  {
+    my $self = shift;
+    $self->{chains} = [];
+
+    for(keys %{$self->{first}})
+    {
+      if ($self->{first}->{$_} == 1)
+      {
+        # if there is no other chunk pointing to this one, start following it
+        $self->walk($_);
+      }
+    }
+    return $self->{chains};
+  }
+
+  sub walk
+  {
+    my $self = shift;
+    my $id = shift;
+    my $chain = shift;
+    my $seen = shift;
+    my $combinations = shift;
+    return if ($combinations > 16); # we don't extract the message if there are too many branches
+    if (!defined($chain)) { $chain = [] };
+    if (!defined($seen)) { $seen = {} };
+    if (!defined($combinations)) { $combinations = 1 };
+    $seen->{$id} = 1; # note chunk identifiers for infinite loop testing
+    for(@{$self->{chunks}->{$id}})
+    {
+      my $chunk = $_;
+      push @{$chain}, $chunk;
+      if (defined($self->{chunks}->{$chunk->next}) && !$seen->{$chunk->next})
+      {
+        $self->walk($chunk->next, $chain, $seen, $combinations * scalar @{$self->{chunks}->{$chunk->next}});
+      }
+      else
+      {
+        push @{$self->{chains}}, DBX::Chain->new($chain);
+      }
+      pop @{$chain};
+    }
+    $seen->{$id} = 0;
+  }
+}
+
+{
+  package DBX::Chunk;
+  use Digest::SHA1 qw(sha1);
+
+  sub new
+  {
+    my $class = shift;
+    my $self = {};
+    $self->{file} = shift;
+    return if ($self->{file}->read($self->{header}, 16) != 16);
+    my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $self->{header});
+
+    # test for a valid header
+    # chunk size is always 512 bytes in all dbx files I have seen
+
+    if (
+          ($chunksize == 512) &&
+          ($datasize <= $chunksize) && 
+          ($datasize > 0) &&
+          ($id != 0) &&
+          (($next == 0) || ($datasize == $chunksize))
+         )
+    {
+      $self->{filepos} = $self->{file}->getpos;
+
+      # if the header seems valid, we skip the whole chunk
+
+      # there's a low chance we miss a start of another chunk,
+      # as the test above is pretty strict and false positives are quite rare
+
+      # it also helps in cases when there are dbx files contained inside dbx
+
+      $self->{file}->seek($chunksize, 1);
+      $self->{hash} = undef;
+      ($self->{id}, $self->{chunksize}, $self->{datasize}, $self->{next}) = ($id, $chunksize, $datasize, $next);
+      bless ($self, $class);
+      return $self;
+    }
+    else
+    {
+
+      # skip 4 bytes and try again
+      # headers were always at 4 byte boundary in every dbx file I have seen
+
+      $self->{file}->seek(-12, 1);
+      return 0;
+    }
+  }
+
+  sub datahash
+  {
+    my $self = shift;
+    if (!defined($self->{hash})) { $self->{hash} = sha1($self->data) };
+    return $self->{hash};
+  }
+
+  sub differ
+  {
+    my $self = shift;
+    my $other = shift;
+    return 1 if (($self->{header}) ne ($other->{header}));
+    # use a hash for comparision, to avoid storing message data, or retrieving message data on each comparision
+    return 1 if ($self->datahash ne $other->datahash);
+    return 0;
+  }
+
+  sub data
+  {
+    my $self = shift;
+    my $data;
+    $self->{file}->setpos($self->{filepos});
+    $self->{file}->read($data, $self->{datasize});
+    return $data;
+  }
+
+  sub id
+  {
+    my $self = shift;
+    return $self->{id};
+  }
+
+  sub next
+  {
+    my $self = shift;
+    return $self->{next};
+  }
+
+}
+
+{
+  package DBX::Scan;
+
+  sub new
+  {
+    my $class = shift;
+    my $self = {};
+    $self->{file} = shift;
+    $self->{cf} = DBX::ChainFinder->new;
+    bless ($self, $class);
+    return $self;
+  }
+
+  sub scan
+  {
+    my $self = shift;
+    $self->{file}->binmode();
+    $self->{file}->seek(0, 0);
+    $self->{cf}->reset;
+
+    while (defined(my $chunk = DBX::Chunk->new($self->{file})))
+    {
+      $self->{cf}->add($chunk) if ($chunk);
+    }
+  }
+
+  sub chains
+  {
+    my $self = shift;
+    return $self->{cf}->find;
+  }
+}
+
+use IO::File;
+
+(my $dbxname=$ARGV[0]) and ((scalar @ARGV) == 1) or print(STDERR "Usage: $0 input.dbx >output.mbox\n"),exit;
+print STDERR "Opening $dbxname for reading...\n";
+my $dbxfile = IO::File->new($dbxname, "r") or die("Error opening file!\n");
+my $dbxscanner = DBX::Scan->new($dbxfile);
+print STDERR "Searching for chunks...\n";
+$dbxscanner->scan;
+print STDERR "Reconstructing chains...\n";
+my $chains = $dbxscanner->chains;
+print STDERR "Printing...\n";
+for(@{$chains})
+{
+  my $message = $_->data;
+  $message =~ s/^>*From />$&/m;		# mbox "From " escaping
+  # we don't have the original envelope address, so just output a fake one
+  print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n";
+  print $message;
+  print "\r\n\r\n";
+}
author	pp <pp@455248ca-bdda-0310-9134-f4ebb693071a>	2005-11-30 20:36:50 +0000
committer	pp <pp@455248ca-bdda-0310-9134-f4ebb693071a>	2005-11-30 20:36:50 +0000
commit	98badafe03c0707ca73833e285e619ba0fda8270 (patch)
tree	d864bd739498b68975fd9dadbbb12d82a6940178 /dbxrecover