Last commit for dbxrecover: d79a49d882243ba777014c605af60e843b91fbc5

- 1 pass version. Faster, uses less memory, works as a filter

pp [2006-05-09 18:11:30]

- 1 pass version. Faster, uses less memory, works as a filter


git-svn-id: https://siedziba.pl:790/svn/repos/dbxrecover@235 455248ca-bdda-0310-9134-f4ebb693071a

#!/usr/bin/perl

# dbxrecover, a program for recovering mail from damaged Outlook dbx files.
# Copyright (C) 2005  Piotr Pawlow <pp@siedziba.pl>

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

use strict;

{
  package DBX::Chain;

  sub new
  {
    my $class = shift;
    my $self = {};
    my $arrayref = shift;
    my @ca = @{$arrayref};
    $self->{chunks} = \@ca;
    $self->{idx} = scalar @ca;
    bless ($self, $class);
    return $self;
  }

  # we assume here the single message is small enough to fit into memory
  # this may not be a case for publishing companies or other businesses sending huge amounts of data by mail :)
  sub data
  {
    my $self = shift;
    my $data = "";
    for(@{$self->{chunks}})
    {
      $data .= $_->data;
    }
    return $data;
  }

  sub broken
  {
    my $self = shift;
    return ($self->{chunks}->[$self->{idx}-1]->next != 0);
  }
}

{
  package DBX::ChainFinder;

  sub new
  {
    my $class = shift;
    my $self = {};
    bless ($self, $class);
    $self->reset;
    return $self;
  }

  sub reset
  {
    my $self = shift;
    $self->{chunks} = {};
    $self->{first} = {};
    $self->{stats} =
    { 'chunkstot'    => 0,
      'chunksdupl'   => 0,
      'chunkidsdupl' => 0,
      'chainstot'    => 0,
      'chainloops'   => 0,
      'chainsign'    => 0,
    };
  }

  sub add
  {
    my $self = shift;
    my $chunk = shift;
    my $id = $chunk->id;
    my $next = $chunk->next;
    $self->{stats}->{chunkstot}++;
    if (!defined($self->{chunks}->{$id}))
    {
      $self->{chunks}->{$id} = [];
    }
    else
    {
      for (@{$self->{chunks}->{$id}})
      {
        $self->{stats}->{chunksdupl}++,return if (!$chunk->differ($_)); # don't add chunk if there is a duplicate present
      }
    }
    push @{$self->{chunks}->{$id}}, $chunk;

    # we store if there is any other chunk pointing to this one
    if (!defined($self->{first}->{$id})) { $self->{first}->{$id} = 1 };
    $self->{first}->{$next} = 0;

    # adding a new chunk requires rescanning for chains
    $self->{chains} = undef;
  }

  sub find
  {
    my $self = shift;
    $self->{chains} = [];

    for(keys %{$self->{first}})
    {
      if ($self->{first}->{$_} == 1)
      {
        # if there is no other chunk pointing to this one, start following it
        $self->walk($_);
      }
    }
    return $self->{chains};
  }

  sub walk
  {
    my $self = shift;
    my $id = shift;
    my $chain = shift;
    my $seen = shift;
    my $combinations = shift;
    $self->{stats}->{chainsign}++,return if ($combinations > 16); # we don't extract the message if there are too many branches
    if (!defined($chain)) { $chain = [] };
    if (!defined($seen)) { $seen = {} };
    if (!defined($combinations)) { $combinations = 1 };
    $seen->{$id} = 1; # note chunk identifiers for infinite loop testing
    for(@{$self->{chunks}->{$id}})
    {
      my $chunk = $_;
      push @{$chain}, $chunk;
      if (defined($self->{chunks}->{$chunk->next}) && !$seen->{$chunk->next})
      {
        $self->walk($chunk->next, $chain, $seen, $combinations * scalar @{$self->{chunks}->{$chunk->next}});
      }
      else
      {
        $self->{stats}->{chainloops}++ if $seen->{$chunk->next};
        push @{$self->{chains}}, DBX::Chain->new($chain);
      }
      pop @{$chain};
    }
    $seen->{$id} = 0;
  }

  sub stats
  {
    my $self = shift;
    $self->{stats}->{chunkidsdupl} = $self->{stats}->{chunkstot} - scalar keys %{$self->{chunks}};
    $self->{stats}->{chainstot} = scalar @{$self->{chains}} if ($self->{chains});
    return $self->{stats};
  }
}

{
  package DBX::Chunk;
  use Digest::SHA1 qw(sha1);

  sub new
  {
    my $class = shift;
    my $self = {};
    $self->{file} = shift;
    return if ($self->{file}->read($self->{header}, 16) != 16);
    my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $self->{header});

    # test for a valid header
    # chunk size is always 512 bytes in all dbx files I have seen

    if (
          ($chunksize == 512) &&
          ($datasize <= $chunksize) &&
          ($datasize > 0) &&
          ($id != 0) &&
          (($next == 0) || ($datasize == $chunksize))
         )
    {
      $self->{filepos} = $self->{file}->getpos;

      # if the header seems valid, we skip the whole chunk

      # the chance we miss a start of another chunk is low, because
      # the test above is pretty strict and false positives are quite rare

      # it also helps in cases when there are dbx files contained inside dbx

      $self->{file}->seek($chunksize, 1);
      bless ($self, $class);
      return $self;
    }
    else
    {

      # skip 4 bytes and try again
      # headers were always at 4 byte boundary in every dbx file I have seen

      $self->{file}->seek(-12, 1);
      return 0;
    }
  }

  sub datahash
  {
    my $self = shift;
    if (!defined($self->{hash})) { $self->{hash} = sha1($self->data) };
    return $self->{hash};
  }

  sub differ
  {
    my $self = shift;
    my $other = shift;
    return 1 if (($self->{header}) ne ($other->{header}));
    # use a hash for comparision, to avoid storing message data, or retrieving message data on each comparision
    return 1 if ($self->datahash ne $other->datahash);
    return 0;
  }

  sub data
  {
    my $self = shift;
    my $datasize = unpack("V", substr($self->{header}, 8, 4));
    my $data;
    my $pos = $self->{file}->getpos;
    $self->{file}->setpos($self->{filepos});
    $self->{file}->read($data, $datasize);
    $self->{file}->setpos($pos);
    return $data;
  }

  sub id
  {
    my $self = shift;
    return scalar unpack("V", $self->{header});
  }

  sub next
  {
    my $self = shift;
    return scalar unpack("V", substr($self->{header}, 12, 4));
  }

}

{
  package DBX::Scan;

  sub new
  {
    my $class = shift;
    my $self = {};
    $self->{file} = shift;
    $self->{cf} = DBX::ChainFinder->new;
    bless ($self, $class);
    return $self;
  }

  sub scan
  {
    my $self = shift;
    $self->{file}->binmode();
    $self->{file}->seek(0, 0);
    $self->{cf}->reset;
    my $time;

    while (defined(my $chunk = DBX::Chunk->new($self->{file})))
    {
      $self->{cf}->add($chunk) if ($chunk);
      if (time - $time > 1)
      {
        $self->printchunkstats;
        $time = time;
      }
    }
    $self->printchunkstats;
    print STDERR "\n";
  }

  sub chains
  {
    my $self = shift;
    my $chains = $self->{cf}->find;
    $self->printchainstats;
    return $chains;
  }

  sub printchunkstats
  {
    my $self = shift;
    my $stats = $self->{cf}->stats;
    print STDERR "Chunks found: ".$stats->{chunkstot}."; ";
    print STDERR "Duplicates: ".$stats->{chunksdupl}."; ";
    print STDERR "Duplicate IDs: ".$stats->{chunkidsdupl}."\r";
  }

  sub printchainstats
  {
    my $self = shift;
    my $stats = $self->{cf}->stats;
    print STDERR "Chains found: ".$stats->{chainstot}."\n";
    print STDERR "Chain loops found: ".$stats->{chainloops}."\n";
    print STDERR "Chains ignored due to duplicate chunk ids: ".$stats->{chainsign}."\n";
  }
}

use IO::File;

(my $dbxname=$ARGV[0]) and ((scalar @ARGV) == 1) or print(STDERR "Usage: $0 input.dbx >output.mbox\n"),exit;
print STDERR "Opening $dbxname for reading...\n";
my $dbxfile = IO::File->new($dbxname, "r") or die("Error opening file!\n");
my $dbxscanner = DBX::Scan->new($dbxfile);
print STDERR "Searching for chunks...\n";
$dbxscanner->scan;
print STDERR "Reconstructing chains...\n";
my $chains = $dbxscanner->chains;
print STDERR "Printing...\n";
for(@{$chains})
{
  my $message = $_->data;
  $message =~ s/^>*From />$&/m;		# mbox "From " escaping
  # we don't have the original envelope address, so just output a fake one
  print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n";
  print $message;
  print "\r\n\r\n";
}

ViewGit