From d79a49d882243ba777014c605af60e843b91fbc5 Mon Sep 17 00:00:00 2001
From: pp <pp@455248ca-bdda-0310-9134-f4ebb693071a>
Date: Tue, 9 May 2006 18:11:30 +0000
Subject: - 1 pass version. Faster, uses less memory, works as a filter

git-svn-id: https://siedziba.pl:790/svn/repos/dbxrecover@235 455248ca-bdda-0310-9134-f4ebb693071a
---
 dbxrecover    | 368 ----------------------------------------------------------
 dbxrecover-1p | 310 +++++++++++++++++++++++++++++++++++++++++++++++++
 dbxrecover-2p | 368 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 678 insertions(+), 368 deletions(-)
 delete mode 100755 dbxrecover
 create mode 100755 dbxrecover-1p
 create mode 100755 dbxrecover-2p

diff --git a/dbxrecover b/dbxrecover
deleted file mode 100755
index 6d2f71c..0000000
--- a/dbxrecover
+++ /dev/null
@@ -1,368 +0,0 @@
-#!/usr/bin/perl
-
-# dbxrecover, a program for recovering mail from damaged Outlook dbx files.
-# Copyright (C) 2005  Piotr Pawlow <pp@siedziba.pl>
-
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License
-# as published by the Free Software Foundation; either version 2
-# of the License, or (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-
-use strict;
-use warnings;
-
-{
-  package DBX::Chain;
-
-  sub new
-  {
-    my $class = shift;
-    my $self = {};
-    my $arrayref = shift;
-    my @ca = @{$arrayref};
-    $self->{chunks} = \@ca;
-    $self->{idx} = scalar @ca;
-    bless ($self, $class);
-    return $self;
-  }
-
-  # we assume here the single message is small enough to fit into memory
-  # this may not be a case for publishing companies or other businesses sending huge amounts of data by mail :)
-  sub data
-  {
-    my $self = shift;
-    my $data = "";
-    for(@{$self->{chunks}})
-    {
-      $data .= $_->data;
-    }
-    return $data;
-  }
-
-  sub broken
-  {
-    my $self = shift;
-    return ($self->{chunks}->[$self->{idx}-1]->next ne "\x00\x00\x00\x00");
-  }
-}
-
-{
-  package DBX::ChainFinder;
-
-  sub new
-  {
-    my $class = shift;
-    my $self = {};
-    bless ($self, $class);
-    $self->reset;
-    return $self;
-  }
-
-  sub reset
-  {
-    my $self = shift;
-    $self->{chunks} = {};
-    $self->{first} = {};
-    $self->{stats} = 
-    { 'chunkstot'    => 0,
-      'chunksdupl'   => 0,
-      'chunkidsdupl' => 0,
-      'chainstot'    => 0,
-      'chainloops'   => 0,
-      'chainsign'    => 0,
-      'chainsbroken' => 0,
-    };
-  }
-
-  sub add
-  {
-    my $self = shift;
-    my $chunk = shift;
-    my $id = $chunk->id;
-    my $next = $chunk->next;
-    $self->{stats}->{chunkstot}++;
-    my $chunks = $self->{chunks}->{$id};
-    if (defined($chunks))
-    {
-      my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks);
-      for (@chunks)
-      {
-        $self->{stats}->{chunksdupl}++,return if (!$chunk->differ($_)); # don't add chunk if there is a duplicate present
-      }
-      $self->{stats}->{chunkidsdupl}++;
-      push @chunks, $chunk;
-      $self->{chunks}->{$id} = \@chunks;
-    }
-    else
-    {
-      $self->{chunks}->{$id} = $chunk;
-    }
-
-    # we store if there is any other chunk pointing to this one
-    if (!defined($self->{first}->{$id})) { $self->{first}->{$id} = 1 };
-    $self->{first}->{$next} = 0;
-  }
-
-  sub find
-  {
-    my $self = shift;
-    my $first = $self->{first};
-    my @ids;
-    while (my ($key, $value) = each %$first)
-    {
-      push @ids,$key if $value == 1;
-    }
-    return \@ids;
-  }
-
-  sub walk_int
-  {
-    no warnings qw(recursion);
-
-    my $self = shift;
-    my $id = shift;
-    my $chain = shift;
-    my $chains = shift;
-    my $seen = shift;
-    my $combinations = shift;
-    $self->{stats}->{chainsign}++,return if ($combinations > 16); # we don't extract the message if there are too many branches
-    $seen->{$id} = 1; # note chunk identifiers for infinite loop testing
-    my $chunks = $self->{chunks}->{$id};
-    my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks);
-    for(@chunks)
-    {
-      my $chunk = $_;
-      push @{$chain}, $chunk;
-      if (defined($self->{chunks}->{$chunk->next}) && !$seen->{$chunk->next})
-      {
-        my $nextchunks = $self->{chunks}->{$chunk->next};
-        my $nextcount = (ref($nextchunks) eq "ARRAY") ? scalar @{$nextchunks} : 1;
-        $self->walk_int($chunk->next, $chain, $chains, $seen, $combinations * $nextcount);
-      }
-      else
-      {
-        $self->{stats}->{chainloops}++ if $seen->{$chunk->next};
-        my $chainobj = DBX::Chain->new($chain);
-        $self->{stats}->{chainsbroken}++ if $chainobj->broken;
-        $self->{stats}->{chainstot}++;
-        push @{$chains}, $chainobj;
-      }
-      pop @{$chain};
-    }
-    $seen->{$id} = 0;
-  }
-
-  sub walk
-  {
-    my $self = shift;
-    my $id = shift;
-    my $chains = [];
-    $self->walk_int($id, [], $chains, {}, 1);
-    return $chains;
-  }
-
-  sub stats
-  {
-    my $self = shift;
-    return $self->{stats};
-  }
-}
-
-{
-  package DBX::Chunk;
-  use Digest::SHA1 qw(sha1);
-
-  sub new
-  {
-    my $class = shift;
-    my $self = {};
-    $self->{f} = shift;
-    return if ($self->{f}->read($self->{h}, 16) != 16);
-    my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $self->{h});
-
-    # test for a valid header
-    # chunk size is always 512 bytes in all dbx files I have seen
-
-    if (
-          ($chunksize == 512) &&
-          ($datasize <= $chunksize) && 
-          ($datasize > 0) &&
-          ($id != 0) &&
-          (($next == 0) || ($datasize == $chunksize))
-         )
-    {
-      $self->{p} = $self->{f}->getpos;
-
-      # if the header seems valid, we skip the whole chunk
-
-      # the chance we miss a start of another chunk is low, because
-      # the test above is pretty strict and false positives are quite rare
-
-      # it also helps in cases when there are dbx files contained inside dbx
-
-      $self->{f}->seek($chunksize, 1);
-      bless ($self, $class);
-      return $self;
-    }
-    else
-    {
-
-      # skip 4 bytes and try again
-      # headers were always at 4 byte boundary in every dbx file I have seen
-
-      $self->{f}->seek(-12, 1);
-      return 0;
-    }
-  }
-
-  sub datahash
-  {
-    my $self = shift;
-    if (!defined($self->{hash})) { $self->{hash} = sha1($self->data) };
-    return $self->{hash};
-  }
-
-  sub differ
-  {
-    my $self = shift;
-    my $other = shift;
-    return 1 if (($self->{h}) ne ($other->{h}));
-    # use a hash for comparision, to avoid storing message data, or retrieving message data on each comparision
-    return 1 if ($self->datahash ne $other->datahash);
-    return 0;
-  }
-
-  sub data
-  {
-    my $self = shift;
-    my $datasize = unpack("V", substr($self->{h}, 8, 4));
-    my $data;
-    my $pos = $self->{f}->getpos;
-    $self->{f}->setpos($self->{p});
-    $self->{f}->read($data, $datasize);
-    $self->{f}->setpos($pos);
-    return $data;
-  }
-
-  sub id
-  {
-    my $self = shift;
-    return substr($self->{h}, 0, 4);
-  }
-
-  sub next
-  {
-    my $self = shift;
-    return substr($self->{h}, 12, 4);
-  }
-
-}
-
-{
-  package DBX::Scan;
-
-  sub new
-  {
-    my $class = shift;
-    my $self = {};
-    $self->{file} = shift;
-    $self->{cf} = DBX::ChainFinder->new;
-    bless ($self, $class);
-    return $self;
-  }
-
-  sub scan
-  {
-    my $self = shift;
-    $self->{file}->binmode();
-    $self->{file}->seek(0, 0);
-    $self->{cf}->reset;
-    my $time = 0;
-
-    while (defined(my $chunk = DBX::Chunk->new($self->{file})))
-    {
-      if ($chunk)
-      {
-        $self->{cf}->add($chunk);
-        if (time - $time > 1)
-        {
-          $self->printchunkstats;
-          $time = time;
-        }
-      }
-    }
-    $self->printchunkstats;
-    print STDERR "\n";
-  }
-
-  sub chains
-  {
-    my $self = shift;
-    my $chainids = $self->{cf}->find;
-    my $time = 0;
-    my $tot = scalar @{$chainids};
-    my $idx = 0;
-    for (@{$chainids})
-    {
-      my $chains = $self->{cf}->walk($_);
-      $idx++;
-      for(@{$chains})
-      {
-        my $message = $_->data;
-        $message =~ s/^>*From />$&/m;		# mbox "From " escaping
-        # we don't have the original envelope address, so just output a fake one
-        print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n";
-        print $message;
-        print "\r\n\r\n";
-      }
-      if (time - $time > 1)
-      {
-        $self->printchainstats(int($idx/$tot*100));
-        $time = time;
-      }
-    }
-    $self->printchainstats(100);
-  }
-
-  sub printchunkstats
-  {
-    my $self = shift;
-    my $stats = $self->{cf}->stats;
-    print STDERR "Chunks found: ".$stats->{chunkstot}."; ";
-    print STDERR "Duplicates: ".$stats->{chunksdupl}."; ";
-    print STDERR "Duplicate IDs: ".$stats->{chunkidsdupl}."\r";
-  }
-
-  sub printchainstats
-  {
-    my $self = shift;
-    my $percdone = shift;
-    my $stats = $self->{cf}->stats;
-    print STDERR "Chains found: ".$stats->{chainstot}."; ";
-    print STDERR "Broken: ".$stats->{chainsbroken}."; ";
-    print STDERR "Loops: ".$stats->{chainloops}."; ";
-    print STDERR "Dropped: ".$stats->{chainsign}."; ";
-    print STDERR $percdone."% done\r";
-  }
-}
-
-use IO::File;
-
-(my $dbxname=$ARGV[0]) and ((scalar @ARGV) == 1) or print(STDERR "Usage: $0 input.dbx >output.mbox\n"),exit;
-print STDERR "Opening $dbxname for reading...\n";
-my $dbxfile = IO::File->new($dbxname, "r") or die("Error opening file!\n");
-my $dbxscanner = DBX::Scan->new($dbxfile);
-print STDERR "Searching for chunks...\n";
-$dbxscanner->scan;
-print STDERR "Searching for chains and printing...\n";
-$dbxscanner->chains;
-print STDERR "\n";
diff --git a/dbxrecover-1p b/dbxrecover-1p
new file mode 100755
index 0000000..db37ab6
--- /dev/null
+++ b/dbxrecover-1p
@@ -0,0 +1,310 @@
+#!/usr/bin/perl
+
+# dbxrecover, a program for recovering mail from damaged Outlook dbx files.
+# Copyright (C) 2005  Piotr Pawlow <pp@siedziba.pl>
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+use strict;
+use warnings;
+
+{
+  package DBX::ChunkList;
+
+  sub new
+  {
+    my $class = shift;
+    my $self = {};
+    bless ($self, $class);
+    $self->{list} = [ @_ ];
+    return $self;
+  }
+
+  sub join
+  {
+    my $self = shift;
+    my $data = '';
+    $self->join_int(\$data);
+    return $data;
+  }
+
+  sub join_int
+  {
+    no warnings qw(recursion);
+
+    my $self = shift;
+    my $dataref = shift;
+    my $list = $self->{list};
+    for my $elem (@{$list})
+    {
+      if (ref($elem))
+      {
+        $elem->join_int($dataref)
+      }
+      else
+      {
+        $$dataref .= $elem;
+      }
+    }
+  }
+
+  sub first
+  {
+    my $self = shift;
+    my $list = $self->{list};
+    my $elem = ${$list}[0];
+    $elem = $elem->first() if ref($elem);
+    return $elem;
+  }
+
+  sub push
+  {
+    my $self = shift;
+    my $a1 = $self->{list};
+    my $a2 = shift;
+    my $a2_list = $a2->{list};
+    if (scalar @{$a2_list} < 32) # FIXME: Arbitrary value choosen without much care. Probably not optimal.
+    {
+      push @{$a1}, @{$a2_list};
+    }
+    else
+    {
+      push @{$a1}, $a2;
+    }
+  }
+}
+
+{
+  package DBX::ChainFinder;
+  use BerkeleyDB;
+  use File::Temp;
+
+  sub new
+  {
+    my $class = shift;
+    my $self = {};
+    bless ($self, $class);
+    $self->reset;
+    return $self;
+  }
+
+  sub reset
+  {
+    my $self = shift;
+
+    $self->{id} = {};
+    $self->{next} = {};
+    $self->{stats} = 
+    { 'chunkstot'    => 0,
+      'chainfrag'    => 0,
+      'chainscompl'   => 0,
+    };
+  }
+
+  # FIXME: this function is too convoluted
+  sub combine
+  {
+    my $self = shift;
+    my $hash = shift;
+    my $otherhash = shift;
+    my $key = shift;
+    my $data = shift;
+    my $idx = shift;
+
+    while (defined($hash->{$key}))
+    {
+      my @chunkkeys = sort keys %{$hash->{$key}};
+      my $chunkkey = pop @chunkkeys;
+      my $chunkref = $hash->{$key}->{$chunkkey};
+      my $chunkid = $chunkref->{head}->[$idx];
+      delete $hash->{$key}->{$chunkkey};
+      delete $hash->{$key} if (!scalar %{$hash->{$key}});
+      delete $otherhash->{$chunkid}->{$chunkkey};
+      delete $otherhash->{$chunkid} if (!scalar %{$otherhash->{$chunkid}});
+      if ($idx == 1)
+      {
+        $data->push($chunkref);
+      }
+      else
+      {
+        $chunkref->push($data);
+        $data = $chunkref;
+      }
+      $key = $chunkid;
+      $self->{stats}->{chainfrag}--;
+    }
+    return ($key, $data);
+  }
+
+  sub add
+  {
+    my $self = shift;
+    my $id = shift;
+    my $next = shift;
+    my $data = new DBX::ChunkList(shift);
+    my $chunkid = $self->{stats}->{chunkstot}++;
+    my $idhash = $self->{id};
+    my $nexthash = $self->{next};
+
+    ($next, $data) = $self->combine($idhash, $nexthash, $next, $data, 1);
+    ($id, $data) = $self->combine($nexthash, $idhash, $id, $data, 0);
+
+    # the test for first chunk is very simple and naive, but it works well enough
+    $self->{stats}->{chainscompl}++,return($data->join()) if (($next == 0) && ($data->first() =~ /^(From|From:|Return-Path:|Received:) /));
+    $idhash->{$id} = {} if (!defined($idhash->{$id}));
+    $nexthash->{$next} = {} if (!defined($nexthash->{$next}));
+    $data->{head} = [$id, $next];
+    $idhash->{$id}->{$chunkid} = $data;
+    $nexthash->{$next}->{$chunkid} = $data;
+    $self->{stats}->{chainfrag}++;
+    return undef;
+  }
+
+  sub remaining
+  {
+    my $self = shift;
+    return undef if (!scalar %{$self->{id}});
+    while (1)
+    {
+      (my $id, my $list) = each %{$self->{id}} or next;
+      (my $cid, my $chain) = each %{$list} or next;
+      my $data = $chain->join();
+      delete $self->{id}->{$id}->{$cid};
+      delete $self->{id}->{$id} if (!scalar %{$self->{id}->{$id}});
+      return $data;
+    }
+  }
+
+  sub stats
+  {
+    my $self = shift;
+    return $self->{stats};
+  }
+}
+
+{
+  package DBX::Scan;
+  sub new
+  {
+    my $class = shift;
+    my $self = {};
+    $self->{file} = shift;
+    $self->{cf} = DBX::ChainFinder->new;
+    bless ($self, $class);
+    return $self;
+  }
+
+  sub scan
+  {
+    my $self = shift;
+    my $file = $self->{file};
+    $file->binmode();
+    $self->{cf}->reset;
+    my $time = 0;
+    my $header;
+
+    $file->read($header, 16);
+
+    while ((length $header) == 16)
+    {
+      my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $header);
+
+      # test for a valid header
+      # chunk size is always 512 bytes in all dbx files I have seen
+      if (
+           ($chunksize == 512) &&
+           ($datasize <= $chunksize) && 
+           ($datasize > 0) &&
+           ($id != 0) &&
+           (($next == 0) || ($datasize == $chunksize))
+         )
+      {
+        my $data;
+        $file->read($data, $datasize);
+
+        # if the header seems valid, we skip the whole chunk
+
+        # the chance we miss a start of another chunk is low, because
+        # the test above is pretty strict and false positives are quite rare
+
+        # it also helps in cases when there are dbx files contained inside dbx
+
+        my $tmp;
+        $file->read($tmp, $chunksize-$datasize);
+
+        my $message = $self->{cf}->add($id, $next, $data);
+        $self->printmsg($message);
+
+        if (time - $time > 1)
+        {
+          $self->printstats;
+          $time = time;
+        }
+
+        $file->read($header, 16);
+      }
+      else
+      {
+        # skip 4 bytes and try again
+        # headers were always at 4 byte boundary in every dbx file I have seen
+
+        my $tmp;
+        $file->read($tmp, 4);
+        $header = substr($header, 4).$tmp;
+      }
+    }
+    $self->printstats;
+    print STDERR "\nPrinting remaining chains...";
+    my $message;
+    do {
+      $message = $self->{cf}->remaining;
+      $self->printmsg($message);
+    } while defined($message);
+    print STDERR "\n";
+  }
+
+  sub printmsg
+  {
+    my $self = shift;
+    my $message = shift;
+    if (defined($message))
+    {
+      $message =~ s/^>*From />$&/m;		# mbox "From " escaping
+      # we don't have the original envelope address, so just output a fake one
+      print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n";
+      print $message;
+      print "\r\n\r\n";
+    }
+  }
+
+  sub printstats
+  {
+    my $self = shift;
+    my $stats = $self->{cf}->stats;
+    print STDERR "Chunks found: ".$stats->{chunkstot}."; ";
+    print STDERR "Chain fragments: ".$stats->{chainfrag}."; ";
+    print STDERR "Chains completed: ".$stats->{chainscompl}."\r";
+  }
+}
+
+use IO::File;
+
+my $dbxname=($ARGV[0] || "-");
+print STDERR "Opening $dbxname for reading...\n";
+my $dbxfile = IO::File->new("<$dbxname") or die("Error opening file!\n");
+my $dbxscanner = DBX::Scan->new($dbxfile);
+print STDERR "Searching for messages...\n";
+$dbxscanner->scan;
+print STDERR "\n";
diff --git a/dbxrecover-2p b/dbxrecover-2p
new file mode 100755
index 0000000..6d2f71c
--- /dev/null
+++ b/dbxrecover-2p
@@ -0,0 +1,368 @@
+#!/usr/bin/perl
+
+# dbxrecover, a program for recovering mail from damaged Outlook dbx files.
+# Copyright (C) 2005  Piotr Pawlow <pp@siedziba.pl>
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
+use strict;
+use warnings;
+
+{
+  package DBX::Chain;
+
+  sub new
+  {
+    my $class = shift;
+    my $self = {};
+    my $arrayref = shift;
+    my @ca = @{$arrayref};
+    $self->{chunks} = \@ca;
+    $self->{idx} = scalar @ca;
+    bless ($self, $class);
+    return $self;
+  }
+
+  # we assume here the single message is small enough to fit into memory
+  # this may not be a case for publishing companies or other businesses sending huge amounts of data by mail :)
+  sub data
+  {
+    my $self = shift;
+    my $data = "";
+    for(@{$self->{chunks}})
+    {
+      $data .= $_->data;
+    }
+    return $data;
+  }
+
+  sub broken
+  {
+    my $self = shift;
+    return ($self->{chunks}->[$self->{idx}-1]->next ne "\x00\x00\x00\x00");
+  }
+}
+
+{
+  package DBX::ChainFinder;
+
+  sub new
+  {
+    my $class = shift;
+    my $self = {};
+    bless ($self, $class);
+    $self->reset;
+    return $self;
+  }
+
+  sub reset
+  {
+    my $self = shift;
+    $self->{chunks} = {};
+    $self->{first} = {};
+    $self->{stats} = 
+    { 'chunkstot'    => 0,
+      'chunksdupl'   => 0,
+      'chunkidsdupl' => 0,
+      'chainstot'    => 0,
+      'chainloops'   => 0,
+      'chainsign'    => 0,
+      'chainsbroken' => 0,
+    };
+  }
+
+  sub add
+  {
+    my $self = shift;
+    my $chunk = shift;
+    my $id = $chunk->id;
+    my $next = $chunk->next;
+    $self->{stats}->{chunkstot}++;
+    my $chunks = $self->{chunks}->{$id};
+    if (defined($chunks))
+    {
+      my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks);
+      for (@chunks)
+      {
+        $self->{stats}->{chunksdupl}++,return if (!$chunk->differ($_)); # don't add chunk if there is a duplicate present
+      }
+      $self->{stats}->{chunkidsdupl}++;
+      push @chunks, $chunk;
+      $self->{chunks}->{$id} = \@chunks;
+    }
+    else
+    {
+      $self->{chunks}->{$id} = $chunk;
+    }
+
+    # we store if there is any other chunk pointing to this one
+    if (!defined($self->{first}->{$id})) { $self->{first}->{$id} = 1 };
+    $self->{first}->{$next} = 0;
+  }
+
+  sub find
+  {
+    my $self = shift;
+    my $first = $self->{first};
+    my @ids;
+    while (my ($key, $value) = each %$first)
+    {
+      push @ids,$key if $value == 1;
+    }
+    return \@ids;
+  }
+
+  sub walk_int
+  {
+    no warnings qw(recursion);
+
+    my $self = shift;
+    my $id = shift;
+    my $chain = shift;
+    my $chains = shift;
+    my $seen = shift;
+    my $combinations = shift;
+    $self->{stats}->{chainsign}++,return if ($combinations > 16); # we don't extract the message if there are too many branches
+    $seen->{$id} = 1; # note chunk identifiers for infinite loop testing
+    my $chunks = $self->{chunks}->{$id};
+    my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks);
+    for(@chunks)
+    {
+      my $chunk = $_;
+      push @{$chain}, $chunk;
+      if (defined($self->{chunks}->{$chunk->next}) && !$seen->{$chunk->next})
+      {
+        my $nextchunks = $self->{chunks}->{$chunk->next};
+        my $nextcount = (ref($nextchunks) eq "ARRAY") ? scalar @{$nextchunks} : 1;
+        $self->walk_int($chunk->next, $chain, $chains, $seen, $combinations * $nextcount);
+      }
+      else
+      {
+        $self->{stats}->{chainloops}++ if $seen->{$chunk->next};
+        my $chainobj = DBX::Chain->new($chain);
+        $self->{stats}->{chainsbroken}++ if $chainobj->broken;
+        $self->{stats}->{chainstot}++;
+        push @{$chains}, $chainobj;
+      }
+      pop @{$chain};
+    }
+    $seen->{$id} = 0;
+  }
+
+  sub walk
+  {
+    my $self = shift;
+    my $id = shift;
+    my $chains = [];
+    $self->walk_int($id, [], $chains, {}, 1);
+    return $chains;
+  }
+
+  sub stats
+  {
+    my $self = shift;
+    return $self->{stats};
+  }
+}
+
+{
+  package DBX::Chunk;
+  use Digest::SHA1 qw(sha1);
+
+  sub new
+  {
+    my $class = shift;
+    my $self = {};
+    $self->{f} = shift;
+    return if ($self->{f}->read($self->{h}, 16) != 16);
+    my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $self->{h});
+
+    # test for a valid header
+    # chunk size is always 512 bytes in all dbx files I have seen
+
+    if (
+          ($chunksize == 512) &&
+          ($datasize <= $chunksize) && 
+          ($datasize > 0) &&
+          ($id != 0) &&
+          (($next == 0) || ($datasize == $chunksize))
+         )
+    {
+      $self->{p} = $self->{f}->getpos;
+
+      # if the header seems valid, we skip the whole chunk
+
+      # the chance we miss a start of another chunk is low, because
+      # the test above is pretty strict and false positives are quite rare
+
+      # it also helps in cases when there are dbx files contained inside dbx
+
+      $self->{f}->seek($chunksize, 1);
+      bless ($self, $class);
+      return $self;
+    }
+    else
+    {
+
+      # skip 4 bytes and try again
+      # headers were always at 4 byte boundary in every dbx file I have seen
+
+      $self->{f}->seek(-12, 1);
+      return 0;
+    }
+  }
+
+  sub datahash
+  {
+    my $self = shift;
+    if (!defined($self->{hash})) { $self->{hash} = sha1($self->data) };
+    return $self->{hash};
+  }
+
+  sub differ
+  {
+    my $self = shift;
+    my $other = shift;
+    return 1 if (($self->{h}) ne ($other->{h}));
+    # use a hash for comparision, to avoid storing message data, or retrieving message data on each comparision
+    return 1 if ($self->datahash ne $other->datahash);
+    return 0;
+  }
+
+  sub data
+  {
+    my $self = shift;
+    my $datasize = unpack("V", substr($self->{h}, 8, 4));
+    my $data;
+    my $pos = $self->{f}->getpos;
+    $self->{f}->setpos($self->{p});
+    $self->{f}->read($data, $datasize);
+    $self->{f}->setpos($pos);
+    return $data;
+  }
+
+  sub id
+  {
+    my $self = shift;
+    return substr($self->{h}, 0, 4);
+  }
+
+  sub next
+  {
+    my $self = shift;
+    return substr($self->{h}, 12, 4);
+  }
+
+}
+
+{
+  package DBX::Scan;
+
+  sub new
+  {
+    my $class = shift;
+    my $self = {};
+    $self->{file} = shift;
+    $self->{cf} = DBX::ChainFinder->new;
+    bless ($self, $class);
+    return $self;
+  }
+
+  sub scan
+  {
+    my $self = shift;
+    $self->{file}->binmode();
+    $self->{file}->seek(0, 0);
+    $self->{cf}->reset;
+    my $time = 0;
+
+    while (defined(my $chunk = DBX::Chunk->new($self->{file})))
+    {
+      if ($chunk)
+      {
+        $self->{cf}->add($chunk);
+        if (time - $time > 1)
+        {
+          $self->printchunkstats;
+          $time = time;
+        }
+      }
+    }
+    $self->printchunkstats;
+    print STDERR "\n";
+  }
+
+  sub chains
+  {
+    my $self = shift;
+    my $chainids = $self->{cf}->find;
+    my $time = 0;
+    my $tot = scalar @{$chainids};
+    my $idx = 0;
+    for (@{$chainids})
+    {
+      my $chains = $self->{cf}->walk($_);
+      $idx++;
+      for(@{$chains})
+      {
+        my $message = $_->data;
+        $message =~ s/^>*From />$&/m;		# mbox "From " escaping
+        # we don't have the original envelope address, so just output a fake one
+        print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n";
+        print $message;
+        print "\r\n\r\n";
+      }
+      if (time - $time > 1)
+      {
+        $self->printchainstats(int($idx/$tot*100));
+        $time = time;
+      }
+    }
+    $self->printchainstats(100);
+  }
+
+  sub printchunkstats
+  {
+    my $self = shift;
+    my $stats = $self->{cf}->stats;
+    print STDERR "Chunks found: ".$stats->{chunkstot}."; ";
+    print STDERR "Duplicates: ".$stats->{chunksdupl}."; ";
+    print STDERR "Duplicate IDs: ".$stats->{chunkidsdupl}."\r";
+  }
+
+  sub printchainstats
+  {
+    my $self = shift;
+    my $percdone = shift;
+    my $stats = $self->{cf}->stats;
+    print STDERR "Chains found: ".$stats->{chainstot}."; ";
+    print STDERR "Broken: ".$stats->{chainsbroken}."; ";
+    print STDERR "Loops: ".$stats->{chainloops}."; ";
+    print STDERR "Dropped: ".$stats->{chainsign}."; ";
+    print STDERR $percdone."% done\r";
+  }
+}
+
+use IO::File;
+
+(my $dbxname=$ARGV[0]) and ((scalar @ARGV) == 1) or print(STDERR "Usage: $0 input.dbx >output.mbox\n"),exit;
+print STDERR "Opening $dbxname for reading...\n";
+my $dbxfile = IO::File->new($dbxname, "r") or die("Error opening file!\n");
+my $dbxscanner = DBX::Scan->new($dbxfile);
+print STDERR "Searching for chunks...\n";
+$dbxscanner->scan;
+print STDERR "Searching for chains and printing...\n";
+$dbxscanner->chains;
+print STDERR "\n";
-- 
cgit v1.2.3