From d79a49d882243ba777014c605af60e843b91fbc5 Mon Sep 17 00:00:00 2001 From: pp Date: Tue, 9 May 2006 18:11:30 +0000 Subject: - 1 pass version. Faster, uses less memory, works as a filter git-svn-id: https://siedziba.pl:790/svn/repos/dbxrecover@235 455248ca-bdda-0310-9134-f4ebb693071a --- dbxrecover | 368 ---------------------------------------------------------- dbxrecover-1p | 310 +++++++++++++++++++++++++++++++++++++++++++++++++ dbxrecover-2p | 368 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 678 insertions(+), 368 deletions(-) delete mode 100755 dbxrecover create mode 100755 dbxrecover-1p create mode 100755 dbxrecover-2p diff --git a/dbxrecover b/dbxrecover deleted file mode 100755 index 6d2f71c..0000000 --- a/dbxrecover +++ /dev/null @@ -1,368 +0,0 @@ -#!/usr/bin/perl - -# dbxrecover, a program for recovering mail from damaged Outlook dbx files. -# Copyright (C) 2005 Piotr Pawlow - -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - -use strict; -use warnings; - -{ - package DBX::Chain; - - sub new - { - my $class = shift; - my $self = {}; - my $arrayref = shift; - my @ca = @{$arrayref}; - $self->{chunks} = \@ca; - $self->{idx} = scalar @ca; - bless ($self, $class); - return $self; - } - - # we assume here the single message is small enough to fit into memory - # this may not be a case for publishing companies or other businesses sending huge amounts of data by mail :) - sub data - { - my $self = shift; - my $data = ""; - for(@{$self->{chunks}}) - { - $data .= $_->data; - } - return $data; - } - - sub broken - { - my $self = shift; - return ($self->{chunks}->[$self->{idx}-1]->next ne "\x00\x00\x00\x00"); - } -} - -{ - package DBX::ChainFinder; - - sub new - { - my $class = shift; - my $self = {}; - bless ($self, $class); - $self->reset; - return $self; - } - - sub reset - { - my $self = shift; - $self->{chunks} = {}; - $self->{first} = {}; - $self->{stats} = - { 'chunkstot' => 0, - 'chunksdupl' => 0, - 'chunkidsdupl' => 0, - 'chainstot' => 0, - 'chainloops' => 0, - 'chainsign' => 0, - 'chainsbroken' => 0, - }; - } - - sub add - { - my $self = shift; - my $chunk = shift; - my $id = $chunk->id; - my $next = $chunk->next; - $self->{stats}->{chunkstot}++; - my $chunks = $self->{chunks}->{$id}; - if (defined($chunks)) - { - my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks); - for (@chunks) - { - $self->{stats}->{chunksdupl}++,return if (!$chunk->differ($_)); # don't add chunk if there is a duplicate present - } - $self->{stats}->{chunkidsdupl}++; - push @chunks, $chunk; - $self->{chunks}->{$id} = \@chunks; - } - else - { - $self->{chunks}->{$id} = $chunk; - } - - # we store if there is any other chunk pointing to this one - if (!defined($self->{first}->{$id})) { $self->{first}->{$id} = 1 }; - $self->{first}->{$next} = 0; - } - - sub find - { - my $self = shift; - my $first = $self->{first}; - my @ids; - while (my ($key, $value) = each %$first) - { - push @ids,$key if $value == 1; - } - return \@ids; - } - - sub walk_int - { - no warnings qw(recursion); - - my $self = shift; - my $id = shift; - my $chain = shift; - my $chains = shift; - my $seen = shift; - my $combinations = shift; - $self->{stats}->{chainsign}++,return if ($combinations > 16); # we don't extract the message if there are too many branches - $seen->{$id} = 1; # note chunk identifiers for infinite loop testing - my $chunks = $self->{chunks}->{$id}; - my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks); - for(@chunks) - { - my $chunk = $_; - push @{$chain}, $chunk; - if (defined($self->{chunks}->{$chunk->next}) && !$seen->{$chunk->next}) - { - my $nextchunks = $self->{chunks}->{$chunk->next}; - my $nextcount = (ref($nextchunks) eq "ARRAY") ? scalar @{$nextchunks} : 1; - $self->walk_int($chunk->next, $chain, $chains, $seen, $combinations * $nextcount); - } - else - { - $self->{stats}->{chainloops}++ if $seen->{$chunk->next}; - my $chainobj = DBX::Chain->new($chain); - $self->{stats}->{chainsbroken}++ if $chainobj->broken; - $self->{stats}->{chainstot}++; - push @{$chains}, $chainobj; - } - pop @{$chain}; - } - $seen->{$id} = 0; - } - - sub walk - { - my $self = shift; - my $id = shift; - my $chains = []; - $self->walk_int($id, [], $chains, {}, 1); - return $chains; - } - - sub stats - { - my $self = shift; - return $self->{stats}; - } -} - -{ - package DBX::Chunk; - use Digest::SHA1 qw(sha1); - - sub new - { - my $class = shift; - my $self = {}; - $self->{f} = shift; - return if ($self->{f}->read($self->{h}, 16) != 16); - my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $self->{h}); - - # test for a valid header - # chunk size is always 512 bytes in all dbx files I have seen - - if ( - ($chunksize == 512) && - ($datasize <= $chunksize) && - ($datasize > 0) && - ($id != 0) && - (($next == 0) || ($datasize == $chunksize)) - ) - { - $self->{p} = $self->{f}->getpos; - - # if the header seems valid, we skip the whole chunk - - # the chance we miss a start of another chunk is low, because - # the test above is pretty strict and false positives are quite rare - - # it also helps in cases when there are dbx files contained inside dbx - - $self->{f}->seek($chunksize, 1); - bless ($self, $class); - return $self; - } - else - { - - # skip 4 bytes and try again - # headers were always at 4 byte boundary in every dbx file I have seen - - $self->{f}->seek(-12, 1); - return 0; - } - } - - sub datahash - { - my $self = shift; - if (!defined($self->{hash})) { $self->{hash} = sha1($self->data) }; - return $self->{hash}; - } - - sub differ - { - my $self = shift; - my $other = shift; - return 1 if (($self->{h}) ne ($other->{h})); - # use a hash for comparision, to avoid storing message data, or retrieving message data on each comparision - return 1 if ($self->datahash ne $other->datahash); - return 0; - } - - sub data - { - my $self = shift; - my $datasize = unpack("V", substr($self->{h}, 8, 4)); - my $data; - my $pos = $self->{f}->getpos; - $self->{f}->setpos($self->{p}); - $self->{f}->read($data, $datasize); - $self->{f}->setpos($pos); - return $data; - } - - sub id - { - my $self = shift; - return substr($self->{h}, 0, 4); - } - - sub next - { - my $self = shift; - return substr($self->{h}, 12, 4); - } - -} - -{ - package DBX::Scan; - - sub new - { - my $class = shift; - my $self = {}; - $self->{file} = shift; - $self->{cf} = DBX::ChainFinder->new; - bless ($self, $class); - return $self; - } - - sub scan - { - my $self = shift; - $self->{file}->binmode(); - $self->{file}->seek(0, 0); - $self->{cf}->reset; - my $time = 0; - - while (defined(my $chunk = DBX::Chunk->new($self->{file}))) - { - if ($chunk) - { - $self->{cf}->add($chunk); - if (time - $time > 1) - { - $self->printchunkstats; - $time = time; - } - } - } - $self->printchunkstats; - print STDERR "\n"; - } - - sub chains - { - my $self = shift; - my $chainids = $self->{cf}->find; - my $time = 0; - my $tot = scalar @{$chainids}; - my $idx = 0; - for (@{$chainids}) - { - my $chains = $self->{cf}->walk($_); - $idx++; - for(@{$chains}) - { - my $message = $_->data; - $message =~ s/^>*From />$&/m; # mbox "From " escaping - # we don't have the original envelope address, so just output a fake one - print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n"; - print $message; - print "\r\n\r\n"; - } - if (time - $time > 1) - { - $self->printchainstats(int($idx/$tot*100)); - $time = time; - } - } - $self->printchainstats(100); - } - - sub printchunkstats - { - my $self = shift; - my $stats = $self->{cf}->stats; - print STDERR "Chunks found: ".$stats->{chunkstot}."; "; - print STDERR "Duplicates: ".$stats->{chunksdupl}."; "; - print STDERR "Duplicate IDs: ".$stats->{chunkidsdupl}."\r"; - } - - sub printchainstats - { - my $self = shift; - my $percdone = shift; - my $stats = $self->{cf}->stats; - print STDERR "Chains found: ".$stats->{chainstot}."; "; - print STDERR "Broken: ".$stats->{chainsbroken}."; "; - print STDERR "Loops: ".$stats->{chainloops}."; "; - print STDERR "Dropped: ".$stats->{chainsign}."; "; - print STDERR $percdone."% done\r"; - } -} - -use IO::File; - -(my $dbxname=$ARGV[0]) and ((scalar @ARGV) == 1) or print(STDERR "Usage: $0 input.dbx >output.mbox\n"),exit; -print STDERR "Opening $dbxname for reading...\n"; -my $dbxfile = IO::File->new($dbxname, "r") or die("Error opening file!\n"); -my $dbxscanner = DBX::Scan->new($dbxfile); -print STDERR "Searching for chunks...\n"; -$dbxscanner->scan; -print STDERR "Searching for chains and printing...\n"; -$dbxscanner->chains; -print STDERR "\n"; diff --git a/dbxrecover-1p b/dbxrecover-1p new file mode 100755 index 0000000..db37ab6 --- /dev/null +++ b/dbxrecover-1p @@ -0,0 +1,310 @@ +#!/usr/bin/perl + +# dbxrecover, a program for recovering mail from damaged Outlook dbx files. +# Copyright (C) 2005 Piotr Pawlow + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +use strict; +use warnings; + +{ + package DBX::ChunkList; + + sub new + { + my $class = shift; + my $self = {}; + bless ($self, $class); + $self->{list} = [ @_ ]; + return $self; + } + + sub join + { + my $self = shift; + my $data = ''; + $self->join_int(\$data); + return $data; + } + + sub join_int + { + no warnings qw(recursion); + + my $self = shift; + my $dataref = shift; + my $list = $self->{list}; + for my $elem (@{$list}) + { + if (ref($elem)) + { + $elem->join_int($dataref) + } + else + { + $$dataref .= $elem; + } + } + } + + sub first + { + my $self = shift; + my $list = $self->{list}; + my $elem = ${$list}[0]; + $elem = $elem->first() if ref($elem); + return $elem; + } + + sub push + { + my $self = shift; + my $a1 = $self->{list}; + my $a2 = shift; + my $a2_list = $a2->{list}; + if (scalar @{$a2_list} < 32) # FIXME: Arbitrary value choosen without much care. Probably not optimal. + { + push @{$a1}, @{$a2_list}; + } + else + { + push @{$a1}, $a2; + } + } +} + +{ + package DBX::ChainFinder; + use BerkeleyDB; + use File::Temp; + + sub new + { + my $class = shift; + my $self = {}; + bless ($self, $class); + $self->reset; + return $self; + } + + sub reset + { + my $self = shift; + + $self->{id} = {}; + $self->{next} = {}; + $self->{stats} = + { 'chunkstot' => 0, + 'chainfrag' => 0, + 'chainscompl' => 0, + }; + } + + # FIXME: this function is too convoluted + sub combine + { + my $self = shift; + my $hash = shift; + my $otherhash = shift; + my $key = shift; + my $data = shift; + my $idx = shift; + + while (defined($hash->{$key})) + { + my @chunkkeys = sort keys %{$hash->{$key}}; + my $chunkkey = pop @chunkkeys; + my $chunkref = $hash->{$key}->{$chunkkey}; + my $chunkid = $chunkref->{head}->[$idx]; + delete $hash->{$key}->{$chunkkey}; + delete $hash->{$key} if (!scalar %{$hash->{$key}}); + delete $otherhash->{$chunkid}->{$chunkkey}; + delete $otherhash->{$chunkid} if (!scalar %{$otherhash->{$chunkid}}); + if ($idx == 1) + { + $data->push($chunkref); + } + else + { + $chunkref->push($data); + $data = $chunkref; + } + $key = $chunkid; + $self->{stats}->{chainfrag}--; + } + return ($key, $data); + } + + sub add + { + my $self = shift; + my $id = shift; + my $next = shift; + my $data = new DBX::ChunkList(shift); + my $chunkid = $self->{stats}->{chunkstot}++; + my $idhash = $self->{id}; + my $nexthash = $self->{next}; + + ($next, $data) = $self->combine($idhash, $nexthash, $next, $data, 1); + ($id, $data) = $self->combine($nexthash, $idhash, $id, $data, 0); + + # the test for first chunk is very simple and naive, but it works well enough + $self->{stats}->{chainscompl}++,return($data->join()) if (($next == 0) && ($data->first() =~ /^(From|From:|Return-Path:|Received:) /)); + $idhash->{$id} = {} if (!defined($idhash->{$id})); + $nexthash->{$next} = {} if (!defined($nexthash->{$next})); + $data->{head} = [$id, $next]; + $idhash->{$id}->{$chunkid} = $data; + $nexthash->{$next}->{$chunkid} = $data; + $self->{stats}->{chainfrag}++; + return undef; + } + + sub remaining + { + my $self = shift; + return undef if (!scalar %{$self->{id}}); + while (1) + { + (my $id, my $list) = each %{$self->{id}} or next; + (my $cid, my $chain) = each %{$list} or next; + my $data = $chain->join(); + delete $self->{id}->{$id}->{$cid}; + delete $self->{id}->{$id} if (!scalar %{$self->{id}->{$id}}); + return $data; + } + } + + sub stats + { + my $self = shift; + return $self->{stats}; + } +} + +{ + package DBX::Scan; + sub new + { + my $class = shift; + my $self = {}; + $self->{file} = shift; + $self->{cf} = DBX::ChainFinder->new; + bless ($self, $class); + return $self; + } + + sub scan + { + my $self = shift; + my $file = $self->{file}; + $file->binmode(); + $self->{cf}->reset; + my $time = 0; + my $header; + + $file->read($header, 16); + + while ((length $header) == 16) + { + my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $header); + + # test for a valid header + # chunk size is always 512 bytes in all dbx files I have seen + if ( + ($chunksize == 512) && + ($datasize <= $chunksize) && + ($datasize > 0) && + ($id != 0) && + (($next == 0) || ($datasize == $chunksize)) + ) + { + my $data; + $file->read($data, $datasize); + + # if the header seems valid, we skip the whole chunk + + # the chance we miss a start of another chunk is low, because + # the test above is pretty strict and false positives are quite rare + + # it also helps in cases when there are dbx files contained inside dbx + + my $tmp; + $file->read($tmp, $chunksize-$datasize); + + my $message = $self->{cf}->add($id, $next, $data); + $self->printmsg($message); + + if (time - $time > 1) + { + $self->printstats; + $time = time; + } + + $file->read($header, 16); + } + else + { + # skip 4 bytes and try again + # headers were always at 4 byte boundary in every dbx file I have seen + + my $tmp; + $file->read($tmp, 4); + $header = substr($header, 4).$tmp; + } + } + $self->printstats; + print STDERR "\nPrinting remaining chains..."; + my $message; + do { + $message = $self->{cf}->remaining; + $self->printmsg($message); + } while defined($message); + print STDERR "\n"; + } + + sub printmsg + { + my $self = shift; + my $message = shift; + if (defined($message)) + { + $message =~ s/^>*From />$&/m; # mbox "From " escaping + # we don't have the original envelope address, so just output a fake one + print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n"; + print $message; + print "\r\n\r\n"; + } + } + + sub printstats + { + my $self = shift; + my $stats = $self->{cf}->stats; + print STDERR "Chunks found: ".$stats->{chunkstot}."; "; + print STDERR "Chain fragments: ".$stats->{chainfrag}."; "; + print STDERR "Chains completed: ".$stats->{chainscompl}."\r"; + } +} + +use IO::File; + +my $dbxname=($ARGV[0] || "-"); +print STDERR "Opening $dbxname for reading...\n"; +my $dbxfile = IO::File->new("<$dbxname") or die("Error opening file!\n"); +my $dbxscanner = DBX::Scan->new($dbxfile); +print STDERR "Searching for messages...\n"; +$dbxscanner->scan; +print STDERR "\n"; diff --git a/dbxrecover-2p b/dbxrecover-2p new file mode 100755 index 0000000..6d2f71c --- /dev/null +++ b/dbxrecover-2p @@ -0,0 +1,368 @@ +#!/usr/bin/perl + +# dbxrecover, a program for recovering mail from damaged Outlook dbx files. +# Copyright (C) 2005 Piotr Pawlow + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +use strict; +use warnings; + +{ + package DBX::Chain; + + sub new + { + my $class = shift; + my $self = {}; + my $arrayref = shift; + my @ca = @{$arrayref}; + $self->{chunks} = \@ca; + $self->{idx} = scalar @ca; + bless ($self, $class); + return $self; + } + + # we assume here the single message is small enough to fit into memory + # this may not be a case for publishing companies or other businesses sending huge amounts of data by mail :) + sub data + { + my $self = shift; + my $data = ""; + for(@{$self->{chunks}}) + { + $data .= $_->data; + } + return $data; + } + + sub broken + { + my $self = shift; + return ($self->{chunks}->[$self->{idx}-1]->next ne "\x00\x00\x00\x00"); + } +} + +{ + package DBX::ChainFinder; + + sub new + { + my $class = shift; + my $self = {}; + bless ($self, $class); + $self->reset; + return $self; + } + + sub reset + { + my $self = shift; + $self->{chunks} = {}; + $self->{first} = {}; + $self->{stats} = + { 'chunkstot' => 0, + 'chunksdupl' => 0, + 'chunkidsdupl' => 0, + 'chainstot' => 0, + 'chainloops' => 0, + 'chainsign' => 0, + 'chainsbroken' => 0, + }; + } + + sub add + { + my $self = shift; + my $chunk = shift; + my $id = $chunk->id; + my $next = $chunk->next; + $self->{stats}->{chunkstot}++; + my $chunks = $self->{chunks}->{$id}; + if (defined($chunks)) + { + my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks); + for (@chunks) + { + $self->{stats}->{chunksdupl}++,return if (!$chunk->differ($_)); # don't add chunk if there is a duplicate present + } + $self->{stats}->{chunkidsdupl}++; + push @chunks, $chunk; + $self->{chunks}->{$id} = \@chunks; + } + else + { + $self->{chunks}->{$id} = $chunk; + } + + # we store if there is any other chunk pointing to this one + if (!defined($self->{first}->{$id})) { $self->{first}->{$id} = 1 }; + $self->{first}->{$next} = 0; + } + + sub find + { + my $self = shift; + my $first = $self->{first}; + my @ids; + while (my ($key, $value) = each %$first) + { + push @ids,$key if $value == 1; + } + return \@ids; + } + + sub walk_int + { + no warnings qw(recursion); + + my $self = shift; + my $id = shift; + my $chain = shift; + my $chains = shift; + my $seen = shift; + my $combinations = shift; + $self->{stats}->{chainsign}++,return if ($combinations > 16); # we don't extract the message if there are too many branches + $seen->{$id} = 1; # note chunk identifiers for infinite loop testing + my $chunks = $self->{chunks}->{$id}; + my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks); + for(@chunks) + { + my $chunk = $_; + push @{$chain}, $chunk; + if (defined($self->{chunks}->{$chunk->next}) && !$seen->{$chunk->next}) + { + my $nextchunks = $self->{chunks}->{$chunk->next}; + my $nextcount = (ref($nextchunks) eq "ARRAY") ? scalar @{$nextchunks} : 1; + $self->walk_int($chunk->next, $chain, $chains, $seen, $combinations * $nextcount); + } + else + { + $self->{stats}->{chainloops}++ if $seen->{$chunk->next}; + my $chainobj = DBX::Chain->new($chain); + $self->{stats}->{chainsbroken}++ if $chainobj->broken; + $self->{stats}->{chainstot}++; + push @{$chains}, $chainobj; + } + pop @{$chain}; + } + $seen->{$id} = 0; + } + + sub walk + { + my $self = shift; + my $id = shift; + my $chains = []; + $self->walk_int($id, [], $chains, {}, 1); + return $chains; + } + + sub stats + { + my $self = shift; + return $self->{stats}; + } +} + +{ + package DBX::Chunk; + use Digest::SHA1 qw(sha1); + + sub new + { + my $class = shift; + my $self = {}; + $self->{f} = shift; + return if ($self->{f}->read($self->{h}, 16) != 16); + my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $self->{h}); + + # test for a valid header + # chunk size is always 512 bytes in all dbx files I have seen + + if ( + ($chunksize == 512) && + ($datasize <= $chunksize) && + ($datasize > 0) && + ($id != 0) && + (($next == 0) || ($datasize == $chunksize)) + ) + { + $self->{p} = $self->{f}->getpos; + + # if the header seems valid, we skip the whole chunk + + # the chance we miss a start of another chunk is low, because + # the test above is pretty strict and false positives are quite rare + + # it also helps in cases when there are dbx files contained inside dbx + + $self->{f}->seek($chunksize, 1); + bless ($self, $class); + return $self; + } + else + { + + # skip 4 bytes and try again + # headers were always at 4 byte boundary in every dbx file I have seen + + $self->{f}->seek(-12, 1); + return 0; + } + } + + sub datahash + { + my $self = shift; + if (!defined($self->{hash})) { $self->{hash} = sha1($self->data) }; + return $self->{hash}; + } + + sub differ + { + my $self = shift; + my $other = shift; + return 1 if (($self->{h}) ne ($other->{h})); + # use a hash for comparision, to avoid storing message data, or retrieving message data on each comparision + return 1 if ($self->datahash ne $other->datahash); + return 0; + } + + sub data + { + my $self = shift; + my $datasize = unpack("V", substr($self->{h}, 8, 4)); + my $data; + my $pos = $self->{f}->getpos; + $self->{f}->setpos($self->{p}); + $self->{f}->read($data, $datasize); + $self->{f}->setpos($pos); + return $data; + } + + sub id + { + my $self = shift; + return substr($self->{h}, 0, 4); + } + + sub next + { + my $self = shift; + return substr($self->{h}, 12, 4); + } + +} + +{ + package DBX::Scan; + + sub new + { + my $class = shift; + my $self = {}; + $self->{file} = shift; + $self->{cf} = DBX::ChainFinder->new; + bless ($self, $class); + return $self; + } + + sub scan + { + my $self = shift; + $self->{file}->binmode(); + $self->{file}->seek(0, 0); + $self->{cf}->reset; + my $time = 0; + + while (defined(my $chunk = DBX::Chunk->new($self->{file}))) + { + if ($chunk) + { + $self->{cf}->add($chunk); + if (time - $time > 1) + { + $self->printchunkstats; + $time = time; + } + } + } + $self->printchunkstats; + print STDERR "\n"; + } + + sub chains + { + my $self = shift; + my $chainids = $self->{cf}->find; + my $time = 0; + my $tot = scalar @{$chainids}; + my $idx = 0; + for (@{$chainids}) + { + my $chains = $self->{cf}->walk($_); + $idx++; + for(@{$chains}) + { + my $message = $_->data; + $message =~ s/^>*From />$&/m; # mbox "From " escaping + # we don't have the original envelope address, so just output a fake one + print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n"; + print $message; + print "\r\n\r\n"; + } + if (time - $time > 1) + { + $self->printchainstats(int($idx/$tot*100)); + $time = time; + } + } + $self->printchainstats(100); + } + + sub printchunkstats + { + my $self = shift; + my $stats = $self->{cf}->stats; + print STDERR "Chunks found: ".$stats->{chunkstot}."; "; + print STDERR "Duplicates: ".$stats->{chunksdupl}."; "; + print STDERR "Duplicate IDs: ".$stats->{chunkidsdupl}."\r"; + } + + sub printchainstats + { + my $self = shift; + my $percdone = shift; + my $stats = $self->{cf}->stats; + print STDERR "Chains found: ".$stats->{chainstot}."; "; + print STDERR "Broken: ".$stats->{chainsbroken}."; "; + print STDERR "Loops: ".$stats->{chainloops}."; "; + print STDERR "Dropped: ".$stats->{chainsign}."; "; + print STDERR $percdone."% done\r"; + } +} + +use IO::File; + +(my $dbxname=$ARGV[0]) and ((scalar @ARGV) == 1) or print(STDERR "Usage: $0 input.dbx >output.mbox\n"),exit; +print STDERR "Opening $dbxname for reading...\n"; +my $dbxfile = IO::File->new($dbxname, "r") or die("Error opening file!\n"); +my $dbxscanner = DBX::Scan->new($dbxfile); +print STDERR "Searching for chunks...\n"; +$dbxscanner->scan; +print STDERR "Searching for chains and printing...\n"; +$dbxscanner->chains; +print STDERR "\n"; -- cgit v1.2.3