From 37f8f895cd6f67687bbaa80e96469bd4363159fb Mon Sep 17 00:00:00 2001 From: pp Date: Thu, 27 Apr 2006 18:30:13 +0000 Subject: - memory usage optimizations - live statistics while searching for chains git-svn-id: https://siedziba.pl:790/svn/repos/dbxrecover@232 455248ca-bdda-0310-9134-f4ebb693071a --- dbxrecover | 162 ++++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 96 insertions(+), 66 deletions(-) diff --git a/dbxrecover b/dbxrecover index 2d703aa..081556a 100755 --- a/dbxrecover +++ b/dbxrecover @@ -18,6 +18,7 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. use strict; +use warnings; { package DBX::Chain; @@ -50,7 +51,7 @@ use strict; sub broken { my $self = shift; - return ($self->{chunks}->[$self->{idx}-1]->next != 0); + return ($self->{chunks}->[$self->{idx}-1]->next ne "\x00\x00\x00\x00"); } } @@ -78,6 +79,7 @@ use strict; 'chainstot' => 0, 'chainloops' => 0, 'chainsign' => 0, + 'chainsbroken' => 0, }; } @@ -88,78 +90,88 @@ use strict; my $id = $chunk->id; my $next = $chunk->next; $self->{stats}->{chunkstot}++; - if (!defined($self->{chunks}->{$id})) + my $chunks = $self->{chunks}->{$id}; + if (defined($chunks)) { - $self->{chunks}->{$id} = []; - } - else - { - for (@{$self->{chunks}->{$id}}) + my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks); + for (@chunks) { $self->{stats}->{chunksdupl}++,return if (!$chunk->differ($_)); # don't add chunk if there is a duplicate present } + $self->{stats}->{chunkidsdupl}++; + push @chunks, $chunk; + $self->{chunks}->{$id} = \@chunks; + } + else + { + $self->{chunks}->{$id} = $chunk; } - push @{$self->{chunks}->{$id}}, $chunk; # we store if there is any other chunk pointing to this one if (!defined($self->{first}->{$id})) { $self->{first}->{$id} = 1 }; $self->{first}->{$next} = 0; - - # adding a new chunk requires rescanning for chains - $self->{chains} = undef; } sub find { my $self = shift; - $self->{chains} = []; - - for(keys %{$self->{first}}) + my $first = $self->{first}; + my @ids; + while (my ($key, $value) = each %$first) { - if ($self->{first}->{$_} == 1) - { - # if there is no other chunk pointing to this one, start following it - $self->walk($_); - } + push @ids,$key if $value == 1; } - return $self->{chains}; + return \@ids; } - sub walk + sub walk_int { + no warnings qw(recursion); + my $self = shift; my $id = shift; my $chain = shift; my $seen = shift; my $combinations = shift; $self->{stats}->{chainsign}++,return if ($combinations > 16); # we don't extract the message if there are too many branches - if (!defined($chain)) { $chain = [] }; - if (!defined($seen)) { $seen = {} }; - if (!defined($combinations)) { $combinations = 1 }; $seen->{$id} = 1; # note chunk identifiers for infinite loop testing - for(@{$self->{chunks}->{$id}}) + my $chunks = $self->{chunks}->{$id}; + my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks); + for(@chunks) { my $chunk = $_; push @{$chain}, $chunk; if (defined($self->{chunks}->{$chunk->next}) && !$seen->{$chunk->next}) { - $self->walk($chunk->next, $chain, $seen, $combinations * scalar @{$self->{chunks}->{$chunk->next}}); + my $nextchunks = $self->{chunks}->{$chunk->next}; + my $nextcount = (ref($nextchunks) eq "ARRAY") ? scalar @{$nextchunks} : 1; + $self->walk_int($chunk->next, $chain, $seen, $combinations * $nextcount); } else { $self->{stats}->{chainloops}++ if $seen->{$chunk->next}; - push @{$self->{chains}}, DBX::Chain->new($chain); + my $chainobj = DBX::Chain->new($chain); + $self->{stats}->{chainsbroken}++ if $chainobj->broken; + $self->{stats}->{chainstot}++; + push @{$self->{chains}}, $chainobj; } pop @{$chain}; } $seen->{$id} = 0; } + sub walk + { + my $self = shift; + my $id = shift; + $self->{chains} = []; + $self->walk_int($id, [], {}, 1); + return $self->{chains}; + } + sub stats { my $self = shift; - $self->{stats}->{chunkidsdupl} = $self->{stats}->{chunkstot} - scalar keys %{$self->{chunks}}; - $self->{stats}->{chainstot} = scalar @{$self->{chains}} if ($self->{chains}); return $self->{stats}; } } @@ -172,9 +184,9 @@ use strict; { my $class = shift; my $self = {}; - $self->{file} = shift; - return if ($self->{file}->read($self->{header}, 16) != 16); - my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $self->{header}); + $self->{f} = shift; + return if ($self->{f}->read($self->{h}, 16) != 16); + my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $self->{h}); # test for a valid header # chunk size is always 512 bytes in all dbx files I have seen @@ -187,7 +199,7 @@ use strict; (($next == 0) || ($datasize == $chunksize)) ) { - $self->{filepos} = $self->{file}->getpos; + $self->{p} = $self->{f}->getpos; # if the header seems valid, we skip the whole chunk @@ -196,7 +208,7 @@ use strict; # it also helps in cases when there are dbx files contained inside dbx - $self->{file}->seek($chunksize, 1); + $self->{f}->seek($chunksize, 1); bless ($self, $class); return $self; } @@ -206,7 +218,7 @@ use strict; # skip 4 bytes and try again # headers were always at 4 byte boundary in every dbx file I have seen - $self->{file}->seek(-12, 1); + $self->{f}->seek(-12, 1); return 0; } } @@ -222,7 +234,7 @@ use strict; { my $self = shift; my $other = shift; - return 1 if (($self->{header}) ne ($other->{header})); + return 1 if (($self->{h}) ne ($other->{h})); # use a hash for comparision, to avoid storing message data, or retrieving message data on each comparision return 1 if ($self->datahash ne $other->datahash); return 0; @@ -231,25 +243,25 @@ use strict; sub data { my $self = shift; - my $datasize = unpack("V", substr($self->{header}, 8, 4)); + my $datasize = unpack("V", substr($self->{h}, 8, 4)); my $data; - my $pos = $self->{file}->getpos; - $self->{file}->setpos($self->{filepos}); - $self->{file}->read($data, $datasize); - $self->{file}->setpos($pos); + my $pos = $self->{f}->getpos; + $self->{f}->setpos($self->{p}); + $self->{f}->read($data, $datasize); + $self->{f}->setpos($pos); return $data; } sub id { my $self = shift; - return scalar unpack("V", $self->{header}); + return substr($self->{h}, 0, 4); } sub next { my $self = shift; - return scalar unpack("V", substr($self->{header}, 12, 4)); + return substr($self->{h}, 12, 4); } } @@ -273,15 +285,18 @@ use strict; $self->{file}->binmode(); $self->{file}->seek(0, 0); $self->{cf}->reset; - my $time; + my $time = 0; while (defined(my $chunk = DBX::Chunk->new($self->{file}))) { - $self->{cf}->add($chunk) if ($chunk); - if (time - $time > 1) + if ($chunk) { - $self->printchunkstats; - $time = time; + $self->{cf}->add($chunk); + if (time - $time > 1) + { + $self->printchunkstats; + $time = time; + } } } $self->printchunkstats; @@ -291,9 +306,30 @@ use strict; sub chains { my $self = shift; - my $chains = $self->{cf}->find; - $self->printchainstats; - return $chains; + my $chainids = $self->{cf}->find; + my $time = 0; + my $tot = scalar @{$chainids}; + my $idx = 0; + for (@{$chainids}) + { + my $chains = $self->{cf}->walk($_); + $idx++; + for(@{$chains}) + { + my $message = $_->data; + $message =~ s/^>*From />$&/m; # mbox "From " escaping + # we don't have the original envelope address, so just output a fake one + print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n"; + print $message; + print "\r\n\r\n"; + } + if (time - $time > 1) + { + $self->printchainstats(int($idx/$tot*100)); + $time = time; + } + } + $self->printchainstats(100); } sub printchunkstats @@ -308,10 +344,13 @@ use strict; sub printchainstats { my $self = shift; + my $percdone = shift; my $stats = $self->{cf}->stats; - print STDERR "Chains found: ".$stats->{chainstot}."\n"; - print STDERR "Chain loops found: ".$stats->{chainloops}."\n"; - print STDERR "Chains ignored due to duplicate chunk ids: ".$stats->{chainsign}."\n"; + print STDERR "Chains found: ".$stats->{chainstot}."; "; + print STDERR "Broken: ".$stats->{chainsbroken}."; "; + print STDERR "Loops: ".$stats->{chainloops}."; "; + print STDERR "Dropped: ".$stats->{chainsign}."; "; + print STDERR $percdone."% done\r"; } } @@ -323,15 +362,6 @@ my $dbxfile = IO::File->new($dbxname, "r") or die("Error opening file!\n"); my $dbxscanner = DBX::Scan->new($dbxfile); print STDERR "Searching for chunks...\n"; $dbxscanner->scan; -print STDERR "Reconstructing chains...\n"; -my $chains = $dbxscanner->chains; -print STDERR "Printing...\n"; -for(@{$chains}) -{ - my $message = $_->data; - $message =~ s/^>*From />$&/m; # mbox "From " escaping - # we don't have the original envelope address, so just output a fake one - print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n"; - print $message; - print "\r\n\r\n"; -} +print STDERR "Searching for chains and printing...\n"; +$dbxscanner->chains; +print STDERR "\n"; -- cgit v1.2.3