- memory usage optimizations

pp [2006-04-27 18:30:13]
- memory usage optimizations
- live statistics while searching for chains


git-svn-id: https://siedziba.pl:790/svn/repos/dbxrecover@232 455248ca-bdda-0310-9134-f4ebb693071a
Filename
dbxrecover
diff --git a/dbxrecover b/dbxrecover
index 2d703aa..081556a 100755
--- a/dbxrecover
+++ b/dbxrecover
@@ -18,6 +18,7 @@
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

 use strict;
+use warnings;

 {
   package DBX::Chain;
@@ -50,7 +51,7 @@ use strict;
   sub broken
   {
     my $self = shift;
-    return ($self->{chunks}->[$self->{idx}-1]->next != 0);
+    return ($self->{chunks}->[$self->{idx}-1]->next ne "\x00\x00\x00\x00");
   }
 }

@@ -78,6 +79,7 @@ use strict;
       'chainstot'    => 0,
       'chainloops'   => 0,
       'chainsign'    => 0,
+      'chainsbroken' => 0,
     };
   }

@@ -88,78 +90,88 @@ use strict;
     my $id = $chunk->id;
     my $next = $chunk->next;
     $self->{stats}->{chunkstot}++;
-    if (!defined($self->{chunks}->{$id}))
+    my $chunks = $self->{chunks}->{$id};
+    if (defined($chunks))
     {
-      $self->{chunks}->{$id} = [];
-    }
-    else
-    {
-      for (@{$self->{chunks}->{$id}})
+      my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks);
+      for (@chunks)
       {
         $self->{stats}->{chunksdupl}++,return if (!$chunk->differ($_)); # don't add chunk if there is a duplicate present
       }
+      $self->{stats}->{chunkidsdupl}++;
+      push @chunks, $chunk;
+      $self->{chunks}->{$id} = \@chunks;
+    }
+    else
+    {
+      $self->{chunks}->{$id} = $chunk;
     }
-    push @{$self->{chunks}->{$id}}, $chunk;

     # we store if there is any other chunk pointing to this one
     if (!defined($self->{first}->{$id})) { $self->{first}->{$id} = 1 };
     $self->{first}->{$next} = 0;
-
-    # adding a new chunk requires rescanning for chains
-    $self->{chains} = undef;
   }

   sub find
   {
     my $self = shift;
-    $self->{chains} = [];
-
-    for(keys %{$self->{first}})
+    my $first = $self->{first};
+    my @ids;
+    while (my ($key, $value) = each %$first)
     {
-      if ($self->{first}->{$_} == 1)
-      {
-        # if there is no other chunk pointing to this one, start following it
-        $self->walk($_);
-      }
+      push @ids,$key if $value == 1;
     }
-    return $self->{chains};
+    return \@ids;
   }

-  sub walk
+  sub walk_int
   {
+    no warnings qw(recursion);
+
     my $self = shift;
     my $id = shift;
     my $chain = shift;
     my $seen = shift;
     my $combinations = shift;
     $self->{stats}->{chainsign}++,return if ($combinations > 16); # we don't extract the message if there are too many branches
-    if (!defined($chain)) { $chain = [] };
-    if (!defined($seen)) { $seen = {} };
-    if (!defined($combinations)) { $combinations = 1 };
     $seen->{$id} = 1; # note chunk identifiers for infinite loop testing
-    for(@{$self->{chunks}->{$id}})
+    my $chunks = $self->{chunks}->{$id};
+    my @chunks = (ref($chunks) eq "ARRAY") ? @{$chunks} : ($chunks);
+    for(@chunks)
     {
       my $chunk = $_;
       push @{$chain}, $chunk;
       if (defined($self->{chunks}->{$chunk->next}) && !$seen->{$chunk->next})
       {
-        $self->walk($chunk->next, $chain, $seen, $combinations * scalar @{$self->{chunks}->{$chunk->next}});
+        my $nextchunks = $self->{chunks}->{$chunk->next};
+        my $nextcount = (ref($nextchunks) eq "ARRAY") ? scalar @{$nextchunks} : 1;
+        $self->walk_int($chunk->next, $chain, $seen, $combinations * $nextcount);
       }
       else
       {
         $self->{stats}->{chainloops}++ if $seen->{$chunk->next};
-        push @{$self->{chains}}, DBX::Chain->new($chain);
+        my $chainobj = DBX::Chain->new($chain);
+        $self->{stats}->{chainsbroken}++ if $chainobj->broken;
+        $self->{stats}->{chainstot}++;
+        push @{$self->{chains}}, $chainobj;
       }
       pop @{$chain};
     }
     $seen->{$id} = 0;
   }

+  sub walk
+  {
+    my $self = shift;
+    my $id = shift;
+    $self->{chains} = [];
+    $self->walk_int($id, [], {}, 1);
+    return $self->{chains};
+  }
+
   sub stats
   {
     my $self = shift;
-    $self->{stats}->{chunkidsdupl} = $self->{stats}->{chunkstot} - scalar keys %{$self->{chunks}};
-    $self->{stats}->{chainstot} = scalar @{$self->{chains}} if ($self->{chains});
     return $self->{stats};
   }
 }
@@ -172,9 +184,9 @@ use strict;
   {
     my $class = shift;
     my $self = {};
-    $self->{file} = shift;
-    return if ($self->{file}->read($self->{header}, 16) != 16);
-    my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $self->{header});
+    $self->{f} = shift;
+    return if ($self->{f}->read($self->{h}, 16) != 16);
+    my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $self->{h});

     # test for a valid header
     # chunk size is always 512 bytes in all dbx files I have seen
@@ -187,7 +199,7 @@ use strict;
           (($next == 0) || ($datasize == $chunksize))
          )
     {
-      $self->{filepos} = $self->{file}->getpos;
+      $self->{p} = $self->{f}->getpos;

       # if the header seems valid, we skip the whole chunk

@@ -196,7 +208,7 @@ use strict;

       # it also helps in cases when there are dbx files contained inside dbx

-      $self->{file}->seek($chunksize, 1);
+      $self->{f}->seek($chunksize, 1);
       bless ($self, $class);
       return $self;
     }
@@ -206,7 +218,7 @@ use strict;
       # skip 4 bytes and try again
       # headers were always at 4 byte boundary in every dbx file I have seen

-      $self->{file}->seek(-12, 1);
+      $self->{f}->seek(-12, 1);
       return 0;
     }
   }
@@ -222,7 +234,7 @@ use strict;
   {
     my $self = shift;
     my $other = shift;
-    return 1 if (($self->{header}) ne ($other->{header}));
+    return 1 if (($self->{h}) ne ($other->{h}));
     # use a hash for comparision, to avoid storing message data, or retrieving message data on each comparision
     return 1 if ($self->datahash ne $other->datahash);
     return 0;
@@ -231,25 +243,25 @@ use strict;
   sub data
   {
     my $self = shift;
-    my $datasize = unpack("V", substr($self->{header}, 8, 4));
+    my $datasize = unpack("V", substr($self->{h}, 8, 4));
     my $data;
-    my $pos = $self->{file}->getpos;
-    $self->{file}->setpos($self->{filepos});
-    $self->{file}->read($data, $datasize);
-    $self->{file}->setpos($pos);
+    my $pos = $self->{f}->getpos;
+    $self->{f}->setpos($self->{p});
+    $self->{f}->read($data, $datasize);
+    $self->{f}->setpos($pos);
     return $data;
   }

   sub id
   {
     my $self = shift;
-    return scalar unpack("V", $self->{header});
+    return substr($self->{h}, 0, 4);
   }

   sub next
   {
     my $self = shift;
-    return scalar unpack("V", substr($self->{header}, 12, 4));
+    return substr($self->{h}, 12, 4);
   }

 }
@@ -273,15 +285,18 @@ use strict;
     $self->{file}->binmode();
     $self->{file}->seek(0, 0);
     $self->{cf}->reset;
-    my $time;
+    my $time = 0;

     while (defined(my $chunk = DBX::Chunk->new($self->{file})))
     {
-      $self->{cf}->add($chunk) if ($chunk);
-      if (time - $time > 1)
+      if ($chunk)
       {
-        $self->printchunkstats;
-        $time = time;
+        $self->{cf}->add($chunk);
+        if (time - $time > 1)
+        {
+          $self->printchunkstats;
+          $time = time;
+        }
       }
     }
     $self->printchunkstats;
@@ -291,9 +306,30 @@ use strict;
   sub chains
   {
     my $self = shift;
-    my $chains = $self->{cf}->find;
-    $self->printchainstats;
-    return $chains;
+    my $chainids = $self->{cf}->find;
+    my $time = 0;
+    my $tot = scalar @{$chainids};
+    my $idx = 0;
+    for (@{$chainids})
+    {
+      my $chains = $self->{cf}->walk($_);
+      $idx++;
+      for(@{$chains})
+      {
+        my $message = $_->data;
+        $message =~ s/^>*From />$&/m;		# mbox "From " escaping
+        # we don't have the original envelope address, so just output a fake one
+        print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n";
+        print $message;
+        print "\r\n\r\n";
+      }
+      if (time - $time > 1)
+      {
+        $self->printchainstats(int($idx/$tot*100));
+        $time = time;
+      }
+    }
+    $self->printchainstats(100);
   }

   sub printchunkstats
@@ -308,10 +344,13 @@ use strict;
   sub printchainstats
   {
     my $self = shift;
+    my $percdone = shift;
     my $stats = $self->{cf}->stats;
-    print STDERR "Chains found: ".$stats->{chainstot}."\n";
-    print STDERR "Chain loops found: ".$stats->{chainloops}."\n";
-    print STDERR "Chains ignored due to duplicate chunk ids: ".$stats->{chainsign}."\n";
+    print STDERR "Chains found: ".$stats->{chainstot}."; ";
+    print STDERR "Broken: ".$stats->{chainsbroken}."; ";
+    print STDERR "Loops: ".$stats->{chainloops}."; ";
+    print STDERR "Dropped: ".$stats->{chainsign}."; ";
+    print STDERR $percdone."% done\r";
   }
 }

@@ -323,15 +362,6 @@ my $dbxfile = IO::File->new($dbxname, "r") or die("Error opening file!\n");
 my $dbxscanner = DBX::Scan->new($dbxfile);
 print STDERR "Searching for chunks...\n";
 $dbxscanner->scan;
-print STDERR "Reconstructing chains...\n";
-my $chains = $dbxscanner->chains;
-print STDERR "Printing...\n";
-for(@{$chains})
-{
-  my $message = $_->data;
-  $message =~ s/^>*From />$&/m;		# mbox "From " escaping
-  # we don't have the original envelope address, so just output a fake one
-  print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n";
-  print $message;
-  print "\r\n\r\n";
-}
+print STDERR "Searching for chains and printing...\n";
+$dbxscanner->chains;
+print STDERR "\n";
ViewGit