Last commit for dbxrecover-1p: 2ce5dc7d37a6437476d43a1d7b9f2b326ef71610

- bugfix: "From" escaping regexp was not global

pp [2007-02-08 04:08:56]
- bugfix: "From" escaping regexp was not global


git-svn-id: https://siedziba.pl:790/svn/repos/dbxrecover@271 455248ca-bdda-0310-9134-f4ebb693071a
#!/usr/bin/perl

# dbxrecover, a program for recovering mail from damaged Outlook dbx files.
# Copyright (C) 2005  Piotr Pawlow <pp@siedziba.pl>

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

use strict;
use warnings;

{
  package DBX::ChunkList;

  sub new
  {
    my $class = shift;
    my $self = {};
    bless ($self, $class);
    $self->{list} = [ @_ ];
    return $self;
  }

  sub join
  {
    my $self = shift;
    my $data = '';
    $self->join_int(\$data);
    return $data;
  }

  sub join_int
  {
    no warnings qw(recursion);

    my $self = shift;
    my $dataref = shift;
    my $list = $self->{list};
    for my $elem (@{$list})
    {
      if (ref($elem))
      {
        $elem->join_int($dataref)
      }
      else
      {
        $$dataref .= $elem;
      }
    }
  }

  sub first
  {
    my $self = shift;
    my $list = $self->{list};
    my $elem = ${$list}[0];
    $elem = $elem->first() if ref($elem);
    return $elem;
  }

  sub push
  {
    my $self = shift;
    my $a1 = $self->{list};
    my $a2 = shift;
    my $a2_list = $a2->{list};
    if (scalar @{$a2_list} < 32) # FIXME: Arbitrary value choosen without much care. Probably not optimal.
    {
      push @{$a1}, @{$a2_list};
    }
    else
    {
      push @{$a1}, $a2;
    }
  }
}

{
  package DBX::ChainFinder;
  use BerkeleyDB;
  use File::Temp;

  sub new
  {
    my $class = shift;
    my $self = {};
    bless ($self, $class);
    $self->reset;
    return $self;
  }

  sub reset
  {
    my $self = shift;

    $self->{id} = {};
    $self->{next} = {};
    $self->{stats} =
    { 'chunkstot'    => 0,
      'chainfrag'    => 0,
      'chainscompl'   => 0,
    };
  }

  # FIXME: this function is too convoluted
  sub combine
  {
    my $self = shift;
    my $hash = shift;
    my $otherhash = shift;
    my $key = shift;
    my $data = shift;
    my $idx = shift;

    while (defined($hash->{$key}))
    {
      my @chunkkeys = sort keys %{$hash->{$key}};
      my $chunkkey = pop @chunkkeys;
      my $chunkref = $hash->{$key}->{$chunkkey};
      my $chunkid = $chunkref->{head}->[$idx];
      delete $hash->{$key}->{$chunkkey};
      delete $hash->{$key} if (!scalar %{$hash->{$key}});
      delete $otherhash->{$chunkid}->{$chunkkey};
      delete $otherhash->{$chunkid} if (!scalar %{$otherhash->{$chunkid}});
      if ($idx == 1)
      {
        $data->push($chunkref);
      }
      else
      {
        $chunkref->push($data);
        $data = $chunkref;
      }
      $key = $chunkid;
      $self->{stats}->{chainfrag}--;
    }
    return ($key, $data);
  }

  sub add
  {
    my $self = shift;
    my $id = shift;
    my $next = shift;
    my $data = new DBX::ChunkList(shift);
    my $chunkid = $self->{stats}->{chunkstot}++;
    my $idhash = $self->{id};
    my $nexthash = $self->{next};

    ($next, $data) = $self->combine($idhash, $nexthash, $next, $data, 1);
    ($id, $data) = $self->combine($nexthash, $idhash, $id, $data, 0);

    # TODO: better test for first chunk. Maybe a grace period to keep seemingly complete messages, in case of false positives.
    $self->{stats}->{chainscompl}++,return($data->join()) if (($next == 0) && ($data->first() =~ /^(From|From:|Return-Path:|Received:) /));
    $idhash->{$id} = {} if (!defined($idhash->{$id}));
    $nexthash->{$next} = {} if (!defined($nexthash->{$next}));
    $data->{head} = [$id, $next];
    $idhash->{$id}->{$chunkid} = $data;
    $nexthash->{$next}->{$chunkid} = $data;
    $self->{stats}->{chainfrag}++;
    return undef;
  }

  sub remaining
  {
    my $self = shift;
    return undef if (!scalar %{$self->{id}});
    while (1)
    {
      (my $id, my $list) = each %{$self->{id}} or next;
      (my $cid, my $chain) = each %{$list} or next;
      my $data = $chain->join();
      delete $self->{id}->{$id}->{$cid};
      delete $self->{id}->{$id} if (!scalar %{$self->{id}->{$id}});
      return $data;
    }
  }

  sub stats
  {
    my $self = shift;
    return $self->{stats};
  }
}

{
  package DBX::Scan;
  sub new
  {
    my $class = shift;
    my $self = {};
    $self->{file} = shift;
    $self->{cf} = DBX::ChainFinder->new;
    bless ($self, $class);
    return $self;
  }

  sub scan
  {
    my $self = shift;
    my $file = $self->{file};
    $file->binmode();
    $self->{cf}->reset;
    my $time = 0;
    my $tmp;
    my $buffer = '';
    my $bufsize = 64*1024;

    while ($file->read($tmp, $bufsize))
    {
      $buffer .= $tmp;
      my $buflen = length $buffer;
      my $idx = 0;
      my $header;
      while (($buflen - $idx) >= 16)
      {
        $header = substr($buffer, $idx, 16);
        my ($id, $chunksize, $datasize, $next) = unpack("VVVV", $header);

        # test for a valid header
        # chunk size is always 512 bytes in all dbx files I have seen
        if (
             ($chunksize == 512) &&
             ($datasize <= $chunksize) &&
             ($datasize > 0) &&
             ($id != 0) &&
             (($next == 0) || ($datasize == $chunksize))
           )
        {
          last if ($buflen - $idx - 16 < $chunksize);
          my $data = substr($buffer, $idx+16, $datasize);

          # if the header seems valid, we skip the whole chunk

          # the chance we miss a start of another chunk is low, because
          # the test above is pretty strict and false positives are quite rare

          # it also helps in cases when there are dbx files contained inside dbx

          $idx += $chunksize + 16;

          my $message = $self->{cf}->add($id, $next, $data);
          $self->printmsg($message);

          if (time - $time > 1)
          {
            $self->printstats;
            $time = time;
          }
        }
        else
        {
          # skip 4 bytes and try again
          # headers were always at 4 byte boundary in every dbx file I have seen

          $idx += 4;
        }
      }
      $buffer = substr($buffer, $idx);
    }
    $self->printstats;
    print STDERR "\nPrinting remaining chains...";
    my $message;
    do {
      $message = $self->{cf}->remaining;
      $self->printmsg($message);
    } while defined($message);
    print STDERR "\n";
  }

  sub printmsg
  {
    my $self = shift;
    my $message = shift;
    if (defined($message))
    {
      $message =~ s/^>*From />$&/mg;		# mbox "From " escaping
      # we don't have the original envelope address, so just output a fake one
      print "From unknown\@unknown.invalid Mon Jan 1 00:00:00 1970\r\n";
      print $message;
      print "\r\n\r\n";
    }
  }

  sub printstats
  {
    my $self = shift;
    my $stats = $self->{cf}->stats;
    print STDERR "Chunks found: ".$stats->{chunkstot}."; ";
    print STDERR "Chain fragments: ".$stats->{chainfrag}."; ";
    print STDERR "Chains completed: ".$stats->{chainscompl}."\r";
  }
}

use IO::File;

my $dbxname=($ARGV[0] || "-");
print STDERR "Opening $dbxname for reading...\n";
my $dbxfile = IO::File->new("<$dbxname") or die("Error opening file!\n");
my $dbxscanner = DBX::Scan->new($dbxfile);
print STDERR "Searching for messages...\n";
$dbxscanner->scan;
print STDERR "\n";
ViewGit