2004. március 29., hétfő

Search for duplicate files on large drives


Problem/Question/Abstract:

I have a list of items that I will need to deal with (looking for duplicate files on a drive) that can be upto 2 to 5 million in size. So I will be populating this list and then searching it over and over and over. Normally when dealing with smaller lists like this I would simply use a TStringList and attach an object. However, this seems a little large for TStringList and the main reason, the searching with IndexOf, I don't think is reason enough to use it. So what I am looking for is a list of some sort (TObjectList ?) that is fast and good to deal with and can easily handle this size of entries. It would be really nice it there was a way to create multiple indexes into the in-memory list, as that would greatly speed up my processing of the information. The best solution would be an in-memory database of some sort (at least I think it would be), but my issue with in-memory databases is that of the String sizes for File Name and directory. If I use a regular String variable in an object, then the size can be variable. If I use a standard DB field, then the size in the ones I have seen are all static. So I have to define a huge field to handle all file names which wastes space on all other entries in the list. Any thoughts as the best container to handle this sort of thing?

Answer:

{$A+,B-,D-,E+,F-,G+,I-,L-,N-,O-,P-,Q-,R-,S-,T-,V-,X-,Y-}

program FindDup;

{$APPTYPE CONSOLE} // 2002-12-16 - Delphi 32 console app

{Usage: FINDDUP [D:]

This program finds all duplicate files on the specified drive. The algorithm uses a hash table where each hash value contains a linked list of files that match a given hash. Every file found is inserted into the hash table linked list unless it's already there and therefore a duplicate. All duplicates are added to a red-black tree, where each node contains the file ame and a pointer to a linked list of records containing size, date, and path information.

This program can be compiled in either real or protected modes. Protected mode will allow it to search larger drives. Informal benchmarks on a 1.6 Gig drive with > 900 directories, Pentium 90, compiled for p-mode:

TEST 16 sec. (simple do-nothing recursive dir searcher)
FINDDUP 18 sec. - 16 = 2 sec.
REP-DPMI 36 sec. - 16 = 20 sec.

Subtracting the overhead of FindFirst/ FindNext, this program runs in about 2 seconds as compared to REP-DPMI's 20 seconds. Without ignoring the overhead, it's still twice as fast. Please note that this program is pure pascal.}

{$DEFINE DRBOB} // Do not search hidden directories or find hidden files
{.$DEFINE SAFE}// Checks memory allocations for out of memory conditions
{.$DEFINE CLEANUP}// Frees allocated memory - slows things down a bit

uses
  SysUtils;

const
  DosDelimSet: set of Char = ['\', ':', #0];
  MaxHash = 16381; {largest prime number < 16K}

type
  PathStr = string;
  NameString = string[12];
  St2 = string[2];

  pPath = ^PathRecord; {we will only keep one copy of each unique directory path}
  PathRecord = record
    Next: pPath;
    Path: PathStr;
  end;

  pDataRec = ^DataRec; {detailed information unique to each dupliacte}
  DataRec = record
    Time: longint;
    Size: longint;
    Path: pPath;
    Next: pDataRec;
  end;

  link = ^RBTreeNode;
  RBTreeNode = record {Red/ black tree node}
    Key: NameString; {Name of file}
    red: boolean;
    l, r: link;
    DataP: pDataRec; {linked list of detail information}
  end;

  pFileRec = ^MyFileRec;
  MyFileRec = record
    {Hash table record. There will only be one record for each duplicate file name}
    Name: NameString;
    Time: longint;
    Size: longint;
    Path: pPath;
    Node: link; {let's quickly insert duplicate into tree}
    next: pFileRec; {next record in linked list}
  end;

  tHashTable = array[0..MAXHASH] of pFileRec;

var
  Head, z: link;
  HashTable: ^tHashTable;
  PathHead, TempPathHead: pPath;
  OldName: NameString;

procedure RBTreeInitialize;
{Initialize red/ black tree}
begin
  New(Z);
{$IFDEF SAFE}
  if Z = nil then
    exit;
{$ENDIF}
  z^.l := z;
  z^.r := z;
  z^.red := false;
  New(Head);
{$IFDEF SAFE}
  if Head = nil then
    exit;
{$ENDIF}
  Head^.r := z;
  Head^.l := z;
  Head^.Key := '';
  Head^.Red := false;
end;

function Rotate(const Value: MyFileRec; y: link): link;
var
  c, gc: link;
begin
  if Value.Name < Y^.Key then
    c := y^.l
  else
    c := y^.r;
  if Value.Name < c^.Key then
  begin
    gc := c^.l;
    c^.l := gc^.r;
    gc^.r := c;
  end
  else
  begin
    gc := c^.r;
    c^.r := gc^.l;
    gc^.l := c;
  end;
  if Value.Name < Y^.Key then
    y^.l := gc
  else
    y^.r := gc;
  Rotate := gc;
end;

function Split(const Value: MyFileRec; gg, g, p, x: link): link;
begin
  x^.red := true;
  x^.l^.red := false;
  x^.r^.red := false;
  if p^.red then
  begin
    g^.red := true;
    if (Value.Name < g^.Key) <> (Value.Name < p^.Key) then
      p := Rotate(Value, g);
    x := rotate(Value, gg);
    x^.red := false;
  end;
  Head^.r^.red := false;
  split := x;
end;

function RBTreeInsert(const Value: MyFileRec; x: link): link;
{Insert file record into red/ black tree}
var
  gg, g, p: link;
begin
  p := x;
  g := x;
  repeat
    gg := g;
    g := p;
    p := x;
    if Value.Name < x^.Key then
      x := x^.l
    else
      x := x^.r;
    if x^.l^.red and x^.r^.red then
      x := split(Value, gg, g, p, x);
  until
    x = z;
  new(x);
{$IFDEF SAFE}
  if x = nil then
    exit;
{$ENDIF}
  x^.Key := Value.Name;
  New(x^.DataP);
{$IFDEF SAFE}
  if x^.DataP = nil then
    exit;
{$ENDIF}
  x^.DataP^.Next := nil;
  x^.DataP^.Time := Value.Time;
  x^.DataP^.Size := Value.Size;
  x^.DataP^.Path := Value.Path;
  x^.l := z;
  x^.r := z;
  if Value.Name < p^.Key then
    p^.l := x
  else
    p^.r := x;
  RbTreeInsert := x;
  x := Split(Value, gg, g, p, x);
end;

procedure Traverse(p: link);
{Traverse red/ black tree, printing out results}
var
  TempQ, q: pDataRec;
begin
  if (p^.l <> z) and (p^.l <> nil) then
    Traverse(p^.l);
  if (p <> head) then
  begin
    if p^.Key <> OldName then
    begin
      OldName := p^.Key;
      writeln(OldName);
    end;
    q := p^.DataP;
    while q <> nil do
    begin
      with q^ do
        writeln(size: 10, '   ', FormatDateTime('yyyy-mm-dd hh:nn:ss',
          FileDateToDateTime(Time)), '   ', Path^.Path);
{$IFDEF CLEANUP}
      TempQ := q;
      q := q^.Next;
      Dispose(TempQ);
{$ELSE}
      q := q^.Next;
{$ENDIF}
    end;
    writeln;
  end;
  if (p^.r <> z) and (p^.r <> nil) then
    Traverse(p^.r);
{$IFDEF CLEANUP}
  Dispose(p);
{$ENDIF}
end;

function AddBackSlash(const DirName: string): string;
{Add a default backslash to a directory name}
begin
  if DirName[Length(DirName)] in DosDelimSet then
    AddBackSlash := DirName
  else
    AddBackSlash := DirName + '\';
end;

function Hash(const Key: NameString): word;
var
  h: word;
  j: integer;
  Len: integer;
begin
  Len := Length(Key);
  h := ord(Key[1]);
  for j := 2 to Len do
  begin
    h := ((h * 32) + Ord(Key[j])) mod MAXHASH;
  end;
  Hash := h;
end;

procedure Add(var SR: tSearchRec; DirP: pPath);
{Add a new search record/ path to the hash table}
var
  p, q, r: pFileRec;
  h: word;
  TempData: pDataRec;
begin
  h := Hash(SR.Name);
  New(r);
{$IFDEF SAFE}
  if r = nil then
    exit;
{$ENDIF}
  r^.Name := SR.Name;
  r^.Time := SR.Time;
  r^.Size := SR.Size;
  r^.Path := DirP;
  r^.Next := nil;
  r^.Node := nil;
  p := HashTable^[H];
  if p = nil then
  begin {Hash slot not used}
    HashTable^[h] := r;
  end
  else
  begin
    q := p;
    while (p <> nil) and (p^.Name < SR.Name) do
    begin
      q := p;
      p := p^.Next;
    end;
    if (p <> nil) and (p^.Name = SR.Name) then
    begin {Found duplicate file}
      if p^.Node = nil then
      begin {was not already in tree}
        p^.Node := RBTreeInsert(p^, Head);
          {save link so we don't have to search tree next time}
      end;
      New(TempData);
{$IFDEF SAFE}
      if TempData = nil then
        exit;
{$ENDIF}
      TempData^.Time := Sr.Time;
      TempData^.Size := Sr.Size;
      TempData^.Path := DirP;
      TempData^.Next := p^.Node^.DataP; {Add to linked list on tree node}
      p^.Node^.DataP := TempData;
      Dispose(r); {didn't need it after all}
    end
    else
    begin {Not a duplicate}
      if p = q then
      begin {Add at start of linked list}
        HashTable^[H] := r;
        r^.Next := P;
      end
      else
      begin {Insert into linked list}
        q^.Next := r;
        r^.Next := p;
      end;
    end;
  end;
end;

procedure Find(const Path: PathStr);
{Recursive file/directory searcher}
var
  Sr: tSearchRec;
  DirP: pPath;
  r: integer;
begin
  New(DirP);
{$IFDEF SAFE}
  if DirP = nil then
    exit;
{$ENDIF}
  DirP^.Path := Path;
  DirP^.Next := PathHead;
  PathHead := DirP;
  r := FindFirst(AddBackSlash(Path) + '*.*', faAnyFile, Sr);
  while r = 0 do
  begin
{$IFDEF DRBOB} {only do non-hidden directories}
    if ((Sr.Attr and faDirectory) <> 0) and ((Sr.Attr and faHidden) = 0) then
    begin
{$ELSE} {do them all}
    if (Sr.Attr and Directory) <> 0 then
    begin
{$ENDIF}
      if Sr.Name[1] <> '.' then
        Find(AddBackSlash(Path) + Sr.Name);
      r := 0;
    end
    else
    begin
{$IFDEF DRBOB}
      if (Sr.Attr and faHidden) = 0 then {Only do non-hidden files}
{$ENDIF}
        Add(Sr, DirP);
    end;
    r := FindNext(Sr);
  end;
end;

function HeapFunc(Size: Word): Integer; far;
begin
  HeapFunc := 1;
end;

procedure Init;
begin
  OldName := '';
  PathHead := nil;
{$IFDEF SAFE}
  HeapError := @HeapFunc;
{$ENDIF}
  New(HashTable);
{$IFDEF SAFE}
  if HashTable = nil then
    halt;
{$ENDIF}
  FillChar(HashTable^, sizeof(HashTable^), 0);
  RBTreeInitialize;
end;

procedure Process;
begin
  Find(ParamStr(1) + '\');
  Traverse(Head);
end;

procedure Done;
var
  i: integer;
  q, tempq: pFileRec;
begin
{$IFDEF CLEANUP}
  Dispose(Z);
  for i := 0 to MAXHASH - 1 do
  begin
    if HashTable^[i] <> nil then
    begin
      q := HashTable^[i];
      while q <> nil do
      begin
        tempq := q^.next;
        Dispose(q);
        q := tempq;
      end;
    end;
  end;
  Dispose(HashTable);
  TempPathHead := PathHead;
  while PathHead <> nil do
  begin
    TempPathHead := PathHead^.Next;
    FreeMem(PathHead, Length(PathHead^.Path) + 5);
    PathHead := TempPathHead;
  end;
{$ENDIF}
end;

begin
  Init;
  Process;
  Done;
end.

Nincsenek megjegyzések:

Megjegyzés küldése