2004. március 29., hétfő
Search for duplicate files on large drives
Problem/Question/Abstract:
I have a list of items that I will need to deal with (looking for duplicate files on a drive) that can be upto 2 to 5 million in size. So I will be populating this list and then searching it over and over and over. Normally when dealing with smaller lists like this I would simply use a TStringList and attach an object. However, this seems a little large for TStringList and the main reason, the searching with IndexOf, I don't think is reason enough to use it. So what I am looking for is a list of some sort (TObjectList ?) that is fast and good to deal with and can easily handle this size of entries. It would be really nice it there was a way to create multiple indexes into the in-memory list, as that would greatly speed up my processing of the information. The best solution would be an in-memory database of some sort (at least I think it would be), but my issue with in-memory databases is that of the String sizes for File Name and directory. If I use a regular String variable in an object, then the size can be variable. If I use a standard DB field, then the size in the ones I have seen are all static. So I have to define a huge field to handle all file names which wastes space on all other entries in the list. Any thoughts as the best container to handle this sort of thing?
Answer:
{$A+,B-,D-,E+,F-,G+,I-,L-,N-,O-,P-,Q-,R-,S-,T-,V-,X-,Y-}
program FindDup;
{$APPTYPE CONSOLE} // 2002-12-16 - Delphi 32 console app
{Usage: FINDDUP [D:]
This program finds all duplicate files on the specified drive. The algorithm uses a hash table where each hash value contains a linked list of files that match a given hash. Every file found is inserted into the hash table linked list unless it's already there and therefore a duplicate. All duplicates are added to a red-black tree, where each node contains the file ame and a pointer to a linked list of records containing size, date, and path information.
This program can be compiled in either real or protected modes. Protected mode will allow it to search larger drives. Informal benchmarks on a 1.6 Gig drive with > 900 directories, Pentium 90, compiled for p-mode:
TEST 16 sec. (simple do-nothing recursive dir searcher)
FINDDUP 18 sec. - 16 = 2 sec.
REP-DPMI 36 sec. - 16 = 20 sec.
Subtracting the overhead of FindFirst/ FindNext, this program runs in about 2 seconds as compared to REP-DPMI's 20 seconds. Without ignoring the overhead, it's still twice as fast. Please note that this program is pure pascal.}
{$DEFINE DRBOB} // Do not search hidden directories or find hidden files
{.$DEFINE SAFE}// Checks memory allocations for out of memory conditions
{.$DEFINE CLEANUP}// Frees allocated memory - slows things down a bit
uses
SysUtils;
const
DosDelimSet: set of Char = ['\', ':', #0];
MaxHash = 16381; {largest prime number < 16K}
type
PathStr = string;
NameString = string[12];
St2 = string[2];
pPath = ^PathRecord; {we will only keep one copy of each unique directory path}
PathRecord = record
Next: pPath;
Path: PathStr;
end;
pDataRec = ^DataRec; {detailed information unique to each dupliacte}
DataRec = record
Time: longint;
Size: longint;
Path: pPath;
Next: pDataRec;
end;
link = ^RBTreeNode;
RBTreeNode = record {Red/ black tree node}
Key: NameString; {Name of file}
red: boolean;
l, r: link;
DataP: pDataRec; {linked list of detail information}
end;
pFileRec = ^MyFileRec;
MyFileRec = record
{Hash table record. There will only be one record for each duplicate file name}
Name: NameString;
Time: longint;
Size: longint;
Path: pPath;
Node: link; {let's quickly insert duplicate into tree}
next: pFileRec; {next record in linked list}
end;
tHashTable = array[0..MAXHASH] of pFileRec;
var
Head, z: link;
HashTable: ^tHashTable;
PathHead, TempPathHead: pPath;
OldName: NameString;
procedure RBTreeInitialize;
{Initialize red/ black tree}
begin
New(Z);
{$IFDEF SAFE}
if Z = nil then
exit;
{$ENDIF}
z^.l := z;
z^.r := z;
z^.red := false;
New(Head);
{$IFDEF SAFE}
if Head = nil then
exit;
{$ENDIF}
Head^.r := z;
Head^.l := z;
Head^.Key := '';
Head^.Red := false;
end;
function Rotate(const Value: MyFileRec; y: link): link;
var
c, gc: link;
begin
if Value.Name < Y^.Key then
c := y^.l
else
c := y^.r;
if Value.Name < c^.Key then
begin
gc := c^.l;
c^.l := gc^.r;
gc^.r := c;
end
else
begin
gc := c^.r;
c^.r := gc^.l;
gc^.l := c;
end;
if Value.Name < Y^.Key then
y^.l := gc
else
y^.r := gc;
Rotate := gc;
end;
function Split(const Value: MyFileRec; gg, g, p, x: link): link;
begin
x^.red := true;
x^.l^.red := false;
x^.r^.red := false;
if p^.red then
begin
g^.red := true;
if (Value.Name < g^.Key) <> (Value.Name < p^.Key) then
p := Rotate(Value, g);
x := rotate(Value, gg);
x^.red := false;
end;
Head^.r^.red := false;
split := x;
end;
function RBTreeInsert(const Value: MyFileRec; x: link): link;
{Insert file record into red/ black tree}
var
gg, g, p: link;
begin
p := x;
g := x;
repeat
gg := g;
g := p;
p := x;
if Value.Name < x^.Key then
x := x^.l
else
x := x^.r;
if x^.l^.red and x^.r^.red then
x := split(Value, gg, g, p, x);
until
x = z;
new(x);
{$IFDEF SAFE}
if x = nil then
exit;
{$ENDIF}
x^.Key := Value.Name;
New(x^.DataP);
{$IFDEF SAFE}
if x^.DataP = nil then
exit;
{$ENDIF}
x^.DataP^.Next := nil;
x^.DataP^.Time := Value.Time;
x^.DataP^.Size := Value.Size;
x^.DataP^.Path := Value.Path;
x^.l := z;
x^.r := z;
if Value.Name < p^.Key then
p^.l := x
else
p^.r := x;
RbTreeInsert := x;
x := Split(Value, gg, g, p, x);
end;
procedure Traverse(p: link);
{Traverse red/ black tree, printing out results}
var
TempQ, q: pDataRec;
begin
if (p^.l <> z) and (p^.l <> nil) then
Traverse(p^.l);
if (p <> head) then
begin
if p^.Key <> OldName then
begin
OldName := p^.Key;
writeln(OldName);
end;
q := p^.DataP;
while q <> nil do
begin
with q^ do
writeln(size: 10, ' ', FormatDateTime('yyyy-mm-dd hh:nn:ss',
FileDateToDateTime(Time)), ' ', Path^.Path);
{$IFDEF CLEANUP}
TempQ := q;
q := q^.Next;
Dispose(TempQ);
{$ELSE}
q := q^.Next;
{$ENDIF}
end;
writeln;
end;
if (p^.r <> z) and (p^.r <> nil) then
Traverse(p^.r);
{$IFDEF CLEANUP}
Dispose(p);
{$ENDIF}
end;
function AddBackSlash(const DirName: string): string;
{Add a default backslash to a directory name}
begin
if DirName[Length(DirName)] in DosDelimSet then
AddBackSlash := DirName
else
AddBackSlash := DirName + '\';
end;
function Hash(const Key: NameString): word;
var
h: word;
j: integer;
Len: integer;
begin
Len := Length(Key);
h := ord(Key[1]);
for j := 2 to Len do
begin
h := ((h * 32) + Ord(Key[j])) mod MAXHASH;
end;
Hash := h;
end;
procedure Add(var SR: tSearchRec; DirP: pPath);
{Add a new search record/ path to the hash table}
var
p, q, r: pFileRec;
h: word;
TempData: pDataRec;
begin
h := Hash(SR.Name);
New(r);
{$IFDEF SAFE}
if r = nil then
exit;
{$ENDIF}
r^.Name := SR.Name;
r^.Time := SR.Time;
r^.Size := SR.Size;
r^.Path := DirP;
r^.Next := nil;
r^.Node := nil;
p := HashTable^[H];
if p = nil then
begin {Hash slot not used}
HashTable^[h] := r;
end
else
begin
q := p;
while (p <> nil) and (p^.Name < SR.Name) do
begin
q := p;
p := p^.Next;
end;
if (p <> nil) and (p^.Name = SR.Name) then
begin {Found duplicate file}
if p^.Node = nil then
begin {was not already in tree}
p^.Node := RBTreeInsert(p^, Head);
{save link so we don't have to search tree next time}
end;
New(TempData);
{$IFDEF SAFE}
if TempData = nil then
exit;
{$ENDIF}
TempData^.Time := Sr.Time;
TempData^.Size := Sr.Size;
TempData^.Path := DirP;
TempData^.Next := p^.Node^.DataP; {Add to linked list on tree node}
p^.Node^.DataP := TempData;
Dispose(r); {didn't need it after all}
end
else
begin {Not a duplicate}
if p = q then
begin {Add at start of linked list}
HashTable^[H] := r;
r^.Next := P;
end
else
begin {Insert into linked list}
q^.Next := r;
r^.Next := p;
end;
end;
end;
end;
procedure Find(const Path: PathStr);
{Recursive file/directory searcher}
var
Sr: tSearchRec;
DirP: pPath;
r: integer;
begin
New(DirP);
{$IFDEF SAFE}
if DirP = nil then
exit;
{$ENDIF}
DirP^.Path := Path;
DirP^.Next := PathHead;
PathHead := DirP;
r := FindFirst(AddBackSlash(Path) + '*.*', faAnyFile, Sr);
while r = 0 do
begin
{$IFDEF DRBOB} {only do non-hidden directories}
if ((Sr.Attr and faDirectory) <> 0) and ((Sr.Attr and faHidden) = 0) then
begin
{$ELSE} {do them all}
if (Sr.Attr and Directory) <> 0 then
begin
{$ENDIF}
if Sr.Name[1] <> '.' then
Find(AddBackSlash(Path) + Sr.Name);
r := 0;
end
else
begin
{$IFDEF DRBOB}
if (Sr.Attr and faHidden) = 0 then {Only do non-hidden files}
{$ENDIF}
Add(Sr, DirP);
end;
r := FindNext(Sr);
end;
end;
function HeapFunc(Size: Word): Integer; far;
begin
HeapFunc := 1;
end;
procedure Init;
begin
OldName := '';
PathHead := nil;
{$IFDEF SAFE}
HeapError := @HeapFunc;
{$ENDIF}
New(HashTable);
{$IFDEF SAFE}
if HashTable = nil then
halt;
{$ENDIF}
FillChar(HashTable^, sizeof(HashTable^), 0);
RBTreeInitialize;
end;
procedure Process;
begin
Find(ParamStr(1) + '\');
Traverse(Head);
end;
procedure Done;
var
i: integer;
q, tempq: pFileRec;
begin
{$IFDEF CLEANUP}
Dispose(Z);
for i := 0 to MAXHASH - 1 do
begin
if HashTable^[i] <> nil then
begin
q := HashTable^[i];
while q <> nil do
begin
tempq := q^.next;
Dispose(q);
q := tempq;
end;
end;
end;
Dispose(HashTable);
TempPathHead := PathHead;
while PathHead <> nil do
begin
TempPathHead := PathHead^.Next;
FreeMem(PathHead, Length(PathHead^.Path) + 5);
PathHead := TempPathHead;
end;
{$ENDIF}
end;
begin
Init;
Process;
Done;
end.
Feliratkozás:
Megjegyzések küldése (Atom)
Nincsenek megjegyzések:
Megjegyzés küldése