2004. november 23., kedd

How to find a string in a file


Problem/Question/Abstract:

I have an array of char called FBuffer1. Let's say: StrCopy(FBuffer1,'Test'). I also have a file, let's say File1.exe. I would like to find a very quick way to be able to localize the string "test" in the file1.exe

Answer:

Solve 1:

One way is to remove the file access problems. Load the whole file into a TMemoryStream, then search the stream. Example:

{ ... }
var
  tmem: TMemoryStream;
  buf: array[1..4] of Char;
begin
  zeromemory(@buf, 4);
  tmem := TMemoryStream.Create;
  tmem.loadfromfile('test1.exe');
  tmem.position := 0;
  while tmem.position <> tmem.size do
  begin
    buf[1] := buf[2];
    buf[2] := buf[3];
    buf[3] := buf[4];
    tmem.read(buf[4], 1);
    if compare(buf, 'hello') then
      Memo1.Lines.Add('match found at position ' + Inttostr(tmem.position));
  end;
  tmem.destroy;
end;


Solve 2:

I was working on just that some time ago. Here is my project file with some alternative functions and a time test. Just paste the following listing into a text file, rename the file to Project1.dpr, open the file in Delphi and run it.

{$APPTYPE CONSOLE}

program Project1;

uses
  Windows, SysUtils;

function ScanString(SourceStart, SourceEnd, Search: PChar; CaseSensitive: Boolean): PChar;
var
  SourcePtr: PChar;
  SourceChr: Char;
  SearchPos: DWord;
  SearchPtr: PChar;
begin
  Result := nil;
  if SourceStart > SourceEnd then
    Exit;
  if not CaseSensitive then
    CharUpperBuff(Search, Length(Search));
  SourcePtr := SourceStart;
  SearchPos := 0;
  SearchPtr := Search;
  while SourcePtr <= SourceEnd do
  begin
    SourceChr := SourcePtr^;
    if not CaseSensitive then
      CharUpperBuff(@SourceChr, 1);
    if SourceChr = SearchPtr^ then
    begin
      Inc(SearchPtr);
      if SearchPtr^ = #0 then
      begin
        Result := SourcePtr - SearchPos;
        Break;
      end;
      Inc(SearchPos);
    end
    else if SearchPos > 0 then
    begin
      SearchPos := 0;
      SearchPtr := Search;
    end;
    Inc(SourcePtr);
  end;
end;

function ScanStringNew(SourceStart, SourceEnd, SearchStr: PChar;
  CaseSensitive: Boolean): PChar;
var
  SourcePtr: PChar;
  ScanLen: DWord;
  ScanPos: DWord;
  ScanStr: PChar;
  ScanPtr: PChar;
  ScanUppStr: PChar;
  ScanUppPtr: PChar;
  ScanLowStr: PChar;
  ScanLowPtr: PChar;
begin
  Result := nil;
  if SourceStart > SourceEnd then
    Exit;
  ScanLen := Length(SearchStr);
  if not CaseSensitive then
  begin
    GetMem(ScanUppStr, ScanLen);
    CopyMemory(ScanUppStr, SearchStr, ScanLen);
    CharUpperBuff(ScanUppStr, ScanLen);
    GetMem(ScanLowStr, ScanLen);
    CopyMemory(ScanLowStr, SearchStr, ScanLen);
    CharLowerBuff(ScanLowStr, ScanLen);
  end
  else
  begin
    ScanUppStr := SearchStr;
    ScanLowStr := SearchStr;
  end;
  ScanPos := 0;
  ScanUppPtr := ScanUppStr;
  ScanLowPtr := ScanLowStr;
  SourcePtr := SourceStart;
  ScanPtr := ScanStr;
  while SourcePtr <= SourceEnd do
  begin
    if (SourcePtr^ = ScanUppPtr^) or (SourcePtr^ = ScanLowPtr^) then
    begin
      Inc(ScanPos);
      if ScanPos = ScanLen then
      begin
        Result := SourcePtr - ScanPos + 1;
        Break;
      end;
      Inc(ScanUppPtr);
      Inc(ScanLowPtr);
    end
    else if ScanPos > 0 then
    begin
      ScanPos := 0;
      ScanUppPtr := ScanUppStr;
      ScanLowPtr := ScanLowStr;
    end;
    Inc(SourcePtr);
  end;
  if not CaseSensitive then
  begin
    FreeMem(ScanUppStr, ScanLen);
    FreeMem(ScanLowStr, ScanLen);
  end;
end;


function ScanStringAsm(SourceStart, SourceEnd, SearchStr: PChar;
  CaseSensitive: Boolean): PChar;
var
  ScanLen: DWord;
  ScanPos: DWord;
  ScanStr: PChar;
  ScanPtr: PChar;
  ScanUppStr: PChar;
  ScanUppPtr: PChar;
  ScanLowStr: PChar;
  ScanLowPtr: PChar;
begin
  if SourceStart > SourceEnd then
    Exit;
  ScanLen := Length(SearchStr);
  if not CaseSensitive then
  begin
    GetMem(ScanUppStr, ScanLen);
    CopyMemory(ScanUppStr, SearchStr, ScanLen);
    CharUpperBuff(ScanUppStr, ScanLen);
    GetMem(ScanLowStr, ScanLen);
    CopyMemory(ScanLowStr, SearchStr, ScanLen);
    CharLowerBuff(ScanLowStr, ScanLen);
  end
  else
  begin
    ScanUppStr := SearchStr;
    ScanLowStr := SearchStr;
  end;
  GetMem(ScanStr, ScanLen * 2 + 2);
  ScanPos := ScanLen;
  ScanPtr := ScanStr;
  ScanUppPtr := ScanUppStr;
  ScanLowPtr := ScanLowStr;
  while ScanPos > 0 do
  begin
    ScanPtr^ := ScanUppPtr^;
    Inc(ScanPtr);
    Inc(ScanUppPtr);
    ScanPtr^ := ScanLowPtr^;
    Inc(ScanPtr);
    Inc(ScanLowPtr);
    Dec(ScanPos);
  end;
  ScanPtr^ := #0;

  asm
      {Register use:
      EDI - pointer to source char
      ESI - pointer to par of scan chars
      AL - current source char
      EBX - match length counter
      ECX - source length counter
      DX - current par of scan chars}

  end;
  


if not CaseSensitive then
begin
FreeMem(ScanUppStr, ScanLen);
  

FreeMem(ScanLowStr, ScanLen);
end;
  

FreeMem(ScanStr, ScanLen * 2 + 2);
  
end;
  

end;

if not CaseSensitive then
begin
  FreeMem(ScanUppStr, ScanLen);

  FreeMem(ScanLowStr, ScanLen);

end;

FreeMem(ScanStr, ScanLen * 2 + 2);

end;


{Preserve registers:}
PUSH EBX {Preserve registers EBX, EDI, ESI:}
PUSH EDI
PUSH ESI
{Initialize registers:}
MOV EDI, SourceStart {Move addr SourceStart to EDI}
MOV ECX, SourceEnd {Calculate source length in ECX:}
SUB ECX, EDI
INC ECX
MOV ESI, ScanStr {Move addr ScanStr to ESI}
MOV DX, WORD[ESI] {Move first par of scan chars to DX}
xor EBX, EBX {Set EBX (match counter) to 0}
@01: {Main test loop:}
MOV AL, BYTE[EDI] {Move current source char to AL}
INC EDI {Inc EDI to point to next source char}
CMP AL, DL {Compare AL with scan char in DL (uppcase)}
JE@10 {Jump to @10 if equal (match)}
CMP AL, DH {Compare AL with scan char in DH (lowcase)}
JE@10 {Jump to @10 if equal (match)}
TEST EBX, EBX {Test EBX (match counter)}
JZ@02 {Jump to @02 if zero (i.e. first scan char)}
SUB ESI, EBX {Move ESI back to start of scan string:}
SUB ESI, EBX
MOV DX, WORD[ESI] {Move first par of scan chars to DX}
xor EBX, EBX {Set EBX to 0}
@02: {Next loop:}
DEC ECX {Dec ECX (source length counter)}
JNZ@01 {Jump back to @01 if not zero}
MOV Result, 0 {Move nil to Result (match not found)}
JMP@99 {Jump to @99}
@10: {Char match found:}
INC EBX {Inc EBX (match length counter):}
ADD ESI, 2 {Move ESI to next par of scan chars:}
MOV DX, WORD[ESI] {Move this par of scan chars to DX}
CMP DL, 0 {Compare char in DL with #0 (end of string)}
JNE@02 {Jump to @02 if not equal (test next char)}
{Match found:}
SUB EDI, EBX {Move EDI back to first char in match}
MOV Result, EDI {Move addr of match to Result}
@99: {Restore registers:}
POP ESI
POP EDI
POP EBX
end;

if not CaseSensitive then
begin
  FreeMem(ScanUppStr, ScanLen);
  FreeMem(ScanLowStr, ScanLen);
end;
FreeMem(ScanStr, ScanLen * 2 + 2);
end;


procedure TimeTest2;
var
  Time1: DWord;
  Time2: DWord;
  Search: string;
  TestName: string;
  TestFile: file;
  TestSize: DWord;
  TestBuff: PChar;
  TestScan: PChar;
  TestPtr: PChar;
  TestPos: Integer;
  HitCount: Integer;
  n, i, j: Integer;
  c: Char;
  Show: Boolean;
begin
  n := 20;
  Show := false;
  Search := 'WINDOWS';

  {TestBuff := PChar(Search);
  TestScan := TestBuff;
  c := TestScan^;
  Time1 := GetTickCount;
  for i := 1 to 10000000 do
  begin
    if TestBuff^ = c then
    begin
    end;
  end;
  Time2 := GetTickCount;
  WriteLn('Tickcount : ', Time2 - Time1);
  Exit;}

  TestName := 'c:\windows\help\getstart.chm';
  AssignFile(TestFile, TestName);
  Reset(TestFile, 1);
  TestSize := FileSize(TestFile);
  GetMem(TestBuff, TestSize);
  BlockRead(TestFile, TestBuff^, TestSize);
  CloseFile(TestFile);

  WriteLn;
  WriteLn('Scaning for "', Search, '"  ', n, ' times');
  WriteLn('in file: ', TestName, '  size: ', TestSize, ' bytes');

  HitCount := 0;
  Time1 := GetTickCount;
  for i := 1 to n do
  begin
    TestScan := TestBuff;
    repeat
      if TestScan <> TestBuff then
        Inc(TestScan, Length(Search));
      TestScan := ScanString(TestScan, TestBuff + TestSize - 1, PChar(Search), false);
      if TestScan <> nil then
      begin
        Inc(HitCount);
        if Show then
        begin
          Write(HitCount, '  ');
          TestPtr := TestScan;
          for TestPos := 1 to Length(Search) do
          begin
            Write(TestPtr^);
            Inc(TestPtr);
          end;
          WriteLn;
          ReadLn;
        end;
      end;
    until TestScan = nil;
  end;
  Time2 := GetTickCount;
  WriteLn('  Tickcount ScanString   : ', Time2 - Time1: 5, 'ms', '  hitcount:', HitCount);
  HitCount := 0;
  Time1 := GetTickCount;
  for i := 1 to n do
  begin
    TestScan := TestBuff;
    repeat
      if TestScan <> TestBuff then
        Inc(TestScan, Length(Search));
      TestScan := ScanStringNew(TestScan, TestBuff + TestSize - 1, PChar(Search), false);
      if TestScan <> nil then
      begin
        Inc(HitCount);
        if Show then
        begin
          Write(HitCount, '  ');
          TestPtr := TestScan;
          for TestPos := 1 to Length(Search) do
          begin
            Write(TestPtr^);
            Inc(TestPtr);
          end;
          WriteLn;
          ReadLn;
        end;
      end;
    until TestScan = nil;
  end;
  Time2 := GetTickCount;
  WriteLn('  Tickcount ScanStringNew: ', Time2 - Time1: 5, 'ms', '  hitcount:', HitCount);
  HitCount := 0;
  Time1 := GetTickCount;
  for i := 1 to n do
  begin
    TestScan := TestBuff;
    repeat
      if TestScan <> TestBuff then
        Inc(TestScan, Length(Search));
      TestScan := ScanStringAsm(TestScan, TestBuff + TestSize - 1, PChar(Search), false);
      if TestScan <> nil then
      begin
        Inc(HitCount);
        if Show then
        begin
          Write(HitCount, '  ');
          TestPtr := TestScan;
          for TestPos := 1 to Length(Search) do
          begin
            Write(TestPtr^);
            Inc(TestPtr);
          end;
          WriteLn;
          ReadLn;
        end;
      end;
    until TestScan = nil;
  end;
  Time2 := GetTickCount;
  WriteLn('  Tickcount ScanStringAsm: ', Time2 - Time1: 5, 'ms', '  hitcount:', HitCount);
  FreeMem(TestBuff, TestSize);
end;

begin
  TimeTest2;
  WriteLn;
  WriteLn('** press enter to close **');
  ReadLn;
end.


Solve 3:

function ScanFile(const filename: string; const forString: string; caseSensitive: Boolean): LongInt;
{ returns position of string in file or -1, if not found }
const
  BufferSize = $8001; { 32K + 1 bytes }
var
  pBuf, pEnd, pScan, pPos: Pchar;
  filesize: LongInt;
  bytesRemaining: LongInt;
  bytesToRead: Integer;
  F: file;
  SearchFor: Pchar;
  oldMode: Word;
begin
  Result := -1; { assume failure }
  if (Length(forString) = 0) or (Length(filename) = 0) then
    Exit;
  SearchFor := nil;
  pBuf := nil;
  { open file as binary, 1 byte recordsize }
  AssignFile(F, filename);
  oldMode := FileMode;
  FileMode := 0; { read-only access }
  Reset(F, 1);
  FileMode := oldMode;
  try { allocate memory for buffer and pchar search string }
    SearchFor := StrAlloc(Length(forString) + 1);
    StrPCopy(SearchFor, forString);
    if not caseSensitive then { convert to upper case }
      AnsiUpper(SearchFor);
    GetMem(pBuf, BufferSize);
    filesize := System.Filesize(F);
    bytesRemaining := filesize;
    pPos := nil;
    while bytesRemaining > 0 do
    begin
      { calc how many bytes to read this round }
      if bytesRemaining >= BufferSize then
        bytesToRead := Pred(BufferSize)
      else
        bytesToRead := bytesRemaining;
      { read a buffer full and zero-terminate the buffer }
      BlockRead(F, pBuf^, bytesToRead, bytesToRead);
      pEnd := @pBuf[bytesToRead];
      pEnd^ := #0;
      { scan the buffer. Problem: buffer may contain #0 chars! So we treat it as
      a concatenation of zero-terminated strings. }
      pScan := pBuf;
      while pScan < pEnd do
      begin
        if not caseSensitive then { convert to upper case }
          AnsiUpper(pScan);
        pPos := StrPos(pScan, SearchFor); { search for substring }
        if pPos <> nil then
        begin { Found it! }
          Result := FileSize - bytesRemaining + LongInt(pPos) - LongInt(pBuf);
          Break;
        end;
        pScan := StrEnd(pScan);
        Inc(pScan);
      end;
      if pPos <> nil then
        Break;
      bytesRemaining := bytesRemaining - bytesToRead;
      if bytesRemaining > 0 then
      begin
        { no luck in this buffers load. We need to handle the case of the search
        string spanning two chunks of file now. We simply go back a bit in the file
        and read from there, thus inspecting some characters twice }
        Seek(F, FilePos(F) - Length(forString));
        bytesRemaining := bytesRemaining + Length(forString);
      end;
    end;
  finally
    CloseFile(F);
    if SearchFor <> nil then
      StrDispose(SearchFor);
    if pBuf <> nil then
      FreeMem(pBuf, BufferSize);
  end;
end;


Solve 4:

One option is to just read the entire file into a single string. The old-fashioned way is to use BlockRead. You could also use a file stream. Once you have it in a single string you can use any normal string operations, even if there are embedded null bytes or CR/LF's.

procedure TForm1.Button1Click(Sender: TObject);
var
  s: string;
  f: file;
  p: integer;
begin
  AssignFile(f, 'c:\winnt\system32\mspaint.exe');
  FileMode := 0;
  Reset(f, 1);
  SetLength(s, FileSize(f));
  BlockRead(f, s[1], FileSize(f));
  CloseFile(f);
  p := pos('This program cannot be run in DOS mode', s);
  Label1.Caption := 'Found at : ' + IntToStr(p);
end;

Nincsenek megjegyzések:

Megjegyzés küldése