2004. május 9., vasárnap

How to search for a pattern in a file


Problem/Question/Abstract:

I need to locate a pattern in a file (both text and binary) - just like the Pos function does with the strings. Preferably, it should deal with FileStream. Straightforward solution first seemed kind of expensive - that is to just plainly go through the stream comparing patterns on every step.

Answer:

Solve 1:

You can do it that way but it is much faster to load chunks of data into a sizeable buffer and do the search in the buffer. Here is an example:

function ScanFile(const filename: string; const forString: string; caseSensitive:
  Boolean): LongInt;
{ returns position of string in file or -1, if not found }
const
  BufferSize = $8001; { 32K + 1 bytes }
var
  pBuf, pEnd, pScan, pPos: Pchar;
  filesize: LongInt;
  bytesRemaining: LongInt;
  bytesToRead: Word;
  F: file;
  SearchFor: Pchar;
  oldMode: Word;
begin
  Result := -1; { assume failure }
  if (Length(forString) = 0) or (Length(filename) = 0) then
    Exit;
  SearchFor := nil;
  pBuf := nil;
  { open file as binary, 1 byte recordsize }
  AssignFile(F, filename);
  oldMode := FileMode;
  FileMode := 0; { read-only access }
  Reset(F, 1);
  FileMode := oldMode;
  try { allocate memory for buffer and pchar search string }
    SearchFor := StrAlloc(Length(forString) + 1);
    StrPCopy(SearchFor, forString);
    if not caseSensitive then { convert to upper case }
      AnsiUpper(SearchFor);
    GetMem(pBuf, BufferSize);
    filesize := System.Filesize(F);
    bytesRemaining := filesize;
    pPos := nil;
    while bytesRemaining > 0 do
    begin
      { calc how many bytes to read this round }
      if bytesRemaining >= BufferSize then
        bytesToRead := Pred(BufferSize)
      else
        bytesToRead := bytesRemaining;
      { read a buffer full and zero-terminate the buffer }
      BlockRead(F, pBuf^, bytesToRead, bytesToRead);
      pEnd := @pBuf[bytesToRead];
      pEnd^ := #0;
      { scan the buffer. Problem: buffer may contain #0 chars! So we
      treat it as a concatenation of zero-terminated strings. }
      pScan := pBuf;
      while pScan < pEnd do
      begin
        if not caseSensitive then { convert to upper case }
          AnsiUpper(pScan);
        pPos := StrPos(pScan, SearchFor); { search for substring }
        if pPos <> nil then
        begin { Found it! }
          Result := FileSize - bytesRemaining + LongInt(pPos) - LongInt(pBuf);
          Break;
        end;
        pScan := StrEnd(pScan);
        Inc(pScan);
      end;
      if pPos <> nil then
        Break;
      bytesRemaining := bytesRemaining - bytesToRead;
      if bytesRemaining > 0 then
      begin
        { no luck in this buffers load. We need to handle the case of the
                                search string spanning two chunks of file now. We simply go back a bit in
                                the file and read from there, thus inspecting some characters twice }
        Seek(F, FilePos(F) - Length(forString));
        bytesRemaining := bytesRemaining + Length(forString);
      end;
    end;
  finally
    CloseFile(F);
    if SearchFor <> nil then
      StrDispose(SearchFor);
    if pBuf <> nil then
      FreeMem(pBuf, BufferSize);
  end;
end;


Solve 2:

procedure TForm1.Button1Click(Sender: TObject);
var
  s: string;
  hFile: THandle;
  hFileMapObj: THandle;
  pSharedBuf: Pointer;
  Time0: Integer;
  p: PChar;
begin
  if not OpenDialog1.Execute then
    Exit;
  s := InputBox('Find', 'Match', '');
  Time0 := GetTickCount;
  hfile := 0;
  hFileMapObj := 0;
  pSharedBuf := nil;
  try
    hFile := FileOpen(OpenDialog1.FileName, fmOpenRead);
    Win32Check(hFileMapObj <> INVALID_HANDLE_VALUE);
    hFileMapObj := CreateFileMapping(hFile, nil, PAGE_READONLY, 0, 0, nil);
    Win32Check(hFileMapObj <> 0);
    pSharedBuf := MapViewOfFile(hFileMapObj, FILE_MAP_READ, 0, 0, 0);
    Win32Check(pSharedBuf <> nil);
    P := StrPos(PChar(pSharedBuf), PChar(s));
  finally
    if pSharedBuf <> nil then
      UnMapViewOfFile(pSharedBuf);
    if hFileMapObj <> 0 then
      CloseHandle(hFileMapObj);
    if hFile <> 0 then
      CloseHandle(hFile);
  end;
  if P = nil then
    Caption := Format('Not found, ticks=%d', [GetTickCount - Time0])
  else
    Caption := Format('Found it at pos %d, ticks=%d', [Integer(P - PChar(pSharedBuf)),
      GetTickCount - Time0]);
end;

Nincsenek megjegyzések:

Megjegyzés küldése