2008. november 11., kedd

Boyer-Moore string searching


Problem/Question/Abstract:

Boyer-Moore string searching

Answer:

Solve 1:

unit BMSearch;

interface

type
{$IFDEF WINDOWS}
  size_t = Word;
{$ELSE}
  size_t = LongInt;
{$ENDIF}

type
  TTranslationTable = array[char] of char; { translation table }
  TSearchBM = class(TObject)
  private
    FTranslate: TTranslationTable; { translation table }
    FJumpTable: array[char] of Byte; { Jumping table }
    FShift_1: integer;
    FPattern: pchar;
    FPatternLen: size_t;
  public
    procedure Prepare(Pattern: pchar; PatternLen: size_t; IgnoreCase: Boolean);
    procedure PrepareStr(const Pattern: string; IgnoreCase: Boolean);
    function Search(Text: pchar; TextLen: size_t): pchar;
    function Pos(const S: string): integer;
  end;

implementation

uses
  SysUtils;

{Ignore Case Table Translation}

procedure CreateTranslationTable(var T: TTranslationTable; IgnoreCase: Boolean);
var
  c: char;
begin
  for c := #0 to #255 do
    T[c] := c;
  if not IgnoreCase then
    exit;
  for c := 'a' to 'z' do
    T[c] := UpCase(c);

  { Mapping all accented characters to their uppercase equivalent }

  T['�'] := 'A';
  T['�'] := 'A';
  T['�'] := 'A';
  T['�'] := 'A';

  T['�'] := 'A';
  T['�'] := 'A';
  T['�'] := 'A';
  T['�'] := 'A';

  T['�'] := 'E';
  T['�'] := 'E';
  T['�'] := 'E';
  T['�'] := 'E';

  T['�'] := 'E';
  T['�'] := 'E';
  T['�'] := 'E';
  T['�'] := 'E';

  T['�'] := 'I';
  T['�'] := 'I';
  T['�'] := 'I';
  T['�'] := 'I';

  T['�'] := 'I';
  T['�'] := 'I';
  T['�'] := 'I';
  T['�'] := 'I';

  T['�'] := 'O';
  T['�'] := 'O';
  T['�'] := 'O';
  T['�'] := 'O';

  T['�'] := 'O';
  T['�'] := 'O';
  T['�'] := 'O';
  T['�'] := 'O';

  T['�'] := 'U';
  T['�'] := 'U';
  T['�'] := 'U';
  T['�'] := 'U';

  T['�'] := 'U';
  T['�'] := 'U';
  T['�'] := 'U';
  T['�'] := 'U';

  T['�'] := '�';
end;

{Preparation of the jumping table}

procedure TSearchBM.Prepare(Pattern: pchar; PatternLen: size_t; IgnoreCase: Boolean);
var
  i: integer;
  c, lastc: char;
begin
  FPattern := Pattern;
  FPatternLen := PatternLen;
  if FPatternLen < 1 then
    FPatternLen := strlen(FPattern);
  {This algorythm is based on a character set of 256}
  if FPatternLen > 256 then
    exit;
  {1. Preparing translating table}
  CreateTranslationTable(FTranslate, IgnoreCase);
  {2. Preparing jumping table}
  for c := #0 to #255 do
    FJumpTable[c] := FPatternLen;
  for i := FPatternLen - 1 downto 0 do
  begin
    c := FTranslate[FPattern[i]];
    if FJumpTable[c] >= FPatternLen - 1 then
      FJumpTable[c] := FPatternLen - 1 - i;
  end;
  FShift_1 := FPatternLen - 1;
  lastc := FTranslate[Pattern[FPatternLen - 1]];
  for i := FPatternLen - 2 downto 0 do
    if FTranslate[FPattern[i]] = lastc then
    begin
      FShift_1 := FPatternLen - 1 - i;
      break;
    end;
  if FShift_1 = 0 then
    FShift_1 := 1;
end;

procedure TSearchBM.PrepareStr(const Pattern: string; IgnoreCase: Boolean);
var
  str: pchar;
begin
  if Pattern <> '' then
  begin
{$IFDEF Windows}
    str := @Pattern[1];
{$ELSE}
    str := pchar(Pattern);
{$ENDIF}
    Prepare(str, Length(Pattern), IgnoreCase);
  end;
end;

{Searching Last char & scanning right to left}

function TSearchBM.Search(Text: pchar; TextLen: size_t): pchar;
var
  shift, m1, j: integer;
  jumps: size_t;
begin
  result := nil;
  if FPatternLen > 256 then
    exit;
  if TextLen < 1 then
    TextLen := strlen(Text);
  m1 := FPatternLen - 1;
  shift := 0;
  jumps := 0;
  {Searching the last character}
  while jumps <= TextLen do
  begin
    Inc(Text, shift);
    shift := FJumpTable[FTranslate[Text^]];
    while shift <> 0 do
    begin
      Inc(jumps, shift);
      if jumps > TextLen then
        exit;
      Inc(Text, shift);
      shift := FJumpTable[FTranslate[Text^]];
    end;
    { Compare right to left FPatternLen - 1 characters }
    if jumps >= m1 then
    begin
      j := 0;
      while FTranslate[FPattern[m1 - j]] = FTranslate[(Text - j)^] do
      begin
        Inc(j);
        if j = FPatternLen then
        begin
          result := Text - m1;
          exit;
        end;
      end;
    end;
    shift := FShift_1;
    Inc(jumps, shift);
  end;
end;

function TSearchBM.Pos(const S: string): integer;
var
  str, p: pchar;
begin
  result := 0;
  if S <> '' then
  begin
{$IFDEF Windows}
    str := @S[1];
{$ELSE}
    str := pchar(S);
{$ENDIF}
    p := Search(str, Length(S));
    if p <> nil then
      result := 1 + p - str;
  end;
end;

end.


Solve 2:

Here's a demo program of the Boyer-Moore search algorithm. The basic idea is to first create a Boyer-Moore index table for the string you want to search for, and then call the BMsearch routine. Remember to turn-off Range Checking {$R-} in your finished program, otherwise the BMSearch will take 3-4 times longer than it should.


{Public-domain demo of Boyer-Moore search algorithm.
Guy McLoughlin - May 1, 1993.}

program DemoBMSearch;

{Boyer-Moore index table data definition}
type
  BMTable = array[0..127] of byte;

  {Create a Boyer-Moore index table to search with.}

procedure Create_BMTable(Pattern: string; var BMT: BMTable);
var
  Index: byte;
begin
  fillchar(BMT, sizeof(BMT), length(Pattern));
  for Index := 1 to length(Pattern) do
    BMT[ord(Pattern[Index])] := (length(Pattern) - Index)
end;

{Boyer-Moore Search function. Returns 0 if string is not found. Returns 65,535 if
BufferSize is too large, ie: greater than 65,520 bytes.}

function BMsearch(var Buffer; BuffSize: word; var BMT: BMTable; Pattern: string): word;
var
  Buffer2: array[1..65520] of char absolute Buffer;
  Index1, Index2, PatSize: word;
begin
  if (BuffSize > 65520) then
  begin
    BMsearch := $FFFF;
    exit
  end;
  PatSize := length(Pattern);
  Index1 := PatSize;
  Index2 := PatSize;
  repeat
    if (Buffer2[Index1] = Pattern[Index2]) then
    begin
      dec(Index1);
      dec(Index2)
    end
    else
    begin
      if (succ(PatSize - Index2) > (BMT[ord(Buffer2[Index1])])) then
        inc(Index1, succ(PatSize - Index2))
      else
        inc(Index1, BMT[ord(Buffer2[Index1])]);
      Index2 := PatSize
    end;
  until
    (Index2 < 1) or (Index1 > BuffSize);
  if (Index1 > BuffSize) then
    BMsearch := 0
  else
    BMsearch := succ(Index1)
end;

type
  arby_64K = array[1..65520] of byte;

var
  Index: word;
  st_Temp: string[10];
  Buffer: ^arby_64K;
  BMT: BMTable;

begin
  new(Buffer);
  fillchar(Buffer^, sizeof(Buffer^), 0);
  st_Temp := 'Gumby';
  move(st_Temp[1], Buffer^[65516], length(st_Temp));
  Create_BMTable(st_Temp, BMT);
  Index := BMSearch(Buffer^, sizeof(Buffer^), BMT, st_Temp);
  writeln(st_Temp, ' found at offset ', Index)
end.

Nincsenek megjegyzések:

Megjegyzés küldése