2010. október 12., kedd

Implement fuzzy search


Problem/Question/Abstract:

How to implement fuzzy search

Answer:

Solve 1:

This DLL calculates the Levenshtein Distance between two strings. Please note that ShareMem must be the first unit in the Uses clause of the Interface section of your unit, if your DLL exports procedures or functions, which pass string parameters or function results. ShareMem is the interface to delphimm.dll, which you have to distribute together with your own DLL. To avoid using delphimm.dll, pass string parameters by using PChar or ShortString parameters.

library Levensh;

uses
  ShareMem, SysUtils;

var
  FiR0: integer;
  FiP0: integer;
  FiQ0: integer;

function Min(X, Y, Z: Integer): Integer;
begin
  if (X < Y) then
    Result := X
  else
    Result := Y;
  if (Result > Z) then
    Result := Z;
end;

procedure LevenshteinPQR(p, q, r: integer);
begin
  FiP0 := p;
  FiQ0 := q;
  FiR0 := r;
end;

function LevenshteinDistance(const sString, sPattern: string): Integer;
const
  MAX_SIZE = 50;
var
  aiDistance: array[0..MAX_SIZE, 0..MAX_SIZE] of Integer;
  i, j, iStringLength, iPatternLength, iMaxI, iMaxJ: Integer;
  chChar: Char;
  iP, iQ, iR, iPP: Integer;
begin
  iStringLength := length(sString);
  if (iStringLength > MAX_SIZE) then
    iMaxI := MAX_SIZE
  else
    iMaxI := iStringLength;
  iPatternLength := length(sPattern);
  if (iPatternLength > MAX_SIZE) then
    iMaxJ := MAX_SIZE
  else
    iMaxJ := iPatternLength;
  aiDistance[0, 0] := 0;
  for i := 1 to iMaxI do
    aiDistance[i, 0] := aiDistance[i - 1, 0] + FiR0;
  for j := 1 to iMaxJ do
  begin
    chChar := sPattern[j];
    if ((chChar = '*') or (chChar = '?')) then
      iP := 0
    else
      iP := FiP0;
    if (chChar = '*') then
      iQ := 0
    else
      iQ := FiQ0;
    if (chChar = '*') then
      iR := 0
    else
      iR := FiR0;
    aiDistance[0, j] := aiDistance[0, j - 1] + iQ;
    for i := 1 to iMaxI do
    begin
      if (sString[i] = sPattern[j]) then
        iPP := 0
      else
        iPP := iP;
      {aiDistance[i, j] := Minimum of 3 values}
      aiDistance[i, j] := Min(aiDistance[i - 1, j - 1] + iPP,
        aiDistance[i, j - 1] + iQ,
        aiDistance[i - 1, j] + iR);
    end;
  end;
  Result := aiDistance[iMaxI, iMaxJ];
end;

exports
  LevenshteinDistance Index 1,
  LevenshteinPQR Index 2;

begin
  FiR0 := 1;
  FiP0 := 1;
  FiQ0 := 1;
end.


Solve 2:

This is an old Pascal code snippet, which is based on a C project published in the C't magazine somewhen back in the 1990's. Can't remember where I found it on the WWW. Please note that the code below accesses a simple *.txt file to search in.

program FuzzySearch;
{Translation from C to Pascal by Karsten Paulini and Simon Reinhardt}
const
  MaxParLen = 255;
var
  InFile: Text;
  Filename: string;
  InputStr: string;
  SearchStr: string;
  Treshold: Integer;

function PrepareTheString(OriginStr: string; var ConvStr: string): Integer;
var
  i: Integer;
begin
  ConvStr := OriginStr;
  for i := 1 to Length(OriginStr) do
  begin
    ConvStr[i] := UpCase(ConvStr[i]);
    if ConvStr[i] < '0' then
      ConvStr[i] := ' '
    else
      case ConvStr[i] of
        Chr(196): ConvStr[i] := Chr(228);
        Chr(214): ConvStr[i] := Chr(246);
        Chr(220): ConvStr[i] := Chr(252);
        Chr(142): ConvStr[i] := Chr(132);
        Chr(153): ConvStr[i] := Chr(148);
        Chr(154): ConvStr[i] := Chr(129);
        ':': ConvStr[i] := ' ';
        ';': ConvStr[i] := ' ';
        '<': ConvStr[i] := ' ';
        '>': ConvStr[i] := ' ';
        '=': ConvStr[i] := ' ';
        '?': ConvStr[i] := ' ';
        '[': ConvStr[i] := ' ';
        ']': ConvStr[i] := ' ';
      end;
  end;
  PrepareTheString := i;
end;

function NGramMatch(TextPara, SearchStr: string; SearchStrLen, NGramLen: Integer;
  var MaxMatch: Integer): Integer;
var
  NGram: string[8];
  NGramCount: Integer;
  i, Count: Integer;
begin
  NGramCount := SearchStrLen - NGramLen + 1;
  Count := 0;
  MaxMatch := 0;
  for i := 1 to NGramCount do
  begin
    NGram := Copy(SearchStr, i, NGramLen);
    if (NGram[NGramLen - 1] = ' ') and (NGram[1] < > ' ') then
      Inc(i, NGramLen - 3) {will be increased in the loop}
    else
    begin
      Inc(MaxMatch, NGramLen);
      if Pos(NGram, TextPara) > 0 then
        Inc(Count);
    end;
  end;
  NGramMatch := Count * NGramLen;
end;

procedure FuzzyMatching(SearchStr: string; Treshold: Integer; var InFile: Text);
var
  TextPara: string;
  TextBuffer: string;
  TextLen: Integer;
  SearchStrLen: Integer;
  NGram1Len: Integer;
  NGram2Len: Integer;
  MatchCount1: Integer;
  MatchCount2: Integer;
  MaxMatch1: Integer;
  MaxMatch2: Integer;
  Similarity: Real;
  BestSim: Real;
begin
  BestSim := 0.0;
  SearchStrLen := PrepareTheString(SearchStr, SearchStr);
  NGram1Len := 3;
  if SearchStrLen < 7 then
    NGram2Len := 2
  else
    NGram2Len := 5;
  while not Eof(InFile) do
  begin
    Readln(InFile, TextBuffer);
    TextLen := PrepareTheString(TextBuffer, TextPara) + 1;
    TextPara := Concat(' ', TextPara);
    if TextLen < MaxParLen - 2 then
    begin
      MatchCount1 := NGramMatch(TextPara, SearchStr, SearchStrLen, NGram1Len, MaxMatch1);
      MatchCount2 := NGramMatch(TextPara, SearchStr, SearchStrLen, NGram2Len, MaxMatch2);
      Similarity := 100.0 * (MatchCount1 + MatchCount2) / (MaxMatch1 + MaxMatch2);
      if Similarity > BestSim then
        BestSim := Similarity;
      if Similarity >= Treshold then
      begin
        Writeln;
        Writeln('[', Similarity, '] ', TextBuffer);
      end;
    end;
  else
    Writeln('Paragraph too long');
end;
if BestSim < Treshold then
  Writeln('No match; Best Match was ', BestSim);
end;

begin
  Writeln;
  Writeln('+------------------------------------------+');
  Writeln('| Fuzzy Search in Information Retrieval |');
  Writeln('|         (C) 1997 Reinhard Rapp           |');
  Writeln('+------------------------------------------+');
  Writeln;
  Write('Name of file to search in: ');
  Readln(Filename);
  Write('Search string: ');
  Readln(InputStr);
  SearchStr := Concat(' ', InputStr, ' ');
  Write('Minimum hit quality in % : ');
  Readln(Treshold);
  if (Treshold > 0) and (Treshold <= 100) and (SearchStr < > '') and (Filename < > '') then
  begin
    Assign(InFile, Filename);
    Reset(InFile);
    FuzzyMatching(SearchStr, Treshold, InFile);
    Close(InFile);
  end;
  Writeln;
  Writeln('Bye!');
end.


Solve 3:

unit FuzzyMatch;

{This unit provides a basic 'fuzzy match' index on how alike two strings are
     The result is of type 'single': near 0 - poor match
                                     near 1 - close match
     The intention is that HowAlike(s1,s2)=HowAlike(s2,s1)
     The Function is not case sensitive}

interface

uses sysutils;

function HowAlike(s1, s2: string): single;

implementation

function instr(start: integer; ToSearch, ToFind: string): integer;
begin
  //This is a quick implementation of the VB InStr, since Pos just doesn't do what is needed!!
  //NB - case sensitive!!
  if start > 1 then
    Delete(ToSearch, 1, start - 1);
  result := pos(ToFind, ToSearch);
  if (result > 0) and (start > 1) then
    inc(result, start);
end;

function HowAlike(s1, s2: string): single;
var
  l1, l2, pass, position, size, foundpos, maxscore: integer;
  score, scored, string1pos, string2pos, bestmatchpos: single;
  swapstring, searchblock: string;
begin
  s1 := Uppercase(trim(s1));
  s2 := Uppercase(trim(s2));

  score := 0;
  maxscore := 0;
  scored := 0;

  //deal with zero length strings...
  if (s1 = '') and (s2 = '') then
  begin
    result := 1;
    exit;
  end
  else if (s1 = '') or (s2 = '') then
  begin
    result := 0;
    exit;
  end;

  //why perform any mathematics is the result is clear?
  if s1 = s2 then
  begin
    result := 1;
    exit;
  end;

  //make two passes,
  //     with s1 and s2 each way round to ensure
  //     consistent results
  for pass := 1 to 2 do
  begin
    l1 := length(s1);
    l2 := length(s2);
    for size := l1 downto 1 do
    begin
      for position := 1 to (l1 - size + 1) do
      begin
        //try to find implied block in the other string
        //Big blocks score much better than small blocks
        searchblock := copy(s1, position, size);
        foundpos := pos(searchblock, s2);

        if size = l1 then
          string1pos := 0.5
        else
          string1pos := (position - 1) / (l1 - size);

        if foundpos > 0 then
        begin
          //the string is in somewhere in there
          //    - find the 'closest' one.
          bestmatchpos := -100; //won't find anything that far away!

          repeat
            if size = l2 then
              string2pos := 0.5
            else
              string2pos := (foundpos - 1) / (l2 - size);

            //If this closer than the previous best?
            if abs(string2pos - string1pos) < abs(bestmatchpos - string1pos) then
              bestmatchpos := string2pos;

            foundpos := instr(foundpos + 1, s2, searchblock);
          until foundpos = 0; //loop while foundpos>0..

          //The closest position is now known: Score it!
          //Score as follows: (1-distance of best match)
          score := score + (1 - abs(string1pos - bestmatchpos));
        end;

        //Keep track if the maximum possible score
        //BE CAREFUL IF CHANGING THIS FUNCTION!!!

        //maxscore:=maxscore+1;
        inc(maxscore);
      end; //for position..
    end; //for size..

    if pass = 1 then
    begin
      //swap the strings around
      swapstring := s1;
      s1 := s2;
      s2 := swapstring;
    end;

    //Each pass is weighted equally

    scored := scored + (0.5 * (score / maxscore));
    score := 0;
    maxscore := 0;
  end; //for pass..

  //HowAlike=score/maxscore
  result := scored;
end;


Solve 4:

A Delphi implementation of the Levenshtein Distance Algorithm

unit Levenshtein;

{Objeto que calcula la distancia de Levenshtein entre 2 cadenas.
Alvaro Jeria Madariaga. 04/10/2002
barbaro@hotpop.com}

interface

uses
  sysutils, Math;

type
  Tdistance = class(TObject)
  private
    function minimum(a, b, c: Integer): Integer;
  public
    function LD(s, t: string): Integer;
  end;

implementation

function Tdistance.minimum(a, b, c: Integer): Integer;
var
  mi: Integer;
begin
  mi := a;
  if (b < mi) then
    mi := b;
  if (c < mi) then
    mi := c;
  Result := mi;
end;

function Tdistance.LD(s, t: string): Integer;
var
  d: array of array of Integer;
  n, m, i, j, costo: Integer;
  s_i, t_j: char;
begin
  n := Length(s);
  m := Length(t);
  if (n = 0) then
  begin
    Result := m;
    Exit;
  end;
  if m = 0 then
  begin
    Result := n;
    Exit;
  end;
  setlength(d, n + 1, m + 1);
  for i := 0 to n do
    d[i, 0] := i;
  for j := 0 to m do
    d[0, j] := j;
  for i := 1 to n do
  begin
    s_i := s[i];
    for j := 1 to m do
    begin
      t_j := t[j];
      if s_i = t_j then
        costo := 0
      else
        costo := 1;
      d[i, j] := Minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + costo);
    end;
  end;
  Result := d[n, m];
end;

end.

Nincsenek megjegyzések:

Megjegyzés küldése