## 2010. október 12., kedd

### Implement fuzzy search

Problem/Question/Abstract:

How to implement fuzzy search

Solve 1:

This DLL calculates the Levenshtein Distance between two strings. Please note that ShareMem must be the first unit in the Uses clause of the Interface section of your unit, if your DLL exports procedures or functions, which pass string parameters or function results. ShareMem is the interface to delphimm.dll, which you have to distribute together with your own DLL. To avoid using delphimm.dll, pass string parameters by using PChar or ShortString parameters.

library Levensh;

uses
ShareMem, SysUtils;

var
FiR0: integer;
FiP0: integer;
FiQ0: integer;

function Min(X, Y, Z: Integer): Integer;
begin
if (X < Y) then
Result := X
else
Result := Y;
if (Result > Z) then
Result := Z;
end;

procedure LevenshteinPQR(p, q, r: integer);
begin
FiP0 := p;
FiQ0 := q;
FiR0 := r;
end;

function LevenshteinDistance(const sString, sPattern: string): Integer;
const
MAX_SIZE = 50;
var
aiDistance: array[0..MAX_SIZE, 0..MAX_SIZE] of Integer;
i, j, iStringLength, iPatternLength, iMaxI, iMaxJ: Integer;
chChar: Char;
iP, iQ, iR, iPP: Integer;
begin
iStringLength := length(sString);
if (iStringLength > MAX_SIZE) then
iMaxI := MAX_SIZE
else
iMaxI := iStringLength;
iPatternLength := length(sPattern);
if (iPatternLength > MAX_SIZE) then
iMaxJ := MAX_SIZE
else
iMaxJ := iPatternLength;
aiDistance[0, 0] := 0;
for i := 1 to iMaxI do
aiDistance[i, 0] := aiDistance[i - 1, 0] + FiR0;
for j := 1 to iMaxJ do
begin
chChar := sPattern[j];
if ((chChar = '*') or (chChar = '?')) then
iP := 0
else
iP := FiP0;
if (chChar = '*') then
iQ := 0
else
iQ := FiQ0;
if (chChar = '*') then
iR := 0
else
iR := FiR0;
aiDistance[0, j] := aiDistance[0, j - 1] + iQ;
for i := 1 to iMaxI do
begin
if (sString[i] = sPattern[j]) then
iPP := 0
else
iPP := iP;
{aiDistance[i, j] := Minimum of 3 values}
aiDistance[i, j] := Min(aiDistance[i - 1, j - 1] + iPP,
aiDistance[i, j - 1] + iQ,
aiDistance[i - 1, j] + iR);
end;
end;
Result := aiDistance[iMaxI, iMaxJ];
end;

exports
LevenshteinDistance Index 1,
LevenshteinPQR Index 2;

begin
FiR0 := 1;
FiP0 := 1;
FiQ0 := 1;
end.

Solve 2:

This is an old Pascal code snippet, which is based on a C project published in the C't magazine somewhen back in the 1990's. Can't remember where I found it on the WWW. Please note that the code below accesses a simple *.txt file to search in.

program FuzzySearch;
{Translation from C to Pascal by Karsten Paulini and Simon Reinhardt}
const
MaxParLen = 255;
var
InFile: Text;
Filename: string;
InputStr: string;
SearchStr: string;
Treshold: Integer;

function PrepareTheString(OriginStr: string; var ConvStr: string): Integer;
var
i: Integer;
begin
ConvStr := OriginStr;
for i := 1 to Length(OriginStr) do
begin
ConvStr[i] := UpCase(ConvStr[i]);
if ConvStr[i] < '0' then
ConvStr[i] := ' '
else
case ConvStr[i] of
Chr(196): ConvStr[i] := Chr(228);
Chr(214): ConvStr[i] := Chr(246);
Chr(220): ConvStr[i] := Chr(252);
Chr(142): ConvStr[i] := Chr(132);
Chr(153): ConvStr[i] := Chr(148);
Chr(154): ConvStr[i] := Chr(129);
':': ConvStr[i] := ' ';
';': ConvStr[i] := ' ';
'<': ConvStr[i] := ' ';
'>': ConvStr[i] := ' ';
'=': ConvStr[i] := ' ';
'?': ConvStr[i] := ' ';
'[': ConvStr[i] := ' ';
']': ConvStr[i] := ' ';
end;
end;
PrepareTheString := i;
end;

function NGramMatch(TextPara, SearchStr: string; SearchStrLen, NGramLen: Integer;
var MaxMatch: Integer): Integer;
var
NGram: string[8];
NGramCount: Integer;
i, Count: Integer;
begin
NGramCount := SearchStrLen - NGramLen + 1;
Count := 0;
MaxMatch := 0;
for i := 1 to NGramCount do
begin
NGram := Copy(SearchStr, i, NGramLen);
if (NGram[NGramLen - 1] = ' ') and (NGram[1] < > ' ') then
Inc(i, NGramLen - 3) {will be increased in the loop}
else
begin
Inc(MaxMatch, NGramLen);
if Pos(NGram, TextPara) > 0 then
Inc(Count);
end;
end;
NGramMatch := Count * NGramLen;
end;

procedure FuzzyMatching(SearchStr: string; Treshold: Integer; var InFile: Text);
var
TextPara: string;
TextBuffer: string;
TextLen: Integer;
SearchStrLen: Integer;
NGram1Len: Integer;
NGram2Len: Integer;
MatchCount1: Integer;
MatchCount2: Integer;
MaxMatch1: Integer;
MaxMatch2: Integer;
Similarity: Real;
BestSim: Real;
begin
BestSim := 0.0;
SearchStrLen := PrepareTheString(SearchStr, SearchStr);
NGram1Len := 3;
if SearchStrLen < 7 then
NGram2Len := 2
else
NGram2Len := 5;
while not Eof(InFile) do
begin
TextLen := PrepareTheString(TextBuffer, TextPara) + 1;
TextPara := Concat(' ', TextPara);
if TextLen < MaxParLen - 2 then
begin
MatchCount1 := NGramMatch(TextPara, SearchStr, SearchStrLen, NGram1Len, MaxMatch1);
MatchCount2 := NGramMatch(TextPara, SearchStr, SearchStrLen, NGram2Len, MaxMatch2);
Similarity := 100.0 * (MatchCount1 + MatchCount2) / (MaxMatch1 + MaxMatch2);
if Similarity > BestSim then
BestSim := Similarity;
if Similarity >= Treshold then
begin
Writeln;
Writeln('[', Similarity, '] ', TextBuffer);
end;
end;
else
Writeln('Paragraph too long');
end;
if BestSim < Treshold then
Writeln('No match; Best Match was ', BestSim);
end;

begin
Writeln;
Writeln('+------------------------------------------+');
Writeln('| Fuzzy Search in Information Retrieval |');
Writeln('|         (C) 1997 Reinhard Rapp           |');
Writeln('+------------------------------------------+');
Writeln;
Write('Name of file to search in: ');
Write('Search string: ');
SearchStr := Concat(' ', InputStr, ' ');
Write('Minimum hit quality in % : ');
if (Treshold > 0) and (Treshold <= 100) and (SearchStr < > '') and (Filename < > '') then
begin
Assign(InFile, Filename);
Reset(InFile);
FuzzyMatching(SearchStr, Treshold, InFile);
Close(InFile);
end;
Writeln;
Writeln('Bye!');
end.

Solve 3:

unit FuzzyMatch;

{This unit provides a basic 'fuzzy match' index on how alike two strings are
The result is of type 'single': near 0 - poor match
near 1 - close match
The intention is that HowAlike(s1,s2)=HowAlike(s2,s1)
The Function is not case sensitive}

interface

uses sysutils;

function HowAlike(s1, s2: string): single;

implementation

function instr(start: integer; ToSearch, ToFind: string): integer;
begin
//This is a quick implementation of the VB InStr, since Pos just doesn't do what is needed!!
//NB - case sensitive!!
if start > 1 then
Delete(ToSearch, 1, start - 1);
result := pos(ToFind, ToSearch);
if (result > 0) and (start > 1) then
inc(result, start);
end;

function HowAlike(s1, s2: string): single;
var
l1, l2, pass, position, size, foundpos, maxscore: integer;
score, scored, string1pos, string2pos, bestmatchpos: single;
swapstring, searchblock: string;
begin
s1 := Uppercase(trim(s1));
s2 := Uppercase(trim(s2));

score := 0;
maxscore := 0;
scored := 0;

//deal with zero length strings...
if (s1 = '') and (s2 = '') then
begin
result := 1;
exit;
end
else if (s1 = '') or (s2 = '') then
begin
result := 0;
exit;
end;

//why perform any mathematics is the result is clear?
if s1 = s2 then
begin
result := 1;
exit;
end;

//make two passes,
//     with s1 and s2 each way round to ensure
//     consistent results
for pass := 1 to 2 do
begin
l1 := length(s1);
l2 := length(s2);
for size := l1 downto 1 do
begin
for position := 1 to (l1 - size + 1) do
begin
//try to find implied block in the other string
//Big blocks score much better than small blocks
searchblock := copy(s1, position, size);
foundpos := pos(searchblock, s2);

if size = l1 then
string1pos := 0.5
else
string1pos := (position - 1) / (l1 - size);

if foundpos > 0 then
begin
//the string is in somewhere in there
//    - find the 'closest' one.
bestmatchpos := -100; //won't find anything that far away!

repeat
if size = l2 then
string2pos := 0.5
else
string2pos := (foundpos - 1) / (l2 - size);

//If this closer than the previous best?
if abs(string2pos - string1pos) < abs(bestmatchpos - string1pos) then
bestmatchpos := string2pos;

foundpos := instr(foundpos + 1, s2, searchblock);
until foundpos = 0; //loop while foundpos>0..

//The closest position is now known: Score it!
//Score as follows: (1-distance of best match)
score := score + (1 - abs(string1pos - bestmatchpos));
end;

//Keep track if the maximum possible score
//BE CAREFUL IF CHANGING THIS FUNCTION!!!

//maxscore:=maxscore+1;
inc(maxscore);
end; //for position..
end; //for size..

if pass = 1 then
begin
//swap the strings around
swapstring := s1;
s1 := s2;
s2 := swapstring;
end;

//Each pass is weighted equally

scored := scored + (0.5 * (score / maxscore));
score := 0;
maxscore := 0;
end; //for pass..

//HowAlike=score/maxscore
result := scored;
end;

Solve 4:

A Delphi implementation of the Levenshtein Distance Algorithm

unit Levenshtein;

{Objeto que calcula la distancia de Levenshtein entre 2 cadenas.
barbaro@hotpop.com}

interface

uses
sysutils, Math;

type
Tdistance = class(TObject)
private
function minimum(a, b, c: Integer): Integer;
public
function LD(s, t: string): Integer;
end;

implementation

function Tdistance.minimum(a, b, c: Integer): Integer;
var
mi: Integer;
begin
mi := a;
if (b < mi) then
mi := b;
if (c < mi) then
mi := c;
Result := mi;
end;

function Tdistance.LD(s, t: string): Integer;
var
d: array of array of Integer;
n, m, i, j, costo: Integer;
s_i, t_j: char;
begin
n := Length(s);
m := Length(t);
if (n = 0) then
begin
Result := m;
Exit;
end;
if m = 0 then
begin
Result := n;
Exit;
end;
setlength(d, n + 1, m + 1);
for i := 0 to n do
d[i, 0] := i;
for j := 0 to m do
d[0, j] := j;
for i := 1 to n do
begin
s_i := s[i];
for j := 1 to m do
begin
t_j := t[j];
if s_i = t_j then
costo := 0
else
costo := 1;
d[i, j] := Minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + costo);
end;
end;
Result := d[n, m];
end;

end.