2004. január 12., hétfő

How to search for a string using the Soundex algorithm


Problem/Question/Abstract:

How to search for a string using the Soundex algorithm

Answer:

Solve 1:

unit SndxAlgs;

interface

uses
  SysUtils;

function Soundex(in_str: string): string;
function NumericSoundex(in_str: string): Smallint;
function ExtendedSoundex(in_str: string): string;

implementation

{Calculate a normal Soundex encoding.}

function Soundex(in_str: string): string;
var
  no_vowels, coded, out_str: string;
  ch: Char;
  i: Integer;
begin
  {Make upper case and remove leading and trailing spaces.}
  in_str := Trim(UpperCase(in_str));
  {Remove vowels, spaces, H, W, and Y except for the first character.}
  no_vowels := in_str[1];
  for i := 2 to Length(in_str) do
  begin
    ch := in_str[i];
    case ch of
      'A', 'E', 'I', 'O', 'U', ' ', 'H', 'W', 'Y':
        ; {Do nothing.}
    else
      no_vowels := no_vowels + ch;
    end;
  end;
  {Encode the characters.}
  for i := 1 to Length(no_vowels) do
  begin
    ch := no_vowels[i];
    case ch of
      'B', 'F', 'P', 'V': ch := '1';
      'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z': ch := '2';
      'D', 'T': ch := '3';
      'L': ch := '4';
      'M', 'N': ch := '5';
      'R': ch := '6';
    else {Vowels, H, W, and Y as the 1st letter.}
      ch := '0';
    end;
    coded := coded + ch;
  end;
  {Use the first letter.}
  out_str := no_vowels[1];
  {Find three non-repeating codes.}
  for i := 2 to Length(no_vowels) do
  begin
    {Look for a non-repeating code.}
    if (coded[i] <> coded[i - 1]) then
    begin
      {This one works.}
      out_str := out_str + coded[i];
      if (Length(out_str) >= 4) then
        Break;
    end;
  end;
  Soundex := out_str;
end;

{Calculate a numeric Soundex encoding.}

function NumericSoundex(in_str: string): Smallint;
var
  value: Integer;
begin
  {Calculate the normal Soundex encoding.}
  in_str := Soundex(in_str);
  {Convert this into a numeric value.}
  value := (Ord(in_str[1]) - Ord('A')) * 1000;
  if (Length(in_str) > 1) then
    value := value + StrToInt(Copy(in_str, 2, Length(in_str) - 1));
  NumericSoundex := value;
end;

{Calculate an extended Soundex encoding.}

function ExtendedSoundex(in_str: string): string;

{Replace instances of fr_str with to_str in str.}
  procedure ReplaceString(var str: string; fr_str, to_str: string);
  var
    fr_len, i: Integer;
  begin
    fr_len := Length(fr_str);
    i := Pos(fr_str, str);
    while (i > 0) do
    begin
      str := Copy(str, 1, i - 1) + to_str + Copy(str, i + fr_len, Length(str) - i - fr_len + 1);
      i := Pos(fr_str, str);
    end;
  end;

var
  no_vowels: string;
  ch, last_ch: Char;
  i: Integer;
begin
  {Make upper case and remove leading and trailing spaces.}
  in_str := Trim(UpperCase(in_str));
  {Remove internal spaces.}
  ReplaceString(in_str, ' ', '');
  {Convert CHR to CR.}
  ReplaceString(in_str, 'CHR', 'CR');
  {Convert PH to F.}
  ReplaceString(in_str, 'PH', 'F');
  {Convert Z to S.}
  ReplaceString(in_str, 'Z', 'S');
  {Remove vowels and repeats.}
  last_ch := in_str[1]; {The last character used.}
  no_vowels := last_ch;
  for i := 2 to Length(in_str) do
  begin
    ch := in_str[i];
    case ch of
      'A', 'E', 'I', 'O', 'U':
        ; {Do nothing.}
    else
      {Skip it if it's a duplicate.}
      if (ch <> last_ch) then
      begin
        no_vowels := no_vowels + ch;
        last_ch := ch;
      end;
    end;
  end;
  ExtendedSoundex := no_vowels;
end;

end.

Used like this:

unit Sndx;

interface

uses
  Windows, Messages, SysUtils, Classes, Graphics, Controls, Forms, Dialogs,
  StdCtrls, ExtCtrls, SndxAlgs;

type
  TForm1 = class(TForm)
    InputText: TEdit;
    Label1: TLabel;
    CmdEncode: TButton;
    Label2: TLabel;
    Label3: TLabel;
    Panel1: TPanel;
    SoundexLabel: TLabel;
    Panel2: TPanel;
    NumericLabel: TLabel;
    Label4: TLabel;
    Panel3: TPanel;
    ExtendedLabel: TLabel;
    procedure CmdEncodeClick(Sender: TObject);
  private
    { Private declarations }
  public
    { Public declarations }
  end;

var
  Form1: TForm1;

implementation

{$R *.DFM}

procedure TForm1.CmdEncodeClick(Sender: TObject);
begin
  SoundexLabel.Caption := Soundex(InputText.Text);
  NumericLabel.Caption := Format('%d', [NumericSoundex(InputText.Text)]);
  ExtendedLabel.Caption := ExtendedSoundex(InputText.Text);
end;

end.


Solve 2:

The code below is designed for use in English language and does not work for special characters like French accents or German Umlauts

function StrSoundEx(const OrgString: string): string;
var
  s: string;
  PrevCh: Char;
  Ch: Char;
  i: Integer;
begin
  s := UpperCase(Trim(OrgString));
  if s <> '' then
  begin
    PrevCh := #0;
    result := s[1];
    for i := 2 to Length(s) do
    begin
      if Length(result) = 4 then
        break;
      Ch := s[i];
      if (Ch <> PrevCh) then
      begin
        if Ch in ['B', 'P', 'F', 'V'] then
          result := result + '1'
        else if Ch in ['C', 'S', 'K', 'G', 'J', 'Q', 'X', 'Z'] then
          result := result + '2'
        else if Ch in ['D', 'T'] then
          result := result + '3'
        else if Ch in ['L'] then
          result := result + '4'
        else if Ch in ['M', 'N'] then
          result := result + '5'
        else if Ch in ['R'] then
          result := result + '6';
        PrevCh := Ch;
      end;
    end;
  end;
  while Length(result) < 4 do
    result := result + '0';
end;


Solve 3:

The following differs from the standard Russell Soundex algorithm in that it lets you set the size of the Soundex code to something other than four characters:

{Given a string this fuction returns the Russell Soundex code for that string. Although the Russell Soundex code is limited to four characters this function allows you to get a code up to 16 characters in length. For names a six to eight character code reduces the number of false matches significantly.

Parameters:
TheWord: The string to be encoded.
SoundexSize: The number of characters in the returned code.

Returns: The Soundex code.}

function dgGetSoundexCode(TheWord: string; SoundexSize: Integer): string;
const
  MaxSize = 16;
var
  I: Integer;
  WorkString1, WorkString2: string;
begin
  {Raise an exception if the SoundexSize parameter is not in the allowed range}
  if not SoundexSize in [1..MaxSize] then
    raise Exception.Create('Soundex size must in the range 1 - 16.');
  {Convert the word to upper case}
  TheWord := UpperCase(TheWord);
  {Copy the first letter}
  WorkString1 := TheWord[1];
  {Copy the rest of the word to WordString1 deleting duplicate letters}
  for I := 2 to Length(TheWord) do
    if TheWord[I - 1] <> TheWord[I] then
      AppendStr(WorkString1, TheWord[I]);
  {Move the first letter to WorkString2}
  WorkString2 := WorkString1[1];
  {Compute the Soundex codes for the remaining letters}
  for I := 2 to Length(WorkString1) do
    case WorkString1[I] of
      'B', 'F', 'P', 'V':
        AppendStr(WorkString2, '1');
      'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z':
        Appendstr(WorkString2, '2');
      'D', 'T':
        Appendstr(WorkString2, '3');
      'L':
        Appendstr(WorkString2, '4');
      'M', 'N':
        Appendstr(WorkString2, '5');
      'R':
        Appendstr(WorkString2, '6');
    end;
  {Pad the string with zeros}
  WorkString1 := '';
  WorkString1 := dgFillString('0', MaxSize);
  AppendStr(WorkString2, WorkString1);
  Result := Copy(WorkString2, 1, SoundexSize);
end;

Nincsenek megjegyzések:

Megjegyzés küldése