2004. október 7., csütörtök

How to convert accented characters to unaccented ones


Problem/Question/Abstract:

Is there a way to convert accented characters to unaccented (meaning ASCII A - Z, a - z)?

Answer:

The classical way is to have a conversion table and do a lookup in that table. The problem with that is that the table is of course specific to a certain charset (encoding), like Windows Latin-1. You could build a table for a range of UNICODE (widechar) characters to get around this limitation and convert the strings to widestrings before you do the accent removals. The routine below uses ANSI characters with the Windows western (Latin-1) encoding.

function SimplifyChar(const _ch: char): char;
const
  Charmap: array[#128..#255] of Char = (
    #128 { ? }, #129 { ? }, #130 { ? }, #131 { ? }, #132 { ? },
    #133 { ? }, #134 { ? }, #135 { ? }, #136 { ? }, #137 { ? },
    #138 { ? }, #139 { ? }, #140 { ? }, #141 { ? }, #142 { ? },
    #143 { ? }, #144 { ? }, #145 { ? }, #146 { ? }, #147 { ? },
    #148 { ? }, #149 { ? }, #150 { ? }, #151 { ? }, #152 { ? },
    #153 { ? }, #154 { ? }, #155 { ? }, #156 { ? }, #157 { ? },
    #158 { ? }, #159 { ? }, #160 { � }, #161 { · }, #162 { � },
    #163 { � }, #164 { � }, #165 { � }, #166 { ¦ }, #167 { � },
    #168 { � }, #169 { © }, #170 { � }, #171 { « }, #172 { ¬ },
    #173 {  }, #174 { ® }, #175 { � }, #176 { � }, #177 { ± },
    #178 { � }, #179 { � }, #180 { � }, #181 { µ }, #182 { ¶ },
    #183 { · }, #184 { � }, #185 { ± }, #186 { � }, #187 { » },
    #188 { � }, #189 { � }, #190 { µ }, #191 { � }, 'A' { � },
    'A' { � }, 'A' { � }, 'A' { � }, 'A' { � }, 'A' { � },
    #198 { � }, #199 { � }, 'E' { � }, 'E' { � }, 'E' { � },
    'E' { � }, 'I' { � }, 'I' { � }, 'I' { � }, 'I' { � },
    #208 { � }, #209 { � }, 'O' { � }, 'O' { � }, 'O' { � },
    'O' { � }, 'O' { � }, #215 { � }, #216 { � }, 'U' { � },
    'U' { � }, 'U' { � }, 'U' { � }, #221 { � }, #222 { � },
    #223 { � }, 'a' { � }, 'a' { � }, 'a' { � }, 'a' { � },
    'a' { � }, 'a' { � }, #230 { � }, #231 { � }, 'e' { � },
    'e' { � }, 'e' { � }, 'e' { � }, 'i' { � }, 'i' { � },
    'i' { � }, 'i' { � }, #240 { � }, #241 { � }, 'o' { � },
    'o' { � }, 'o' { � }, 'o' { � }, 'o' { � }, #247 { � },
    #248 { � }, 'u' { � }, 'u' { � }, 'u' { � }, 'u' { � },
    #253 { � }, #254 { � }, #255 { � }
    );
begin
  if _ch >= #128 then
    Result := Charmap[_ch]
  else
    Result := _ch;
end;

The charmap table was created by this little routine and then edited:

procedure CreateCharacterMap(fromchar, tochar: Char);

function DisplayStr(const ch: Char): string;
begin
  if ch < #32 then
    Result := '^' + Chr(Ord('A') - 1 + Ord(ch))
  else
    Result := ch;
end;

var
  sl: TStringlist;
  line, element: string;
  ch: char;
begin
  Assert(fromchar <= tochar);
  sl := Tstringlist.Create;
  try
    sl.Add('Const');
    line := Format('  Charmap: array [#%d..#%d] of Char = (', [Ord(fromchar),
      Ord(tochar)]);
    sl.Add(line);
    line := '';
    for ch := fromchar to toChar do
    begin
      element := Format('#%3.3d { %s }', [Ord(ch), DisplayStr(ch)]);
      if (Length(line) + Length(element)) > 66 then
      begin
        sl.Add('    ' + line);
        line := '';
      end;
      line := line + element;
      if ch <> tochar then
        line := line + ', ';
    end;
    sl.Add('    ' + line);
    sl.add('    );');
    Clipboard.AsText := sl.Text;
  finally
    sl.Free
  end;
end;

Nincsenek megjegyzések:

Megjegyzés küldése