2006. március 8., szerda

Hyphenation - Dividing Spanish words in syllables


Problem/Question/Abstract:

A simple hyphenation algorithm to syllabicate Spanish words.

Answer:

Sometimes we need to display or print a text, and we'd like to hyphenate long words that don't fit at the end of a line, to prevent them from falling entirely into the next line leaving too much space unused.

The main problem that arises is how to divide a Spanish word in syllables. If your are interested in syllabication for English words, read the note at the end of this article.

procedure Syllabify(Syllables: TStringList; s: string);
const
  Consonants = ['b', 'B', 'c', 'C', 'd', 'D', 'f', 'F', 'g', 'G',
    'h', 'H', 'j', 'J', 'k', 'K', 'l', 'L', 'm', 'M', 'n', 'N',
    '�', '�', 'p', 'P', 'q', 'Q', 'r', 'R', 's', 'S', 't', 'T',
    'v', 'V', 'w', 'W', 'x', 'X', 'y', 'Y', 'z', 'Z'];
  StrongVowels = ['a', 'A', '�', '�', 'e', 'E', '�', '�',
    '�', '�', 'o', '�', 'O', '�', '�', '�'];
  WeakVowels = ['i', 'I', 'u', 'U', '�', '�'];
  Vowels = StrongVowels + WeakVowels;
  Letters = Vowels + Consonants;
var
  i, j, n, m, hyphen: integer;
begin
  j := 2;
  s := #0 + s + #0;
  n := Length(s) - 1;
  i := 2;
  Syllables.Clear;
  while i <= n do
  begin
    hyphen := 0; // Do not hyphenate
    if s[i] in Consonants then
    begin
      if s[i + 1] in Vowels then
      begin
        if s[i - 1] in Vowels then
          hyphen := 1;
      end
      else if (s[i + 1] in Consonants) and
        (s[i - 1] in Vowels) then
      begin
        if s[i + 1] in ['r', 'R'] then
        begin
          if s[i] in ['b', 'B', 'c', 'C', 'd', 'D', 'f', 'F', 'g',
            'G', 'k', 'K', 'p', 'P', 'r', 'R', 't', 'T', 'v', 'V'] then
            hyphen := 1
          else
            hyphen := 2;
        end
        else if s[i + 1] in ['l', 'L'] then
        begin
          if s[i] in ['b', 'B', 'c', 'C', 'd', 'D', 'f', 'F', 'g',
            'G', 'k', 'K', 'l', 'L', 'p', 'P', 't', 'T', 'v', 'V'] then
            hyphen := 1
          else
            hyphen := 2;
        end
        else if s[i + 1] in ['h', 'H'] then
        begin
          if s[i] in ['c', 'C', 's', 'S', 'p', 'P'] then
            hyphen := 1
          else
            hyphen := 2;
        end
        else
          hyphen := 2;
      end;
    end
    else if s[i] in StrongVowels then
    begin
      if (s[i - 1] in StrongVowels) then
        hyphen := 1
    end
    else if s[i] = '-' then
    begin
      Syllables.Add(Copy(s, j, i - j));
      Syllables.Add('-');
      inc(i);
      j := i;
    end;
    if hyphen = 1 then
    begin // Hyphenate here
      Syllables.Add(Copy(s, j, i - j));
      j := i;
    end
    else if hyphen = 2 then
    begin // Hyphenate after
      inc(i);
      Syllables.Add(Copy(s, j, i - j));
      j := i;
    end;
    inc(i);
  end;
  m := Syllables.Count - 1;
  if (j = n) and (m >= 0) and (s[n] in Consonants) then
    Syllables[m] := Syllables[m] + s[n] // Last letter
  else
    Syllables.Add(Copy(s, j, n - j + 1)); // Last syllable
end;

To test the procedure yon can drop a Textbox and a Label on a form and in the Change event of the Textbox write:

procedure TForm1.Edit1Change(Sender: TObject);
var
  Syllables: TStringList;
begin
  Syllables := TStringList.Create;
  try
    Syllabify(Syllables, Edit1.Text);
    Label1.Caption := StringReplace(Trim(Syllables.Text),
      #13#10, '-', [rfReplaceAll]);
  finally
    Syllables.Free;
  end;
end;

Now that we have a syllabication procedure, we have to note that we can't hyphenate a word in any syllable break. It is usually correct and/or desirable to join small syllables at the left and/or right sides of a word to guarantee for example that there are at least two syllables on either side of the word when it gets hyphenated, or -like in the following example- to make sure that at least we have four characters in either side:

procedure ApplyRules(Syllables: TStringList);
// Guarantee there are at least four letters in the left
// and right parts of the word
begin
  with Syllables do
  begin
    if Count = 1 then
      exit;
    while Count > 1 do
    begin
      if Length(Strings[0]) >= 4 then
        break;
      Strings[0] := Strings[0] + Strings[1];
      Delete(1);
    end;
    while Syllables.Count > 1 do
    begin
      if Length(Strings[Count - 1]) >= 4 then
        break;
      Strings[Count - 2] := Strings[Count - 2]
        + Strings[Count - 1];
      Delete(Count - 1);
    end;
  end;
end;

Finally, it comes the time to parse the text separating the lines of a paragraph determining which words should be hyphenated. The following example does that with a text to be displayed in a Memo:

procedure Hyphenate(Memo: TMemo; OriginalText: TStrings);
var
  paragraph, i, j, k, m, n, MaxLineWidth: integer;
  s, line: string;
  Bitmap: TBitmap;
  Canvas: TCanvas;
  Syllables: TStringList;
begin
  Syllables := TStringList.Create;
  try
    // We need a canvas to use its TextWidth method to get the width
    // of the text to see if it fits in the client area or not. The
    // TMemo class doesn't have a Canvas property, so we have to
    // create one of our own.
    Bitmap := TBitmap.Create;
    Canvas := Bitmap.Canvas;
    try
      Canvas.Font := Memo.Font;
      MaxLineWidth := Memo.ClientWidth - 6; // Maximum width
      Memo.Lines.Clear;
      for paragraph := 0 to OriginalText.Count - 1 do
      begin
        // For each paragraph
        s := OriginalText[paragraph]; // Get the original paragraph
        // Get the lines in which we have to break the paragraph
        while Canvas.TextWidth(s) > MaxLineWidth do
        begin
          // First we find (in "j") the index of the start of the
          // first word that doesn't fit (the one to hyphenate)
          j := 1;
          n := Length(s);
          i := 2;
          while i <= n do
          begin
            if (s[i - 1] = ' ') and (s[i] <> ' ') then
              j := i; // last beginning of a word
            if Canvas.TextWidth(Copy(s, 1, i)) > MaxLineWidth then
              break; // reached a width that doesn't fit
            inc(i);
          end;
          // Where does the break occurs?
          if s[i] = ' ' then
          begin
            // Great! We break on a space
            Memo.Lines.Add(Copy(s, 1, i - 1)); // Add the line
            s := Copy(s, i + 1, n - i); // Remove the line
          end
          else
          begin
            // We break somewhere in a word. Now, we find (in "k")
            // the first space after the word (k)
            k := j + 1;
            while (k <= n) and (s[k] <> ' ') do
              inc(k);
            // Divide the word in Syllables
            Syllabify(Syllables, Copy(s, j, k - j));
            ApplyRules(Syllables);
            // Check (in "m") how many syllables fit
            m := 0;
            Line := Copy(s, 1, j - 1);
            while Canvas.TextWidth(Line + Syllables[m] + '-')
              <= MaxLineWidth do
            begin
              Line := Line + Syllables[m];
              inc(m);
            end;
            if (m <> 0) and (Syllables[m - 1] <> '-') then
            begin
              // Hyphenate
              Line := Line + '-';
              j := Length(Line);
              if Syllables[m] = '-' then
                inc(j);
            end;
            Memo.Lines.Add(Line); // Add the line
            s := Copy(s, j, n - j + 1); // Remove the line
          end;
        end;
        Memo.Lines.Add(s); // Add the last line (it fits)
      end;
    finally
      Bitmap.Free;
    end;
  finally
    Syllables.Free;
  end;
end;

To test the procedure, drop a Memo component on a form, align it for example to the top of the form (Align = alTop) and write the following code in the OnResize event of the form:

procedure TForm1.FormResize(Sender: TObject);
var
  OriginalText: TStringList;
begin
  OriginalText := TStringList.Create;
  try
    OriginalText.Add('Si se ha preguntado c�mo hacen los '
      + 'programas procesamiento de textos para dividir palabras '
      + 'con de guiones al final de una l�nea, he aqu� un '
      + 'ejemplo sencillo (en comparaci�n con los que usan las '
      + 'aplicaciones de procesamiento de textos).');
    OriginalText.Add('Este es un segundo p�rrafo que se provee '
      + 'con fines de ejemplo.');
    Hyphenate(Memo1, OriginalText);
  finally
    OriginalText.Free;
  end;
end;

NOTE:

English words are hyphenated phonetically, so the process would have two phases:

produce a phonetic representation of the word using pronunciation rules; and
perform the hyphenation of the phonetic representation using hyphenation rules (and parallelly apply that to the original word).

There are rules for both things, and also exceptions, so a small exceptions dictionary may be needed. Of course, it's all easier said than done. I realize it is somewhat complex, but I still believe it is possible to syllabicate English words algorithmically.

Copyright (c) 2001 Ernesto De Spirito
Visit: http://www.latiumsoftware.com/delphi-newsletter.php

Nincsenek megjegyzések:

Megjegyzés küldése