extensions/spellcheck/hunspell/src/phonet.cpp

   1 /******* BEGIN LICENSE BLOCK *******
   2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   3  *
   4  * The contents of this file are subject to the Mozilla Public License Version
   5  * 1.1 (the "License"); you may not use this file except in compliance with
   6  * the License. You may obtain a copy of the License at
   7  * http://www.mozilla.org/MPL/
   8  *
   9  * Software distributed under the License is distributed on an "AS IS" basis,
  10  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  11  * for the specific language governing rights and limitations under the
  12  * License.
  13  *
  14  * The Initial Developer of the Original Code is Björn Jacke. Portions created
  15  * by the Initial Developers are Copyright (C) 2000-2007 the Initial
  16  * Developers. All Rights Reserved.
  17  *
  18  * Contributor(s): Björn Jacke (bjoern.jacke@gmx.de)
  19  *                 László Németh (nemethl@gyorsposta.hu)
  20  *
  21  * Alternatively, the contents of this file may be used under the terms of
  22  * either the GNU General Public License Version 2 or later (the "GPL"), or
  23  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  24  * in which case the provisions of the GPL or the LGPL are applicable instead
  25  * of those above. If you wish to allow use of your version of this file only
  26  * under the terms of either the GPL or the LGPL, and not to allow others to
  27  * use your version of this file under the terms of the MPL, indicate your
  28  * decision by deleting the provisions above and replace them with the notice
  29  * and other provisions required by the GPL or the LGPL. If you do not delete
  30  * the provisions above, a recipient may use your version of this file under
  31  * the terms of any one of the MPL, the GPL or the LGPL.
  32  *
  33  * Changelog:
  34  *  2000-01-05  Björn Jacke <bjoern.jacke AT gmx.de>
  35  *              Initial Release insprired by the article about phonetic
  36  *              transformations out of c't 25/1999
  37  *
  38  *  2007-07-26  Björn Jacke <bjoern.jacke AT gmx.de>
  39  *              Released under MPL/GPL/LGPL tri-license for Hunspell
  40  *
  41  *  2007-08-23  László Németh <nemeth at OOo>
  42  *              Porting from Aspell to Hunspell using C-like structs
  43  *
  44  ******* END LICENSE BLOCK *******/
  45
  46 #ifndef MOZILLA_CLIENT
  47 #include <cstdlib>
  48 #include <cstring>
  49 #include <cstdio>
  50 #include <cctype>
  51 #else
  52 #include <stdlib.h>
  53 #include <string.h>
  54 #include <stdio.h>
  55 #include <ctype.h>
  56 #endif
  57
  58 #include "csutil.hxx"
  59 #include "phonet.hxx"
  60
  61 void init_phonet_hash(phonetable & parms)
  62   {
  63     int i, k;
  64
  65     for (i = 0; i < HASHSIZE; i++) {
  66       parms.hash[i] = -1;
  67     }
  68
  69     for (i = 0; parms.rules[i][0] != '\0'; i += 2) {
  70       /**  set hash value  **/
  71       k = (unsigned char) parms.rules[i][0];
  72
  73       if (parms.hash[k] < 0) {
  74         parms.hash[k] = i;
  75       }
  76     }
  77   }
  78
  79   // like strcpy but safe if the strings overlap
  80   //   but only if dest < src
  81   static inline void strmove(char * dest, char * src) {
  82     while (*src)
  83       *dest++ = *src++;
  84     *dest = '\0';
  85   }
  86
  87 int myisalpha(char ch) {
  88   if ((unsigned char) ch < 128) return isalpha(ch);
  89   return 1;
  90 }
  91
  92 /*  phonetic transcription algorithm                   */
  93 /*  see: http://aspell.net/man-html/Phonetic-Code.html */
  94 /*  convert string to uppercase before this call       */
  95 int phonet (const char * inword, char * target,
  96               int len,
  97               phonetable & parms)
  98   {
  99     /**       Do phonetic transformation.       **/
 100     /**  "len" = length of "inword" incl. '\0'. **/
 101
 102     /**  result:  >= 0:  length of "target"    **/
 103     /**            otherwise:  error            **/
 104
 105     int  i,j,k=0,n,p,z;
 106     int  k0,n0,p0=-333,z0;
 107     char c, c0;
 108     const char * s;
 109     typedef unsigned char uchar;
 110     char word[MAXPHONETUTF8LEN + 1];
 111     if (len == -1) len = strlen(inword);
 112     if (len > MAXPHONETUTF8LEN) return 0;
 113     strcpy(word, inword);
 114
 115     /**  check word  **/
 116     i = j = z = 0;
 117     while ((c = word[i]) != '\0') {
 118       n = parms.hash[(uchar) c];
 119       z0 = 0;
 120
 121       if (n >= 0) {
 122         /**  check all rules for the same letter  **/
 123         while (parms.rules[n][0] == c) {
 124
 125           /**  check whole string  **/
 126           k = 1;   /** number of found letters  **/
 127           p = 5;   /** default priority  **/
 128           s = parms.rules[n];
 129           s++;     /**  important for (see below)  "*(s-1)"  **/
 130
 131           while (*s != '\0'  &&  word[i+k] == *s
 132                  &&  !isdigit ((unsigned char) *s)  &&  strchr ("(-<^$", *s) == NULL) {
 133             k++;
 134             s++;
 135           }
 136           if (*s == '(') {
 137             /**  check letters in "(..)"  **/
 138             if (myisalpha(word[i+k])  // ...could be implied?
 139                 && strchr(s+1, word[i+k]) != NULL) {
 140               k++;
 141               while (*s != ')')
 142                 s++;
 143               s++;
 144             }
 145           }
 146           p0 = (int) *s;
 147           k0 = k;
 148           while (*s == '-'  &&  k > 1) {
 149             k--;
 150             s++;
 151           }
 152           if (*s == '<')
 153             s++;
 154           if (isdigit ((unsigned char) *s)) {
 155             /**  determine priority  **/
 156             p = *s - '0';
 157             s++;
 158           }
 159           if (*s == '^'  &&  *(s+1) == '^')
 160             s++;
 161
 162           if (*s == '\0'
 163               || (*s == '^'
 164                   && (i == 0  ||  ! myisalpha(word[i-1]))
 165                   && (*(s+1) != '$'
 166                       || (! myisalpha(word[i+k0]) )))
 167               || (*s == '$'  &&  i > 0
 168                   &&  myisalpha(word[i-1])
 169                   && (! myisalpha(word[i+k0]) )))
 170           {
 171             /**  search for followup rules, if:     **/
 172             /**  parms.followup and k > 1  and  NO '-' in searchstring **/
 173             c0 = word[i+k-1];
 174             n0 = parms.hash[(uchar) c0];
 175
 176 //            if (parms.followup  &&  k > 1  &&  n0 >= 0
 177             if (k > 1  &&  n0 >= 0
 178                 &&  p0 != (int) '-'  &&  word[i+k] != '\0') {
 179               /**  test follow-up rule for "word[i+k]"  **/
 180               while (parms.rules[n0][0] == c0) {
 181
 182                 /**  check whole string  **/
 183                 k0 = k;
 184                 p0 = 5;
 185                 s = parms.rules[n0];
 186                 s++;
 187                 while (*s != '\0'  &&  word[i+k0] == *s
 188                        && ! isdigit((unsigned char) *s)  &&  strchr("(-<^$",*s) == NULL) {
 189                   k0++;
 190                   s++;
 191                 }
 192                 if (*s == '(') {
 193                   /**  check letters  **/
 194                   if (myisalpha(word[i+k0])
 195                       &&  strchr (s+1, word[i+k0]) != NULL) {
 196                     k0++;
 197                     while (*s != ')'  &&  *s != '\0')
 198                       s++;
 199                     if (*s == ')')
 200                       s++;
 201                   }
 202                 }
 203                 while (*s == '-') {
 204                   /**  "k0" gets NOT reduced   **/
 205                   /**  because "if (k0 == k)"  **/
 206                   s++;
 207                 }
 208                 if (*s == '<')
 209                   s++;
 210                 if (isdigit ((unsigned char) *s)) {
 211                   p0 = *s - '0';
 212                   s++;
 213                 }
 214
 215                 if (*s == '\0'
 216                     /**  *s == '^' cuts  **/
 217                     || (*s == '$'  &&  ! myisalpha(word[i+k0])))
 218                 {
 219                   if (k0 == k) {
 220                     /**  this is just a piece of the string  **/
 221                     n0 += 2;
 222                     continue;
 223                   }
 224
 225                   if (p0 < p) {
 226                     /**  priority too low  **/
 227                     n0 += 2;
 228                     continue;
 229                   }
 230                   /**  rule fits; stop search  **/
 231                   break;
 232                 }
 233                 n0 += 2;
 234               } /**  End of "while (parms.rules[n0][0] == c0)"  **/
 235
 236               if (p0 >= p  && parms.rules[n0][0] == c0) {
 237                 n += 2;
 238                 continue;
 239               }
 240             } /** end of follow-up stuff **/
 241
 242             /**  replace string  **/
 243             s = parms.rules[n+1];
 244             p0 = (parms.rules[n][0] != '\0'
 245                  &&  strchr (parms.rules[n]+1,'<') != NULL) ? 1:0;
 246             if (p0 == 1 &&  z == 0) {
 247               /**  rule with '<' is used  **/
 248               if (j > 0  &&  *s != '\0'
 249                  && (target[j-1] == c  ||  target[j-1] == *s)) {
 250                 j--;
 251               }
 252               z0 = 1;
 253               z = 1;
 254               k0 = 0;
 255               while (*s != '\0'  &&  word[i+k0] != '\0') {
 256                 word[i+k0] = *s;
 257                 k0++;
 258                 s++;
 259               }
 260               if (k > k0)
 261                 strmove (&word[0]+i+k0, &word[0]+i+k);
 262
 263               /**  new "actual letter"  **/
 264               c = word[i];
 265             }
 266             else { /** no '<' rule used **/
 267               i += k - 1;
 268               z = 0;
 269               while (*s != '\0'
 270                      &&  *(s+1) != '\0'  &&  j < len) {
 271                 if (j == 0  ||  target[j-1] != *s) {
 272                   target[j] = *s;
 273                   j++;
 274                 }
 275                 s++;
 276               }
 277               /**  new "actual letter"  **/
 278               c = *s;
 279               if (parms.rules[n][0] != '\0'
 280                  &&  strstr (parms.rules[n]+1, "^^") != NULL) {
 281                 if (c != '\0') {
 282                   target[j] = c;
 283                   j++;
 284                 }
 285                 strmove (&word[0], &word[0]+i+1);
 286                 i = 0;
 287                 z0 = 1;
 288               }
 289             }
 290             break;
 291           }  /** end of follow-up stuff **/
 292           n += 2;
 293         } /**  end of while (parms.rules[n][0] == c)  **/
 294       } /**  end of if (n >= 0)  **/
 295       if (z0 == 0) {
 296 //        if (k && (assert(p0!=-333),!p0) &&  j < len &&  c != '\0'
 297 //           && (!parms.collapse_result  ||  j == 0  ||  target[j-1] != c)){
 298         if (k && !p0 && j < len &&  c != '\0'
 299            && (1 || j == 0  ||  target[j-1] != c)){
 300            /**  condense only double letters  **/
 301           target[j] = c;
 302           ///printf("\n setting \n");
 303           j++;
 304         }
 305
 306         i++;
 307         z = 0;
 308         k=0;
 309       }
 310     }  /**  end of   while ((c = word[i]) != '\0')  **/
 311
 312     target[j] = '\0';
 313     return (j);
 314
 315   }  /**  end of function "phonet"  **/