external/bsd/openldap/dist/servers/slapd/phonetic.c

   1 /* phonetic.c - routines to do phonetic matching */
   2 /* $OpenLDAP: pkg/ldap/servers/slapd/phonetic.c,v 1.22.2.3 2008/02/11 23:26:44 kurt Exp $ */
   3 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
   4  *
   5  * Copyright 1998-2008 The OpenLDAP Foundation.
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted only as authorized by the OpenLDAP
  10  * Public License.
  11  *
  12  * A copy of this license is available in the file LICENSE in the
  13  * top-level directory of the distribution or, alternatively, at
  14  * <http://www.OpenLDAP.org/license.html>.
  15  */
  16 /* Portions Copyright (c) 1995 Regents of the University of Michigan.
  17  * All rights reserved.
  18  *
  19  * Redistribution and use in source and binary forms are permitted
  20  * provided that this notice is preserved and that due credit is given
  21  * to the University of Michigan at Ann Arbor. The name of the University
  22  * may not be used to endorse or promote products derived from this
  23  * software without specific prior written permission. This software
  24  * is provided ``as is'' without express or implied warranty.
  25  */
  26
  27 #include "portable.h"
  28
  29 #include <stdio.h>
  30
  31 #include <ac/ctype.h>
  32 #include <ac/string.h>
  33 #include <ac/socket.h>
  34 #include <ac/time.h>
  35
  36 #include "slap.h"
  37
  38 #if !defined(SLAPD_METAPHONE) && !defined(SLAPD_PHONETIC)
  39 #define SLAPD_METAPHONE
  40 #endif
  41
  42 #define iswordbreak(x)  (!isascii(x) || isspace((unsigned char) (x)) || \
  43                          ispunct((unsigned char) (x)) || \
  44                          isdigit((unsigned char) (x)) || (x) == '\0')
  45
  46 #if 0
  47 static char *
  48 first_word( char *s )
  49 {
  50         if ( s == NULL ) {
  51                 return( NULL );
  52         }
  53
  54         while ( iswordbreak( *s ) ) {
  55                 if ( *s == '\0' ) {
  56                         return( NULL );
  57                 } else {
  58                         s++;
  59                 }
  60         }
  61
  62         return( s );
  63 }
  64
  65 static char *
  66 next_word( char *s )
  67 {
  68         if ( s == NULL ) {
  69                 return( NULL );
  70         }
  71
  72         while ( ! iswordbreak( *s ) ) {
  73                 s++;
  74         }
  75
  76         while ( iswordbreak( *s ) ) {
  77                 if ( *s == '\0' ) {
  78                         return( NULL );
  79                 } else {
  80                         s++;
  81                 }
  82         }
  83
  84         return( s );
  85 }
  86
  87 static char *
  88 word_dup( char *w )
  89 {
  90         char    *s, *ret;
  91         char    save;
  92
  93         for ( s = w; !iswordbreak( *s ); s++ )
  94                 ;       /* NULL */
  95         save = *s;
  96         *s = '\0';
  97         ret = ch_strdup( w );
  98         *s = save;
  99
 100         return( ret );
 101 }
 102 #endif /* 0 */
 103
 104 #ifndef MAXPHONEMELEN
 105 #define MAXPHONEMELEN   4
 106 #endif
 107
 108 #if defined(SLAPD_PHONETIC)
 109
 110 /* lifted from isode-8.0 */
 111 char *
 112 phonetic( char *s )
 113 {
 114         char    code, adjacent, ch;
 115         char    *p;
 116         int     i;
 117         char    phoneme[MAXPHONEMELEN + 1];
 118
 119         p = s;
 120         if (  p == NULL || *p == '\0' ) {
 121                 return( NULL );
 122         }
 123
 124         adjacent = '0';
 125         phoneme[0] = TOUPPER((unsigned char)*p);
 126
 127         phoneme[1]  = '\0';
 128         for ( i = 0; i < 99 && (! iswordbreak(*p)); p++ ) {
 129                 ch = TOUPPER ((unsigned char)*p);
 130
 131                 code = '0';
 132
 133                 switch (ch) {
 134                 case 'B':
 135                 case 'F':
 136                 case 'P':
 137                 case 'V':
 138                         code = (adjacent != '1') ? '1' : '0';
 139                         break;
 140                 case 'S':
 141                 case 'C':
 142                 case 'G':
 143                 case 'J':
 144                 case 'K':
 145                 case 'Q':
 146                 case 'X':
 147                 case 'Z':
 148                         code = (adjacent != '2') ? '2' : '0';
 149                         break;
 150                 case 'D':
 151                 case 'T':
 152                         code = (adjacent != '3') ? '3' : '0';
 153                         break;
 154                 case 'L':
 155                         code = (adjacent != '4') ? '4' : '0';
 156                         break;
 157                 case 'M':
 158                 case 'N':
 159                         code = (adjacent != '5') ? '5' : '0';
 160                         break;
 161                 case 'R':
 162                         code = (adjacent != '6') ? '6' : '0';
 163                         break;
 164                 default:
 165                         adjacent = '0';
 166                 }
 167
 168                 if ( i == 0 ) {
 169                         adjacent = code;
 170                         i++;
 171                 } else if ( code != '0' ) {
 172                         if ( i == MAXPHONEMELEN )
 173                                 break;
 174                         adjacent = phoneme[i] = code;
 175                         i++;
 176                 }
 177         }
 178
 179         if ( i > 0 )
 180                 phoneme[i] = '\0';
 181
 182         return( ch_strdup( phoneme ) );
 183 }
 184
 185 #elif defined(SLAPD_METAPHONE)
 186
 187 /*
 188  * Metaphone was originally developed by Lawrence Philips and
 189  * published in the "Computer Language" magazine in 1990.
 190  */
 191 /*
 192  * Metaphone copied from C Gazette, June/July 1991, pp 56-57,
 193  * author Gary A. Parker, with changes by Bernard Tiffany of the
 194  * University of Michigan, and more changes by Tim Howes of the
 195  * University of Michigan.
 196  */
 197
 198 /* Character coding array */
 199 static const char  vsvfn[26] = {
 200            1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
 201         /* A   B  C   D  E  F  G   H  I  J  K  L  M  */
 202            2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
 203         /* N  O  P  Q  R  S  T  U  V  W  X  Y  Z  */
 204
 205 /* Macros to access character coding array */
 206 #define vowel(x)        ((x) != '\0' && vsvfn[(x) - 'A'] & 1)   /* AEIOU */
 207 #define same(x)         ((x) != '\0' && vsvfn[(x) - 'A'] & 2)   /* FJLMNR */
 208 #define varson(x)       ((x) != '\0' && vsvfn[(x) - 'A'] & 4)   /* CGPST */
 209 #define frontv(x)       ((x) != '\0' && vsvfn[(x) - 'A'] & 8)   /* EIY */
 210 #define noghf(x)        ((x) != '\0' && vsvfn[(x) - 'A'] & 16)  /* BDH */
 211
 212 char *
 213 phonetic( char *Word )
 214 {
 215         char           *n, *n_start, *n_end;    /* pointers to string */
 216         char           *metaph_end;     /* pointers to metaph */
 217         char            ntrans[40];     /* word with uppercase letters */
 218         int             KSflag; /* state flag for X -> KS */
 219         char            buf[MAXPHONEMELEN + 2];
 220         char            *Metaph;
 221
 222         /*
 223          * Copy Word to internal buffer, dropping non-alphabetic characters
 224          * and converting to upper case
 225          */
 226
 227         for (n = ntrans + 4, n_end = ntrans + 35; !iswordbreak( *Word ) &&
 228             n < n_end; Word++) {
 229                 if (isalpha((unsigned char)*Word))
 230                         *n++ = TOUPPER((unsigned char)*Word);
 231         }
 232         Metaph = buf;
 233         *Metaph = '\0';
 234         if (n == ntrans + 4) {
 235                 return( ch_strdup( buf ) );             /* Return if null */
 236         }
 237         n_end = n;              /* Set n_end to end of string */
 238
 239         /* ntrans[0] will always be == 0 */
 240         ntrans[0] = '\0';
 241         ntrans[1] = '\0';
 242         ntrans[2] = '\0';
 243         ntrans[3] = '\0';
 244         *n++ = 0;
 245         *n++ = 0;
 246         *n++ = 0;
 247         *n = 0;                 /* Pad with nulls */
 248         n = ntrans + 4;         /* Assign pointer to start */
 249
 250         /* Check for PN, KN, GN, AE, WR, WH, and X at start */
 251         switch (*n) {
 252         case 'P':
 253         case 'K':
 254         case 'G':
 255                 /* 'PN', 'KN', 'GN' becomes 'N' */
 256                 if (*(n + 1) == 'N')
 257                         *n++ = 0;
 258                 break;
 259         case 'A':
 260                 /* 'AE' becomes 'E' */
 261                 if (*(n + 1) == 'E')
 262                         *n++ = 0;
 263                 break;
 264         case 'W':
 265                 /* 'WR' becomes 'R', and 'WH' to 'H' */
 266                 if (*(n + 1) == 'R')
 267                         *n++ = 0;
 268                 else if (*(n + 1) == 'H') {
 269                         *(n + 1) = *n;
 270                         *n++ = 0;
 271                 }
 272                 break;
 273         case 'X':
 274                 /* 'X' becomes 'S' */
 275                 *n = 'S';
 276                 break;
 277         }
 278
 279         /*
 280          * Now, loop step through string, stopping at end of string or when
 281          * the computed 'metaph' is MAXPHONEMELEN characters long
 282          */
 283
 284         KSflag = 0;             /* state flag for KS translation */
 285         for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
 286              n <= n_end && Metaph < metaph_end; n++) {
 287                 if (KSflag) {
 288                         KSflag = 0;
 289                         *Metaph++ = 'S';
 290                 } else {
 291                         /* Drop duplicates except for CC */
 292                         if (*(n - 1) == *n && *n != 'C')
 293                                 continue;
 294                         /* Check for F J L M N R or first letter vowel */
 295                         if (same(*n) || (n == n_start && vowel(*n)))
 296                                 *Metaph++ = *n;
 297                         else
 298                                 switch (*n) {
 299                                 case 'B':
 300
 301                                         /*
 302                                          * B unless in -MB
 303                                          */
 304                                         if (n == (n_end - 1) && *(n - 1) != 'M')
 305                                                 *Metaph++ = *n;
 306                                         break;
 307                                 case 'C':
 308
 309                                         /*
 310                                          * X if in -CIA-, -CH- else S if in
 311                                          * -CI-, -CE-, -CY- else dropped if
 312                                          * in -SCI-, -SCE-, -SCY- else K
 313                                          */
 314                                         if (*(n - 1) != 'S' || !frontv(*(n + 1))) {
 315                                                 if (*(n + 1) == 'I' && *(n + 2) == 'A')
 316                                                         *Metaph++ = 'X';
 317                                                 else if (frontv(*(n + 1)))
 318                                                         *Metaph++ = 'S';
 319                                                 else if (*(n + 1) == 'H')
 320                                                         *Metaph++ = ((n == n_start && !vowel(*(n + 2)))
 321                                                          || *(n - 1) == 'S')
 322                                                             ? (char) 'K' : (char) 'X';
 323                                                 else
 324                                                         *Metaph++ = 'K';
 325                                         }
 326                                         break;
 327                                 case 'D':
 328
 329                                         /*
 330                                          * J if in DGE or DGI or DGY else T
 331                                          */
 332                                         *Metaph++ = (*(n + 1) == 'G' && frontv(*(n + 2)))
 333                                             ? (char) 'J' : (char) 'T';
 334                                         break;
 335                                 case 'G':
 336
 337                                         /*
 338                                          * F if in -GH and not B--GH, D--GH,
 339                                          * -H--GH, -H---GH else dropped if
 340                                          * -GNED, -GN, -DGE-, -DGI-, -DGY-
 341                                          * else J if in -GE-, -GI-, -GY- and
 342                                          * not GG else K
 343                                          */
 344                                         if ((*(n + 1) != 'J' || vowel(*(n + 2))) &&
 345                                             (*(n + 1) != 'N' || ((n + 1) < n_end &&
 346                                                                  (*(n + 2) != 'E' || *(n + 3) != 'D'))) &&
 347                                             (*(n - 1) != 'D' || !frontv(*(n + 1))))
 348                                                 *Metaph++ = (frontv(*(n + 1)) &&
 349                                                              *(n + 2) != 'G') ? (char) 'G' : (char) 'K';
 350                                         else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
 351                                                  *(n - 4) != 'H')
 352                                                 *Metaph++ = 'F';
 353                                         break;
 354                                 case 'H':
 355
 356                                         /*
 357                                          * H if before a vowel and not after
 358                                          * C, G, P, S, T else dropped
 359                                          */
 360                                         if (!varson(*(n - 1)) && (!vowel(*(n - 1)) ||
 361                                                            vowel(*(n + 1))))
 362                                                 *Metaph++ = 'H';
 363                                         break;
 364                                 case 'K':
 365
 366                                         /*
 367                                          * dropped if after C else K
 368                                          */
 369                                         if (*(n - 1) != 'C')
 370                                                 *Metaph++ = 'K';
 371                                         break;
 372                                 case 'P':
 373
 374                                         /*
 375                                          * F if before H, else P
 376                                          */
 377                                         *Metaph++ = *(n + 1) == 'H' ?
 378                                             (char) 'F' : (char) 'P';
 379                                         break;
 380                                 case 'Q':
 381
 382                                         /*
 383                                          * K
 384                                          */
 385                                         *Metaph++ = 'K';
 386                                         break;
 387                                 case 'S':
 388
 389                                         /*
 390                                          * X in -SH-, -SIO- or -SIA- else S
 391                                          */
 392                                         *Metaph++ = (*(n + 1) == 'H' ||
 393                                                      (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
 394                                                           *(n + 2) == 'A')))
 395                                             ? (char) 'X' : (char) 'S';
 396                                         break;
 397                                 case 'T':
 398
 399                                         /*
 400                                          * X in -TIA- or -TIO- else 0 (zero)
 401                                          * before H else dropped if in -TCH-
 402                                          * else T
 403                                          */
 404                                         if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
 405                                                            *(n + 2) == 'A'))
 406                                                 *Metaph++ = 'X';
 407                                         else if (*(n + 1) == 'H')
 408                                                 *Metaph++ = '0';
 409                                         else if (*(n + 1) != 'C' || *(n + 2) != 'H')
 410                                                 *Metaph++ = 'T';
 411                                         break;
 412                                 case 'V':
 413
 414                                         /*
 415                                          * F
 416                                          */
 417                                         *Metaph++ = 'F';
 418                                         break;
 419                                 case 'W':
 420
 421                                         /*
 422                                          * W after a vowel, else dropped
 423                                          */
 424                                 case 'Y':
 425
 426                                         /*
 427                                          * Y unless followed by a vowel
 428                                          */
 429                                         if (vowel(*(n + 1)))
 430                                                 *Metaph++ = *n;
 431                                         break;
 432                                 case 'X':
 433
 434                                         /*
 435                                          * KS
 436                                          */
 437                                         if (n == n_start)
 438                                                 *Metaph++ = 'S';
 439                                         else {
 440                                                 *Metaph++ = 'K';        /* Insert K, then S */
 441                                                 KSflag = 1;
 442                                         }
 443                                         break;
 444                                 case 'Z':
 445
 446                                         /*
 447                                          * S
 448                                          */
 449                                         *Metaph++ = 'S';
 450                                         break;
 451                                 }
 452                 }
 453         }
 454
 455         *Metaph = 0;            /* Null terminate */
 456         return( ch_strdup( buf ) );
 457 }
 458
 459 #endif /* SLAPD_METAPHONE */