Patrick Welche <prlw1@cam.ac.uk>
[netbsd-mini2440.git] / external / bsd / openldap / dist / servers / slapd / phonetic.c
blob10c394672be7dfa55a6abaebc45ffcc27e8968e0
1 /* phonetic.c - routines to do phonetic matching */
2 /* $OpenLDAP: pkg/ldap/servers/slapd/phonetic.c,v 1.22.2.3 2008/02/11 23:26:44 kurt Exp $ */
3 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 * Copyright 1998-2008 The OpenLDAP Foundation.
6 * All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted only as authorized by the OpenLDAP
10 * Public License.
12 * A copy of this license is available in the file LICENSE in the
13 * top-level directory of the distribution or, alternatively, at
14 * <http://www.OpenLDAP.org/license.html>.
16 /* Portions Copyright (c) 1995 Regents of the University of Michigan.
17 * All rights reserved.
19 * Redistribution and use in source and binary forms are permitted
20 * provided that this notice is preserved and that due credit is given
21 * to the University of Michigan at Ann Arbor. The name of the University
22 * may not be used to endorse or promote products derived from this
23 * software without specific prior written permission. This software
24 * is provided ``as is'' without express or implied warranty.
27 #include "portable.h"
29 #include <stdio.h>
31 #include <ac/ctype.h>
32 #include <ac/string.h>
33 #include <ac/socket.h>
34 #include <ac/time.h>
36 #include "slap.h"
38 #if !defined(SLAPD_METAPHONE) && !defined(SLAPD_PHONETIC)
39 #define SLAPD_METAPHONE
40 #endif
42 #define iswordbreak(x) (!isascii(x) || isspace((unsigned char) (x)) || \
43 ispunct((unsigned char) (x)) || \
44 isdigit((unsigned char) (x)) || (x) == '\0')
46 #if 0
47 static char *
48 first_word( char *s )
50 if ( s == NULL ) {
51 return( NULL );
54 while ( iswordbreak( *s ) ) {
55 if ( *s == '\0' ) {
56 return( NULL );
57 } else {
58 s++;
62 return( s );
65 static char *
66 next_word( char *s )
68 if ( s == NULL ) {
69 return( NULL );
72 while ( ! iswordbreak( *s ) ) {
73 s++;
76 while ( iswordbreak( *s ) ) {
77 if ( *s == '\0' ) {
78 return( NULL );
79 } else {
80 s++;
84 return( s );
87 static char *
88 word_dup( char *w )
90 char *s, *ret;
91 char save;
93 for ( s = w; !iswordbreak( *s ); s++ )
94 ; /* NULL */
95 save = *s;
96 *s = '\0';
97 ret = ch_strdup( w );
98 *s = save;
100 return( ret );
102 #endif /* 0 */
104 #ifndef MAXPHONEMELEN
105 #define MAXPHONEMELEN 4
106 #endif
108 #if defined(SLAPD_PHONETIC)
110 /* lifted from isode-8.0 */
111 char *
112 phonetic( char *s )
114 char code, adjacent, ch;
115 char *p;
116 int i;
117 char phoneme[MAXPHONEMELEN + 1];
119 p = s;
120 if ( p == NULL || *p == '\0' ) {
121 return( NULL );
124 adjacent = '0';
125 phoneme[0] = TOUPPER((unsigned char)*p);
127 phoneme[1] = '\0';
128 for ( i = 0; i < 99 && (! iswordbreak(*p)); p++ ) {
129 ch = TOUPPER ((unsigned char)*p);
131 code = '0';
133 switch (ch) {
134 case 'B':
135 case 'F':
136 case 'P':
137 case 'V':
138 code = (adjacent != '1') ? '1' : '0';
139 break;
140 case 'S':
141 case 'C':
142 case 'G':
143 case 'J':
144 case 'K':
145 case 'Q':
146 case 'X':
147 case 'Z':
148 code = (adjacent != '2') ? '2' : '0';
149 break;
150 case 'D':
151 case 'T':
152 code = (adjacent != '3') ? '3' : '0';
153 break;
154 case 'L':
155 code = (adjacent != '4') ? '4' : '0';
156 break;
157 case 'M':
158 case 'N':
159 code = (adjacent != '5') ? '5' : '0';
160 break;
161 case 'R':
162 code = (adjacent != '6') ? '6' : '0';
163 break;
164 default:
165 adjacent = '0';
168 if ( i == 0 ) {
169 adjacent = code;
170 i++;
171 } else if ( code != '0' ) {
172 if ( i == MAXPHONEMELEN )
173 break;
174 adjacent = phoneme[i] = code;
175 i++;
179 if ( i > 0 )
180 phoneme[i] = '\0';
182 return( ch_strdup( phoneme ) );
185 #elif defined(SLAPD_METAPHONE)
188 * Metaphone was originally developed by Lawrence Philips and
189 * published in the "Computer Language" magazine in 1990.
192 * Metaphone copied from C Gazette, June/July 1991, pp 56-57,
193 * author Gary A. Parker, with changes by Bernard Tiffany of the
194 * University of Michigan, and more changes by Tim Howes of the
195 * University of Michigan.
198 /* Character coding array */
199 static const char vsvfn[26] = {
200 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2,
201 /* A B C D E F G H I J K L M */
202 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0};
203 /* N O P Q R S T U V W X Y Z */
205 /* Macros to access character coding array */
206 #define vowel(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 1) /* AEIOU */
207 #define same(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 2) /* FJLMNR */
208 #define varson(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 4) /* CGPST */
209 #define frontv(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 8) /* EIY */
210 #define noghf(x) ((x) != '\0' && vsvfn[(x) - 'A'] & 16) /* BDH */
212 char *
213 phonetic( char *Word )
215 char *n, *n_start, *n_end; /* pointers to string */
216 char *metaph_end; /* pointers to metaph */
217 char ntrans[40]; /* word with uppercase letters */
218 int KSflag; /* state flag for X -> KS */
219 char buf[MAXPHONEMELEN + 2];
220 char *Metaph;
223 * Copy Word to internal buffer, dropping non-alphabetic characters
224 * and converting to upper case
227 for (n = ntrans + 4, n_end = ntrans + 35; !iswordbreak( *Word ) &&
228 n < n_end; Word++) {
229 if (isalpha((unsigned char)*Word))
230 *n++ = TOUPPER((unsigned char)*Word);
232 Metaph = buf;
233 *Metaph = '\0';
234 if (n == ntrans + 4) {
235 return( ch_strdup( buf ) ); /* Return if null */
237 n_end = n; /* Set n_end to end of string */
239 /* ntrans[0] will always be == 0 */
240 ntrans[0] = '\0';
241 ntrans[1] = '\0';
242 ntrans[2] = '\0';
243 ntrans[3] = '\0';
244 *n++ = 0;
245 *n++ = 0;
246 *n++ = 0;
247 *n = 0; /* Pad with nulls */
248 n = ntrans + 4; /* Assign pointer to start */
250 /* Check for PN, KN, GN, AE, WR, WH, and X at start */
251 switch (*n) {
252 case 'P':
253 case 'K':
254 case 'G':
255 /* 'PN', 'KN', 'GN' becomes 'N' */
256 if (*(n + 1) == 'N')
257 *n++ = 0;
258 break;
259 case 'A':
260 /* 'AE' becomes 'E' */
261 if (*(n + 1) == 'E')
262 *n++ = 0;
263 break;
264 case 'W':
265 /* 'WR' becomes 'R', and 'WH' to 'H' */
266 if (*(n + 1) == 'R')
267 *n++ = 0;
268 else if (*(n + 1) == 'H') {
269 *(n + 1) = *n;
270 *n++ = 0;
272 break;
273 case 'X':
274 /* 'X' becomes 'S' */
275 *n = 'S';
276 break;
280 * Now, loop step through string, stopping at end of string or when
281 * the computed 'metaph' is MAXPHONEMELEN characters long
284 KSflag = 0; /* state flag for KS translation */
285 for (metaph_end = Metaph + MAXPHONEMELEN, n_start = n;
286 n <= n_end && Metaph < metaph_end; n++) {
287 if (KSflag) {
288 KSflag = 0;
289 *Metaph++ = 'S';
290 } else {
291 /* Drop duplicates except for CC */
292 if (*(n - 1) == *n && *n != 'C')
293 continue;
294 /* Check for F J L M N R or first letter vowel */
295 if (same(*n) || (n == n_start && vowel(*n)))
296 *Metaph++ = *n;
297 else
298 switch (*n) {
299 case 'B':
302 * B unless in -MB
304 if (n == (n_end - 1) && *(n - 1) != 'M')
305 *Metaph++ = *n;
306 break;
307 case 'C':
310 * X if in -CIA-, -CH- else S if in
311 * -CI-, -CE-, -CY- else dropped if
312 * in -SCI-, -SCE-, -SCY- else K
314 if (*(n - 1) != 'S' || !frontv(*(n + 1))) {
315 if (*(n + 1) == 'I' && *(n + 2) == 'A')
316 *Metaph++ = 'X';
317 else if (frontv(*(n + 1)))
318 *Metaph++ = 'S';
319 else if (*(n + 1) == 'H')
320 *Metaph++ = ((n == n_start && !vowel(*(n + 2)))
321 || *(n - 1) == 'S')
322 ? (char) 'K' : (char) 'X';
323 else
324 *Metaph++ = 'K';
326 break;
327 case 'D':
330 * J if in DGE or DGI or DGY else T
332 *Metaph++ = (*(n + 1) == 'G' && frontv(*(n + 2)))
333 ? (char) 'J' : (char) 'T';
334 break;
335 case 'G':
338 * F if in -GH and not B--GH, D--GH,
339 * -H--GH, -H---GH else dropped if
340 * -GNED, -GN, -DGE-, -DGI-, -DGY-
341 * else J if in -GE-, -GI-, -GY- and
342 * not GG else K
344 if ((*(n + 1) != 'J' || vowel(*(n + 2))) &&
345 (*(n + 1) != 'N' || ((n + 1) < n_end &&
346 (*(n + 2) != 'E' || *(n + 3) != 'D'))) &&
347 (*(n - 1) != 'D' || !frontv(*(n + 1))))
348 *Metaph++ = (frontv(*(n + 1)) &&
349 *(n + 2) != 'G') ? (char) 'G' : (char) 'K';
350 else if (*(n + 1) == 'H' && !noghf(*(n - 3)) &&
351 *(n - 4) != 'H')
352 *Metaph++ = 'F';
353 break;
354 case 'H':
357 * H if before a vowel and not after
358 * C, G, P, S, T else dropped
360 if (!varson(*(n - 1)) && (!vowel(*(n - 1)) ||
361 vowel(*(n + 1))))
362 *Metaph++ = 'H';
363 break;
364 case 'K':
367 * dropped if after C else K
369 if (*(n - 1) != 'C')
370 *Metaph++ = 'K';
371 break;
372 case 'P':
375 * F if before H, else P
377 *Metaph++ = *(n + 1) == 'H' ?
378 (char) 'F' : (char) 'P';
379 break;
380 case 'Q':
385 *Metaph++ = 'K';
386 break;
387 case 'S':
390 * X in -SH-, -SIO- or -SIA- else S
392 *Metaph++ = (*(n + 1) == 'H' ||
393 (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
394 *(n + 2) == 'A')))
395 ? (char) 'X' : (char) 'S';
396 break;
397 case 'T':
400 * X in -TIA- or -TIO- else 0 (zero)
401 * before H else dropped if in -TCH-
402 * else T
404 if (*(n + 1) == 'I' && (*(n + 2) == 'O' ||
405 *(n + 2) == 'A'))
406 *Metaph++ = 'X';
407 else if (*(n + 1) == 'H')
408 *Metaph++ = '0';
409 else if (*(n + 1) != 'C' || *(n + 2) != 'H')
410 *Metaph++ = 'T';
411 break;
412 case 'V':
417 *Metaph++ = 'F';
418 break;
419 case 'W':
422 * W after a vowel, else dropped
424 case 'Y':
427 * Y unless followed by a vowel
429 if (vowel(*(n + 1)))
430 *Metaph++ = *n;
431 break;
432 case 'X':
435 * KS
437 if (n == n_start)
438 *Metaph++ = 'S';
439 else {
440 *Metaph++ = 'K'; /* Insert K, then S */
441 KSflag = 1;
443 break;
444 case 'Z':
449 *Metaph++ = 'S';
450 break;
455 *Metaph = 0; /* Null terminate */
456 return( ch_strdup( buf ) );
459 #endif /* SLAPD_METAPHONE */