Revert commit 66c0185a3 and follow-on patches.
[pgsql.git] / contrib / fuzzystrmatch / fuzzystrmatch.c
blob72ae2ab91b5a9a9dd8248e580fcab90fc3de9aeb
1 /*
2 * fuzzystrmatch.c
4 * Functions for "fuzzy" comparison of strings
6 * Joe Conway <mail@joeconway.com>
8 * contrib/fuzzystrmatch/fuzzystrmatch.c
9 * Copyright (c) 2001-2024, PostgreSQL Global Development Group
10 * ALL RIGHTS RESERVED;
12 * metaphone()
13 * -----------
14 * Modified for PostgreSQL by Joe Conway.
15 * Based on CPAN's "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
16 * Code slightly modified for use as PostgreSQL function (palloc, elog, etc).
17 * Metaphone was originally created by Lawrence Philips and presented in article
18 * in "Computer Language" December 1990 issue.
20 * Permission to use, copy, modify, and distribute this software and its
21 * documentation for any purpose, without fee, and without a written agreement
22 * is hereby granted, provided that the above copyright notice and this
23 * paragraph and the following two paragraphs appear in all copies.
25 * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
26 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
27 * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
28 * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
31 * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
33 * AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
34 * ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO
35 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
39 #include "postgres.h"
41 #include <ctype.h>
43 #include "mb/pg_wchar.h"
44 #include "utils/builtins.h"
45 #include "utils/varlena.h"
46 #include "varatt.h"
48 PG_MODULE_MAGIC;
51 * Soundex
53 static void _soundex(const char *instr, char *outstr);
55 #define SOUNDEX_LEN 4
57 /* ABCDEFGHIJKLMNOPQRSTUVWXYZ */
58 static const char *const soundex_table = "01230120022455012623010202";
60 static char
61 soundex_code(char letter)
63 letter = toupper((unsigned char) letter);
64 /* Defend against non-ASCII letters */
65 if (letter >= 'A' && letter <= 'Z')
66 return soundex_table[letter - 'A'];
67 return letter;
71 * Metaphone
73 #define MAX_METAPHONE_STRLEN 255
76 * Original code by Michael G Schwern starts here.
77 * Code slightly modified for use as PostgreSQL function.
81 /**************************************************************************
82 metaphone -- Breaks english phrases down into their phonemes.
84 Input
85 word -- An english word to be phonized
86 max_phonemes -- How many phonemes to calculate. If 0, then it
87 will phonize the entire phrase.
88 phoned_word -- The final phonized word. (We'll allocate the
89 memory.)
90 Output
91 error -- A simple error flag, returns true or false
93 NOTES: ALL non-alpha characters are ignored, this includes whitespace,
94 although non-alpha characters will break up phonemes.
95 ****************************************************************************/
98 /* I add modifications to the traditional metaphone algorithm that you
99 might find in books. Define this if you want metaphone to behave
100 traditionally */
101 #undef USE_TRADITIONAL_METAPHONE
103 /* Special encodings */
104 #define SH 'X'
105 #define TH '0'
107 static char Lookahead(char *word, int how_far);
108 static void _metaphone(char *word, int max_phonemes, char **phoned_word);
110 /* Metachar.h ... little bits about characters for metaphone */
113 /*-- Character encoding array & accessing macros --*/
114 /* Stolen directly out of the book... */
115 static const char _codes[26] = {
116 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
117 /* a b c d e f g h i j k l m n o p q r s t u v w x y z */
120 static int
121 getcode(char c)
123 if (isalpha((unsigned char) c))
125 c = toupper((unsigned char) c);
126 /* Defend against non-ASCII letters */
127 if (c >= 'A' && c <= 'Z')
128 return _codes[c - 'A'];
130 return 0;
133 #define isvowel(c) (getcode(c) & 1) /* AEIOU */
135 /* These letters are passed through unchanged */
136 #define NOCHANGE(c) (getcode(c) & 2) /* FJMNR */
138 /* These form diphthongs when preceding H */
139 #define AFFECTH(c) (getcode(c) & 4) /* CGPST */
141 /* These make C and G soft */
142 #define MAKESOFT(c) (getcode(c) & 8) /* EIY */
144 /* These prevent GH from becoming F */
145 #define NOGHTOF(c) (getcode(c) & 16) /* BDH */
147 PG_FUNCTION_INFO_V1(levenshtein_with_costs);
148 Datum
149 levenshtein_with_costs(PG_FUNCTION_ARGS)
151 text *src = PG_GETARG_TEXT_PP(0);
152 text *dst = PG_GETARG_TEXT_PP(1);
153 int ins_c = PG_GETARG_INT32(2);
154 int del_c = PG_GETARG_INT32(3);
155 int sub_c = PG_GETARG_INT32(4);
156 const char *s_data;
157 const char *t_data;
158 int s_bytes,
159 t_bytes;
161 /* Extract a pointer to the actual character data */
162 s_data = VARDATA_ANY(src);
163 t_data = VARDATA_ANY(dst);
164 /* Determine length of each string in bytes */
165 s_bytes = VARSIZE_ANY_EXHDR(src);
166 t_bytes = VARSIZE_ANY_EXHDR(dst);
168 PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
169 ins_c, del_c, sub_c, false));
173 PG_FUNCTION_INFO_V1(levenshtein);
174 Datum
175 levenshtein(PG_FUNCTION_ARGS)
177 text *src = PG_GETARG_TEXT_PP(0);
178 text *dst = PG_GETARG_TEXT_PP(1);
179 const char *s_data;
180 const char *t_data;
181 int s_bytes,
182 t_bytes;
184 /* Extract a pointer to the actual character data */
185 s_data = VARDATA_ANY(src);
186 t_data = VARDATA_ANY(dst);
187 /* Determine length of each string in bytes */
188 s_bytes = VARSIZE_ANY_EXHDR(src);
189 t_bytes = VARSIZE_ANY_EXHDR(dst);
191 PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
192 1, 1, 1, false));
196 PG_FUNCTION_INFO_V1(levenshtein_less_equal_with_costs);
197 Datum
198 levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
200 text *src = PG_GETARG_TEXT_PP(0);
201 text *dst = PG_GETARG_TEXT_PP(1);
202 int ins_c = PG_GETARG_INT32(2);
203 int del_c = PG_GETARG_INT32(3);
204 int sub_c = PG_GETARG_INT32(4);
205 int max_d = PG_GETARG_INT32(5);
206 const char *s_data;
207 const char *t_data;
208 int s_bytes,
209 t_bytes;
211 /* Extract a pointer to the actual character data */
212 s_data = VARDATA_ANY(src);
213 t_data = VARDATA_ANY(dst);
214 /* Determine length of each string in bytes */
215 s_bytes = VARSIZE_ANY_EXHDR(src);
216 t_bytes = VARSIZE_ANY_EXHDR(dst);
218 PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
219 t_data, t_bytes,
220 ins_c, del_c, sub_c,
221 max_d, false));
225 PG_FUNCTION_INFO_V1(levenshtein_less_equal);
226 Datum
227 levenshtein_less_equal(PG_FUNCTION_ARGS)
229 text *src = PG_GETARG_TEXT_PP(0);
230 text *dst = PG_GETARG_TEXT_PP(1);
231 int max_d = PG_GETARG_INT32(2);
232 const char *s_data;
233 const char *t_data;
234 int s_bytes,
235 t_bytes;
237 /* Extract a pointer to the actual character data */
238 s_data = VARDATA_ANY(src);
239 t_data = VARDATA_ANY(dst);
240 /* Determine length of each string in bytes */
241 s_bytes = VARSIZE_ANY_EXHDR(src);
242 t_bytes = VARSIZE_ANY_EXHDR(dst);
244 PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
245 t_data, t_bytes,
246 1, 1, 1,
247 max_d, false));
252 * Calculates the metaphone of an input string.
253 * Returns number of characters requested
254 * (suggested value is 4)
256 PG_FUNCTION_INFO_V1(metaphone);
257 Datum
258 metaphone(PG_FUNCTION_ARGS)
260 char *str_i = TextDatumGetCString(PG_GETARG_DATUM(0));
261 size_t str_i_len = strlen(str_i);
262 int reqlen;
263 char *metaph;
265 /* return an empty string if we receive one */
266 if (!(str_i_len > 0))
267 PG_RETURN_TEXT_P(cstring_to_text(""));
269 if (str_i_len > MAX_METAPHONE_STRLEN)
270 ereport(ERROR,
271 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
272 errmsg("argument exceeds the maximum length of %d bytes",
273 MAX_METAPHONE_STRLEN)));
275 reqlen = PG_GETARG_INT32(1);
276 if (reqlen > MAX_METAPHONE_STRLEN)
277 ereport(ERROR,
278 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
279 errmsg("output exceeds the maximum length of %d bytes",
280 MAX_METAPHONE_STRLEN)));
282 if (!(reqlen > 0))
283 ereport(ERROR,
284 (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
285 errmsg("output cannot be empty string")));
287 _metaphone(str_i, reqlen, &metaph);
288 PG_RETURN_TEXT_P(cstring_to_text(metaph));
293 * Original code by Michael G Schwern starts here.
294 * Code slightly modified for use as PostgreSQL
295 * function (palloc, etc).
298 /* I suppose I could have been using a character pointer instead of
299 * accessing the array directly... */
301 /* Look at the next letter in the word */
302 #define Next_Letter (toupper((unsigned char) word[w_idx+1]))
303 /* Look at the current letter in the word */
304 #define Curr_Letter (toupper((unsigned char) word[w_idx]))
305 /* Go N letters back. */
306 #define Look_Back_Letter(n) \
307 (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
308 /* Previous letter. I dunno, should this return null on failure? */
309 #define Prev_Letter (Look_Back_Letter(1))
310 /* Look two letters down. It makes sure you don't walk off the string. */
311 #define After_Next_Letter \
312 (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
313 #define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
316 /* Allows us to safely look ahead an arbitrary # of letters */
317 /* I probably could have just used strlen... */
318 static char
319 Lookahead(char *word, int how_far)
321 char letter_ahead = '\0'; /* null by default */
322 int idx;
324 for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
325 /* Edge forward in the string... */
327 letter_ahead = word[idx]; /* idx will be either == to how_far or at the
328 * end of the string */
329 return letter_ahead;
333 /* phonize one letter */
334 #define Phonize(c) do {(*phoned_word)[p_idx++] = c;} while (0)
335 /* Slap a null character on the end of the phoned word */
336 #define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0)
337 /* How long is the phoned word? */
338 #define Phone_Len (p_idx)
340 /* Note is a letter is a 'break' in the word */
341 #define Isbreak(c) (!isalpha((unsigned char) (c)))
344 static void
345 _metaphone(char *word, /* IN */
346 int max_phonemes,
347 char **phoned_word) /* OUT */
349 int w_idx = 0; /* point in the phonization we're at. */
350 int p_idx = 0; /* end of the phoned phrase */
352 /*-- Parameter checks --*/
355 * Shouldn't be necessary, but left these here anyway jec Aug 3, 2001
358 /* Negative phoneme length is meaningless */
359 if (!(max_phonemes > 0))
360 /* internal error */
361 elog(ERROR, "metaphone: Requested output length must be > 0");
363 /* Empty/null string is meaningless */
364 if ((word == NULL) || !(strlen(word) > 0))
365 /* internal error */
366 elog(ERROR, "metaphone: Input string length must be > 0");
368 /*-- Allocate memory for our phoned_phrase --*/
369 if (max_phonemes == 0)
370 { /* Assume largest possible */
371 *phoned_word = palloc(sizeof(char) * strlen(word) + 1);
373 else
375 *phoned_word = palloc(sizeof(char) * max_phonemes + 1);
378 /*-- The first phoneme has to be processed specially. --*/
379 /* Find our first letter */
380 for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++)
382 /* On the off chance we were given nothing but crap... */
383 if (Curr_Letter == '\0')
385 End_Phoned_Word;
386 return;
390 switch (Curr_Letter)
392 /* AE becomes E */
393 case 'A':
394 if (Next_Letter == 'E')
396 Phonize('E');
397 w_idx += 2;
399 /* Remember, preserve vowels at the beginning */
400 else
402 Phonize('A');
403 w_idx++;
405 break;
406 /* [GKP]N becomes N */
407 case 'G':
408 case 'K':
409 case 'P':
410 if (Next_Letter == 'N')
412 Phonize('N');
413 w_idx += 2;
415 break;
418 * WH becomes H, WR becomes R W if followed by a vowel
420 case 'W':
421 if (Next_Letter == 'H' ||
422 Next_Letter == 'R')
424 Phonize(Next_Letter);
425 w_idx += 2;
427 else if (isvowel(Next_Letter))
429 Phonize('W');
430 w_idx += 2;
432 /* else ignore */
433 break;
434 /* X becomes S */
435 case 'X':
436 Phonize('S');
437 w_idx++;
438 break;
439 /* Vowels are kept */
442 * We did A already case 'A': case 'a':
444 case 'E':
445 case 'I':
446 case 'O':
447 case 'U':
448 Phonize(Curr_Letter);
449 w_idx++;
450 break;
451 default:
452 /* do nothing */
453 break;
458 /* On to the metaphoning */
459 for (; Curr_Letter != '\0' &&
460 (max_phonemes == 0 || Phone_Len < max_phonemes);
461 w_idx++)
464 * How many letters to skip because an earlier encoding handled
465 * multiple letters
467 unsigned short int skip_letter = 0;
471 * THOUGHT: It would be nice if, rather than having things like...
472 * well, SCI. For SCI you encode the S, then have to remember to skip
473 * the C. So the phonome SCI invades both S and C. It would be
474 * better, IMHO, to skip the C from the S part of the encoding. Hell,
475 * I'm trying it.
478 /* Ignore non-alphas */
479 if (!isalpha((unsigned char) (Curr_Letter)))
480 continue;
482 /* Drop duplicates, except CC */
483 if (Curr_Letter == Prev_Letter &&
484 Curr_Letter != 'C')
485 continue;
487 switch (Curr_Letter)
489 /* B -> B unless in MB */
490 case 'B':
491 if (Prev_Letter != 'M')
492 Phonize('B');
493 break;
496 * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
497 * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
498 * SCE-, -SCY- (handed in S) else K
500 case 'C':
501 if (MAKESOFT(Next_Letter))
502 { /* C[IEY] */
503 if (After_Next_Letter == 'A' &&
504 Next_Letter == 'I')
505 { /* CIA */
506 Phonize(SH);
508 /* SC[IEY] */
509 else if (Prev_Letter == 'S')
511 /* Dropped */
513 else
514 Phonize('S');
516 else if (Next_Letter == 'H')
518 #ifndef USE_TRADITIONAL_METAPHONE
519 if (After_Next_Letter == 'R' ||
520 Prev_Letter == 'S')
521 { /* Christ, School */
522 Phonize('K');
524 else
525 Phonize(SH);
526 #else
527 Phonize(SH);
528 #endif
529 skip_letter++;
531 else
532 Phonize('K');
533 break;
536 * J if in -DGE-, -DGI- or -DGY- else T
538 case 'D':
539 if (Next_Letter == 'G' &&
540 MAKESOFT(After_Next_Letter))
542 Phonize('J');
543 skip_letter++;
545 else
546 Phonize('T');
547 break;
550 * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
551 * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
552 * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
553 * else K
555 case 'G':
556 if (Next_Letter == 'H')
558 if (!(NOGHTOF(Look_Back_Letter(3)) ||
559 Look_Back_Letter(4) == 'H'))
561 Phonize('F');
562 skip_letter++;
564 else
566 /* silent */
569 else if (Next_Letter == 'N')
571 if (Isbreak(After_Next_Letter) ||
572 (After_Next_Letter == 'E' &&
573 Look_Ahead_Letter(3) == 'D'))
575 /* dropped */
577 else
578 Phonize('K');
580 else if (MAKESOFT(Next_Letter) &&
581 Prev_Letter != 'G')
582 Phonize('J');
583 else
584 Phonize('K');
585 break;
586 /* H if before a vowel and not after C,G,P,S,T */
587 case 'H':
588 if (isvowel(Next_Letter) &&
589 !AFFECTH(Prev_Letter))
590 Phonize('H');
591 break;
594 * dropped if after C else K
596 case 'K':
597 if (Prev_Letter != 'C')
598 Phonize('K');
599 break;
602 * F if before H else P
604 case 'P':
605 if (Next_Letter == 'H')
606 Phonize('F');
607 else
608 Phonize('P');
609 break;
614 case 'Q':
615 Phonize('K');
616 break;
619 * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S
621 case 'S':
622 if (Next_Letter == 'I' &&
623 (After_Next_Letter == 'O' ||
624 After_Next_Letter == 'A'))
625 Phonize(SH);
626 else if (Next_Letter == 'H')
628 Phonize(SH);
629 skip_letter++;
631 #ifndef USE_TRADITIONAL_METAPHONE
632 else if (Next_Letter == 'C' &&
633 Look_Ahead_Letter(2) == 'H' &&
634 Look_Ahead_Letter(3) == 'W')
636 Phonize(SH);
637 skip_letter += 2;
639 #endif
640 else
641 Phonize('S');
642 break;
645 * 'sh' in -TIA- or -TIO- else 'th' before H else T
647 case 'T':
648 if (Next_Letter == 'I' &&
649 (After_Next_Letter == 'O' ||
650 After_Next_Letter == 'A'))
651 Phonize(SH);
652 else if (Next_Letter == 'H')
654 Phonize(TH);
655 skip_letter++;
657 else
658 Phonize('T');
659 break;
660 /* F */
661 case 'V':
662 Phonize('F');
663 break;
664 /* W before a vowel, else dropped */
665 case 'W':
666 if (isvowel(Next_Letter))
667 Phonize('W');
668 break;
669 /* KS */
670 case 'X':
671 Phonize('K');
672 if (max_phonemes == 0 || Phone_Len < max_phonemes)
673 Phonize('S');
674 break;
675 /* Y if followed by a vowel */
676 case 'Y':
677 if (isvowel(Next_Letter))
678 Phonize('Y');
679 break;
680 /* S */
681 case 'Z':
682 Phonize('S');
683 break;
684 /* No transformation */
685 case 'F':
686 case 'J':
687 case 'L':
688 case 'M':
689 case 'N':
690 case 'R':
691 Phonize(Curr_Letter);
692 break;
693 default:
694 /* nothing */
695 break;
696 } /* END SWITCH */
698 w_idx += skip_letter;
699 } /* END FOR */
701 End_Phoned_Word;
702 } /* END metaphone */
706 * SQL function: soundex(text) returns text
708 PG_FUNCTION_INFO_V1(soundex);
710 Datum
711 soundex(PG_FUNCTION_ARGS)
713 char outstr[SOUNDEX_LEN + 1];
714 char *arg;
716 arg = text_to_cstring(PG_GETARG_TEXT_PP(0));
718 _soundex(arg, outstr);
720 PG_RETURN_TEXT_P(cstring_to_text(outstr));
723 static void
724 _soundex(const char *instr, char *outstr)
726 int count;
728 Assert(instr);
729 Assert(outstr);
731 /* Skip leading non-alphabetic characters */
732 while (*instr && !isalpha((unsigned char) *instr))
733 ++instr;
735 /* If no string left, return all-zeroes buffer */
736 if (!*instr)
738 memset(outstr, '\0', SOUNDEX_LEN + 1);
739 return;
742 /* Take the first letter as is */
743 *outstr++ = (char) toupper((unsigned char) *instr++);
745 count = 1;
746 while (*instr && count < SOUNDEX_LEN)
748 if (isalpha((unsigned char) *instr) &&
749 soundex_code(*instr) != soundex_code(*(instr - 1)))
751 *outstr = soundex_code(*instr);
752 if (*outstr != '0')
754 ++outstr;
755 ++count;
758 ++instr;
761 /* Fill with 0's */
762 while (count < SOUNDEX_LEN)
764 *outstr = '0';
765 ++outstr;
766 ++count;
769 /* And null-terminate */
770 *outstr = '\0';
773 PG_FUNCTION_INFO_V1(difference);
775 Datum
776 difference(PG_FUNCTION_ARGS)
778 char sndx1[SOUNDEX_LEN + 1],
779 sndx2[SOUNDEX_LEN + 1];
780 int i,
781 result;
783 _soundex(text_to_cstring(PG_GETARG_TEXT_PP(0)), sndx1);
784 _soundex(text_to_cstring(PG_GETARG_TEXT_PP(1)), sndx2);
786 result = 0;
787 for (i = 0; i < SOUNDEX_LEN; i++)
789 if (sndx1[i] == sndx2[i])
790 result++;
793 PG_RETURN_INT32(result);