4 * Functions for "fuzzy" comparison of strings
6 * Joe Conway <mail@joeconway.com>
8 * contrib/fuzzystrmatch/fuzzystrmatch.c
9 * Copyright (c) 2001-2024, PostgreSQL Global Development Group
10 * ALL RIGHTS RESERVED;
14 * Modified for PostgreSQL by Joe Conway.
15 * Based on CPAN's "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
16 * Code slightly modified for use as PostgreSQL function (palloc, elog, etc).
17 * Metaphone was originally created by Lawrence Philips and presented in article
18 * in "Computer Language" December 1990 issue.
20 * Permission to use, copy, modify, and distribute this software and its
21 * documentation for any purpose, without fee, and without a written agreement
22 * is hereby granted, provided that the above copyright notice and this
23 * paragraph and the following two paragraphs appear in all copies.
25 * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
26 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
27 * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
28 * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
31 * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
33 * AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
34 * ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO
35 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
43 #include "mb/pg_wchar.h"
44 #include "utils/builtins.h"
45 #include "utils/varlena.h"
53 static void _soundex(const char *instr
, char *outstr
);
57 /* ABCDEFGHIJKLMNOPQRSTUVWXYZ */
58 static const char *const soundex_table
= "01230120022455012623010202";
61 soundex_code(char letter
)
63 letter
= toupper((unsigned char) letter
);
64 /* Defend against non-ASCII letters */
65 if (letter
>= 'A' && letter
<= 'Z')
66 return soundex_table
[letter
- 'A'];
73 #define MAX_METAPHONE_STRLEN 255
76 * Original code by Michael G Schwern starts here.
77 * Code slightly modified for use as PostgreSQL function.
81 /**************************************************************************
82 metaphone -- Breaks english phrases down into their phonemes.
85 word -- An english word to be phonized
86 max_phonemes -- How many phonemes to calculate. If 0, then it
87 will phonize the entire phrase.
88 phoned_word -- The final phonized word. (We'll allocate the
91 error -- A simple error flag, returns true or false
93 NOTES: ALL non-alpha characters are ignored, this includes whitespace,
94 although non-alpha characters will break up phonemes.
95 ****************************************************************************/
98 /* I add modifications to the traditional metaphone algorithm that you
99 might find in books. Define this if you want metaphone to behave
101 #undef USE_TRADITIONAL_METAPHONE
103 /* Special encodings */
107 static char Lookahead(char *word
, int how_far
);
108 static void _metaphone(char *word
, int max_phonemes
, char **phoned_word
);
110 /* Metachar.h ... little bits about characters for metaphone */
113 /*-- Character encoding array & accessing macros --*/
114 /* Stolen directly out of the book... */
115 static const char _codes
[26] = {
116 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
117 /* a b c d e f g h i j k l m n o p q r s t u v w x y z */
123 if (isalpha((unsigned char) c
))
125 c
= toupper((unsigned char) c
);
126 /* Defend against non-ASCII letters */
127 if (c
>= 'A' && c
<= 'Z')
128 return _codes
[c
- 'A'];
133 #define isvowel(c) (getcode(c) & 1) /* AEIOU */
135 /* These letters are passed through unchanged */
136 #define NOCHANGE(c) (getcode(c) & 2) /* FJMNR */
138 /* These form diphthongs when preceding H */
139 #define AFFECTH(c) (getcode(c) & 4) /* CGPST */
141 /* These make C and G soft */
142 #define MAKESOFT(c) (getcode(c) & 8) /* EIY */
144 /* These prevent GH from becoming F */
145 #define NOGHTOF(c) (getcode(c) & 16) /* BDH */
147 PG_FUNCTION_INFO_V1(levenshtein_with_costs
);
149 levenshtein_with_costs(PG_FUNCTION_ARGS
)
151 text
*src
= PG_GETARG_TEXT_PP(0);
152 text
*dst
= PG_GETARG_TEXT_PP(1);
153 int ins_c
= PG_GETARG_INT32(2);
154 int del_c
= PG_GETARG_INT32(3);
155 int sub_c
= PG_GETARG_INT32(4);
161 /* Extract a pointer to the actual character data */
162 s_data
= VARDATA_ANY(src
);
163 t_data
= VARDATA_ANY(dst
);
164 /* Determine length of each string in bytes */
165 s_bytes
= VARSIZE_ANY_EXHDR(src
);
166 t_bytes
= VARSIZE_ANY_EXHDR(dst
);
168 PG_RETURN_INT32(varstr_levenshtein(s_data
, s_bytes
, t_data
, t_bytes
,
169 ins_c
, del_c
, sub_c
, false));
173 PG_FUNCTION_INFO_V1(levenshtein
);
175 levenshtein(PG_FUNCTION_ARGS
)
177 text
*src
= PG_GETARG_TEXT_PP(0);
178 text
*dst
= PG_GETARG_TEXT_PP(1);
184 /* Extract a pointer to the actual character data */
185 s_data
= VARDATA_ANY(src
);
186 t_data
= VARDATA_ANY(dst
);
187 /* Determine length of each string in bytes */
188 s_bytes
= VARSIZE_ANY_EXHDR(src
);
189 t_bytes
= VARSIZE_ANY_EXHDR(dst
);
191 PG_RETURN_INT32(varstr_levenshtein(s_data
, s_bytes
, t_data
, t_bytes
,
196 PG_FUNCTION_INFO_V1(levenshtein_less_equal_with_costs
);
198 levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS
)
200 text
*src
= PG_GETARG_TEXT_PP(0);
201 text
*dst
= PG_GETARG_TEXT_PP(1);
202 int ins_c
= PG_GETARG_INT32(2);
203 int del_c
= PG_GETARG_INT32(3);
204 int sub_c
= PG_GETARG_INT32(4);
205 int max_d
= PG_GETARG_INT32(5);
211 /* Extract a pointer to the actual character data */
212 s_data
= VARDATA_ANY(src
);
213 t_data
= VARDATA_ANY(dst
);
214 /* Determine length of each string in bytes */
215 s_bytes
= VARSIZE_ANY_EXHDR(src
);
216 t_bytes
= VARSIZE_ANY_EXHDR(dst
);
218 PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data
, s_bytes
,
225 PG_FUNCTION_INFO_V1(levenshtein_less_equal
);
227 levenshtein_less_equal(PG_FUNCTION_ARGS
)
229 text
*src
= PG_GETARG_TEXT_PP(0);
230 text
*dst
= PG_GETARG_TEXT_PP(1);
231 int max_d
= PG_GETARG_INT32(2);
237 /* Extract a pointer to the actual character data */
238 s_data
= VARDATA_ANY(src
);
239 t_data
= VARDATA_ANY(dst
);
240 /* Determine length of each string in bytes */
241 s_bytes
= VARSIZE_ANY_EXHDR(src
);
242 t_bytes
= VARSIZE_ANY_EXHDR(dst
);
244 PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data
, s_bytes
,
252 * Calculates the metaphone of an input string.
253 * Returns number of characters requested
254 * (suggested value is 4)
256 PG_FUNCTION_INFO_V1(metaphone
);
258 metaphone(PG_FUNCTION_ARGS
)
260 char *str_i
= TextDatumGetCString(PG_GETARG_DATUM(0));
261 size_t str_i_len
= strlen(str_i
);
265 /* return an empty string if we receive one */
266 if (!(str_i_len
> 0))
267 PG_RETURN_TEXT_P(cstring_to_text(""));
269 if (str_i_len
> MAX_METAPHONE_STRLEN
)
271 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
272 errmsg("argument exceeds the maximum length of %d bytes",
273 MAX_METAPHONE_STRLEN
)));
275 reqlen
= PG_GETARG_INT32(1);
276 if (reqlen
> MAX_METAPHONE_STRLEN
)
278 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
279 errmsg("output exceeds the maximum length of %d bytes",
280 MAX_METAPHONE_STRLEN
)));
284 (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING
),
285 errmsg("output cannot be empty string")));
287 _metaphone(str_i
, reqlen
, &metaph
);
288 PG_RETURN_TEXT_P(cstring_to_text(metaph
));
293 * Original code by Michael G Schwern starts here.
294 * Code slightly modified for use as PostgreSQL
295 * function (palloc, etc).
298 /* I suppose I could have been using a character pointer instead of
299 * accessing the array directly... */
301 /* Look at the next letter in the word */
302 #define Next_Letter (toupper((unsigned char) word[w_idx+1]))
303 /* Look at the current letter in the word */
304 #define Curr_Letter (toupper((unsigned char) word[w_idx]))
305 /* Go N letters back. */
306 #define Look_Back_Letter(n) \
307 (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
308 /* Previous letter. I dunno, should this return null on failure? */
309 #define Prev_Letter (Look_Back_Letter(1))
310 /* Look two letters down. It makes sure you don't walk off the string. */
311 #define After_Next_Letter \
312 (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
313 #define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
316 /* Allows us to safely look ahead an arbitrary # of letters */
317 /* I probably could have just used strlen... */
319 Lookahead(char *word
, int how_far
)
321 char letter_ahead
= '\0'; /* null by default */
324 for (idx
= 0; word
[idx
] != '\0' && idx
< how_far
; idx
++);
325 /* Edge forward in the string... */
327 letter_ahead
= word
[idx
]; /* idx will be either == to how_far or at the
328 * end of the string */
333 /* phonize one letter */
334 #define Phonize(c) do {(*phoned_word)[p_idx++] = c;} while (0)
335 /* Slap a null character on the end of the phoned word */
336 #define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0)
337 /* How long is the phoned word? */
338 #define Phone_Len (p_idx)
340 /* Note is a letter is a 'break' in the word */
341 #define Isbreak(c) (!isalpha((unsigned char) (c)))
345 _metaphone(char *word
, /* IN */
347 char **phoned_word
) /* OUT */
349 int w_idx
= 0; /* point in the phonization we're at. */
350 int p_idx
= 0; /* end of the phoned phrase */
352 /*-- Parameter checks --*/
355 * Shouldn't be necessary, but left these here anyway jec Aug 3, 2001
358 /* Negative phoneme length is meaningless */
359 if (!(max_phonemes
> 0))
361 elog(ERROR
, "metaphone: Requested output length must be > 0");
363 /* Empty/null string is meaningless */
364 if ((word
== NULL
) || !(strlen(word
) > 0))
366 elog(ERROR
, "metaphone: Input string length must be > 0");
368 /*-- Allocate memory for our phoned_phrase --*/
369 if (max_phonemes
== 0)
370 { /* Assume largest possible */
371 *phoned_word
= palloc(sizeof(char) * strlen(word
) + 1);
375 *phoned_word
= palloc(sizeof(char) * max_phonemes
+ 1);
378 /*-- The first phoneme has to be processed specially. --*/
379 /* Find our first letter */
380 for (; !isalpha((unsigned char) (Curr_Letter
)); w_idx
++)
382 /* On the off chance we were given nothing but crap... */
383 if (Curr_Letter
== '\0')
394 if (Next_Letter
== 'E')
399 /* Remember, preserve vowels at the beginning */
406 /* [GKP]N becomes N */
410 if (Next_Letter
== 'N')
418 * WH becomes H, WR becomes R W if followed by a vowel
421 if (Next_Letter
== 'H' ||
424 Phonize(Next_Letter
);
427 else if (isvowel(Next_Letter
))
439 /* Vowels are kept */
442 * We did A already case 'A': case 'a':
448 Phonize(Curr_Letter
);
458 /* On to the metaphoning */
459 for (; Curr_Letter
!= '\0' &&
460 (max_phonemes
== 0 || Phone_Len
< max_phonemes
);
464 * How many letters to skip because an earlier encoding handled
467 unsigned short int skip_letter
= 0;
471 * THOUGHT: It would be nice if, rather than having things like...
472 * well, SCI. For SCI you encode the S, then have to remember to skip
473 * the C. So the phonome SCI invades both S and C. It would be
474 * better, IMHO, to skip the C from the S part of the encoding. Hell,
478 /* Ignore non-alphas */
479 if (!isalpha((unsigned char) (Curr_Letter
)))
482 /* Drop duplicates, except CC */
483 if (Curr_Letter
== Prev_Letter
&&
489 /* B -> B unless in MB */
491 if (Prev_Letter
!= 'M')
496 * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
497 * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
498 * SCE-, -SCY- (handed in S) else K
501 if (MAKESOFT(Next_Letter
))
503 if (After_Next_Letter
== 'A' &&
509 else if (Prev_Letter
== 'S')
516 else if (Next_Letter
== 'H')
518 #ifndef USE_TRADITIONAL_METAPHONE
519 if (After_Next_Letter
== 'R' ||
521 { /* Christ, School */
536 * J if in -DGE-, -DGI- or -DGY- else T
539 if (Next_Letter
== 'G' &&
540 MAKESOFT(After_Next_Letter
))
550 * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
551 * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
552 * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
556 if (Next_Letter
== 'H')
558 if (!(NOGHTOF(Look_Back_Letter(3)) ||
559 Look_Back_Letter(4) == 'H'))
569 else if (Next_Letter
== 'N')
571 if (Isbreak(After_Next_Letter
) ||
572 (After_Next_Letter
== 'E' &&
573 Look_Ahead_Letter(3) == 'D'))
580 else if (MAKESOFT(Next_Letter
) &&
586 /* H if before a vowel and not after C,G,P,S,T */
588 if (isvowel(Next_Letter
) &&
589 !AFFECTH(Prev_Letter
))
594 * dropped if after C else K
597 if (Prev_Letter
!= 'C')
602 * F if before H else P
605 if (Next_Letter
== 'H')
619 * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S
622 if (Next_Letter
== 'I' &&
623 (After_Next_Letter
== 'O' ||
624 After_Next_Letter
== 'A'))
626 else if (Next_Letter
== 'H')
631 #ifndef USE_TRADITIONAL_METAPHONE
632 else if (Next_Letter
== 'C' &&
633 Look_Ahead_Letter(2) == 'H' &&
634 Look_Ahead_Letter(3) == 'W')
645 * 'sh' in -TIA- or -TIO- else 'th' before H else T
648 if (Next_Letter
== 'I' &&
649 (After_Next_Letter
== 'O' ||
650 After_Next_Letter
== 'A'))
652 else if (Next_Letter
== 'H')
664 /* W before a vowel, else dropped */
666 if (isvowel(Next_Letter
))
672 if (max_phonemes
== 0 || Phone_Len
< max_phonemes
)
675 /* Y if followed by a vowel */
677 if (isvowel(Next_Letter
))
684 /* No transformation */
691 Phonize(Curr_Letter
);
698 w_idx
+= skip_letter
;
702 } /* END metaphone */
706 * SQL function: soundex(text) returns text
708 PG_FUNCTION_INFO_V1(soundex
);
711 soundex(PG_FUNCTION_ARGS
)
713 char outstr
[SOUNDEX_LEN
+ 1];
716 arg
= text_to_cstring(PG_GETARG_TEXT_PP(0));
718 _soundex(arg
, outstr
);
720 PG_RETURN_TEXT_P(cstring_to_text(outstr
));
724 _soundex(const char *instr
, char *outstr
)
731 /* Skip leading non-alphabetic characters */
732 while (*instr
&& !isalpha((unsigned char) *instr
))
735 /* If no string left, return all-zeroes buffer */
738 memset(outstr
, '\0', SOUNDEX_LEN
+ 1);
742 /* Take the first letter as is */
743 *outstr
++ = (char) toupper((unsigned char) *instr
++);
746 while (*instr
&& count
< SOUNDEX_LEN
)
748 if (isalpha((unsigned char) *instr
) &&
749 soundex_code(*instr
) != soundex_code(*(instr
- 1)))
751 *outstr
= soundex_code(*instr
);
762 while (count
< SOUNDEX_LEN
)
769 /* And null-terminate */
773 PG_FUNCTION_INFO_V1(difference
);
776 difference(PG_FUNCTION_ARGS
)
778 char sndx1
[SOUNDEX_LEN
+ 1],
779 sndx2
[SOUNDEX_LEN
+ 1];
783 _soundex(text_to_cstring(PG_GETARG_TEXT_PP(0)), sndx1
);
784 _soundex(text_to_cstring(PG_GETARG_TEXT_PP(1)), sndx2
);
787 for (i
= 0; i
< SOUNDEX_LEN
; i
++)
789 if (sndx1
[i
] == sndx2
[i
])
793 PG_RETURN_INT32(result
);