6 Copyright (c) 1991-2001, Be Incorporated. All rights reserved.
8 Permission is hereby granted, free of charge, to any person obtaining a copy of
9 this software and associated documentation files (the "Software"), to deal in
10 the Software without restriction, including without limitation the rights to
11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
12 of the Software, and to permit persons to whom the Software is furnished to do
13 so, subject to the following conditions:
15 The above copyright notice and this permission notice applies to all licensees
16 and shall be included in all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION
23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 Except as contained in this notice, the name of Be Incorporated shall not be
26 used in advertising or otherwise to promote the sale, use or other dealings in
27 this Software without prior written authorization from Be Incorporated.
29 BeMail(TM), Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks
30 of Be Incorporated in the United States and other countries. Other brand product
31 names are registered trademarks or trademarks of their respective holders.
51 MAXMETAPH is the length of the Metaphone code.
53 Four is a good compromise value for English names. For comparing words
54 which are not names or for some non-English names, use a longer code
55 length for more precise matches.
57 The default here is 5.
63 // Character coding array, A-Z
64 static char vsvfn
[26] = { 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0,
65 2, 4, 4, 1, 0, 0, 0, 8, 0};
68 static const char* gCmpKey
;
72 word_cmp(BString
** firstArg
, BString
** secondArg
)
74 return word_match(gCmpKey
, (*firstArg
)->String()) - word_match(gCmpKey
,
75 (*secondArg
)->String());
79 Words::Words(bool useMetaphone
)
81 fUseMetaphone(useMetaphone
)
87 Words::Words(BPositionIO
* thes
, bool useMetaphone
)
90 fUseMetaphone(useMetaphone
)
102 Words::Words(const char* dataPath
, const char* indexPath
, bool useMetaphone
)
104 fUseMetaphone(useMetaphone
)
107 fEntrySize
= sizeof(uint32
);
108 SetTo(dataPath
, indexPath
);
113 Words::BuildIndex(void)
115 // Parse the Words file...
125 char entryName
[256], *namePtr
= entryName
;
126 char suffixName
[256];
127 char flags
[32], *flagsPtr
= flags
;
130 int32 state
= FIND_WORD
;
132 // Make sure we are at start of file
133 fDataFile
->Seek(0, SEEK_SET
);
136 // Read blocks from thes until eof
139 blockOffset
= fDataFile
->Position();
140 if ((blockSize
= fDataFile
->Read(buffer
, 16384)) == 0)
144 for (nptr
= buffer
, eptr
= buffer
+ blockSize
; nptr
< eptr
; nptr
++) {
145 // Looking for start of word?
146 if (state
== FIND_WORD
) {
148 if (isalpha(*nptr
)) {
150 *namePtr
++ = *nptr
; // copy word
151 entry
.offset
= blockOffset
+ (nptr
- buffer
);
155 } else if ((*nptr
== '\n') || (*nptr
== '\r')) {
158 if (namePtr
!= entryName
) {
159 // Add previous entry to word index
160 *namePtr
= 0; // terminate word
161 *flagsPtr
= 0; // terminate flags
162 NormalizeWord(entryName
, entryName
);
164 entry
.key
= GetKey(entryName
);
167 // Add suffixed words if any
168 if (flagsPtr
!= flags
) {
169 // printf("Base: %s, flags: %s\n", entryName, flags);
170 for (flagsPtr
= flags
; *flagsPtr
!= 0; flagsPtr
++) {
171 if (suffix_word(suffixName
, entryName
, *flagsPtr
)) {
172 // printf("Suffix: %s\n", suffixName);
173 entry
.key
= GetKey(suffixName
);
183 } else if (state
== GET_WORD
) {
186 *namePtr
= 0; // terminate word
187 // printf("Found word: %s\n", entryName);
191 *namePtr
++ = *nptr
; // copy word
193 } else if (state
== GET_FLAGS
) // Are we getting the flags?
194 *flagsPtr
++ = *nptr
; // copy flag
195 } // End for (nptr = buffer, eptr = buffer + blockSize;
196 // nptr < eptr; nptr++, entry.size++)
197 } // End while (true)
205 Words::GetKey(const char* s
)
214 metaphone(s
, Metaph
, GENERATE
);
215 // Compact Metaphone from 6 bytes to 4
217 // printf("%s -> %s: \n", s, Metaph);
219 for (sPtr
= Metaph
, offset
= 25; *sPtr
; sPtr
++, offset
-= 5) {
221 // printf("%d,", int16(c));
222 key
|= int32(c
) << offset
;
224 for (; offset
>= 0; offset
-= 5)
225 key
|= int32(31) << offset
;
226 // printf(": %ld\n", key);
229 return WIndex::GetKey(s
);
234 // Macros to access the character coding array
235 #define vowel(x) (vsvfn[(x) - 'A'] & 1) // AEIOU
236 #define same(x) (vsvfn[(x) - 'A'] & 2) // FJLMNR
237 #define varson(x) (vsvfn[(x) - 'A'] & 4) // CGPST
238 #define frontv(x) (vsvfn[(x) - 'A'] & 8) // EIY
239 #define noghf(x) (vsvfn[(x) - 'A'] & 16) // BDH
246 1 - The word to be converted to a metaphone code.
247 2 - A MAXMETAPH + 1 char field for the result.
249 If 0: Compute the Metaphone code for the first argument,
250 then compare it to the Metaphone code passed in the second argument.
251 If 1: Compute the Metaphone code for the first argument,
252 then store the result in the area pointed to by the second argument.
255 If function code is 0, returns Success_ for a match, else Error_.
256 If function code is 1, returns Success_.
261 metaphone(const char* Word
, char* Metaph
, metaphlag Flag
)
263 char *n
, *n_start
, *n_end
; // Pointers to string
264 char *metaph
= NULL
, *metaph_end
; // Pointers to metaph
265 char ntrans
[512]; // Word with uppercase letters
266 char newm
[MAXMETAPH
+ 4]; // New metaph for comparison
267 int KSflag
; // State flag for X translation
269 // Copy word to internal buffer, dropping non-alphabetic characters
270 // and converting to upper case.
272 for (n
= ntrans
+ 1, n_end
= ntrans
+ sizeof(ntrans
) - 2;
273 *Word
&& n
< n_end
; ++Word
) {
275 *n
++ = toupper(*Word
);
279 return false; // Return if zero characters
281 n_end
= n
; // Set end of string pointer
283 // Pad with NULs, front and rear
290 // If doing comparison, redirect pointers
292 if (COMPARE
== Flag
) {
297 // Check for PN, KN, GN, WR, WH, and X at start
313 if ('R' == *(n
+ 1)) {
315 } else if ('H' == *(n
+ 1)) {
326 // Now loop through the string, stopping at the end of the string
327 // or when the computed Metaphone code is MAXMETAPH characters long.
329 KSflag
= false; // State flag for KStranslation
331 for (metaph_end
= Metaph
+ MAXMETAPH
, n_start
= n
;
332 n
<= n_end
&& Metaph
< metaph_end
; ++n
) {
337 // Drop duplicates except for CC
339 if (*(n
- 1) == *n
&& *n
!= 'C')
342 // Check for F J L M N R or first letter vowel
344 if (same(*n
) || (n
== n_start
&& vowel(*n
))) {
349 if (n
< n_end
|| *(n
- 1) != 'M')
354 if (*(n
- 1) != 'S' || !frontv(*(n
+ 1))) {
355 if ('I' == *(n
+ 1) && 'A' == *(n
+ 2))
357 else if (frontv(*(n
+ 1)))
359 else if ('H' == *(n
+ 1)) {
360 *Metaph
++ = ((n
== n_start
&& !vowel(*(n
+ 2)))
361 || 'S' == *(n
- 1)) ? 'K' : 'X';
369 *Metaph
++ = ('G' == *(n
+ 1) && frontv(*(n
+ 2)))
374 if ((*(n
+ 1) != 'H' || vowel(*(n
+ 2)))
375 && (*(n
+ 1) != 'N' || ((n
+ 1) < n_end
376 && (*(n
+ 2) != 'E' || *(n
+ 3) != 'D')))
377 && (*(n
- 1) != 'D' || !frontv(*(n
+ 1)))) {
378 *Metaph
++ = (frontv(*(n
+ 1))
379 && *(n
+ 2) != 'G') ? 'J' : 'K';
380 } else if ('H' == *(n
+ 1) && !noghf(*(n
- 3))
381 && *(n
- 4) != 'H') {
387 if (!varson(*(n
- 1))
388 && (!vowel(*(n
- 1)) || vowel(*(n
+ 1)))) {
399 *Metaph
++ = ('H' == *(n
+ 1)) ? 'F' : 'P';
407 *Metaph
++ = ('H' == *(n
+ 1) || ('I' == *(n
+ 1)
408 && ('O' == *(n
+ 2) || 'A' == *(n
+ 2))))
414 && ('O' == *(n
+ 2) || 'A' == *(n
+ 2))) {
416 } else if ('H' == *(n
+ 1)) {
418 } else if (*(n
+ 1) != 'C' || *(n
+ 2) != 'H') {
449 // Compare new Metaphone code with old
451 && *(Metaph
- 1) != metaph
[(Metaph
- newm
) - 1]) {
456 // If comparing, check if Metaphone codes were equal in length
457 if (COMPARE
== Flag
&& metaph
[Metaph
- newm
])
466 word_match(const char* reference
, const char* test
)
482 a
= (tolower(s1
[1]) == c2
);
483 b
= (tolower(s2
[1]) == c1
);
500 // Equivalent Character
501 else if (vsvfn
[(unsigned)c1
] == vsvfn
[(unsigned)c2
])
503 // Unrelated Character
522 suffix_word(char* dst
, const char* src
, char flag
)
526 end
= stpcpy(dst
, src
);
527 flag
= toupper(flag
);
532 end
= stpcpy(end
- 1, "ive");
535 end
= stpcpy(end
, "ive");
542 end
= stpcpy(end
- 1, "ion");
545 end
= stpcpy(end
- 1, "ication");
548 end
= stpcpy(end
, "en");
555 end
= stpcpy(end
- 1, "ions");
558 end
= stpcpy(end
- 1, "ications");
561 end
= stpcpy(end
, "ens");
568 end
= stpcpy(end
- 1, "ieth");
571 end
= stpcpy(end
, "th");
576 end
= stpcpy(end
, "ly");
581 end
= stpcpy(end
- 1, "ing");
584 end
= stpcpy(end
, "ing");
591 end
= stpcpy(end
- 1, "ings");
594 end
= stpcpy(end
, "ings");
601 end
= stpcpy(end
- 1, "ed");
604 if (!strchr("aeiou", end
[-2])) {
605 end
= stpcpy(end
- 1, "ied");
610 end
= stpcpy(end
, "ed");
617 end
= stpcpy(end
- 1, "est");
620 if (!strchr("aeiou", end
[-2])) {
621 end
= stpcpy(end
- 1, "iest");
626 end
= stpcpy(end
, "est");
633 end
= stpcpy(end
- 1, "er");
636 if (!strchr("aeiou", end
[-2])) {
637 end
= stpcpy(end
- 1, "ier");
642 end
= stpcpy(end
, "er");
649 end
= stpcpy(end
- 1, "ers");
652 if (!strchr("aeiou", end
[-2])) {
653 end
= stpcpy(end
- 1, "iers");
658 end
= stpcpy(end
, "ers");
668 end
= stpcpy(end
, "es");
671 if (!strchr("aeiou", end
[-2])) {
672 end
= stpcpy(end
- 1, "ies");
677 end
= stpcpy(end
, "s");
684 if (!strchr("aeiou", end
[-2])) {
685 end
= stpcpy(end
- 1, "iness");
690 end
= stpcpy(end
, "ness");
695 end
= stpcpy(end
, "'s");
705 Words::FindBestMatches(BList
* matches
, const char* s
)
708 // printf("*** Looking for %s: ***\n", s);
710 if ((index
= FindFirst(s
)) >= 0) {
713 WIndexEntry
* indexEntry
;
715 int32 key
= (ItemAt(index
))->key
;
717 char word
[128], suffixWord
[128];
718 const char *src
, *testWord
;
719 const char *suffixFlags
;
722 gCmpKey
= srcWord
.String();
725 uint8 hashValue
, highHash
, lowHash
;
726 for (int32 i
= 0; i
< 32; i
++)
730 indexEntry
= ItemAt(index
);
731 // Hash the entry offset; we use this to make sure we don't add
732 // the same word file entry twice;
733 // It is possible for the same entry in the words file to have
734 // multiple entries in the index.
736 hashValue
= indexEntry
->offset
% 256;
737 highHash
= hashValue
>> 3;
738 lowHash
= 0x01 << (hashValue
& 0x07);
740 // printf("Testing Entry: %ld: hash=%d, highHash=%d, lowHash=%d\n",
741 // indexEntry->offset, hashValue, (uint16)highHash,
744 // Has this entry offset been seen before?
745 if (!(hashTable
[highHash
] & lowHash
)) {
746 // printf("New Entry\n");
747 hashTable
[highHash
] |= lowHash
; // Mark this offset so we don't add it twice
749 entry
= GetEntry(index
);
750 src
= entry
->String();
751 while (*src
&& !isalpha(*src
))
754 while (*src
&& *src
!= '/')
758 suffixFlags
= src
+ 1;
762 // printf("Base Word: %s\n", word);
763 // printf("Flags: %s\n", suffixFlags);
764 testWord
= word
; // Test the base word first
766 // printf("Testing: %s\n", testWord);
767 // Does this word match the key
768 if ((GetKey(testWord
) == key
)
769 // And does it look close enough to the compare key?
770 // word_match(gCmpKey, testWord)
771 // <= int32((strlen(gCmpKey)-1)/2))
772 && word_match(gCmpKey
, testWord
)
773 <= int32(float(strlen(gCmpKey
)-1)*.75)) {
774 // printf("Added: %s\n", testWord);
775 matches
->AddItem((void*)(new BString(testWord
)));
778 // If suffix, transform and test
780 // Repeat until valid suffix found or end is reached
783 && !(suffixLength
= suffix_word(suffixWord
,
784 word
, *suffixFlags
++))) {}
786 testWord
= suffixWord
;
795 // else printf("Redundant entry\n");
798 } while (key
== (ItemAt(index
))->key
);
800 return matches
->CountItems();
808 sort_word_list(BList
* matches
, const char* reference
)
810 if (matches
->CountItems() > 0) {
811 BString
srcWord(reference
);
812 gCmpKey
= srcWord
.String();
813 matches
->SortItems((int(*)(const void*, const void*))word_cmp
);