libroot/posix/stdio: Remove unused portions.
[haiku.git] / src / apps / mail / Words.cpp
blob3bc987d979f1909b0fb42127c766bf68f3ab34fd
1 /*
2 Open Tracker License
4 Terms and Conditions
6 Copyright (c) 1991-2001, Be Incorporated. All rights reserved.
8 Permission is hereby granted, free of charge, to any person obtaining a copy of
9 this software and associated documentation files (the "Software"), to deal in
10 the Software without restriction, including without limitation the rights to
11 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
12 of the Software, and to permit persons to whom the Software is furnished to do
13 so, subject to the following conditions:
15 The above copyright notice and this permission notice applies to all licensees
16 and shall be included in all copies or substantial portions of the Software.
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF TITLE, MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 BE INCORPORATED BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
22 AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION
23 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 Except as contained in this notice, the name of Be Incorporated shall not be
26 used in advertising or otherwise to promote the sale, use or other dealings in
27 this Software without prior written authorization from Be Incorporated.
29 BeMail(TM), Tracker(TM), Be(R), BeOS(R), and BeIA(TM) are trademarks or registered trademarks
30 of Be Incorporated in the United States and other countries. Other brand product
31 names are registered trademarks or trademarks of their respective holders.
32 All rights reserved.
36 #include "Words.h"
38 #include <ctype.h>
39 #include <stdio.h>
40 #include <string.h>
43 enum {
44 FIND_WORD,
45 GET_WORD,
46 GET_FLAGS
51 MAXMETAPH is the length of the Metaphone code.
53 Four is a good compromise value for English names. For comparing words
54 which are not names or for some non-English names, use a longer code
55 length for more precise matches.
57 The default here is 5.
60 #define MAXMETAPH 6
63 // Character coding array, A-Z
64 static char vsvfn[26] = { 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0,
65 2, 4, 4, 1, 0, 0, 0, 8, 0};
68 static const char* gCmpKey;
71 static int
72 word_cmp(BString** firstArg, BString** secondArg)
74 return word_match(gCmpKey, (*firstArg)->String()) - word_match(gCmpKey,
75 (*secondArg)->String());
79 Words::Words(bool useMetaphone)
81 fUseMetaphone(useMetaphone)
87 Words::Words(BPositionIO* thes, bool useMetaphone)
89 WIndex(thes),
90 fUseMetaphone(useMetaphone)
96 Words::~Words(void)
102 Words::Words(const char* dataPath, const char* indexPath, bool useMetaphone)
104 fUseMetaphone(useMetaphone)
106 if (!useMetaphone)
107 fEntrySize = sizeof(uint32);
108 SetTo(dataPath, indexPath);
112 status_t
113 Words::BuildIndex(void)
115 // Parse the Words file...
117 // Buffer Stuff
118 char buffer[16384];
119 char *nptr, *eptr;
120 int64 blockOffset;
121 int32 blockSize;
123 // The Word Entry
124 WIndexEntry entry;
125 char entryName[256], *namePtr = entryName;
126 char suffixName[256];
127 char flags[32], *flagsPtr = flags;
129 // State Info
130 int32 state = FIND_WORD;
132 // Make sure we are at start of file
133 fDataFile->Seek(0, SEEK_SET);
134 entry.offset = -1;
136 // Read blocks from thes until eof
137 while (true) {
138 // Get next block
139 blockOffset = fDataFile->Position();
140 if ((blockSize = fDataFile->Read(buffer, 16384)) == 0)
141 break;
143 // parse block
144 for (nptr = buffer, eptr = buffer + blockSize; nptr < eptr; nptr++) {
145 // Looking for start of word?
146 if (state == FIND_WORD) {
147 // Is start of word?
148 if (isalpha(*nptr)) {
149 state = GET_WORD;
150 *namePtr++ = *nptr; // copy word
151 entry.offset = blockOffset + (nptr - buffer);
152 } else {
153 entry.offset++;
155 } else if ((*nptr == '\n') || (*nptr == '\r')) {
156 // End of word?
158 if (namePtr != entryName) {
159 // Add previous entry to word index
160 *namePtr = 0; // terminate word
161 *flagsPtr = 0; // terminate flags
162 NormalizeWord(entryName, entryName);
163 // Add base word
164 entry.key = GetKey(entryName);
165 AddItem(&entry);
167 // Add suffixed words if any
168 if (flagsPtr != flags) {
169 // printf("Base: %s, flags: %s\n", entryName, flags);
170 for (flagsPtr = flags; *flagsPtr != 0; flagsPtr++) {
171 if (suffix_word(suffixName, entryName, *flagsPtr)) {
172 // printf("Suffix: %s\n", suffixName);
173 entry.key = GetKey(suffixName);
174 AddItem(&entry);
179 // Init new entry
180 state = FIND_WORD;
181 namePtr = entryName;
182 flagsPtr = flags;
183 } else if (state == GET_WORD) {
184 // Start of flags?
185 if (*nptr == '/') {
186 *namePtr = 0; // terminate word
187 // printf("Found word: %s\n", entryName);
189 state = GET_FLAGS;
190 } else {
191 *namePtr++ = *nptr; // copy word
193 } else if (state == GET_FLAGS) // Are we getting the flags?
194 *flagsPtr++ = *nptr; // copy flag
195 } // End for (nptr = buffer, eptr = buffer + blockSize;
196 // nptr < eptr; nptr++, entry.size++)
197 } // End while (true)
199 SortItems();
200 return B_OK;
204 int32
205 Words::GetKey(const char* s)
207 if (fUseMetaphone) {
208 char Metaph[12];
209 const char *sPtr;
210 int32 key = 0;
211 int32 offset;
212 char c;
214 metaphone(s, Metaph, GENERATE);
215 // Compact Metaphone from 6 bytes to 4
217 // printf("%s -> %s: \n", s, Metaph);
219 for (sPtr = Metaph, offset = 25; *sPtr; sPtr++, offset -= 5) {
220 c = *sPtr - 'A';
221 // printf("%d,", int16(c));
222 key |= int32(c) << offset;
224 for (; offset >= 0; offset -= 5)
225 key |= int32(31) << offset;
226 // printf(": %ld\n", key);
227 return key;
228 } else {
229 return WIndex::GetKey(s);
234 // Macros to access the character coding array
235 #define vowel(x) (vsvfn[(x) - 'A'] & 1) // AEIOU
236 #define same(x) (vsvfn[(x) - 'A'] & 2) // FJLMNR
237 #define varson(x) (vsvfn[(x) - 'A'] & 4) // CGPST
238 #define frontv(x) (vsvfn[(x) - 'A'] & 8) // EIY
239 #define noghf(x) (vsvfn[(x) - 'A'] & 16) // BDH
240 #define NUL '\0'
243 metaphone()
245 Arguments:
246 1 - The word to be converted to a metaphone code.
247 2 - A MAXMETAPH + 1 char field for the result.
248 3 - Function flag:
249 If 0: Compute the Metaphone code for the first argument,
250 then compare it to the Metaphone code passed in the second argument.
251 If 1: Compute the Metaphone code for the first argument,
252 then store the result in the area pointed to by the second argument.
254 Returns:
255 If function code is 0, returns Success_ for a match, else Error_.
256 If function code is 1, returns Success_.
260 bool
261 metaphone(const char* Word, char* Metaph, metaphlag Flag)
263 char *n, *n_start, *n_end; // Pointers to string
264 char *metaph = NULL, *metaph_end; // Pointers to metaph
265 char ntrans[512]; // Word with uppercase letters
266 char newm[MAXMETAPH + 4]; // New metaph for comparison
267 int KSflag; // State flag for X translation
269 // Copy word to internal buffer, dropping non-alphabetic characters
270 // and converting to upper case.
272 for (n = ntrans + 1, n_end = ntrans + sizeof(ntrans) - 2;
273 *Word && n < n_end; ++Word) {
274 if (isalpha(*Word))
275 *n++ = toupper(*Word);
278 if (n == ntrans + 1)
279 return false; // Return if zero characters
280 else
281 n_end = n; // Set end of string pointer
283 // Pad with NULs, front and rear
285 *n++ = NUL;
286 *n = NUL;
287 n = ntrans;
288 *n++ = NUL;
290 // If doing comparison, redirect pointers
292 if (COMPARE == Flag) {
293 metaph = Metaph;
294 Metaph = newm;
297 // Check for PN, KN, GN, WR, WH, and X at start
299 switch (*n) {
300 case 'P':
301 case 'K':
302 case 'G':
303 if ('N' == *(n + 1))
304 *n++ = NUL;
305 break;
307 case 'A':
308 if ('E' == *(n + 1))
309 *n++ = NUL;
310 break;
312 case 'W':
313 if ('R' == *(n + 1)) {
314 *n++ = NUL;
315 } else if ('H' == *(n + 1)) {
316 *(n + 1) = *n;
317 *n++ = NUL;
319 break;
321 case 'X':
322 *n = 'S';
323 break;
326 // Now loop through the string, stopping at the end of the string
327 // or when the computed Metaphone code is MAXMETAPH characters long.
329 KSflag = false; // State flag for KStranslation
331 for (metaph_end = Metaph + MAXMETAPH, n_start = n;
332 n <= n_end && Metaph < metaph_end; ++n) {
333 if (KSflag) {
334 KSflag = false;
335 *Metaph++ = *n;
336 } else {
337 // Drop duplicates except for CC
339 if (*(n - 1) == *n && *n != 'C')
340 continue;
342 // Check for F J L M N R or first letter vowel
344 if (same(*n) || (n == n_start && vowel(*n))) {
345 *Metaph++ = *n;
346 } else {
347 switch (*n) {
348 case 'B':
349 if (n < n_end || *(n - 1) != 'M')
350 *Metaph++ = *n;
351 break;
353 case 'C':
354 if (*(n - 1) != 'S' || !frontv(*(n + 1))) {
355 if ('I' == *(n + 1) && 'A' == *(n + 2))
356 *Metaph++ = 'X';
357 else if (frontv(*(n + 1)))
358 *Metaph++ = 'S';
359 else if ('H' == *(n + 1)) {
360 *Metaph++ = ((n == n_start && !vowel(*(n + 2)))
361 || 'S' == *(n - 1)) ? 'K' : 'X';
362 } else {
363 *Metaph++ = 'K';
366 break;
368 case 'D':
369 *Metaph++ = ('G' == *(n + 1) && frontv(*(n + 2)))
370 ? 'J' : 'T';
371 break;
373 case 'G':
374 if ((*(n + 1) != 'H' || vowel(*(n + 2)))
375 && (*(n + 1) != 'N' || ((n + 1) < n_end
376 && (*(n + 2) != 'E' || *(n + 3) != 'D')))
377 && (*(n - 1) != 'D' || !frontv(*(n + 1)))) {
378 *Metaph++ = (frontv(*(n + 1))
379 && *(n + 2) != 'G') ? 'J' : 'K';
380 } else if ('H' == *(n + 1) && !noghf(*(n - 3))
381 && *(n - 4) != 'H') {
382 *Metaph++ = 'F';
384 break;
386 case 'H':
387 if (!varson(*(n - 1))
388 && (!vowel(*(n - 1)) || vowel(*(n + 1)))) {
389 *Metaph++ = 'H';
391 break;
393 case 'K':
394 if (*(n - 1) != 'C')
395 *Metaph++ = 'K';
396 break;
398 case 'P':
399 *Metaph++ = ('H' == *(n + 1)) ? 'F' : 'P';
400 break;
402 case 'Q':
403 *Metaph++ = 'K';
404 break;
406 case 'S':
407 *Metaph++ = ('H' == *(n + 1) || ('I' == *(n + 1)
408 && ('O' == *(n + 2) || 'A' == *(n + 2))))
409 ? 'X' : 'S';
410 break;
412 case 'T':
413 if ('I' == *(n + 1)
414 && ('O' == *(n + 2) || 'A' == *(n + 2))) {
415 *Metaph++ = 'X';
416 } else if ('H' == *(n + 1)) {
417 *Metaph++ = 'O';
418 } else if (*(n + 1) != 'C' || *(n + 2) != 'H') {
419 *Metaph++ = 'T';
421 break;
423 case 'V':
424 *Metaph++ = 'F';
425 break;
427 case 'W':
428 case 'Y':
429 if (vowel(*(n + 1)))
430 *Metaph++ = *n;
431 break;
433 case 'X':
434 if (n == n_start) {
435 *Metaph++ = 'S';
436 } else {
437 *Metaph++ = 'K';
438 KSflag = true;
440 break;
442 case 'Z':
443 *Metaph++ = 'S';
444 break;
449 // Compare new Metaphone code with old
450 if (COMPARE == Flag
451 && *(Metaph - 1) != metaph[(Metaph - newm) - 1]) {
452 return false;
456 // If comparing, check if Metaphone codes were equal in length
457 if (COMPARE == Flag && metaph[Metaph - newm])
458 return false;
460 *Metaph = NUL;
461 return true;
466 word_match(const char* reference, const char* test)
468 const char *s1, *s2;
469 int32 x = 0;
470 char c1, c2;
471 s1 = test;
472 s2 = reference;
474 bool a, b;
476 while (*s2 || *s1) {
477 c1 = tolower(*s1);
478 c2 = tolower(*s2);
480 if (*s2 && *s1) {
481 if (c1 != c2) {
482 a = (tolower(s1[1]) == c2);
483 b = (tolower(s2[1]) == c1);
484 // Reversed pair
485 if (a && b) {
486 x += 1;
487 s1++;
488 s2++;
490 // Extra character
491 if (a) {
492 x += 1;
493 s1++;
495 // Missing Character
496 else if (b) {
497 x += 1;
498 s2++;
500 // Equivalent Character
501 else if (vsvfn[(unsigned)c1] == vsvfn[(unsigned)c2])
502 x++;
503 // Unrelated Character
504 else
505 x += 3;
507 } else {
508 x += 1;
511 if (*s2)
512 s2++;
513 if (*s1)
514 s1++;
517 return x;
521 int32
522 suffix_word(char* dst, const char* src, char flag)
524 char* end;
526 end = stpcpy(dst, src);
527 flag = toupper(flag);
528 switch(flag) {
529 case 'V':
530 switch(end[-1]) {
531 case 'e':
532 end = stpcpy(end - 1, "ive");
533 break;
534 default:
535 end = stpcpy(end, "ive");
536 break;
538 break;
539 case 'N':
540 switch(end[-1]) {
541 case 'e':
542 end = stpcpy(end - 1, "ion");
543 break;
544 case 'y':
545 end = stpcpy(end - 1, "ication");
546 break;
547 default:
548 end = stpcpy(end, "en");
549 break;
551 break;
552 case 'X':
553 switch(end[-1]) {
554 case 'e':
555 end = stpcpy(end - 1, "ions");
556 break;
557 case 'y':
558 end = stpcpy(end - 1, "ications");
559 break;
560 default:
561 end = stpcpy(end, "ens");
562 break;
564 break;
565 case 'H':
566 switch(end[-1]) {
567 case 'y':
568 end = stpcpy(end - 1, "ieth");
569 break;
570 default:
571 end = stpcpy(end, "th");
572 break;
574 break;
575 case 'Y':
576 end = stpcpy(end, "ly");
577 break;
578 case 'G':
579 switch(end[-1]) {
580 case 'e':
581 end = stpcpy(end - 1, "ing");
582 break;
583 default:
584 end = stpcpy(end, "ing");
585 break;
587 break;
588 case 'J':
589 switch(end[-1]) {
590 case 'e':
591 end = stpcpy(end - 1, "ings");
592 break;
593 default:
594 end = stpcpy(end, "ings");
595 break;
597 break;
598 case 'D':
599 switch(end[-1]) {
600 case 'e':
601 end = stpcpy(end - 1, "ed");
602 break;
603 case 'y':
604 if (!strchr("aeiou", end[-2])) {
605 end = stpcpy(end - 1, "ied");
606 break;
608 // Fall through
609 default:
610 end = stpcpy(end, "ed");
611 break;
613 break;
614 case 'T':
615 switch(end[-1]) {
616 case 'e':
617 end = stpcpy(end - 1, "est");
618 break;
619 case 'y':
620 if (!strchr("aeiou", end[-2])) {
621 end = stpcpy(end - 1, "iest");
622 break;
624 // Fall through
625 default:
626 end = stpcpy(end, "est");
627 break;
629 break;
630 case 'R':
631 switch(end[-1]) {
632 case 'e':
633 end = stpcpy(end - 1, "er");
634 break;
635 case 'y':
636 if (!strchr("aeiou", end[-2])) {
637 end = stpcpy(end - 1, "ier");
638 break;
640 // Fall through
641 default:
642 end = stpcpy(end, "er");
643 break;
645 break;
646 case 'Z':
647 switch(end[-1]) {
648 case 'e':
649 end = stpcpy(end - 1, "ers");
650 break;
651 case 'y':
652 if (!strchr("aeiou", end[-2])) {
653 end = stpcpy(end - 1, "iers");
654 break;
656 // Fall through
657 default:
658 end = stpcpy(end, "ers");
659 break;
661 break;
662 case 'S':
663 switch(end[-1]) {
664 case 's':
665 case 'x':
666 case 'z':
667 case 'h':
668 end = stpcpy(end, "es");
669 break;
670 case 'y':
671 if (!strchr("aeiou", end[-2])) {
672 end = stpcpy(end - 1, "ies");
673 break;
675 // Fall through
676 default:
677 end = stpcpy(end, "s");
678 break;
680 break;
681 case 'P':
682 switch(end[-1]) {
683 case 'y':
684 if (!strchr("aeiou", end[-2])) {
685 end = stpcpy(end - 1, "iness");
686 break;
688 // Fall through
689 default:
690 end = stpcpy(end, "ness");
691 break;
693 break;
694 case 'M':
695 end = stpcpy(end, "'s");
696 break;
697 default:
698 return 0;
700 return end - dst;
704 int32
705 Words::FindBestMatches(BList* matches, const char* s)
707 int32 index;
708 // printf("*** Looking for %s: ***\n", s);
710 if ((index = FindFirst(s)) >= 0) {
711 BString srcWord(s);
712 FileEntry* entry;
713 WIndexEntry* indexEntry;
715 int32 key = (ItemAt(index))->key;
716 int32 suffixLength;
717 char word[128], suffixWord[128];
718 const char *src, *testWord;
719 const char *suffixFlags;
720 char *dst;
722 gCmpKey = srcWord.String();
724 uint8 hashTable[32];
725 uint8 hashValue, highHash, lowHash;
726 for (int32 i = 0; i < 32; i++)
727 hashTable[i] = 0;
729 do {
730 indexEntry = ItemAt(index);
731 // Hash the entry offset; we use this to make sure we don't add
732 // the same word file entry twice;
733 // It is possible for the same entry in the words file to have
734 // multiple entries in the index.
736 hashValue = indexEntry->offset % 256;
737 highHash = hashValue >> 3;
738 lowHash = 0x01 << (hashValue & 0x07);
740 // printf("Testing Entry: %ld: hash=%d, highHash=%d, lowHash=%d\n",
741 // indexEntry->offset, hashValue, (uint16)highHash,
742 // (uint16)lowHash);
744 // Has this entry offset been seen before?
745 if (!(hashTable[highHash] & lowHash)) {
746 // printf("New Entry\n");
747 hashTable[highHash] |= lowHash; // Mark this offset so we don't add it twice
749 entry = GetEntry(index);
750 src = entry->String();
751 while (*src && !isalpha(*src))
752 src++;
753 dst = word;
754 while (*src && *src != '/')
755 *dst++ = *src++;
756 *dst = 0;
757 if (*src == '/')
758 suffixFlags = src + 1;
759 else
760 suffixFlags = src;
762 // printf("Base Word: %s\n", word);
763 // printf("Flags: %s\n", suffixFlags);
764 testWord = word; // Test the base word first
765 do {
766 // printf("Testing: %s\n", testWord);
767 // Does this word match the key
768 if ((GetKey(testWord) == key)
769 // And does it look close enough to the compare key?
770 // word_match(gCmpKey, testWord)
771 // <= int32((strlen(gCmpKey)-1)/2))
772 && word_match(gCmpKey, testWord)
773 <= int32(float(strlen(gCmpKey)-1)*.75)) {
774 // printf("Added: %s\n", testWord);
775 matches->AddItem((void*)(new BString(testWord)));
778 // If suffix, transform and test
779 if (*suffixFlags) {
780 // Repeat until valid suffix found or end is reached
781 suffixLength = 0;
782 while (*suffixFlags
783 && !(suffixLength = suffix_word(suffixWord,
784 word, *suffixFlags++))) {}
785 if (suffixLength)
786 testWord = suffixWord;
787 else
788 testWord = NULL;
789 } else {
790 testWord = NULL;
792 } while (testWord);
793 delete entry;
795 // else printf("Redundant entry\n");
797 index++;
798 } while (key == (ItemAt(index))->key);
800 return matches->CountItems();
801 } else {
802 return 0;
807 void
808 sort_word_list(BList* matches, const char* reference)
810 if (matches->CountItems() > 0) {
811 BString srcWord(reference);
812 gCmpKey = srcWord.String();
813 matches->SortItems((int(*)(const void*, const void*))word_cmp);