8354 sync regcomp(3C) with upstream (fix make catalog)
[unleashed/tickless.git] / usr / src / cmd / spell / spellprog.c
blob7635f8678654f2e5a8556efd3628fb0ed4e98eb6
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
20 * CDDL HEADER END
23 * Copyright 2015 Gary Mills
24 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
25 * Use is subject to license terms.
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
31 #include <stdlib.h>
32 #include <unistd.h>
33 #include <limits.h>
34 #include <string.h>
35 #include <stdio.h>
36 #include <ctype.h>
37 #include <locale.h>
38 #include "hash.h"
40 #define Tolower(c) (isupper(c)?tolower(c):c)
41 #define DLEV 2
44 * ANSI prototypes
46 static int ily(char *, char *, char *, int);
47 static int s(char *, char *, char *, int);
48 static int es(char *, char *, char *, int);
49 static int subst(char *, char *, char *, int);
50 static int nop(void);
51 static int bility(char *, char *, char *, int);
52 static int i_to_y(char *, char *, char *, int);
53 static int CCe(char *, char *, char *, int);
54 static int y_to_e(char *, char *, char *, int);
55 static int strip(char *, char *, char *, int);
56 static int ize(char *, char *, char *, int);
57 static int tion(char *, char *, char *, int);
58 static int an(char *, char *, char *, int);
59 int prime(char *);
60 static int tryword(char *, char *, int);
61 static int trypref(char *, char *, int);
62 static int trysuff(char *, int);
63 static int vowel(int);
64 static int dict(char *, char *);
65 static int monosyl(char *, char *);
66 static int VCe(char *, char *, char *, int);
67 static char *skipv(char *);
69 struct suftab {
70 char *suf;
71 int (*p1)();
72 int n1;
73 char *d1;
74 char *a1;
75 int (*p2)();
76 int n2;
77 char *d2;
78 char *a2;
81 static struct suftab sufa[] = {
82 {"ssen", ily, 4, "-y+iness", "+ness" },
83 {"ssel", ily, 4, "-y+i+less", "+less" },
84 {"se", s, 1, "", "+s", es, 2, "-y+ies", "+es" },
85 {"s'", s, 2, "", "+'s"},
86 {"s", s, 1, "", "+s"},
87 {"ecn", subst, 1, "-t+ce", ""},
88 {"ycn", subst, 1, "-t+cy", ""},
89 {"ytilb", nop, 0, "", ""},
90 {"ytilib", bility, 5, "-le+ility", ""},
91 {"elbaif", i_to_y, 4, "-y+iable", ""},
92 {"elba", CCe, 4, "-e+able", "+able"},
93 {"yti", CCe, 3, "-e+ity", "+ity"},
94 {"ylb", y_to_e, 1, "-e+y", ""},
95 {"yl", ily, 2, "-y+ily", "+ly"},
96 {"laci", strip, 2, "", "+al"},
97 {"latnem", strip, 2, "", "+al"},
98 {"lanoi", strip, 2, "", "+al"},
99 {"tnem", strip, 4, "", "+ment"},
100 {"gni", CCe, 3, "-e+ing", "+ing"},
101 {"reta", nop, 0, "", ""},
102 {"retc", nop, 0, "", ""},
103 {"re", strip, 1, "", "+r", i_to_y, 2, "-y+ier", "+er"},
104 {"de", strip, 1, "", "+d", i_to_y, 2, "-y+ied", "+ed"},
105 {"citsi", strip, 2, "", "+ic"},
106 {"citi", ize, 1, "-ic+e", ""},
107 {"cihparg", i_to_y, 1, "-y+ic", ""},
108 {"tse", strip, 2, "", "+st", i_to_y, 3, "-y+iest", "+est"},
109 {"cirtem", i_to_y, 1, "-y+ic", ""},
110 {"yrtem", subst, 0, "-er+ry", ""},
111 {"cigol", i_to_y, 1, "-y+ic", ""},
112 {"tsigol", i_to_y, 2, "-y+ist", ""},
113 {"tsi", CCe, 3, "-e+ist", "+ist"},
114 {"msi", CCe, 3, "-e+ism", "+ist"},
115 {"noitacifi", i_to_y, 6, "-y+ication", ""},
116 {"noitazi", ize, 4, "-e+ation", ""},
117 {"rota", tion, 2, "-e+or", ""},
118 {"rotc", tion, 2, "", "+or"},
119 {"noit", tion, 3, "-e+ion", "+ion"},
120 {"naino", an, 3, "", "+ian"},
121 {"na", an, 1, "", "+n"},
122 {"evi", subst, 0, "-ion+ive", ""},
123 {"ezi", CCe, 3, "-e+ize", "+ize"},
124 {"pihs", strip, 4, "", "+ship"},
125 {"dooh", ily, 4, "-y+ihood", "+hood"},
126 {"luf", ily, 3, "-y+iful", "+ful"},
127 {"ekil", strip, 4, "", "+like"},
131 static struct suftab sufb[] = {
132 {"ssen", ily, 4, "-y+iness", "+ness" },
133 {"ssel", ily, 4, "-y+i+less", "+less" },
134 {"se", s, 1, "", "+s", es, 2, "-y+ies", "+es" },
135 {"s'", s, 2, "", "+'s"},
136 {"s", s, 1, "", "+s"},
137 {"ecn", subst, 1, "-t+ce", ""},
138 {"ycn", subst, 1, "-t+cy", ""},
139 {"ytilb", nop, 0, "", ""},
140 {"ytilib", bility, 5, "-le+ility", ""},
141 {"elbaif", i_to_y, 4, "-y+iable", ""},
142 {"elba", CCe, 4, "-e+able", "+able"},
143 {"yti", CCe, 3, "-e+ity", "+ity"},
144 {"ylb", y_to_e, 1, "-e+y", ""},
145 {"yl", ily, 2, "-y+ily", "+ly"},
146 {"laci", strip, 2, "", "+al"},
147 {"latnem", strip, 2, "", "+al"},
148 {"lanoi", strip, 2, "", "+al"},
149 {"tnem", strip, 4, "", "+ment"},
150 {"gni", CCe, 3, "-e+ing", "+ing"},
151 {"reta", nop, 0, "", ""},
152 {"retc", nop, 0, "", ""},
153 {"re", strip, 1, "", "+r", i_to_y, 2, "-y+ier", "+er"},
154 {"de", strip, 1, "", "+d", i_to_y, 2, "-y+ied", "+ed"},
155 {"citsi", strip, 2, "", "+ic"},
156 {"citi", ize, 1, "-ic+e", ""},
157 {"cihparg", i_to_y, 1, "-y+ic", ""},
158 {"tse", strip, 2, "", "+st", i_to_y, 3, "-y+iest", "+est"},
159 {"cirtem", i_to_y, 1, "-y+ic", ""},
160 {"yrtem", subst, 0, "-er+ry", ""},
161 {"cigol", i_to_y, 1, "-y+ic", ""},
162 {"tsigol", i_to_y, 2, "-y+ist", ""},
163 {"tsi", CCe, 3, "-e+ist", "+ist"},
164 {"msi", CCe, 3, "-e+ism", "+ist"},
165 {"noitacifi", i_to_y, 6, "-y+ication", ""},
166 {"noitasi", ize, 4, "-e+ation", ""},
167 {"rota", tion, 2, "-e+or", ""},
168 {"rotc", tion, 2, "", "+or"},
169 {"noit", tion, 3, "-e+ion", "+ion"},
170 {"naino", an, 3, "", "+ian"},
171 {"na", an, 1, "", "+n"},
172 {"evi", subst, 0, "-ion+ive", ""},
173 {"esi", CCe, 3, "-e+ise", "+ise"},
174 {"pihs", strip, 4, "", "+ship"},
175 {"dooh", ily, 4, "-y+ihood", "+hood"},
176 {"luf", ily, 3, "-y+iful", "+ful"},
177 {"ekil", strip, 4, "", "+like"},
181 static char *preftab[] = {
182 "anti",
183 "auto",
184 "bio",
185 "counter",
186 "dis",
187 "electro",
188 "en",
189 "fore",
190 "geo",
191 "hyper",
192 "intra",
193 "inter",
194 "iso",
195 "kilo",
196 "magneto",
197 "meta",
198 "micro",
199 "mid",
200 "milli",
201 "mis",
202 "mono",
203 "multi",
204 "non",
205 "out",
206 "over",
207 "photo",
208 "poly",
209 "pre",
210 "pseudo",
211 "psycho",
212 "re",
213 "semi",
214 "stereo",
215 "sub",
216 "super",
217 "tele",
218 "thermo",
219 "ultra",
220 "under", /* must precede un */
221 "un",
225 static int bflag;
226 static int vflag;
227 static int xflag;
228 static struct suftab *suftab;
229 static char *prog;
230 static char word[LINE_MAX];
231 static char original[LINE_MAX];
232 static char *deriv[LINE_MAX];
233 static char affix[LINE_MAX];
234 static FILE *file, *found;
236 * deriv is stack of pointers to notes like +micro +ed
237 * affix is concatenated string of notes
238 * the buffer size 141 stems from the sizes of original and affix.
242 * in an attempt to defray future maintenance misunderstandings, here is
243 * an attempt to describe the input/output expectations of the spell
244 * program.
246 * spellprog is intended to be called from the shell file spell.
247 * because of this, there is little error checking (this is historical, not
248 * necessarily advisable).
250 * spellprog options hashed-list pass
252 * the hashed-list is a list of the form made by spellin.
253 * there are 2 types of hashed lists:
254 * 1. a stop list: this specifies words that by the rules embodied
255 * in spellprog would be recognized as correct, BUT are really
256 * errors.
257 * 2. a dictionary of correctly spelled words.
258 * the pass number determines how the words found in the specified
259 * hashed-list are treated. If the pass number is 1, the hashed-list is
260 * treated as the stop-list, otherwise, it is treated as the regular
261 * dictionary list. in this case, the value of "pass" is a filename. Found
262 * words are written to this file.
264 * In the normal case, the filename = /dev/null. However, if the v option
265 * is specified, the derivations are written to this file.
266 * The spellprog looks up words in the hashed-list; if a word is found, it
267 * is printed to the stdout. If the hashed-list was the stop-list, the
268 * words found are presumed to be misspellings. in this case,
269 * a control character is printed ( a "-" is appended to the word.
270 * a hyphen will never occur naturally in the input list because deroff
271 * is used in the shell file before calling spellprog.)
272 * If the regualar spelling list was used (hlista or hlistb), the words
273 * are correct, and may be ditched. (unless the -v option was used -
274 * see the manual page).
276 * spellprog should be called twice : first with the stop-list, to flag all
277 * a priori incorrectly spelled words; second with the dictionary.
279 * spellprog hstop 1 |\
280 * spellprog hlista /dev/null
282 * for a complete scenario, see the shell file: spell.
287 main(int argc, char **argv)
289 char *ep, *cp;
290 char *dp;
291 int fold;
292 int c, j;
293 int pass;
295 /* Set locale environment variables local definitions */
296 (void) setlocale(LC_ALL, "");
297 #if !defined(TEXT_DOMAIN) /* Should be defined by cc -D */
298 #define TEXT_DOMAIN "SYS_TEST" /* Use this only if it wasn't */
299 #endif
300 (void) textdomain(TEXT_DOMAIN);
303 prog = argv[0];
304 while ((c = getopt(argc, argv, "bvx")) != EOF) {
305 switch (c) {
306 case 'b':
307 bflag++;
308 break;
309 case 'v':
310 vflag++;
311 break;
312 case 'x':
313 xflag++;
314 break;
318 argc -= optind;
319 argv = &argv[optind];
321 if ((argc < 2) || !prime(*argv)) {
322 (void) fprintf(stderr,
323 gettext("%s: cannot initialize hash table\n"), prog);
324 exit(1);
326 argc--;
327 argv++;
329 /* Select the correct suffix table */
330 suftab = (bflag == 0) ? sufa : sufb;
333 * if pass is not 1, it is assumed to be a filename.
334 * found words are written to this file.
336 pass = **argv;
337 if (pass != '1')
338 found = fopen(*argv, "w");
340 for (;;) {
341 affix[0] = 0;
342 file = stdout;
343 for (ep = word; (*ep = j = getchar()) != '\n'; ep++)
344 if (j == EOF)
345 exit(0);
347 * here is the hyphen processing. these words were found in the stop
348 * list. however, if they exist as is, (no derivations tried) in the
349 * dictionary, let them through as correct.
352 if (ep[-1] == '-') {
353 *--ep = 0;
354 if (!tryword(word, ep, 0))
355 (void) fprintf(file, "%s\n", word);
356 continue;
358 for (cp = word, dp = original; cp < ep; )
359 *dp++ = *cp++;
360 *dp = 0;
361 fold = 0;
362 for (cp = word; cp < ep; cp++)
363 if (islower(*cp))
364 goto lcase;
365 if (((ep - word) == 1) &&
366 ((word[0] == 'A') || (word[0] == 'I')))
367 continue;
368 if (trypref(ep, ".", 0))
369 goto foundit;
370 ++fold;
371 for (cp = original+1, dp = word+1; dp < ep; dp++, cp++)
372 *dp = Tolower(*cp);
373 lcase:
374 if (((ep - word) == 1) && (word[0] == 'a'))
375 continue;
376 if (trypref(ep, ".", 0)||trysuff(ep, 0))
377 goto foundit;
378 if (isupper(word[0])) {
379 for (cp = original, dp = word; *dp = *cp++; dp++)
380 if (fold) *dp = Tolower(*dp);
381 word[0] = Tolower(word[0]);
382 goto lcase;
384 (void) fprintf(file, "%s\n", original);
385 continue;
387 foundit:
388 if (pass == '1')
389 (void) fprintf(file, "%s-\n", original);
390 else if (affix[0] != 0 && affix[0] != '.') {
391 file = found;
392 (void) fprintf(file, "%s\t%s\n", affix,
393 original);
399 * strip exactly one suffix and do
400 * indicated routine(s), which may recursively
401 * strip suffixes
404 static int
405 trysuff(char *ep, int lev)
407 struct suftab *t;
408 char *cp, *sp;
410 lev += DLEV;
411 deriv[lev] = deriv[lev-1] = 0;
412 for (t = &suftab[0]; (t != 0 && (sp = t->suf) != 0); t++) {
413 cp = ep;
414 while (*sp)
415 if (*--cp != *sp++)
416 goto next;
417 for (sp = cp; --sp >= word && !vowel(*sp); )
419 if (sp < word)
420 return (0);
421 if ((*t->p1)(ep-t->n1, t->d1, t->a1, lev+1))
422 return (1);
423 if (t->p2 != 0) {
424 deriv[lev] = deriv[lev+1] = 0;
425 return ((*t->p2)(ep-t->n2, t->d2, t->a2, lev));
427 return (0);
428 next:;
430 return (0);
433 static int
434 nop(void)
436 return (0);
439 /* ARGSUSED */
440 static int
441 strip(char *ep, char *d, char *a, int lev)
443 return (trypref(ep, a, lev)||trysuff(ep, lev));
446 static int
447 s(char *ep, char *d, char *a, int lev)
449 if (lev > DLEV+1)
450 return (0);
451 if (*ep == 's' && ep[-1] == 's')
452 return (0);
453 return (strip(ep, d, a, lev));
456 /* ARGSUSED */
457 static int
458 an(char *ep, char *d, char *a, int lev)
460 if (!isupper(*word)) /* must be proper name */
461 return (0);
462 return (trypref(ep, a, lev));
465 /* ARGSUSED */
466 static int
467 ize(char *ep, char *d, char *a, int lev)
469 ep[-1] = 'e';
470 return (strip(ep, "", d, lev));
473 /* ARGSUSED */
474 static int
475 y_to_e(char *ep, char *d, char *a, int lev)
477 *ep++ = 'e';
478 return (strip(ep, "", d, lev));
481 static int
482 ily(char *ep, char *d, char *a, int lev)
484 if (ep[-1] == 'i')
485 return (i_to_y(ep, d, a, lev));
486 else
487 return (strip(ep, d, a, lev));
490 static int
491 bility(char *ep, char *d, char *a, int lev)
493 *ep++ = 'l';
494 return (y_to_e(ep, d, a, lev));
497 static int
498 i_to_y(char *ep, char *d, char *a, int lev)
500 if (ep[-1] == 'i') {
501 ep[-1] = 'y';
502 a = d;
504 return (strip(ep, "", a, lev));
507 static int
508 es(char *ep, char *d, char *a, int lev)
510 if (lev > DLEV)
511 return (0);
512 switch (ep[-1]) {
513 default:
514 return (0);
515 case 'i':
516 return (i_to_y(ep, d, a, lev));
517 case 's':
518 case 'h':
519 case 'z':
520 case 'x':
521 return (strip(ep, d, a, lev));
525 /* ARGSUSED */
526 static int
527 subst(char *ep, char *d, char *a, int lev)
529 char *u, *t;
531 if (skipv(skipv(ep-1)) < word)
532 return (0);
533 for (t = d; *t != '+'; t++)
534 continue;
535 for (u = ep; *--t != '-'; )
536 *--u = *t;
537 return (strip(ep, "", d, lev));
541 static int
542 tion(char *ep, char *d, char *a, int lev)
544 switch (ep[-2]) {
545 case 'c':
546 case 'r':
547 return (trypref(ep, a, lev));
548 case 'a':
549 return (y_to_e(ep, d, a, lev));
551 return (0);
554 /* possible consonant-consonant-e ending */
555 static int
556 CCe(char *ep, char *d, char *a, int lev)
558 switch (ep[-1]) {
559 case 'r':
560 if (ep[-2] == 't')
561 return (y_to_e(ep, d, a, lev));
562 break;
563 case 'l':
564 if (vowel(ep[-2]))
565 break;
566 switch (ep[-2]) {
567 case 'l':
568 case 'r':
569 case 'w':
570 break;
571 default:
572 return (y_to_e(ep, d, a, lev));
574 break;
575 case 's':
576 if (ep[-2] == 's')
577 break;
578 if (*ep == 'a')
579 return (0);
580 if (vowel(ep[-2]))
581 break;
582 if (y_to_e(ep, d, a, lev))
583 return (1);
584 if (!(ep[-2] == 'n' && ep[-1] == 'g'))
585 return (0);
586 break;
587 case 'c':
588 case 'g':
589 if (*ep == 'a')
590 return (0);
591 if (vowel(ep[-2]))
592 break;
593 if (y_to_e(ep, d, a, lev))
594 return (1);
595 if (!(ep[-2] == 'n' && ep[-1] == 'g'))
596 return (0);
597 break;
598 case 'v':
599 case 'z':
600 if (vowel(ep[-2]))
601 break;
602 if (y_to_e(ep, d, a, lev))
603 return (1);
604 if (!(ep[-2] == 'n' && ep[-1] == 'g'))
605 return (0);
606 break;
607 case 'u':
608 if (y_to_e(ep, d, a, lev))
609 return (1);
610 if (!(ep[-2] == 'n' && ep[-1] == 'g'))
611 return (0);
612 break;
614 return (VCe(ep, d, a, lev));
617 /* possible consonant-vowel-consonant-e ending */
618 static int
619 VCe(char *ep, char *d, char *a, int lev)
621 char c;
622 c = ep[-1];
623 if (c == 'e')
624 return (0);
625 if (!vowel(c) && vowel(ep[-2])) {
626 c = *ep;
627 *ep++ = 'e';
628 if (trypref(ep, d, lev)||trysuff(ep, lev))
629 return (1);
630 ep--;
631 *ep = c;
633 return (strip(ep, d, a, lev));
636 static char *
637 lookuppref(char **wp, char *ep)
639 char **sp;
640 char *bp, *cp;
642 for (sp = preftab; *sp; sp++) {
643 bp = *wp;
644 for (cp = *sp; *cp; cp++, bp++)
645 if (Tolower(*bp) != *cp)
646 goto next;
647 for (cp = bp; cp < ep; cp++)
648 if (vowel(*cp)) {
649 *wp = bp;
650 return (*sp);
652 next:;
654 return (0);
658 * while word is not in dictionary try stripping
659 * prefixes. Fail if no more prefixes.
661 static int
662 trypref(char *ep, char *a, int lev)
664 char *cp;
665 char *bp;
666 char *pp;
667 int val = 0;
668 char space[LINE_MAX * 2];
669 deriv[lev] = a;
670 if (tryword(word, ep, lev))
671 return (1);
672 bp = word;
673 pp = space;
674 deriv[lev+1] = pp;
675 while (cp = lookuppref(&bp, ep)) {
676 *pp++ = '+';
677 while (*pp = *cp++)
678 pp++;
679 if (tryword(bp, ep, lev+1)) {
680 val = 1;
681 break;
684 deriv[lev+1] = deriv[lev+2] = 0;
685 return (val);
688 static int
689 tryword(char *bp, char *ep, int lev)
691 int i, j;
692 char duple[3];
693 if (ep-bp <= 1)
694 return (0);
695 if (vowel(*ep)) {
696 if (monosyl(bp, ep))
697 return (0);
699 i = dict(bp, ep);
700 if (i == 0 && vowel(*ep) && ep[-1] == ep[-2] && monosyl(bp, ep-1)) {
701 ep--;
702 deriv[++lev] = duple;
703 duple[0] = '+';
704 duple[1] = *ep;
705 duple[2] = 0;
706 i = dict(bp, ep);
708 if (vflag == 0 || i == 0)
709 return (i);
711 * when derivations are wanted, collect them
712 * for printing
714 j = lev;
715 do {
716 if (deriv[j])
717 (void) strcat(affix, deriv[j]);
718 } while (--j > 0);
719 return (i);
723 static int
724 monosyl(char *bp, char *ep)
726 if (ep < bp+2)
727 return (0);
728 if (vowel(*--ep) || !vowel(*--ep) || ep[1] == 'x' || ep[1] == 'w')
729 return (0);
730 while (--ep >= bp)
731 if (vowel(*ep))
732 return (0);
733 return (1);
736 static char *
737 skipv(char *s)
739 if (s >= word&&vowel(*s))
740 s--;
741 while (s >= word && !vowel(*s))
742 s--;
743 return (s);
746 static int
747 vowel(int c)
749 switch (Tolower(c)) {
750 case 'a':
751 case 'e':
752 case 'i':
753 case 'o':
754 case 'u':
755 case 'y':
756 return (1);
758 return (0);
761 static int
762 dict(char *bp, char *ep)
764 int temp, result;
765 if (xflag)
766 (void) fprintf(stdout, "=%.*s\n", ep-bp, bp);
767 temp = *ep;
768 *ep = 0;
769 result = hashlook(bp);
770 *ep = temp;
771 return (result);