Fix oversight in previous error-reporting patch; mustn't pfree path string
[PostgreSQL.git] / src / backend / tsearch / wparser_def.c
bloba07fedc0a7a0a00f0862e38e2928bbf6a59a0045
1 /*-------------------------------------------------------------------------
3 * wparser_def.c
4 * Default text search parser
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
9 * IDENTIFICATION
10 * $PostgreSQL$
12 *-------------------------------------------------------------------------
15 #include "postgres.h"
17 #include "commands/defrem.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_public.h"
20 #include "tsearch/ts_type.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
25 /* Define me to enable tracing of parser behavior */
26 /* #define WPARSER_TRACE */
29 /* Output token categories */
31 #define ASCIIWORD 1
32 #define WORD_T 2
33 #define NUMWORD 3
34 #define EMAIL 4
35 #define URL_T 5
36 #define HOST 6
37 #define SCIENTIFIC 7
38 #define VERSIONNUMBER 8
39 #define NUMPARTHWORD 9
40 #define PARTHWORD 10
41 #define ASCIIPARTHWORD 11
42 #define SPACE 12
43 #define TAG_T 13
44 #define PROTOCOL 14
45 #define NUMHWORD 15
46 #define ASCIIHWORD 16
47 #define HWORD 17
48 #define URLPATH 18
49 #define FILEPATH 19
50 #define DECIMAL 20
51 #define SIGNEDINT 21
52 #define UNSIGNEDINT 22
53 #define XMLENTITY 23
55 #define LASTNUM 23
57 static const char *const tok_alias[] = {
58 "",
59 "asciiword",
60 "word",
61 "numword",
62 "email",
63 "url",
64 "host",
65 "sfloat",
66 "version",
67 "hword_numpart",
68 "hword_part",
69 "hword_asciipart",
70 "blank",
71 "tag",
72 "protocol",
73 "numhword",
74 "asciihword",
75 "hword",
76 "url_path",
77 "file",
78 "float",
79 "int",
80 "uint",
81 "entity"
84 static const char *const lex_descr[] = {
85 "",
86 "Word, all ASCII",
87 "Word, all letters",
88 "Word, letters and digits",
89 "Email address",
90 "URL",
91 "Host",
92 "Scientific notation",
93 "Version number",
94 "Hyphenated word part, letters and digits",
95 "Hyphenated word part, all letters",
96 "Hyphenated word part, all ASCII",
97 "Space symbols",
98 "XML tag",
99 "Protocol head",
100 "Hyphenated word, letters and digits",
101 "Hyphenated word, all ASCII",
102 "Hyphenated word, all letters",
103 "URL path",
104 "File or path name",
105 "Decimal notation",
106 "Signed integer",
107 "Unsigned integer",
108 "XML entity"
112 /* Parser states */
114 typedef enum
116 TPS_Base = 0,
117 TPS_InNumWord,
118 TPS_InAsciiWord,
119 TPS_InWord,
120 TPS_InUnsignedInt,
121 TPS_InSignedIntFirst,
122 TPS_InSignedInt,
123 TPS_InSpace,
124 TPS_InUDecimalFirst,
125 TPS_InUDecimal,
126 TPS_InDecimalFirst,
127 TPS_InDecimal,
128 TPS_InVerVersion,
129 TPS_InSVerVersion,
130 TPS_InVersionFirst,
131 TPS_InVersion,
132 TPS_InMantissaFirst,
133 TPS_InMantissaSign,
134 TPS_InMantissa,
135 TPS_InXMLEntityFirst,
136 TPS_InXMLEntity,
137 TPS_InXMLEntityNumFirst,
138 TPS_InXMLEntityNum,
139 TPS_InXMLEntityHexNumFirst,
140 TPS_InXMLEntityHexNum,
141 TPS_InXMLEntityEnd,
142 TPS_InTagFirst,
143 TPS_InXMLBegin,
144 TPS_InTagCloseFirst,
145 TPS_InTagName,
146 TPS_InTagBeginEnd,
147 TPS_InTag,
148 TPS_InTagEscapeK,
149 TPS_InTagEscapeKK,
150 TPS_InTagBackSleshed,
151 TPS_InTagEnd,
152 TPS_InCommentFirst,
153 TPS_InCommentLast,
154 TPS_InComment,
155 TPS_InCloseCommentFirst,
156 TPS_InCloseCommentLast,
157 TPS_InCommentEnd,
158 TPS_InHostFirstDomain,
159 TPS_InHostDomainSecond,
160 TPS_InHostDomain,
161 TPS_InPortFirst,
162 TPS_InPort,
163 TPS_InHostFirstAN,
164 TPS_InHost,
165 TPS_InEmail,
166 TPS_InFileFirst,
167 TPS_InFileTwiddle,
168 TPS_InPathFirst,
169 TPS_InPathFirstFirst,
170 TPS_InPathSecond,
171 TPS_InFile,
172 TPS_InFileNext,
173 TPS_InURLPathFirst,
174 TPS_InURLPathStart,
175 TPS_InURLPath,
176 TPS_InFURL,
177 TPS_InProtocolFirst,
178 TPS_InProtocolSecond,
179 TPS_InProtocolEnd,
180 TPS_InHyphenAsciiWordFirst,
181 TPS_InHyphenAsciiWord,
182 TPS_InHyphenWordFirst,
183 TPS_InHyphenWord,
184 TPS_InHyphenNumWordFirst,
185 TPS_InHyphenNumWord,
186 TPS_InHyphenDigitLookahead,
187 TPS_InParseHyphen,
188 TPS_InParseHyphenHyphen,
189 TPS_InHyphenWordPart,
190 TPS_InHyphenAsciiWordPart,
191 TPS_InHyphenNumWordPart,
192 TPS_InHyphenUnsignedInt,
193 TPS_Null /* last state (fake value) */
194 } TParserState;
196 /* forward declaration */
197 struct TParser;
199 typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
200 * except p_iseq */
201 typedef void (*TParserSpecial) (struct TParser *); /* special handler for
202 * special cases... */
204 typedef struct
206 TParserCharTest isclass;
207 char c;
208 uint16 flags;
209 TParserState tostate;
210 int type;
211 TParserSpecial special;
212 } TParserStateActionItem;
214 /* Flag bits in TParserStateActionItem.flags */
215 #define A_NEXT 0x0000
216 #define A_BINGO 0x0001
217 #define A_POP 0x0002
218 #define A_PUSH 0x0004
219 #define A_RERUN 0x0008
220 #define A_CLEAR 0x0010
221 #define A_MERGE 0x0020
222 #define A_CLRALL 0x0040
224 typedef struct TParserPosition
226 int posbyte; /* position of parser in bytes */
227 int poschar; /* position of parser in characters */
228 int charlen; /* length of current char */
229 int lenbytetoken; /* length of token-so-far in bytes */
230 int lenchartoken; /* and in chars */
231 TParserState state;
232 struct TParserPosition *prev;
233 const TParserStateActionItem *pushedAtAction;
234 } TParserPosition;
236 typedef struct TParser
238 /* string and position information */
239 char *str; /* multibyte string */
240 int lenstr; /* length of mbstring */
241 #ifdef USE_WIDE_UPPER_LOWER
242 wchar_t *wstr; /* wide character string */
243 int lenwstr; /* length of wsting */
244 #endif
246 /* State of parse */
247 int charmaxlen;
248 bool usewide;
249 TParserPosition *state;
250 bool ignore;
251 bool wanthost;
253 /* silly char */
254 char c;
256 /* out */
257 char *token;
258 int lenbytetoken;
259 int lenchartoken;
260 int type;
261 } TParser;
264 /* forward decls here */
265 static bool TParserGet(TParser *prs);
268 static TParserPosition *
269 newTParserPosition(TParserPosition *prev)
271 TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
273 if (prev)
274 memcpy(res, prev, sizeof(TParserPosition));
275 else
276 memset(res, 0, sizeof(TParserPosition));
278 res->prev = prev;
280 res->pushedAtAction = NULL;
282 return res;
285 static TParser *
286 TParserInit(char *str, int len)
288 TParser *prs = (TParser *) palloc0(sizeof(TParser));
290 prs->charmaxlen = pg_database_encoding_max_length();
291 prs->str = str;
292 prs->lenstr = len;
294 #ifdef USE_WIDE_UPPER_LOWER
297 * Use wide char code only when max encoding length > 1.
299 if (prs->charmaxlen > 1)
301 prs->usewide = true;
302 prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
303 prs->lenwstr = char2wchar(prs->wstr, prs->lenstr + 1,
304 prs->str, prs->lenstr);
306 else
307 #endif
308 prs->usewide = false;
310 prs->state = newTParserPosition(NULL);
311 prs->state->state = TPS_Base;
313 #ifdef WPARSER_TRACE
314 fprintf(stderr, "parsing \"%.*s\"\n", len, str);
315 #endif
317 return prs;
320 static void
321 TParserClose(TParser *prs)
323 while (prs->state)
325 TParserPosition *ptr = prs->state->prev;
327 pfree(prs->state);
328 prs->state = ptr;
331 #ifdef USE_WIDE_UPPER_LOWER
332 if (prs->wstr)
333 pfree(prs->wstr);
334 #endif
336 pfree(prs);
340 * Character-type support functions, equivalent to is* macros, but
341 * working with any possible encodings and locales. Note,
342 * that with multibyte encoding and C-locale isw* function may fail
343 * or give wrong result. Note 2: multibyte encoding and C-locale
344 * often are used for Asian languages
347 #ifdef USE_WIDE_UPPER_LOWER
349 #define p_iswhat(type) \
350 static int \
351 p_is##type(TParser *prs) { \
352 Assert( prs->state ); \
353 if ( prs->usewide ) \
355 if ( lc_ctype_is_c() ) \
356 return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \
358 return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \
361 return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
364 static int \
365 p_isnot##type(TParser *prs) { \
366 return !p_is##type(prs); \
369 static int
370 p_isalnum(TParser *prs)
372 Assert(prs->state);
374 if (prs->usewide)
376 if (lc_ctype_is_c())
378 unsigned int c = *(prs->wstr + prs->state->poschar);
381 * any non-ascii symbol with multibyte encoding with C-locale is
382 * an alpha character
384 if (c > 0x7f)
385 return 1;
387 return isalnum(0xff & c);
390 return iswalnum((wint_t) *(prs->wstr + prs->state->poschar));
393 return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
395 static int
396 p_isnotalnum(TParser *prs)
398 return !p_isalnum(prs);
401 static int
402 p_isalpha(TParser *prs)
404 Assert(prs->state);
406 if (prs->usewide)
408 if (lc_ctype_is_c())
410 unsigned int c = *(prs->wstr + prs->state->poschar);
413 * any non-ascii symbol with multibyte encoding with C-locale is
414 * an alpha character
416 if (c > 0x7f)
417 return 1;
419 return isalpha(0xff & c);
422 return iswalpha((wint_t) *(prs->wstr + prs->state->poschar));
425 return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
428 static int
429 p_isnotalpha(TParser *prs)
431 return !p_isalpha(prs);
434 /* p_iseq should be used only for ascii symbols */
436 static int
437 p_iseq(TParser *prs, char c)
439 Assert(prs->state);
440 return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
442 #else /* USE_WIDE_UPPER_LOWER */
444 #define p_iswhat(type) \
445 static int \
446 p_is##type(TParser *prs) { \
447 Assert( prs->state ); \
448 return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
451 static int \
452 p_isnot##type(TParser *prs) { \
453 return !p_is##type(prs); \
457 static int
458 p_iseq(TParser *prs, char c)
460 Assert(prs->state);
461 return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
464 p_iswhat(alnum)
465 p_iswhat(alpha)
466 #endif /* USE_WIDE_UPPER_LOWER */
468 p_iswhat(digit)
469 p_iswhat(lower)
470 p_iswhat(print)
471 p_iswhat(punct)
472 p_iswhat(space)
473 p_iswhat(upper)
474 p_iswhat(xdigit)
476 static int
477 p_isEOF(TParser *prs)
479 Assert(prs->state);
480 return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
483 static int
484 p_iseqC(TParser *prs)
486 return p_iseq(prs, prs->c);
489 static int
490 p_isneC(TParser *prs)
492 return !p_iseq(prs, prs->c);
495 static int
496 p_isascii(TParser *prs)
498 return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
501 static int
502 p_isasclet(TParser *prs)
504 return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
508 /* deliberately suppress unused-function complaints for the above */
509 void _make_compiler_happy(void);
510 void
511 _make_compiler_happy(void)
513 p_isalnum(NULL);
514 p_isnotalnum(NULL);
515 p_isalpha(NULL);
516 p_isnotalpha(NULL);
517 p_isdigit(NULL);
518 p_isnotdigit(NULL);
519 p_islower(NULL);
520 p_isnotlower(NULL);
521 p_isprint(NULL);
522 p_isnotprint(NULL);
523 p_ispunct(NULL);
524 p_isnotpunct(NULL);
525 p_isspace(NULL);
526 p_isnotspace(NULL);
527 p_isupper(NULL);
528 p_isnotupper(NULL);
529 p_isxdigit(NULL);
530 p_isnotxdigit(NULL);
531 p_isEOF(NULL);
532 p_iseqC(NULL);
533 p_isneC(NULL);
537 static void
538 SpecialTags(TParser *prs)
540 switch (prs->state->lenchartoken)
542 case 8: /* </script */
543 if (pg_strncasecmp(prs->token, "</script", 8) == 0)
544 prs->ignore = false;
545 break;
546 case 7: /* <script || </style */
547 if (pg_strncasecmp(prs->token, "</style", 7) == 0)
548 prs->ignore = false;
549 else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
550 prs->ignore = true;
551 break;
552 case 6: /* <style */
553 if (pg_strncasecmp(prs->token, "<style", 6) == 0)
554 prs->ignore = true;
555 break;
556 default:
557 break;
561 static void
562 SpecialFURL(TParser *prs)
564 prs->wanthost = true;
565 prs->state->posbyte -= prs->state->lenbytetoken;
566 prs->state->poschar -= prs->state->lenchartoken;
569 static void
570 SpecialHyphen(TParser *prs)
572 prs->state->posbyte -= prs->state->lenbytetoken;
573 prs->state->poschar -= prs->state->lenchartoken;
576 static void
577 SpecialVerVersion(TParser *prs)
579 prs->state->posbyte -= prs->state->lenbytetoken;
580 prs->state->poschar -= prs->state->lenchartoken;
581 prs->state->lenbytetoken = 0;
582 prs->state->lenchartoken = 0;
585 static int
586 p_isstophost(TParser *prs)
588 if (prs->wanthost)
590 prs->wanthost = false;
591 return 1;
593 return 0;
596 static int
597 p_isignore(TParser *prs)
599 return (prs->ignore) ? 1 : 0;
602 static int
603 p_ishost(TParser *prs)
605 TParser *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
606 int res = 0;
608 if (TParserGet(tmpprs) && tmpprs->type == HOST)
610 prs->state->posbyte += tmpprs->lenbytetoken;
611 prs->state->poschar += tmpprs->lenchartoken;
612 prs->state->lenbytetoken += tmpprs->lenbytetoken;
613 prs->state->lenchartoken += tmpprs->lenchartoken;
614 prs->state->charlen = tmpprs->state->charlen;
615 res = 1;
617 TParserClose(tmpprs);
619 return res;
622 static int
623 p_isURLPath(TParser *prs)
625 TParser *tmpprs = TParserInit(prs->str + prs->state->posbyte, prs->lenstr - prs->state->posbyte);
626 int res = 0;
628 tmpprs->state = newTParserPosition(tmpprs->state);
629 tmpprs->state->state = TPS_InFileFirst;
631 if (TParserGet(tmpprs) && (tmpprs->type == URLPATH || tmpprs->type == FILEPATH))
633 prs->state->posbyte += tmpprs->lenbytetoken;
634 prs->state->poschar += tmpprs->lenchartoken;
635 prs->state->lenbytetoken += tmpprs->lenbytetoken;
636 prs->state->lenchartoken += tmpprs->lenchartoken;
637 prs->state->charlen = tmpprs->state->charlen;
638 res = 1;
640 TParserClose(tmpprs);
642 return res;
646 * Table of state/action of parser
649 static const TParserStateActionItem actionTPS_Base[] = {
650 {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
651 {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
652 {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
653 {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
654 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
655 {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
656 {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
657 {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
658 {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
659 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
660 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
661 {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
662 {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
666 static const TParserStateActionItem actionTPS_InNumWord[] = {
667 {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
668 {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
669 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
670 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
671 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
672 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
673 {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
676 static const TParserStateActionItem actionTPS_InAsciiWord[] = {
677 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
678 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
679 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
680 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
681 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
682 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
683 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
684 {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
685 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
686 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
687 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
688 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
689 {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
692 static const TParserStateActionItem actionTPS_InWord[] = {
693 {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
694 {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
695 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
696 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
697 {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
700 static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
701 {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
702 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
703 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
704 {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
705 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
706 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
707 {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
708 {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
709 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
710 {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
713 static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
714 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
715 {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
716 {NULL, 0, A_POP, TPS_Null, 0, NULL}
719 static const TParserStateActionItem actionTPS_InSignedInt[] = {
720 {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
721 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
722 {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
723 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
724 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
725 {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
728 static const TParserStateActionItem actionTPS_InSpace[] = {
729 {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
730 {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
731 {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
732 {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
733 {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
734 {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
735 {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
736 {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
737 {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
740 static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
741 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
742 {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
743 {NULL, 0, A_POP, TPS_Null, 0, NULL}
746 static const TParserStateActionItem actionTPS_InUDecimal[] = {
747 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
748 {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
749 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
750 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
751 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
752 {NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
755 static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
756 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
757 {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
758 {NULL, 0, A_POP, TPS_Null, 0, NULL}
761 static const TParserStateActionItem actionTPS_InDecimal[] = {
762 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
763 {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
764 {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
765 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
766 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
767 {NULL, 0, A_BINGO, TPS_Base, DECIMAL, NULL}
770 static const TParserStateActionItem actionTPS_InVerVersion[] = {
771 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
772 {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
773 {NULL, 0, A_POP, TPS_Null, 0, NULL}
776 static const TParserStateActionItem actionTPS_InSVerVersion[] = {
777 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
778 {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
779 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
783 static const TParserStateActionItem actionTPS_InVersionFirst[] = {
784 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
785 {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
786 {NULL, 0, A_POP, TPS_Null, 0, NULL}
789 static const TParserStateActionItem actionTPS_InVersion[] = {
790 {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
791 {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
792 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
793 {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
796 static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
797 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
798 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
799 {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
800 {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
801 {NULL, 0, A_POP, TPS_Null, 0, NULL}
804 static const TParserStateActionItem actionTPS_InMantissaSign[] = {
805 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
806 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
807 {NULL, 0, A_POP, TPS_Null, 0, NULL}
810 static const TParserStateActionItem actionTPS_InMantissa[] = {
811 {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
812 {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
813 {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
816 static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
817 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
818 {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
819 {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
820 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
821 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
822 {NULL, 0, A_POP, TPS_Null, 0, NULL}
825 static const TParserStateActionItem actionTPS_InXMLEntity[] = {
826 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
827 {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
828 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
829 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
830 {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
831 {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
832 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
833 {NULL, 0, A_POP, TPS_Null, 0, NULL}
836 static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
837 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
838 {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
839 {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
840 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
841 {NULL, 0, A_POP, TPS_Null, 0, NULL}
844 static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
845 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
846 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
847 {NULL, 0, A_POP, TPS_Null, 0, NULL}
850 static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
851 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
852 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
853 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
854 {NULL, 0, A_POP, TPS_Null, 0, NULL}
857 static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
858 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
859 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
860 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
861 {NULL, 0, A_POP, TPS_Null, 0, NULL}
864 static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
865 {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
868 static const TParserStateActionItem actionTPS_InTagFirst[] = {
869 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
870 {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
871 {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
872 {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
873 {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
874 {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
875 {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
876 {NULL, 0, A_POP, TPS_Null, 0, NULL}
879 static const TParserStateActionItem actionTPS_InXMLBegin[] = {
880 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
881 /* <?xml ... */
882 /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
883 {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
884 {NULL, 0, A_POP, TPS_Null, 0, NULL}
887 static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
888 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
889 {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
890 {NULL, 0, A_POP, TPS_Null, 0, NULL}
893 static const TParserStateActionItem actionTPS_InTagName[] = {
894 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
895 /* <br/> case */
896 {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
897 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
898 {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
899 {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
900 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
901 {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
902 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
903 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
904 {NULL, 0, A_POP, TPS_Null, 0, NULL}
907 static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
908 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
909 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
910 {NULL, 0, A_POP, TPS_Null, 0, NULL}
913 static const TParserStateActionItem actionTPS_InTag[] = {
914 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
915 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
916 {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
917 {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
918 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
919 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
920 {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
921 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
922 {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
923 {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
924 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
925 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
926 {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
927 {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
928 {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
929 {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
930 {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
931 {NULL, 0, A_POP, TPS_Null, 0, NULL}
934 static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
935 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
936 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
937 {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
938 {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
941 static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
942 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
943 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
944 {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
945 {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
948 static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
949 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
950 {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
953 static const TParserStateActionItem actionTPS_InTagEnd[] = {
954 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
957 static const TParserStateActionItem actionTPS_InCommentFirst[] = {
958 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
959 {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
960 /* <!DOCTYPE ...> */
961 {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
962 {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
963 {NULL, 0, A_POP, TPS_Null, 0, NULL}
966 static const TParserStateActionItem actionTPS_InCommentLast[] = {
967 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
968 {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
969 {NULL, 0, A_POP, TPS_Null, 0, NULL}
972 static const TParserStateActionItem actionTPS_InComment[] = {
973 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
974 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
975 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
978 static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
979 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
980 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
981 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
984 static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
985 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
986 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
987 {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
988 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
991 static const TParserStateActionItem actionTPS_InCommentEnd[] = {
992 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
995 static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
996 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
997 {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
998 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
999 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1002 static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
1003 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1004 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1005 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1006 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1007 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1008 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1009 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1012 static const TParserStateActionItem actionTPS_InHostDomain[] = {
1013 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1014 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1015 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1016 {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1017 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1018 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1019 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1020 {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1021 {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1022 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1023 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1026 static const TParserStateActionItem actionTPS_InPortFirst[] = {
1027 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1028 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1029 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1032 static const TParserStateActionItem actionTPS_InPort[] = {
1033 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1034 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1035 {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1036 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1037 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1040 static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
1041 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1042 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1043 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1044 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1047 static const TParserStateActionItem actionTPS_InHost[] = {
1048 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1049 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1050 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1051 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1052 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1053 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1054 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1057 static const TParserStateActionItem actionTPS_InEmail[] = {
1058 {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1059 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1062 static const TParserStateActionItem actionTPS_InFileFirst[] = {
1063 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1064 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1065 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1066 {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1067 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1068 {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
1069 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1070 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1073 static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
1074 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1075 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1076 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1077 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1078 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1079 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1082 static const TParserStateActionItem actionTPS_InPathFirst[] = {
1083 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1084 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1085 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1086 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1087 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1088 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1089 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1092 static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
1093 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1094 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1095 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1096 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1099 static const TParserStateActionItem actionTPS_InPathSecond[] = {
1100 {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1101 {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1102 {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1103 {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1104 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1107 static const TParserStateActionItem actionTPS_InFile[] = {
1108 {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1109 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1110 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1111 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1112 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1113 {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1114 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1115 {p_iseqC, '?', A_PUSH, TPS_InURLPathFirst, 0, NULL},
1116 {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1119 static const TParserStateActionItem actionTPS_InFileNext[] = {
1120 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1121 {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1122 {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1123 {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1124 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1127 static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
1128 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1129 {p_iseqC, '"', A_POP, TPS_Null, 0, NULL},
1130 {p_iseqC, '\'', A_POP, TPS_Null, 0, NULL},
1131 {p_isnotspace, 0, A_CLEAR, TPS_InURLPath, 0, NULL},
1132 {NULL, 0, A_POP, TPS_Null, 0, NULL},
1135 static const TParserStateActionItem actionTPS_InURLPathStart[] = {
1136 {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1139 static const TParserStateActionItem actionTPS_InURLPath[] = {
1140 {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1141 {p_iseqC, '"', A_BINGO, TPS_Base, URLPATH, NULL},
1142 {p_iseqC, '\'', A_BINGO, TPS_Base, URLPATH, NULL},
1143 {p_isnotspace, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1144 {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1147 static const TParserStateActionItem actionTPS_InFURL[] = {
1148 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1149 {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
1150 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1153 static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
1154 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1155 {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1156 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1159 static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
1160 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1161 {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1162 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1165 static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
1166 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1169 static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
1170 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1171 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1172 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1173 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1174 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1177 static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
1178 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
1179 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1180 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1181 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1182 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1183 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1186 static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
1187 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1188 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1189 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1190 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1193 static const TParserStateActionItem actionTPS_InHyphenWord[] = {
1194 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
1195 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1196 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1197 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1198 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1201 static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
1202 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1203 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1204 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1205 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1208 static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
1209 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1210 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1211 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1212 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1215 static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
1216 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1217 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1218 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1219 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1222 static const TParserStateActionItem actionTPS_InParseHyphen[] = {
1223 {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1224 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1225 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1226 {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1227 {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1228 {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1231 static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
1232 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1233 {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1234 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1237 static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
1238 {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1239 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1240 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1241 {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1244 static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
1245 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1246 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1247 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1248 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1249 {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1252 static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
1253 {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1254 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1255 {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1258 static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1259 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1260 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1261 {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1262 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1267 * main table of per-state parser actions
1269 typedef struct
1271 const TParserStateActionItem *action; /* the actual state info */
1272 TParserState state; /* only for Assert crosscheck */
1273 #ifdef WPARSER_TRACE
1274 const char *state_name; /* only for debug printout */
1275 #endif
1276 } TParserStateAction;
1278 #ifdef WPARSER_TRACE
1279 #define TPARSERSTATEACTION(state) \
1280 { CppConcat(action,state), state, CppAsString(state) }
1281 #else
1282 #define TPARSERSTATEACTION(state) \
1283 { CppConcat(action,state), state }
1284 #endif
1287 * order must be the same as in typedef enum {} TParserState!!
1290 static const TParserStateAction Actions[] = {
1291 TPARSERSTATEACTION(TPS_Base),
1292 TPARSERSTATEACTION(TPS_InNumWord),
1293 TPARSERSTATEACTION(TPS_InAsciiWord),
1294 TPARSERSTATEACTION(TPS_InWord),
1295 TPARSERSTATEACTION(TPS_InUnsignedInt),
1296 TPARSERSTATEACTION(TPS_InSignedIntFirst),
1297 TPARSERSTATEACTION(TPS_InSignedInt),
1298 TPARSERSTATEACTION(TPS_InSpace),
1299 TPARSERSTATEACTION(TPS_InUDecimalFirst),
1300 TPARSERSTATEACTION(TPS_InUDecimal),
1301 TPARSERSTATEACTION(TPS_InDecimalFirst),
1302 TPARSERSTATEACTION(TPS_InDecimal),
1303 TPARSERSTATEACTION(TPS_InVerVersion),
1304 TPARSERSTATEACTION(TPS_InSVerVersion),
1305 TPARSERSTATEACTION(TPS_InVersionFirst),
1306 TPARSERSTATEACTION(TPS_InVersion),
1307 TPARSERSTATEACTION(TPS_InMantissaFirst),
1308 TPARSERSTATEACTION(TPS_InMantissaSign),
1309 TPARSERSTATEACTION(TPS_InMantissa),
1310 TPARSERSTATEACTION(TPS_InXMLEntityFirst),
1311 TPARSERSTATEACTION(TPS_InXMLEntity),
1312 TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
1313 TPARSERSTATEACTION(TPS_InXMLEntityNum),
1314 TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
1315 TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
1316 TPARSERSTATEACTION(TPS_InXMLEntityEnd),
1317 TPARSERSTATEACTION(TPS_InTagFirst),
1318 TPARSERSTATEACTION(TPS_InXMLBegin),
1319 TPARSERSTATEACTION(TPS_InTagCloseFirst),
1320 TPARSERSTATEACTION(TPS_InTagName),
1321 TPARSERSTATEACTION(TPS_InTagBeginEnd),
1322 TPARSERSTATEACTION(TPS_InTag),
1323 TPARSERSTATEACTION(TPS_InTagEscapeK),
1324 TPARSERSTATEACTION(TPS_InTagEscapeKK),
1325 TPARSERSTATEACTION(TPS_InTagBackSleshed),
1326 TPARSERSTATEACTION(TPS_InTagEnd),
1327 TPARSERSTATEACTION(TPS_InCommentFirst),
1328 TPARSERSTATEACTION(TPS_InCommentLast),
1329 TPARSERSTATEACTION(TPS_InComment),
1330 TPARSERSTATEACTION(TPS_InCloseCommentFirst),
1331 TPARSERSTATEACTION(TPS_InCloseCommentLast),
1332 TPARSERSTATEACTION(TPS_InCommentEnd),
1333 TPARSERSTATEACTION(TPS_InHostFirstDomain),
1334 TPARSERSTATEACTION(TPS_InHostDomainSecond),
1335 TPARSERSTATEACTION(TPS_InHostDomain),
1336 TPARSERSTATEACTION(TPS_InPortFirst),
1337 TPARSERSTATEACTION(TPS_InPort),
1338 TPARSERSTATEACTION(TPS_InHostFirstAN),
1339 TPARSERSTATEACTION(TPS_InHost),
1340 TPARSERSTATEACTION(TPS_InEmail),
1341 TPARSERSTATEACTION(TPS_InFileFirst),
1342 TPARSERSTATEACTION(TPS_InFileTwiddle),
1343 TPARSERSTATEACTION(TPS_InPathFirst),
1344 TPARSERSTATEACTION(TPS_InPathFirstFirst),
1345 TPARSERSTATEACTION(TPS_InPathSecond),
1346 TPARSERSTATEACTION(TPS_InFile),
1347 TPARSERSTATEACTION(TPS_InFileNext),
1348 TPARSERSTATEACTION(TPS_InURLPathFirst),
1349 TPARSERSTATEACTION(TPS_InURLPathStart),
1350 TPARSERSTATEACTION(TPS_InURLPath),
1351 TPARSERSTATEACTION(TPS_InFURL),
1352 TPARSERSTATEACTION(TPS_InProtocolFirst),
1353 TPARSERSTATEACTION(TPS_InProtocolSecond),
1354 TPARSERSTATEACTION(TPS_InProtocolEnd),
1355 TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
1356 TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
1357 TPARSERSTATEACTION(TPS_InHyphenWordFirst),
1358 TPARSERSTATEACTION(TPS_InHyphenWord),
1359 TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
1360 TPARSERSTATEACTION(TPS_InHyphenNumWord),
1361 TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
1362 TPARSERSTATEACTION(TPS_InParseHyphen),
1363 TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
1364 TPARSERSTATEACTION(TPS_InHyphenWordPart),
1365 TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
1366 TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1367 TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
1371 static bool
1372 TParserGet(TParser *prs)
1374 const TParserStateActionItem *item = NULL;
1376 Assert(prs->state);
1378 if (prs->state->posbyte >= prs->lenstr)
1379 return false;
1381 prs->token = prs->str + prs->state->posbyte;
1382 prs->state->pushedAtAction = NULL;
1384 /* look at string */
1385 while (prs->state->posbyte <= prs->lenstr)
1387 if (prs->state->posbyte == prs->lenstr)
1388 prs->state->charlen = 0;
1389 else
1390 prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1391 pg_mblen(prs->str + prs->state->posbyte);
1393 Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1394 Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1395 Assert(Actions[prs->state->state].state == prs->state->state);
1397 if (prs->state->pushedAtAction)
1399 /* After a POP, pick up at the next test */
1400 item = prs->state->pushedAtAction + 1;
1401 prs->state->pushedAtAction = NULL;
1403 else
1405 item = Actions[prs->state->state].action;
1406 Assert(item != NULL);
1409 /* find action by character class */
1410 while (item->isclass)
1412 prs->c = item->c;
1413 if (item->isclass(prs) != 0)
1414 break;
1415 item++;
1418 #ifdef WPARSER_TRACE
1420 TParserPosition *ptr;
1422 fprintf(stderr, "state ");
1423 /* indent according to stack depth */
1424 for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1425 fprintf(stderr, " ");
1426 fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1427 if (prs->state->posbyte < prs->lenstr)
1428 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1429 else
1430 fprintf(stderr, "at EOF");
1431 fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1432 (int) (item - Actions[prs->state->state].action),
1433 (item->flags & A_BINGO) ? " BINGO" : "",
1434 (item->flags & A_POP) ? " POP" : "",
1435 (item->flags & A_PUSH) ? " PUSH" : "",
1436 (item->flags & A_RERUN) ? " RERUN" : "",
1437 (item->flags & A_CLEAR) ? " CLEAR" : "",
1438 (item->flags & A_MERGE) ? " MERGE" : "",
1439 (item->flags & A_CLRALL) ? " CLRALL" : "",
1440 (item->tostate != TPS_Null) ? " tostate " : "",
1441 (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1442 (item->type > 0) ? " type " : "",
1443 tok_alias[item->type]);
1445 #endif
1447 /* call special handler if exists */
1448 if (item->special)
1449 item->special(prs);
1451 /* BINGO, token is found */
1452 if (item->flags & A_BINGO)
1454 Assert(item->type > 0);
1455 prs->lenbytetoken = prs->state->lenbytetoken;
1456 prs->lenchartoken = prs->state->lenchartoken;
1457 prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1458 prs->type = item->type;
1461 /* do various actions by flags */
1462 if (item->flags & A_POP)
1463 { /* pop stored state in stack */
1464 TParserPosition *ptr = prs->state->prev;
1466 pfree(prs->state);
1467 prs->state = ptr;
1468 Assert(prs->state);
1470 else if (item->flags & A_PUSH)
1471 { /* push (store) state in stack */
1472 prs->state->pushedAtAction = item; /* remember where we push */
1473 prs->state = newTParserPosition(prs->state);
1475 else if (item->flags & A_CLEAR)
1476 { /* clear previous pushed state */
1477 TParserPosition *ptr;
1479 Assert(prs->state->prev);
1480 ptr = prs->state->prev->prev;
1481 pfree(prs->state->prev);
1482 prs->state->prev = ptr;
1484 else if (item->flags & A_CLRALL)
1485 { /* clear all previous pushed state */
1486 TParserPosition *ptr;
1488 while (prs->state->prev)
1490 ptr = prs->state->prev->prev;
1491 pfree(prs->state->prev);
1492 prs->state->prev = ptr;
1495 else if (item->flags & A_MERGE)
1496 { /* merge posinfo with current and pushed state */
1497 TParserPosition *ptr = prs->state;
1499 Assert(prs->state->prev);
1500 prs->state = prs->state->prev;
1502 prs->state->posbyte = ptr->posbyte;
1503 prs->state->poschar = ptr->poschar;
1504 prs->state->charlen = ptr->charlen;
1505 prs->state->lenbytetoken = ptr->lenbytetoken;
1506 prs->state->lenchartoken = ptr->lenchartoken;
1507 pfree(ptr);
1510 /* set new state if pointed */
1511 if (item->tostate != TPS_Null)
1512 prs->state->state = item->tostate;
1514 /* check for go away */
1515 if ((item->flags & A_BINGO) ||
1516 (prs->state->posbyte >= prs->lenstr &&
1517 (item->flags & A_RERUN) == 0))
1518 break;
1520 /* go to beginning of loop if we should rerun or we just restore state */
1521 if (item->flags & (A_RERUN | A_POP))
1522 continue;
1524 /* move forward */
1525 if (prs->state->charlen)
1527 prs->state->posbyte += prs->state->charlen;
1528 prs->state->lenbytetoken += prs->state->charlen;
1529 prs->state->poschar++;
1530 prs->state->lenchartoken++;
1534 return (item && (item->flags & A_BINGO)) ? true : false;
1537 Datum
1538 prsd_lextype(PG_FUNCTION_ARGS)
1540 LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1541 int i;
1543 for (i = 1; i <= LASTNUM; i++)
1545 descr[i - 1].lexid = i;
1546 descr[i - 1].alias = pstrdup(tok_alias[i]);
1547 descr[i - 1].descr = pstrdup(lex_descr[i]);
1550 descr[LASTNUM].lexid = 0;
1552 PG_RETURN_POINTER(descr);
1555 Datum
1556 prsd_start(PG_FUNCTION_ARGS)
1558 PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1561 Datum
1562 prsd_nexttoken(PG_FUNCTION_ARGS)
1564 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1565 char **t = (char **) PG_GETARG_POINTER(1);
1566 int *tlen = (int *) PG_GETARG_POINTER(2);
1568 if (!TParserGet(p))
1569 PG_RETURN_INT32(0);
1571 *t = p->token;
1572 *tlen = p->lenbytetoken;
1574 PG_RETURN_INT32(p->type);
1577 Datum
1578 prsd_end(PG_FUNCTION_ARGS)
1580 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1582 TParserClose(p);
1583 PG_RETURN_VOID();
1586 #define LEAVETOKEN(x) ( (x)==SPACE )
1587 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1588 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
1590 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1591 #define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1592 #define XMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1593 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) )
1594 #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1596 typedef struct
1598 HeadlineWordEntry *words;
1599 int len;
1600 } hlCheck;
1602 static bool
1603 checkcondition_HL(void *checkval, QueryOperand *val)
1605 int i;
1607 for (i = 0; i < ((hlCheck *) checkval)->len; i++)
1609 if (((hlCheck *) checkval)->words[i].item == val)
1610 return true;
1612 return false;
1616 static bool
1617 hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
1619 int i,
1621 QueryItem *item = GETQUERY(query);
1622 int pos = *p;
1624 *q = -1;
1625 *p = 0x7fffffff;
1627 for (j = 0; j < query->size; j++)
1629 if (item->type != QI_VAL)
1631 item++;
1632 continue;
1634 for (i = pos; i < prs->curwords; i++)
1636 if (prs->words[i].item == &item->operand)
1638 if (i > *q)
1639 *q = i;
1640 break;
1643 item++;
1646 if (*q < 0)
1647 return false;
1649 item = GETQUERY(query);
1650 for (j = 0; j < query->size; j++)
1652 if (item->type != QI_VAL)
1654 item++;
1655 continue;
1657 for (i = *q; i >= pos; i--)
1659 if (prs->words[i].item == &item->operand)
1661 if (i < *p)
1662 *p = i;
1663 break;
1666 item++;
1669 if (*p <= *q)
1671 hlCheck ch;
1673 ch.words = &(prs->words[*p]);
1674 ch.len = *q - *p + 1;
1675 if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
1676 return true;
1677 else
1679 (*p)++;
1680 return hlCover(prs, query, p, q);
1684 return false;
1687 static void
1688 mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
1690 int i;
1692 for (i = startpos; i <= endpos; i++)
1694 if (prs->words[i].item)
1695 prs->words[i].selected = 1;
1696 if (highlight == 0)
1698 if (HLIDIGNORE(prs->words[i].type))
1699 prs->words[i].replace = 1;
1701 else
1703 if (XMLHLIDIGNORE(prs->words[i].type))
1704 prs->words[i].replace = 1;
1707 prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
1711 typedef struct
1713 int4 startpos;
1714 int4 endpos;
1715 int4 poslen;
1716 int4 curlen;
1717 int2 in;
1718 int2 excluded;
1719 } CoverPos;
1721 static void
1722 get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
1723 int *curlen, int *poslen, int max_words)
1725 int i;
1726 /* Objective: Generate a fragment of words between startpos and endpos
1727 * such that it has at most max_words and both ends has query words.
1728 * If the startpos and endpos are the endpoints of the cover and the
1729 * cover has fewer words than max_words, then this function should
1730 * just return the cover
1732 /* first move startpos to an item */
1733 for(i = *startpos; i <= *endpos; i++)
1735 *startpos = i;
1736 if (prs->words[i].item && !prs->words[i].repeated)
1737 break;
1739 /* cut endpos to have only max_words */
1740 *curlen = 0;
1741 *poslen = 0;
1742 for(i = *startpos; i <= *endpos && *curlen < max_words; i++)
1744 if (!NONWORDTOKEN(prs->words[i].type))
1745 *curlen += 1;
1746 if (prs->words[i].item && !prs->words[i].repeated)
1747 *poslen += 1;
1749 /* if the cover was cut then move back endpos to a query item */
1750 if (*endpos > i)
1752 *endpos = i;
1753 for(i = *endpos; i >= *startpos; i --)
1755 *endpos = i;
1756 if (prs->words[i].item && !prs->words[i].repeated)
1757 break;
1758 if (!NONWORDTOKEN(prs->words[i].type))
1759 *curlen -= 1;
1764 static void
1765 mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
1766 int shortword, int min_words,
1767 int max_words, int max_fragments)
1769 int4 poslen, curlen, i, f, num_f = 0;
1770 int4 stretch, maxstretch, posmarker;
1772 int4 startpos = 0,
1773 endpos = 0,
1774 p = 0,
1775 q = 0;
1777 int4 numcovers = 0,
1778 maxcovers = 32;
1780 int4 minI, minwords, maxitems;
1781 CoverPos *covers;
1783 covers = palloc(maxcovers * sizeof(CoverPos));
1785 /* get all covers */
1786 while (hlCover(prs, query, &p, &q))
1788 startpos = p;
1789 endpos = q;
1791 /* Break the cover into smaller fragments such that each fragment
1792 * has at most max_words. Also ensure that each end of the fragment
1793 * is a query word. This will allow us to stretch the fragment in
1794 * either direction
1797 while (startpos <= endpos)
1799 get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
1800 if (numcovers >= maxcovers)
1802 maxcovers *= 2;
1803 covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
1805 covers[numcovers].startpos = startpos;
1806 covers[numcovers].endpos = endpos;
1807 covers[numcovers].curlen = curlen;
1808 covers[numcovers].poslen = poslen;
1809 covers[numcovers].in = 0;
1810 covers[numcovers].excluded = 0;
1811 numcovers ++;
1812 startpos = endpos + 1;
1813 endpos = q;
1815 /* move p to generate the next cover */
1816 p++;
1819 /* choose best covers */
1820 for (f = 0; f < max_fragments; f++)
1822 maxitems = 0;
1823 minwords = 0x7fffffff;
1824 minI = -1;
1825 /* Choose the cover that contains max items.
1826 * In case of tie choose the one with smaller
1827 * number of words.
1829 for (i = 0; i < numcovers; i ++)
1831 if (!covers[i].in && !covers[i].excluded &&
1832 (maxitems < covers[i].poslen || (maxitems == covers[i].poslen
1833 && minwords > covers[i].curlen)))
1835 maxitems = covers[i].poslen;
1836 minwords = covers[i].curlen;
1837 minI = i;
1840 /* if a cover was found mark it */
1841 if (minI >= 0)
1843 covers[minI].in = 1;
1844 /* adjust the size of cover */
1845 startpos = covers[minI].startpos;
1846 endpos = covers[minI].endpos;
1847 curlen = covers[minI].curlen;
1848 /* stretch the cover if cover size is lower than max_words */
1849 if (curlen < max_words)
1851 /* divide the stretch on both sides of cover */
1852 maxstretch = (max_words - curlen)/2;
1853 /* first stretch the startpos
1854 * stop stretching if
1855 * 1. we hit the beginning of document
1856 * 2. exceed maxstretch
1857 * 3. we hit an already marked fragment
1859 stretch = 0;
1860 posmarker = startpos;
1861 for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
1863 if (!NONWORDTOKEN(prs->words[i].type))
1865 curlen ++;
1866 stretch ++;
1868 posmarker = i;
1870 /* cut back startpos till we find a non short token */
1871 for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
1873 if (!NONWORDTOKEN(prs->words[i].type))
1874 curlen --;
1876 startpos = i;
1877 /* now stretch the endpos as much as possible*/
1878 posmarker = endpos;
1879 for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
1881 if (!NONWORDTOKEN(prs->words[i].type))
1882 curlen ++;
1883 posmarker = i;
1885 /* cut back endpos till we find a non-short token */
1886 for ( i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
1888 if (!NONWORDTOKEN(prs->words[i].type))
1889 curlen --;
1891 endpos = i;
1893 covers[minI].startpos = startpos;
1894 covers[minI].endpos = endpos;
1895 covers[minI].curlen = curlen;
1896 /* Mark the chosen fragments (covers) */
1897 mark_fragment(prs, highlight, startpos, endpos);
1898 num_f ++;
1899 /* exclude overlapping covers */
1900 for (i = 0; i < numcovers; i ++)
1902 if (i != minI && ( (covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos)))
1903 covers[i].excluded = 1;
1906 else
1907 break;
1910 /* show at least min_words we have not marked anything*/
1911 if (num_f <= 0)
1913 startpos = endpos = curlen = 0;
1914 for (i = 0; i < prs->curwords && curlen < min_words; i++)
1916 if (!NONWORDTOKEN(prs->words[i].type))
1917 curlen++;
1918 endpos = i;
1920 mark_fragment(prs, highlight, startpos, endpos);
1922 pfree(covers);
1924 static void
1925 mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
1926 int shortword, int min_words, int max_words)
1928 int p = 0,
1929 q = 0;
1930 int bestb = -1,
1931 beste = -1;
1932 int bestlen = -1;
1933 int pose = 0,
1934 posb,
1935 poslen,
1936 curlen;
1938 int i;
1940 if (highlight == 0)
1942 while (hlCover(prs, query, &p, &q))
1944 /* find cover len in words */
1945 curlen = 0;
1946 poslen = 0;
1947 for (i = p; i <= q && curlen < max_words; i++)
1949 if (!NONWORDTOKEN(prs->words[i].type))
1950 curlen++;
1951 if (prs->words[i].item && !prs->words[i].repeated)
1952 poslen++;
1953 pose = i;
1956 if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
1958 /* best already finded, so try one more cover */
1959 p++;
1960 continue;
1963 posb = p;
1964 if (curlen < max_words)
1965 { /* find good end */
1966 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
1968 if (i != q)
1970 if (!NONWORDTOKEN(prs->words[i].type))
1971 curlen++;
1972 if (prs->words[i].item && !prs->words[i].repeated)
1973 poslen++;
1975 pose = i;
1976 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1977 continue;
1978 if (curlen >= min_words)
1979 break;
1981 if (curlen < min_words && i >= prs->curwords)
1982 { /* got end of text and our cover is shoter
1983 * than min_words */
1984 for (i = p; i >= 0; i--)
1986 if (!NONWORDTOKEN(prs->words[i].type))
1987 curlen++;
1988 if (prs->words[i].item && !prs->words[i].repeated)
1989 poslen++;
1990 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
1991 continue;
1992 if (curlen >= min_words)
1993 break;
1995 posb = (i >= 0) ? i : 0;
1998 else
1999 { /* shorter cover :((( */
2000 for (; curlen > min_words; i--)
2002 if (!NONWORDTOKEN(prs->words[i].type))
2003 curlen--;
2004 if (prs->words[i].item && !prs->words[i].repeated)
2005 poslen--;
2006 pose = i;
2007 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
2008 continue;
2009 break;
2013 if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
2014 (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
2015 (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
2017 bestb = posb;
2018 beste = pose;
2019 bestlen = poslen;
2022 p++;
2025 if (bestlen < 0)
2027 curlen = 0;
2028 for (i = 0; i < prs->curwords && curlen < min_words; i++)
2030 if (!NONWORDTOKEN(prs->words[i].type))
2031 curlen++;
2032 pose = i;
2034 bestb = 0;
2035 beste = pose;
2038 else
2040 bestb = 0;
2041 beste = prs->curwords - 1;
2044 for (i = bestb; i <= beste; i++)
2046 if (prs->words[i].item)
2047 prs->words[i].selected = 1;
2048 if (highlight == 0)
2050 if (HLIDIGNORE(prs->words[i].type))
2051 prs->words[i].replace = 1;
2053 else
2055 if (XMLHLIDIGNORE(prs->words[i].type))
2056 prs->words[i].replace = 1;
2059 prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2064 Datum
2065 prsd_headline(PG_FUNCTION_ARGS)
2067 HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
2068 List *prsoptions = (List *) PG_GETARG_POINTER(1);
2069 TSQuery query = PG_GETARG_TSQUERY(2);
2071 /* from opt + start and and tag */
2072 int min_words = 15;
2073 int max_words = 35;
2074 int shortword = 3;
2075 int max_fragments = 0;
2076 int highlight = 0;
2077 ListCell *l;
2079 /* config */
2080 prs->startsel = NULL;
2081 prs->stopsel = NULL;
2082 foreach(l, prsoptions)
2084 DefElem *defel = (DefElem *) lfirst(l);
2085 char *val = defGetString(defel);
2087 if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2088 max_words = pg_atoi(val, sizeof(int32), 0);
2089 else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2090 min_words = pg_atoi(val, sizeof(int32), 0);
2091 else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2092 shortword = pg_atoi(val, sizeof(int32), 0);
2093 else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2094 max_fragments = pg_atoi(val, sizeof(int32), 0);
2095 else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2096 prs->startsel = pstrdup(val);
2097 else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2098 prs->stopsel = pstrdup(val);
2099 else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2100 prs->fragdelim = pstrdup(val);
2101 else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2102 highlight = (pg_strcasecmp(val, "1") == 0 ||
2103 pg_strcasecmp(val, "on") == 0 ||
2104 pg_strcasecmp(val, "true") == 0 ||
2105 pg_strcasecmp(val, "t") == 0 ||
2106 pg_strcasecmp(val, "y") == 0 ||
2107 pg_strcasecmp(val, "yes") == 0);
2108 else
2109 ereport(ERROR,
2110 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2111 errmsg("unrecognized headline parameter: \"%s\"",
2112 defel->defname)));
2115 if (highlight == 0)
2117 if (min_words >= max_words)
2118 ereport(ERROR,
2119 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2120 errmsg("MinWords should be less than MaxWords")));
2121 if (min_words <= 0)
2122 ereport(ERROR,
2123 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2124 errmsg("MinWords should be positive")));
2125 if (shortword < 0)
2126 ereport(ERROR,
2127 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2128 errmsg("ShortWord should be >= 0")));
2129 if (max_fragments < 0)
2130 ereport(ERROR,
2131 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2132 errmsg("MaxFragments should be >= 0")));
2135 if (max_fragments == 0)
2136 /* call the default headline generator */
2137 mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
2138 else
2139 mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
2141 if (!prs->startsel)
2142 prs->startsel = pstrdup("<b>");
2143 if (!prs->stopsel)
2144 prs->stopsel = pstrdup("</b>");
2145 if (!prs->fragdelim)
2146 prs->fragdelim = pstrdup(" ... ");
2147 prs->startsellen = strlen(prs->startsel);
2148 prs->stopsellen = strlen(prs->stopsel);
2149 prs->fragdelimlen = strlen(prs->fragdelim);
2151 PG_RETURN_POINTER(prs);