1 /*-------------------------------------------------------------------------
4 * Default text search parser
6 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
12 *-------------------------------------------------------------------------
17 #include "commands/defrem.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_public.h"
20 #include "tsearch/ts_type.h"
21 #include "tsearch/ts_utils.h"
22 #include "utils/builtins.h"
25 /* Define me to enable tracing of parser behavior */
26 /* #define WPARSER_TRACE */
29 /* Output token categories */
38 #define VERSIONNUMBER 8
39 #define NUMPARTHWORD 9
41 #define ASCIIPARTHWORD 11
52 #define UNSIGNEDINT 22
57 static const char *const tok_alias
[] = {
84 static const char *const lex_descr
[] = {
88 "Word, letters and digits",
92 "Scientific notation",
94 "Hyphenated word part, letters and digits",
95 "Hyphenated word part, all letters",
96 "Hyphenated word part, all ASCII",
100 "Hyphenated word, letters and digits",
101 "Hyphenated word, all ASCII",
102 "Hyphenated word, all letters",
121 TPS_InSignedIntFirst
,
135 TPS_InXMLEntityFirst
,
137 TPS_InXMLEntityNumFirst
,
139 TPS_InXMLEntityHexNumFirst
,
140 TPS_InXMLEntityHexNum
,
150 TPS_InTagBackSleshed
,
155 TPS_InCloseCommentFirst
,
156 TPS_InCloseCommentLast
,
158 TPS_InHostFirstDomain
,
159 TPS_InHostDomainSecond
,
169 TPS_InPathFirstFirst
,
178 TPS_InProtocolSecond
,
180 TPS_InHyphenAsciiWordFirst
,
181 TPS_InHyphenAsciiWord
,
182 TPS_InHyphenWordFirst
,
184 TPS_InHyphenNumWordFirst
,
186 TPS_InHyphenDigitLookahead
,
188 TPS_InParseHyphenHyphen
,
189 TPS_InHyphenWordPart
,
190 TPS_InHyphenAsciiWordPart
,
191 TPS_InHyphenNumWordPart
,
192 TPS_InHyphenUnsignedInt
,
193 TPS_Null
/* last state (fake value) */
196 /* forward declaration */
199 typedef int (*TParserCharTest
) (struct TParser
*); /* any p_is* functions
201 typedef void (*TParserSpecial
) (struct TParser
*); /* special handler for
202 * special cases... */
206 TParserCharTest isclass
;
209 TParserState tostate
;
211 TParserSpecial special
;
212 } TParserStateActionItem
;
214 /* Flag bits in TParserStateActionItem.flags */
215 #define A_NEXT 0x0000
216 #define A_BINGO 0x0001
218 #define A_PUSH 0x0004
219 #define A_RERUN 0x0008
220 #define A_CLEAR 0x0010
221 #define A_MERGE 0x0020
222 #define A_CLRALL 0x0040
224 typedef struct TParserPosition
226 int posbyte
; /* position of parser in bytes */
227 int poschar
; /* position of parser in characters */
228 int charlen
; /* length of current char */
229 int lenbytetoken
; /* length of token-so-far in bytes */
230 int lenchartoken
; /* and in chars */
232 struct TParserPosition
*prev
;
233 const TParserStateActionItem
*pushedAtAction
;
236 typedef struct TParser
238 /* string and position information */
239 char *str
; /* multibyte string */
240 int lenstr
; /* length of mbstring */
241 #ifdef USE_WIDE_UPPER_LOWER
242 wchar_t *wstr
; /* wide character string */
243 int lenwstr
; /* length of wsting */
249 TParserPosition
*state
;
264 /* forward decls here */
265 static bool TParserGet(TParser
*prs
);
268 static TParserPosition
*
269 newTParserPosition(TParserPosition
*prev
)
271 TParserPosition
*res
= (TParserPosition
*) palloc(sizeof(TParserPosition
));
274 memcpy(res
, prev
, sizeof(TParserPosition
));
276 memset(res
, 0, sizeof(TParserPosition
));
280 res
->pushedAtAction
= NULL
;
286 TParserInit(char *str
, int len
)
288 TParser
*prs
= (TParser
*) palloc0(sizeof(TParser
));
290 prs
->charmaxlen
= pg_database_encoding_max_length();
294 #ifdef USE_WIDE_UPPER_LOWER
297 * Use wide char code only when max encoding length > 1.
299 if (prs
->charmaxlen
> 1)
302 prs
->wstr
= (wchar_t *) palloc(sizeof(wchar_t) * (prs
->lenstr
+ 1));
303 prs
->lenwstr
= char2wchar(prs
->wstr
, prs
->lenstr
+ 1,
304 prs
->str
, prs
->lenstr
);
308 prs
->usewide
= false;
310 prs
->state
= newTParserPosition(NULL
);
311 prs
->state
->state
= TPS_Base
;
314 fprintf(stderr
, "parsing \"%.*s\"\n", len
, str
);
321 TParserClose(TParser
*prs
)
325 TParserPosition
*ptr
= prs
->state
->prev
;
331 #ifdef USE_WIDE_UPPER_LOWER
340 * Character-type support functions, equivalent to is* macros, but
341 * working with any possible encodings and locales. Note,
342 * that with multibyte encoding and C-locale isw* function may fail
343 * or give wrong result. Note 2: multibyte encoding and C-locale
344 * often are used for Asian languages
347 #ifdef USE_WIDE_UPPER_LOWER
349 #define p_iswhat(type) \
351 p_is##type(TParser *prs) { \
352 Assert( prs->state ); \
353 if ( prs->usewide ) \
355 if ( lc_ctype_is_c() ) \
356 return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \
358 return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \
361 return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
365 p_isnot##type(TParser *prs) { \
366 return !p_is##type(prs); \
370 p_isalnum(TParser
*prs
)
378 unsigned int c
= *(prs
->wstr
+ prs
->state
->poschar
);
381 * any non-ascii symbol with multibyte encoding with C-locale is
387 return isalnum(0xff & c
);
390 return iswalnum((wint_t) *(prs
->wstr
+ prs
->state
->poschar
));
393 return isalnum(*(unsigned char *) (prs
->str
+ prs
->state
->posbyte
));
396 p_isnotalnum(TParser
*prs
)
398 return !p_isalnum(prs
);
402 p_isalpha(TParser
*prs
)
410 unsigned int c
= *(prs
->wstr
+ prs
->state
->poschar
);
413 * any non-ascii symbol with multibyte encoding with C-locale is
419 return isalpha(0xff & c
);
422 return iswalpha((wint_t) *(prs
->wstr
+ prs
->state
->poschar
));
425 return isalpha(*(unsigned char *) (prs
->str
+ prs
->state
->posbyte
));
429 p_isnotalpha(TParser
*prs
)
431 return !p_isalpha(prs
);
434 /* p_iseq should be used only for ascii symbols */
437 p_iseq(TParser
*prs
, char c
)
440 return ((prs
->state
->charlen
== 1 && *(prs
->str
+ prs
->state
->posbyte
) == c
)) ? 1 : 0;
442 #else /* USE_WIDE_UPPER_LOWER */
444 #define p_iswhat(type) \
446 p_is##type(TParser *prs) { \
447 Assert( prs->state ); \
448 return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
452 p_isnot##type(TParser *prs) { \
453 return !p_is##type(prs); \
458 p_iseq(TParser
*prs
, char c
)
461 return (*(prs
->str
+ prs
->state
->posbyte
) == c
) ? 1 : 0;
466 #endif /* USE_WIDE_UPPER_LOWER */
477 p_isEOF(TParser
*prs
)
480 return (prs
->state
->posbyte
== prs
->lenstr
|| prs
->state
->charlen
== 0) ? 1 : 0;
484 p_iseqC(TParser
*prs
)
486 return p_iseq(prs
, prs
->c
);
490 p_isneC(TParser
*prs
)
492 return !p_iseq(prs
, prs
->c
);
496 p_isascii(TParser
*prs
)
498 return (prs
->state
->charlen
== 1 && isascii((unsigned char) *(prs
->str
+ prs
->state
->posbyte
))) ? 1 : 0;
502 p_isasclet(TParser
*prs
)
504 return (p_isascii(prs
) && p_isalpha(prs
)) ? 1 : 0;
508 /* deliberately suppress unused-function complaints for the above */
509 void _make_compiler_happy(void);
511 _make_compiler_happy(void)
538 SpecialTags(TParser
*prs
)
540 switch (prs
->state
->lenchartoken
)
542 case 8: /* </script */
543 if (pg_strncasecmp(prs
->token
, "</script", 8) == 0)
546 case 7: /* <script || </style */
547 if (pg_strncasecmp(prs
->token
, "</style", 7) == 0)
549 else if (pg_strncasecmp(prs
->token
, "<script", 7) == 0)
553 if (pg_strncasecmp(prs
->token
, "<style", 6) == 0)
562 SpecialFURL(TParser
*prs
)
564 prs
->wanthost
= true;
565 prs
->state
->posbyte
-= prs
->state
->lenbytetoken
;
566 prs
->state
->poschar
-= prs
->state
->lenchartoken
;
570 SpecialHyphen(TParser
*prs
)
572 prs
->state
->posbyte
-= prs
->state
->lenbytetoken
;
573 prs
->state
->poschar
-= prs
->state
->lenchartoken
;
577 SpecialVerVersion(TParser
*prs
)
579 prs
->state
->posbyte
-= prs
->state
->lenbytetoken
;
580 prs
->state
->poschar
-= prs
->state
->lenchartoken
;
581 prs
->state
->lenbytetoken
= 0;
582 prs
->state
->lenchartoken
= 0;
586 p_isstophost(TParser
*prs
)
590 prs
->wanthost
= false;
597 p_isignore(TParser
*prs
)
599 return (prs
->ignore
) ? 1 : 0;
603 p_ishost(TParser
*prs
)
605 TParser
*tmpprs
= TParserInit(prs
->str
+ prs
->state
->posbyte
, prs
->lenstr
- prs
->state
->posbyte
);
608 if (TParserGet(tmpprs
) && tmpprs
->type
== HOST
)
610 prs
->state
->posbyte
+= tmpprs
->lenbytetoken
;
611 prs
->state
->poschar
+= tmpprs
->lenchartoken
;
612 prs
->state
->lenbytetoken
+= tmpprs
->lenbytetoken
;
613 prs
->state
->lenchartoken
+= tmpprs
->lenchartoken
;
614 prs
->state
->charlen
= tmpprs
->state
->charlen
;
617 TParserClose(tmpprs
);
623 p_isURLPath(TParser
*prs
)
625 TParser
*tmpprs
= TParserInit(prs
->str
+ prs
->state
->posbyte
, prs
->lenstr
- prs
->state
->posbyte
);
628 tmpprs
->state
= newTParserPosition(tmpprs
->state
);
629 tmpprs
->state
->state
= TPS_InFileFirst
;
631 if (TParserGet(tmpprs
) && (tmpprs
->type
== URLPATH
|| tmpprs
->type
== FILEPATH
))
633 prs
->state
->posbyte
+= tmpprs
->lenbytetoken
;
634 prs
->state
->poschar
+= tmpprs
->lenchartoken
;
635 prs
->state
->lenbytetoken
+= tmpprs
->lenbytetoken
;
636 prs
->state
->lenchartoken
+= tmpprs
->lenchartoken
;
637 prs
->state
->charlen
= tmpprs
->state
->charlen
;
640 TParserClose(tmpprs
);
646 * Table of state/action of parser
649 static const TParserStateActionItem actionTPS_Base
[] = {
650 {p_isEOF
, 0, A_NEXT
, TPS_Null
, 0, NULL
},
651 {p_iseqC
, '<', A_PUSH
, TPS_InTagFirst
, 0, NULL
},
652 {p_isignore
, 0, A_NEXT
, TPS_InSpace
, 0, NULL
},
653 {p_isasclet
, 0, A_NEXT
, TPS_InAsciiWord
, 0, NULL
},
654 {p_isalpha
, 0, A_NEXT
, TPS_InWord
, 0, NULL
},
655 {p_isdigit
, 0, A_NEXT
, TPS_InUnsignedInt
, 0, NULL
},
656 {p_iseqC
, '-', A_PUSH
, TPS_InSignedIntFirst
, 0, NULL
},
657 {p_iseqC
, '+', A_PUSH
, TPS_InSignedIntFirst
, 0, NULL
},
658 {p_iseqC
, '&', A_PUSH
, TPS_InXMLEntityFirst
, 0, NULL
},
659 {p_iseqC
, '~', A_PUSH
, TPS_InFileTwiddle
, 0, NULL
},
660 {p_iseqC
, '/', A_PUSH
, TPS_InFileFirst
, 0, NULL
},
661 {p_iseqC
, '.', A_PUSH
, TPS_InPathFirstFirst
, 0, NULL
},
662 {NULL
, 0, A_NEXT
, TPS_InSpace
, 0, NULL
}
666 static const TParserStateActionItem actionTPS_InNumWord
[] = {
667 {p_isEOF
, 0, A_BINGO
, TPS_Base
, NUMWORD
, NULL
},
668 {p_isalnum
, 0, A_NEXT
, TPS_InNumWord
, 0, NULL
},
669 {p_iseqC
, '@', A_PUSH
, TPS_InEmail
, 0, NULL
},
670 {p_iseqC
, '/', A_PUSH
, TPS_InFileFirst
, 0, NULL
},
671 {p_iseqC
, '.', A_PUSH
, TPS_InFileNext
, 0, NULL
},
672 {p_iseqC
, '-', A_PUSH
, TPS_InHyphenNumWordFirst
, 0, NULL
},
673 {NULL
, 0, A_BINGO
, TPS_Base
, NUMWORD
, NULL
}
676 static const TParserStateActionItem actionTPS_InAsciiWord
[] = {
677 {p_isEOF
, 0, A_BINGO
, TPS_Base
, ASCIIWORD
, NULL
},
678 {p_isasclet
, 0, A_NEXT
, TPS_Null
, 0, NULL
},
679 {p_iseqC
, '.', A_PUSH
, TPS_InHostFirstDomain
, 0, NULL
},
680 {p_iseqC
, '.', A_PUSH
, TPS_InFileNext
, 0, NULL
},
681 {p_iseqC
, '-', A_PUSH
, TPS_InHostFirstAN
, 0, NULL
},
682 {p_iseqC
, '-', A_PUSH
, TPS_InHyphenAsciiWordFirst
, 0, NULL
},
683 {p_iseqC
, '@', A_PUSH
, TPS_InEmail
, 0, NULL
},
684 {p_iseqC
, ':', A_PUSH
, TPS_InProtocolFirst
, 0, NULL
},
685 {p_iseqC
, '/', A_PUSH
, TPS_InFileFirst
, 0, NULL
},
686 {p_isdigit
, 0, A_PUSH
, TPS_InHost
, 0, NULL
},
687 {p_isdigit
, 0, A_NEXT
, TPS_InNumWord
, 0, NULL
},
688 {p_isalpha
, 0, A_NEXT
, TPS_InWord
, 0, NULL
},
689 {NULL
, 0, A_BINGO
, TPS_Base
, ASCIIWORD
, NULL
}
692 static const TParserStateActionItem actionTPS_InWord
[] = {
693 {p_isEOF
, 0, A_BINGO
, TPS_Base
, WORD_T
, NULL
},
694 {p_isalpha
, 0, A_NEXT
, TPS_Null
, 0, NULL
},
695 {p_isdigit
, 0, A_NEXT
, TPS_InNumWord
, 0, NULL
},
696 {p_iseqC
, '-', A_PUSH
, TPS_InHyphenWordFirst
, 0, NULL
},
697 {NULL
, 0, A_BINGO
, TPS_Base
, WORD_T
, NULL
}
700 static const TParserStateActionItem actionTPS_InUnsignedInt
[] = {
701 {p_isEOF
, 0, A_BINGO
, TPS_Base
, UNSIGNEDINT
, NULL
},
702 {p_isdigit
, 0, A_NEXT
, TPS_Null
, 0, NULL
},
703 {p_iseqC
, '.', A_PUSH
, TPS_InHostFirstDomain
, 0, NULL
},
704 {p_iseqC
, '.', A_PUSH
, TPS_InUDecimalFirst
, 0, NULL
},
705 {p_iseqC
, 'e', A_PUSH
, TPS_InMantissaFirst
, 0, NULL
},
706 {p_iseqC
, 'E', A_PUSH
, TPS_InMantissaFirst
, 0, NULL
},
707 {p_isasclet
, 0, A_PUSH
, TPS_InHost
, 0, NULL
},
708 {p_isalpha
, 0, A_NEXT
, TPS_InNumWord
, 0, NULL
},
709 {p_iseqC
, '/', A_PUSH
, TPS_InFileFirst
, 0, NULL
},
710 {NULL
, 0, A_BINGO
, TPS_Base
, UNSIGNEDINT
, NULL
}
713 static const TParserStateActionItem actionTPS_InSignedIntFirst
[] = {
714 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
715 {p_isdigit
, 0, A_NEXT
| A_CLEAR
, TPS_InSignedInt
, 0, NULL
},
716 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
719 static const TParserStateActionItem actionTPS_InSignedInt
[] = {
720 {p_isEOF
, 0, A_BINGO
, TPS_Base
, SIGNEDINT
, NULL
},
721 {p_isdigit
, 0, A_NEXT
, TPS_Null
, 0, NULL
},
722 {p_iseqC
, '.', A_PUSH
, TPS_InDecimalFirst
, 0, NULL
},
723 {p_iseqC
, 'e', A_PUSH
, TPS_InMantissaFirst
, 0, NULL
},
724 {p_iseqC
, 'E', A_PUSH
, TPS_InMantissaFirst
, 0, NULL
},
725 {NULL
, 0, A_BINGO
, TPS_Base
, SIGNEDINT
, NULL
}
728 static const TParserStateActionItem actionTPS_InSpace
[] = {
729 {p_isEOF
, 0, A_BINGO
, TPS_Base
, SPACE
, NULL
},
730 {p_iseqC
, '<', A_BINGO
, TPS_Base
, SPACE
, NULL
},
731 {p_isignore
, 0, A_NEXT
, TPS_Null
, 0, NULL
},
732 {p_iseqC
, '-', A_BINGO
, TPS_Base
, SPACE
, NULL
},
733 {p_iseqC
, '+', A_BINGO
, TPS_Base
, SPACE
, NULL
},
734 {p_iseqC
, '&', A_BINGO
, TPS_Base
, SPACE
, NULL
},
735 {p_iseqC
, '/', A_BINGO
, TPS_Base
, SPACE
, NULL
},
736 {p_isnotalnum
, 0, A_NEXT
, TPS_InSpace
, 0, NULL
},
737 {NULL
, 0, A_BINGO
, TPS_Base
, SPACE
, NULL
}
740 static const TParserStateActionItem actionTPS_InUDecimalFirst
[] = {
741 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
742 {p_isdigit
, 0, A_CLEAR
, TPS_InUDecimal
, 0, NULL
},
743 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
746 static const TParserStateActionItem actionTPS_InUDecimal
[] = {
747 {p_isEOF
, 0, A_BINGO
, TPS_Base
, DECIMAL
, NULL
},
748 {p_isdigit
, 0, A_NEXT
, TPS_InUDecimal
, 0, NULL
},
749 {p_iseqC
, '.', A_PUSH
, TPS_InVersionFirst
, 0, NULL
},
750 {p_iseqC
, 'e', A_PUSH
, TPS_InMantissaFirst
, 0, NULL
},
751 {p_iseqC
, 'E', A_PUSH
, TPS_InMantissaFirst
, 0, NULL
},
752 {NULL
, 0, A_BINGO
, TPS_Base
, DECIMAL
, NULL
}
755 static const TParserStateActionItem actionTPS_InDecimalFirst
[] = {
756 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
757 {p_isdigit
, 0, A_CLEAR
, TPS_InDecimal
, 0, NULL
},
758 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
761 static const TParserStateActionItem actionTPS_InDecimal
[] = {
762 {p_isEOF
, 0, A_BINGO
, TPS_Base
, DECIMAL
, NULL
},
763 {p_isdigit
, 0, A_NEXT
, TPS_InDecimal
, 0, NULL
},
764 {p_iseqC
, '.', A_PUSH
, TPS_InVerVersion
, 0, NULL
},
765 {p_iseqC
, 'e', A_PUSH
, TPS_InMantissaFirst
, 0, NULL
},
766 {p_iseqC
, 'E', A_PUSH
, TPS_InMantissaFirst
, 0, NULL
},
767 {NULL
, 0, A_BINGO
, TPS_Base
, DECIMAL
, NULL
}
770 static const TParserStateActionItem actionTPS_InVerVersion
[] = {
771 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
772 {p_isdigit
, 0, A_RERUN
, TPS_InSVerVersion
, 0, SpecialVerVersion
},
773 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
776 static const TParserStateActionItem actionTPS_InSVerVersion
[] = {
777 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
778 {p_isdigit
, 0, A_BINGO
| A_CLRALL
, TPS_InUnsignedInt
, SPACE
, NULL
},
779 {NULL
, 0, A_NEXT
, TPS_Null
, 0, NULL
}
783 static const TParserStateActionItem actionTPS_InVersionFirst
[] = {
784 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
785 {p_isdigit
, 0, A_CLEAR
, TPS_InVersion
, 0, NULL
},
786 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
789 static const TParserStateActionItem actionTPS_InVersion
[] = {
790 {p_isEOF
, 0, A_BINGO
, TPS_Base
, VERSIONNUMBER
, NULL
},
791 {p_isdigit
, 0, A_NEXT
, TPS_InVersion
, 0, NULL
},
792 {p_iseqC
, '.', A_PUSH
, TPS_InVersionFirst
, 0, NULL
},
793 {NULL
, 0, A_BINGO
, TPS_Base
, VERSIONNUMBER
, NULL
}
796 static const TParserStateActionItem actionTPS_InMantissaFirst
[] = {
797 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
798 {p_isdigit
, 0, A_CLEAR
, TPS_InMantissa
, 0, NULL
},
799 {p_iseqC
, '+', A_NEXT
, TPS_InMantissaSign
, 0, NULL
},
800 {p_iseqC
, '-', A_NEXT
, TPS_InMantissaSign
, 0, NULL
},
801 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
804 static const TParserStateActionItem actionTPS_InMantissaSign
[] = {
805 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
806 {p_isdigit
, 0, A_CLEAR
, TPS_InMantissa
, 0, NULL
},
807 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
810 static const TParserStateActionItem actionTPS_InMantissa
[] = {
811 {p_isEOF
, 0, A_BINGO
, TPS_Base
, SCIENTIFIC
, NULL
},
812 {p_isdigit
, 0, A_NEXT
, TPS_InMantissa
, 0, NULL
},
813 {NULL
, 0, A_BINGO
, TPS_Base
, SCIENTIFIC
, NULL
}
816 static const TParserStateActionItem actionTPS_InXMLEntityFirst
[] = {
817 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
818 {p_iseqC
, '#', A_NEXT
, TPS_InXMLEntityNumFirst
, 0, NULL
},
819 {p_isasclet
, 0, A_NEXT
, TPS_InXMLEntity
, 0, NULL
},
820 {p_iseqC
, ':', A_NEXT
, TPS_InXMLEntity
, 0, NULL
},
821 {p_iseqC
, '_', A_NEXT
, TPS_InXMLEntity
, 0, NULL
},
822 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
825 static const TParserStateActionItem actionTPS_InXMLEntity
[] = {
826 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
827 {p_isalnum
, 0, A_NEXT
, TPS_InXMLEntity
, 0, NULL
},
828 {p_iseqC
, ':', A_NEXT
, TPS_InXMLEntity
, 0, NULL
},
829 {p_iseqC
, '_', A_NEXT
, TPS_InXMLEntity
, 0, NULL
},
830 {p_iseqC
, '.', A_NEXT
, TPS_InXMLEntity
, 0, NULL
},
831 {p_iseqC
, '-', A_NEXT
, TPS_InXMLEntity
, 0, NULL
},
832 {p_iseqC
, ';', A_NEXT
, TPS_InXMLEntityEnd
, 0, NULL
},
833 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
836 static const TParserStateActionItem actionTPS_InXMLEntityNumFirst
[] = {
837 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
838 {p_iseqC
, 'x', A_NEXT
, TPS_InXMLEntityHexNumFirst
, 0, NULL
},
839 {p_iseqC
, 'X', A_NEXT
, TPS_InXMLEntityHexNumFirst
, 0, NULL
},
840 {p_isdigit
, 0, A_NEXT
, TPS_InXMLEntityNum
, 0, NULL
},
841 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
844 static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst
[] = {
845 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
846 {p_isxdigit
, 0, A_NEXT
, TPS_InXMLEntityHexNum
, 0, NULL
},
847 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
850 static const TParserStateActionItem actionTPS_InXMLEntityNum
[] = {
851 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
852 {p_isdigit
, 0, A_NEXT
, TPS_InXMLEntityNum
, 0, NULL
},
853 {p_iseqC
, ';', A_NEXT
, TPS_InXMLEntityEnd
, 0, NULL
},
854 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
857 static const TParserStateActionItem actionTPS_InXMLEntityHexNum
[] = {
858 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
859 {p_isxdigit
, 0, A_NEXT
, TPS_InXMLEntityHexNum
, 0, NULL
},
860 {p_iseqC
, ';', A_NEXT
, TPS_InXMLEntityEnd
, 0, NULL
},
861 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
864 static const TParserStateActionItem actionTPS_InXMLEntityEnd
[] = {
865 {NULL
, 0, A_BINGO
| A_CLEAR
, TPS_Base
, XMLENTITY
, NULL
}
868 static const TParserStateActionItem actionTPS_InTagFirst
[] = {
869 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
870 {p_iseqC
, '/', A_PUSH
, TPS_InTagCloseFirst
, 0, NULL
},
871 {p_iseqC
, '!', A_PUSH
, TPS_InCommentFirst
, 0, NULL
},
872 {p_iseqC
, '?', A_PUSH
, TPS_InXMLBegin
, 0, NULL
},
873 {p_isasclet
, 0, A_PUSH
, TPS_InTagName
, 0, NULL
},
874 {p_iseqC
, ':', A_PUSH
, TPS_InTagName
, 0, NULL
},
875 {p_iseqC
, '_', A_PUSH
, TPS_InTagName
, 0, NULL
},
876 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
879 static const TParserStateActionItem actionTPS_InXMLBegin
[] = {
880 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
882 /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
883 {p_iseqC
, 'x', A_NEXT
, TPS_InTag
, 0, NULL
},
884 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
887 static const TParserStateActionItem actionTPS_InTagCloseFirst
[] = {
888 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
889 {p_isasclet
, 0, A_NEXT
, TPS_InTagName
, 0, NULL
},
890 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
893 static const TParserStateActionItem actionTPS_InTagName
[] = {
894 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
896 {p_iseqC
, '/', A_NEXT
, TPS_InTagBeginEnd
, 0, NULL
},
897 {p_iseqC
, '>', A_NEXT
, TPS_InTagEnd
, 0, SpecialTags
},
898 {p_isspace
, 0, A_NEXT
, TPS_InTag
, 0, SpecialTags
},
899 {p_isalnum
, 0, A_NEXT
, TPS_Null
, 0, NULL
},
900 {p_iseqC
, ':', A_NEXT
, TPS_Null
, 0, NULL
},
901 {p_iseqC
, '_', A_NEXT
, TPS_Null
, 0, NULL
},
902 {p_iseqC
, '.', A_NEXT
, TPS_Null
, 0, NULL
},
903 {p_iseqC
, '-', A_NEXT
, TPS_Null
, 0, NULL
},
904 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
907 static const TParserStateActionItem actionTPS_InTagBeginEnd
[] = {
908 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
909 {p_iseqC
, '>', A_NEXT
, TPS_InTagEnd
, 0, NULL
},
910 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
913 static const TParserStateActionItem actionTPS_InTag
[] = {
914 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
915 {p_iseqC
, '>', A_NEXT
, TPS_InTagEnd
, 0, SpecialTags
},
916 {p_iseqC
, '\'', A_NEXT
, TPS_InTagEscapeK
, 0, NULL
},
917 {p_iseqC
, '"', A_NEXT
, TPS_InTagEscapeKK
, 0, NULL
},
918 {p_isasclet
, 0, A_NEXT
, TPS_Null
, 0, NULL
},
919 {p_isdigit
, 0, A_NEXT
, TPS_Null
, 0, NULL
},
920 {p_iseqC
, '=', A_NEXT
, TPS_Null
, 0, NULL
},
921 {p_iseqC
, '-', A_NEXT
, TPS_Null
, 0, NULL
},
922 {p_iseqC
, '#', A_NEXT
, TPS_Null
, 0, NULL
},
923 {p_iseqC
, '/', A_NEXT
, TPS_Null
, 0, NULL
},
924 {p_iseqC
, ':', A_NEXT
, TPS_Null
, 0, NULL
},
925 {p_iseqC
, '.', A_NEXT
, TPS_Null
, 0, NULL
},
926 {p_iseqC
, '&', A_NEXT
, TPS_Null
, 0, NULL
},
927 {p_iseqC
, '?', A_NEXT
, TPS_Null
, 0, NULL
},
928 {p_iseqC
, '%', A_NEXT
, TPS_Null
, 0, NULL
},
929 {p_iseqC
, '~', A_NEXT
, TPS_Null
, 0, NULL
},
930 {p_isspace
, 0, A_NEXT
, TPS_Null
, 0, SpecialTags
},
931 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
934 static const TParserStateActionItem actionTPS_InTagEscapeK
[] = {
935 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
936 {p_iseqC
, '\\', A_PUSH
, TPS_InTagBackSleshed
, 0, NULL
},
937 {p_iseqC
, '\'', A_NEXT
, TPS_InTag
, 0, NULL
},
938 {NULL
, 0, A_NEXT
, TPS_InTagEscapeK
, 0, NULL
}
941 static const TParserStateActionItem actionTPS_InTagEscapeKK
[] = {
942 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
943 {p_iseqC
, '\\', A_PUSH
, TPS_InTagBackSleshed
, 0, NULL
},
944 {p_iseqC
, '"', A_NEXT
, TPS_InTag
, 0, NULL
},
945 {NULL
, 0, A_NEXT
, TPS_InTagEscapeKK
, 0, NULL
}
948 static const TParserStateActionItem actionTPS_InTagBackSleshed
[] = {
949 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
950 {NULL
, 0, A_MERGE
, TPS_Null
, 0, NULL
}
953 static const TParserStateActionItem actionTPS_InTagEnd
[] = {
954 {NULL
, 0, A_BINGO
| A_CLRALL
, TPS_Base
, TAG_T
, NULL
}
957 static const TParserStateActionItem actionTPS_InCommentFirst
[] = {
958 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
959 {p_iseqC
, '-', A_NEXT
, TPS_InCommentLast
, 0, NULL
},
961 {p_iseqC
, 'D', A_NEXT
, TPS_InTag
, 0, NULL
},
962 {p_iseqC
, 'd', A_NEXT
, TPS_InTag
, 0, NULL
},
963 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
966 static const TParserStateActionItem actionTPS_InCommentLast
[] = {
967 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
968 {p_iseqC
, '-', A_NEXT
, TPS_InComment
, 0, NULL
},
969 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
972 static const TParserStateActionItem actionTPS_InComment
[] = {
973 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
974 {p_iseqC
, '-', A_NEXT
, TPS_InCloseCommentFirst
, 0, NULL
},
975 {NULL
, 0, A_NEXT
, TPS_Null
, 0, NULL
}
978 static const TParserStateActionItem actionTPS_InCloseCommentFirst
[] = {
979 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
980 {p_iseqC
, '-', A_NEXT
, TPS_InCloseCommentLast
, 0, NULL
},
981 {NULL
, 0, A_NEXT
, TPS_InComment
, 0, NULL
}
984 static const TParserStateActionItem actionTPS_InCloseCommentLast
[] = {
985 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
986 {p_iseqC
, '-', A_NEXT
, TPS_Null
, 0, NULL
},
987 {p_iseqC
, '>', A_NEXT
, TPS_InCommentEnd
, 0, NULL
},
988 {NULL
, 0, A_NEXT
, TPS_InComment
, 0, NULL
}
991 static const TParserStateActionItem actionTPS_InCommentEnd
[] = {
992 {NULL
, 0, A_BINGO
| A_CLRALL
, TPS_Base
, TAG_T
, NULL
}
995 static const TParserStateActionItem actionTPS_InHostFirstDomain
[] = {
996 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
997 {p_isasclet
, 0, A_NEXT
, TPS_InHostDomainSecond
, 0, NULL
},
998 {p_isdigit
, 0, A_NEXT
, TPS_InHost
, 0, NULL
},
999 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1002 static const TParserStateActionItem actionTPS_InHostDomainSecond
[] = {
1003 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1004 {p_isasclet
, 0, A_NEXT
, TPS_InHostDomain
, 0, NULL
},
1005 {p_isdigit
, 0, A_PUSH
, TPS_InHost
, 0, NULL
},
1006 {p_iseqC
, '-', A_PUSH
, TPS_InHostFirstAN
, 0, NULL
},
1007 {p_iseqC
, '.', A_PUSH
, TPS_InHostFirstDomain
, 0, NULL
},
1008 {p_iseqC
, '@', A_PUSH
, TPS_InEmail
, 0, NULL
},
1009 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1012 static const TParserStateActionItem actionTPS_InHostDomain
[] = {
1013 {p_isEOF
, 0, A_BINGO
| A_CLRALL
, TPS_Base
, HOST
, NULL
},
1014 {p_isasclet
, 0, A_NEXT
, TPS_InHostDomain
, 0, NULL
},
1015 {p_isdigit
, 0, A_PUSH
, TPS_InHost
, 0, NULL
},
1016 {p_iseqC
, ':', A_PUSH
, TPS_InPortFirst
, 0, NULL
},
1017 {p_iseqC
, '-', A_PUSH
, TPS_InHostFirstAN
, 0, NULL
},
1018 {p_iseqC
, '.', A_PUSH
, TPS_InHostFirstDomain
, 0, NULL
},
1019 {p_iseqC
, '@', A_PUSH
, TPS_InEmail
, 0, NULL
},
1020 {p_isdigit
, 0, A_POP
, TPS_Null
, 0, NULL
},
1021 {p_isstophost
, 0, A_BINGO
| A_CLRALL
, TPS_InURLPathStart
, HOST
, NULL
},
1022 {p_iseqC
, '/', A_PUSH
, TPS_InFURL
, 0, NULL
},
1023 {NULL
, 0, A_BINGO
| A_CLRALL
, TPS_Base
, HOST
, NULL
}
1026 static const TParserStateActionItem actionTPS_InPortFirst
[] = {
1027 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1028 {p_isdigit
, 0, A_NEXT
, TPS_InPort
, 0, NULL
},
1029 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1032 static const TParserStateActionItem actionTPS_InPort
[] = {
1033 {p_isEOF
, 0, A_BINGO
| A_CLRALL
, TPS_Base
, HOST
, NULL
},
1034 {p_isdigit
, 0, A_NEXT
, TPS_InPort
, 0, NULL
},
1035 {p_isstophost
, 0, A_BINGO
| A_CLRALL
, TPS_InURLPathStart
, HOST
, NULL
},
1036 {p_iseqC
, '/', A_PUSH
, TPS_InFURL
, 0, NULL
},
1037 {NULL
, 0, A_BINGO
| A_CLRALL
, TPS_Base
, HOST
, NULL
}
1040 static const TParserStateActionItem actionTPS_InHostFirstAN
[] = {
1041 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1042 {p_isdigit
, 0, A_NEXT
, TPS_InHost
, 0, NULL
},
1043 {p_isasclet
, 0, A_NEXT
, TPS_InHost
, 0, NULL
},
1044 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1047 static const TParserStateActionItem actionTPS_InHost
[] = {
1048 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1049 {p_isdigit
, 0, A_NEXT
, TPS_InHost
, 0, NULL
},
1050 {p_isasclet
, 0, A_NEXT
, TPS_InHost
, 0, NULL
},
1051 {p_iseqC
, '@', A_PUSH
, TPS_InEmail
, 0, NULL
},
1052 {p_iseqC
, '.', A_PUSH
, TPS_InHostFirstDomain
, 0, NULL
},
1053 {p_iseqC
, '-', A_PUSH
, TPS_InHostFirstAN
, 0, NULL
},
1054 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1057 static const TParserStateActionItem actionTPS_InEmail
[] = {
1058 {p_ishost
, 0, A_BINGO
| A_CLRALL
, TPS_Base
, EMAIL
, NULL
},
1059 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1062 static const TParserStateActionItem actionTPS_InFileFirst
[] = {
1063 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1064 {p_isasclet
, 0, A_NEXT
, TPS_InFile
, 0, NULL
},
1065 {p_isdigit
, 0, A_NEXT
, TPS_InFile
, 0, NULL
},
1066 {p_iseqC
, '.', A_NEXT
, TPS_InPathFirst
, 0, NULL
},
1067 {p_iseqC
, '_', A_NEXT
, TPS_InFile
, 0, NULL
},
1068 {p_iseqC
, '?', A_PUSH
, TPS_InURLPathFirst
, 0, NULL
},
1069 {p_iseqC
, '~', A_PUSH
, TPS_InFileTwiddle
, 0, NULL
},
1070 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1073 static const TParserStateActionItem actionTPS_InFileTwiddle
[] = {
1074 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1075 {p_isasclet
, 0, A_NEXT
, TPS_InFile
, 0, NULL
},
1076 {p_isdigit
, 0, A_NEXT
, TPS_InFile
, 0, NULL
},
1077 {p_iseqC
, '_', A_NEXT
, TPS_InFile
, 0, NULL
},
1078 {p_iseqC
, '/', A_NEXT
, TPS_InFileFirst
, 0, NULL
},
1079 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1082 static const TParserStateActionItem actionTPS_InPathFirst
[] = {
1083 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1084 {p_isasclet
, 0, A_NEXT
, TPS_InFile
, 0, NULL
},
1085 {p_isdigit
, 0, A_NEXT
, TPS_InFile
, 0, NULL
},
1086 {p_iseqC
, '_', A_NEXT
, TPS_InFile
, 0, NULL
},
1087 {p_iseqC
, '.', A_NEXT
, TPS_InPathSecond
, 0, NULL
},
1088 {p_iseqC
, '/', A_NEXT
, TPS_InFileFirst
, 0, NULL
},
1089 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1092 static const TParserStateActionItem actionTPS_InPathFirstFirst
[] = {
1093 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1094 {p_iseqC
, '.', A_NEXT
, TPS_InPathSecond
, 0, NULL
},
1095 {p_iseqC
, '/', A_NEXT
, TPS_InFileFirst
, 0, NULL
},
1096 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1099 static const TParserStateActionItem actionTPS_InPathSecond
[] = {
1100 {p_isEOF
, 0, A_BINGO
| A_CLEAR
, TPS_Base
, FILEPATH
, NULL
},
1101 {p_iseqC
, '/', A_NEXT
| A_PUSH
, TPS_InFileFirst
, 0, NULL
},
1102 {p_iseqC
, '/', A_BINGO
| A_CLEAR
, TPS_Base
, FILEPATH
, NULL
},
1103 {p_isspace
, 0, A_BINGO
| A_CLEAR
, TPS_Base
, FILEPATH
, NULL
},
1104 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1107 static const TParserStateActionItem actionTPS_InFile
[] = {
1108 {p_isEOF
, 0, A_BINGO
, TPS_Base
, FILEPATH
, NULL
},
1109 {p_isasclet
, 0, A_NEXT
, TPS_InFile
, 0, NULL
},
1110 {p_isdigit
, 0, A_NEXT
, TPS_InFile
, 0, NULL
},
1111 {p_iseqC
, '.', A_PUSH
, TPS_InFileNext
, 0, NULL
},
1112 {p_iseqC
, '_', A_NEXT
, TPS_InFile
, 0, NULL
},
1113 {p_iseqC
, '-', A_NEXT
, TPS_InFile
, 0, NULL
},
1114 {p_iseqC
, '/', A_PUSH
, TPS_InFileFirst
, 0, NULL
},
1115 {p_iseqC
, '?', A_PUSH
, TPS_InURLPathFirst
, 0, NULL
},
1116 {NULL
, 0, A_BINGO
, TPS_Base
, FILEPATH
, NULL
}
1119 static const TParserStateActionItem actionTPS_InFileNext
[] = {
1120 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1121 {p_isasclet
, 0, A_CLEAR
, TPS_InFile
, 0, NULL
},
1122 {p_isdigit
, 0, A_CLEAR
, TPS_InFile
, 0, NULL
},
1123 {p_iseqC
, '_', A_CLEAR
, TPS_InFile
, 0, NULL
},
1124 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1127 static const TParserStateActionItem actionTPS_InURLPathFirst
[] = {
1128 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1129 {p_iseqC
, '"', A_POP
, TPS_Null
, 0, NULL
},
1130 {p_iseqC
, '\'', A_POP
, TPS_Null
, 0, NULL
},
1131 {p_isnotspace
, 0, A_CLEAR
, TPS_InURLPath
, 0, NULL
},
1132 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
},
1135 static const TParserStateActionItem actionTPS_InURLPathStart
[] = {
1136 {NULL
, 0, A_NEXT
, TPS_InURLPath
, 0, NULL
}
1139 static const TParserStateActionItem actionTPS_InURLPath
[] = {
1140 {p_isEOF
, 0, A_BINGO
, TPS_Base
, URLPATH
, NULL
},
1141 {p_iseqC
, '"', A_BINGO
, TPS_Base
, URLPATH
, NULL
},
1142 {p_iseqC
, '\'', A_BINGO
, TPS_Base
, URLPATH
, NULL
},
1143 {p_isnotspace
, 0, A_NEXT
, TPS_InURLPath
, 0, NULL
},
1144 {NULL
, 0, A_BINGO
, TPS_Base
, URLPATH
, NULL
}
1147 static const TParserStateActionItem actionTPS_InFURL
[] = {
1148 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1149 {p_isURLPath
, 0, A_BINGO
| A_CLRALL
, TPS_Base
, URL_T
, SpecialFURL
},
1150 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1153 static const TParserStateActionItem actionTPS_InProtocolFirst
[] = {
1154 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1155 {p_iseqC
, '/', A_NEXT
, TPS_InProtocolSecond
, 0, NULL
},
1156 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1159 static const TParserStateActionItem actionTPS_InProtocolSecond
[] = {
1160 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1161 {p_iseqC
, '/', A_NEXT
, TPS_InProtocolEnd
, 0, NULL
},
1162 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1165 static const TParserStateActionItem actionTPS_InProtocolEnd
[] = {
1166 {NULL
, 0, A_BINGO
| A_CLRALL
, TPS_Base
, PROTOCOL
, NULL
}
1169 static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst
[] = {
1170 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1171 {p_isasclet
, 0, A_NEXT
, TPS_InHyphenAsciiWord
, 0, NULL
},
1172 {p_isalpha
, 0, A_NEXT
, TPS_InHyphenWord
, 0, NULL
},
1173 {p_isdigit
, 0, A_NEXT
, TPS_InHyphenDigitLookahead
, 0, NULL
},
1174 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1177 static const TParserStateActionItem actionTPS_InHyphenAsciiWord
[] = {
1178 {p_isEOF
, 0, A_BINGO
| A_CLRALL
, TPS_InParseHyphen
, ASCIIHWORD
, SpecialHyphen
},
1179 {p_isasclet
, 0, A_NEXT
, TPS_InHyphenAsciiWord
, 0, NULL
},
1180 {p_isalpha
, 0, A_NEXT
, TPS_InHyphenWord
, 0, NULL
},
1181 {p_isdigit
, 0, A_NEXT
, TPS_InHyphenNumWord
, 0, NULL
},
1182 {p_iseqC
, '-', A_PUSH
, TPS_InHyphenAsciiWordFirst
, 0, NULL
},
1183 {NULL
, 0, A_BINGO
| A_CLRALL
, TPS_InParseHyphen
, ASCIIHWORD
, SpecialHyphen
}
1186 static const TParserStateActionItem actionTPS_InHyphenWordFirst
[] = {
1187 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1188 {p_isalpha
, 0, A_NEXT
, TPS_InHyphenWord
, 0, NULL
},
1189 {p_isdigit
, 0, A_NEXT
, TPS_InHyphenDigitLookahead
, 0, NULL
},
1190 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1193 static const TParserStateActionItem actionTPS_InHyphenWord
[] = {
1194 {p_isEOF
, 0, A_BINGO
| A_CLRALL
, TPS_InParseHyphen
, HWORD
, SpecialHyphen
},
1195 {p_isalpha
, 0, A_NEXT
, TPS_InHyphenWord
, 0, NULL
},
1196 {p_isdigit
, 0, A_NEXT
, TPS_InHyphenNumWord
, 0, NULL
},
1197 {p_iseqC
, '-', A_PUSH
, TPS_InHyphenWordFirst
, 0, NULL
},
1198 {NULL
, 0, A_BINGO
| A_CLRALL
, TPS_InParseHyphen
, HWORD
, SpecialHyphen
}
1201 static const TParserStateActionItem actionTPS_InHyphenNumWordFirst
[] = {
1202 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1203 {p_isalpha
, 0, A_NEXT
, TPS_InHyphenNumWord
, 0, NULL
},
1204 {p_isdigit
, 0, A_NEXT
, TPS_InHyphenDigitLookahead
, 0, NULL
},
1205 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1208 static const TParserStateActionItem actionTPS_InHyphenNumWord
[] = {
1209 {p_isEOF
, 0, A_BINGO
| A_CLRALL
, TPS_InParseHyphen
, NUMHWORD
, SpecialHyphen
},
1210 {p_isalnum
, 0, A_NEXT
, TPS_InHyphenNumWord
, 0, NULL
},
1211 {p_iseqC
, '-', A_PUSH
, TPS_InHyphenNumWordFirst
, 0, NULL
},
1212 {NULL
, 0, A_BINGO
| A_CLRALL
, TPS_InParseHyphen
, NUMHWORD
, SpecialHyphen
}
1215 static const TParserStateActionItem actionTPS_InHyphenDigitLookahead
[] = {
1216 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1217 {p_isdigit
, 0, A_NEXT
, TPS_InHyphenDigitLookahead
, 0, NULL
},
1218 {p_isalpha
, 0, A_NEXT
, TPS_InHyphenNumWord
, 0, NULL
},
1219 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1222 static const TParserStateActionItem actionTPS_InParseHyphen
[] = {
1223 {p_isEOF
, 0, A_RERUN
, TPS_Base
, 0, NULL
},
1224 {p_isasclet
, 0, A_NEXT
, TPS_InHyphenAsciiWordPart
, 0, NULL
},
1225 {p_isalpha
, 0, A_NEXT
, TPS_InHyphenWordPart
, 0, NULL
},
1226 {p_isdigit
, 0, A_PUSH
, TPS_InHyphenUnsignedInt
, 0, NULL
},
1227 {p_iseqC
, '-', A_PUSH
, TPS_InParseHyphenHyphen
, 0, NULL
},
1228 {NULL
, 0, A_RERUN
, TPS_Base
, 0, NULL
}
1231 static const TParserStateActionItem actionTPS_InParseHyphenHyphen
[] = {
1232 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1233 {p_isalnum
, 0, A_BINGO
| A_CLEAR
, TPS_InParseHyphen
, SPACE
, NULL
},
1234 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1237 static const TParserStateActionItem actionTPS_InHyphenWordPart
[] = {
1238 {p_isEOF
, 0, A_BINGO
, TPS_Base
, PARTHWORD
, NULL
},
1239 {p_isalpha
, 0, A_NEXT
, TPS_InHyphenWordPart
, 0, NULL
},
1240 {p_isdigit
, 0, A_NEXT
, TPS_InHyphenNumWordPart
, 0, NULL
},
1241 {NULL
, 0, A_BINGO
, TPS_InParseHyphen
, PARTHWORD
, NULL
}
1244 static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart
[] = {
1245 {p_isEOF
, 0, A_BINGO
, TPS_Base
, ASCIIPARTHWORD
, NULL
},
1246 {p_isasclet
, 0, A_NEXT
, TPS_InHyphenAsciiWordPart
, 0, NULL
},
1247 {p_isalpha
, 0, A_NEXT
, TPS_InHyphenWordPart
, 0, NULL
},
1248 {p_isdigit
, 0, A_NEXT
, TPS_InHyphenNumWordPart
, 0, NULL
},
1249 {NULL
, 0, A_BINGO
, TPS_InParseHyphen
, ASCIIPARTHWORD
, NULL
}
1252 static const TParserStateActionItem actionTPS_InHyphenNumWordPart
[] = {
1253 {p_isEOF
, 0, A_BINGO
, TPS_Base
, NUMPARTHWORD
, NULL
},
1254 {p_isalnum
, 0, A_NEXT
, TPS_InHyphenNumWordPart
, 0, NULL
},
1255 {NULL
, 0, A_BINGO
, TPS_InParseHyphen
, NUMPARTHWORD
, NULL
}
1258 static const TParserStateActionItem actionTPS_InHyphenUnsignedInt
[] = {
1259 {p_isEOF
, 0, A_POP
, TPS_Null
, 0, NULL
},
1260 {p_isdigit
, 0, A_NEXT
, TPS_Null
, 0, NULL
},
1261 {p_isalpha
, 0, A_CLEAR
, TPS_InHyphenNumWordPart
, 0, NULL
},
1262 {NULL
, 0, A_POP
, TPS_Null
, 0, NULL
}
1267 * main table of per-state parser actions
1271 const TParserStateActionItem
*action
; /* the actual state info */
1272 TParserState state
; /* only for Assert crosscheck */
1273 #ifdef WPARSER_TRACE
1274 const char *state_name
; /* only for debug printout */
1276 } TParserStateAction
;
1278 #ifdef WPARSER_TRACE
1279 #define TPARSERSTATEACTION(state) \
1280 { CppConcat(action,state), state, CppAsString(state) }
1282 #define TPARSERSTATEACTION(state) \
1283 { CppConcat(action,state), state }
1287 * order must be the same as in typedef enum {} TParserState!!
1290 static const TParserStateAction Actions
[] = {
1291 TPARSERSTATEACTION(TPS_Base
),
1292 TPARSERSTATEACTION(TPS_InNumWord
),
1293 TPARSERSTATEACTION(TPS_InAsciiWord
),
1294 TPARSERSTATEACTION(TPS_InWord
),
1295 TPARSERSTATEACTION(TPS_InUnsignedInt
),
1296 TPARSERSTATEACTION(TPS_InSignedIntFirst
),
1297 TPARSERSTATEACTION(TPS_InSignedInt
),
1298 TPARSERSTATEACTION(TPS_InSpace
),
1299 TPARSERSTATEACTION(TPS_InUDecimalFirst
),
1300 TPARSERSTATEACTION(TPS_InUDecimal
),
1301 TPARSERSTATEACTION(TPS_InDecimalFirst
),
1302 TPARSERSTATEACTION(TPS_InDecimal
),
1303 TPARSERSTATEACTION(TPS_InVerVersion
),
1304 TPARSERSTATEACTION(TPS_InSVerVersion
),
1305 TPARSERSTATEACTION(TPS_InVersionFirst
),
1306 TPARSERSTATEACTION(TPS_InVersion
),
1307 TPARSERSTATEACTION(TPS_InMantissaFirst
),
1308 TPARSERSTATEACTION(TPS_InMantissaSign
),
1309 TPARSERSTATEACTION(TPS_InMantissa
),
1310 TPARSERSTATEACTION(TPS_InXMLEntityFirst
),
1311 TPARSERSTATEACTION(TPS_InXMLEntity
),
1312 TPARSERSTATEACTION(TPS_InXMLEntityNumFirst
),
1313 TPARSERSTATEACTION(TPS_InXMLEntityNum
),
1314 TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst
),
1315 TPARSERSTATEACTION(TPS_InXMLEntityHexNum
),
1316 TPARSERSTATEACTION(TPS_InXMLEntityEnd
),
1317 TPARSERSTATEACTION(TPS_InTagFirst
),
1318 TPARSERSTATEACTION(TPS_InXMLBegin
),
1319 TPARSERSTATEACTION(TPS_InTagCloseFirst
),
1320 TPARSERSTATEACTION(TPS_InTagName
),
1321 TPARSERSTATEACTION(TPS_InTagBeginEnd
),
1322 TPARSERSTATEACTION(TPS_InTag
),
1323 TPARSERSTATEACTION(TPS_InTagEscapeK
),
1324 TPARSERSTATEACTION(TPS_InTagEscapeKK
),
1325 TPARSERSTATEACTION(TPS_InTagBackSleshed
),
1326 TPARSERSTATEACTION(TPS_InTagEnd
),
1327 TPARSERSTATEACTION(TPS_InCommentFirst
),
1328 TPARSERSTATEACTION(TPS_InCommentLast
),
1329 TPARSERSTATEACTION(TPS_InComment
),
1330 TPARSERSTATEACTION(TPS_InCloseCommentFirst
),
1331 TPARSERSTATEACTION(TPS_InCloseCommentLast
),
1332 TPARSERSTATEACTION(TPS_InCommentEnd
),
1333 TPARSERSTATEACTION(TPS_InHostFirstDomain
),
1334 TPARSERSTATEACTION(TPS_InHostDomainSecond
),
1335 TPARSERSTATEACTION(TPS_InHostDomain
),
1336 TPARSERSTATEACTION(TPS_InPortFirst
),
1337 TPARSERSTATEACTION(TPS_InPort
),
1338 TPARSERSTATEACTION(TPS_InHostFirstAN
),
1339 TPARSERSTATEACTION(TPS_InHost
),
1340 TPARSERSTATEACTION(TPS_InEmail
),
1341 TPARSERSTATEACTION(TPS_InFileFirst
),
1342 TPARSERSTATEACTION(TPS_InFileTwiddle
),
1343 TPARSERSTATEACTION(TPS_InPathFirst
),
1344 TPARSERSTATEACTION(TPS_InPathFirstFirst
),
1345 TPARSERSTATEACTION(TPS_InPathSecond
),
1346 TPARSERSTATEACTION(TPS_InFile
),
1347 TPARSERSTATEACTION(TPS_InFileNext
),
1348 TPARSERSTATEACTION(TPS_InURLPathFirst
),
1349 TPARSERSTATEACTION(TPS_InURLPathStart
),
1350 TPARSERSTATEACTION(TPS_InURLPath
),
1351 TPARSERSTATEACTION(TPS_InFURL
),
1352 TPARSERSTATEACTION(TPS_InProtocolFirst
),
1353 TPARSERSTATEACTION(TPS_InProtocolSecond
),
1354 TPARSERSTATEACTION(TPS_InProtocolEnd
),
1355 TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst
),
1356 TPARSERSTATEACTION(TPS_InHyphenAsciiWord
),
1357 TPARSERSTATEACTION(TPS_InHyphenWordFirst
),
1358 TPARSERSTATEACTION(TPS_InHyphenWord
),
1359 TPARSERSTATEACTION(TPS_InHyphenNumWordFirst
),
1360 TPARSERSTATEACTION(TPS_InHyphenNumWord
),
1361 TPARSERSTATEACTION(TPS_InHyphenDigitLookahead
),
1362 TPARSERSTATEACTION(TPS_InParseHyphen
),
1363 TPARSERSTATEACTION(TPS_InParseHyphenHyphen
),
1364 TPARSERSTATEACTION(TPS_InHyphenWordPart
),
1365 TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart
),
1366 TPARSERSTATEACTION(TPS_InHyphenNumWordPart
),
1367 TPARSERSTATEACTION(TPS_InHyphenUnsignedInt
)
1372 TParserGet(TParser
*prs
)
1374 const TParserStateActionItem
*item
= NULL
;
1378 if (prs
->state
->posbyte
>= prs
->lenstr
)
1381 prs
->token
= prs
->str
+ prs
->state
->posbyte
;
1382 prs
->state
->pushedAtAction
= NULL
;
1384 /* look at string */
1385 while (prs
->state
->posbyte
<= prs
->lenstr
)
1387 if (prs
->state
->posbyte
== prs
->lenstr
)
1388 prs
->state
->charlen
= 0;
1390 prs
->state
->charlen
= (prs
->charmaxlen
== 1) ? prs
->charmaxlen
:
1391 pg_mblen(prs
->str
+ prs
->state
->posbyte
);
1393 Assert(prs
->state
->posbyte
+ prs
->state
->charlen
<= prs
->lenstr
);
1394 Assert(prs
->state
->state
>= TPS_Base
&& prs
->state
->state
< TPS_Null
);
1395 Assert(Actions
[prs
->state
->state
].state
== prs
->state
->state
);
1397 if (prs
->state
->pushedAtAction
)
1399 /* After a POP, pick up at the next test */
1400 item
= prs
->state
->pushedAtAction
+ 1;
1401 prs
->state
->pushedAtAction
= NULL
;
1405 item
= Actions
[prs
->state
->state
].action
;
1406 Assert(item
!= NULL
);
1409 /* find action by character class */
1410 while (item
->isclass
)
1413 if (item
->isclass(prs
) != 0)
1418 #ifdef WPARSER_TRACE
1420 TParserPosition
*ptr
;
1422 fprintf(stderr
, "state ");
1423 /* indent according to stack depth */
1424 for (ptr
= prs
->state
->prev
; ptr
; ptr
= ptr
->prev
)
1425 fprintf(stderr
, " ");
1426 fprintf(stderr
, "%s ", Actions
[prs
->state
->state
].state_name
);
1427 if (prs
->state
->posbyte
< prs
->lenstr
)
1428 fprintf(stderr
, "at %c", *(prs
->str
+ prs
->state
->posbyte
));
1430 fprintf(stderr
, "at EOF");
1431 fprintf(stderr
, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1432 (int) (item
- Actions
[prs
->state
->state
].action
),
1433 (item
->flags
& A_BINGO
) ? " BINGO" : "",
1434 (item
->flags
& A_POP
) ? " POP" : "",
1435 (item
->flags
& A_PUSH
) ? " PUSH" : "",
1436 (item
->flags
& A_RERUN
) ? " RERUN" : "",
1437 (item
->flags
& A_CLEAR
) ? " CLEAR" : "",
1438 (item
->flags
& A_MERGE
) ? " MERGE" : "",
1439 (item
->flags
& A_CLRALL
) ? " CLRALL" : "",
1440 (item
->tostate
!= TPS_Null
) ? " tostate " : "",
1441 (item
->tostate
!= TPS_Null
) ? Actions
[item
->tostate
].state_name
: "",
1442 (item
->type
> 0) ? " type " : "",
1443 tok_alias
[item
->type
]);
1447 /* call special handler if exists */
1451 /* BINGO, token is found */
1452 if (item
->flags
& A_BINGO
)
1454 Assert(item
->type
> 0);
1455 prs
->lenbytetoken
= prs
->state
->lenbytetoken
;
1456 prs
->lenchartoken
= prs
->state
->lenchartoken
;
1457 prs
->state
->lenbytetoken
= prs
->state
->lenchartoken
= 0;
1458 prs
->type
= item
->type
;
1461 /* do various actions by flags */
1462 if (item
->flags
& A_POP
)
1463 { /* pop stored state in stack */
1464 TParserPosition
*ptr
= prs
->state
->prev
;
1470 else if (item
->flags
& A_PUSH
)
1471 { /* push (store) state in stack */
1472 prs
->state
->pushedAtAction
= item
; /* remember where we push */
1473 prs
->state
= newTParserPosition(prs
->state
);
1475 else if (item
->flags
& A_CLEAR
)
1476 { /* clear previous pushed state */
1477 TParserPosition
*ptr
;
1479 Assert(prs
->state
->prev
);
1480 ptr
= prs
->state
->prev
->prev
;
1481 pfree(prs
->state
->prev
);
1482 prs
->state
->prev
= ptr
;
1484 else if (item
->flags
& A_CLRALL
)
1485 { /* clear all previous pushed state */
1486 TParserPosition
*ptr
;
1488 while (prs
->state
->prev
)
1490 ptr
= prs
->state
->prev
->prev
;
1491 pfree(prs
->state
->prev
);
1492 prs
->state
->prev
= ptr
;
1495 else if (item
->flags
& A_MERGE
)
1496 { /* merge posinfo with current and pushed state */
1497 TParserPosition
*ptr
= prs
->state
;
1499 Assert(prs
->state
->prev
);
1500 prs
->state
= prs
->state
->prev
;
1502 prs
->state
->posbyte
= ptr
->posbyte
;
1503 prs
->state
->poschar
= ptr
->poschar
;
1504 prs
->state
->charlen
= ptr
->charlen
;
1505 prs
->state
->lenbytetoken
= ptr
->lenbytetoken
;
1506 prs
->state
->lenchartoken
= ptr
->lenchartoken
;
1510 /* set new state if pointed */
1511 if (item
->tostate
!= TPS_Null
)
1512 prs
->state
->state
= item
->tostate
;
1514 /* check for go away */
1515 if ((item
->flags
& A_BINGO
) ||
1516 (prs
->state
->posbyte
>= prs
->lenstr
&&
1517 (item
->flags
& A_RERUN
) == 0))
1520 /* go to beginning of loop if we should rerun or we just restore state */
1521 if (item
->flags
& (A_RERUN
| A_POP
))
1525 if (prs
->state
->charlen
)
1527 prs
->state
->posbyte
+= prs
->state
->charlen
;
1528 prs
->state
->lenbytetoken
+= prs
->state
->charlen
;
1529 prs
->state
->poschar
++;
1530 prs
->state
->lenchartoken
++;
1534 return (item
&& (item
->flags
& A_BINGO
)) ? true : false;
1538 prsd_lextype(PG_FUNCTION_ARGS
)
1540 LexDescr
*descr
= (LexDescr
*) palloc(sizeof(LexDescr
) * (LASTNUM
+ 1));
1543 for (i
= 1; i
<= LASTNUM
; i
++)
1545 descr
[i
- 1].lexid
= i
;
1546 descr
[i
- 1].alias
= pstrdup(tok_alias
[i
]);
1547 descr
[i
- 1].descr
= pstrdup(lex_descr
[i
]);
1550 descr
[LASTNUM
].lexid
= 0;
1552 PG_RETURN_POINTER(descr
);
1556 prsd_start(PG_FUNCTION_ARGS
)
1558 PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1562 prsd_nexttoken(PG_FUNCTION_ARGS
)
1564 TParser
*p
= (TParser
*) PG_GETARG_POINTER(0);
1565 char **t
= (char **) PG_GETARG_POINTER(1);
1566 int *tlen
= (int *) PG_GETARG_POINTER(2);
1572 *tlen
= p
->lenbytetoken
;
1574 PG_RETURN_INT32(p
->type
);
1578 prsd_end(PG_FUNCTION_ARGS
)
1580 TParser
*p
= (TParser
*) PG_GETARG_POINTER(0);
1586 #define LEAVETOKEN(x) ( (x)==SPACE )
1587 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1588 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
1590 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1591 #define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1592 #define XMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1593 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) )
1594 #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1598 HeadlineWordEntry
*words
;
1603 checkcondition_HL(void *checkval
, QueryOperand
*val
)
1607 for (i
= 0; i
< ((hlCheck
*) checkval
)->len
; i
++)
1609 if (((hlCheck
*) checkval
)->words
[i
].item
== val
)
1617 hlCover(HeadlineParsedText
*prs
, TSQuery query
, int *p
, int *q
)
1621 QueryItem
*item
= GETQUERY(query
);
1627 for (j
= 0; j
< query
->size
; j
++)
1629 if (item
->type
!= QI_VAL
)
1634 for (i
= pos
; i
< prs
->curwords
; i
++)
1636 if (prs
->words
[i
].item
== &item
->operand
)
1649 item
= GETQUERY(query
);
1650 for (j
= 0; j
< query
->size
; j
++)
1652 if (item
->type
!= QI_VAL
)
1657 for (i
= *q
; i
>= pos
; i
--)
1659 if (prs
->words
[i
].item
== &item
->operand
)
1673 ch
.words
= &(prs
->words
[*p
]);
1674 ch
.len
= *q
- *p
+ 1;
1675 if (TS_execute(GETQUERY(query
), &ch
, false, checkcondition_HL
))
1680 return hlCover(prs
, query
, p
, q
);
1688 mark_fragment(HeadlineParsedText
*prs
, int highlight
, int startpos
, int endpos
)
1692 for (i
= startpos
; i
<= endpos
; i
++)
1694 if (prs
->words
[i
].item
)
1695 prs
->words
[i
].selected
= 1;
1698 if (HLIDIGNORE(prs
->words
[i
].type
))
1699 prs
->words
[i
].replace
= 1;
1703 if (XMLHLIDIGNORE(prs
->words
[i
].type
))
1704 prs
->words
[i
].replace
= 1;
1707 prs
->words
[i
].in
= (prs
->words
[i
].repeated
) ? 0 : 1;
1722 get_next_fragment(HeadlineParsedText
*prs
, int *startpos
, int *endpos
,
1723 int *curlen
, int *poslen
, int max_words
)
1726 /* Objective: Generate a fragment of words between startpos and endpos
1727 * such that it has at most max_words and both ends has query words.
1728 * If the startpos and endpos are the endpoints of the cover and the
1729 * cover has fewer words than max_words, then this function should
1730 * just return the cover
1732 /* first move startpos to an item */
1733 for(i
= *startpos
; i
<= *endpos
; i
++)
1736 if (prs
->words
[i
].item
&& !prs
->words
[i
].repeated
)
1739 /* cut endpos to have only max_words */
1742 for(i
= *startpos
; i
<= *endpos
&& *curlen
< max_words
; i
++)
1744 if (!NONWORDTOKEN(prs
->words
[i
].type
))
1746 if (prs
->words
[i
].item
&& !prs
->words
[i
].repeated
)
1749 /* if the cover was cut then move back endpos to a query item */
1753 for(i
= *endpos
; i
>= *startpos
; i
--)
1756 if (prs
->words
[i
].item
&& !prs
->words
[i
].repeated
)
1758 if (!NONWORDTOKEN(prs
->words
[i
].type
))
1765 mark_hl_fragments(HeadlineParsedText
*prs
, TSQuery query
, int highlight
,
1766 int shortword
, int min_words
,
1767 int max_words
, int max_fragments
)
1769 int4 poslen
, curlen
, i
, f
, num_f
= 0;
1770 int4 stretch
, maxstretch
, posmarker
;
1780 int4 minI
, minwords
, maxitems
;
1783 covers
= palloc(maxcovers
* sizeof(CoverPos
));
1785 /* get all covers */
1786 while (hlCover(prs
, query
, &p
, &q
))
1791 /* Break the cover into smaller fragments such that each fragment
1792 * has at most max_words. Also ensure that each end of the fragment
1793 * is a query word. This will allow us to stretch the fragment in
1797 while (startpos
<= endpos
)
1799 get_next_fragment(prs
, &startpos
, &endpos
, &curlen
, &poslen
, max_words
);
1800 if (numcovers
>= maxcovers
)
1803 covers
= repalloc(covers
, sizeof(CoverPos
) * maxcovers
);
1805 covers
[numcovers
].startpos
= startpos
;
1806 covers
[numcovers
].endpos
= endpos
;
1807 covers
[numcovers
].curlen
= curlen
;
1808 covers
[numcovers
].poslen
= poslen
;
1809 covers
[numcovers
].in
= 0;
1810 covers
[numcovers
].excluded
= 0;
1812 startpos
= endpos
+ 1;
1815 /* move p to generate the next cover */
1819 /* choose best covers */
1820 for (f
= 0; f
< max_fragments
; f
++)
1823 minwords
= 0x7fffffff;
1825 /* Choose the cover that contains max items.
1826 * In case of tie choose the one with smaller
1829 for (i
= 0; i
< numcovers
; i
++)
1831 if (!covers
[i
].in
&& !covers
[i
].excluded
&&
1832 (maxitems
< covers
[i
].poslen
|| (maxitems
== covers
[i
].poslen
1833 && minwords
> covers
[i
].curlen
)))
1835 maxitems
= covers
[i
].poslen
;
1836 minwords
= covers
[i
].curlen
;
1840 /* if a cover was found mark it */
1843 covers
[minI
].in
= 1;
1844 /* adjust the size of cover */
1845 startpos
= covers
[minI
].startpos
;
1846 endpos
= covers
[minI
].endpos
;
1847 curlen
= covers
[minI
].curlen
;
1848 /* stretch the cover if cover size is lower than max_words */
1849 if (curlen
< max_words
)
1851 /* divide the stretch on both sides of cover */
1852 maxstretch
= (max_words
- curlen
)/2;
1853 /* first stretch the startpos
1854 * stop stretching if
1855 * 1. we hit the beginning of document
1856 * 2. exceed maxstretch
1857 * 3. we hit an already marked fragment
1860 posmarker
= startpos
;
1861 for (i
= startpos
- 1; i
>= 0 && stretch
< maxstretch
&& !prs
->words
[i
].in
; i
--)
1863 if (!NONWORDTOKEN(prs
->words
[i
].type
))
1870 /* cut back startpos till we find a non short token */
1871 for (i
= posmarker
; i
< startpos
&& (NOENDTOKEN(prs
->words
[i
].type
) || prs
->words
[i
].len
<= shortword
); i
++)
1873 if (!NONWORDTOKEN(prs
->words
[i
].type
))
1877 /* now stretch the endpos as much as possible*/
1879 for (i
= endpos
+ 1; i
< prs
->curwords
&& curlen
< max_words
&& !prs
->words
[i
].in
; i
++)
1881 if (!NONWORDTOKEN(prs
->words
[i
].type
))
1885 /* cut back endpos till we find a non-short token */
1886 for ( i
= posmarker
; i
> endpos
&& (NOENDTOKEN(prs
->words
[i
].type
) || prs
->words
[i
].len
<= shortword
); i
--)
1888 if (!NONWORDTOKEN(prs
->words
[i
].type
))
1893 covers
[minI
].startpos
= startpos
;
1894 covers
[minI
].endpos
= endpos
;
1895 covers
[minI
].curlen
= curlen
;
1896 /* Mark the chosen fragments (covers) */
1897 mark_fragment(prs
, highlight
, startpos
, endpos
);
1899 /* exclude overlapping covers */
1900 for (i
= 0; i
< numcovers
; i
++)
1902 if (i
!= minI
&& ( (covers
[i
].startpos
>= covers
[minI
].startpos
&& covers
[i
].startpos
<= covers
[minI
].endpos
) || (covers
[i
].endpos
>= covers
[minI
].startpos
&& covers
[i
].endpos
<= covers
[minI
].endpos
)))
1903 covers
[i
].excluded
= 1;
1910 /* show at least min_words we have not marked anything*/
1913 startpos
= endpos
= curlen
= 0;
1914 for (i
= 0; i
< prs
->curwords
&& curlen
< min_words
; i
++)
1916 if (!NONWORDTOKEN(prs
->words
[i
].type
))
1920 mark_fragment(prs
, highlight
, startpos
, endpos
);
1925 mark_hl_words(HeadlineParsedText
*prs
, TSQuery query
, int highlight
,
1926 int shortword
, int min_words
, int max_words
)
1942 while (hlCover(prs
, query
, &p
, &q
))
1944 /* find cover len in words */
1947 for (i
= p
; i
<= q
&& curlen
< max_words
; i
++)
1949 if (!NONWORDTOKEN(prs
->words
[i
].type
))
1951 if (prs
->words
[i
].item
&& !prs
->words
[i
].repeated
)
1956 if (poslen
< bestlen
&& !(NOENDTOKEN(prs
->words
[beste
].type
) || prs
->words
[beste
].len
<= shortword
))
1958 /* best already finded, so try one more cover */
1964 if (curlen
< max_words
)
1965 { /* find good end */
1966 for (i
= i
- 1; i
< prs
->curwords
&& curlen
< max_words
; i
++)
1970 if (!NONWORDTOKEN(prs
->words
[i
].type
))
1972 if (prs
->words
[i
].item
&& !prs
->words
[i
].repeated
)
1976 if (NOENDTOKEN(prs
->words
[i
].type
) || prs
->words
[i
].len
<= shortword
)
1978 if (curlen
>= min_words
)
1981 if (curlen
< min_words
&& i
>= prs
->curwords
)
1982 { /* got end of text and our cover is shoter
1984 for (i
= p
; i
>= 0; i
--)
1986 if (!NONWORDTOKEN(prs
->words
[i
].type
))
1988 if (prs
->words
[i
].item
&& !prs
->words
[i
].repeated
)
1990 if (NOENDTOKEN(prs
->words
[i
].type
) || prs
->words
[i
].len
<= shortword
)
1992 if (curlen
>= min_words
)
1995 posb
= (i
>= 0) ? i
: 0;
1999 { /* shorter cover :((( */
2000 for (; curlen
> min_words
; i
--)
2002 if (!NONWORDTOKEN(prs
->words
[i
].type
))
2004 if (prs
->words
[i
].item
&& !prs
->words
[i
].repeated
)
2007 if (NOENDTOKEN(prs
->words
[i
].type
) || prs
->words
[i
].len
<= shortword
)
2013 if (bestlen
< 0 || (poslen
> bestlen
&& !(NOENDTOKEN(prs
->words
[pose
].type
) || prs
->words
[pose
].len
<= shortword
)) ||
2014 (bestlen
>= 0 && !(NOENDTOKEN(prs
->words
[pose
].type
) || prs
->words
[pose
].len
<= shortword
) &&
2015 (NOENDTOKEN(prs
->words
[beste
].type
) || prs
->words
[beste
].len
<= shortword
)))
2028 for (i
= 0; i
< prs
->curwords
&& curlen
< min_words
; i
++)
2030 if (!NONWORDTOKEN(prs
->words
[i
].type
))
2041 beste
= prs
->curwords
- 1;
2044 for (i
= bestb
; i
<= beste
; i
++)
2046 if (prs
->words
[i
].item
)
2047 prs
->words
[i
].selected
= 1;
2050 if (HLIDIGNORE(prs
->words
[i
].type
))
2051 prs
->words
[i
].replace
= 1;
2055 if (XMLHLIDIGNORE(prs
->words
[i
].type
))
2056 prs
->words
[i
].replace
= 1;
2059 prs
->words
[i
].in
= (prs
->words
[i
].repeated
) ? 0 : 1;
2065 prsd_headline(PG_FUNCTION_ARGS
)
2067 HeadlineParsedText
*prs
= (HeadlineParsedText
*) PG_GETARG_POINTER(0);
2068 List
*prsoptions
= (List
*) PG_GETARG_POINTER(1);
2069 TSQuery query
= PG_GETARG_TSQUERY(2);
2071 /* from opt + start and and tag */
2075 int max_fragments
= 0;
2080 prs
->startsel
= NULL
;
2081 prs
->stopsel
= NULL
;
2082 foreach(l
, prsoptions
)
2084 DefElem
*defel
= (DefElem
*) lfirst(l
);
2085 char *val
= defGetString(defel
);
2087 if (pg_strcasecmp(defel
->defname
, "MaxWords") == 0)
2088 max_words
= pg_atoi(val
, sizeof(int32
), 0);
2089 else if (pg_strcasecmp(defel
->defname
, "MinWords") == 0)
2090 min_words
= pg_atoi(val
, sizeof(int32
), 0);
2091 else if (pg_strcasecmp(defel
->defname
, "ShortWord") == 0)
2092 shortword
= pg_atoi(val
, sizeof(int32
), 0);
2093 else if (pg_strcasecmp(defel
->defname
, "MaxFragments") == 0)
2094 max_fragments
= pg_atoi(val
, sizeof(int32
), 0);
2095 else if (pg_strcasecmp(defel
->defname
, "StartSel") == 0)
2096 prs
->startsel
= pstrdup(val
);
2097 else if (pg_strcasecmp(defel
->defname
, "StopSel") == 0)
2098 prs
->stopsel
= pstrdup(val
);
2099 else if (pg_strcasecmp(defel
->defname
, "FragmentDelimiter") == 0)
2100 prs
->fragdelim
= pstrdup(val
);
2101 else if (pg_strcasecmp(defel
->defname
, "HighlightAll") == 0)
2102 highlight
= (pg_strcasecmp(val
, "1") == 0 ||
2103 pg_strcasecmp(val
, "on") == 0 ||
2104 pg_strcasecmp(val
, "true") == 0 ||
2105 pg_strcasecmp(val
, "t") == 0 ||
2106 pg_strcasecmp(val
, "y") == 0 ||
2107 pg_strcasecmp(val
, "yes") == 0);
2110 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
2111 errmsg("unrecognized headline parameter: \"%s\"",
2117 if (min_words
>= max_words
)
2119 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
2120 errmsg("MinWords should be less than MaxWords")));
2123 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
2124 errmsg("MinWords should be positive")));
2127 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
2128 errmsg("ShortWord should be >= 0")));
2129 if (max_fragments
< 0)
2131 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
2132 errmsg("MaxFragments should be >= 0")));
2135 if (max_fragments
== 0)
2136 /* call the default headline generator */
2137 mark_hl_words(prs
, query
, highlight
, shortword
, min_words
, max_words
);
2139 mark_hl_fragments(prs
, query
, highlight
, shortword
, min_words
, max_words
, max_fragments
);
2142 prs
->startsel
= pstrdup("<b>");
2144 prs
->stopsel
= pstrdup("</b>");
2145 if (!prs
->fragdelim
)
2146 prs
->fragdelim
= pstrdup(" ... ");
2147 prs
->startsellen
= strlen(prs
->startsel
);
2148 prs
->stopsellen
= strlen(prs
->stopsel
);
2149 prs
->fragdelimlen
= strlen(prs
->fragdelim
);
2151 PG_RETURN_POINTER(prs
);