2 * regexp.c: generic and extensible Regular Expression engine
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/shemas mechanisms now available in
6 * XML related specifications thise includes:
7 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
12 * See Copyright for the status of this software.
14 * Daniel Veillard <veillard@redhat.com>
20 #ifdef LIBXML_REGEXP_ENABLED
24 #include <libxml/tree.h>
25 #include <libxml/parserInternals.h>
26 #include <libxml/xmlregexp.h>
27 #include <libxml/xmlautomata.h>
28 #include <libxml/xmlunicode.h>
30 /* #define DEBUG_REGEXP_GRAPH */
31 /* #define DEBUG_REGEXP_EXEC */
32 /* #define DEBUG_PUSH */
33 /* #define DEBUG_COMPACTION */
35 #define ERROR(str) ctxt->error = 1; \
36 xmlGenericError(xmlGenericErrorContext, "Regexp: %s: %s\n", str, ctxt->cur)
37 #define NEXT ctxt->cur++
38 #define CUR (*(ctxt->cur))
39 #define NXT(index) (ctxt->cur[index])
41 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
42 #define NEXTL(l) ctxt->cur += l;
47 * macro to flag unimplemented blocks
50 xmlGenericError(xmlGenericErrorContext, \
51 "Unimplemented block at %s:%d\n", \
55 /************************************************************************
57 * Datatypes and structures *
59 ************************************************************************/
62 XML_REGEXP_EPSILON
= 1,
67 XML_REGEXP_ANYCHAR
, /* . */
68 XML_REGEXP_ANYSPACE
, /* \s */
69 XML_REGEXP_NOTSPACE
, /* \S */
70 XML_REGEXP_INITNAME
, /* \l */
71 XML_REGEXP_NOTINITNAME
, /* \l */
72 XML_REGEXP_NAMECHAR
, /* \c */
73 XML_REGEXP_NOTNAMECHAR
, /* \C */
74 XML_REGEXP_DECIMAL
, /* \d */
75 XML_REGEXP_NOTDECIMAL
, /* \d */
76 XML_REGEXP_REALCHAR
, /* \w */
77 XML_REGEXP_NOTREALCHAR
, /* \w */
79 XML_REGEXP_LETTER_UPPERCASE
,
80 XML_REGEXP_LETTER_LOWERCASE
,
81 XML_REGEXP_LETTER_TITLECASE
,
82 XML_REGEXP_LETTER_MODIFIER
,
83 XML_REGEXP_LETTER_OTHERS
,
85 XML_REGEXP_MARK_NONSPACING
,
86 XML_REGEXP_MARK_SPACECOMBINING
,
87 XML_REGEXP_MARK_ENCLOSING
,
89 XML_REGEXP_NUMBER_DECIMAL
,
90 XML_REGEXP_NUMBER_LETTER
,
91 XML_REGEXP_NUMBER_OTHERS
,
93 XML_REGEXP_PUNCT_CONNECTOR
,
94 XML_REGEXP_PUNCT_DASH
,
95 XML_REGEXP_PUNCT_OPEN
,
96 XML_REGEXP_PUNCT_CLOSE
,
97 XML_REGEXP_PUNCT_INITQUOTE
,
98 XML_REGEXP_PUNCT_FINQUOTE
,
99 XML_REGEXP_PUNCT_OTHERS
,
101 XML_REGEXP_SEPAR_SPACE
,
102 XML_REGEXP_SEPAR_LINE
,
103 XML_REGEXP_SEPAR_PARA
,
105 XML_REGEXP_SYMBOL_MATH
,
106 XML_REGEXP_SYMBOL_CURRENCY
,
107 XML_REGEXP_SYMBOL_MODIFIER
,
108 XML_REGEXP_SYMBOL_OTHERS
,
110 XML_REGEXP_OTHER_CONTROL
,
111 XML_REGEXP_OTHER_FORMAT
,
112 XML_REGEXP_OTHER_PRIVATE
,
114 XML_REGEXP_BLOCK_NAME
118 XML_REGEXP_QUANT_EPSILON
= 1,
119 XML_REGEXP_QUANT_ONCE
,
120 XML_REGEXP_QUANT_OPT
,
121 XML_REGEXP_QUANT_MULT
,
122 XML_REGEXP_QUANT_PLUS
,
123 XML_REGEXP_QUANT_ONCEONLY
,
124 XML_REGEXP_QUANT_ALL
,
125 XML_REGEXP_QUANT_RANGE
129 XML_REGEXP_START_STATE
= 1,
130 XML_REGEXP_FINAL_STATE
,
131 XML_REGEXP_TRANS_STATE
135 XML_REGEXP_MARK_NORMAL
= 0,
136 XML_REGEXP_MARK_START
,
137 XML_REGEXP_MARK_VISITED
140 typedef struct _xmlRegRange xmlRegRange
;
141 typedef xmlRegRange
*xmlRegRangePtr
;
143 struct _xmlRegRange
{
151 typedef struct _xmlRegAtom xmlRegAtom
;
152 typedef xmlRegAtom
*xmlRegAtomPtr
;
154 typedef struct _xmlAutomataState xmlRegState
;
155 typedef xmlRegState
*xmlRegStatePtr
;
160 xmlRegQuantType quant
;
168 xmlRegStatePtr start
;
172 xmlRegRangePtr
*ranges
;
176 typedef struct _xmlRegCounter xmlRegCounter
;
177 typedef xmlRegCounter
*xmlRegCounterPtr
;
179 struct _xmlRegCounter
{
184 typedef struct _xmlRegTrans xmlRegTrans
;
185 typedef xmlRegTrans
*xmlRegTransPtr
;
187 struct _xmlRegTrans
{
194 struct _xmlAutomataState
{
195 xmlRegStateType type
;
196 xmlRegMarkedType mark
;
197 xmlRegMarkedType reached
;
205 typedef struct _xmlAutomata xmlRegParserCtxt
;
206 typedef xmlRegParserCtxt
*xmlRegParserCtxtPtr
;
208 struct _xmlAutomata
{
215 xmlRegStatePtr start
;
217 xmlRegStatePtr state
;
223 xmlRegAtomPtr
*atoms
;
227 xmlRegStatePtr
*states
;
231 xmlRegCounter
*counters
;
239 xmlRegStatePtr
*states
;
241 xmlRegAtomPtr
*atoms
;
243 xmlRegCounter
*counters
;
246 * That's the compact form for determinists automatas
255 typedef struct _xmlRegExecRollback xmlRegExecRollback
;
256 typedef xmlRegExecRollback
*xmlRegExecRollbackPtr
;
258 struct _xmlRegExecRollback
{
259 xmlRegStatePtr state
;/* the current state */
260 int index
; /* the index in the input stack */
261 int nextbranch
; /* the next transition to explore in that state */
262 int *counts
; /* save the automate state if it has some */
265 typedef struct _xmlRegInputToken xmlRegInputToken
;
266 typedef xmlRegInputToken
*xmlRegInputTokenPtr
;
268 struct _xmlRegInputToken
{
273 struct _xmlRegExecCtxt
{
274 int status
; /* execution status != 0 indicate an error */
275 int determinist
; /* did we found an inderterministic behaviour */
276 xmlRegexpPtr comp
; /* the compiled regexp */
277 xmlRegExecCallbacks callback
;
280 xmlRegStatePtr state
;/* the current state */
281 int transno
; /* the current transition on that state */
282 int transcount
; /* the number of char in char counted transitions */
285 * A stack of rollback states
289 xmlRegExecRollback
*rollbacks
;
292 * The state of the automata if any
303 const xmlChar
*inputString
; /* when operating on characters */
304 xmlRegInputTokenPtr inputStack
;/* when operating on strings */
308 #define REGEXP_ALL_COUNTER 0x123456
309 #define REGEXP_ALL_LAX_COUNTER 0x123457
311 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt
, int top
);
312 static void xmlRegFreeState(xmlRegStatePtr state
);
313 static void xmlRegFreeAtom(xmlRegAtomPtr atom
);
315 /************************************************************************
317 * Allocation/Deallocation *
319 ************************************************************************/
321 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt
);
323 * xmlRegEpxFromParse:
324 * @ctxt: the parser context used to build it
326 * Allocate a new regexp and fill it with the reult from the parser
328 * Returns the new regexp or NULL in case of error
331 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt
) {
334 ret
= (xmlRegexpPtr
) xmlMalloc(sizeof(xmlRegexp
));
337 memset(ret
, 0, sizeof(xmlRegexp
));
338 ret
->string
= ctxt
->string
;
340 ret
->nbStates
= ctxt
->nbStates
;
342 ret
->states
= ctxt
->states
;
344 ret
->nbAtoms
= ctxt
->nbAtoms
;
346 ret
->atoms
= ctxt
->atoms
;
348 ret
->nbCounters
= ctxt
->nbCounters
;
349 ctxt
->nbCounters
= 0;
350 ret
->counters
= ctxt
->counters
;
351 ctxt
->counters
= NULL
;
352 ret
->determinist
= ctxt
->determinist
;
354 if ((ret
->determinist
!= 0) &&
355 (ret
->nbCounters
== 0) &&
356 (ret
->atoms
!= NULL
) &&
357 (ret
->atoms
[0] != NULL
) &&
358 (ret
->atoms
[0]->type
== XML_REGEXP_STRING
)) {
359 int i
, j
, nbstates
= 0, nbatoms
= 0;
368 * Switch to a compact representation
369 * 1/ counting the effective number of states left
370 * 2/ conting the unique number of atoms, and check that
371 * they are all of the string type
372 * 3/ build a table state x atom for the transitions
375 stateRemap
= xmlMalloc(ret
->nbStates
* sizeof(int));
376 for (i
= 0;i
< ret
->nbStates
;i
++) {
377 if (ret
->states
[i
] != NULL
) {
378 stateRemap
[i
] = nbstates
;
384 #ifdef DEBUG_COMPACTION
385 printf("Final: %d states\n", nbstates
);
387 stringMap
= xmlMalloc(ret
->nbAtoms
* sizeof(char *));
388 stringRemap
= xmlMalloc(ret
->nbAtoms
* sizeof(int));
389 for (i
= 0;i
< ret
->nbAtoms
;i
++) {
390 if ((ret
->atoms
[i
]->type
== XML_REGEXP_STRING
) &&
391 (ret
->atoms
[i
]->quant
== XML_REGEXP_QUANT_ONCE
)) {
392 value
= ret
->atoms
[i
]->valuep
;
393 for (j
= 0;j
< nbatoms
;j
++) {
394 if (xmlStrEqual(stringMap
[j
], value
)) {
400 stringRemap
[i
] = nbatoms
;
401 stringMap
[nbatoms
] = xmlStrdup(value
);
406 xmlFree(stringRemap
);
407 for (i
= 0;i
< nbatoms
;i
++)
408 xmlFree(stringMap
[i
]);
413 #ifdef DEBUG_COMPACTION
414 printf("Final: %d atoms\n", nbatoms
);
418 * Allocate the transition table. The first entry for each
419 * state correspond to the state type.
421 transitions
= (int *) xmlMalloc(nbstates
* (nbatoms
+ 1) * sizeof(int));
423 memset(transitions
, 0, nbstates
* (nbatoms
+ 1) * sizeof(int));
425 for (i
= 0;i
< ret
->nbStates
;i
++) {
426 int stateno
, atomno
, targetno
, prev
;
427 xmlRegStatePtr state
;
428 xmlRegTransPtr trans
;
430 stateno
= stateRemap
[i
];
433 state
= ret
->states
[i
];
435 transitions
[stateno
* (nbatoms
+ 1)] = state
->type
;
437 for (j
= 0;j
< state
->nbTrans
;j
++) {
438 trans
= &(state
->trans
[j
]);
439 if ((trans
->to
== -1) || (trans
->atom
== NULL
))
441 atomno
= stringRemap
[trans
->atom
->no
];
442 if ((trans
->atom
->data
!= NULL
) && (transdata
== NULL
)) {
443 transdata
= (void **) xmlMalloc(nbstates
* nbatoms
*
445 if (transdata
!= NULL
)
447 nbstates
* nbatoms
* sizeof(void *));
449 targetno
= stateRemap
[trans
->to
];
451 * if the same atome can generate transition to 2 different
452 * states then it means the automata is not determinist and
453 * the compact form can't be used !
455 prev
= transitions
[stateno
* (nbatoms
+ 1) + atomno
+ 1];
457 if (prev
!= targetno
+ 1) {
458 printf("not determinist\n");
459 ret
->determinist
= 0;
460 #ifdef DEBUG_COMPACTION
461 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
462 i
, j
, trans
->atom
->no
, trans
->to
, atomno
, targetno
);
463 printf(" previous to is %d\n", prev
);
465 ret
->determinist
= 0;
466 if (transdata
!= NULL
)
468 xmlFree(transitions
);
470 xmlFree(stringRemap
);
471 for (i
= 0;i
< nbatoms
;i
++)
472 xmlFree(stringMap
[i
]);
478 printf("State %d trans %d: atom %d to %d : %d to %d\n",
479 i
, j
, trans
->atom
->no
, trans
->to
, atomno
, targetno
);
481 transitions
[stateno
* (nbatoms
+ 1) + atomno
+ 1] =
482 targetno
+ 1; /* to avoid 0 */
483 if (transdata
!= NULL
)
484 transdata
[stateno
* nbatoms
+ atomno
] =
489 ret
->determinist
= 1;
490 #ifdef DEBUG_COMPACTION
494 for (i
= 0;i
< nbstates
;i
++) {
495 for (j
= 0;j
< nbatoms
+ 1;j
++) {
496 printf("%02d ", transitions
[i
* (nbatoms
+ 1) + j
]);
503 * Cleanup of the old data
505 if (ret
->states
!= NULL
) {
506 for (i
= 0;i
< ret
->nbStates
;i
++)
507 xmlRegFreeState(ret
->states
[i
]);
508 xmlFree(ret
->states
);
512 if (ret
->atoms
!= NULL
) {
513 for (i
= 0;i
< ret
->nbAtoms
;i
++)
514 xmlRegFreeAtom(ret
->atoms
[i
]);
520 ret
->compact
= transitions
;
521 ret
->transdata
= transdata
;
522 ret
->stringMap
= stringMap
;
523 ret
->nbstrings
= nbatoms
;
524 ret
->nbstates
= nbstates
;
526 xmlFree(stringRemap
);
533 * xmlRegNewParserCtxt:
534 * @string: the string to parse
536 * Allocate a new regexp parser context
538 * Returns the new context or NULL in case of error
540 static xmlRegParserCtxtPtr
541 xmlRegNewParserCtxt(const xmlChar
*string
) {
542 xmlRegParserCtxtPtr ret
;
544 ret
= (xmlRegParserCtxtPtr
) xmlMalloc(sizeof(xmlRegParserCtxt
));
547 memset(ret
, 0, sizeof(xmlRegParserCtxt
));
549 ret
->string
= xmlStrdup(string
);
550 ret
->cur
= ret
->string
;
553 ret
->determinist
= -1;
559 * @ctxt: the regexp parser context
560 * @neg: is that negative
561 * @type: the type of range
562 * @start: the start codepoint
563 * @end: the end codepoint
565 * Allocate a new regexp range
567 * Returns the new range or NULL in case of error
569 static xmlRegRangePtr
570 xmlRegNewRange(xmlRegParserCtxtPtr ctxt
,
571 int neg
, xmlRegAtomType type
, int start
, int end
) {
574 ret
= (xmlRegRangePtr
) xmlMalloc(sizeof(xmlRegRange
));
576 ERROR("failed to allocate regexp range");
588 * @range: the regexp range
590 * Free a regexp range
593 xmlRegFreeRange(xmlRegRangePtr range
) {
597 if (range
->blockName
!= NULL
)
598 xmlFree(range
->blockName
);
604 * @ctxt: the regexp parser context
605 * @type: the type of atom
607 * Allocate a new regexp range
609 * Returns the new atom or NULL in case of error
612 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt
, xmlRegAtomType type
) {
615 ret
= (xmlRegAtomPtr
) xmlMalloc(sizeof(xmlRegAtom
));
617 ERROR("failed to allocate regexp atom");
620 memset(ret
, 0, sizeof(xmlRegAtom
));
622 ret
->quant
= XML_REGEXP_QUANT_ONCE
;
630 * @atom: the regexp atom
635 xmlRegFreeAtom(xmlRegAtomPtr atom
) {
641 for (i
= 0;i
< atom
->nbRanges
;i
++)
642 xmlRegFreeRange(atom
->ranges
[i
]);
643 if (atom
->ranges
!= NULL
)
644 xmlFree(atom
->ranges
);
645 if (atom
->type
== XML_REGEXP_STRING
)
646 xmlFree(atom
->valuep
);
650 static xmlRegStatePtr
651 xmlRegNewState(xmlRegParserCtxtPtr ctxt
) {
654 ret
= (xmlRegStatePtr
) xmlMalloc(sizeof(xmlRegState
));
656 ERROR("failed to allocate regexp state");
659 memset(ret
, 0, sizeof(xmlRegState
));
660 ret
->type
= XML_REGEXP_TRANS_STATE
;
661 ret
->mark
= XML_REGEXP_MARK_NORMAL
;
667 * @state: the regexp state
669 * Free a regexp state
672 xmlRegFreeState(xmlRegStatePtr state
) {
676 if (state
->trans
!= NULL
)
677 xmlFree(state
->trans
);
682 * xmlRegFreeParserCtxt:
683 * @ctxt: the regexp parser context
685 * Free a regexp parser context
688 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt
) {
693 if (ctxt
->string
!= NULL
)
694 xmlFree(ctxt
->string
);
695 if (ctxt
->states
!= NULL
) {
696 for (i
= 0;i
< ctxt
->nbStates
;i
++)
697 xmlRegFreeState(ctxt
->states
[i
]);
698 xmlFree(ctxt
->states
);
700 if (ctxt
->atoms
!= NULL
) {
701 for (i
= 0;i
< ctxt
->nbAtoms
;i
++)
702 xmlRegFreeAtom(ctxt
->atoms
[i
]);
703 xmlFree(ctxt
->atoms
);
705 if (ctxt
->counters
!= NULL
)
706 xmlFree(ctxt
->counters
);
710 /************************************************************************
712 * Display of Data structures *
714 ************************************************************************/
717 xmlRegPrintAtomType(FILE *output
, xmlRegAtomType type
) {
719 case XML_REGEXP_EPSILON
:
720 fprintf(output
, "epsilon "); break;
721 case XML_REGEXP_CHARVAL
:
722 fprintf(output
, "charval "); break;
723 case XML_REGEXP_RANGES
:
724 fprintf(output
, "ranges "); break;
725 case XML_REGEXP_SUBREG
:
726 fprintf(output
, "subexpr "); break;
727 case XML_REGEXP_STRING
:
728 fprintf(output
, "string "); break;
729 case XML_REGEXP_ANYCHAR
:
730 fprintf(output
, "anychar "); break;
731 case XML_REGEXP_ANYSPACE
:
732 fprintf(output
, "anyspace "); break;
733 case XML_REGEXP_NOTSPACE
:
734 fprintf(output
, "notspace "); break;
735 case XML_REGEXP_INITNAME
:
736 fprintf(output
, "initname "); break;
737 case XML_REGEXP_NOTINITNAME
:
738 fprintf(output
, "notinitname "); break;
739 case XML_REGEXP_NAMECHAR
:
740 fprintf(output
, "namechar "); break;
741 case XML_REGEXP_NOTNAMECHAR
:
742 fprintf(output
, "notnamechar "); break;
743 case XML_REGEXP_DECIMAL
:
744 fprintf(output
, "decimal "); break;
745 case XML_REGEXP_NOTDECIMAL
:
746 fprintf(output
, "notdecimal "); break;
747 case XML_REGEXP_REALCHAR
:
748 fprintf(output
, "realchar "); break;
749 case XML_REGEXP_NOTREALCHAR
:
750 fprintf(output
, "notrealchar "); break;
751 case XML_REGEXP_LETTER
:
752 fprintf(output
, "LETTER "); break;
753 case XML_REGEXP_LETTER_UPPERCASE
:
754 fprintf(output
, "LETTER_UPPERCASE "); break;
755 case XML_REGEXP_LETTER_LOWERCASE
:
756 fprintf(output
, "LETTER_LOWERCASE "); break;
757 case XML_REGEXP_LETTER_TITLECASE
:
758 fprintf(output
, "LETTER_TITLECASE "); break;
759 case XML_REGEXP_LETTER_MODIFIER
:
760 fprintf(output
, "LETTER_MODIFIER "); break;
761 case XML_REGEXP_LETTER_OTHERS
:
762 fprintf(output
, "LETTER_OTHERS "); break;
763 case XML_REGEXP_MARK
:
764 fprintf(output
, "MARK "); break;
765 case XML_REGEXP_MARK_NONSPACING
:
766 fprintf(output
, "MARK_NONSPACING "); break;
767 case XML_REGEXP_MARK_SPACECOMBINING
:
768 fprintf(output
, "MARK_SPACECOMBINING "); break;
769 case XML_REGEXP_MARK_ENCLOSING
:
770 fprintf(output
, "MARK_ENCLOSING "); break;
771 case XML_REGEXP_NUMBER
:
772 fprintf(output
, "NUMBER "); break;
773 case XML_REGEXP_NUMBER_DECIMAL
:
774 fprintf(output
, "NUMBER_DECIMAL "); break;
775 case XML_REGEXP_NUMBER_LETTER
:
776 fprintf(output
, "NUMBER_LETTER "); break;
777 case XML_REGEXP_NUMBER_OTHERS
:
778 fprintf(output
, "NUMBER_OTHERS "); break;
779 case XML_REGEXP_PUNCT
:
780 fprintf(output
, "PUNCT "); break;
781 case XML_REGEXP_PUNCT_CONNECTOR
:
782 fprintf(output
, "PUNCT_CONNECTOR "); break;
783 case XML_REGEXP_PUNCT_DASH
:
784 fprintf(output
, "PUNCT_DASH "); break;
785 case XML_REGEXP_PUNCT_OPEN
:
786 fprintf(output
, "PUNCT_OPEN "); break;
787 case XML_REGEXP_PUNCT_CLOSE
:
788 fprintf(output
, "PUNCT_CLOSE "); break;
789 case XML_REGEXP_PUNCT_INITQUOTE
:
790 fprintf(output
, "PUNCT_INITQUOTE "); break;
791 case XML_REGEXP_PUNCT_FINQUOTE
:
792 fprintf(output
, "PUNCT_FINQUOTE "); break;
793 case XML_REGEXP_PUNCT_OTHERS
:
794 fprintf(output
, "PUNCT_OTHERS "); break;
795 case XML_REGEXP_SEPAR
:
796 fprintf(output
, "SEPAR "); break;
797 case XML_REGEXP_SEPAR_SPACE
:
798 fprintf(output
, "SEPAR_SPACE "); break;
799 case XML_REGEXP_SEPAR_LINE
:
800 fprintf(output
, "SEPAR_LINE "); break;
801 case XML_REGEXP_SEPAR_PARA
:
802 fprintf(output
, "SEPAR_PARA "); break;
803 case XML_REGEXP_SYMBOL
:
804 fprintf(output
, "SYMBOL "); break;
805 case XML_REGEXP_SYMBOL_MATH
:
806 fprintf(output
, "SYMBOL_MATH "); break;
807 case XML_REGEXP_SYMBOL_CURRENCY
:
808 fprintf(output
, "SYMBOL_CURRENCY "); break;
809 case XML_REGEXP_SYMBOL_MODIFIER
:
810 fprintf(output
, "SYMBOL_MODIFIER "); break;
811 case XML_REGEXP_SYMBOL_OTHERS
:
812 fprintf(output
, "SYMBOL_OTHERS "); break;
813 case XML_REGEXP_OTHER
:
814 fprintf(output
, "OTHER "); break;
815 case XML_REGEXP_OTHER_CONTROL
:
816 fprintf(output
, "OTHER_CONTROL "); break;
817 case XML_REGEXP_OTHER_FORMAT
:
818 fprintf(output
, "OTHER_FORMAT "); break;
819 case XML_REGEXP_OTHER_PRIVATE
:
820 fprintf(output
, "OTHER_PRIVATE "); break;
821 case XML_REGEXP_OTHER_NA
:
822 fprintf(output
, "OTHER_NA "); break;
823 case XML_REGEXP_BLOCK_NAME
:
824 fprintf(output
, "BLOCK "); break;
829 xmlRegPrintQuantType(FILE *output
, xmlRegQuantType type
) {
831 case XML_REGEXP_QUANT_EPSILON
:
832 fprintf(output
, "epsilon "); break;
833 case XML_REGEXP_QUANT_ONCE
:
834 fprintf(output
, "once "); break;
835 case XML_REGEXP_QUANT_OPT
:
836 fprintf(output
, "? "); break;
837 case XML_REGEXP_QUANT_MULT
:
838 fprintf(output
, "* "); break;
839 case XML_REGEXP_QUANT_PLUS
:
840 fprintf(output
, "+ "); break;
841 case XML_REGEXP_QUANT_RANGE
:
842 fprintf(output
, "range "); break;
843 case XML_REGEXP_QUANT_ONCEONLY
:
844 fprintf(output
, "onceonly "); break;
845 case XML_REGEXP_QUANT_ALL
:
846 fprintf(output
, "all "); break;
850 xmlRegPrintRange(FILE *output
, xmlRegRangePtr range
) {
851 fprintf(output
, " range: ");
853 fprintf(output
, "negative ");
854 xmlRegPrintAtomType(output
, range
->type
);
855 fprintf(output
, "%c - %c\n", range
->start
, range
->end
);
859 xmlRegPrintAtom(FILE *output
, xmlRegAtomPtr atom
) {
860 fprintf(output
, " atom: ");
862 fprintf(output
, "NULL\n");
865 xmlRegPrintAtomType(output
, atom
->type
);
866 xmlRegPrintQuantType(output
, atom
->quant
);
867 if (atom
->quant
== XML_REGEXP_QUANT_RANGE
)
868 fprintf(output
, "%d-%d ", atom
->min
, atom
->max
);
869 if (atom
->type
== XML_REGEXP_STRING
)
870 fprintf(output
, "'%s' ", (char *) atom
->valuep
);
871 if (atom
->type
== XML_REGEXP_CHARVAL
)
872 fprintf(output
, "char %c\n", atom
->codepoint
);
873 else if (atom
->type
== XML_REGEXP_RANGES
) {
875 fprintf(output
, "%d entries\n", atom
->nbRanges
);
876 for (i
= 0; i
< atom
->nbRanges
;i
++)
877 xmlRegPrintRange(output
, atom
->ranges
[i
]);
878 } else if (atom
->type
== XML_REGEXP_SUBREG
) {
879 fprintf(output
, "start %d end %d\n", atom
->start
->no
, atom
->stop
->no
);
881 fprintf(output
, "\n");
886 xmlRegPrintTrans(FILE *output
, xmlRegTransPtr trans
) {
887 fprintf(output
, " trans: ");
889 fprintf(output
, "NULL\n");
893 fprintf(output
, "removed\n");
896 if (trans
->counter
>= 0) {
897 fprintf(output
, "counted %d, ", trans
->counter
);
899 if (trans
->count
== REGEXP_ALL_COUNTER
) {
900 fprintf(output
, "all transition, ");
901 } else if (trans
->count
>= 0) {
902 fprintf(output
, "count based %d, ", trans
->count
);
904 if (trans
->atom
== NULL
) {
905 fprintf(output
, "epsilon to %d\n", trans
->to
);
908 if (trans
->atom
->type
== XML_REGEXP_CHARVAL
)
909 fprintf(output
, "char %c ", trans
->atom
->codepoint
);
910 fprintf(output
, "atom %d, to %d\n", trans
->atom
->no
, trans
->to
);
914 xmlRegPrintState(FILE *output
, xmlRegStatePtr state
) {
917 fprintf(output
, " state: ");
919 fprintf(output
, "NULL\n");
922 if (state
->type
== XML_REGEXP_START_STATE
)
923 fprintf(output
, "START ");
924 if (state
->type
== XML_REGEXP_FINAL_STATE
)
925 fprintf(output
, "FINAL ");
927 fprintf(output
, "%d, %d transitions:\n", state
->no
, state
->nbTrans
);
928 for (i
= 0;i
< state
->nbTrans
; i
++) {
929 xmlRegPrintTrans(output
, &(state
->trans
[i
]));
933 #ifdef DEBUG_REGEXP_GRAPH
935 xmlRegPrintCtxt(FILE *output
, xmlRegParserCtxtPtr ctxt
) {
938 fprintf(output
, " ctxt: ");
940 fprintf(output
, "NULL\n");
943 fprintf(output
, "'%s' ", ctxt
->string
);
945 fprintf(output
, "error ");
947 fprintf(output
, "neg ");
948 fprintf(output
, "\n");
949 fprintf(output
, "%d atoms:\n", ctxt
->nbAtoms
);
950 for (i
= 0;i
< ctxt
->nbAtoms
; i
++) {
951 fprintf(output
, " %02d ", i
);
952 xmlRegPrintAtom(output
, ctxt
->atoms
[i
]);
954 if (ctxt
->atom
!= NULL
) {
955 fprintf(output
, "current atom:\n");
956 xmlRegPrintAtom(output
, ctxt
->atom
);
958 fprintf(output
, "%d states:", ctxt
->nbStates
);
959 if (ctxt
->start
!= NULL
)
960 fprintf(output
, " start: %d", ctxt
->start
->no
);
961 if (ctxt
->end
!= NULL
)
962 fprintf(output
, " end: %d", ctxt
->end
->no
);
963 fprintf(output
, "\n");
964 for (i
= 0;i
< ctxt
->nbStates
; i
++) {
965 xmlRegPrintState(output
, ctxt
->states
[i
]);
967 fprintf(output
, "%d counters:\n", ctxt
->nbCounters
);
968 for (i
= 0;i
< ctxt
->nbCounters
; i
++) {
969 fprintf(output
, " %d: min %d max %d\n", i
, ctxt
->counters
[i
].min
,
970 ctxt
->counters
[i
].max
);
975 /************************************************************************
977 * Finite Automata structures manipulations *
979 ************************************************************************/
982 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt
, xmlRegAtomPtr atom
,
983 int neg
, xmlRegAtomType type
, int start
, int end
,
984 xmlChar
*blockName
) {
985 xmlRegRangePtr range
;
988 ERROR("add range: atom is NULL");
991 if (atom
->type
!= XML_REGEXP_RANGES
) {
992 ERROR("add range: atom is not ranges");
995 if (atom
->maxRanges
== 0) {
997 atom
->ranges
= (xmlRegRangePtr
*) xmlMalloc(atom
->maxRanges
*
998 sizeof(xmlRegRangePtr
));
999 if (atom
->ranges
== NULL
) {
1000 ERROR("add range: allocation failed");
1001 atom
->maxRanges
= 0;
1004 } else if (atom
->nbRanges
>= atom
->maxRanges
) {
1005 xmlRegRangePtr
*tmp
;
1006 atom
->maxRanges
*= 2;
1007 tmp
= (xmlRegRangePtr
*) xmlRealloc(atom
->ranges
, atom
->maxRanges
*
1008 sizeof(xmlRegRangePtr
));
1010 ERROR("add range: allocation failed");
1011 atom
->maxRanges
/= 2;
1016 range
= xmlRegNewRange(ctxt
, neg
, type
, start
, end
);
1019 range
->blockName
= blockName
;
1020 atom
->ranges
[atom
->nbRanges
++] = range
;
1025 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt
) {
1026 if (ctxt
->maxCounters
== 0) {
1027 ctxt
->maxCounters
= 4;
1028 ctxt
->counters
= (xmlRegCounter
*) xmlMalloc(ctxt
->maxCounters
*
1029 sizeof(xmlRegCounter
));
1030 if (ctxt
->counters
== NULL
) {
1031 ERROR("reg counter: allocation failed");
1032 ctxt
->maxCounters
= 0;
1035 } else if (ctxt
->nbCounters
>= ctxt
->maxCounters
) {
1037 ctxt
->maxCounters
*= 2;
1038 tmp
= (xmlRegCounter
*) xmlRealloc(ctxt
->counters
, ctxt
->maxCounters
*
1039 sizeof(xmlRegCounter
));
1041 ERROR("reg counter: allocation failed");
1042 ctxt
->maxCounters
/= 2;
1045 ctxt
->counters
= tmp
;
1047 ctxt
->counters
[ctxt
->nbCounters
].min
= -1;
1048 ctxt
->counters
[ctxt
->nbCounters
].max
= -1;
1049 return(ctxt
->nbCounters
++);
1053 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt
, xmlRegAtomPtr atom
) {
1055 ERROR("atom push: atom is NULL");
1058 if (ctxt
->maxAtoms
== 0) {
1060 ctxt
->atoms
= (xmlRegAtomPtr
*) xmlMalloc(ctxt
->maxAtoms
*
1061 sizeof(xmlRegAtomPtr
));
1062 if (ctxt
->atoms
== NULL
) {
1063 ERROR("atom push: allocation failed");
1067 } else if (ctxt
->nbAtoms
>= ctxt
->maxAtoms
) {
1069 ctxt
->maxAtoms
*= 2;
1070 tmp
= (xmlRegAtomPtr
*) xmlRealloc(ctxt
->atoms
, ctxt
->maxAtoms
*
1071 sizeof(xmlRegAtomPtr
));
1073 ERROR("atom push: allocation failed");
1074 ctxt
->maxAtoms
/= 2;
1079 atom
->no
= ctxt
->nbAtoms
;
1080 ctxt
->atoms
[ctxt
->nbAtoms
++] = atom
;
1084 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr state
,
1085 xmlRegAtomPtr atom
, xmlRegStatePtr target
,
1086 int counter
, int count
) {
1087 if (state
== NULL
) {
1088 ERROR("add state: state is NULL");
1091 if (target
== NULL
) {
1092 ERROR("add state: target is NULL");
1095 if (state
->maxTrans
== 0) {
1096 state
->maxTrans
= 4;
1097 state
->trans
= (xmlRegTrans
*) xmlMalloc(state
->maxTrans
*
1098 sizeof(xmlRegTrans
));
1099 if (state
->trans
== NULL
) {
1100 ERROR("add range: allocation failed");
1101 state
->maxTrans
= 0;
1104 } else if (state
->nbTrans
>= state
->maxTrans
) {
1106 state
->maxTrans
*= 2;
1107 tmp
= (xmlRegTrans
*) xmlRealloc(state
->trans
, state
->maxTrans
*
1108 sizeof(xmlRegTrans
));
1110 ERROR("add range: allocation failed");
1111 state
->maxTrans
/= 2;
1116 #ifdef DEBUG_REGEXP_GRAPH
1117 printf("Add trans from %d to %d ", state
->no
, target
->no
);
1118 if (count
== REGEXP_ALL_COUNTER
)
1119 printf("all transition");
1120 else if (count
>= 0)
1121 printf("count based %d", count
);
1122 else if (counter
>= 0)
1123 printf("counted %d", counter
);
1124 else if (atom
== NULL
)
1125 printf("epsilon transition");
1129 state
->trans
[state
->nbTrans
].atom
= atom
;
1130 state
->trans
[state
->nbTrans
].to
= target
->no
;
1131 state
->trans
[state
->nbTrans
].counter
= counter
;
1132 state
->trans
[state
->nbTrans
].count
= count
;
1137 xmlRegStatePush(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr state
) {
1138 if (ctxt
->maxStates
== 0) {
1139 ctxt
->maxStates
= 4;
1140 ctxt
->states
= (xmlRegStatePtr
*) xmlMalloc(ctxt
->maxStates
*
1141 sizeof(xmlRegStatePtr
));
1142 if (ctxt
->states
== NULL
) {
1143 ERROR("add range: allocation failed");
1144 ctxt
->maxStates
= 0;
1147 } else if (ctxt
->nbStates
>= ctxt
->maxStates
) {
1148 xmlRegStatePtr
*tmp
;
1149 ctxt
->maxStates
*= 2;
1150 tmp
= (xmlRegStatePtr
*) xmlRealloc(ctxt
->states
, ctxt
->maxStates
*
1151 sizeof(xmlRegStatePtr
));
1153 ERROR("add range: allocation failed");
1154 ctxt
->maxStates
/= 2;
1159 state
->no
= ctxt
->nbStates
;
1160 ctxt
->states
[ctxt
->nbStates
++] = state
;
1164 * xmlFAGenerateAllTransition:
1165 * @ctxt: a regexp parser context
1166 * @from: the from state
1167 * @to: the target state or NULL for building a new one
1172 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt
,
1173 xmlRegStatePtr from
, xmlRegStatePtr to
,
1176 to
= xmlRegNewState(ctxt
);
1177 xmlRegStatePush(ctxt
, to
);
1181 xmlRegStateAddTrans(ctxt
, from
, NULL
, to
, -1, REGEXP_ALL_LAX_COUNTER
);
1183 xmlRegStateAddTrans(ctxt
, from
, NULL
, to
, -1, REGEXP_ALL_COUNTER
);
1187 * xmlFAGenerateEpsilonTransition:
1188 * @ctxt: a regexp parser context
1189 * @from: the from state
1190 * @to: the target state or NULL for building a new one
1194 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt
,
1195 xmlRegStatePtr from
, xmlRegStatePtr to
) {
1197 to
= xmlRegNewState(ctxt
);
1198 xmlRegStatePush(ctxt
, to
);
1201 xmlRegStateAddTrans(ctxt
, from
, NULL
, to
, -1, -1);
1205 * xmlFAGenerateCountedEpsilonTransition:
1206 * @ctxt: a regexp parser context
1207 * @from: the from state
1208 * @to: the target state or NULL for building a new one
1209 * counter: the counter for that transition
1213 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt
,
1214 xmlRegStatePtr from
, xmlRegStatePtr to
, int counter
) {
1216 to
= xmlRegNewState(ctxt
);
1217 xmlRegStatePush(ctxt
, to
);
1220 xmlRegStateAddTrans(ctxt
, from
, NULL
, to
, counter
, -1);
1224 * xmlFAGenerateCountedTransition:
1225 * @ctxt: a regexp parser context
1226 * @from: the from state
1227 * @to: the target state or NULL for building a new one
1228 * counter: the counter for that transition
1232 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt
,
1233 xmlRegStatePtr from
, xmlRegStatePtr to
, int counter
) {
1235 to
= xmlRegNewState(ctxt
);
1236 xmlRegStatePush(ctxt
, to
);
1239 xmlRegStateAddTrans(ctxt
, from
, NULL
, to
, -1, counter
);
1243 * xmlFAGenerateTransitions:
1244 * @ctxt: a regexp parser context
1245 * @from: the from state
1246 * @to: the target state or NULL for building a new one
1247 * @atom: the atom generating the transition
1251 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr from
,
1252 xmlRegStatePtr to
, xmlRegAtomPtr atom
) {
1254 ERROR("genrate transition: atom == NULL");
1257 if (atom
->type
== XML_REGEXP_SUBREG
) {
1259 * this is a subexpression handling one should not need to
1260 * create a new node excep for XML_REGEXP_QUANT_RANGE.
1262 xmlRegAtomPush(ctxt
, atom
);
1263 if ((to
!= NULL
) && (atom
->stop
!= to
) &&
1264 (atom
->quant
!= XML_REGEXP_QUANT_RANGE
)) {
1266 * Generate an epsilon transition to link to the target
1268 xmlFAGenerateEpsilonTransition(ctxt
, atom
->stop
, to
);
1270 switch (atom
->quant
) {
1271 case XML_REGEXP_QUANT_OPT
:
1272 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1273 xmlFAGenerateEpsilonTransition(ctxt
, atom
->start
, atom
->stop
);
1275 case XML_REGEXP_QUANT_MULT
:
1276 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1277 xmlFAGenerateEpsilonTransition(ctxt
, atom
->start
, atom
->stop
);
1278 xmlFAGenerateEpsilonTransition(ctxt
, atom
->stop
, atom
->start
);
1280 case XML_REGEXP_QUANT_PLUS
:
1281 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1282 xmlFAGenerateEpsilonTransition(ctxt
, atom
->stop
, atom
->start
);
1284 case XML_REGEXP_QUANT_RANGE
: {
1286 xmlRegStatePtr newstate
;
1289 * This one is nasty:
1290 * 1/ register a new counter
1291 * 2/ register an epsilon transition associated to
1292 * this counter going from atom->stop to atom->start
1293 * 3/ create a new state
1294 * 4/ generate a counted transition from atom->stop to
1297 counter
= xmlRegGetCounter(ctxt
);
1298 ctxt
->counters
[counter
].min
= atom
->min
- 1;
1299 ctxt
->counters
[counter
].max
= atom
->max
- 1;
1302 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1303 xmlFAGenerateCountedEpsilonTransition(ctxt
, atom
->stop
,
1304 atom
->start
, counter
);
1308 newstate
= xmlRegNewState(ctxt
);
1309 xmlRegStatePush(ctxt
, newstate
);
1310 ctxt
->state
= newstate
;
1312 xmlFAGenerateCountedTransition(ctxt
, atom
->stop
,
1321 to
= xmlRegNewState(ctxt
);
1322 xmlRegStatePush(ctxt
, to
);
1324 xmlRegStateAddTrans(ctxt
, from
, atom
, to
, -1, -1);
1325 xmlRegAtomPush(ctxt
, atom
);
1328 switch (atom
->quant
) {
1329 case XML_REGEXP_QUANT_OPT
:
1330 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1331 xmlFAGenerateEpsilonTransition(ctxt
, from
, to
);
1333 case XML_REGEXP_QUANT_MULT
:
1334 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1335 xmlFAGenerateEpsilonTransition(ctxt
, from
, to
);
1336 xmlRegStateAddTrans(ctxt
, to
, atom
, to
, -1, -1);
1338 case XML_REGEXP_QUANT_PLUS
:
1339 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1340 xmlRegStateAddTrans(ctxt
, to
, atom
, to
, -1, -1);
1348 * xmlFAReduceEpsilonTransitions:
1349 * @ctxt: a regexp parser context
1350 * @fromnr: the from state
1351 * @tonr: the to state
1352 * @cpunter: should that transition be associted to a counted
1356 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt
, int fromnr
,
1357 int tonr
, int counter
) {
1359 xmlRegStatePtr from
;
1362 #ifdef DEBUG_REGEXP_GRAPH
1363 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr
, tonr
);
1365 from
= ctxt
->states
[fromnr
];
1368 to
= ctxt
->states
[tonr
];
1371 if ((to
->mark
== XML_REGEXP_MARK_START
) ||
1372 (to
->mark
== XML_REGEXP_MARK_VISITED
))
1375 to
->mark
= XML_REGEXP_MARK_VISITED
;
1376 if (to
->type
== XML_REGEXP_FINAL_STATE
) {
1377 #ifdef DEBUG_REGEXP_GRAPH
1378 printf("State %d is final, so %d becomes final\n", tonr
, fromnr
);
1380 from
->type
= XML_REGEXP_FINAL_STATE
;
1382 for (transnr
= 0;transnr
< to
->nbTrans
;transnr
++) {
1383 if (to
->trans
[transnr
].atom
== NULL
) {
1385 * Don't remove counted transitions
1388 if (to
->trans
[transnr
].to
!= fromnr
) {
1389 if (to
->trans
[transnr
].count
>= 0) {
1390 int newto
= to
->trans
[transnr
].to
;
1392 xmlRegStateAddTrans(ctxt
, from
, NULL
,
1393 ctxt
->states
[newto
],
1394 -1, to
->trans
[transnr
].count
);
1396 #ifdef DEBUG_REGEXP_GRAPH
1397 printf("Found epsilon trans %d from %d to %d\n",
1398 transnr
, tonr
, to
->trans
[transnr
].to
);
1400 if (to
->trans
[transnr
].counter
>= 0) {
1401 xmlFAReduceEpsilonTransitions(ctxt
, fromnr
,
1402 to
->trans
[transnr
].to
,
1403 to
->trans
[transnr
].counter
);
1405 xmlFAReduceEpsilonTransitions(ctxt
, fromnr
,
1406 to
->trans
[transnr
].to
,
1412 int newto
= to
->trans
[transnr
].to
;
1414 if (to
->trans
[transnr
].counter
>= 0) {
1415 xmlRegStateAddTrans(ctxt
, from
, to
->trans
[transnr
].atom
,
1416 ctxt
->states
[newto
],
1417 to
->trans
[transnr
].counter
, -1);
1419 xmlRegStateAddTrans(ctxt
, from
, to
->trans
[transnr
].atom
,
1420 ctxt
->states
[newto
], counter
, -1);
1424 to
->mark
= XML_REGEXP_MARK_NORMAL
;
1428 * xmlFAEliminateEpsilonTransitions:
1429 * @ctxt: a regexp parser context
1433 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt
) {
1434 int statenr
, transnr
;
1435 xmlRegStatePtr state
;
1438 * build the completed transitions bypassing the epsilons
1439 * Use a marking algorithm to avoid loops
1441 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
1442 state
= ctxt
->states
[statenr
];
1445 for (transnr
= 0;transnr
< state
->nbTrans
;transnr
++) {
1446 if ((state
->trans
[transnr
].atom
== NULL
) &&
1447 (state
->trans
[transnr
].to
>= 0)) {
1448 if (state
->trans
[transnr
].to
== statenr
) {
1449 state
->trans
[transnr
].to
= -1;
1450 #ifdef DEBUG_REGEXP_GRAPH
1451 printf("Removed loopback epsilon trans %d on %d\n",
1454 } else if (state
->trans
[transnr
].count
< 0) {
1455 int newto
= state
->trans
[transnr
].to
;
1457 #ifdef DEBUG_REGEXP_GRAPH
1458 printf("Found epsilon trans %d from %d to %d\n",
1459 transnr
, statenr
, newto
);
1461 state
->mark
= XML_REGEXP_MARK_START
;
1462 xmlFAReduceEpsilonTransitions(ctxt
, statenr
,
1463 newto
, state
->trans
[transnr
].counter
);
1464 state
->mark
= XML_REGEXP_MARK_NORMAL
;
1465 #ifdef DEBUG_REGEXP_GRAPH
1467 printf("Found counted transition %d on %d\n",
1475 * Eliminate the epsilon transitions
1477 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
1478 state
= ctxt
->states
[statenr
];
1481 for (transnr
= 0;transnr
< state
->nbTrans
;transnr
++) {
1482 if ((state
->trans
[transnr
].atom
== NULL
) &&
1483 (state
->trans
[transnr
].count
< 0) &&
1484 (state
->trans
[transnr
].to
>= 0)) {
1485 state
->trans
[transnr
].to
= -1;
1491 * Use this pass to detect unreachable states too
1493 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
1494 state
= ctxt
->states
[statenr
];
1498 state
= ctxt
->states
[0];
1501 while (state
!= NULL
) {
1502 xmlRegStatePtr target
= NULL
;
1505 * Mark all state reachable from the current reachable state
1507 for (transnr
= 0;transnr
< state
->nbTrans
;transnr
++) {
1508 if ((state
->trans
[transnr
].to
>= 0) &&
1509 ((state
->trans
[transnr
].atom
!= NULL
) ||
1510 (state
->trans
[transnr
].count
>= 0))) {
1511 int newto
= state
->trans
[transnr
].to
;
1513 if (ctxt
->states
[newto
] == NULL
)
1515 if (ctxt
->states
[newto
]->reached
== 0) {
1516 ctxt
->states
[newto
]->reached
= 1;
1517 target
= ctxt
->states
[newto
];
1522 * find the next accessible state not explored
1524 if (target
== NULL
) {
1525 for (statenr
= 1;statenr
< ctxt
->nbStates
;statenr
++) {
1526 state
= ctxt
->states
[statenr
];
1527 if ((state
!= NULL
) && (state
->reached
== 1)) {
1535 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
1536 state
= ctxt
->states
[statenr
];
1537 if ((state
!= NULL
) && (state
->reached
== 0)) {
1538 #ifdef DEBUG_REGEXP_GRAPH
1539 printf("Removed unreachable state %d\n", statenr
);
1541 xmlRegFreeState(state
);
1542 ctxt
->states
[statenr
] = NULL
;
1549 * xmlFACompareAtoms:
1553 * Compares two atoms to check whether they are equivatents
1555 * Returns 1 if yes and 0 otherwise
1558 xmlFACompareAtoms(xmlRegAtomPtr atom1
, xmlRegAtomPtr atom2
) {
1561 if ((atom1
== NULL
) || (atom2
== NULL
))
1564 if (atom1
->type
!= atom2
->type
)
1566 switch (atom1
->type
) {
1567 case XML_REGEXP_STRING
:
1568 return(xmlStrEqual((xmlChar
*)atom1
->valuep
,
1569 (xmlChar
*)atom2
->valuep
));
1570 case XML_REGEXP_EPSILON
:
1572 case XML_REGEXP_CHARVAL
:
1573 return(atom1
->codepoint
== atom2
->codepoint
);
1574 case XML_REGEXP_RANGES
:
1584 * xmlFARecurseDeterminism:
1585 * @ctxt: a regexp parser context
1587 * Check whether the associated regexp is determinist,
1588 * should be called after xmlFAEliminateEpsilonTransitions()
1592 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr state
,
1593 int to
, xmlRegAtomPtr atom
) {
1600 for (transnr
= 0;transnr
< state
->nbTrans
;transnr
++) {
1601 t1
= &(state
->trans
[transnr
]);
1603 * check transitions conflicting with the one looked at
1605 if (t1
->atom
== NULL
) {
1608 ret
= xmlFARecurseDeterminism(ctxt
, ctxt
->states
[t1
->to
],
1616 if (xmlFACompareAtoms(t1
->atom
, atom
))
1623 * xmlFAComputesDeterminism:
1624 * @ctxt: a regexp parser context
1626 * Check whether the associated regexp is determinist,
1627 * should be called after xmlFAEliminateEpsilonTransitions()
1631 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt
) {
1632 int statenr
, transnr
;
1633 xmlRegStatePtr state
;
1634 xmlRegTransPtr t1
, t2
;
1638 #ifdef DEBUG_REGEXP_GRAPH
1639 printf("xmlFAComputesDeterminism\n");
1640 xmlRegPrintCtxt(stdout
, ctxt
);
1642 if (ctxt
->determinist
!= -1)
1643 return(ctxt
->determinist
);
1646 * Check for all states that there isn't 2 transitions
1647 * with the same atom and a different target.
1649 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
1650 state
= ctxt
->states
[statenr
];
1653 for (transnr
= 0;transnr
< state
->nbTrans
;transnr
++) {
1654 t1
= &(state
->trans
[transnr
]);
1656 * Determinism checks in case of counted or all transitions
1657 * will have to be handled separately
1659 if (t1
->atom
== NULL
)
1661 if (t1
->to
== -1) /* eliminated */
1663 for (i
= 0;i
< transnr
;i
++) {
1664 t2
= &(state
->trans
[i
]);
1665 if (t2
->to
== -1) /* eliminated */
1667 if (t2
->atom
!= NULL
) {
1668 if (t1
->to
== t2
->to
) {
1669 if (xmlFACompareAtoms(t1
->atom
, t2
->atom
))
1670 t2
->to
= -1; /* eliminate */
1672 /* not determinist ! */
1673 if (xmlFACompareAtoms(t1
->atom
, t2
->atom
))
1676 } else if (t1
->to
!= -1) {
1678 * do the closure in case of remaining specific
1679 * epsilon transitions like choices or all
1681 ret
= xmlFARecurseDeterminism(ctxt
, ctxt
->states
[t1
->to
],
1693 ctxt
->determinist
= ret
;
1697 /************************************************************************
1699 * Routines to check input against transition atoms *
1701 ************************************************************************/
1704 xmlRegCheckCharacterRange(xmlRegAtomType type
, int codepoint
, int neg
,
1705 int start
, int end
, const xmlChar
*blockName
) {
1709 case XML_REGEXP_STRING
:
1710 case XML_REGEXP_SUBREG
:
1711 case XML_REGEXP_RANGES
:
1712 case XML_REGEXP_EPSILON
:
1714 case XML_REGEXP_ANYCHAR
:
1715 ret
= ((codepoint
!= '\n') && (codepoint
!= '\r'));
1717 case XML_REGEXP_CHARVAL
:
1718 ret
= ((codepoint
>= start
) && (codepoint
<= end
));
1720 case XML_REGEXP_NOTSPACE
:
1722 case XML_REGEXP_ANYSPACE
:
1723 ret
= ((codepoint
== '\n') || (codepoint
== '\r') ||
1724 (codepoint
== '\t') || (codepoint
== ' '));
1726 case XML_REGEXP_NOTINITNAME
:
1728 case XML_REGEXP_INITNAME
:
1729 ret
= (xmlIsLetter(codepoint
) ||
1730 (codepoint
== '_') || (codepoint
== ':'));
1732 case XML_REGEXP_NOTNAMECHAR
:
1734 case XML_REGEXP_NAMECHAR
:
1735 ret
= (xmlIsLetter(codepoint
) || xmlIsDigit(codepoint
) ||
1736 (codepoint
== '.') || (codepoint
== '-') ||
1737 (codepoint
== '_') || (codepoint
== ':') ||
1738 xmlIsCombining(codepoint
) || xmlIsExtender(codepoint
));
1740 case XML_REGEXP_NOTDECIMAL
:
1742 case XML_REGEXP_DECIMAL
:
1743 ret
= xmlUCSIsCatNd(codepoint
);
1745 case XML_REGEXP_REALCHAR
:
1747 case XML_REGEXP_NOTREALCHAR
:
1748 ret
= xmlUCSIsCatP(codepoint
);
1750 ret
= xmlUCSIsCatZ(codepoint
);
1752 ret
= xmlUCSIsCatC(codepoint
);
1754 case XML_REGEXP_LETTER
:
1755 ret
= xmlUCSIsCatL(codepoint
);
1757 case XML_REGEXP_LETTER_UPPERCASE
:
1758 ret
= xmlUCSIsCatLu(codepoint
);
1760 case XML_REGEXP_LETTER_LOWERCASE
:
1761 ret
= xmlUCSIsCatLl(codepoint
);
1763 case XML_REGEXP_LETTER_TITLECASE
:
1764 ret
= xmlUCSIsCatLt(codepoint
);
1766 case XML_REGEXP_LETTER_MODIFIER
:
1767 ret
= xmlUCSIsCatLm(codepoint
);
1769 case XML_REGEXP_LETTER_OTHERS
:
1770 ret
= xmlUCSIsCatLo(codepoint
);
1772 case XML_REGEXP_MARK
:
1773 ret
= xmlUCSIsCatM(codepoint
);
1775 case XML_REGEXP_MARK_NONSPACING
:
1776 ret
= xmlUCSIsCatMn(codepoint
);
1778 case XML_REGEXP_MARK_SPACECOMBINING
:
1779 ret
= xmlUCSIsCatMc(codepoint
);
1781 case XML_REGEXP_MARK_ENCLOSING
:
1782 ret
= xmlUCSIsCatMe(codepoint
);
1784 case XML_REGEXP_NUMBER
:
1785 ret
= xmlUCSIsCatN(codepoint
);
1787 case XML_REGEXP_NUMBER_DECIMAL
:
1788 ret
= xmlUCSIsCatNd(codepoint
);
1790 case XML_REGEXP_NUMBER_LETTER
:
1791 ret
= xmlUCSIsCatNl(codepoint
);
1793 case XML_REGEXP_NUMBER_OTHERS
:
1794 ret
= xmlUCSIsCatNo(codepoint
);
1796 case XML_REGEXP_PUNCT
:
1797 ret
= xmlUCSIsCatP(codepoint
);
1799 case XML_REGEXP_PUNCT_CONNECTOR
:
1800 ret
= xmlUCSIsCatPc(codepoint
);
1802 case XML_REGEXP_PUNCT_DASH
:
1803 ret
= xmlUCSIsCatPd(codepoint
);
1805 case XML_REGEXP_PUNCT_OPEN
:
1806 ret
= xmlUCSIsCatPs(codepoint
);
1808 case XML_REGEXP_PUNCT_CLOSE
:
1809 ret
= xmlUCSIsCatPe(codepoint
);
1811 case XML_REGEXP_PUNCT_INITQUOTE
:
1812 ret
= xmlUCSIsCatPi(codepoint
);
1814 case XML_REGEXP_PUNCT_FINQUOTE
:
1815 ret
= xmlUCSIsCatPf(codepoint
);
1817 case XML_REGEXP_PUNCT_OTHERS
:
1818 ret
= xmlUCSIsCatPo(codepoint
);
1820 case XML_REGEXP_SEPAR
:
1821 ret
= xmlUCSIsCatZ(codepoint
);
1823 case XML_REGEXP_SEPAR_SPACE
:
1824 ret
= xmlUCSIsCatZs(codepoint
);
1826 case XML_REGEXP_SEPAR_LINE
:
1827 ret
= xmlUCSIsCatZl(codepoint
);
1829 case XML_REGEXP_SEPAR_PARA
:
1830 ret
= xmlUCSIsCatZp(codepoint
);
1832 case XML_REGEXP_SYMBOL
:
1833 ret
= xmlUCSIsCatS(codepoint
);
1835 case XML_REGEXP_SYMBOL_MATH
:
1836 ret
= xmlUCSIsCatSm(codepoint
);
1838 case XML_REGEXP_SYMBOL_CURRENCY
:
1839 ret
= xmlUCSIsCatSc(codepoint
);
1841 case XML_REGEXP_SYMBOL_MODIFIER
:
1842 ret
= xmlUCSIsCatSk(codepoint
);
1844 case XML_REGEXP_SYMBOL_OTHERS
:
1845 ret
= xmlUCSIsCatSo(codepoint
);
1847 case XML_REGEXP_OTHER
:
1848 ret
= xmlUCSIsCatC(codepoint
);
1850 case XML_REGEXP_OTHER_CONTROL
:
1851 ret
= xmlUCSIsCatCc(codepoint
);
1853 case XML_REGEXP_OTHER_FORMAT
:
1854 ret
= xmlUCSIsCatCf(codepoint
);
1856 case XML_REGEXP_OTHER_PRIVATE
:
1857 ret
= xmlUCSIsCatCo(codepoint
);
1859 case XML_REGEXP_OTHER_NA
:
1860 /* ret = xmlUCSIsCatCn(codepoint); */
1861 /* Seems it doesn't exist anymore in recent Unicode releases */
1864 case XML_REGEXP_BLOCK_NAME
:
1865 ret
= xmlUCSIsBlock(codepoint
, (const char *) blockName
);
1874 xmlRegCheckCharacter(xmlRegAtomPtr atom
, int codepoint
) {
1876 xmlRegRangePtr range
;
1878 if ((atom
== NULL
) || (!xmlIsChar(codepoint
)))
1881 switch (atom
->type
) {
1882 case XML_REGEXP_SUBREG
:
1883 case XML_REGEXP_EPSILON
:
1885 case XML_REGEXP_CHARVAL
:
1886 return(codepoint
== atom
->codepoint
);
1887 case XML_REGEXP_RANGES
: {
1889 for (i
= 0;i
< atom
->nbRanges
;i
++) {
1890 range
= atom
->ranges
[i
];
1892 ret
= xmlRegCheckCharacterRange(range
->type
, codepoint
,
1893 0, range
->start
, range
->end
,
1896 return(0); /* excluded char */
1898 ret
= xmlRegCheckCharacterRange(range
->type
, codepoint
,
1899 0, range
->start
, range
->end
,
1902 accept
= 1; /* might still be excluded */
1907 case XML_REGEXP_STRING
:
1908 printf("TODO: XML_REGEXP_STRING\n");
1910 case XML_REGEXP_ANYCHAR
:
1911 case XML_REGEXP_ANYSPACE
:
1912 case XML_REGEXP_NOTSPACE
:
1913 case XML_REGEXP_INITNAME
:
1914 case XML_REGEXP_NOTINITNAME
:
1915 case XML_REGEXP_NAMECHAR
:
1916 case XML_REGEXP_NOTNAMECHAR
:
1917 case XML_REGEXP_DECIMAL
:
1918 case XML_REGEXP_NOTDECIMAL
:
1919 case XML_REGEXP_REALCHAR
:
1920 case XML_REGEXP_NOTREALCHAR
:
1921 case XML_REGEXP_LETTER
:
1922 case XML_REGEXP_LETTER_UPPERCASE
:
1923 case XML_REGEXP_LETTER_LOWERCASE
:
1924 case XML_REGEXP_LETTER_TITLECASE
:
1925 case XML_REGEXP_LETTER_MODIFIER
:
1926 case XML_REGEXP_LETTER_OTHERS
:
1927 case XML_REGEXP_MARK
:
1928 case XML_REGEXP_MARK_NONSPACING
:
1929 case XML_REGEXP_MARK_SPACECOMBINING
:
1930 case XML_REGEXP_MARK_ENCLOSING
:
1931 case XML_REGEXP_NUMBER
:
1932 case XML_REGEXP_NUMBER_DECIMAL
:
1933 case XML_REGEXP_NUMBER_LETTER
:
1934 case XML_REGEXP_NUMBER_OTHERS
:
1935 case XML_REGEXP_PUNCT
:
1936 case XML_REGEXP_PUNCT_CONNECTOR
:
1937 case XML_REGEXP_PUNCT_DASH
:
1938 case XML_REGEXP_PUNCT_OPEN
:
1939 case XML_REGEXP_PUNCT_CLOSE
:
1940 case XML_REGEXP_PUNCT_INITQUOTE
:
1941 case XML_REGEXP_PUNCT_FINQUOTE
:
1942 case XML_REGEXP_PUNCT_OTHERS
:
1943 case XML_REGEXP_SEPAR
:
1944 case XML_REGEXP_SEPAR_SPACE
:
1945 case XML_REGEXP_SEPAR_LINE
:
1946 case XML_REGEXP_SEPAR_PARA
:
1947 case XML_REGEXP_SYMBOL
:
1948 case XML_REGEXP_SYMBOL_MATH
:
1949 case XML_REGEXP_SYMBOL_CURRENCY
:
1950 case XML_REGEXP_SYMBOL_MODIFIER
:
1951 case XML_REGEXP_SYMBOL_OTHERS
:
1952 case XML_REGEXP_OTHER
:
1953 case XML_REGEXP_OTHER_CONTROL
:
1954 case XML_REGEXP_OTHER_FORMAT
:
1955 case XML_REGEXP_OTHER_PRIVATE
:
1956 case XML_REGEXP_OTHER_NA
:
1957 case XML_REGEXP_BLOCK_NAME
:
1958 ret
= xmlRegCheckCharacterRange(atom
->type
, codepoint
, 0, 0, 0,
1959 (const xmlChar
*)atom
->valuep
);
1967 /************************************************************************
1969 * Saving an restoring state of an execution context *
1971 ************************************************************************/
1973 #ifdef DEBUG_REGEXP_EXEC
1975 xmlFARegDebugExec(xmlRegExecCtxtPtr exec
) {
1976 printf("state: %d:%d:idx %d", exec
->state
->no
, exec
->transno
, exec
->index
);
1977 if (exec
->inputStack
!= NULL
) {
1980 for (i
= 0;(i
< 3) && (i
< exec
->inputStackNr
);i
++)
1981 printf("%s ", exec
->inputStack
[exec
->inputStackNr
- (i
+ 1)]);
1983 printf(": %s", &(exec
->inputString
[exec
->index
]));
1990 xmlFARegExecSave(xmlRegExecCtxtPtr exec
) {
1991 #ifdef DEBUG_REGEXP_EXEC
1994 xmlFARegDebugExec(exec
);
1998 if (exec
->maxRollbacks
== 0) {
1999 exec
->maxRollbacks
= 4;
2000 exec
->rollbacks
= (xmlRegExecRollback
*) xmlMalloc(exec
->maxRollbacks
*
2001 sizeof(xmlRegExecRollback
));
2002 if (exec
->rollbacks
== NULL
) {
2003 fprintf(stderr
, "exec save: allocation failed");
2004 exec
->maxRollbacks
= 0;
2007 memset(exec
->rollbacks
, 0,
2008 exec
->maxRollbacks
* sizeof(xmlRegExecRollback
));
2009 } else if (exec
->nbRollbacks
>= exec
->maxRollbacks
) {
2010 xmlRegExecRollback
*tmp
;
2011 int len
= exec
->maxRollbacks
;
2013 exec
->maxRollbacks
*= 2;
2014 tmp
= (xmlRegExecRollback
*) xmlRealloc(exec
->rollbacks
,
2015 exec
->maxRollbacks
* sizeof(xmlRegExecRollback
));
2017 fprintf(stderr
, "exec save: allocation failed");
2018 exec
->maxRollbacks
/= 2;
2021 exec
->rollbacks
= tmp
;
2022 tmp
= &exec
->rollbacks
[len
];
2023 memset(tmp
, 0, (exec
->maxRollbacks
- len
) * sizeof(xmlRegExecRollback
));
2025 exec
->rollbacks
[exec
->nbRollbacks
].state
= exec
->state
;
2026 exec
->rollbacks
[exec
->nbRollbacks
].index
= exec
->index
;
2027 exec
->rollbacks
[exec
->nbRollbacks
].nextbranch
= exec
->transno
+ 1;
2028 if (exec
->comp
->nbCounters
> 0) {
2029 if (exec
->rollbacks
[exec
->nbRollbacks
].counts
== NULL
) {
2030 exec
->rollbacks
[exec
->nbRollbacks
].counts
= (int *)
2031 xmlMalloc(exec
->comp
->nbCounters
* sizeof(int));
2032 if (exec
->rollbacks
[exec
->nbRollbacks
].counts
== NULL
) {
2033 fprintf(stderr
, "exec save: allocation failed");
2038 memcpy(exec
->rollbacks
[exec
->nbRollbacks
].counts
, exec
->counts
,
2039 exec
->comp
->nbCounters
* sizeof(int));
2041 exec
->nbRollbacks
++;
2045 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec
) {
2046 if (exec
->nbRollbacks
<= 0) {
2048 #ifdef DEBUG_REGEXP_EXEC
2049 printf("rollback failed on empty stack\n");
2053 exec
->nbRollbacks
--;
2054 exec
->state
= exec
->rollbacks
[exec
->nbRollbacks
].state
;
2055 exec
->index
= exec
->rollbacks
[exec
->nbRollbacks
].index
;
2056 exec
->transno
= exec
->rollbacks
[exec
->nbRollbacks
].nextbranch
;
2057 if (exec
->comp
->nbCounters
> 0) {
2058 if (exec
->rollbacks
[exec
->nbRollbacks
].counts
== NULL
) {
2059 fprintf(stderr
, "exec save: allocation failed");
2063 memcpy(exec
->counts
, exec
->rollbacks
[exec
->nbRollbacks
].counts
,
2064 exec
->comp
->nbCounters
* sizeof(int));
2067 #ifdef DEBUG_REGEXP_EXEC
2068 printf("restored ");
2069 xmlFARegDebugExec(exec
);
2073 /************************************************************************
2075 * Verifyer, running an input against a compiled regexp *
2077 ************************************************************************/
2080 xmlFARegExec(xmlRegexpPtr comp
, const xmlChar
*content
) {
2081 xmlRegExecCtxt execval
;
2082 xmlRegExecCtxtPtr exec
= &execval
;
2083 int ret
, codepoint
, len
;
2085 exec
->inputString
= content
;
2087 exec
->determinist
= 1;
2088 exec
->maxRollbacks
= 0;
2089 exec
->nbRollbacks
= 0;
2090 exec
->rollbacks
= NULL
;
2093 exec
->state
= comp
->states
[0];
2095 exec
->transcount
= 0;
2096 if (comp
->nbCounters
> 0) {
2097 exec
->counts
= (int *) xmlMalloc(comp
->nbCounters
* sizeof(int));
2098 if (exec
->counts
== NULL
)
2100 memset(exec
->counts
, 0, comp
->nbCounters
* sizeof(int));
2102 exec
->counts
= NULL
;
2103 while ((exec
->status
== 0) &&
2104 ((exec
->inputString
[exec
->index
] != 0) ||
2105 (exec
->state
->type
!= XML_REGEXP_FINAL_STATE
))) {
2106 xmlRegTransPtr trans
;
2110 * End of input on non-terminal state, rollback, however we may
2111 * still have epsilon like transition for counted transitions
2112 * on counters, in that case don't break too early.
2114 if ((exec
->inputString
[exec
->index
] == 0) && (exec
->counts
== NULL
))
2117 exec
->transcount
= 0;
2118 for (;exec
->transno
< exec
->state
->nbTrans
;exec
->transno
++) {
2119 trans
= &exec
->state
->trans
[exec
->transno
];
2124 if (trans
->count
>= 0) {
2126 xmlRegCounterPtr counter
;
2129 * A counted transition.
2132 count
= exec
->counts
[trans
->count
];
2133 counter
= &exec
->comp
->counters
[trans
->count
];
2134 #ifdef DEBUG_REGEXP_EXEC
2135 printf("testing count %d: val %d, min %d, max %d\n",
2136 trans
->count
, count
, counter
->min
, counter
->max
);
2138 ret
= ((count
>= counter
->min
) && (count
<= counter
->max
));
2139 } else if (atom
== NULL
) {
2140 fprintf(stderr
, "epsilon transition left at runtime\n");
2143 } else if (exec
->inputString
[exec
->index
] != 0) {
2144 codepoint
= CUR_SCHAR(&(exec
->inputString
[exec
->index
]), len
);
2145 ret
= xmlRegCheckCharacter(atom
, codepoint
);
2146 if ((ret
== 1) && (atom
->min
> 0) && (atom
->max
> 0)) {
2147 xmlRegStatePtr to
= comp
->states
[trans
->to
];
2150 * this is a multiple input sequence
2152 if (exec
->state
->nbTrans
> exec
->transno
+ 1) {
2153 xmlFARegExecSave(exec
);
2155 exec
->transcount
= 1;
2158 * Try to progress as much as possible on the input
2160 if (exec
->transcount
== atom
->max
) {
2165 * End of input: stop here
2167 if (exec
->inputString
[exec
->index
] == 0) {
2171 if (exec
->transcount
>= atom
->min
) {
2172 int transno
= exec
->transno
;
2173 xmlRegStatePtr state
= exec
->state
;
2176 * The transition is acceptable save it
2178 exec
->transno
= -1; /* trick */
2180 xmlFARegExecSave(exec
);
2181 exec
->transno
= transno
;
2182 exec
->state
= state
;
2184 codepoint
= CUR_SCHAR(&(exec
->inputString
[exec
->index
]),
2186 ret
= xmlRegCheckCharacter(atom
, codepoint
);
2189 if (exec
->transcount
< atom
->min
)
2193 * If the last check failed but one transition was found
2194 * possible, rollback
2204 if (exec
->state
->nbTrans
> exec
->transno
+ 1) {
2205 xmlFARegExecSave(exec
);
2207 if (trans
->counter
>= 0) {
2208 #ifdef DEBUG_REGEXP_EXEC
2209 printf("Increasing count %d\n", trans
->counter
);
2211 exec
->counts
[trans
->counter
]++;
2213 #ifdef DEBUG_REGEXP_EXEC
2214 printf("entering state %d\n", trans
->to
);
2216 exec
->state
= comp
->states
[trans
->to
];
2218 if (trans
->atom
!= NULL
) {
2222 } else if (ret
< 0) {
2227 if ((exec
->transno
!= 0) || (exec
->state
->nbTrans
== 0)) {
2230 * Failed to find a way out
2232 exec
->determinist
= 0;
2233 xmlFARegExecRollBack(exec
);
2238 if (exec
->rollbacks
!= NULL
) {
2239 if (exec
->counts
!= NULL
) {
2242 for (i
= 0;i
< exec
->maxRollbacks
;i
++)
2243 if (exec
->rollbacks
[i
].counts
!= NULL
)
2244 xmlFree(exec
->rollbacks
[i
].counts
);
2246 xmlFree(exec
->rollbacks
);
2248 if (exec
->counts
!= NULL
)
2249 xmlFree(exec
->counts
);
2250 if (exec
->status
== 0)
2252 if (exec
->status
== -1)
2254 return(exec
->status
);
2257 /************************************************************************
2259 * Progressive interface to the verifyer one atom at a time *
2261 ************************************************************************/
2264 * xmlRegExecCtxtPtr:
2265 * @comp: a precompiled regular expression
2266 * @callback: a callback function used for handling progresses in the
2267 * automata matching phase
2268 * @data: the context data associated to the callback in this context
2270 * Build a context used for progressive evaluation of a regexp.
2273 xmlRegNewExecCtxt(xmlRegexpPtr comp
, xmlRegExecCallbacks callback
, void *data
) {
2274 xmlRegExecCtxtPtr exec
;
2278 exec
= (xmlRegExecCtxtPtr
) xmlMalloc(sizeof(xmlRegExecCtxt
));
2282 memset(exec
, 0, sizeof(xmlRegExecCtxt
));
2283 exec
->inputString
= NULL
;
2285 exec
->determinist
= 1;
2286 exec
->maxRollbacks
= 0;
2287 exec
->nbRollbacks
= 0;
2288 exec
->rollbacks
= NULL
;
2291 if (comp
->compact
== NULL
)
2292 exec
->state
= comp
->states
[0];
2294 exec
->transcount
= 0;
2295 exec
->callback
= callback
;
2297 if (comp
->nbCounters
> 0) {
2298 exec
->counts
= (int *) xmlMalloc(comp
->nbCounters
* sizeof(int));
2299 if (exec
->counts
== NULL
) {
2303 memset(exec
->counts
, 0, comp
->nbCounters
* sizeof(int));
2305 exec
->counts
= NULL
;
2306 exec
->inputStackMax
= 0;
2307 exec
->inputStackNr
= 0;
2308 exec
->inputStack
= NULL
;
2313 * xmlRegFreeExecCtxt:
2314 * @exec: a regular expression evaulation context
2316 * Free the structures associated to a regular expression evaulation context.
2319 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec
) {
2323 if (exec
->rollbacks
!= NULL
) {
2324 if (exec
->counts
!= NULL
) {
2327 for (i
= 0;i
< exec
->maxRollbacks
;i
++)
2328 if (exec
->rollbacks
[i
].counts
!= NULL
)
2329 xmlFree(exec
->rollbacks
[i
].counts
);
2331 xmlFree(exec
->rollbacks
);
2333 if (exec
->counts
!= NULL
)
2334 xmlFree(exec
->counts
);
2335 if (exec
->inputStack
!= NULL
) {
2338 for (i
= 0;i
< exec
->inputStackNr
;i
++) {
2339 if (exec
->inputStack
[i
].value
!= NULL
)
2340 xmlFree(exec
->inputStack
[i
].value
);
2342 xmlFree(exec
->inputStack
);
2348 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec
, const xmlChar
*value
,
2351 printf("saving value: %d:%s\n", exec
->inputStackNr
, value
);
2353 if (exec
->inputStackMax
== 0) {
2354 exec
->inputStackMax
= 4;
2355 exec
->inputStack
= (xmlRegInputTokenPtr
)
2356 xmlMalloc(exec
->inputStackMax
* sizeof(xmlRegInputToken
));
2357 if (exec
->inputStack
== NULL
) {
2358 fprintf(stderr
, "push input: allocation failed");
2359 exec
->inputStackMax
= 0;
2362 } else if (exec
->inputStackNr
+ 1 >= exec
->inputStackMax
) {
2363 xmlRegInputTokenPtr tmp
;
2365 exec
->inputStackMax
*= 2;
2366 tmp
= (xmlRegInputTokenPtr
) xmlRealloc(exec
->inputStack
,
2367 exec
->inputStackMax
* sizeof(xmlRegInputToken
));
2369 fprintf(stderr
, "push input: allocation failed");
2370 exec
->inputStackMax
/= 2;
2373 exec
->inputStack
= tmp
;
2375 exec
->inputStack
[exec
->inputStackNr
].value
= xmlStrdup(value
);
2376 exec
->inputStack
[exec
->inputStackNr
].data
= data
;
2377 exec
->inputStackNr
++;
2378 exec
->inputStack
[exec
->inputStackNr
].value
= NULL
;
2379 exec
->inputStack
[exec
->inputStackNr
].data
= NULL
;
2384 * xmlRegCompactPushString:
2385 * @exec: a regexp execution context
2386 * @comp: the precompiled exec with a compact table
2387 * @value: a string token input
2388 * @data: data associated to the token to reuse in callbacks
2390 * Push one input token in the execution context
2392 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2393 * a negative value in case of error.
2396 xmlRegCompactPushString(xmlRegExecCtxtPtr exec
,
2398 const xmlChar
*value
,
2400 int state
= exec
->index
;
2403 if ((comp
== NULL
) || (comp
->compact
== NULL
) || (comp
->stringMap
== NULL
))
2406 if (value
== NULL
) {
2408 * are we at a final state ?
2410 if (comp
->compact
[state
* (comp
->nbstrings
+ 1)] ==
2411 XML_REGEXP_FINAL_STATE
)
2417 printf("value pushed: %s\n", value
);
2421 * Examine all outside transition from current state
2423 for (i
= 0;i
< comp
->nbstrings
;i
++) {
2424 target
= comp
->compact
[state
* (comp
->nbstrings
+ 1) + i
+ 1];
2425 if ((target
> 0) && (target
<= comp
->nbstates
)) {
2426 target
--; /* to avoid 0 */
2427 if (xmlStrEqual(comp
->stringMap
[i
], value
)) {
2428 exec
->index
= target
;
2429 if ((exec
->callback
!= NULL
) && (comp
->transdata
!= NULL
)) {
2430 exec
->callback(exec
->data
, value
,
2431 comp
->transdata
[state
* comp
->nbstrings
+ i
], data
);
2434 printf("entering state %d\n", target
);
2436 if (comp
->compact
[target
* (comp
->nbstrings
+ 1)] ==
2437 XML_REGEXP_FINAL_STATE
)
2444 * Failed to find an exit transition out from current state for the
2448 printf("failed to find a transition for %s on state %d\n", value
, state
);
2455 * xmlRegExecPushString:
2456 * @exec: a regexp execution context
2457 * @value: a string token input
2458 * @data: data associated to the token to reuse in callbacks
2460 * Push one input token in the execution context
2462 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2463 * a negative value in case of error.
2466 xmlRegExecPushString(xmlRegExecCtxtPtr exec
, const xmlChar
*value
,
2468 xmlRegTransPtr trans
;
2475 if (exec
->comp
== NULL
)
2477 if (exec
->status
!= 0)
2478 return(exec
->status
);
2480 if (exec
->comp
->compact
!= NULL
)
2481 return(xmlRegCompactPushString(exec
, exec
->comp
, value
, data
));
2483 if (value
== NULL
) {
2484 if (exec
->state
->type
== XML_REGEXP_FINAL_STATE
)
2490 printf("value pushed: %s\n", value
);
2493 * If we have an active rollback stack push the new value there
2494 * and get back to where we were left
2496 if ((value
!= NULL
) && (exec
->inputStackNr
> 0)) {
2497 xmlFARegExecSaveInputString(exec
, value
, data
);
2498 value
= exec
->inputStack
[exec
->index
].value
;
2499 data
= exec
->inputStack
[exec
->index
].data
;
2501 printf("value loaded: %s\n", value
);
2505 while ((exec
->status
== 0) &&
2508 (exec
->state
->type
!= XML_REGEXP_FINAL_STATE
)))) {
2511 * End of input on non-terminal state, rollback, however we may
2512 * still have epsilon like transition for counted transitions
2513 * on counters, in that case don't break too early.
2515 if ((value
== NULL
) && (exec
->counts
== NULL
))
2518 exec
->transcount
= 0;
2519 for (;exec
->transno
< exec
->state
->nbTrans
;exec
->transno
++) {
2520 trans
= &exec
->state
->trans
[exec
->transno
];
2525 if (trans
->count
== REGEXP_ALL_LAX_COUNTER
) {
2529 xmlRegCounterPtr counter
;
2534 printf("testing all lax %d\n", trans
->count
);
2537 * Check all counted transitions from the current state
2539 if ((value
== NULL
) && (final
)) {
2541 } else if (value
!= NULL
) {
2542 for (i
= 0;i
< exec
->state
->nbTrans
;i
++) {
2543 t
= &exec
->state
->trans
[i
];
2544 if ((t
->counter
< 0) || (t
== trans
))
2546 counter
= &exec
->comp
->counters
[t
->counter
];
2547 count
= exec
->counts
[t
->counter
];
2548 if ((count
< counter
->max
) &&
2549 (t
->atom
!= NULL
) &&
2550 (xmlStrEqual(value
, t
->atom
->valuep
))) {
2554 if ((count
>= counter
->min
) &&
2555 (count
< counter
->max
) &&
2556 (xmlStrEqual(value
, t
->atom
->valuep
))) {
2562 } else if (trans
->count
== REGEXP_ALL_COUNTER
) {
2566 xmlRegCounterPtr counter
;
2571 printf("testing all %d\n", trans
->count
);
2574 * Check all counted transitions from the current state
2576 for (i
= 0;i
< exec
->state
->nbTrans
;i
++) {
2577 t
= &exec
->state
->trans
[i
];
2578 if ((t
->counter
< 0) || (t
== trans
))
2580 counter
= &exec
->comp
->counters
[t
->counter
];
2581 count
= exec
->counts
[t
->counter
];
2582 if ((count
< counter
->min
) || (count
> counter
->max
)) {
2587 } else if (trans
->count
>= 0) {
2589 xmlRegCounterPtr counter
;
2592 * A counted transition.
2595 count
= exec
->counts
[trans
->count
];
2596 counter
= &exec
->comp
->counters
[trans
->count
];
2598 printf("testing count %d: val %d, min %d, max %d\n",
2599 trans
->count
, count
, counter
->min
, counter
->max
);
2601 ret
= ((count
>= counter
->min
) && (count
<= counter
->max
));
2602 } else if (atom
== NULL
) {
2603 fprintf(stderr
, "epsilon transition left at runtime\n");
2606 } else if (value
!= NULL
) {
2607 ret
= xmlStrEqual(value
, atom
->valuep
);
2608 if ((ret
== 1) && (trans
->counter
>= 0)) {
2609 xmlRegCounterPtr counter
;
2612 count
= exec
->counts
[trans
->counter
];
2613 counter
= &exec
->comp
->counters
[trans
->counter
];
2614 if (count
>= counter
->max
)
2618 if ((ret
== 1) && (atom
->min
> 0) && (atom
->max
> 0)) {
2619 xmlRegStatePtr to
= exec
->comp
->states
[trans
->to
];
2622 * this is a multiple input sequence
2624 if (exec
->state
->nbTrans
> exec
->transno
+ 1) {
2625 if (exec
->inputStackNr
<= 0) {
2626 xmlFARegExecSaveInputString(exec
, value
, data
);
2628 xmlFARegExecSave(exec
);
2630 exec
->transcount
= 1;
2633 * Try to progress as much as possible on the input
2635 if (exec
->transcount
== atom
->max
) {
2639 value
= exec
->inputStack
[exec
->index
].value
;
2640 data
= exec
->inputStack
[exec
->index
].data
;
2642 printf("value loaded: %s\n", value
);
2646 * End of input: stop here
2648 if (value
== NULL
) {
2652 if (exec
->transcount
>= atom
->min
) {
2653 int transno
= exec
->transno
;
2654 xmlRegStatePtr state
= exec
->state
;
2657 * The transition is acceptable save it
2659 exec
->transno
= -1; /* trick */
2661 if (exec
->inputStackNr
<= 0) {
2662 xmlFARegExecSaveInputString(exec
, value
, data
);
2664 xmlFARegExecSave(exec
);
2665 exec
->transno
= transno
;
2666 exec
->state
= state
;
2668 ret
= xmlStrEqual(value
, atom
->valuep
);
2671 if (exec
->transcount
< atom
->min
)
2675 * If the last check failed but one transition was found
2676 * possible, rollback
2686 if ((exec
->callback
!= NULL
) && (atom
!= NULL
)) {
2687 exec
->callback(exec
->data
, atom
->valuep
,
2690 if (exec
->state
->nbTrans
> exec
->transno
+ 1) {
2691 if (exec
->inputStackNr
<= 0) {
2692 xmlFARegExecSaveInputString(exec
, value
, data
);
2694 xmlFARegExecSave(exec
);
2696 if (trans
->counter
>= 0) {
2698 printf("Increasing count %d\n", trans
->counter
);
2700 exec
->counts
[trans
->counter
]++;
2703 printf("entering state %d\n", trans
->to
);
2705 exec
->state
= exec
->comp
->states
[trans
->to
];
2707 if (trans
->atom
!= NULL
) {
2708 if (exec
->inputStack
!= NULL
) {
2710 if (exec
->index
< exec
->inputStackNr
) {
2711 value
= exec
->inputStack
[exec
->index
].value
;
2712 data
= exec
->inputStack
[exec
->index
].data
;
2714 printf("value loaded: %s\n", value
);
2720 printf("end of input\n");
2727 printf("end of input\n");
2732 } else if (ret
< 0) {
2737 if ((exec
->transno
!= 0) || (exec
->state
->nbTrans
== 0)) {
2740 * Failed to find a way out
2742 exec
->determinist
= 0;
2743 xmlFARegExecRollBack(exec
);
2744 if (exec
->status
== 0) {
2745 value
= exec
->inputStack
[exec
->index
].value
;
2746 data
= exec
->inputStack
[exec
->index
].data
;
2748 printf("value loaded: %s\n", value
);
2755 if (exec
->status
== 0) {
2756 return(exec
->state
->type
== XML_REGEXP_FINAL_STATE
);
2758 return(exec
->status
);
2763 xmlRegExecPushChar(xmlRegExecCtxtPtr exec
, int UCS
) {
2764 xmlRegTransPtr trans
;
2771 if (exec
->status
!= 0)
2772 return(exec
->status
);
2774 while ((exec
->status
== 0) &&
2775 ((exec
->inputString
[exec
->index
] != 0) ||
2776 (exec
->state
->type
!= XML_REGEXP_FINAL_STATE
))) {
2779 * End of input on non-terminal state, rollback, however we may
2780 * still have epsilon like transition for counted transitions
2781 * on counters, in that case don't break too early.
2783 if ((exec
->inputString
[exec
->index
] == 0) && (exec
->counts
== NULL
))
2786 exec
->transcount
= 0;
2787 for (;exec
->transno
< exec
->state
->nbTrans
;exec
->transno
++) {
2788 trans
= &exec
->state
->trans
[exec
->transno
];
2793 if (trans
->count
>= 0) {
2795 xmlRegCounterPtr counter
;
2798 * A counted transition.
2801 count
= exec
->counts
[trans
->count
];
2802 counter
= &exec
->comp
->counters
[trans
->count
];
2803 #ifdef DEBUG_REGEXP_EXEC
2804 printf("testing count %d: val %d, min %d, max %d\n",
2805 trans
->count
, count
, counter
->min
, counter
->max
);
2807 ret
= ((count
>= counter
->min
) && (count
<= counter
->max
));
2808 } else if (atom
== NULL
) {
2809 fprintf(stderr
, "epsilon transition left at runtime\n");
2812 } else if (exec
->inputString
[exec
->index
] != 0) {
2813 codepoint
= CUR_SCHAR(&(exec
->inputString
[exec
->index
]), len
);
2814 ret
= xmlRegCheckCharacter(atom
, codepoint
);
2815 if ((ret
== 1) && (atom
->min
> 0) && (atom
->max
> 0)) {
2816 xmlRegStatePtr to
= exec
->comp
->states
[trans
->to
];
2819 * this is a multiple input sequence
2821 if (exec
->state
->nbTrans
> exec
->transno
+ 1) {
2822 xmlFARegExecSave(exec
);
2824 exec
->transcount
= 1;
2827 * Try to progress as much as possible on the input
2829 if (exec
->transcount
== atom
->max
) {
2834 * End of input: stop here
2836 if (exec
->inputString
[exec
->index
] == 0) {
2840 if (exec
->transcount
>= atom
->min
) {
2841 int transno
= exec
->transno
;
2842 xmlRegStatePtr state
= exec
->state
;
2845 * The transition is acceptable save it
2847 exec
->transno
= -1; /* trick */
2849 xmlFARegExecSave(exec
);
2850 exec
->transno
= transno
;
2851 exec
->state
= state
;
2853 codepoint
= CUR_SCHAR(&(exec
->inputString
[exec
->index
]),
2855 ret
= xmlRegCheckCharacter(atom
, codepoint
);
2858 if (exec
->transcount
< atom
->min
)
2862 * If the last check failed but one transition was found
2863 * possible, rollback
2873 if (exec
->state
->nbTrans
> exec
->transno
+ 1) {
2874 xmlFARegExecSave(exec
);
2876 if (trans
->counter
>= 0) {
2877 #ifdef DEBUG_REGEXP_EXEC
2878 printf("Increasing count %d\n", trans
->counter
);
2880 exec
->counts
[trans
->counter
]++;
2882 #ifdef DEBUG_REGEXP_EXEC
2883 printf("entering state %d\n", trans
->to
);
2885 exec
->state
= exec
->comp
->states
[trans
->to
];
2887 if (trans
->atom
!= NULL
) {
2891 } else if (ret
< 0) {
2896 if ((exec
->transno
!= 0) || (exec
->state
->nbTrans
== 0)) {
2899 * Failed to find a way out
2901 exec
->determinist
= 0;
2902 xmlFARegExecRollBack(exec
);
2909 /************************************************************************
2911 * Parser for the Shemas Datatype Regular Expressions *
2912 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
2914 ************************************************************************/
2918 * @ctxt: a regexp parser context
2920 * [10] Char ::= [^.\?*+()|#x5B#x5D]
2923 xmlFAIsChar(xmlRegParserCtxtPtr ctxt
) {
2927 cur
= CUR_SCHAR(ctxt
->cur
, len
);
2928 if ((cur
== '.') || (cur
== '\\') || (cur
== '?') ||
2929 (cur
== '*') || (cur
== '+') || (cur
== '(') ||
2930 (cur
== ')') || (cur
== '|') || (cur
== 0x5B) ||
2931 (cur
== 0x5D) || (cur
== 0))
2937 * xmlFAParseCharProp:
2938 * @ctxt: a regexp parser context
2940 * [27] charProp ::= IsCategory | IsBlock
2941 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
2942 * Separators | Symbols | Others
2943 * [29] Letters ::= 'L' [ultmo]?
2944 * [30] Marks ::= 'M' [nce]?
2945 * [31] Numbers ::= 'N' [dlo]?
2946 * [32] Punctuation ::= 'P' [cdseifo]?
2947 * [33] Separators ::= 'Z' [slp]?
2948 * [34] Symbols ::= 'S' [mcko]?
2949 * [35] Others ::= 'C' [cfon]?
2950 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
2953 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt
) {
2955 xmlRegAtomType type
= 0;
2956 xmlChar
*blockName
= NULL
;
2964 type
= XML_REGEXP_LETTER_UPPERCASE
;
2965 } else if (cur
== 'l') {
2967 type
= XML_REGEXP_LETTER_LOWERCASE
;
2968 } else if (cur
== 't') {
2970 type
= XML_REGEXP_LETTER_TITLECASE
;
2971 } else if (cur
== 'm') {
2973 type
= XML_REGEXP_LETTER_MODIFIER
;
2974 } else if (cur
== 'o') {
2976 type
= XML_REGEXP_LETTER_OTHERS
;
2978 type
= XML_REGEXP_LETTER
;
2980 } else if (cur
== 'M') {
2986 type
= XML_REGEXP_MARK_NONSPACING
;
2987 } else if (cur
== 'c') {
2989 /* spacing combining */
2990 type
= XML_REGEXP_MARK_SPACECOMBINING
;
2991 } else if (cur
== 'e') {
2994 type
= XML_REGEXP_MARK_ENCLOSING
;
2997 type
= XML_REGEXP_MARK
;
2999 } else if (cur
== 'N') {
3005 type
= XML_REGEXP_NUMBER_DECIMAL
;
3006 } else if (cur
== 'l') {
3009 type
= XML_REGEXP_NUMBER_LETTER
;
3010 } else if (cur
== 'o') {
3013 type
= XML_REGEXP_NUMBER_OTHERS
;
3016 type
= XML_REGEXP_NUMBER
;
3018 } else if (cur
== 'P') {
3024 type
= XML_REGEXP_PUNCT_CONNECTOR
;
3025 } else if (cur
== 'd') {
3028 type
= XML_REGEXP_PUNCT_DASH
;
3029 } else if (cur
== 's') {
3032 type
= XML_REGEXP_PUNCT_OPEN
;
3033 } else if (cur
== 'e') {
3036 type
= XML_REGEXP_PUNCT_CLOSE
;
3037 } else if (cur
== 'i') {
3040 type
= XML_REGEXP_PUNCT_INITQUOTE
;
3041 } else if (cur
== 'f') {
3044 type
= XML_REGEXP_PUNCT_FINQUOTE
;
3045 } else if (cur
== 'o') {
3048 type
= XML_REGEXP_PUNCT_OTHERS
;
3050 /* all punctuation */
3051 type
= XML_REGEXP_PUNCT
;
3053 } else if (cur
== 'Z') {
3059 type
= XML_REGEXP_SEPAR_SPACE
;
3060 } else if (cur
== 'l') {
3063 type
= XML_REGEXP_SEPAR_LINE
;
3064 } else if (cur
== 'p') {
3067 type
= XML_REGEXP_SEPAR_PARA
;
3069 /* all separators */
3070 type
= XML_REGEXP_SEPAR
;
3072 } else if (cur
== 'S') {
3077 type
= XML_REGEXP_SYMBOL_MATH
;
3079 } else if (cur
== 'c') {
3081 type
= XML_REGEXP_SYMBOL_CURRENCY
;
3083 } else if (cur
== 'k') {
3085 type
= XML_REGEXP_SYMBOL_MODIFIER
;
3087 } else if (cur
== 'o') {
3089 type
= XML_REGEXP_SYMBOL_OTHERS
;
3093 type
= XML_REGEXP_SYMBOL
;
3095 } else if (cur
== 'C') {
3101 type
= XML_REGEXP_OTHER_CONTROL
;
3102 } else if (cur
== 'f') {
3105 type
= XML_REGEXP_OTHER_FORMAT
;
3106 } else if (cur
== 'o') {
3109 type
= XML_REGEXP_OTHER_PRIVATE
;
3110 } else if (cur
== 'n') {
3113 type
= XML_REGEXP_OTHER_NA
;
3116 type
= XML_REGEXP_OTHER
;
3118 } else if (cur
== 'I') {
3119 const xmlChar
*start
;
3123 ERROR("IsXXXX expected");
3129 if (((cur
>= 'a') && (cur
<= 'z')) ||
3130 ((cur
>= 'A') && (cur
<= 'Z')) ||
3131 ((cur
>= '0') && (cur
<= '9')) ||
3135 while (((cur
>= 'a') && (cur
<= 'z')) ||
3136 ((cur
>= 'A') && (cur
<= 'Z')) ||
3137 ((cur
>= '0') && (cur
<= '9')) ||
3143 type
= XML_REGEXP_BLOCK_NAME
;
3144 blockName
= xmlStrndup(start
, ctxt
->cur
- start
);
3146 ERROR("Unknown char property");
3149 if (ctxt
->atom
== NULL
) {
3150 ctxt
->atom
= xmlRegNewAtom(ctxt
, type
);
3151 if (ctxt
->atom
!= NULL
)
3152 ctxt
->atom
->valuep
= blockName
;
3153 } else if (ctxt
->atom
->type
== XML_REGEXP_RANGES
) {
3154 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
3155 type
, 0, 0, blockName
);
3160 * xmlFAParseCharClassEsc:
3161 * @ctxt: a regexp parser context
3163 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
3164 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
3165 * [25] catEsc ::= '\p{' charProp '}'
3166 * [26] complEsc ::= '\P{' charProp '}'
3167 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
3170 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt
) {
3174 if (ctxt
->atom
== NULL
) {
3175 ctxt
->atom
= xmlRegNewAtom(ctxt
, XML_REGEXP_ANYCHAR
);
3176 } else if (ctxt
->atom
->type
== XML_REGEXP_RANGES
) {
3177 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
3178 XML_REGEXP_ANYCHAR
, 0, 0, NULL
);
3184 ERROR("Escaped sequence: expecting \\");
3192 ERROR("Expecting '{'");
3196 xmlFAParseCharProp(ctxt
);
3198 ERROR("Expecting '}'");
3202 } else if (cur
== 'P') {
3205 ERROR("Expecting '{'");
3209 xmlFAParseCharProp(ctxt
);
3210 ctxt
->atom
->neg
= 1;
3212 ERROR("Expecting '}'");
3216 } else if ((cur
== 'n') || (cur
== 'r') || (cur
== 't') || (cur
== '\\') ||
3217 (cur
== '|') || (cur
== '.') || (cur
== '?') || (cur
== '*') ||
3218 (cur
== '+') || (cur
== '(') || (cur
== ')') || (cur
== '{') ||
3219 (cur
== '}') || (cur
== 0x2D) || (cur
== 0x5B) || (cur
== 0x5D) ||
3221 if (ctxt
->atom
== NULL
) {
3222 ctxt
->atom
= xmlRegNewAtom(ctxt
, XML_REGEXP_CHARVAL
);
3223 if (ctxt
->atom
!= NULL
)
3224 ctxt
->atom
->codepoint
= cur
;
3225 } else if (ctxt
->atom
->type
== XML_REGEXP_RANGES
) {
3226 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
3227 XML_REGEXP_CHARVAL
, cur
, cur
, NULL
);
3230 } else if ((cur
== 's') || (cur
== 'S') || (cur
== 'i') || (cur
== 'I') ||
3231 (cur
== 'c') || (cur
== 'C') || (cur
== 'd') || (cur
== 'D') ||
3232 (cur
== 'w') || (cur
== 'W')) {
3233 xmlRegAtomType type
= XML_REGEXP_ANYSPACE
;
3237 type
= XML_REGEXP_ANYSPACE
;
3240 type
= XML_REGEXP_NOTSPACE
;
3243 type
= XML_REGEXP_INITNAME
;
3246 type
= XML_REGEXP_NOTINITNAME
;
3249 type
= XML_REGEXP_NAMECHAR
;
3252 type
= XML_REGEXP_NOTNAMECHAR
;
3255 type
= XML_REGEXP_DECIMAL
;
3258 type
= XML_REGEXP_NOTDECIMAL
;
3261 type
= XML_REGEXP_REALCHAR
;
3264 type
= XML_REGEXP_NOTREALCHAR
;
3268 if (ctxt
->atom
== NULL
) {
3269 ctxt
->atom
= xmlRegNewAtom(ctxt
, type
);
3270 } else if (ctxt
->atom
->type
== XML_REGEXP_RANGES
) {
3271 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
3278 * xmlFAParseCharRef:
3279 * @ctxt: a regexp parser context
3281 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' )
3284 xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt
) {
3287 if ((CUR
!= '&') || (NXT(1) != '#'))
3295 if (((cur
>= '0') && (cur
<= '9')) ||
3296 ((cur
>= 'a') && (cur
<= 'f')) ||
3297 ((cur
>= 'A') && (cur
<= 'F'))) {
3298 while (((cur
>= '0') && (cur
<= '9')) ||
3299 ((cur
>= 'A') && (cur
<= 'F'))) {
3300 if ((cur
>= '0') && (cur
<= '9'))
3301 ret
= ret
* 16 + cur
- '0';
3302 else if ((cur
>= 'a') && (cur
<= 'f'))
3303 ret
= ret
* 16 + 10 + (cur
- 'a');
3305 ret
= ret
* 16 + 10 + (cur
- 'A');
3310 ERROR("Char ref: expecting [0-9A-F]");
3314 if ((cur
>= '0') && (cur
<= '9')) {
3315 while ((cur
>= '0') && (cur
<= '9')) {
3316 ret
= ret
* 10 + cur
- '0';
3321 ERROR("Char ref: expecting [0-9]");
3326 ERROR("Char ref: expecting ';'");
3335 * xmlFAParseCharRange:
3336 * @ctxt: a regexp parser context
3338 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
3339 * [18] seRange ::= charOrEsc '-' charOrEsc
3340 * [20] charOrEsc ::= XmlChar | SingleCharEsc
3341 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
3342 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
3345 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt
) {
3350 if ((CUR
== '&') && (NXT(1) == '#')) {
3351 end
= start
= xmlFAParseCharRef(ctxt
);
3352 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
3353 XML_REGEXP_CHARVAL
, start
, end
, NULL
);
3361 case 'n': start
= 0xA; break;
3362 case 'r': start
= 0xD; break;
3363 case 't': start
= 0x9; break;
3364 case '\\': case '|': case '.': case '-': case '^': case '?':
3365 case '*': case '+': case '{': case '}': case '(': case ')':
3369 ERROR("Invalid escape value");
3373 } else if ((cur
!= 0x5B) && (cur
!= 0x5D)) {
3376 ERROR("Expecting a char range");
3385 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
3386 XML_REGEXP_CHARVAL
, start
, end
, NULL
);
3395 case 'n': end
= 0xA; break;
3396 case 'r': end
= 0xD; break;
3397 case 't': end
= 0x9; break;
3398 case '\\': case '|': case '.': case '-': case '^': case '?':
3399 case '*': case '+': case '{': case '}': case '(': case ')':
3403 ERROR("Invalid escape value");
3406 } else if ((cur
!= 0x5B) && (cur
!= 0x5D)) {
3409 ERROR("Expecting the end of a char range");
3413 /* TODO check that the values are acceptable character ranges for XML */
3415 ERROR("End of range is before start of range");
3417 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
3418 XML_REGEXP_CHARVAL
, start
, end
, NULL
);
3424 * xmlFAParsePosCharGroup:
3425 * @ctxt: a regexp parser context
3427 * [14] posCharGroup ::= ( charRange | charClassEsc )+
3430 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt
) {
3432 if ((CUR
== '\\') || (CUR
== '.')) {
3433 xmlFAParseCharClassEsc(ctxt
);
3435 xmlFAParseCharRange(ctxt
);
3437 } while ((CUR
!= ']') && (CUR
!= '^') && (CUR
!= '-') &&
3438 (ctxt
->error
== 0));
3442 * xmlFAParseCharGroup:
3443 * @ctxt: a regexp parser context
3445 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
3446 * [15] negCharGroup ::= '^' posCharGroup
3447 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
3448 * [12] charClassExpr ::= '[' charGroup ']'
3451 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt
) {
3453 while ((CUR
!= ']') && (ctxt
->error
== 0)) {
3455 int neg
= ctxt
->neg
;
3458 ctxt
->neg
= !ctxt
->neg
;
3459 xmlFAParsePosCharGroup(ctxt
);
3461 } else if (CUR
== '-') {
3463 ctxt
->neg
= !ctxt
->neg
;
3465 ERROR("charClassExpr: '[' expected");
3469 xmlFAParseCharGroup(ctxt
);
3473 ERROR("charClassExpr: ']' expected");
3477 } else if (CUR
!= ']') {
3478 xmlFAParsePosCharGroup(ctxt
);
3485 * xmlFAParseCharClass:
3486 * @ctxt: a regexp parser context
3488 * [11] charClass ::= charClassEsc | charClassExpr
3489 * [12] charClassExpr ::= '[' charGroup ']'
3492 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt
) {
3495 ctxt
->atom
= xmlRegNewAtom(ctxt
, XML_REGEXP_RANGES
);
3496 if (ctxt
->atom
== NULL
)
3498 xmlFAParseCharGroup(ctxt
);
3502 ERROR("xmlFAParseCharClass: ']' expected");
3505 xmlFAParseCharClassEsc(ctxt
);
3510 * xmlFAParseQuantExact:
3511 * @ctxt: a regexp parser context
3513 * [8] QuantExact ::= [0-9]+
3516 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt
) {
3520 while ((CUR
>= '0') && (CUR
<= '9')) {
3521 ret
= ret
* 10 + (CUR
- '0');
3532 * xmlFAParseQuantifier:
3533 * @ctxt: a regexp parser context
3535 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
3536 * [5] quantity ::= quantRange | quantMin | QuantExact
3537 * [6] quantRange ::= QuantExact ',' QuantExact
3538 * [7] quantMin ::= QuantExact ','
3539 * [8] QuantExact ::= [0-9]+
3542 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt
) {
3546 if ((cur
== '?') || (cur
== '*') || (cur
== '+')) {
3547 if (ctxt
->atom
!= NULL
) {
3549 ctxt
->atom
->quant
= XML_REGEXP_QUANT_OPT
;
3550 else if (cur
== '*')
3551 ctxt
->atom
->quant
= XML_REGEXP_QUANT_MULT
;
3552 else if (cur
== '+')
3553 ctxt
->atom
->quant
= XML_REGEXP_QUANT_PLUS
;
3559 int min
= 0, max
= 0;
3562 cur
= xmlFAParseQuantExact(ctxt
);
3567 cur
= xmlFAParseQuantExact(ctxt
);
3574 ERROR("Unterminated quantifier");
3578 if (ctxt
->atom
!= NULL
) {
3579 ctxt
->atom
->quant
= XML_REGEXP_QUANT_RANGE
;
3580 ctxt
->atom
->min
= min
;
3581 ctxt
->atom
->max
= max
;
3590 * @ctxt: a regexp parser context
3592 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
3595 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt
) {
3598 codepoint
= xmlFAIsChar(ctxt
);
3599 if (codepoint
> 0) {
3600 ctxt
->atom
= xmlRegNewAtom(ctxt
, XML_REGEXP_CHARVAL
);
3601 if (ctxt
->atom
== NULL
)
3603 codepoint
= CUR_SCHAR(ctxt
->cur
, len
);
3604 ctxt
->atom
->codepoint
= codepoint
;
3607 } else if (CUR
== '|') {
3609 } else if (CUR
== 0) {
3611 } else if (CUR
== ')') {
3613 } else if (CUR
== '(') {
3614 xmlRegStatePtr start
, oldend
;
3617 xmlFAGenerateEpsilonTransition(ctxt
, ctxt
->state
, NULL
);
3618 start
= ctxt
->state
;
3622 xmlFAParseRegExp(ctxt
, 0);
3626 ERROR("xmlFAParseAtom: expecting ')'");
3628 ctxt
->atom
= xmlRegNewAtom(ctxt
, XML_REGEXP_SUBREG
);
3629 if (ctxt
->atom
== NULL
)
3631 ctxt
->atom
->start
= start
;
3632 ctxt
->atom
->stop
= ctxt
->state
;
3635 } else if ((CUR
== '[') || (CUR
== '\\') || (CUR
== '.')) {
3636 xmlFAParseCharClass(ctxt
);
3644 * @ctxt: a regexp parser context
3646 * [3] piece ::= atom quantifier?
3649 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt
) {
3653 ret
= xmlFAParseAtom(ctxt
);
3656 if (ctxt
->atom
== NULL
) {
3657 ERROR("internal: no atom generated");
3659 xmlFAParseQuantifier(ctxt
);
3665 * @ctxt: a regexp parser context
3666 * @first: is taht the first
3668 * [2] branch ::= piece*
3671 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt
, int first
) {
3672 xmlRegStatePtr previous
;
3673 xmlRegAtomPtr prevatom
= NULL
;
3676 previous
= ctxt
->state
;
3677 ret
= xmlFAParsePiece(ctxt
);
3680 xmlFAGenerateTransitions(ctxt
, previous
, NULL
, ctxt
->atom
);
3681 previous
= ctxt
->state
;
3683 prevatom
= ctxt
->atom
;
3687 while ((ret
!= 0) && (ctxt
->error
== 0)) {
3688 ret
= xmlFAParsePiece(ctxt
);
3691 xmlFAGenerateTransitions(ctxt
, previous
, NULL
, ctxt
->atom
);
3693 xmlFAGenerateTransitions(ctxt
, previous
, NULL
, prevatom
);
3694 prevatom
= ctxt
->atom
;
3696 previous
= ctxt
->state
;
3701 xmlFAGenerateTransitions(ctxt
, previous
, ctxt
->end
, prevatom
);
3707 * @ctxt: a regexp parser context
3708 * @top: is that the top-level expressions ?
3710 * [1] regExp ::= branch ( '|' branch )*
3713 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt
, int top
) {
3714 xmlRegStatePtr start
, end
, oldend
;
3718 start
= ctxt
->state
;
3719 xmlFAParseBranch(ctxt
, (ctxt
->end
== NULL
));
3721 ctxt
->end
= ctxt
->state
;
3725 while ((CUR
== '|') && (ctxt
->error
== 0)) {
3727 ctxt
->state
= start
;
3729 xmlFAParseBranch(ctxt
, 0);
3735 /************************************************************************
3739 ************************************************************************/
3743 * @output: the file for the output debug
3744 * @regexp: the compiled regexp
3746 * Print the content of the compiled regular expression
3749 xmlRegexpPrint(FILE *output
, xmlRegexpPtr regexp
) {
3752 fprintf(output
, " regexp: ");
3753 if (regexp
== NULL
) {
3754 fprintf(output
, "NULL\n");
3757 fprintf(output
, "'%s' ", regexp
->string
);
3758 fprintf(output
, "\n");
3759 fprintf(output
, "%d atoms:\n", regexp
->nbAtoms
);
3760 for (i
= 0;i
< regexp
->nbAtoms
; i
++) {
3761 fprintf(output
, " %02d ", i
);
3762 xmlRegPrintAtom(output
, regexp
->atoms
[i
]);
3764 fprintf(output
, "%d states:", regexp
->nbStates
);
3765 fprintf(output
, "\n");
3766 for (i
= 0;i
< regexp
->nbStates
; i
++) {
3767 xmlRegPrintState(output
, regexp
->states
[i
]);
3769 fprintf(output
, "%d counters:\n", regexp
->nbCounters
);
3770 for (i
= 0;i
< regexp
->nbCounters
; i
++) {
3771 fprintf(output
, " %d: min %d max %d\n", i
, regexp
->counters
[i
].min
,
3772 regexp
->counters
[i
].max
);
3778 * @regexp: a regular expression string
3780 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
3781 * Appendix F and build an automata suitable for testing strings against
3782 * that regular expression
3784 * Returns the compiled expression or NULL in case of error
3787 xmlRegexpCompile(const xmlChar
*regexp
) {
3789 xmlRegParserCtxtPtr ctxt
;
3791 ctxt
= xmlRegNewParserCtxt(regexp
);
3795 /* initialize the parser */
3797 ctxt
->start
= ctxt
->state
= xmlRegNewState(ctxt
);
3798 xmlRegStatePush(ctxt
, ctxt
->start
);
3800 /* parse the expression building an automata */
3801 xmlFAParseRegExp(ctxt
, 1);
3803 ERROR("xmlFAParseRegExp: extra characters");
3805 ctxt
->end
= ctxt
->state
;
3806 ctxt
->start
->type
= XML_REGEXP_START_STATE
;
3807 ctxt
->end
->type
= XML_REGEXP_FINAL_STATE
;
3809 /* remove the Epsilon except for counted transitions */
3810 xmlFAEliminateEpsilonTransitions(ctxt
);
3813 if (ctxt
->error
!= 0) {
3814 xmlRegFreeParserCtxt(ctxt
);
3817 ret
= xmlRegEpxFromParse(ctxt
);
3818 xmlRegFreeParserCtxt(ctxt
);
3824 * @comp: the compiled regular expression
3825 * @content: the value to check against the regular expression
3827 * Check if the regular expression generate the value
3829 * Returns 1 if it matches, 0 if not and a negativa value in case of error
3832 xmlRegexpExec(xmlRegexpPtr comp
, const xmlChar
*content
) {
3833 if ((comp
== NULL
) || (content
== NULL
))
3835 return(xmlFARegExec(comp
, content
));
3839 * xmlRegexpIsDeterminist:
3840 * @comp: the compiled regular expression
3842 * Check if the regular expression is determinist
3844 * Returns 1 if it yes, 0 if not and a negativa value in case of error
3847 xmlRegexpIsDeterminist(xmlRegexpPtr comp
) {
3853 if (comp
->determinist
!= -1)
3854 return(comp
->determinist
);
3856 am
= xmlNewAutomata();
3857 if (am
->states
!= NULL
) {
3860 for (i
= 0;i
< am
->nbStates
;i
++)
3861 xmlRegFreeState(am
->states
[i
]);
3862 xmlFree(am
->states
);
3864 am
->nbAtoms
= comp
->nbAtoms
;
3865 am
->atoms
= comp
->atoms
;
3866 am
->nbStates
= comp
->nbStates
;
3867 am
->states
= comp
->states
;
3868 am
->determinist
= -1;
3869 ret
= xmlFAComputesDeterminism(am
);
3872 xmlFreeAutomata(am
);
3878 * @regexp: the regexp
3883 xmlRegFreeRegexp(xmlRegexpPtr regexp
) {
3888 if (regexp
->string
!= NULL
)
3889 xmlFree(regexp
->string
);
3890 if (regexp
->states
!= NULL
) {
3891 for (i
= 0;i
< regexp
->nbStates
;i
++)
3892 xmlRegFreeState(regexp
->states
[i
]);
3893 xmlFree(regexp
->states
);
3895 if (regexp
->atoms
!= NULL
) {
3896 for (i
= 0;i
< regexp
->nbAtoms
;i
++)
3897 xmlRegFreeAtom(regexp
->atoms
[i
]);
3898 xmlFree(regexp
->atoms
);
3900 if (regexp
->counters
!= NULL
)
3901 xmlFree(regexp
->counters
);
3902 if (regexp
->compact
!= NULL
)
3903 xmlFree(regexp
->compact
);
3904 if (regexp
->transdata
!= NULL
)
3905 xmlFree(regexp
->transdata
);
3906 if (regexp
->stringMap
!= NULL
) {
3907 for (i
= 0; i
< regexp
->nbstrings
;i
++)
3908 xmlFree(regexp
->stringMap
[i
]);
3909 xmlFree(regexp
->stringMap
);
3915 #ifdef LIBXML_AUTOMATA_ENABLED
3916 /************************************************************************
3918 * The Automata interface *
3920 ************************************************************************/
3925 * Create a new automata
3927 * Returns the new object or NULL in case of failure
3930 xmlNewAutomata(void) {
3931 xmlAutomataPtr ctxt
;
3933 ctxt
= xmlRegNewParserCtxt(NULL
);
3937 /* initialize the parser */
3939 ctxt
->start
= ctxt
->state
= xmlRegNewState(ctxt
);
3940 xmlRegStatePush(ctxt
, ctxt
->start
);
3952 xmlFreeAutomata(xmlAutomataPtr am
) {
3955 xmlRegFreeParserCtxt(am
);
3959 * xmlAutomataGetInitState:
3962 * Returns the initial state of the automata
3965 xmlAutomataGetInitState(xmlAutomataPtr am
) {
3972 * xmlAutomataSetFinalState:
3974 * @state: a state in this automata
3976 * Makes that state a final state
3978 * Returns 0 or -1 in case of error
3981 xmlAutomataSetFinalState(xmlAutomataPtr am
, xmlAutomataStatePtr state
) {
3982 if ((am
== NULL
) || (state
== NULL
))
3984 state
->type
= XML_REGEXP_FINAL_STATE
;
3989 * xmlAutomataNewTransition:
3991 * @from: the starting point of the transition
3992 * @to: the target point of the transition or NULL
3993 * @token: the input string associated to that transition
3994 * @data: data passed to the callback function if the transition is activated
3996 * If @to is NULL, this create first a new target state in the automata
3997 * and then adds a transition from the @from state to the target state
3998 * activated by the value of @token
4000 * Returns the target state or NULL in case of error
4003 xmlAutomataNewTransition(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
4004 xmlAutomataStatePtr to
, const xmlChar
*token
,
4008 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
4010 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
4014 atom
->valuep
= xmlStrdup(token
);
4016 xmlFAGenerateTransitions(am
, from
, to
, atom
);
4023 * xmlAutomataNewCountTrans:
4025 * @from: the starting point of the transition
4026 * @to: the target point of the transition or NULL
4027 * @token: the input string associated to that transition
4028 * @min: the minimum successive occurences of token
4029 * @min: the maximum successive occurences of token
4031 * If @to is NULL, this create first a new target state in the automata
4032 * and then adds a transition from the @from state to the target state
4033 * activated by a succession of input of value @token and whose number
4034 * is between @min and @max
4036 * Returns the target state or NULL in case of error
4039 xmlAutomataNewCountTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
4040 xmlAutomataStatePtr to
, const xmlChar
*token
,
4041 int min
, int max
, void *data
) {
4044 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
4048 if ((max
< min
) || (max
< 1))
4050 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
4053 atom
->valuep
= xmlStrdup(token
);
4061 xmlFAGenerateTransitions(am
, from
, to
, atom
);
4067 xmlFAGenerateEpsilonTransition(am
, from
, to
);
4072 * xmlAutomataNewOnceTrans:
4074 * @from: the starting point of the transition
4075 * @to: the target point of the transition or NULL
4076 * @token: the input string associated to that transition
4077 * @min: the minimum successive occurences of token
4078 * @min: the maximum successive occurences of token
4080 * If @to is NULL, this create first a new target state in the automata
4081 * and then adds a transition from the @from state to the target state
4082 * activated by a succession of input of value @token and whose number
4083 * is between @min and @max, moreover that transistion can only be crossed
4086 * Returns the target state or NULL in case of error
4089 xmlAutomataNewOnceTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
4090 xmlAutomataStatePtr to
, const xmlChar
*token
,
4091 int min
, int max
, void *data
) {
4095 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
4099 if ((max
< min
) || (max
< 1))
4101 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
4104 atom
->valuep
= xmlStrdup(token
);
4106 atom
->quant
= XML_REGEXP_QUANT_ONCEONLY
;
4113 * associate a counter to the transition.
4115 counter
= xmlRegGetCounter(am
);
4116 am
->counters
[counter
].min
= 1;
4117 am
->counters
[counter
].max
= 1;
4119 /* xmlFAGenerateTransitions(am, from, to, atom); */
4121 to
= xmlRegNewState(am
);
4122 xmlRegStatePush(am
, to
);
4124 xmlRegStateAddTrans(am
, from
, atom
, to
, counter
, -1);
4125 xmlRegAtomPush(am
, atom
);
4135 * xmlAutomataNewState:
4138 * Create a new disconnected state in the automata
4140 * Returns the new state or NULL in case of error
4143 xmlAutomataNewState(xmlAutomataPtr am
) {
4144 xmlAutomataStatePtr to
;
4148 to
= xmlRegNewState(am
);
4149 xmlRegStatePush(am
, to
);
4154 * xmlAutomataNewTransition:
4156 * @from: the starting point of the transition
4157 * @to: the target point of the transition or NULL
4159 * If @to is NULL, this create first a new target state in the automata
4160 * and then adds a an epsilon transition from the @from state to the
4163 * Returns the target state or NULL in case of error
4166 xmlAutomataNewEpsilon(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
4167 xmlAutomataStatePtr to
) {
4168 if ((am
== NULL
) || (from
== NULL
))
4170 xmlFAGenerateEpsilonTransition(am
, from
, to
);
4177 * xmlAutomataNewAllTrans:
4179 * @from: the starting point of the transition
4180 * @to: the target point of the transition or NULL
4182 * If @to is NULL, this create first a new target state in the automata
4183 * and then adds a an ALL transition from the @from state to the
4184 * target state. That transition is an epsilon transition allowed only when
4185 * all transitions from the @from node have been activated.
4187 * Returns the target state or NULL in case of error
4190 xmlAutomataNewAllTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
4191 xmlAutomataStatePtr to
, int lax
) {
4192 if ((am
== NULL
) || (from
== NULL
))
4194 xmlFAGenerateAllTransition(am
, from
, to
, lax
);
4201 * xmlAutomataNewCounter:
4203 * @min: the minimal value on the counter
4204 * @max: the maximal value on the counter
4206 * Create a new counter
4208 * Returns the counter number or -1 in case of error
4211 xmlAutomataNewCounter(xmlAutomataPtr am
, int min
, int max
) {
4217 ret
= xmlRegGetCounter(am
);
4220 am
->counters
[ret
].min
= min
;
4221 am
->counters
[ret
].max
= max
;
4226 * xmlAutomataNewCountedTrans:
4228 * @from: the starting point of the transition
4229 * @to: the target point of the transition or NULL
4230 * @counter: the counter associated to that transition
4232 * If @to is NULL, this create first a new target state in the automata
4233 * and then adds an epsilon transition from the @from state to the target state
4234 * which will increment the counter provided
4236 * Returns the target state or NULL in case of error
4239 xmlAutomataNewCountedTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
4240 xmlAutomataStatePtr to
, int counter
) {
4241 if ((am
== NULL
) || (from
== NULL
) || (counter
< 0))
4243 xmlFAGenerateCountedEpsilonTransition(am
, from
, to
, counter
);
4250 * xmlAutomataNewCounterTrans:
4252 * @from: the starting point of the transition
4253 * @to: the target point of the transition or NULL
4254 * @counter: the counter associated to that transition
4256 * If @to is NULL, this create first a new target state in the automata
4257 * and then adds an epsilon transition from the @from state to the target state
4258 * which will be allowed only if the counter is within the right range.
4260 * Returns the target state or NULL in case of error
4263 xmlAutomataNewCounterTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
4264 xmlAutomataStatePtr to
, int counter
) {
4265 if ((am
== NULL
) || (from
== NULL
) || (counter
< 0))
4267 xmlFAGenerateCountedTransition(am
, from
, to
, counter
);
4274 * xmlAutomataCompile:
4277 * Compile the automata into a Reg Exp ready for being executed.
4278 * The automata should be free after this point.
4280 * Returns the compiled regexp or NULL in case of error
4283 xmlAutomataCompile(xmlAutomataPtr am
) {
4286 xmlFAEliminateEpsilonTransitions(am
);
4287 /* xmlFAComputesDeterminism(am); */
4288 ret
= xmlRegEpxFromParse(am
);
4294 * xmlAutomataIsDeterminist:
4297 * Checks if an automata is determinist.
4299 * Returns 1 if true, 0 if not, and -1 in case of error
4302 xmlAutomataIsDeterminist(xmlAutomataPtr am
) {
4308 ret
= xmlFAComputesDeterminism(am
);
4311 #endif /* LIBXML_AUTOMATA_ENABLED */
4312 #endif /* LIBXML_REGEXP_ENABLED */