new release
[cxml-rng.git] / nppcre.lisp
blobed624a7d3ed3356ad35302c405d4383adccb2685
1 ;;; -*- show-trailing-whitespace: t; indent-tabs: nil -*-
3 ;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
5 ;;; hacked for XSD regular expressions by David Lichteblau in 2007:
7 ;;; - no comments and extended stuff
8 ;;; - no (?
9 ;;; - no greedyness modifier
10 ;;; - fewer and different backslash-escapes: \i \I \c \C \.
11 ;;; - character set substraction: [foo-[bar]]
12 ;;; - no ^ and $, but always wrap those around the complete parse tree
13 ;;; - ...
15 ;;; Derived from:
16 ;;; /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.24 2005/04/01 21:29:09 edi Exp
17 ;;; and
18 ;;; /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.21 2005/08/03 21:11:27 edi Exp
20 ;;; Redistribution and use in source and binary forms, with or without
21 ;;; modification, are permitted provided that the following conditions
22 ;;; are met:
24 ;;; * Redistributions of source code must retain the above copyright
25 ;;; notice, this list of conditions and the following disclaimer.
27 ;;; * Redistributions in binary form must reproduce the above
28 ;;; copyright notice, this list of conditions and the following
29 ;;; disclaimer in the documentation and/or other materials
30 ;;; provided with the distribution.
32 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
33 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
34 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35 ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
36 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
38 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
40 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44 (in-package :cxml-types)
46 (eval-when (:compile-toplevel :load-toplevel :execute)
47 (defparameter *standard-optimize-settings* '(optimize)))
49 (defvar *in-pattern-parser-p* nil)
51 (defun signal-ppcre-syntax-error (fmt &rest args)
52 (error "invalid pattern: ~?" fmt args))
54 (defun signal-ppcre-syntax-error* (pos fmt &rest args)
55 (error "invalid pattern at ~D: ~?" pos fmt args))
57 (defmacro maybe-coerce-to-simple-string (string)
58 (let ((=string= (gensym)))
59 `(let ((,=string= ,string))
60 (cond ((simple-string-p ,=string=)
61 ,=string=)
63 (coerce ,=string= 'simple-string))))))
65 (defun map-char-to-special-char-class (chr lexer)
66 (declare #.*standard-optimize-settings*)
67 "Maps escaped characters like \"\\d\" to the tokens which represent
68 their associated character classes."
69 (case chr
70 (#\. '\.)
71 (#\s '\\s) (#\i '\\i) (#\c '\\c) (#\d '\\d) (#\w '\\w)
72 (#\S '^s) (#\I '^i) (#\C '^c) (#\D '^d) (#\W '^w)
73 (#\p
74 (unless (eql (next-char lexer) #\{)
75 (signal-ppcre-syntax-error "Missing open brace after \\p"))
76 (let* ((bag (loop
77 for c = (next-char lexer)
78 for last = (eql c #\})
79 and done = nil then last
80 until done
81 unless c do
82 (signal-ppcre-syntax-error
83 "Missing close brace after \\p")
84 collect c))
85 (bag (coerce (list* #\p #\{ bag) 'string)))
86 (or (find-symbol bag 'cxml-types)
87 (signal-ppcre-syntax-error "Invalid character property: ~A"
88 bag))))))
90 (locally
91 (declare #.*standard-optimize-settings*)
92 (defstruct (lexer (:constructor make-lexer-internal))
93 "LEXER structures are used to hold the regex string which is
94 currently lexed and to keep track of the lexer's state."
95 (str ""
96 :type string
97 :read-only t)
98 (len 0
99 :type fixnum
100 :read-only t)
101 (pos 0
102 :type fixnum)
103 (last-pos nil
104 :type list)))
106 (defun make-lexer (string)
107 (declare (inline make-lexer-internal)
108 #-genera (type string string))
109 (make-lexer-internal :str (maybe-coerce-to-simple-string string)
110 :len (length string)))
112 (declaim (inline end-of-string-p))
113 (defun end-of-string-p (lexer)
114 (declare #.*standard-optimize-settings*)
115 "Tests whether we're at the end of the regex string."
116 (<= (lexer-len lexer)
117 (lexer-pos lexer)))
119 (declaim (inline looking-at-p))
120 (defun looking-at-p (lexer chr)
121 (declare #.*standard-optimize-settings*)
122 "Tests whether the next character the lexer would see is CHR.
123 Does not respect extended mode."
124 (and (not (end-of-string-p lexer))
125 (char= (schar (lexer-str lexer) (lexer-pos lexer))
126 chr)))
128 (declaim (inline next-char-non-extended))
129 (defun next-char-non-extended (lexer)
130 (declare #.*standard-optimize-settings*)
131 "Returns the next character which is to be examined and updates the
132 POS slot. Does not respect extended mode."
133 (cond ((end-of-string-p lexer)
134 nil)
136 (prog1
137 (schar (lexer-str lexer) (lexer-pos lexer))
138 (incf (lexer-pos lexer))))))
140 (defun next-char (lexer)
141 (declare #.*standard-optimize-settings*)
142 (next-char-non-extended lexer))
144 (declaim (inline fail))
145 (defun fail (lexer)
146 (declare #.*standard-optimize-settings*)
147 "Moves (LEXER-POS LEXER) back to the last position stored in
148 \(LEXER-LAST-POS LEXER) and pops the LAST-POS stack."
149 (unless (lexer-last-pos lexer)
150 (signal-ppcre-syntax-error "LAST-POS stack of LEXER ~A is empty" lexer))
151 (setf (lexer-pos lexer) (pop (lexer-last-pos lexer)))
152 nil)
154 (defun get-number (lexer &key (radix 10) max-length no-whitespace-p)
155 (declare #.*standard-optimize-settings*)
156 "Read and consume the number the lexer is currently looking at and
157 return it. Returns NIL if no number could be identified.
158 RADIX is used as in PARSE-INTEGER. If MAX-LENGTH is not NIL we'll read
159 at most the next MAX-LENGTH characters. If NO-WHITESPACE-P is not NIL
160 we don't tolerate whitespace in front of the number."
161 (when (or (end-of-string-p lexer)
162 (and no-whitespace-p
163 (not (find (schar (lexer-str lexer) (lexer-pos lexer))
164 "0123456789"))))
165 (return-from get-number nil))
166 (multiple-value-bind (integer new-pos)
167 (parse-integer (lexer-str lexer)
168 :start (lexer-pos lexer)
169 :end (if max-length
170 (let ((end-pos (+ (lexer-pos lexer)
171 (the fixnum max-length)))
172 (lexer-len (lexer-len lexer)))
173 (if (< end-pos lexer-len)
174 end-pos
175 lexer-len))
176 (lexer-len lexer))
177 :radix radix
178 :junk-allowed t)
179 (cond ((and integer (>= (the fixnum integer) 0))
180 (setf (lexer-pos lexer) new-pos)
181 integer)
182 (t nil))))
184 (declaim (inline make-char-from-code))
185 (defun make-char-from-code (number error-pos)
186 (declare #.*standard-optimize-settings*)
187 "Create character from char-code NUMBER. NUMBER can be NIL
188 which is interpreted as 0. ERROR-POS is the position where
189 the corresponding number started within the regex string."
190 ;; only look at rightmost eight bits in compliance with Perl
191 (let ((code (logand #o377 (the fixnum (or number 0)))))
192 (or (and (< code char-code-limit)
193 (code-char code))
194 (signal-ppcre-syntax-error*
195 error-pos
196 "No character for hex-code ~X"
197 number))))
199 (defun unescape-char (lexer)
200 (declare #.*standard-optimize-settings*)
201 "Convert the characters(s) following a backslash into a token
202 which is returned. This function is to be called when the backslash
203 has already been consumed. Special character classes like \\W are
204 handled elsewhere."
205 (when (end-of-string-p lexer)
206 (signal-ppcre-syntax-error "String ends with backslash"))
207 (let ((chr (next-char-non-extended lexer)))
208 (case chr
209 ;; the following five character names are 'semi-standard'
210 ;; according to the CLHS but I'm not aware of any implementation
211 ;; that doesn't implement them
212 ((#\t)
213 #\Tab)
214 ((#\n)
215 #\Newline)
216 ((#\r)
217 #\Return)
218 (otherwise
219 ;; all other characters aren't affected by a backslash
220 chr))))
222 (defun convert-substraction (r s)
223 (flet ((rangify (x)
224 (etypecase x
225 (character `((:range ,x ,x)))
226 (list
227 (ecase (car x)
228 (:range (list x))
229 (:property (copy-list (symbol-value (second x))))))
230 (symbol (copy-list (symbol-value x))))))
231 (ranges- (mapcan #'rangify r) (mapcan #'rangify s))))
233 (defun collect-char-class (lexer)
234 (declare #.*standard-optimize-settings*)
235 "Reads and consumes characters from regex string until a right
236 bracket is seen. Assembles them into a list \(which is returned) of
237 characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and
238 tokens representing special character classes."
239 (let ((start-pos (lexer-pos lexer)) ; remember start for error message
240 hyphen-seen
241 last-char
242 list)
243 (flet ((handle-char (c)
244 "Do the right thing with character C depending on whether
245 we're inside a range or not."
246 (cond ((and hyphen-seen last-char)
247 (setf (car list) (list :range last-char c)
248 last-char nil))
250 (push c list)
251 (setq last-char c)))
252 (setq hyphen-seen nil)))
253 (loop for first = t then nil
254 for c = (next-char-non-extended lexer)
255 ;; leave loop if at end of string
256 while c
257 do (cond
258 ((char= c #\\)
259 ;; we've seen a backslash
260 (let ((next-char (next-char-non-extended lexer)))
261 (case next-char
262 ((#\. #\i #\I #\c #\C #\d #\D #\w #\W #\s #\S #\p)
263 ;; a special character class
264 (push (list :property
265 (map-char-to-special-char-class
266 next-char
267 lexer))
268 list)
269 ;; if the last character was a hyphen
270 ;; just collect it literally
271 (when hyphen-seen
272 (push #\- list))
273 ;; if the next character is a hyphen do the same
274 (when (looking-at-p lexer #\-)
275 (incf (lexer-pos lexer))
276 (when (looking-at-p lexer #\[)
277 (incf (lexer-pos lexer))
278 (return-from collect-char-class
279 (prog1
280 (convert-substraction
281 (nreverse list)
282 (collect-char-class lexer))
283 (unless
284 (eql (next-char-non-extended lexer) #\])
285 (signal-ppcre-syntax-error*
286 start-pos
287 "Missing right bracket to close character class")))))
288 (push #\- list))
289 (setq hyphen-seen nil))
290 (otherwise
291 ;; otherwise unescape the following character(s)
292 (decf (lexer-pos lexer))
293 (handle-char (unescape-char lexer))))))
294 (first
295 ;; the first character must not be a right bracket
296 ;; and isn't treated specially if it's a hyphen
297 (handle-char c))
298 ((char= c #\])
299 ;; end of character class
300 ;; make sure we collect a pending hyphen
301 (when hyphen-seen
302 (setq hyphen-seen nil)
303 (handle-char #\-))
304 ;; reverse the list to preserve the order intended
305 ;; by the author of the regex string
306 (return-from collect-char-class (nreverse list)))
307 ((and hyphen-seen (char= c #\[))
308 (return-from collect-char-class
309 (prog1
310 (convert-substraction
311 (nreverse list)
312 (collect-char-class lexer))
313 (unless (eql (next-char-non-extended lexer) #\])
314 (signal-ppcre-syntax-error*
315 start-pos
316 "Missing right bracket to close character class")))))
317 ((and (char= c #\-)
318 last-char
319 (not hyphen-seen))
320 ;; if the last character was 'just a character'
321 ;; we expect to be in the middle of a range
322 (setq hyphen-seen t))
323 ((char= c #\-)
324 ;; otherwise this is just an ordinary hyphen
325 (handle-char #\-))
327 ;; default case - just collect the character
328 (handle-char c))))
329 ;; we can only exit the loop normally if we've reached the end
330 ;; of the regex string without seeing a right bracket
331 (signal-ppcre-syntax-error*
332 start-pos
333 "Missing right bracket to close character class"))))
335 (defun get-quantifier (lexer)
336 (declare #.*standard-optimize-settings*)
337 "Returns a list of two values (min max) if what the lexer is looking
338 at can be interpreted as a quantifier. Otherwise returns NIL and
339 resets the lexer to its old position."
340 ;; remember starting position for FAIL and UNGET-TOKEN functions
341 (push (lexer-pos lexer) (lexer-last-pos lexer))
342 (let ((next-char (next-char lexer)))
343 (case next-char
344 ((#\*)
345 ;; * (Kleene star): match 0 or more times
346 '(0 nil))
347 ((#\+)
348 ;; +: match 1 or more times
349 '(1 nil))
350 ((#\?)
351 ;; ?: match 0 or 1 times
352 '(0 1))
353 ((#\{)
354 ;; one of
355 ;; {n}: match exactly n times
356 ;; {n,}: match at least n times
357 ;; {n,m}: match at least n but not more than m times
358 ;; note that anything not matching one of these patterns will
359 ;; be interpreted literally - even whitespace isn't allowed
360 (let ((num1 (get-number lexer :no-whitespace-p t)))
361 (if num1
362 (let ((next-char (next-char-non-extended lexer)))
363 (case next-char
364 ((#\,)
365 (let* ((num2 (get-number lexer :no-whitespace-p t))
366 (next-char (next-char-non-extended lexer)))
367 (case next-char
368 ((#\})
369 ;; this is the case {n,} (NUM2 is NIL) or {n,m}
370 (list num1 num2))
371 (otherwise
372 (fail lexer)))))
373 ((#\})
374 ;; this is the case {n}
375 (list num1 num1))
376 (otherwise
377 (fail lexer))))
378 ;; no number following left curly brace, so we treat it
379 ;; like a normal character
380 (fail lexer))))
381 ;; cannot be a quantifier
382 (otherwise
383 (fail lexer)))))
385 (defun get-token (lexer)
386 (declare #.*standard-optimize-settings*)
387 "Returns and consumes the next token from the regex string (or NIL)."
388 ;; remember starting position for UNGET-TOKEN function
389 (push (lexer-pos lexer)
390 (lexer-last-pos lexer))
391 (let ((next-char (next-char lexer)))
392 (cond (next-char
393 (case next-char
394 ;; the easy cases first - the following six characters
395 ;; always have a special meaning and get translated
396 ;; into tokens immediately
397 ((#\))
398 :close-paren)
399 ((#\|)
400 :vertical-bar)
401 ((#\?)
402 :question-mark)
403 ((#\.)
404 :everything)
405 ((#\+ #\*)
406 ;; quantifiers will always be consumend by
407 ;; GET-QUANTIFIER, they must not appear here
408 (signal-ppcre-syntax-error*
409 (1- (lexer-pos lexer))
410 "Quantifier '~A' not allowed"
411 next-char))
412 ((#\{)
413 ;; left brace isn't a special character in it's own
414 ;; right but we must check if what follows might
415 ;; look like a quantifier
416 (let ((this-pos (lexer-pos lexer))
417 (this-last-pos (lexer-last-pos lexer)))
418 (unget-token lexer)
419 (when (get-quantifier lexer)
420 (signal-ppcre-syntax-error*
421 (car this-last-pos)
422 "Quantifier '~A' not allowed"
423 (subseq (lexer-str lexer)
424 (car this-last-pos)
425 (lexer-pos lexer))))
426 (setf (lexer-pos lexer) this-pos
427 (lexer-last-pos lexer) this-last-pos)
428 next-char))
429 ((#\[)
430 ;; left bracket always starts a character class
431 (cons (cond ((looking-at-p lexer #\^)
432 (incf (lexer-pos lexer))
433 :inverted-char-class)
435 :char-class))
436 (collect-char-class lexer)))
437 ((#\\)
438 ;; backslash might mean different things so we have
439 ;; to peek one char ahead:
440 (let ((next-char (next-char-non-extended lexer)))
441 (case next-char
442 ((#\. #\i #\I #\c #\C #\d #\D #\w #\W #\s #\S #\p)
443 ;; these will be treated like character classes
444 (list :property
445 (map-char-to-special-char-class next-char lexer)))
446 (otherwise
447 ;; in all other cases just unescape the
448 ;; character
449 (decf (lexer-pos lexer))
450 (unescape-char lexer)))))
451 ((#\()
452 :open-paren)
453 (otherwise
454 ;; all other characters are their own tokens
455 next-char)))
456 ;; we didn't get a character (this if the "else" branch from
457 ;; the first IF), so we don't return a token but NIL
459 (pop (lexer-last-pos lexer))
460 nil))))
462 (declaim (notinline unget-token)) ;FIXME: else AVER in GET-TOKEN
463 (defun unget-token (lexer)
464 (declare #.*standard-optimize-settings*)
465 "Moves the lexer back to the last position stored in the LAST-POS stack."
466 (if (lexer-last-pos lexer)
467 (setf (lexer-pos lexer)
468 (pop (lexer-last-pos lexer)))
469 (error "No token to unget \(this should not happen)")))
471 (declaim (inline start-of-subexpr-p))
472 (defun start-of-subexpr-p (lexer)
473 (declare #.*standard-optimize-settings*)
474 "Tests whether the next token can start a valid sub-expression, i.e.
475 a stand-alone regex."
476 (let* ((pos (lexer-pos lexer))
477 (next-char (next-char lexer)))
478 (not (or (null next-char)
479 (prog1
480 (member (the character next-char)
481 '(#\) #\|)
482 :test #'char=)
483 (setf (lexer-pos lexer) pos))))))
485 (defun group (lexer)
486 (declare #.*standard-optimize-settings*)
487 "Parses and consumes a <group>.
488 The productions are: <group> -> \"(\"<regex>\")\"
489 <legal-token>
490 Will return <parse-tree> or (<grouping-type> <parse-tree>) where
491 <grouping-type> is one of six keywords - see source for details."
492 (let ((open-token (get-token lexer)))
493 (cond ((eq open-token :open-paren)
494 (let* ((open-paren-pos (car (lexer-last-pos lexer)))
495 (reg-expr (reg-expr lexer))
496 (close-token (get-token lexer)))
497 (unless (eq close-token :close-paren)
498 ;; the token following <regex> must be the closing
499 ;; parenthesis or this is a syntax error
500 (signal-ppcre-syntax-error*
501 open-paren-pos
502 "Opening paren has no matching closing paren"))
503 (list :register reg-expr)))
505 ;; this is the <legal-token> production; <legal-token> is
506 ;; any token which passes START-OF-SUBEXPR-P (otherwise
507 ;; parsing had already stopped in the SEQ method)
508 open-token))))
510 (defun greedy-quant (lexer)
511 (declare #.*standard-optimize-settings*)
512 "Parses and consumes a <greedy-quant>.
513 The productions are: <greedy-quant> -> <group> | <group><quantifier>
514 where <quantifier> is parsed by the lexer function GET-QUANTIFIER.
515 Will return <parse-tree> or (:GREEDY-REPETITION <min> <max> <parse-tree>)."
516 (let* ((group (group lexer))
517 (token (get-quantifier lexer)))
518 (if token
519 ;; if GET-QUANTIFIER returned a non-NIL value it's the
520 ;; two-element list (<min> <max>)
521 (list :greedy-repetition (first token) (second token) group)
522 group)))
524 (defun quant (lexer)
525 (declare #.*standard-optimize-settings*)
526 (greedy-quant lexer))
528 (defun seq (lexer)
529 (declare #.*standard-optimize-settings*)
530 "Parses and consumes a <seq>.
531 The productions are: <seq> -> <quant> | <quant><seq>.
532 Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)."
533 (flet ((make-array-from-two-chars (char1 char2)
534 (let ((string (make-array 2
535 :element-type 'character
536 :fill-pointer t
537 :adjustable t)))
538 (setf (aref string 0) char1)
539 (setf (aref string 1) char2)
540 string)))
541 ;; Note that we're calling START-OF-SUBEXPR-P before we actually try
542 ;; to parse a <seq> or <quant> in order to catch empty regular
543 ;; expressions
544 (if (start-of-subexpr-p lexer)
545 (let ((quant (quant lexer)))
546 (if (start-of-subexpr-p lexer)
547 (let* ((seq (seq lexer))
548 (quant-is-char-p (characterp quant))
549 (seq-is-sequence-p (and (consp seq)
550 (eq (first seq) :sequence))))
551 (cond ((and quant-is-char-p
552 (characterp seq))
553 (make-array-from-two-chars seq quant))
554 ((and quant-is-char-p
555 (stringp seq))
556 (vector-push-extend quant seq)
557 seq)
558 ((and quant-is-char-p
559 seq-is-sequence-p
560 (characterp (second seq)))
561 (cond ((cddr seq)
562 (setf (cdr seq)
563 (cons
564 (make-array-from-two-chars (second seq)
565 quant)
566 (cddr seq)))
567 seq)
568 (t (make-array-from-two-chars (second seq) quant))))
569 ((and quant-is-char-p
570 seq-is-sequence-p
571 (stringp (second seq)))
572 (cond ((cddr seq)
573 (setf (cdr seq)
574 (cons
575 (progn
576 (vector-push-extend quant (second seq))
577 (second seq))
578 (cddr seq)))
579 seq)
581 (vector-push-extend quant (second seq))
582 (second seq))))
583 (seq-is-sequence-p
584 ;; if <seq> is also a :SEQUENCE parse tree we merge
585 ;; both lists into one to avoid unnecessary consing
586 (setf (cdr seq)
587 (cons quant (cdr seq)))
588 seq)
589 (t (list :sequence quant seq))))
590 quant))
591 :void)))
593 (defun reg-expr (lexer)
594 (declare #.*standard-optimize-settings*)
595 "Parses and consumes a <regex>, a complete regular expression.
596 The productions are: <regex> -> <seq> | <seq>\"|\"<regex>.
597 Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
598 (let ((pos (lexer-pos lexer)))
599 (case (next-char lexer)
600 ((nil)
601 ;; if we didn't get any token we return :VOID which stands for
602 ;; "empty regular expression"
603 :void)
604 ((#\|)
605 ;; now check whether the expression started with a vertical
606 ;; bar, i.e. <seq> - the left alternation - is empty
607 (list :alternation :void (reg-expr lexer)))
608 (otherwise
609 ;; otherwise un-read the character we just saw and parse a
610 ;; <seq> plus the character following it
611 (setf (lexer-pos lexer) pos)
612 (let* ((seq (seq lexer))
613 (pos (lexer-pos lexer)))
614 (case (next-char lexer)
615 ((nil)
616 ;; no further character, just a <seq>
617 seq)
618 ((#\|)
619 ;; if the character was a vertical bar, this is an
620 ;; alternation and we have the second production
621 (let ((reg-expr (reg-expr lexer)))
622 (cond ((and (consp reg-expr)
623 (eq (first reg-expr) :alternation))
624 ;; again we try to merge as above in SEQ
625 (setf (cdr reg-expr)
626 (cons seq (cdr reg-expr)))
627 reg-expr)
628 (t (list :alternation seq reg-expr)))))
629 (otherwise
630 ;; a character which is not a vertical bar - this is
631 ;; either a syntax error or we're inside of a group and
632 ;; the next character is a closing parenthesis; so we
633 ;; just un-read the character and let another function
634 ;; take care of it
635 (setf (lexer-pos lexer) pos)
636 seq)))))))
638 (defun reverse-strings (parse-tree)
639 (declare #.*standard-optimize-settings*)
640 (cond ((stringp parse-tree)
641 (nreverse parse-tree))
642 ((consp parse-tree)
643 (loop for parse-tree-rest on parse-tree
644 while parse-tree-rest
645 do (setf (car parse-tree-rest)
646 (reverse-strings (car parse-tree-rest))))
647 parse-tree)
648 (t parse-tree)))
650 (defun parse-pattern (string)
651 (declare #.*standard-optimize-settings*)
652 "Translate the regex string STRING into a parse tree."
653 (let* ((*in-pattern-parser-p* t)
654 (lexer (make-lexer string))
655 (parse-tree (reverse-strings (reg-expr lexer))))
656 ;; check whether we've consumed the whole regex string
657 (if (end-of-string-p lexer)
658 `(:sequence :start-anchor ,parse-tree :end-anchor)
659 (signal-ppcre-syntax-error*
660 (lexer-pos lexer)
661 "Expected end of string"))))
663 (defmethod pattern-scanner ((str string))
664 (cl-ppcre:create-scanner (parse-pattern str)))
666 (defmethod pattern-scanner ((scanner function))
667 scanner)