1 ;;; -*- show-trailing-whitespace: t; indent-tabs: nil -*-
3 ;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
5 ;;; hacked for XSD regular expressions by David Lichteblau in 2007:
7 ;;; - no comments and extended stuff
9 ;;; - no greedyness modifier
10 ;;; - fewer and different backslash-escapes: \i \I \c \C \.
11 ;;; - character set substraction: [foo-[bar]]
12 ;;; - no ^ and $, but always wrap those around the complete parse tree
16 ;;; /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.24 2005/04/01 21:29:09 edi Exp
18 ;;; /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.21 2005/08/03 21:11:27 edi Exp
20 ;;; Redistribution and use in source and binary forms, with or without
21 ;;; modification, are permitted provided that the following conditions
24 ;;; * Redistributions of source code must retain the above copyright
25 ;;; notice, this list of conditions and the following disclaimer.
27 ;;; * Redistributions in binary form must reproduce the above
28 ;;; copyright notice, this list of conditions and the following
29 ;;; disclaimer in the documentation and/or other materials
30 ;;; provided with the distribution.
32 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
33 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
34 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35 ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
36 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
38 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
40 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44 (in-package :cxml-types
)
46 (eval-when (:compile-toplevel
:load-toplevel
:execute
)
47 (defparameter *standard-optimize-settings
* '(optimize)))
49 (defvar *in-pattern-parser-p
* nil
)
51 (defun signal-ppcre-syntax-error (fmt &rest args
)
52 (error "invalid pattern: ~?" fmt args
))
54 (defun signal-ppcre-syntax-error* (pos fmt
&rest args
)
55 (error "invalid pattern at ~D: ~?" pos fmt args
))
57 (defmacro maybe-coerce-to-simple-string
(string)
58 (let ((=string
= (gensym)))
59 `(let ((,=string
= ,string
))
60 (cond ((simple-string-p ,=string
=)
63 (coerce ,=string
= 'simple-string
))))))
65 (defun map-char-to-special-char-class (chr lexer
)
66 (declare #.
*standard-optimize-settings
*)
67 "Maps escaped characters like \"\\d\" to the tokens which represent
68 their associated character classes."
71 (#\s
'\\s
) (#\i
'\\i
) (#\c
'\\c
) (#\d
'\\d
) (#\w
'\\w
)
72 (#\S
'^s
) (#\I
'^i
) (#\C
'^c
) (#\D
'^d
) (#\W
'^w
)
74 (unless (eql (next-char lexer
) #\
{)
75 (signal-ppcre-syntax-error "Missing open brace after \\p"))
77 for c
= (next-char lexer
)
78 for last
= (eql c
#\
})
79 and done
= nil then last
82 (signal-ppcre-syntax-error
83 "Missing close brace after \\p")
85 (bag (coerce (list* #\p
#\
{ bag
) 'string
)))
86 (or (find-symbol bag
'cxml-types
)
87 (signal-ppcre-syntax-error "Invalid character property: ~A"
91 (declare #.
*standard-optimize-settings
*)
92 (defstruct (lexer (:constructor make-lexer-internal
))
93 "LEXER structures are used to hold the regex string which is
94 currently lexed and to keep track of the lexer's state."
106 (defun make-lexer (string)
107 (declare (inline make-lexer-internal
)
108 #-genera
(type string string
))
109 (make-lexer-internal :str
(maybe-coerce-to-simple-string string
)
110 :len
(length string
)))
112 (declaim (inline end-of-string-p
))
113 (defun end-of-string-p (lexer)
114 (declare #.
*standard-optimize-settings
*)
115 "Tests whether we're at the end of the regex string."
116 (<= (lexer-len lexer
)
119 (declaim (inline looking-at-p
))
120 (defun looking-at-p (lexer chr
)
121 (declare #.
*standard-optimize-settings
*)
122 "Tests whether the next character the lexer would see is CHR.
123 Does not respect extended mode."
124 (and (not (end-of-string-p lexer
))
125 (char= (schar (lexer-str lexer
) (lexer-pos lexer
))
128 (declaim (inline next-char-non-extended
))
129 (defun next-char-non-extended (lexer)
130 (declare #.
*standard-optimize-settings
*)
131 "Returns the next character which is to be examined and updates the
132 POS slot. Does not respect extended mode."
133 (cond ((end-of-string-p lexer
)
137 (schar (lexer-str lexer
) (lexer-pos lexer
))
138 (incf (lexer-pos lexer
))))))
140 (defun next-char (lexer)
141 (declare #.
*standard-optimize-settings
*)
142 (next-char-non-extended lexer
))
144 (declaim (inline fail
))
146 (declare #.
*standard-optimize-settings
*)
147 "Moves (LEXER-POS LEXER) back to the last position stored in
148 \(LEXER-LAST-POS LEXER) and pops the LAST-POS stack."
149 (unless (lexer-last-pos lexer
)
150 (signal-ppcre-syntax-error "LAST-POS stack of LEXER ~A is empty" lexer
))
151 (setf (lexer-pos lexer
) (pop (lexer-last-pos lexer
)))
154 (defun get-number (lexer &key
(radix 10) max-length no-whitespace-p
)
155 (declare #.
*standard-optimize-settings
*)
156 "Read and consume the number the lexer is currently looking at and
157 return it. Returns NIL if no number could be identified.
158 RADIX is used as in PARSE-INTEGER. If MAX-LENGTH is not NIL we'll read
159 at most the next MAX-LENGTH characters. If NO-WHITESPACE-P is not NIL
160 we don't tolerate whitespace in front of the number."
161 (when (or (end-of-string-p lexer
)
163 (not (find (schar (lexer-str lexer
) (lexer-pos lexer
))
165 (return-from get-number nil
))
166 (multiple-value-bind (integer new-pos
)
167 (parse-integer (lexer-str lexer
)
168 :start
(lexer-pos lexer
)
170 (let ((end-pos (+ (lexer-pos lexer
)
171 (the fixnum max-length
)))
172 (lexer-len (lexer-len lexer
)))
173 (if (< end-pos lexer-len
)
179 (cond ((and integer
(>= (the fixnum integer
) 0))
180 (setf (lexer-pos lexer
) new-pos
)
184 (declaim (inline make-char-from-code
))
185 (defun make-char-from-code (number error-pos
)
186 (declare #.
*standard-optimize-settings
*)
187 "Create character from char-code NUMBER. NUMBER can be NIL
188 which is interpreted as 0. ERROR-POS is the position where
189 the corresponding number started within the regex string."
190 ;; only look at rightmost eight bits in compliance with Perl
191 (let ((code (logand #o377
(the fixnum
(or number
0)))))
192 (or (and (< code char-code-limit
)
194 (signal-ppcre-syntax-error*
196 "No character for hex-code ~X"
199 (defun unescape-char (lexer)
200 (declare #.
*standard-optimize-settings
*)
201 "Convert the characters(s) following a backslash into a token
202 which is returned. This function is to be called when the backslash
203 has already been consumed. Special character classes like \\W are
205 (when (end-of-string-p lexer
)
206 (signal-ppcre-syntax-error "String ends with backslash"))
207 (let ((chr (next-char-non-extended lexer
)))
209 ;; the following five character names are 'semi-standard'
210 ;; according to the CLHS but I'm not aware of any implementation
211 ;; that doesn't implement them
219 ;; all other characters aren't affected by a backslash
222 (defun convert-substraction (r s
)
225 (character `((:range
,x
,x
)))
229 (:property
(copy-list (symbol-value (second x
))))))
230 (symbol (copy-list (symbol-value x
))))))
231 (ranges- (mapcan #'rangify r
) (mapcan #'rangify s
))))
233 (defun collect-char-class (lexer)
234 (declare #.
*standard-optimize-settings
*)
235 "Reads and consumes characters from regex string until a right
236 bracket is seen. Assembles them into a list \(which is returned) of
237 characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and
238 tokens representing special character classes."
239 (let ((start-pos (lexer-pos lexer
)) ; remember start for error message
243 (flet ((handle-char (c)
244 "Do the right thing with character C depending on whether
245 we're inside a range or not."
246 (cond ((and hyphen-seen last-char
)
247 (setf (car list
) (list :range last-char c
)
252 (setq hyphen-seen nil
)))
253 (loop for first
= t then nil
254 for c
= (next-char-non-extended lexer
)
255 ;; leave loop if at end of string
259 ;; we've seen a backslash
260 (let ((next-char (next-char-non-extended lexer
)))
262 ((#\.
#\i
#\I
#\c
#\C
#\d
#\D
#\w
#\W
#\s
#\S
#\p
)
263 ;; a special character class
264 (push (list :property
265 (map-char-to-special-char-class
269 ;; if the last character was a hyphen
270 ;; just collect it literally
273 ;; if the next character is a hyphen do the same
274 (when (looking-at-p lexer
#\-
)
275 (incf (lexer-pos lexer
))
276 (when (looking-at-p lexer
#\
[)
277 (incf (lexer-pos lexer
))
278 (return-from collect-char-class
280 (convert-substraction
282 (collect-char-class lexer
))
284 (eql (next-char-non-extended lexer
) #\
])
285 (signal-ppcre-syntax-error*
287 "Missing right bracket to close character class")))))
289 (setq hyphen-seen nil
))
291 ;; otherwise unescape the following character(s)
292 (decf (lexer-pos lexer
))
293 (handle-char (unescape-char lexer
))))))
295 ;; the first character must not be a right bracket
296 ;; and isn't treated specially if it's a hyphen
299 ;; end of character class
300 ;; make sure we collect a pending hyphen
302 (setq hyphen-seen nil
)
304 ;; reverse the list to preserve the order intended
305 ;; by the author of the regex string
306 (return-from collect-char-class
(nreverse list
)))
307 ((and hyphen-seen
(char= c
#\
[))
308 (return-from collect-char-class
310 (convert-substraction
312 (collect-char-class lexer
))
313 (unless (eql (next-char-non-extended lexer
) #\
])
314 (signal-ppcre-syntax-error*
316 "Missing right bracket to close character class")))))
320 ;; if the last character was 'just a character'
321 ;; we expect to be in the middle of a range
322 (setq hyphen-seen t
))
324 ;; otherwise this is just an ordinary hyphen
327 ;; default case - just collect the character
329 ;; we can only exit the loop normally if we've reached the end
330 ;; of the regex string without seeing a right bracket
331 (signal-ppcre-syntax-error*
333 "Missing right bracket to close character class"))))
335 (defun get-quantifier (lexer)
336 (declare #.
*standard-optimize-settings
*)
337 "Returns a list of two values (min max) if what the lexer is looking
338 at can be interpreted as a quantifier. Otherwise returns NIL and
339 resets the lexer to its old position."
340 ;; remember starting position for FAIL and UNGET-TOKEN functions
341 (push (lexer-pos lexer
) (lexer-last-pos lexer
))
342 (let ((next-char (next-char lexer
)))
345 ;; * (Kleene star): match 0 or more times
348 ;; +: match 1 or more times
351 ;; ?: match 0 or 1 times
355 ;; {n}: match exactly n times
356 ;; {n,}: match at least n times
357 ;; {n,m}: match at least n but not more than m times
358 ;; note that anything not matching one of these patterns will
359 ;; be interpreted literally - even whitespace isn't allowed
360 (let ((num1 (get-number lexer
:no-whitespace-p t
)))
362 (let ((next-char (next-char-non-extended lexer
)))
365 (let* ((num2 (get-number lexer
:no-whitespace-p t
))
366 (next-char (next-char-non-extended lexer
)))
369 ;; this is the case {n,} (NUM2 is NIL) or {n,m}
374 ;; this is the case {n}
378 ;; no number following left curly brace, so we treat it
379 ;; like a normal character
381 ;; cannot be a quantifier
385 (defun get-token (lexer)
386 (declare #.
*standard-optimize-settings
*)
387 "Returns and consumes the next token from the regex string (or NIL)."
388 ;; remember starting position for UNGET-TOKEN function
389 (push (lexer-pos lexer
)
390 (lexer-last-pos lexer
))
391 (let ((next-char (next-char lexer
)))
394 ;; the easy cases first - the following six characters
395 ;; always have a special meaning and get translated
396 ;; into tokens immediately
406 ;; quantifiers will always be consumend by
407 ;; GET-QUANTIFIER, they must not appear here
408 (signal-ppcre-syntax-error*
409 (1- (lexer-pos lexer
))
410 "Quantifier '~A' not allowed"
413 ;; left brace isn't a special character in it's own
414 ;; right but we must check if what follows might
415 ;; look like a quantifier
416 (let ((this-pos (lexer-pos lexer
))
417 (this-last-pos (lexer-last-pos lexer
)))
419 (when (get-quantifier lexer
)
420 (signal-ppcre-syntax-error*
422 "Quantifier '~A' not allowed"
423 (subseq (lexer-str lexer
)
426 (setf (lexer-pos lexer
) this-pos
427 (lexer-last-pos lexer
) this-last-pos
)
430 ;; left bracket always starts a character class
431 (cons (cond ((looking-at-p lexer
#\^
)
432 (incf (lexer-pos lexer
))
433 :inverted-char-class
)
436 (collect-char-class lexer
)))
438 ;; backslash might mean different things so we have
439 ;; to peek one char ahead:
440 (let ((next-char (next-char-non-extended lexer
)))
442 ((#\.
#\i
#\I
#\c
#\C
#\d
#\D
#\w
#\W
#\s
#\S
#\p
)
443 ;; these will be treated like character classes
445 (map-char-to-special-char-class next-char lexer
)))
447 ;; in all other cases just unescape the
449 (decf (lexer-pos lexer
))
450 (unescape-char lexer
)))))
454 ;; all other characters are their own tokens
456 ;; we didn't get a character (this if the "else" branch from
457 ;; the first IF), so we don't return a token but NIL
459 (pop (lexer-last-pos lexer
))
462 (declaim (notinline unget-token
)) ;FIXME: else AVER in GET-TOKEN
463 (defun unget-token (lexer)
464 (declare #.
*standard-optimize-settings
*)
465 "Moves the lexer back to the last position stored in the LAST-POS stack."
466 (if (lexer-last-pos lexer
)
467 (setf (lexer-pos lexer
)
468 (pop (lexer-last-pos lexer
)))
469 (error "No token to unget \(this should not happen)")))
471 (declaim (inline start-of-subexpr-p
))
472 (defun start-of-subexpr-p (lexer)
473 (declare #.
*standard-optimize-settings
*)
474 "Tests whether the next token can start a valid sub-expression, i.e.
475 a stand-alone regex."
476 (let* ((pos (lexer-pos lexer
))
477 (next-char (next-char lexer
)))
478 (not (or (null next-char
)
480 (member (the character next-char
)
483 (setf (lexer-pos lexer
) pos
))))))
486 (declare #.
*standard-optimize-settings
*)
487 "Parses and consumes a <group>.
488 The productions are: <group> -> \"(\"<regex>\")\"
490 Will return <parse-tree> or (<grouping-type> <parse-tree>) where
491 <grouping-type> is one of six keywords - see source for details."
492 (let ((open-token (get-token lexer
)))
493 (cond ((eq open-token
:open-paren
)
494 (let* ((open-paren-pos (car (lexer-last-pos lexer
)))
495 (reg-expr (reg-expr lexer
))
496 (close-token (get-token lexer
)))
497 (unless (eq close-token
:close-paren
)
498 ;; the token following <regex> must be the closing
499 ;; parenthesis or this is a syntax error
500 (signal-ppcre-syntax-error*
502 "Opening paren has no matching closing paren"))
503 (list :register reg-expr
)))
505 ;; this is the <legal-token> production; <legal-token> is
506 ;; any token which passes START-OF-SUBEXPR-P (otherwise
507 ;; parsing had already stopped in the SEQ method)
510 (defun greedy-quant (lexer)
511 (declare #.
*standard-optimize-settings
*)
512 "Parses and consumes a <greedy-quant>.
513 The productions are: <greedy-quant> -> <group> | <group><quantifier>
514 where <quantifier> is parsed by the lexer function GET-QUANTIFIER.
515 Will return <parse-tree> or (:GREEDY-REPETITION <min> <max> <parse-tree>)."
516 (let* ((group (group lexer
))
517 (token (get-quantifier lexer
)))
519 ;; if GET-QUANTIFIER returned a non-NIL value it's the
520 ;; two-element list (<min> <max>)
521 (list :greedy-repetition
(first token
) (second token
) group
)
525 (declare #.
*standard-optimize-settings
*)
526 (greedy-quant lexer
))
529 (declare #.
*standard-optimize-settings
*)
530 "Parses and consumes a <seq>.
531 The productions are: <seq> -> <quant> | <quant><seq>.
532 Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)."
533 (flet ((make-array-from-two-chars (char1 char2
)
534 (let ((string (make-array 2
535 :element-type
'character
538 (setf (aref string
0) char1
)
539 (setf (aref string
1) char2
)
541 ;; Note that we're calling START-OF-SUBEXPR-P before we actually try
542 ;; to parse a <seq> or <quant> in order to catch empty regular
544 (if (start-of-subexpr-p lexer
)
545 (let ((quant (quant lexer
)))
546 (if (start-of-subexpr-p lexer
)
547 (let* ((seq (seq lexer
))
548 (quant-is-char-p (characterp quant
))
549 (seq-is-sequence-p (and (consp seq
)
550 (eq (first seq
) :sequence
))))
551 (cond ((and quant-is-char-p
553 (make-array-from-two-chars seq quant
))
554 ((and quant-is-char-p
556 (vector-push-extend quant seq
)
558 ((and quant-is-char-p
560 (characterp (second seq
)))
564 (make-array-from-two-chars (second seq
)
568 (t (make-array-from-two-chars (second seq
) quant
))))
569 ((and quant-is-char-p
571 (stringp (second seq
)))
576 (vector-push-extend quant
(second seq
))
581 (vector-push-extend quant
(second seq
))
584 ;; if <seq> is also a :SEQUENCE parse tree we merge
585 ;; both lists into one to avoid unnecessary consing
587 (cons quant
(cdr seq
)))
589 (t (list :sequence quant seq
))))
593 (defun reg-expr (lexer)
594 (declare #.
*standard-optimize-settings
*)
595 "Parses and consumes a <regex>, a complete regular expression.
596 The productions are: <regex> -> <seq> | <seq>\"|\"<regex>.
597 Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
598 (let ((pos (lexer-pos lexer
)))
599 (case (next-char lexer
)
601 ;; if we didn't get any token we return :VOID which stands for
602 ;; "empty regular expression"
605 ;; now check whether the expression started with a vertical
606 ;; bar, i.e. <seq> - the left alternation - is empty
607 (list :alternation
:void
(reg-expr lexer
)))
609 ;; otherwise un-read the character we just saw and parse a
610 ;; <seq> plus the character following it
611 (setf (lexer-pos lexer
) pos
)
612 (let* ((seq (seq lexer
))
613 (pos (lexer-pos lexer
)))
614 (case (next-char lexer
)
616 ;; no further character, just a <seq>
619 ;; if the character was a vertical bar, this is an
620 ;; alternation and we have the second production
621 (let ((reg-expr (reg-expr lexer
)))
622 (cond ((and (consp reg-expr
)
623 (eq (first reg-expr
) :alternation
))
624 ;; again we try to merge as above in SEQ
626 (cons seq
(cdr reg-expr
)))
628 (t (list :alternation seq reg-expr
)))))
630 ;; a character which is not a vertical bar - this is
631 ;; either a syntax error or we're inside of a group and
632 ;; the next character is a closing parenthesis; so we
633 ;; just un-read the character and let another function
635 (setf (lexer-pos lexer
) pos
)
638 (defun reverse-strings (parse-tree)
639 (declare #.
*standard-optimize-settings
*)
640 (cond ((stringp parse-tree
)
641 (nreverse parse-tree
))
643 (loop for parse-tree-rest on parse-tree
644 while parse-tree-rest
645 do
(setf (car parse-tree-rest
)
646 (reverse-strings (car parse-tree-rest
))))
650 (defun parse-pattern (string)
651 (declare #.
*standard-optimize-settings
*)
652 "Translate the regex string STRING into a parse tree."
653 (let* ((*in-pattern-parser-p
* t
)
654 (lexer (make-lexer string
))
655 (parse-tree (reverse-strings (reg-expr lexer
))))
656 ;; check whether we've consumed the whole regex string
657 (if (end-of-string-p lexer
)
658 `(:sequence
:start-anchor
,parse-tree
:end-anchor
)
659 (signal-ppcre-syntax-error*
661 "Expected end of string"))))
663 (defmethod pattern-scanner ((str string
))
664 (cl-ppcre:create-scanner
(parse-pattern str
)))
666 (defmethod pattern-scanner ((scanner function
))