1 ;;; -*- show-trailing-whitespace: t; indent-tabs: nil -*-
3 ;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
5 ;;; hacked for XSD regular expressions by David Lichteblau in 2007:
7 ;;; - no comments and extended stuff
9 ;;; - no greedyness modifier
10 ;;; - fewer and different backslash-escapes: \i \I \c \C \.
11 ;;; - character set substraction: [foo-[bar]]
12 ;;; - no ^ and $, but always wrap those around the complete parse tree
16 ;;; /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.24 2005/04/01 21:29:09 edi Exp
18 ;;; /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.21 2005/08/03 21:11:27 edi Exp
20 ;;; Redistribution and use in source and binary forms, with or without
21 ;;; modification, are permitted provided that the following conditions
24 ;;; * Redistributions of source code must retain the above copyright
25 ;;; notice, this list of conditions and the following disclaimer.
27 ;;; * Redistributions in binary form must reproduce the above
28 ;;; copyright notice, this list of conditions and the following
29 ;;; disclaimer in the documentation and/or other materials
30 ;;; provided with the distribution.
32 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
33 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
34 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35 ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
36 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
38 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
40 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44 (in-package :cxml-types
)
46 (eval-when (:compile-toplevel
:load-toplevel
:execute
)
47 (defparameter *standard-optimize-settings
* '(optimize)))
49 (defvar *in-pattern-parser-p
* nil
)
51 (defvar *convert-char-class-to-hash
* #'cl-ppcre
::convert-char-class-to-hash
)
55 (format t
"Patching CL-PPCRE::CONVERT-CHAR-CLASS-TO-HASH~%")
56 (setf (fdefinition 'cl-ppcre
::convert-char-class-to-hash
)
58 (when *in-pattern-parser-p
*
59 (setf list
(mapcan (lambda (x)
64 (funcall *convert-char-class-to-hash
* list
)))
66 (defun signal-ppcre-syntax-error (fmt &rest args
)
67 (error "invalid pattern: ~?" fmt args
))
69 (defun signal-ppcre-syntax-error* (pos fmt
&rest args
)
70 (error "invalid pattern at ~D: ~?" pos fmt args
))
72 (defmacro maybe-coerce-to-simple-string
(string)
73 (let ((=string
= (gensym)))
74 `(let ((,=string
= ,string
))
75 (cond ((simple-string-p ,=string
=)
78 (coerce ,=string
= 'simple-string
))))))
80 (defun map-char-to-special-char-class (chr lexer
)
81 (declare #.
*standard-optimize-settings
*)
82 "Maps escaped characters like \"\\d\" to the tokens which represent
83 their associated character classes."
86 (#\s
'\\s
) (#\i
'\\i
) (#\c
'\\c
) (#\d
'\\d
) (#\w
'\\w
)
87 (#\S
'^s
) (#\I
'^i
) (#\C
'^c
) (#\D
'^d
) (#\W
'^w
)
89 (unless (eql (next-char lexer
) #\
{)
90 (signal-ppcre-syntax-error "Missing open brace after \\p"))
92 for c
= (next-char lexer
)
93 for last
= (eql c
#\
})
94 and done
= nil then last
97 (signal-ppcre-syntax-error
98 "Missing close brace after \\p")
100 (bag (coerce (list* #\p
#\
{ bag
) 'string
)))
101 (or (find-symbol bag
'cxml-types
)
102 (signal-ppcre-syntax-error "Invalid character property: ~A"
106 (declare #.
*standard-optimize-settings
*)
107 (defstruct (lexer (:constructor make-lexer-internal
))
108 "LEXER structures are used to hold the regex string which is
109 currently lexed and to keep track of the lexer's state."
121 (defun make-lexer (string)
122 (declare (inline make-lexer-internal
)
123 #-genera
(type string string
))
124 (make-lexer-internal :str
(maybe-coerce-to-simple-string string
)
125 :len
(length string
)))
127 (declaim (inline end-of-string-p
))
128 (defun end-of-string-p (lexer)
129 (declare #.
*standard-optimize-settings
*)
130 "Tests whether we're at the end of the regex string."
131 (<= (lexer-len lexer
)
134 (declaim (inline looking-at-p
))
135 (defun looking-at-p (lexer chr
)
136 (declare #.
*standard-optimize-settings
*)
137 "Tests whether the next character the lexer would see is CHR.
138 Does not respect extended mode."
139 (and (not (end-of-string-p lexer
))
140 (char= (schar (lexer-str lexer
) (lexer-pos lexer
))
143 (declaim (inline next-char-non-extended
))
144 (defun next-char-non-extended (lexer)
145 (declare #.
*standard-optimize-settings
*)
146 "Returns the next character which is to be examined and updates the
147 POS slot. Does not respect extended mode."
148 (cond ((end-of-string-p lexer
)
152 (schar (lexer-str lexer
) (lexer-pos lexer
))
153 (incf (lexer-pos lexer
))))))
155 (defun next-char (lexer)
156 (declare #.
*standard-optimize-settings
*)
157 (next-char-non-extended lexer
))
159 (declaim (inline fail
))
161 (declare #.
*standard-optimize-settings
*)
162 "Moves (LEXER-POS LEXER) back to the last position stored in
163 \(LEXER-LAST-POS LEXER) and pops the LAST-POS stack."
164 (unless (lexer-last-pos lexer
)
165 (signal-ppcre-syntax-error "LAST-POS stack of LEXER ~A is empty" lexer
))
166 (setf (lexer-pos lexer
) (pop (lexer-last-pos lexer
)))
169 (defun get-number (lexer &key
(radix 10) max-length no-whitespace-p
)
170 (declare #.
*standard-optimize-settings
*)
171 "Read and consume the number the lexer is currently looking at and
172 return it. Returns NIL if no number could be identified.
173 RADIX is used as in PARSE-INTEGER. If MAX-LENGTH is not NIL we'll read
174 at most the next MAX-LENGTH characters. If NO-WHITESPACE-P is not NIL
175 we don't tolerate whitespace in front of the number."
176 (when (or (end-of-string-p lexer
)
178 (not (find (schar (lexer-str lexer
) (lexer-pos lexer
))
180 (return-from get-number nil
))
181 (multiple-value-bind (integer new-pos
)
182 (parse-integer (lexer-str lexer
)
183 :start
(lexer-pos lexer
)
185 (let ((end-pos (+ (lexer-pos lexer
)
186 (the fixnum max-length
)))
187 (lexer-len (lexer-len lexer
)))
188 (if (< end-pos lexer-len
)
194 (cond ((and integer
(>= (the fixnum integer
) 0))
195 (setf (lexer-pos lexer
) new-pos
)
199 (declaim (inline make-char-from-code
))
200 (defun make-char-from-code (number error-pos
)
201 (declare #.
*standard-optimize-settings
*)
202 "Create character from char-code NUMBER. NUMBER can be NIL
203 which is interpreted as 0. ERROR-POS is the position where
204 the corresponding number started within the regex string."
205 ;; only look at rightmost eight bits in compliance with Perl
206 (let ((code (logand #o377
(the fixnum
(or number
0)))))
207 (or (and (< code char-code-limit
)
209 (signal-ppcre-syntax-error*
211 "No character for hex-code ~X"
214 (defun unescape-char (lexer)
215 (declare #.
*standard-optimize-settings
*)
216 "Convert the characters(s) following a backslash into a token
217 which is returned. This function is to be called when the backslash
218 has already been consumed. Special character classes like \\W are
220 (when (end-of-string-p lexer
)
221 (signal-ppcre-syntax-error "String ends with backslash"))
222 (let ((chr (next-char-non-extended lexer
)))
224 ;; the following five character names are 'semi-standard'
225 ;; according to the CLHS but I'm not aware of any implementation
226 ;; that doesn't implement them
234 ;; all other characters aren't affected by a backslash
237 (defun convert-substraction (r s
)
240 (character `((:range
,x
,x
)))
241 (list (assert (eq (car x
) :range
)) (list x
))
242 (symbol (copy-list (symbol-value x
))))))
243 (ranges- (mapcan #'rangify r
) (mapcan #'rangify s
))))
245 (defun collect-char-class (lexer)
246 (declare #.
*standard-optimize-settings
*)
247 "Reads and consumes characters from regex string until a right
248 bracket is seen. Assembles them into a list \(which is returned) of
249 characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and
250 tokens representing special character classes."
251 (let ((start-pos (lexer-pos lexer
)) ; remember start for error message
255 (flet ((handle-char (c)
256 "Do the right thing with character C depending on whether
257 we're inside a range or not."
258 (cond ((and hyphen-seen last-char
)
259 (setf (car list
) (list :range last-char c
)
264 (setq hyphen-seen nil
)))
265 (loop for first
= t then nil
266 for c
= (next-char-non-extended lexer
)
267 ;; leave loop if at end of string
271 ;; we've seen a backslash
272 (let ((next-char (next-char-non-extended lexer
)))
274 ((#\.
#\i
#\I
#\c
#\C
#\d
#\D
#\w
#\W
#\s
#\S
#\p
)
275 ;; a special character class
276 (push (map-char-to-special-char-class next-char lexer
)
278 ;; if the last character was a hyphen
279 ;; just collect it literally
282 ;; if the next character is a hyphen do the same
283 (when (looking-at-p lexer
#\-
)
284 (incf (lexer-pos lexer
))
285 (when (looking-at-p lexer
#\
[)
286 (incf (lexer-pos lexer
))
287 (return-from collect-char-class
289 (convert-substraction
291 (collect-char-class lexer
))
293 (eql (next-char-non-extended lexer
) #\
])
294 (signal-ppcre-syntax-error*
296 "Missing right bracket to close character class")))))
298 (setq hyphen-seen nil
))
300 ;; otherwise unescape the following character(s)
301 (decf (lexer-pos lexer
))
302 (handle-char (unescape-char lexer
))))))
304 ;; the first character must not be a right bracket
305 ;; and isn't treated specially if it's a hyphen
308 ;; end of character class
309 ;; make sure we collect a pending hyphen
311 (setq hyphen-seen nil
)
313 ;; reverse the list to preserve the order intended
314 ;; by the author of the regex string
315 (return-from collect-char-class
(nreverse list
)))
316 ((and hyphen-seen
(char= c
#\
[))
317 (return-from collect-char-class
319 (convert-substraction
321 (collect-char-class lexer
))
322 (unless (eql (next-char-non-extended lexer
) #\
])
323 (signal-ppcre-syntax-error*
325 "Missing right bracket to close character class")))))
329 ;; if the last character was 'just a character'
330 ;; we expect to be in the middle of a range
331 (setq hyphen-seen t
))
333 ;; otherwise this is just an ordinary hyphen
336 ;; default case - just collect the character
338 ;; we can only exit the loop normally if we've reached the end
339 ;; of the regex string without seeing a right bracket
340 (signal-ppcre-syntax-error*
342 "Missing right bracket to close character class"))))
344 (defun get-quantifier (lexer)
345 (declare #.
*standard-optimize-settings
*)
346 "Returns a list of two values (min max) if what the lexer is looking
347 at can be interpreted as a quantifier. Otherwise returns NIL and
348 resets the lexer to its old position."
349 ;; remember starting position for FAIL and UNGET-TOKEN functions
350 (push (lexer-pos lexer
) (lexer-last-pos lexer
))
351 (let ((next-char (next-char lexer
)))
354 ;; * (Kleene star): match 0 or more times
357 ;; +: match 1 or more times
360 ;; ?: match 0 or 1 times
364 ;; {n}: match exactly n times
365 ;; {n,}: match at least n times
366 ;; {n,m}: match at least n but not more than m times
367 ;; note that anything not matching one of these patterns will
368 ;; be interpreted literally - even whitespace isn't allowed
369 (let ((num1 (get-number lexer
:no-whitespace-p t
)))
371 (let ((next-char (next-char-non-extended lexer
)))
374 (let* ((num2 (get-number lexer
:no-whitespace-p t
))
375 (next-char (next-char-non-extended lexer
)))
378 ;; this is the case {n,} (NUM2 is NIL) or {n,m}
383 ;; this is the case {n}
387 ;; no number following left curly brace, so we treat it
388 ;; like a normal character
390 ;; cannot be a quantifier
394 (defun get-token (lexer)
395 (declare #.
*standard-optimize-settings
*)
396 "Returns and consumes the next token from the regex string (or NIL)."
397 ;; remember starting position for UNGET-TOKEN function
398 (push (lexer-pos lexer
)
399 (lexer-last-pos lexer
))
400 (let ((next-char (next-char lexer
)))
403 ;; the easy cases first - the following six characters
404 ;; always have a special meaning and get translated
405 ;; into tokens immediately
415 ;; quantifiers will always be consumend by
416 ;; GET-QUANTIFIER, they must not appear here
417 (signal-ppcre-syntax-error*
418 (1- (lexer-pos lexer
))
419 "Quantifier '~A' not allowed"
422 ;; left brace isn't a special character in it's own
423 ;; right but we must check if what follows might
424 ;; look like a quantifier
425 (let ((this-pos (lexer-pos lexer
))
426 (this-last-pos (lexer-last-pos lexer
)))
428 (when (get-quantifier lexer
)
429 (signal-ppcre-syntax-error*
431 "Quantifier '~A' not allowed"
432 (subseq (lexer-str lexer
)
435 (setf (lexer-pos lexer
) this-pos
436 (lexer-last-pos lexer
) this-last-pos
)
439 ;; left bracket always starts a character class
440 (cons (cond ((looking-at-p lexer
#\^
)
441 (incf (lexer-pos lexer
))
442 :inverted-char-class
)
445 (collect-char-class lexer
)))
447 ;; backslash might mean different things so we have
448 ;; to peek one char ahead:
449 (let ((next-char (next-char-non-extended lexer
)))
451 ((#\.
#\i
#\I
#\c
#\C
#\d
#\D
#\w
#\W
#\s
#\S
#\p
)
452 ;; these will be treated like character classes
453 (map-char-to-special-char-class next-char lexer
))
455 ;; in all other cases just unescape the
457 (decf (lexer-pos lexer
))
458 (unescape-char lexer
)))))
462 ;; all other characters are their own tokens
464 ;; we didn't get a character (this if the "else" branch from
465 ;; the first IF), so we don't return a token but NIL
467 (pop (lexer-last-pos lexer
))
470 (declaim (notinline unget-token
)) ;FIXME: else AVER in GET-TOKEN
471 (defun unget-token (lexer)
472 (declare #.
*standard-optimize-settings
*)
473 "Moves the lexer back to the last position stored in the LAST-POS stack."
474 (if (lexer-last-pos lexer
)
475 (setf (lexer-pos lexer
)
476 (pop (lexer-last-pos lexer
)))
477 (error "No token to unget \(this should not happen)")))
479 (declaim (inline start-of-subexpr-p
))
480 (defun start-of-subexpr-p (lexer)
481 (declare #.
*standard-optimize-settings
*)
482 "Tests whether the next token can start a valid sub-expression, i.e.
483 a stand-alone regex."
484 (let* ((pos (lexer-pos lexer
))
485 (next-char (next-char lexer
)))
486 (not (or (null next-char
)
488 (member (the character next-char
)
491 (setf (lexer-pos lexer
) pos
))))))
494 (declare #.
*standard-optimize-settings
*)
495 "Parses and consumes a <group>.
496 The productions are: <group> -> \"(\"<regex>\")\"
498 Will return <parse-tree> or (<grouping-type> <parse-tree>) where
499 <grouping-type> is one of six keywords - see source for details."
500 (let ((open-token (get-token lexer
)))
501 (cond ((eq open-token
:open-paren
)
502 (let* ((open-paren-pos (car (lexer-last-pos lexer
)))
503 (reg-expr (reg-expr lexer
))
504 (close-token (get-token lexer
)))
505 (unless (eq close-token
:close-paren
)
506 ;; the token following <regex> must be the closing
507 ;; parenthesis or this is a syntax error
508 (signal-ppcre-syntax-error*
510 "Opening paren has no matching closing paren"))
511 (list :register reg-expr
)))
513 ;; this is the <legal-token> production; <legal-token> is
514 ;; any token which passes START-OF-SUBEXPR-P (otherwise
515 ;; parsing had already stopped in the SEQ method)
518 (defun greedy-quant (lexer)
519 (declare #.
*standard-optimize-settings
*)
520 "Parses and consumes a <greedy-quant>.
521 The productions are: <greedy-quant> -> <group> | <group><quantifier>
522 where <quantifier> is parsed by the lexer function GET-QUANTIFIER.
523 Will return <parse-tree> or (:GREEDY-REPETITION <min> <max> <parse-tree>)."
524 (let* ((group (group lexer
))
525 (token (get-quantifier lexer
)))
527 ;; if GET-QUANTIFIER returned a non-NIL value it's the
528 ;; two-element list (<min> <max>)
529 (list :greedy-repetition
(first token
) (second token
) group
)
533 (declare #.
*standard-optimize-settings
*)
534 (greedy-quant lexer
))
537 (declare #.
*standard-optimize-settings
*)
538 "Parses and consumes a <seq>.
539 The productions are: <seq> -> <quant> | <quant><seq>.
540 Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)."
541 (flet ((make-array-from-two-chars (char1 char2
)
542 (let ((string (make-array 2
543 :element-type
'character
546 (setf (aref string
0) char1
)
547 (setf (aref string
1) char2
)
549 ;; Note that we're calling START-OF-SUBEXPR-P before we actually try
550 ;; to parse a <seq> or <quant> in order to catch empty regular
552 (if (start-of-subexpr-p lexer
)
553 (let ((quant (quant lexer
)))
554 (if (start-of-subexpr-p lexer
)
555 (let* ((seq (seq lexer
))
556 (quant-is-char-p (characterp quant
))
557 (seq-is-sequence-p (and (consp seq
)
558 (eq (first seq
) :sequence
))))
559 (cond ((and quant-is-char-p
561 (make-array-from-two-chars seq quant
))
562 ((and quant-is-char-p
564 (vector-push-extend quant seq
)
566 ((and quant-is-char-p
568 (characterp (second seq
)))
572 (make-array-from-two-chars (second seq
)
576 (t (make-array-from-two-chars (second seq
) quant
))))
577 ((and quant-is-char-p
579 (stringp (second seq
)))
584 (vector-push-extend quant
(second seq
))
589 (vector-push-extend quant
(second seq
))
592 ;; if <seq> is also a :SEQUENCE parse tree we merge
593 ;; both lists into one to avoid unnecessary consing
595 (cons quant
(cdr seq
)))
597 (t (list :sequence quant seq
))))
601 (defun reg-expr (lexer)
602 (declare #.
*standard-optimize-settings
*)
603 "Parses and consumes a <regex>, a complete regular expression.
604 The productions are: <regex> -> <seq> | <seq>\"|\"<regex>.
605 Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
606 (let ((pos (lexer-pos lexer
)))
607 (case (next-char lexer
)
609 ;; if we didn't get any token we return :VOID which stands for
610 ;; "empty regular expression"
613 ;; now check whether the expression started with a vertical
614 ;; bar, i.e. <seq> - the left alternation - is empty
615 (list :alternation
:void
(reg-expr lexer
)))
617 ;; otherwise un-read the character we just saw and parse a
618 ;; <seq> plus the character following it
619 (setf (lexer-pos lexer
) pos
)
620 (let* ((seq (seq lexer
))
621 (pos (lexer-pos lexer
)))
622 (case (next-char lexer
)
624 ;; no further character, just a <seq>
627 ;; if the character was a vertical bar, this is an
628 ;; alternation and we have the second production
629 (let ((reg-expr (reg-expr lexer
)))
630 (cond ((and (consp reg-expr
)
631 (eq (first reg-expr
) :alternation
))
632 ;; again we try to merge as above in SEQ
634 (cons seq
(cdr reg-expr
)))
636 (t (list :alternation seq reg-expr
)))))
638 ;; a character which is not a vertical bar - this is
639 ;; either a syntax error or we're inside of a group and
640 ;; the next character is a closing parenthesis; so we
641 ;; just un-read the character and let another function
643 (setf (lexer-pos lexer
) pos
)
646 (defun reverse-strings (parse-tree)
647 (declare #.
*standard-optimize-settings
*)
648 (cond ((stringp parse-tree
)
649 (nreverse parse-tree
))
651 (loop for parse-tree-rest on parse-tree
652 while parse-tree-rest
653 do
(setf (car parse-tree-rest
)
654 (reverse-strings (car parse-tree-rest
))))
658 (defun parse-pattern (string)
659 (declare #.
*standard-optimize-settings
*)
660 "Translate the regex string STRING into a parse tree."
661 (let* ((*in-pattern-parser-p
* t
)
662 (lexer (make-lexer string
))
663 (parse-tree (reverse-strings (reg-expr lexer
))))
664 ;; check whether we've consumed the whole regex string
665 (if (end-of-string-p lexer
)
666 `(:sequence
:start-anchor
,parse-tree
:end-anchor
)
667 (signal-ppcre-syntax-error*
669 "Expected end of string"))))
671 (defmethod pattern-scanner ((str string
))
672 (cl-ppcre:create-scanner
(parse-pattern str
)))
674 (defmethod pattern-scanner ((scanner function
))