nppcre.lisp

   1 ;;; -*- show-trailing-whitespace: t; indent-tabs: nil -*-
   2
   3 ;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
   4
   5 ;;; hacked for XSD regular expressions by David Lichteblau in 2007:
   6
   7 ;;;  - no comments and extended stuff
   8 ;;;  - no (?
   9 ;;;  - no greedyness modifier
  10 ;;;  - fewer and different backslash-escapes: \i \I \c \C \.
  11 ;;;  - character set substraction: [foo-[bar]]
  12 ;;;  - no ^ and $, but always wrap those around the complete parse tree
  13 ;;;  - ...
  14
  15 ;;; Derived from:
  16 ;;; /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.24 2005/04/01 21:29:09 edi Exp
  17 ;;; and
  18 ;;; /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.21 2005/08/03 21:11:27 edi Exp
  19
  20 ;;; Redistribution and use in source and binary forms, with or without
  21 ;;; modification, are permitted provided that the following conditions
  22 ;;; are met:
  23
  24 ;;;   * Redistributions of source code must retain the above copyright
  25 ;;;     notice, this list of conditions and the following disclaimer.
  26
  27 ;;;   * Redistributions in binary form must reproduce the above
  28 ;;;     copyright notice, this list of conditions and the following
  29 ;;;     disclaimer in the documentation and/or other materials
  30 ;;;     provided with the distribution.
  31
  32 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
  33 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  34 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  35 ;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  36 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  37 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  38 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  39 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  40 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  41 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  42 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  43
  44 (in-package :cxml-types)
  45
  46 (eval-when (:compile-toplevel :load-toplevel :execute)
  47   (defparameter *standard-optimize-settings* '(optimize)))
  48
  49 (defvar *in-pattern-parser-p* nil)
  50
  51 (defvar *convert-char-class-to-hash* #'cl-ppcre::convert-char-class-to-hash)
  52
  53 ;;; zzz Evil hack!
  54
  55 (format t "Patching CL-PPCRE::CONVERT-CHAR-CLASS-TO-HASH~%")
  56 (setf (fdefinition 'cl-ppcre::convert-char-class-to-hash)
  57       (lambda (list)
  58         (when *in-pattern-parser-p*
  59           (setf list (mapcan (lambda (x)
  60                                (if (symbolp x)
  61                                    (symbol-value x)
  62                                    x))
  63                              list)))
  64         (funcall *convert-char-class-to-hash* list)))
  65
  66 (defun signal-ppcre-syntax-error (fmt &rest args)
  67   (error "invalid pattern: ~?" fmt args))
  68
  69 (defun signal-ppcre-syntax-error* (pos fmt &rest args)
  70   (error "invalid pattern at ~D: ~?" pos fmt args))
  71
  72 (defmacro maybe-coerce-to-simple-string (string)
  73   (let ((=string= (gensym)))
  74     `(let ((,=string= ,string))
  75       (cond ((simple-string-p ,=string=)
  76               ,=string=)
  77             (t
  78               (coerce ,=string= 'simple-string))))))
  79
  80 (defun map-char-to-special-char-class (chr lexer)
  81   (declare #.*standard-optimize-settings*)
  82   "Maps escaped characters like \"\\d\" to the tokens which represent
  83 their associated character classes."
  84   (case chr
  85     (#\. '\.)
  86     (#\s '\\s) (#\i '\\i) (#\c '\\c) (#\d '\\d) (#\w '\\w)
  87     (#\S '^s) (#\I '^i) (#\C '^c) (#\D '^d) (#\W '^w)
  88     (#\p
  89      (unless (eql (next-char lexer) #\{)
  90        (signal-ppcre-syntax-error "Missing open brace after \\p"))
  91      (let* ((bag (loop
  92                     for c = (next-char lexer)
  93                     for last = (eql c #\})
  94                     and done = nil then last
  95                     until done
  96                     unless c do
  97                       (signal-ppcre-syntax-error
  98                        "Missing close brace after \\p")
  99                     collect c))
 100             (bag (coerce (list* #\p #\{ bag) 'string)))
 101        (or (find-symbol bag 'cxml-types)
 102            (signal-ppcre-syntax-error "Invalid character property: ~A"
 103                                       bag))))))
 104
 105 (locally
 106   (declare #.*standard-optimize-settings*)
 107   (defstruct (lexer (:constructor make-lexer-internal))
 108     "LEXER structures are used to hold the regex string which is
 109 currently lexed and to keep track of the lexer's state."
 110     (str ""
 111          :type string
 112          :read-only t)
 113     (len 0
 114          :type fixnum
 115          :read-only t)
 116     (pos 0
 117          :type fixnum)
 118     (last-pos nil
 119               :type list)))
 120
 121 (defun make-lexer (string)
 122   (declare (inline make-lexer-internal)
 123            #-genera (type string string))
 124   (make-lexer-internal :str (maybe-coerce-to-simple-string string)
 125                        :len (length string)))
 126
 127 (declaim (inline end-of-string-p))
 128 (defun end-of-string-p (lexer)
 129   (declare #.*standard-optimize-settings*)
 130   "Tests whether we're at the end of the regex string."
 131   (<= (lexer-len lexer)
 132       (lexer-pos lexer)))
 133
 134 (declaim (inline looking-at-p))
 135 (defun looking-at-p (lexer chr)
 136   (declare #.*standard-optimize-settings*)
 137   "Tests whether the next character the lexer would see is CHR.
 138 Does not respect extended mode."
 139   (and (not (end-of-string-p lexer))
 140        (char= (schar (lexer-str lexer) (lexer-pos lexer))
 141               chr)))
 142
 143 (declaim (inline next-char-non-extended))
 144 (defun next-char-non-extended (lexer)
 145   (declare #.*standard-optimize-settings*)
 146   "Returns the next character which is to be examined and updates the
 147 POS slot. Does not respect extended mode."
 148   (cond ((end-of-string-p lexer)
 149           nil)
 150         (t
 151           (prog1
 152             (schar (lexer-str lexer) (lexer-pos lexer))
 153             (incf (lexer-pos lexer))))))
 154
 155 (defun next-char (lexer)
 156   (declare #.*standard-optimize-settings*)
 157   (next-char-non-extended lexer))
 158
 159 (declaim (inline fail))
 160 (defun fail (lexer)
 161   (declare #.*standard-optimize-settings*)
 162   "Moves (LEXER-POS LEXER) back to the last position stored in
 163 \(LEXER-LAST-POS LEXER) and pops the LAST-POS stack."
 164   (unless (lexer-last-pos lexer)
 165     (signal-ppcre-syntax-error "LAST-POS stack of LEXER ~A is empty" lexer))
 166   (setf (lexer-pos lexer) (pop (lexer-last-pos lexer)))
 167   nil)
 168
 169 (defun get-number (lexer &key (radix 10) max-length no-whitespace-p)
 170   (declare #.*standard-optimize-settings*)
 171   "Read and consume the number the lexer is currently looking at and
 172 return it. Returns NIL if no number could be identified.
 173 RADIX is used as in PARSE-INTEGER. If MAX-LENGTH is not NIL we'll read
 174 at most the next MAX-LENGTH characters. If NO-WHITESPACE-P is not NIL
 175 we don't tolerate whitespace in front of the number."
 176   (when (or (end-of-string-p lexer)
 177             (and no-whitespace-p
 178                  (not (find (schar (lexer-str lexer) (lexer-pos lexer))
 179                             "0123456789"))))
 180     (return-from get-number nil))
 181   (multiple-value-bind (integer new-pos)
 182       (parse-integer (lexer-str lexer)
 183                      :start (lexer-pos lexer)
 184                      :end (if max-length
 185                             (let ((end-pos (+ (lexer-pos lexer)
 186                                               (the fixnum max-length)))
 187                                   (lexer-len (lexer-len lexer)))
 188                               (if (< end-pos lexer-len)
 189                                 end-pos
 190                                 lexer-len))
 191                             (lexer-len lexer))
 192                      :radix radix
 193                      :junk-allowed t)
 194     (cond ((and integer (>= (the fixnum integer) 0))
 195             (setf (lexer-pos lexer) new-pos)
 196             integer)
 197           (t nil))))
 198
 199 (declaim (inline make-char-from-code))
 200 (defun make-char-from-code (number error-pos)
 201   (declare #.*standard-optimize-settings*)
 202   "Create character from char-code NUMBER. NUMBER can be NIL
 203 which is interpreted as 0. ERROR-POS is the position where
 204 the corresponding number started within the regex string."
 205   ;; only look at rightmost eight bits in compliance with Perl
 206   (let ((code (logand #o377 (the fixnum (or number 0)))))
 207     (or (and (< code char-code-limit)
 208              (code-char code))
 209         (signal-ppcre-syntax-error*
 210          error-pos
 211          "No character for hex-code ~X"
 212          number))))
 213
 214 (defun unescape-char (lexer)
 215   (declare #.*standard-optimize-settings*)
 216   "Convert the characters(s) following a backslash into a token
 217 which is returned. This function is to be called when the backslash
 218 has already been consumed. Special character classes like \\W are
 219 handled elsewhere."
 220   (when (end-of-string-p lexer)
 221     (signal-ppcre-syntax-error "String ends with backslash"))
 222   (let ((chr (next-char-non-extended lexer)))
 223     (case chr
 224       ;; the following five character names are 'semi-standard'
 225       ;; according to the CLHS but I'm not aware of any implementation
 226       ;; that doesn't implement them
 227       ((#\t)
 228         #\Tab)
 229       ((#\n)
 230         #\Newline)
 231       ((#\r)
 232         #\Return)
 233       (otherwise
 234         ;; all other characters aren't affected by a backslash
 235         chr))))
 236
 237 (defun convert-substraction (r s)
 238   (flet ((rangify (x)
 239            (etypecase x
 240              (character `((:range ,x ,x)))
 241              (list (assert (eq (car x) :range)) (list x))
 242              (symbol (copy-list (symbol-value x))))))
 243     (ranges- (mapcan #'rangify r) (mapcan #'rangify s))))
 244
 245 (defun collect-char-class (lexer)
 246   (declare #.*standard-optimize-settings*)
 247   "Reads and consumes characters from regex string until a right
 248 bracket is seen. Assembles them into a list \(which is returned) of
 249 characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and
 250 tokens representing special character classes."
 251   (let ((start-pos (lexer-pos lexer))         ; remember start for error message
 252         hyphen-seen
 253         last-char
 254         list)
 255     (flet ((handle-char (c)
 256              "Do the right thing with character C depending on whether
 257 we're inside a range or not."
 258              (cond ((and hyphen-seen last-char)
 259                      (setf (car list) (list :range last-char c)
 260                            last-char nil))
 261                    (t
 262                      (push c list)
 263                      (setq last-char c)))
 264              (setq hyphen-seen nil)))
 265       (loop for first = t then nil
 266             for c = (next-char-non-extended lexer)
 267             ;; leave loop if at end of string
 268             while c
 269             do (cond
 270                  ((char= c #\\)
 271                    ;; we've seen a backslash
 272                    (let ((next-char (next-char-non-extended lexer)))
 273                      (case next-char
 274                        ((#\. #\i #\I #\c #\C #\d #\D #\w #\W #\s #\S #\p)
 275                          ;; a special character class
 276                          (push (map-char-to-special-char-class next-char lexer)
 277                                list)
 278                          ;; if the last character was a hyphen
 279                          ;; just collect it literally
 280                          (when hyphen-seen
 281                            (push #\- list))
 282                          ;; if the next character is a hyphen do the same
 283                          (when (looking-at-p lexer #\-)
 284                            (incf (lexer-pos lexer))
 285                            (when (looking-at-p lexer #\[)
 286                              (incf (lexer-pos lexer))
 287                              (return-from collect-char-class
 288                                (prog1
 289                                    (convert-substraction
 290                                     (nreverse list)
 291                                     (collect-char-class lexer))
 292                                  (unless
 293                                      (eql (next-char-non-extended lexer) #\])
 294                                    (signal-ppcre-syntax-error*
 295                                     start-pos
 296                                     "Missing right bracket to close character class")))))
 297                            (push #\- list))
 298                          (setq hyphen-seen nil))
 299                        (otherwise
 300                          ;; otherwise unescape the following character(s)
 301                          (decf (lexer-pos lexer))
 302                          (handle-char (unescape-char lexer))))))
 303                  (first
 304                    ;; the first character must not be a right bracket
 305                    ;; and isn't treated specially if it's a hyphen
 306                    (handle-char c))
 307                  ((char= c #\])
 308                    ;; end of character class
 309                    ;; make sure we collect a pending hyphen
 310                    (when hyphen-seen
 311                      (setq hyphen-seen nil)
 312                      (handle-char #\-))
 313                    ;; reverse the list to preserve the order intended
 314                    ;; by the author of the regex string
 315                    (return-from collect-char-class (nreverse list)))
 316                  ((and hyphen-seen (char= c #\[))
 317                   (return-from collect-char-class
 318                     (prog1
 319                         (convert-substraction
 320                          (nreverse list)
 321                          (collect-char-class lexer))
 322                       (unless (eql (next-char-non-extended lexer) #\])
 323                         (signal-ppcre-syntax-error*
 324                          start-pos
 325                          "Missing right bracket to close character class")))))
 326                  ((and (char= c #\-)
 327                        last-char
 328                        (not hyphen-seen))
 329                    ;; if the last character was 'just a character'
 330                    ;; we expect to be in the middle of a range
 331                    (setq hyphen-seen t))
 332                  ((char= c #\-)
 333                    ;; otherwise this is just an ordinary hyphen
 334                    (handle-char #\-))
 335                  (t
 336                    ;; default case - just collect the character
 337                    (handle-char c))))
 338       ;; we can only exit the loop normally if we've reached the end
 339       ;; of the regex string without seeing a right bracket
 340       (signal-ppcre-syntax-error*
 341        start-pos
 342        "Missing right bracket to close character class"))))
 343
 344 (defun get-quantifier (lexer)
 345   (declare #.*standard-optimize-settings*)
 346   "Returns a list of two values (min max) if what the lexer is looking
 347 at can be interpreted as a quantifier. Otherwise returns NIL and
 348 resets the lexer to its old position."
 349   ;; remember starting position for FAIL and UNGET-TOKEN functions
 350   (push (lexer-pos lexer) (lexer-last-pos lexer))
 351   (let ((next-char (next-char lexer)))
 352     (case next-char
 353       ((#\*)
 354         ;; * (Kleene star): match 0 or more times
 355         '(0 nil))
 356       ((#\+)
 357         ;; +: match 1 or more times
 358         '(1 nil))
 359       ((#\?)
 360         ;; ?: match 0 or 1 times
 361         '(0 1))
 362       ((#\{)
 363         ;; one of
 364         ;;   {n}:   match exactly n times
 365         ;;   {n,}:  match at least n times
 366         ;;   {n,m}: match at least n but not more than m times
 367         ;; note that anything not matching one of these patterns will
 368         ;; be interpreted literally - even whitespace isn't allowed
 369         (let ((num1 (get-number lexer :no-whitespace-p t)))
 370           (if num1
 371             (let ((next-char (next-char-non-extended lexer)))
 372               (case next-char
 373                 ((#\,)
 374                   (let* ((num2 (get-number lexer :no-whitespace-p t))
 375                          (next-char (next-char-non-extended lexer)))
 376                     (case next-char
 377                       ((#\})
 378                         ;; this is the case {n,} (NUM2 is NIL) or {n,m}
 379                         (list num1 num2))
 380                       (otherwise
 381                         (fail lexer)))))
 382                 ((#\})
 383                   ;; this is the case {n}
 384                   (list num1 num1))
 385                 (otherwise
 386                   (fail lexer))))
 387             ;; no number following left curly brace, so we treat it
 388             ;; like a normal character
 389             (fail lexer))))
 390       ;; cannot be a quantifier
 391       (otherwise
 392         (fail lexer)))))
 393
 394 (defun get-token (lexer)
 395   (declare #.*standard-optimize-settings*)
 396   "Returns and consumes the next token from the regex string (or NIL)."
 397   ;; remember starting position for UNGET-TOKEN function
 398   (push (lexer-pos lexer)
 399         (lexer-last-pos lexer))
 400   (let ((next-char (next-char lexer)))
 401     (cond (next-char
 402             (case next-char
 403               ;; the easy cases first - the following six characters
 404               ;; always have a special meaning and get translated
 405               ;; into tokens immediately
 406               ((#\))
 407                 :close-paren)
 408               ((#\|)
 409                 :vertical-bar)
 410               ((#\?)
 411                 :question-mark)
 412               ((#\.)
 413                 :everything)
 414               ((#\+ #\*)
 415                 ;; quantifiers will always be consumend by
 416                 ;; GET-QUANTIFIER, they must not appear here
 417                 (signal-ppcre-syntax-error*
 418                  (1- (lexer-pos lexer))
 419                  "Quantifier '~A' not allowed"
 420                  next-char))
 421               ((#\{)
 422                 ;; left brace isn't a special character in it's own
 423                 ;; right but we must check if what follows might
 424                 ;; look like a quantifier
 425                 (let ((this-pos (lexer-pos lexer))
 426                       (this-last-pos (lexer-last-pos lexer)))
 427                   (unget-token lexer)
 428                   (when (get-quantifier lexer)
 429                     (signal-ppcre-syntax-error*
 430                      (car this-last-pos)
 431                      "Quantifier '~A' not allowed"
 432                      (subseq (lexer-str lexer)
 433                              (car this-last-pos)
 434                              (lexer-pos lexer))))
 435                   (setf (lexer-pos lexer) this-pos
 436                         (lexer-last-pos lexer) this-last-pos)
 437                   next-char))
 438               ((#\[)
 439                 ;; left bracket always starts a character class
 440                 (cons  (cond ((looking-at-p lexer #\^)
 441                                (incf (lexer-pos lexer))
 442                                :inverted-char-class)
 443                              (t
 444                                :char-class))
 445                        (collect-char-class lexer)))
 446               ((#\\)
 447                 ;; backslash might mean different things so we have
 448                 ;; to peek one char ahead:
 449                 (let ((next-char (next-char-non-extended lexer)))
 450                   (case next-char
 451                     ((#\. #\i #\I #\c #\C #\d #\D #\w #\W #\s #\S #\p)
 452                       ;; these will be treated like character classes
 453                       (map-char-to-special-char-class next-char lexer))
 454                     (otherwise
 455                       ;; in all other cases just unescape the
 456                       ;; character
 457                       (decf (lexer-pos lexer))
 458                       (unescape-char lexer)))))
 459               ((#\()
 460                :open-paren)
 461               (otherwise
 462                 ;; all other characters are their own tokens
 463                 next-char)))
 464           ;; we didn't get a character (this if the "else" branch from
 465           ;; the first IF), so we don't return a token but NIL
 466           (t
 467             (pop (lexer-last-pos lexer))
 468             nil))))
 469
 470 (declaim (notinline unget-token))       ;FIXME: else AVER in GET-TOKEN
 471 (defun unget-token (lexer)
 472   (declare #.*standard-optimize-settings*)
 473   "Moves the lexer back to the last position stored in the LAST-POS stack."
 474   (if (lexer-last-pos lexer)
 475     (setf (lexer-pos lexer)
 476             (pop (lexer-last-pos lexer)))
 477     (error "No token to unget \(this should not happen)")))
 478
 479 (declaim (inline start-of-subexpr-p))
 480 (defun start-of-subexpr-p (lexer)
 481   (declare #.*standard-optimize-settings*)
 482   "Tests whether the next token can start a valid sub-expression, i.e.
 483 a stand-alone regex."
 484   (let* ((pos (lexer-pos lexer))
 485          (next-char (next-char lexer)))
 486     (not (or (null next-char)
 487              (prog1
 488                (member (the character next-char)
 489                        '(#\) #\|)
 490                        :test #'char=)
 491                (setf (lexer-pos lexer) pos))))))
 492
 493 (defun group (lexer)
 494   (declare #.*standard-optimize-settings*)
 495   "Parses and consumes a <group>.
 496 The productions are: <group> -> \"(\"<regex>\")\"
 497                                 <legal-token>
 498 Will return <parse-tree> or (<grouping-type> <parse-tree>) where
 499 <grouping-type> is one of six keywords - see source for details."
 500   (let ((open-token (get-token lexer)))
 501     (cond ((eq open-token :open-paren)
 502            (let* ((open-paren-pos (car (lexer-last-pos lexer)))
 503                   (reg-expr (reg-expr lexer))
 504                   (close-token (get-token lexer)))
 505              (unless (eq close-token :close-paren)
 506                ;; the token following <regex> must be the closing
 507                ;; parenthesis or this is a syntax error
 508                (signal-ppcre-syntax-error*
 509                 open-paren-pos
 510                 "Opening paren has no matching closing paren"))
 511              (list :register reg-expr)))
 512           (t
 513             ;; this is the <legal-token> production; <legal-token> is
 514             ;; any token which passes START-OF-SUBEXPR-P (otherwise
 515             ;; parsing had already stopped in the SEQ method)
 516             open-token))))
 517
 518 (defun greedy-quant (lexer)
 519   (declare #.*standard-optimize-settings*)
 520   "Parses and consumes a <greedy-quant>.
 521 The productions are: <greedy-quant> -> <group> | <group><quantifier>
 522 where <quantifier> is parsed by the lexer function GET-QUANTIFIER.
 523 Will return <parse-tree> or (:GREEDY-REPETITION <min> <max> <parse-tree>)."
 524   (let* ((group (group lexer))
 525          (token (get-quantifier lexer)))
 526     (if token
 527       ;; if GET-QUANTIFIER returned a non-NIL value it's the
 528       ;; two-element list (<min> <max>)
 529       (list :greedy-repetition (first token) (second token) group)
 530       group)))
 531
 532 (defun quant (lexer)
 533   (declare #.*standard-optimize-settings*)
 534   (greedy-quant lexer))
 535
 536 (defun seq (lexer)
 537   (declare #.*standard-optimize-settings*)
 538   "Parses and consumes a <seq>.
 539 The productions are: <seq> -> <quant> | <quant><seq>.
 540 Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)."
 541   (flet ((make-array-from-two-chars (char1 char2)
 542            (let ((string (make-array 2
 543                                      :element-type 'character
 544                                      :fill-pointer t
 545                                      :adjustable t)))
 546              (setf (aref string 0) char1)
 547              (setf (aref string 1) char2)
 548              string)))
 549     ;; Note that we're calling START-OF-SUBEXPR-P before we actually try
 550     ;; to parse a <seq> or <quant> in order to catch empty regular
 551     ;; expressions
 552     (if (start-of-subexpr-p lexer)
 553       (let ((quant (quant lexer)))
 554         (if (start-of-subexpr-p lexer)
 555           (let* ((seq (seq lexer))
 556                  (quant-is-char-p (characterp quant))
 557                  (seq-is-sequence-p (and (consp seq)
 558                                          (eq (first seq) :sequence))))
 559             (cond ((and quant-is-char-p
 560                         (characterp seq))
 561                     (make-array-from-two-chars seq quant))
 562                   ((and quant-is-char-p
 563                         (stringp seq))
 564                     (vector-push-extend quant seq)
 565                     seq)
 566                   ((and quant-is-char-p
 567                         seq-is-sequence-p
 568                         (characterp (second seq)))
 569                     (cond ((cddr seq)
 570                             (setf (cdr seq)
 571                                     (cons
 572                                      (make-array-from-two-chars (second seq)
 573                                                                 quant)
 574                                      (cddr seq)))
 575                             seq)
 576                           (t (make-array-from-two-chars (second seq) quant))))
 577                   ((and quant-is-char-p
 578                         seq-is-sequence-p
 579                         (stringp (second seq)))
 580                     (cond ((cddr seq)
 581                             (setf (cdr seq)
 582                                     (cons
 583                                      (progn
 584                                        (vector-push-extend quant (second seq))
 585                                        (second seq))
 586                                      (cddr seq)))
 587                             seq)
 588                           (t
 589                             (vector-push-extend quant (second seq))
 590                             (second seq))))
 591                   (seq-is-sequence-p
 592                     ;; if <seq> is also a :SEQUENCE parse tree we merge
 593                     ;; both lists into one to avoid unnecessary consing
 594                     (setf (cdr seq)
 595                             (cons quant (cdr seq)))
 596                     seq)
 597                   (t (list :sequence quant seq))))
 598           quant))
 599       :void)))
 600
 601 (defun reg-expr (lexer)
 602   (declare #.*standard-optimize-settings*)
 603   "Parses and consumes a <regex>, a complete regular expression.
 604 The productions are: <regex> -> <seq> | <seq>\"|\"<regex>.
 605 Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
 606   (let ((pos (lexer-pos lexer)))
 607     (case (next-char lexer)
 608       ((nil)
 609         ;; if we didn't get any token we return :VOID which stands for
 610         ;; "empty regular expression"
 611         :void)
 612       ((#\|)
 613         ;; now check whether the expression started with a vertical
 614         ;; bar, i.e. <seq> - the left alternation - is empty
 615         (list :alternation :void (reg-expr lexer)))
 616       (otherwise
 617         ;; otherwise un-read the character we just saw and parse a
 618         ;; <seq> plus the character following it
 619         (setf (lexer-pos lexer) pos)
 620         (let* ((seq (seq lexer))
 621                (pos (lexer-pos lexer)))
 622           (case (next-char lexer)
 623             ((nil)
 624               ;; no further character, just a <seq>
 625               seq)
 626             ((#\|)
 627               ;; if the character was a vertical bar, this is an
 628               ;; alternation and we have the second production
 629               (let ((reg-expr (reg-expr lexer)))
 630                 (cond ((and (consp reg-expr)
 631                             (eq (first reg-expr) :alternation))
 632                         ;; again we try to merge as above in SEQ
 633                         (setf (cdr reg-expr)
 634                                 (cons seq (cdr reg-expr)))
 635                         reg-expr)
 636                       (t (list :alternation seq reg-expr)))))
 637             (otherwise
 638               ;; a character which is not a vertical bar - this is
 639               ;; either a syntax error or we're inside of a group and
 640               ;; the next character is a closing parenthesis; so we
 641               ;; just un-read the character and let another function
 642               ;; take care of it
 643               (setf (lexer-pos lexer) pos)
 644               seq)))))))
 645
 646 (defun reverse-strings (parse-tree)
 647   (declare #.*standard-optimize-settings*)
 648   (cond ((stringp parse-tree)
 649           (nreverse parse-tree))
 650         ((consp parse-tree)
 651           (loop for parse-tree-rest on parse-tree
 652                 while parse-tree-rest
 653                 do (setf (car parse-tree-rest)
 654                            (reverse-strings (car parse-tree-rest))))
 655           parse-tree)
 656         (t parse-tree)))
 657
 658 (defun parse-pattern (string)
 659   (declare #.*standard-optimize-settings*)
 660   "Translate the regex string STRING into a parse tree."
 661   (let* ((*in-pattern-parser-p* t)
 662          (lexer (make-lexer string))
 663          (parse-tree (reverse-strings (reg-expr lexer))))
 664     ;; check whether we've consumed the whole regex string
 665     (if (end-of-string-p lexer)
 666         `(:sequence :start-anchor ,parse-tree :end-anchor)
 667         (signal-ppcre-syntax-error*
 668          (lexer-pos lexer)
 669          "Expected end of string"))))
 670
 671 (defmethod pattern-scanner ((str string))
 672   (cl-ppcre:create-scanner (parse-pattern str)))
 673
 674 (defmethod pattern-scanner ((scanner function))
 675   scanner)