nppcre.lisp

   1 ;;; -*- show-trailing-whitespace: t; indent-tabs: nil -*-
   2
   3 ;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
   4
   5 ;;; hacked for XSD regular expressions by David Lichteblau in 2007:
   6
   7 ;;;  - no comments and extended stuff
   8 ;;;  - no (?
   9 ;;;  - no greedyness modifier
  10 ;;;  - fewer and different backslash-escapes: \i \I \c \C \.
  11 ;;;  - character set substraction: [foo-[bar]]
  12 ;;;  - no ^ and $, but always wrap those around the complete parse tree
  13 ;;;  - ...
  14
  15 ;;; Derived from:
  16 ;;; /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.24 2005/04/01 21:29:09 edi Exp
  17 ;;; and
  18 ;;; /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.21 2005/08/03 21:11:27 edi Exp
  19
  20 ;;; Redistribution and use in source and binary forms, with or without
  21 ;;; modification, are permitted provided that the following conditions
  22 ;;; are met:
  23
  24 ;;;   * Redistributions of source code must retain the above copyright
  25 ;;;     notice, this list of conditions and the following disclaimer.
  26
  27 ;;;   * Redistributions in binary form must reproduce the above
  28 ;;;     copyright notice, this list of conditions and the following
  29 ;;;     disclaimer in the documentation and/or other materials
  30 ;;;     provided with the distribution.
  31
  32 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
  33 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  34 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  35 ;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  36 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  37 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  38 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  39 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  40 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  41 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  42 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  43
  44 (in-package :cxml-types)
  45
  46 (eval-when (:compile-toplevel :load-toplevel :execute)
  47   (defparameter *standard-optimize-settings* '(optimize)))
  48
  49 (defvar *in-pattern-parser-p* nil)
  50
  51 (defun signal-ppcre-syntax-error (fmt &rest args)
  52   (error "invalid pattern: ~?" fmt args))
  53
  54 (defun signal-ppcre-syntax-error* (pos fmt &rest args)
  55   (error "invalid pattern at ~D: ~?" pos fmt args))
  56
  57 (defmacro maybe-coerce-to-simple-string (string)
  58   (let ((=string= (gensym)))
  59     `(let ((,=string= ,string))
  60       (cond ((simple-string-p ,=string=)
  61               ,=string=)
  62             (t
  63               (coerce ,=string= 'simple-string))))))
  64
  65 (defun map-char-to-special-char-class (chr lexer)
  66   (declare #.*standard-optimize-settings*)
  67   "Maps escaped characters like \"\\d\" to the tokens which represent
  68 their associated character classes."
  69   (case chr
  70     (#\. '\.)
  71     (#\s '\\s) (#\i '\\i) (#\c '\\c) (#\d '\\d) (#\w '\\w)
  72     (#\S '^s) (#\I '^i) (#\C '^c) (#\D '^d) (#\W '^w)
  73     (#\p
  74      (unless (eql (next-char lexer) #\{)
  75        (signal-ppcre-syntax-error "Missing open brace after \\p"))
  76      (let* ((bag (loop
  77                     for c = (next-char lexer)
  78                     for last = (eql c #\})
  79                     and done = nil then last
  80                     until done
  81                     unless c do
  82                       (signal-ppcre-syntax-error
  83                        "Missing close brace after \\p")
  84                     collect c))
  85             (bag (coerce (list* #\p #\{ bag) 'string)))
  86        (or (find-symbol bag 'cxml-types)
  87            (signal-ppcre-syntax-error "Invalid character property: ~A"
  88                                       bag))))))
  89
  90 (locally
  91   (declare #.*standard-optimize-settings*)
  92   (defstruct (lexer (:constructor make-lexer-internal))
  93     "LEXER structures are used to hold the regex string which is
  94 currently lexed and to keep track of the lexer's state."
  95     (str ""
  96          :type string
  97          :read-only t)
  98     (len 0
  99          :type fixnum
 100          :read-only t)
 101     (pos 0
 102          :type fixnum)
 103     (last-pos nil
 104               :type list)))
 105
 106 (defun make-lexer (string)
 107   (declare (inline make-lexer-internal)
 108            #-genera (type string string))
 109   (make-lexer-internal :str (maybe-coerce-to-simple-string string)
 110                        :len (length string)))
 111
 112 (declaim (inline end-of-string-p))
 113 (defun end-of-string-p (lexer)
 114   (declare #.*standard-optimize-settings*)
 115   "Tests whether we're at the end of the regex string."
 116   (<= (lexer-len lexer)
 117       (lexer-pos lexer)))
 118
 119 (declaim (inline looking-at-p))
 120 (defun looking-at-p (lexer chr)
 121   (declare #.*standard-optimize-settings*)
 122   "Tests whether the next character the lexer would see is CHR.
 123 Does not respect extended mode."
 124   (and (not (end-of-string-p lexer))
 125        (char= (schar (lexer-str lexer) (lexer-pos lexer))
 126               chr)))
 127
 128 (declaim (inline next-char-non-extended))
 129 (defun next-char-non-extended (lexer)
 130   (declare #.*standard-optimize-settings*)
 131   "Returns the next character which is to be examined and updates the
 132 POS slot. Does not respect extended mode."
 133   (cond ((end-of-string-p lexer)
 134           nil)
 135         (t
 136           (prog1
 137             (schar (lexer-str lexer) (lexer-pos lexer))
 138             (incf (lexer-pos lexer))))))
 139
 140 (defun next-char (lexer)
 141   (declare #.*standard-optimize-settings*)
 142   (next-char-non-extended lexer))
 143
 144 (declaim (inline fail))
 145 (defun fail (lexer)
 146   (declare #.*standard-optimize-settings*)
 147   "Moves (LEXER-POS LEXER) back to the last position stored in
 148 \(LEXER-LAST-POS LEXER) and pops the LAST-POS stack."
 149   (unless (lexer-last-pos lexer)
 150     (signal-ppcre-syntax-error "LAST-POS stack of LEXER ~A is empty" lexer))
 151   (setf (lexer-pos lexer) (pop (lexer-last-pos lexer)))
 152   nil)
 153
 154 (defun get-number (lexer &key (radix 10) max-length no-whitespace-p)
 155   (declare #.*standard-optimize-settings*)
 156   "Read and consume the number the lexer is currently looking at and
 157 return it. Returns NIL if no number could be identified.
 158 RADIX is used as in PARSE-INTEGER. If MAX-LENGTH is not NIL we'll read
 159 at most the next MAX-LENGTH characters. If NO-WHITESPACE-P is not NIL
 160 we don't tolerate whitespace in front of the number."
 161   (when (or (end-of-string-p lexer)
 162             (and no-whitespace-p
 163                  (not (find (schar (lexer-str lexer) (lexer-pos lexer))
 164                             "0123456789"))))
 165     (return-from get-number nil))
 166   (multiple-value-bind (integer new-pos)
 167       (parse-integer (lexer-str lexer)
 168                      :start (lexer-pos lexer)
 169                      :end (if max-length
 170                             (let ((end-pos (+ (lexer-pos lexer)
 171                                               (the fixnum max-length)))
 172                                   (lexer-len (lexer-len lexer)))
 173                               (if (< end-pos lexer-len)
 174                                 end-pos
 175                                 lexer-len))
 176                             (lexer-len lexer))
 177                      :radix radix
 178                      :junk-allowed t)
 179     (cond ((and integer (>= (the fixnum integer) 0))
 180             (setf (lexer-pos lexer) new-pos)
 181             integer)
 182           (t nil))))
 183
 184 (declaim (inline make-char-from-code))
 185 (defun make-char-from-code (number error-pos)
 186   (declare #.*standard-optimize-settings*)
 187   "Create character from char-code NUMBER. NUMBER can be NIL
 188 which is interpreted as 0. ERROR-POS is the position where
 189 the corresponding number started within the regex string."
 190   ;; only look at rightmost eight bits in compliance with Perl
 191   (let ((code (logand #o377 (the fixnum (or number 0)))))
 192     (or (and (< code char-code-limit)
 193              (code-char code))
 194         (signal-ppcre-syntax-error*
 195          error-pos
 196          "No character for hex-code ~X"
 197          number))))
 198
 199 (defun unescape-char (lexer)
 200   (declare #.*standard-optimize-settings*)
 201   "Convert the characters(s) following a backslash into a token
 202 which is returned. This function is to be called when the backslash
 203 has already been consumed. Special character classes like \\W are
 204 handled elsewhere."
 205   (when (end-of-string-p lexer)
 206     (signal-ppcre-syntax-error "String ends with backslash"))
 207   (let ((chr (next-char-non-extended lexer)))
 208     (case chr
 209       ;; the following five character names are 'semi-standard'
 210       ;; according to the CLHS but I'm not aware of any implementation
 211       ;; that doesn't implement them
 212       ((#\t)
 213         #\Tab)
 214       ((#\n)
 215         #\Newline)
 216       ((#\r)
 217         #\Return)
 218       (otherwise
 219         ;; all other characters aren't affected by a backslash
 220         chr))))
 221
 222 (defun convert-substraction (r s)
 223   (flet ((rangify (x)
 224            (etypecase x
 225              (character `((:range ,x ,x)))
 226              (list
 227               (ecase (car x)
 228                 (:range (list x))
 229                 (:property (copy-list (symbol-value (second x))))))
 230              (symbol (copy-list (symbol-value x))))))
 231     (ranges- (mapcan #'rangify r) (mapcan #'rangify s))))
 232
 233 (defun collect-char-class (lexer)
 234   (declare #.*standard-optimize-settings*)
 235   "Reads and consumes characters from regex string until a right
 236 bracket is seen. Assembles them into a list \(which is returned) of
 237 characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and
 238 tokens representing special character classes."
 239   (let ((start-pos (lexer-pos lexer))         ; remember start for error message
 240         hyphen-seen
 241         last-char
 242         list)
 243     (flet ((handle-char (c)
 244              "Do the right thing with character C depending on whether
 245 we're inside a range or not."
 246              (cond ((and hyphen-seen last-char)
 247                      (setf (car list) (list :range last-char c)
 248                            last-char nil))
 249                    (t
 250                      (push c list)
 251                      (setq last-char c)))
 252              (setq hyphen-seen nil)))
 253       (loop for first = t then nil
 254             for c = (next-char-non-extended lexer)
 255             ;; leave loop if at end of string
 256             while c
 257             do (cond
 258                  ((char= c #\\)
 259                    ;; we've seen a backslash
 260                    (let ((next-char (next-char-non-extended lexer)))
 261                      (case next-char
 262                        ((#\. #\i #\I #\c #\C #\d #\D #\w #\W #\s #\S #\p)
 263                          ;; a special character class
 264                          (push (list :property
 265                                      (map-char-to-special-char-class
 266                                       next-char
 267                                       lexer))
 268                                list)
 269                          ;; if the last character was a hyphen
 270                          ;; just collect it literally
 271                          (when hyphen-seen
 272                            (push #\- list))
 273                          ;; if the next character is a hyphen do the same
 274                          (when (looking-at-p lexer #\-)
 275                            (incf (lexer-pos lexer))
 276                            (when (looking-at-p lexer #\[)
 277                              (incf (lexer-pos lexer))
 278                              (return-from collect-char-class
 279                                (prog1
 280                                    (convert-substraction
 281                                     (nreverse list)
 282                                     (collect-char-class lexer))
 283                                  (unless
 284                                      (eql (next-char-non-extended lexer) #\])
 285                                    (signal-ppcre-syntax-error*
 286                                     start-pos
 287                                     "Missing right bracket to close character class")))))
 288                            (push #\- list))
 289                          (setq hyphen-seen nil))
 290                        (otherwise
 291                          ;; otherwise unescape the following character(s)
 292                          (decf (lexer-pos lexer))
 293                          (handle-char (unescape-char lexer))))))
 294                  (first
 295                    ;; the first character must not be a right bracket
 296                    ;; and isn't treated specially if it's a hyphen
 297                    (handle-char c))
 298                  ((char= c #\])
 299                    ;; end of character class
 300                    ;; make sure we collect a pending hyphen
 301                    (when hyphen-seen
 302                      (setq hyphen-seen nil)
 303                      (handle-char #\-))
 304                    ;; reverse the list to preserve the order intended
 305                    ;; by the author of the regex string
 306                    (return-from collect-char-class (nreverse list)))
 307                  ((and hyphen-seen (char= c #\[))
 308                   (return-from collect-char-class
 309                     (prog1
 310                         (convert-substraction
 311                          (nreverse list)
 312                          (collect-char-class lexer))
 313                       (unless (eql (next-char-non-extended lexer) #\])
 314                         (signal-ppcre-syntax-error*
 315                          start-pos
 316                          "Missing right bracket to close character class")))))
 317                  ((and (char= c #\-)
 318                        last-char
 319                        (not hyphen-seen))
 320                    ;; if the last character was 'just a character'
 321                    ;; we expect to be in the middle of a range
 322                    (setq hyphen-seen t))
 323                  ((char= c #\-)
 324                    ;; otherwise this is just an ordinary hyphen
 325                    (handle-char #\-))
 326                  (t
 327                    ;; default case - just collect the character
 328                    (handle-char c))))
 329       ;; we can only exit the loop normally if we've reached the end
 330       ;; of the regex string without seeing a right bracket
 331       (signal-ppcre-syntax-error*
 332        start-pos
 333        "Missing right bracket to close character class"))))
 334
 335 (defun get-quantifier (lexer)
 336   (declare #.*standard-optimize-settings*)
 337   "Returns a list of two values (min max) if what the lexer is looking
 338 at can be interpreted as a quantifier. Otherwise returns NIL and
 339 resets the lexer to its old position."
 340   ;; remember starting position for FAIL and UNGET-TOKEN functions
 341   (push (lexer-pos lexer) (lexer-last-pos lexer))
 342   (let ((next-char (next-char lexer)))
 343     (case next-char
 344       ((#\*)
 345         ;; * (Kleene star): match 0 or more times
 346         '(0 nil))
 347       ((#\+)
 348         ;; +: match 1 or more times
 349         '(1 nil))
 350       ((#\?)
 351         ;; ?: match 0 or 1 times
 352         '(0 1))
 353       ((#\{)
 354         ;; one of
 355         ;;   {n}:   match exactly n times
 356         ;;   {n,}:  match at least n times
 357         ;;   {n,m}: match at least n but not more than m times
 358         ;; note that anything not matching one of these patterns will
 359         ;; be interpreted literally - even whitespace isn't allowed
 360         (let ((num1 (get-number lexer :no-whitespace-p t)))
 361           (if num1
 362             (let ((next-char (next-char-non-extended lexer)))
 363               (case next-char
 364                 ((#\,)
 365                   (let* ((num2 (get-number lexer :no-whitespace-p t))
 366                          (next-char (next-char-non-extended lexer)))
 367                     (case next-char
 368                       ((#\})
 369                         ;; this is the case {n,} (NUM2 is NIL) or {n,m}
 370                         (list num1 num2))
 371                       (otherwise
 372                         (fail lexer)))))
 373                 ((#\})
 374                   ;; this is the case {n}
 375                   (list num1 num1))
 376                 (otherwise
 377                   (fail lexer))))
 378             ;; no number following left curly brace, so we treat it
 379             ;; like a normal character
 380             (fail lexer))))
 381       ;; cannot be a quantifier
 382       (otherwise
 383         (fail lexer)))))
 384
 385 (defun get-token (lexer)
 386   (declare #.*standard-optimize-settings*)
 387   "Returns and consumes the next token from the regex string (or NIL)."
 388   ;; remember starting position for UNGET-TOKEN function
 389   (push (lexer-pos lexer)
 390         (lexer-last-pos lexer))
 391   (let ((next-char (next-char lexer)))
 392     (cond (next-char
 393             (case next-char
 394               ;; the easy cases first - the following six characters
 395               ;; always have a special meaning and get translated
 396               ;; into tokens immediately
 397               ((#\))
 398                 :close-paren)
 399               ((#\|)
 400                 :vertical-bar)
 401               ((#\?)
 402                 :question-mark)
 403               ((#\.)
 404                 :everything)
 405               ((#\+ #\*)
 406                 ;; quantifiers will always be consumend by
 407                 ;; GET-QUANTIFIER, they must not appear here
 408                 (signal-ppcre-syntax-error*
 409                  (1- (lexer-pos lexer))
 410                  "Quantifier '~A' not allowed"
 411                  next-char))
 412               ((#\{)
 413                 ;; left brace isn't a special character in it's own
 414                 ;; right but we must check if what follows might
 415                 ;; look like a quantifier
 416                 (let ((this-pos (lexer-pos lexer))
 417                       (this-last-pos (lexer-last-pos lexer)))
 418                   (unget-token lexer)
 419                   (when (get-quantifier lexer)
 420                     (signal-ppcre-syntax-error*
 421                      (car this-last-pos)
 422                      "Quantifier '~A' not allowed"
 423                      (subseq (lexer-str lexer)
 424                              (car this-last-pos)
 425                              (lexer-pos lexer))))
 426                   (setf (lexer-pos lexer) this-pos
 427                         (lexer-last-pos lexer) this-last-pos)
 428                   next-char))
 429               ((#\[)
 430                 ;; left bracket always starts a character class
 431                 (cons  (cond ((looking-at-p lexer #\^)
 432                                (incf (lexer-pos lexer))
 433                                :inverted-char-class)
 434                              (t
 435                                :char-class))
 436                        (collect-char-class lexer)))
 437               ((#\\)
 438                 ;; backslash might mean different things so we have
 439                 ;; to peek one char ahead:
 440                 (let ((next-char (next-char-non-extended lexer)))
 441                   (case next-char
 442                     ((#\. #\i #\I #\c #\C #\d #\D #\w #\W #\s #\S #\p)
 443                       ;; these will be treated like character classes
 444                      (list :property
 445                            (map-char-to-special-char-class next-char lexer)))
 446                     (otherwise
 447                       ;; in all other cases just unescape the
 448                       ;; character
 449                       (decf (lexer-pos lexer))
 450                       (unescape-char lexer)))))
 451               ((#\()
 452                :open-paren)
 453               (otherwise
 454                 ;; all other characters are their own tokens
 455                 next-char)))
 456           ;; we didn't get a character (this if the "else" branch from
 457           ;; the first IF), so we don't return a token but NIL
 458           (t
 459             (pop (lexer-last-pos lexer))
 460             nil))))
 461
 462 (declaim (notinline unget-token))       ;FIXME: else AVER in GET-TOKEN
 463 (defun unget-token (lexer)
 464   (declare #.*standard-optimize-settings*)
 465   "Moves the lexer back to the last position stored in the LAST-POS stack."
 466   (if (lexer-last-pos lexer)
 467     (setf (lexer-pos lexer)
 468             (pop (lexer-last-pos lexer)))
 469     (error "No token to unget \(this should not happen)")))
 470
 471 (declaim (inline start-of-subexpr-p))
 472 (defun start-of-subexpr-p (lexer)
 473   (declare #.*standard-optimize-settings*)
 474   "Tests whether the next token can start a valid sub-expression, i.e.
 475 a stand-alone regex."
 476   (let* ((pos (lexer-pos lexer))
 477          (next-char (next-char lexer)))
 478     (not (or (null next-char)
 479              (prog1
 480                (member (the character next-char)
 481                        '(#\) #\|)
 482                        :test #'char=)
 483                (setf (lexer-pos lexer) pos))))))
 484
 485 (defun group (lexer)
 486   (declare #.*standard-optimize-settings*)
 487   "Parses and consumes a <group>.
 488 The productions are: <group> -> \"(\"<regex>\")\"
 489                                 <legal-token>
 490 Will return <parse-tree> or (<grouping-type> <parse-tree>) where
 491 <grouping-type> is one of six keywords - see source for details."
 492   (let ((open-token (get-token lexer)))
 493     (cond ((eq open-token :open-paren)
 494            (let* ((open-paren-pos (car (lexer-last-pos lexer)))
 495                   (reg-expr (reg-expr lexer))
 496                   (close-token (get-token lexer)))
 497              (unless (eq close-token :close-paren)
 498                ;; the token following <regex> must be the closing
 499                ;; parenthesis or this is a syntax error
 500                (signal-ppcre-syntax-error*
 501                 open-paren-pos
 502                 "Opening paren has no matching closing paren"))
 503              (list :register reg-expr)))
 504           (t
 505             ;; this is the <legal-token> production; <legal-token> is
 506             ;; any token which passes START-OF-SUBEXPR-P (otherwise
 507             ;; parsing had already stopped in the SEQ method)
 508             open-token))))
 509
 510 (defun greedy-quant (lexer)
 511   (declare #.*standard-optimize-settings*)
 512   "Parses and consumes a <greedy-quant>.
 513 The productions are: <greedy-quant> -> <group> | <group><quantifier>
 514 where <quantifier> is parsed by the lexer function GET-QUANTIFIER.
 515 Will return <parse-tree> or (:GREEDY-REPETITION <min> <max> <parse-tree>)."
 516   (let* ((group (group lexer))
 517          (token (get-quantifier lexer)))
 518     (if token
 519       ;; if GET-QUANTIFIER returned a non-NIL value it's the
 520       ;; two-element list (<min> <max>)
 521       (list :greedy-repetition (first token) (second token) group)
 522       group)))
 523
 524 (defun quant (lexer)
 525   (declare #.*standard-optimize-settings*)
 526   (greedy-quant lexer))
 527
 528 (defun seq (lexer)
 529   (declare #.*standard-optimize-settings*)
 530   "Parses and consumes a <seq>.
 531 The productions are: <seq> -> <quant> | <quant><seq>.
 532 Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)."
 533   (flet ((make-array-from-two-chars (char1 char2)
 534            (let ((string (make-array 2
 535                                      :element-type 'character
 536                                      :fill-pointer t
 537                                      :adjustable t)))
 538              (setf (aref string 0) char1)
 539              (setf (aref string 1) char2)
 540              string)))
 541     ;; Note that we're calling START-OF-SUBEXPR-P before we actually try
 542     ;; to parse a <seq> or <quant> in order to catch empty regular
 543     ;; expressions
 544     (if (start-of-subexpr-p lexer)
 545       (let ((quant (quant lexer)))
 546         (if (start-of-subexpr-p lexer)
 547           (let* ((seq (seq lexer))
 548                  (quant-is-char-p (characterp quant))
 549                  (seq-is-sequence-p (and (consp seq)
 550                                          (eq (first seq) :sequence))))
 551             (cond ((and quant-is-char-p
 552                         (characterp seq))
 553                     (make-array-from-two-chars seq quant))
 554                   ((and quant-is-char-p
 555                         (stringp seq))
 556                     (vector-push-extend quant seq)
 557                     seq)
 558                   ((and quant-is-char-p
 559                         seq-is-sequence-p
 560                         (characterp (second seq)))
 561                     (cond ((cddr seq)
 562                             (setf (cdr seq)
 563                                     (cons
 564                                      (make-array-from-two-chars (second seq)
 565                                                                 quant)
 566                                      (cddr seq)))
 567                             seq)
 568                           (t (make-array-from-two-chars (second seq) quant))))
 569                   ((and quant-is-char-p
 570                         seq-is-sequence-p
 571                         (stringp (second seq)))
 572                     (cond ((cddr seq)
 573                             (setf (cdr seq)
 574                                     (cons
 575                                      (progn
 576                                        (vector-push-extend quant (second seq))
 577                                        (second seq))
 578                                      (cddr seq)))
 579                             seq)
 580                           (t
 581                             (vector-push-extend quant (second seq))
 582                             (second seq))))
 583                   (seq-is-sequence-p
 584                     ;; if <seq> is also a :SEQUENCE parse tree we merge
 585                     ;; both lists into one to avoid unnecessary consing
 586                     (setf (cdr seq)
 587                             (cons quant (cdr seq)))
 588                     seq)
 589                   (t (list :sequence quant seq))))
 590           quant))
 591       :void)))
 592
 593 (defun reg-expr (lexer)
 594   (declare #.*standard-optimize-settings*)
 595   "Parses and consumes a <regex>, a complete regular expression.
 596 The productions are: <regex> -> <seq> | <seq>\"|\"<regex>.
 597 Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
 598   (let ((pos (lexer-pos lexer)))
 599     (case (next-char lexer)
 600       ((nil)
 601         ;; if we didn't get any token we return :VOID which stands for
 602         ;; "empty regular expression"
 603         :void)
 604       ((#\|)
 605         ;; now check whether the expression started with a vertical
 606         ;; bar, i.e. <seq> - the left alternation - is empty
 607         (list :alternation :void (reg-expr lexer)))
 608       (otherwise
 609         ;; otherwise un-read the character we just saw and parse a
 610         ;; <seq> plus the character following it
 611         (setf (lexer-pos lexer) pos)
 612         (let* ((seq (seq lexer))
 613                (pos (lexer-pos lexer)))
 614           (case (next-char lexer)
 615             ((nil)
 616               ;; no further character, just a <seq>
 617               seq)
 618             ((#\|)
 619               ;; if the character was a vertical bar, this is an
 620               ;; alternation and we have the second production
 621               (let ((reg-expr (reg-expr lexer)))
 622                 (cond ((and (consp reg-expr)
 623                             (eq (first reg-expr) :alternation))
 624                         ;; again we try to merge as above in SEQ
 625                         (setf (cdr reg-expr)
 626                                 (cons seq (cdr reg-expr)))
 627                         reg-expr)
 628                       (t (list :alternation seq reg-expr)))))
 629             (otherwise
 630               ;; a character which is not a vertical bar - this is
 631               ;; either a syntax error or we're inside of a group and
 632               ;; the next character is a closing parenthesis; so we
 633               ;; just un-read the character and let another function
 634               ;; take care of it
 635               (setf (lexer-pos lexer) pos)
 636               seq)))))))
 637
 638 (defun reverse-strings (parse-tree)
 639   (declare #.*standard-optimize-settings*)
 640   (cond ((stringp parse-tree)
 641           (nreverse parse-tree))
 642         ((consp parse-tree)
 643           (loop for parse-tree-rest on parse-tree
 644                 while parse-tree-rest
 645                 do (setf (car parse-tree-rest)
 646                            (reverse-strings (car parse-tree-rest))))
 647           parse-tree)
 648         (t parse-tree)))
 649
 650 (defun parse-pattern (string)
 651   (declare #.*standard-optimize-settings*)
 652   "Translate the regex string STRING into a parse tree."
 653   (let* ((*in-pattern-parser-p* t)
 654          (lexer (make-lexer string))
 655          (parse-tree (reverse-strings (reg-expr lexer))))
 656     ;; check whether we've consumed the whole regex string
 657     (if (end-of-string-p lexer)
 658         `(:sequence :start-anchor ,parse-tree :end-anchor)
 659         (signal-ppcre-syntax-error*
 660          (lexer-pos lexer)
 661          "Expected end of string"))))
 662
 663 (defmethod pattern-scanner ((str string))
 664   (cl-ppcre:create-scanner (parse-pattern str)))
 665
 666 (defmethod pattern-scanner ((scanner function))
 667   scanner)