1 ;Portable regular expressions for Common Lisp
6 (defparameter *pregexp-version
* 20200129) ;last change
8 (defparameter *pregexp-comment-char
* #\
;)
10 (defparameter *pregexp-space-sensitive-p
* t
)
12 (defmacro pregexp-recur
(name varvals
&rest body
)
13 `(labels ((,name
,(mapcar #'first varvals
) ,@body
))
14 (,name
,@(mapcar #'second varvals
))))
16 (defun pregexp-read-pattern (s i n
)
17 (if (>= i n
) (values `(:or
(:seq
)) i
)
21 (char= (char s i
) #\
)))
22 (return (values (cons :or
(nreverse branches
)) i
)))
23 (multiple-value-bind (branch1 i1
)
25 s
(if (char= (char s i
) #\|
) (+ i
1) i
) n
)
26 (push branch1 branches
)
29 (defun pregexp-read-branch (s i n
)
33 (member (char s i
) '(#\|
#\
))))
34 (return (values (cons :seq
(nreverse pieces
)) i
)))
35 (t (multiple-value-bind (pc i-new
)
36 (pregexp-read-piece s i n
)
40 (defun pregexp-read-piece (s i n
)
41 (let ((c (char s i
))) (incf i
)
45 (#\.
(pregexp-wrap-quantifier-if-any :any s i n
))
46 (#\
[ (let ((negp nil
))
48 (char= (char s i
) #\^
))
52 (multiple-value-bind (chars i1
)
53 (pregexp-read-char-list s i n
)
54 (pregexp-wrap-quantifier-if-any
55 (if negp
(list :neg-char chars
) chars
)
57 (#\
( (multiple-value-bind (re i1
)
58 (pregexp-read-subpattern s i n
)
59 (pregexp-wrap-quantifier-if-any
61 (#\\ (multiple-value-bind (m i1
)
62 (pregexp-read-escaped-number s i n
)
64 (pregexp-wrap-quantifier-if-any
65 (list :backref m
) s i1 n
)
66 (multiple-value-bind (c i1
)
67 (pregexp-read-escaped-char s i
)
69 (pregexp-wrap-quantifier-if-any c s i1 n
)
70 (error "pregexp-read-piece: backslash"))))))
71 (t (if (or *pregexp-space-sensitive-p
*
72 (and (not (pregexp-whitespacep c
))
73 (not (char= c
*pregexp-comment-char
*))))
74 (pregexp-wrap-quantifier-if-any c s i n
)
75 (let ((in-comment-p nil
))
78 (when (char= c
#\newline
)
79 (setq in-comment-p nil
)))
80 ((pregexp-whitespacep c
) t
)
81 ((char= c
*pregexp-comment-char
*)
82 (setq in-comment-p t
))
83 (t (decf i
) (return (values :empty i
))))
84 (when (>= i n
) (return (values :empty i
)))
88 (defun pregexp-read-escaped-number (s i n
)
89 (let ((r '()) (c nil
))
92 (not (digit-char-p (setq c
(char s i
)))))
97 (values (read-from-string (concatenate 'string
(nreverse r
)))
100 (defun pregexp-read-escaped-char (s i
)
101 (let ((c (char s i
)))
107 (#\D
(list :neg-char
:digit
))
111 (#\S
(list :neg-char
:space
))
114 (#\W
(list :neg-char
:word
))
118 (defun pregexp-read-posix-char-class (s i n
)
119 (let ((r '()) (negp nil
))
121 (when (>= i n
) (error "pregexp-read-posix-char-class"))
122 (let ((c (char s i
))) (incf i
)
123 (cond ((char= c
#\^
) (setq negp t
))
124 ((alpha-char-p c
) (push c r
))
127 (not (char= (char s i
) #\
])))
128 (error "pregexp-read-posix-char-class"))
131 (t (error "pregexp-read-posix-char-class")))))
133 (intern (string-upcase (concatenate 'string
(nreverse r
)))
136 (if negp
(list :neg-char posix-class
) posix-class
)
139 (defun pregexp-read-cluster-type (s i
)
140 (let ((c (char s i
))) (incf i
)
143 (setq c
(char s i
)) (incf i
)
146 (#\
= (values `(:lookahead
) i
))
147 (#\
! (values `(:neg-lookahead
) i
))
148 (#\
> (values `(:no-backtrack
) i
))
149 (#\
< (setq c
(char s i
)) (incf i
)
151 (#\
= (values `(:lookbehind
) i
))
152 (#\
! (values `(:neg-lookbehind
) i
))
153 (t (error "pregexp-read-cluster-type"))))
154 (t (let ((r '()) (invp nil
))
158 (#\i
(push (if invp
:case-sensitive
159 :case-insensitive
) r
)
161 (#\x
(setq *pregexp-space-sensitive-p
* invp
)
163 (#\
: (return (values (nreverse r
) i
)))
164 (t (error "pregexp-read-cluster-type")))
167 (t (decf i
) (values '(:sub
) i
)))))
169 (defun pregexp-read-subpattern (s i n
)
170 (let ((remember-space-sensitive-p *pregexp-space-sensitive-p
*))
171 (multiple-value-bind (ctyp i1
)
172 (pregexp-read-cluster-type s i
)
173 (multiple-value-bind (re i2
)
174 (pregexp-read-pattern s i1 n
)
175 (setq *pregexp-space-sensitive-p
* remember-space-sensitive-p
)
177 (char= (char s i2
) #\
)))
180 (setq re
(list ct re
)))
182 (t (error "pregexp-read-subpattern")))))))
184 (defun pregexp-wrap-quantifier-if-any (re s i n
)
186 (when (>= i n
) (return (values re i
)))
187 (let ((c (char s i
))) (incf i
)
188 (if (and (pregexp-whitespacep c
) (not *pregexp-space-sensitive-p
*)) t
191 (let* ((new-re (list :between nil
1 1 re
)))
192 ; (:between non-greedy at-least at-most re)
194 (#\
* (setf (third new-re
) 0
195 (fourth new-re
) nil
))
196 (#\
+ (setf (fourth new-re
) nil
))
197 (#\? (setf (third new-re
) 0))
198 (#\
{ (multiple-value-bind (p q i1
)
199 (pregexp-read-nums s i n
)
200 (setf (third new-re
) p
204 (when (>= i n
) (return))
205 (let ((c (char s i
))) (incf i
)
206 (cond ((and (pregexp-whitespacep c
)
207 (not *pregexp-space-sensitive-p
*)) t
)
209 (setf (second new-re
) t
)
211 (t (decf i
) (return)))))
212 (return (values new-re i
))))
213 (t (decf i
) (return (values re i
))))))))
215 (defun pregexp-whitespacep (c)
216 (or (char= c
#\space
) (char= c
#\tab
)
217 (not (graphic-char-p c
))))
219 (defun pregexp-read-nums (s i n
)
220 (let ((p '()) (q '()) (reading 1))
222 (when (>= i n
) (error "pregexp-read-nums: unmatched left brace"))
223 (let ((c (char s i
)))
224 (cond ((digit-char-p c
)
229 ((and (pregexp-whitespacep c
) (not *pregexp-space-sensitive-p
*))
231 ((and (char= c
#\
,) (= reading
1))
232 (incf i
) (incf reading
))
235 (setq p
(read-from-string (concatenate 'string
(nreverse p
)) nil
)
236 q
(read-from-string (concatenate 'string
(nreverse q
)) nil
))
238 (cond ((and (not p
) (= reading
1)) (values 0 nil i
))
239 ((= reading
1) (values p p i
))
240 (t (values p q i
)))))
241 (t (error "pregexp-read-nums: left brace must be followed by number")))))))
243 (defun pregexp-read-char-list (s i n
)
246 (when (>= i n
) (error "pregexp-read-char-list: char class ended too soon"))
247 (let ((c (char s i
))) (incf i
)
250 (progn (push c r
) (incf i
))
252 (#\\ (multiple-value-bind (c2 i2
)
253 (pregexp-read-escaped-char s i
)
255 (error "pregexp-read-char-list: backslash"))
258 (#\-
(if (or (null r
)
260 (char= (char s i
) #\
])))
266 (push `(:char-range
,c-1
,(char s i
)) r
)
269 (#\
[ (if (char= (char s i
) #\
:)
270 (multiple-value-bind (c i1
)
271 (pregexp-read-posix-char-class s
(1+ i
) n
)
276 (values (cons :one-of-chars
280 (defun pregexp-string-match (s1 s i n sk fk
)
281 (let ((n1 (length s1
)))
282 (if (> n1 n
) (funcall fk
)
283 (let ((j 0) (k i
) (failp nil
))
285 (cond ((>= j n1
) (return))
286 ((>= k n
) (return (setq failp t
)))
287 ((char= (char s1 j
) (char s k
))
289 (t (return (setq failp t
)))))
290 (if failp
(funcall fk
)
293 (defun pregexp-char-word?
(c)
294 (or (alpha-char-p c
) (digit-char-p c
) (char= c
#\_
)))
296 (defun pregexp-at-word-boundary-p (s i n
)
299 (let* ((c-i (char s i
))
300 (c-i-minus-1 (char s
(- i
1)))
301 (c-i-is-word-p (pregexp-check-if-in-char-class-p c-i
:word
))
302 (c-i-minus-1-is-word-p (pregexp-check-if-in-char-class-p c-i-minus-1
:word
)))
303 (or (and c-i-is-word-p
(not c-i-minus-1-is-word-p
))
304 (and (not c-i-is-word-p
) c-i-minus-1-is-word-p
)))))
306 (defun pregexp-check-if-in-char-class-p (c char-class
) ;check thoroughly
308 (:any
(not (char= c
#\newline
)))
310 (:alnum
(or (alpha-char-p c
) (digit-char-p c
)))
311 (:alpha
(alpha-char-p c
))
312 (:ascii
(< (char-code c
) 128))
313 (:blank
(or (char= c
#\space
) (char= c
#\tab
)))
314 (:cntrl
(< (char-code c
) 32))
315 (:digit
(digit-char-p c
))
316 (:graph
(and (pregexp-check-if-in-char-class-p c
:print
)
317 (not (pregexp-whitespacep c
))))
318 (:lower
(lower-case-p c
))
319 (:print
(>= (char-code c
) 32))
320 (:punct
(and (pregexp-check-if-in-char-class-p c
:print
)
321 (not (or (pregexp-whitespacep c
)
324 (:space
(pregexp-whitespacep c
))
325 (:upper
(upper-case-p c
))
326 (:word
(or (alpha-char-p c
) (digit-char-p c
) (char= c
#\_
)))
327 (:xdigit
(or (digit-char-p c
)
328 (member c
'(#\a #\b #\c
#\d
#\e
#\f) :test
#'char-equal
)))
329 (t (error "pregexp-check-if-in-char-class-p"))))
331 (defun pregexp-make-backref-list (re)
334 (rest-backrefs (pregexp-make-backref-list (cdr re
))))
336 (cons (cons re nil
) rest-backrefs
)
337 (append (pregexp-make-backref-list re1
) rest-backrefs
)))
340 (defun pregexp-match-positions-aux (re s sn start n i
)
341 (let* ((backrefs (pregexp-make-backref-list re
))
342 (case-sensitive-p t
))
343 (flet ((char=1 (c1 c2
)
350 (char-not-greaterp c1 c2 c3
))))
352 match-loop
((re re
) (i i
) (sk #'identity
) (fk (lambda () nil
)))
354 (if (= i start
) (funcall sk i
) (funcall fk
)))
356 (if (>= i n
) (funcall sk i
) (funcall fk
)))
357 ((eq re
:empty
) (funcall sk i
))
359 (if (pregexp-at-word-boundary-p s i n
)
363 (if (pregexp-at-word-boundary-p s i n
)
366 ((and (characterp re
) (< i n
))
367 (if (char=1 (char s i
) re
)
370 ((and (not (consp re
)) (< i n
))
371 (if (pregexp-check-if-in-char-class-p (char s i
) re
)
377 (if (>= i n
) (funcall fk
)
378 (if (char<=1 (second re
) (char s i
) (third re
))
382 (if (>= i n
) (funcall fk
)
384 one-of-chars-loop
((chars (rest re
)))
385 (if (null chars
) (funcall fk
)
386 (match-loop (first chars
) i sk
388 (one-of-chars-loop (rest chars
))))))))
390 (if (>= i n
) (funcall fk
)
391 (match-loop (second re
) i
393 (declare (ignore i1
))
395 (lambda () (funcall sk
(1+ i
))))))
398 seq-loop
((res (rest re
)) (i i
))
399 (if (null res
) (funcall sk i
)
400 (match-loop (first res
) i
401 (lambda (i1) (seq-loop (rest res
) i1
))
405 or-loop
((res (rest re
)))
406 (if (null res
) (funcall fk
)
407 (match-loop (first res
) i
410 (or-loop (rest res
))))
412 (or-loop (cdr res
)))))))
414 (let* ((cell (nth (second re
) backrefs
))
416 (cond (cell (cdr cell
))
417 (t (error "pregexp-match-positions-aux: non-existent backref ~s" re
)
421 (pregexp-string-match
422 (subseq s
(car backref
) (cdr backref
))
426 (match-loop (second re
) i
428 (setf (cdr (assoc re backrefs
)) (cons i i1
))
433 (match-loop (second re
) i
436 (if found-it-p
(funcall sk i
) (funcall fk
))))
439 (match-loop (second re
) i
442 (if found-it-p
(funcall fk
) (funcall sk i
))))
444 (let ((n-actual n
) (sn-actual sn
))
447 (match-loop `(:seq
(:between nil
0 nil
:super-any
)
452 (setq n n-actual sn sn-actual
)
453 (if found-it-p
(funcall sk i
) (funcall fk
)))))
455 (let ((n-actual n
) (sn-actual sn
))
458 (match-loop `(:seq
(:between nil
0 nil
:super-any
)
463 (setq n n-actual sn sn-actual
)
464 (if found-it-p
(funcall fk
) (funcall sk i
)))))
467 (match-loop (second re
) i
#'identity
(lambda () nil
))))
469 (funcall sk found-it-p
)
471 ((:case-sensitive
:case-insensitive
)
472 (let ((old-case-sensitive-p case-sensitive-p
))
473 (setq case-sensitive-p
474 (eq (first re
) :case-sensitive
))
475 (match-loop (second re
) i
477 (setq case-sensitive-p old-case-sensitive-p
)
480 (setq case-sensitive-p old-case-sensitive-p
)
483 (let* ((non-greedy-p (second re
))
487 (could-loop-infinitely-p
488 (and (not non-greedy-p
) (not q
)))
495 (if (and could-loop-infinitely-p
497 (error "pregexp-match-positions-aux: greedy quantifier operand could be empty")
502 (let ((fk (lambda () (funcall sk i
))))
503 (if (and q
(>= k q
)) (funcall fk
)
504 (if (not non-greedy-p
)
507 (if (and could-loop-infinitely-p
509 (error "pregexp-match-positions-aux greedy quantifier operand could be empty"))
510 (or (q-loop (1+ k
) i1
)
518 (t (error "pregexp-match-positions-aux"))))
519 ((>= i n
) (funcall fk
))
520 (t (error "pregexp-match-positions-aux")))))
521 (setq backrefs
(mapcar #'cdr backrefs
))
522 (and (car backrefs
) backrefs
)))
524 (defun pregexp-replace-aux (str ins n backrefs
)
527 (when (>= i n
) (return r
))
528 (let ((c (char ins i
)))
531 (multiple-value-bind (m i1
)
532 (pregexp-read-escaped-number ins i n
)
533 (when (and (not i1
) (char= (char str i
) #\
&))
536 (cond (m (let ((backref (nth m backrefs
)))
538 (setq r
(concatenate 'string r
539 (subseq str
(car backref
) (cdr backref
)))))
541 (t (let ((c2 (char ins i
)))
543 (unless (char= c2
#\$
)
544 (setq r
(concatenate 'string r
(string c2
))))))))
545 (setq r
(concatenate 'string r
(string c
))))))))
548 (setq *pregexp-space-sensitive-p
* t
) ;in case it got corrupted
549 (list :sub
(pregexp-read-pattern s
0 (length s
))))
551 (defun pregexp-match-positions (pat str
&optional start end
)
552 (when (stringp pat
) (setq pat
(pregexp pat
)))
554 (error "pregexp-match-positions: pattern ~s must be compiled or string regexp"
556 (let ((str-len (length str
)))
557 (when (not start
) (setq start
0))
558 (when (or (not end
) (> end str-len
)) (setq end str-len
))
561 (unless (<= i end
) (return nil
))
562 (let ((res (pregexp-match-positions-aux
563 pat str str-len start end i
)))
567 (defun pregexp-match (pat str
&optional start end
)
569 (pregexp-match-positions pat str start end
)))
574 (subseq str
(car index-pair
) (cdr index-pair
))))
577 (defun pregexp-split (pat str
)
578 ;split str into substrings, using pat as delim
579 (let ((r '()) (n (length str
)) (i 0) (picked-up-one-undelimited-char-p nil
) it
)
581 (cond ((>= i n
) (return (nreverse r
)))
582 ((setq it
(car (pregexp-match-positions pat str i n
)))
583 (let ((j (car it
)) (k (cdr it
)))
584 (cond ((= j k
) (push (subseq str i
(1+ j
)) r
)
586 (setq picked-up-one-undelimited-char-p t
))
587 ((and (= j i
) picked-up-one-undelimited-char-p
)
589 (setq picked-up-one-undelimited-char-p nil
))
590 (t (push (subseq str i j
) r
)
592 (setq picked-up-one-undelimited-char-p nil
)))))
593 (t (push (subseq str i n
) r
)
596 (defun pregexp-replace (pat str ins
)
597 (let* ((n (length str
))
598 (pp (pregexp-match-positions pat str
0 n
)))
600 (let ((ins-len (length ins
))
605 (pregexp-replace-aux str ins ins-len pp
)
606 (subseq str m-n n
))))))
608 (defun pregexp-replace* (pat str ins
)
609 ;return str with every occurrence of pat replaced by ins
610 (when (stringp pat
) (setq pat
(pregexp pat
)))
611 (let ((n (length str
))
612 (ins-len (length ins
))
616 ;i = index in str to start replacing from
617 ;r = already calculated prefix of answer
618 (when (>= i n
) (return r
))
619 (let ((pp (pregexp-match-positions pat str i n
)))
623 ;this implies pat didn't match str at all,
624 ;so let's return original str
626 (concatenate 'string r
(subseq str i n
)))))
627 (setq r
(concatenate 'string r
628 (subseq str i
(caar pp
))
629 (pregexp-replace-aux str ins ins-len pp
)))
630 (setq i
(cdar pp
))))))
632 (defun pregexp-quote (s)
633 (let ((i (- (length s
) 1)) (r '()))
635 (when (< i
0) (return (concatenate 'string r
)))
636 (let ((c (char s i
)))
639 (when (member c
'(#\\ #\.
#\? #\
* #\
+ #\|
#\^
#\$
640 #\
[ #\
] #\
{ #\
} #\
( #\
)))
646 pregexp-read-subpattern
647 ; pregexp-read-char-list
650 ; pregexp-make-backref-list
651 pregexp-read-cluster-type
652 ; pregexp-wrap-quantifier-if-any
653 ; pregexp-match-positions
654 pregexp-match-positions-aux
656 ; pregexp-read-escaped-char
657 ; pregexp-read-escaped-number
658 ; pregexp-read-posix-char-class
659 ; pregexp-replace pregexp-replace-aux
660 ; pregexp-check-if-in-char-class-p