SRRAT: use MRAT reader functions instead of CADDAR, etc.
[maxima.git] / archive / src / cmulisp-regex.lisp
blob6dc05cdfeb805a754aa98b4bf9b9b19bf1cbd3e9
1 ;;;
2 ;;; Alien interface to GNU regex, for CMUCL
3 ;;;
4 ;;; Copyright 2000, Raymond Toy
5 ;;;
6 ;;; This is a part of Maxima and therefore released under the GPL that
7 ;;; governs GPL.
8 ;;;
9 ;;; It is intended that we support everything that GNU regex does, but
10 ;;; we're not quite there yet.
11 ;;;
13 (eval-when (compile load eval)
14 (defpackage "REGEXP"
15 (:use "COMMON-LISP" "ALIEN" "C-CALL")
16 (:export
17 ;; Constants
18 "+RE-BACKSLASH-ESCAPE-IN-LISTS+"
19 "+RE-BK-PLUS-QM+"
20 "+RE-CHAR-CLASSES+"
21 "+RE-CONTEXT-INDEP-ANCHORS+"
22 "+RE-CONTEXT-INDEP-OPS+"
23 "+RE-CONTEXT-INVALID-OPS+"
24 "+RE-DOT-NEWLINE+"
25 "+RE-DOT-NOT-NULL+"
26 "+RE-HAT-LISTS-NOT-NEWLINE+"
27 "+RE-INTERVALS+"
28 "+RE-LIMITED-OPS+"
29 "+RE-NEWLINE-ALT+"
30 "+RE-NO-BK-BRACES+"
31 "+RE-NO-BK-PARENS+"
32 "+RE-NO-BK-REFS+"
33 "+RE-NO-BK-VBAR+"
34 "+RE-NO-EMPTY-RANGES+"
35 "+RE-UNMATCHED-RIGHT-PAREN-ORD+"
36 ;; Common regexp syntaxes
37 "+RE-SYNTAX-EMACS+"
38 "+RE-SYNTAX-EGREP+"
39 "+RE-SYNTAX-POSIX-COMMON+"
40 "+RE-SYNTAX-POSIX-BASIC+"
41 "+RE-SYNTAX-POSIX-EXTENDED+"
42 "+RE-SYNTAX-SPENCER+"
43 ;; Variables
44 "*MATCH-DATA*"
45 "*CASE-FOLD-SEARCH*"
46 ;; Functions
47 "MATCH-DATA-START"
48 "MATCH-DATA-END"
49 "RE-SET-SYNTAX"
50 "COMPILE-PATTERN"
51 "ALLOCATE-RE-REGS"
52 "FREE-RE-REGS"
53 "RE-NSUB"
54 "LISPIFY-MATCH-DATA"
55 "RE-SEARCH"
56 "RE-REGFREE"
57 "STRING-MATCH"
58 "MATCH-BEGINNING"
59 "MATCH-END"
62 (defpackage "SI"
63 (:use "COMMON-LISP" "REGEXP" "ALIEN"))
64 ) ; end eval-when
66 (in-package "REGEXP")
68 #+nil
69 (export '(
70 ;; Constants
71 +re-backslash-escape-in-lists+
72 +re-bk-plus-qm+
73 +re-char-classes+
74 +re-context-indep-anchors+
75 +re-context-indep-ops+
76 +re-context-invalid-ops+
77 +re-dot-newline+
78 +re-dot-not-null+
79 +re-hat-lists-not-newline+
80 +re-intervals+
81 +re-limited-ops+
82 +re-newline-alt+
83 +re-no-bk-braces+
84 +re-no-bk-parens+
85 +re-no-bk-refs+
86 +re-no-bk-vbar+
87 +re-no-empty-ranges+
88 +re-unmatched-right-paren-ord+
89 ;; Common regexp syntaxes
90 +re-syntax-emacs+
91 +re-syntax-egrep+
92 +re-syntax-posix-common+
93 +re-syntax-posix-basic+
94 +re-syntax-posix-extended+
95 +re-syntax-spencer+
96 ;; Variables
97 *match-data*
98 *case-fold-search*
99 ;; Functions
100 match-data-start
101 match-data-end
102 string-match
103 match-beginning
104 match-end
107 (use-package "ALIEN")
108 (use-package "C-CALL")
110 (eval-when (:load-toplevel :compile-toplevel :execute)
111 (defvar *regex-lib*
112 "/apps/gnu/src/regex-0.12/regex.o"
113 "The full path to GNU regex.o")
115 (eval-when (:compile-toplevel :execute)
116 (ext:load-foreign *regex-lib*)
119 ;;; From regex.h
121 ;; GNU interface
122 (def-alien-type reg-syntax-t unsigned)
123 (def-alien-type re-pattern-buffer
124 (struct re-pattern-buffer
125 (buffer (* unsigned-char))
126 (allocated unsigned)
127 (used unsigned)
128 (syntax unsigned)
129 (fastmap (* unsigned-char))
130 (translate (* unsigned-char))
131 (re-nsub int)
132 (bit-fields int)))
134 (def-alien-type re-registers
135 (struct re-registers
136 (num-regs unsigned)
137 (start (* int))
138 (end (* int))))
141 (declaim (inline re-compile-pattern))
142 (def-alien-routine ("re_compile_pattern" re-compile-pattern) c-string
143 (pattern c-string)
144 (length int)
145 (buffer (* re-pattern-buffer)))
147 (declaim (inline re-match))
148 (def-alien-routine ("re_match" re-match) int
149 (buffer (* re-pattern-buffer))
150 (string c-string)
151 (length int)
152 (start int)
153 (regs (* re-registers)))
155 (declaim (inline re-set-registers))
156 (def-alien-routine ("re_set_registers" re-set-registers) void
157 (buffer (* re-pattern-buffer))
158 (regs (* re-registers))
159 (num-regs unsigned)
160 (starts (* int))
161 (ends (* int)))
163 (declaim (inline re-regfree))
164 (def-alien-routine ("regfree" re-regfree) void
165 (regs (* re-pattern-buffer)))
167 (declaim (inline re-search))
168 (def-alien-routine ("re_search" re-search) int
169 (buffer (* re-pattern-buffer))
170 (string c-string)
171 (length int)
172 (start int)
173 (range int)
174 (regs (* re-registers)))
176 (declaim (inline re-set-syntax))
177 (def-alien-routine ("re_set_syntax" re-set-syntax) reg-syntax-t
178 (syntax reg-syntax-t))
180 ;; Note: for some reason, I can't set this directly to get the desired
181 ;; syntax. I need to use re_set_syntax instead, which works.
182 (def-alien-variable ("re_syntax_options" re-syntax-options) reg-syntax-t)
185 ;;; POSIX interface
186 ;;; Not yet supported, but we really should since it's standardized.
188 (def-alien-type regex-t re-pattern-buffer)
189 (def-alien-type regoff-t int)
190 (def-alien-type regmatch-t
191 (struct regmatch-t
192 (rm-so int)
193 (rm-eo int)))
195 (declaim (inline re-regcomp))
196 (def-alien-routine ("regcomp" re-regcomp) int
197 (preg (* regex-t))
198 (regex c-string)
199 (cflags int))
201 (declaim (inline re-regexec))
202 (def-alien-routine ("regexec" re-regexec) int
203 (preg (* regex-t))
204 (string c-string)
205 (nmatch int)
206 (pmatch (array regmatch-t) :in-out)
207 (eflags int))
209 (declaim (inline re-regerror))
210 (def-alien-routine ("regerror" re-regerror) int
211 (errcode int)
212 (preg (* regex_t))
213 (errbuf c-string)
214 (errbuf-size int))
217 ;; Create all of the necessary constants defined in regex.h to define the syntax.
219 (macrolet ((frob (&rest name-desc-list)
220 `(progn
221 ,@(let ((bit 1))
222 (mapcar #'(lambda (name-desc)
223 (prog1
224 `(defconstant ,(first name-desc) ,bit ,(second name-desc))
225 (setf bit (ash bit 1))))
226 name-desc-list)))))
227 (frob (+re-backslash-escape-in-lists+
228 "If this bit is not set, then \\ inside a bracket expression is
229 literal. If set, then such a \\ quotes the following character. ")
230 (+re-bk-plus-qm+
231 "If this bit is not set, then + and ? are operators, and \\+ and \\?
232 are literals. If set, then \\+ and \\? are operators and + and ? are
233 literals.")
234 (+re-char-classes+
235 "If this bit is set, then character classes are supported. They are:
236 [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:],
237 [:xdigit:],[:space:], [:print:], [:punct:], [:graph:], and
238 [:cntrl:]. If not set, then character classes are not supported.")
239 (+re-context-indep-anchors+
240 "If this bit is set, then ^ and $ are always anchors (outside bracket
241 expressions, of course).
242 If this bit is not set, then it depends:
243 ^ is an anchor if it is at the beginning of a regular
244 expression or after an open-group or an alternation operator;
245 $ is an anchor if it is at the end of a regular expression, or
246 before a close-group or an alternation operator.
248 (+re-context-indep-ops+ "")
249 (+re-context-invalid-ops+ "")
250 (+re-dot-newline+ "")
251 (+re-dot-not-null+ "")
252 (+re-hat-lists-not-newline+ "")
253 (+re-intervals+ "")
254 (+re-limited-ops+ "")
255 (+re-newline-alt+ "")
256 (+re-no-bk-braces+ "")
257 (+re-no-bk-parens+ "")
258 (+re-no-bk-refs+ "")
259 (+re-no-bk-vbar+ "")
260 (+re-no-empty-ranges+ "")
261 (+re-unmatched-right-paren-ord+ "")))
263 ;; Define some common syntaxes.
265 (defconstant +re-syntax-emacs+ 0)
267 (defconstant +re-syntax-awk+
268 (logior +re-backslash-escape-in-lists+ +re-dot-not-null+
269 +re-no-bk-parens+ +re-no-bk-refs+
270 +re-no-bk-vbar+ +re-no-empty-ranges+
271 +re-unmatched-right-paren-ord+))
273 (defconstant +re-syntax-grep+
274 (logior +re-bk-plus-qm+ +re-char-classes+
275 +re-hat-lists-not-newline+ +re-intervals+
276 +re-newline-alt+))
278 (defconstant +re-syntax-egrep+
279 (logior +re-char-classes+ +re-context-indep-anchors+
280 +re-context-indep-ops+ +re-hat-lists-not-newline+
281 +re-newline-alt+ +re-no-bk-parens+
282 +re-no-bk-vbar+))
284 (defconstant +re-syntax-posix-common+
285 (logior +re-char-classes+ +re-dot-newline+ +re-dot-not-null+
286 +re-intervals+ +re-no-empty-ranges+))
288 (defconstant +re-syntax-posix-basic+
289 (logior +re-syntax-posix-common+ +re-bk-plus-qm+))
291 (defconstant +re-syntax-posix-minimal-basic+
292 (logior +re-syntax-posix-common+ +re-limited-ops+))
294 (defconstant +re-syntax-posix-extended+
295 (logior +re-syntax-posix-common+ +re-context-indep-anchors+
296 +re-context-indep-ops+ +re-no-bk-braces+
297 +re-no-bk-parens+ +re-no-bk-vbar+
298 +re-unmatched-right-paren-ord+))
300 (defconstant +re-syntax-posix-awk+
301 (logior +re-syntax-posix-extended+ +re-backslash-escape-in-lists+))
303 ;; This isn't defined regex.h, but GCL uses this syntax in its info
304 ;; reader. (Not 100% sure this is right, but is close enough for
305 ;; GCL's and maxima's use.)
306 (defconstant +re-syntax-spencer+
307 (logior +re-no-bk-parens+ +re-no-bk-vbar+))
308 ;;; This ends the raw GNU regex interface.
311 ;;; A simple slightly higher-level interface to GNU regex that might
312 ;;; be more appropriate for Lisp.
313 #+nil
314 (defun allocate-re-regs (compiled-pattern-buffer)
315 (declare (type (alien (* re-pattern-buffer)) compiled-pattern-buffer))
316 (let* ((nregs (1+ (slot compiled-pattern-buffer 're-nsub)))
317 (re-regs (make-alien re-registers 1))
318 (reg-start (make-alien int nregs))
319 (reg-end (make-alien int nregs)))
320 (re-set-registers compiled-pattern-buffer re-regs
321 nregs reg-start reg-end)
322 (ext:finalize re-regs
323 #'(lambda ()
324 (format t "~&freeing re-regs~%")
325 (free-alien (slot (deref re-regs 0) 'start))
326 (free-alien (slot (deref re-regs 0) 'end))
328 re-regs))
330 (defun allocate-re-regs ()
331 (let ((regs (make-alien re-registers 1)))
332 (setf (slot (deref regs) 'num-regs) 0)
333 regs))
335 ;; Return the number of matches and submatches found in the result
336 ;; pattern buffer after doing a search. Assumes the search was
337 ;; successful.
338 (defun re-nsub (pat-buf)
339 (1+ (slot (deref pat-buf) 're-nsub)))
341 (defun free-re-regs (re-regs)
342 (declare (type (alien (* re-registers)) re-regs))
344 (let ((r (deref re-regs)))
345 ;;(format t "freeing ~A:~%" re-regs)
346 ;;(format t " num-regs: ~A~%" (slot r 'num-regs))
347 (when (plusp (slot r 'num-regs))
348 ;;(format t " free start: ~A~%" (slot r 'start))
349 ;;(format t " free end: ~A~%" (slot r 'end))
350 (free-alien (slot r 'start))
351 (free-alien (slot r 'end))
352 (free-alien re-regs)
355 (defun make-case-fold-table ()
356 "Translation table to fold all uppercase ASCII characters to lower
357 case characters"
358 (let ((tab (make-alien (unsigned 8) 256)))
359 ;; Initialize the table to the 256 ASCII characters
360 (dotimes (k 256)
361 (setf (deref tab k) k))
362 ;; Translate the upper case characters to lower case
363 (loop for k from (char-int #\A) to (char-int #\Z)
364 do (setf (deref tab k) (- k #.(- (char-int #\A) (char-int #\a)))))
365 tab))
368 (defvar *match-data* nil
369 "The match-data from the latest successful string-match")
370 (declaim (type (or null (simple-array t (*))) *match-data*))
372 (defvar *case-fold-search* nil
373 "Non-NIL if the character case should be folded during searchs")
376 (defun allocate-re-pattern-buffer ()
377 (let* ((pat-buf-ptr (make-alien re-pattern-buffer 1))
378 (pat-buf (deref pat-buf-ptr 0)))
379 ;; Set BUFFER to NIL and ALLOCATED to 0 so re_compile_pattern
380 ;; allocates space for us.
381 (setf (slot pat-buf 'buffer) nil)
382 (setf (slot pat-buf 'allocated) 0)
383 ;; We don't support fastmap
384 (setf (slot pat-buf 'fastmap) (make-alien unsigned-char 256))
385 ;; Set case folding appropriately
386 (setf (slot pat-buf 'translate)
387 (if *case-fold-search* (make-case-fold-table) nil))
388 pat-buf-ptr))
391 (defun dump-compiled-pattern (compiled-pattern)
392 (declare (type (alien (* re-pattern-buffer)) compiled-pattern))
393 (let ((pat-buf (deref compiled-pattern)))
394 (format t "buffer = ~S~%" (slot pat-buf 'buffer))
395 (format t "allocated = ~S~%" (slot pat-buf 'allocated))
396 (format t "used = ~S~%" (slot pat-buf 'used))
397 (format t "syntax = ~S~%" (slot pat-buf 'syntax))
398 (format t "fastmap = ~S~%" (slot pat-buf 'fastmap))
399 (format t "re-nsub = ~S~%" (slot pat-buf 're-nsub))
400 (format t "translate = ~S~%" (slot pat-buf 'translate))
401 (format t "bit-fields = ~:42,' ,' ,4B~%" (slot pat-buf 'bit-fields))
404 (defun compile-pattern (pattern-string)
405 (declare (type string pattern-string))
406 (let* ((pat-buf (allocate-re-pattern-buffer))
407 (comp (re-compile-pattern pattern-string
408 (length pattern-string)
409 pat-buf)))
410 (when comp
411 (unwind-protect
412 (error "~A in regexp ~S" comp pattern-string)
413 ;; Free up the pattern buffer
414 (re-regfree pat-buf)
415 (free-alien pat-buf)))
416 pat-buf))
418 (defstruct match-data
419 (start 0 :type (unsigned-byte 32))
420 (end 0 :type (unsigned-byte 32)))
422 ;; Copy the data in the alien re-register to a lisp array
423 (defun lispify-match-data (nsub re-regs)
424 (declare (fixnum nsub)
425 (type (alien (* re-registers)) re-regs))
426 (let* ((regs (deref re-regs))
427 (start (slot regs 'start))
428 (end (slot regs 'end))
429 (matches (make-array nsub)))
430 (dotimes (k nsub)
431 (setf (aref matches k)
432 (make-match-data :start (deref start k) :end (deref end k))))
433 matches))
435 (in-package "SI")
436 ;;; Define the interface needed by cl-info.
437 (defun string-match (pattern string
438 &optional (start 0) end
439 (syntax +re-syntax-posix-basic+))
440 "Search the string STRING for the first pattern that matches the
441 regexp PATTERN. The syntax used for the pattern is specified by
442 SYNTAX. The search may start in the string at START and ends at END,
443 which default to 0 and the end of the string, respectively.
445 If there is a match, returns the index of the start of the match and
446 an array of match-data. If there is no match, -1 is returned and
447 nil."
448 (declare (type string pattern string))
449 (re-set-syntax syntax)
450 (let (comp-result)
451 ;; Make sure we free up the space for the pattern buffer.
452 (unwind-protect
453 (progn
454 (setf comp-result (compile-pattern pattern))
455 (cond (comp-result
456 (let (re-regs)
457 ;; Make sure we free up the space for the registers
458 (unwind-protect
459 (progn
460 (setf re-regs (allocate-re-regs))
461 (let ((search-result
462 (re-search comp-result string (length string)
463 start (or end (length string))
464 re-regs)))
465 (cond ((>= search-result 0)
466 (let ((matches
467 (lispify-match-data
468 (re-nsub comp-result)
469 re-regs)))
470 ;; Save the last match in the global var
471 (setf *match-data* matches)
472 (values search-result matches)))
474 (values search-result nil)))))
475 ;; Free up the re-register since we're done with it now.
476 (free-re-regs re-regs))))
478 (setf *match-data* nil)
479 (values -1 nil))))
480 ;; Free the pattern buffer
481 (re-regfree comp-result)
482 (free-alien comp-result))))
484 ;; Memoized version
485 #+nil
486 (defvar *compiled-pattern-hashtable* (make-hash-table :test 'equal))
488 #+nil
489 (defun string-match (pattern string
490 &optional (start 0) end)
491 (declare (type string pattern string))
492 (setf re-syntax-options +re-syntax-posix-basic+)
493 (multiple-value-bind (comp-pattern foundp)
494 (gethash pattern *compiled-pattern-hashtable*)
495 (unless comp-pattern
496 ;; Compile up the pattern and save it away
497 (setf (gethash pattern *compiled-pattern-hashtable*)
498 (compile-pattern pattern))
499 (setf comp-pattern (gethash pattern *compiled-pattern-hashtable*)))
500 (unwind-protect
501 (progn
502 (cond (comp-pattern
503 (let* ((re-regs (allocate-re-regs)))
504 ;; Make sure we free up the space for the registers
505 (unwind-protect
506 (progn
507 (let ((search-result
508 (re-search comp-pattern string (length string)
509 start (or end (length string))
510 re-regs)))
511 (cond ((>= search-result 0)
512 (let ((matches
513 (lispify-match-data
514 (1+ (slot (deref comp-pattern) 're-nsub))
515 re-regs)))
516 ;; Save the last match in the global var
517 (setf *match-data* matches)
518 (values search-result matches)))
520 (values search-result nil)))))
521 ;; Free up the re-register since we're done with it now.
522 (free-re-regs re-regs))))
524 (setf *match-data* nil)
525 (values -1 nil)))))))
527 (defun match-beginning (index &optional (match-data *match-data*))
528 (if (and match-data (< index (length match-data)))
529 (match-data-start (aref match-data index))
530 -1))
532 (defun match-end (index &optional (match-data *match-data*))
533 (if (and match-data (< index (length match-data)))
534 (match-data-end (aref match-data index))
535 -1))