4 Maxima interface to pregexp.lisp
(a portable regex parser by Dorai Sitaram
)
6 Copyright
: 2008 -
2016 Volker van Nek
8 --------------------------------------------------------------------------------
10 This program is free software
; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation
; either version 2 of the License, or
13 (at your option
) any later version.
15 This program is distributed in the hope that it will be useful
,
16 but WITHOUT ANY WARRANTY
; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program
; if not, write to the Free Software
22 Foundation
, Inc.
, 51 Franklin Street
, Fifth Floor
, Boston
,
25 --------------------------------------------------------------------------------
29 regex_compile - returns a Lisp structure containing a compiled regex.
31 regex_match_pos
, regex_match
, regex_split
, regex_subst_first
, regex_subst -
32 the regular expression used as an argument may be a string or
33 compiled by regex_compile.
35 string_to_regex - masking special characters.
39 str
: "his hay needle stack -- my hay needle stack -- her hay needle stack"$
41 regex
: regex_compile
("ne{2}dle")$
43 regex_match_pos
(regex, str
);
45 regex_match_pos
("ne{2}dle", str
);
47 regex_match_pos
("ne{2}dle", str
, 25, 44);
49 regex_match
("ne{2}dle", "hay needle stack");
51 regex_match
("ne{2}dle", "hay needle stack", 10);
53 regex_split
("[,;]+", "split,pea;;;soup");
54 ["split", "pea", "soup"]
55 regex_subst_first
("ty", "t.", "liberte egalite fraternite");
56 "liberty egalite fraternite"
57 regex_subst
("ty", "t.\\b", "liberte egalite fraternite");
58 "liberty egality fraternity"
59 string_to_regex
(". :");
62 --------------------------------------------------------------------------------
64 The pregexp manual by Dorai Sitaram is at
65 http
://ds26gte.github.io
/pregexp
/index.html.
67 --------------------------------------------------------------------------------
69 Like in stringproc.lisp we use
1-indexed position specifications.
71 When the external format is not utf-8
(unicode) positions are counted in
72 octets at Lisp level and in Maxima characters at Maxima level.
73 See remarks in stringproc.lisp.
74 Without unicode support non-us-ascii Maxima characters are not recognized
75 by regular expressions
, e.g. the regex
"." doesn
't match to an umlaut.
82 (declare-top (special *parse-utf-8-input
*))
86 (defstruct (compiled-regex (:print-function compiled-regex-print
))
87 ;; The compiled parse tree
89 ;; The regex pattern for the parse-tree. Used to print a nice
90 ;; description of what the compiled regex is for.
93 (defun compiled-regex-print (struct stream i
)
95 ;; Include the pattern so the printed object is more meaningful
96 ;; instead of being some opaque structure with no information on
98 (format stream
"Structure [COMPILED-REGEX for ~S]"
99 (compiled-regex-pattern struct
)))
101 (defmfun $regex_compile
(regex)
103 :parse-tree
(pregexp:pregexp regex
)
107 (defun regex-check-and-maybe-coerce (name regex
&rest args
)
109 ((compiled-regex-p regex
)
110 (setq regex
(compiled-regex-parse-tree regex
)) )
111 ((not (stringp regex
))
112 (gf-merror (intl:gettext
"`~m': first arg must be a compiled regex or a string.") name
) ))
113 (unless (every #'stringp args
)
114 (gf-merror (intl:gettext
"Unsuitable arguments to `~m'.") name
) )
118 (defun regex-index-error (name)
119 (gf-merror (intl:gettext
"`~m': improper start or end index.") name
) )
122 ;; When the external format is not utf-8 (unicode) positions are counted in octets.
123 ;; We want them in numbers of characters to find the right position in a string.
124 ;; utf-8-pos-dec returns the decrement we need to adjust.
125 ;; (string position = octet position - decrement)
126 (defun regex-utf-8-pos-dec (ov off pos
) ;; begin to count at a given offset
130 (when (= (logand (aref ov i
) 192.
) 128.
)
134 (defmfun $regex_match_pos
(regex str
&optional
(start 1) (end nil
)) ;; 1-based indexing!
135 (setq regex
(regex-check-and-maybe-coerce "regex_match_pos" regex str
))
137 (when end
(decf end
))
140 (when *parse-utf-8-input
*
141 (setq ov
(string-to-raw-bytes str
))
142 (let ((args (utf-8-fix-start-end ov
(list nil start end
))))
143 (setq start
(cadr args
)
145 (let ((pos-list (pregexp:pregexp-match-positions regex str start end
))
148 (dolist (pos pos-list
(cons '(mlist simp
) (nreverse pos-mlist
)))
149 (when *parse-utf-8-input
*
150 (let ((dec (regex-utf-8-pos-dec ov
0 (car pos
))))
151 (decf (cdr pos
) (+ dec
(regex-utf-8-pos-dec ov
(car pos
) (cdr pos
))))
152 (decf (car pos
) dec
) ))
153 (push `((mlist simp
) ,(1+ (car pos
)) ,(1+ (cdr pos
))) pos-mlist
) )
154 (return-from $regex_match_pos nil
) )))
155 (regex-index-error "regex_match_pos") )))
158 (defmfun $regex_match
(regex str
&optional
(start 1) (end nil
))
159 (setq regex
(regex-check-and-maybe-coerce "regex_match" regex str
))
161 (when *parse-utf-8-input
*
162 (let* ((ov (string-to-raw-bytes str
))
163 (args (utf-8-fix-start-end ov
(list nil start end
))) )
164 (setq start
(cadr args
)
167 (pregexp:pregexp-match regex str
(1- start
) (if end
(1- end
) nil
)) ))
169 (cons '(mlist simp
) match
)
170 (return-from $regex_match nil
) )))
171 (regex-index-error "regex_match") ))
174 (defmfun $regex_split
(regex str
)
175 (setq regex
(regex-check-and-maybe-coerce "regex_split" regex str
))
176 (cons '(mlist simp
) (pregexp:pregexp-split regex str
)) )
179 (defmfun $regex_subst_first
(replacement regex str
)
180 (setq regex
(regex-check-and-maybe-coerce "regex_subst_first" regex str replacement
))
181 (pregexp:pregexp-replace regex str replacement
) )
183 ;; Argument order different to the order of pregexp-replace.
184 ;; Use order like in $ssubst or substitute: new, old, str.
186 (defmfun $regex_subst
(replacement regex str
)
187 (setq regex
(regex-check-and-maybe-coerce "regex_subst" regex str replacement
))
188 (pregexp:pregexp-replace
* regex str replacement
) )
191 (defmfun $string_to_regex
(str)
192 (unless (stringp str
)
193 (gf-merror (intl:gettext
"`string_to_regex': Argument must be a string.")) )
194 (pregexp:pregexp-quote str
) )