Use %%PRETTY-FNAME in more quadpack error messages
[maxima.git] / share / stringproc / sregex.lisp
blob28067da57b4b503856e931c07be6619742d7963d
1 #|
2 ~*~ sregex ~*~
4 Maxima interface to pregexp.lisp (a portable regex parser by Dorai Sitaram)
6 Copyright : 2008 - 2016 Volker van Nek
8 --------------------------------------------------------------------------------
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23 MA 02110-1301, USA.
25 --------------------------------------------------------------------------------
27 Functions:
29 regex_compile - returns a Lisp structure containing a compiled regex.
31 regex_match_pos, regex_match, regex_split, regex_subst_first, regex_subst -
32 the regular expression used as an argument may be a string or
33 compiled by regex_compile.
35 string_to_regex - masking special characters.
37 Examples:
39 str : "his hay needle stack -- my hay needle stack -- her hay needle stack"$
41 regex : regex_compile("ne{2}dle")$
43 regex_match_pos(regex, str);
44 [[9, 15]]
45 regex_match_pos("ne{2}dle", str);
46 [[9, 15]]
47 regex_match_pos("ne{2}dle", str, 25, 44);
48 [[32, 38]]
49 regex_match("ne{2}dle", "hay needle stack");
50 ["needle"]
51 regex_match("ne{2}dle", "hay needle stack", 10);
52 false
53 regex_split("[,;]+", "split,pea;;;soup");
54 ["split", "pea", "soup"]
55 regex_subst_first("ty", "t.", "liberte egalite fraternite");
56 "liberty egalite fraternite"
57 regex_subst("ty", "t.\\b", "liberte egalite fraternite");
58 "liberty egality fraternity"
59 string_to_regex(". :");
60 "\. :"
62 --------------------------------------------------------------------------------
64 The pregexp manual by Dorai Sitaram is at
65 http://ds26gte.github.io/pregexp/index.html.
67 --------------------------------------------------------------------------------
69 Like in stringproc.lisp we use 1-indexed position specifications.
71 When the external format is not utf-8 (unicode) positions are counted in
72 octets at Lisp level and in Maxima characters at Maxima level.
73 See remarks in stringproc.lisp.
74 Without unicode support non-us-ascii Maxima characters are not recognized
75 by regular expressions, e.g. the regex "." doesn't match to an umlaut.
79 (in-package :maxima)
82 (declare-top (special *parse-utf-8-input*))
86 (defstruct (compiled-regex (:print-function compiled-regex-print))
87 ;; The compiled parse tree
88 parse-tree
89 ;; The regex pattern for the parse-tree. Used to print a nice
90 ;; description of what the compiled regex is for.
91 pattern)
93 (defun compiled-regex-print (struct stream i)
94 (declare (ignore i))
95 ;; Include the pattern so the printed object is more meaningful
96 ;; instead of being some opaque structure with no information on
97 ;; what it holds.
98 (format stream "Structure [COMPILED-REGEX for ~S]"
99 (compiled-regex-pattern struct)))
101 (defmfun $regex_compile (regex)
102 (make-compiled-regex
103 :parse-tree (pregexp:pregexp regex)
104 :pattern regex))
107 (defun regex-check-and-maybe-coerce (name regex &rest args)
108 (cond
109 ((compiled-regex-p regex)
110 (setq regex (compiled-regex-parse-tree regex)) )
111 ((not (stringp regex))
112 (gf-merror (intl:gettext "`~m': first arg must be a compiled regex or a string.") name) ))
113 (unless (every #'stringp args)
114 (gf-merror (intl:gettext "Unsuitable arguments to `~m'.") name) )
115 regex )
118 (defun regex-index-error (name)
119 (gf-merror (intl:gettext "`~m': improper start or end index.") name) )
122 ;; When the external format is not utf-8 (unicode) positions are counted in octets.
123 ;; We want them in numbers of characters to find the right position in a string.
124 ;; utf-8-pos-dec returns the decrement we need to adjust.
125 ;; (string position = octet position - decrement)
126 (defun regex-utf-8-pos-dec (ov off pos) ;; begin to count at a given offset
127 (do ((i off (1+ i))
128 (n 0))
129 ((= i pos) n)
130 (when (= (logand (aref ov i) 192.) 128.)
131 (incf n) )))
134 (defmfun $regex_match_pos (regex str &optional (start 1) (end nil)) ;; 1-based indexing!
135 (setq regex (regex-check-and-maybe-coerce "regex_match_pos" regex str))
136 (decf start)
137 (when end (decf end))
138 (let (ov)
139 (or (ignore-errors
140 (when *parse-utf-8-input*
141 (setq ov (string-to-raw-bytes str))
142 (let ((args (utf-8-fix-start-end ov (list nil start end))))
143 (setq start (cadr args)
144 end (caddr args) )))
145 (let ((pos-list (pregexp:pregexp-match-positions regex str start end))
146 (pos-mlist nil) )
147 (if pos-list
148 (dolist (pos pos-list (cons '(mlist simp) (nreverse pos-mlist)))
149 (when *parse-utf-8-input*
150 (let ((dec (regex-utf-8-pos-dec ov 0 (car pos))))
151 (decf (cdr pos) (+ dec (regex-utf-8-pos-dec ov (car pos) (cdr pos))))
152 (decf (car pos) dec) ))
153 (push `((mlist simp) ,(1+ (car pos)) ,(1+ (cdr pos))) pos-mlist) )
154 (return-from $regex_match_pos nil) )))
155 (regex-index-error "regex_match_pos") )))
158 (defmfun $regex_match (regex str &optional (start 1) (end nil))
159 (setq regex (regex-check-and-maybe-coerce "regex_match" regex str))
160 (or (ignore-errors
161 (when *parse-utf-8-input*
162 (let* ((ov (string-to-raw-bytes str))
163 (args (utf-8-fix-start-end ov (list nil start end))) )
164 (setq start (cadr args)
165 end (caddr args) )))
166 (let ((match
167 (pregexp:pregexp-match regex str (1- start) (if end (1- end) nil)) ))
168 (if match
169 (cons '(mlist simp) match)
170 (return-from $regex_match nil) )))
171 (regex-index-error "regex_match") ))
174 (defmfun $regex_split (regex str)
175 (setq regex (regex-check-and-maybe-coerce "regex_split" regex str))
176 (cons '(mlist simp) (pregexp:pregexp-split regex str)) )
179 (defmfun $regex_subst_first (replacement regex str)
180 (setq regex (regex-check-and-maybe-coerce "regex_subst_first" regex str replacement))
181 (pregexp:pregexp-replace regex str replacement) )
183 ;; Argument order different to the order of pregexp-replace.
184 ;; Use order like in $ssubst or substitute: new, old, str.
186 (defmfun $regex_subst (replacement regex str)
187 (setq regex (regex-check-and-maybe-coerce "regex_subst" regex str replacement))
188 (pregexp:pregexp-replace* regex str replacement) )
191 (defmfun $string_to_regex (str)
192 (unless (stringp str)
193 (gf-merror (intl:gettext "`string_to_regex': Argument must be a string.")) )
194 (pregexp:pregexp-quote str) )