Diagnostics: make debug more dynamic, note -> info, add listmsg level
[nasm.git] / asm / quote.c
blob58bb5a108e79d0fc1e9e504e9d1457b097659e7f
1 /* ----------------------------------------------------------------------- *
3 * Copyright 1996-2019 The NASM Authors - All Rights Reserved
4 * See the file AUTHORS included with the NASM distribution for
5 * the specific copyright holders.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following
9 * conditions are met:
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above
14 * copyright notice, this list of conditions and the following
15 * disclaimer in the documentation and/or other materials provided
16 * with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
19 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
20 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
21 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
29 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
30 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 * ----------------------------------------------------------------------- */
35 * quote.c
38 #include "compiler.h"
39 #include "nasmlib.h"
40 #include "quote.h"
41 #include "nctype.h"
42 #include "error.h"
45 * Create a NASM quoted string in newly allocated memory. Update the
46 * *lenp parameter with the output length (sans final NUL).
49 char *nasm_quote(const char *str, size_t *lenp)
51 const char *p, *ep;
52 char c, c1, *q, *nstr;
53 unsigned char uc;
54 bool sq_ok, dq_ok;
55 size_t qlen;
56 size_t len = *lenp;
58 sq_ok = dq_ok = true;
59 ep = str+len;
60 qlen = 0; /* Length if we need `...` quotes */
61 for (p = str; p < ep; p++) {
62 c = *p;
63 switch (c) {
64 case '\'':
65 sq_ok = false;
66 qlen++;
67 break;
68 case '\"':
69 dq_ok = false;
70 qlen++;
71 break;
72 case '`':
73 case '\\':
74 qlen += 2;
75 break;
76 default:
77 if (c < ' ' || c > '~') {
78 sq_ok = dq_ok = false;
79 switch (c) {
80 case '\a':
81 case '\b':
82 case '\t':
83 case '\n':
84 case '\v':
85 case '\f':
86 case '\r':
87 case 27:
88 qlen += 2;
89 break;
90 default:
91 c1 = (p+1 < ep) ? p[1] : 0;
92 if (c1 >= '0' && c1 <= '7')
93 uc = 0377; /* Must use the full form */
94 else
95 uc = c;
96 if (uc > 077)
97 qlen++;
98 if (uc > 07)
99 qlen++;
100 qlen += 2;
101 break;
103 } else {
104 qlen++;
106 break;
110 if (sq_ok || dq_ok) {
111 /* Use '...' or "..." */
112 nstr = nasm_malloc(len+3);
113 nstr[0] = nstr[len+1] = sq_ok ? '\'' : '\"';
114 q = &nstr[len+2];
115 if (len > 0)
116 memcpy(nstr+1, str, len);
117 } else {
118 /* Need to use `...` quoted syntax */
119 nstr = nasm_malloc(qlen+3);
120 q = nstr;
121 *q++ = '`';
122 for (p = str; p < ep; p++) {
123 c = *p;
124 switch (c) {
125 case '`':
126 case '\\':
127 *q++ = '\\';
128 *q++ = c;
129 break;
130 case 7:
131 *q++ = '\\';
132 *q++ = 'a';
133 break;
134 case 8:
135 *q++ = '\\';
136 *q++ = 'b';
137 break;
138 case 9:
139 *q++ = '\\';
140 *q++ = 't';
141 break;
142 case 10:
143 *q++ = '\\';
144 *q++ = 'n';
145 break;
146 case 11:
147 *q++ = '\\';
148 *q++ = 'v';
149 break;
150 case 12:
151 *q++ = '\\';
152 *q++ = 'f';
153 break;
154 case 13:
155 *q++ = '\\';
156 *q++ = 'r';
157 break;
158 case 27:
159 *q++ = '\\';
160 *q++ = 'e';
161 break;
162 default:
163 if (c < ' ' || c > '~') {
164 c1 = (p+1 < ep) ? p[1] : 0;
165 if (c1 >= '0' && c1 <= '7')
166 uc = 0377; /* Must use the full form */
167 else
168 uc = c;
169 *q++ = '\\';
170 if (uc > 077)
171 *q++ = ((unsigned char)c >> 6) + '0';
172 if (uc > 07)
173 *q++ = (((unsigned char)c >> 3) & 7) + '0';
174 *q++ = ((unsigned char)c & 7) + '0';
175 break;
176 } else {
177 *q++ = c;
179 break;
182 *q++ = '`';
183 nasm_assert((size_t)(q-nstr) == qlen+2);
185 *q = '\0';
186 *lenp = q - nstr;
187 return nstr;
190 static unsigned char *emit_utf8(unsigned char *q, uint32_t v)
192 uint32_t vb1, vb2, vb3, vb4, vb5;
194 if (v <= 0x7f) {
195 *q++ = v;
196 goto out0;
199 vb1 = v >> 6;
200 if (vb1 <= 0x1f) {
201 *q++ = 0xc0 + vb1;
202 goto out1;
205 vb2 = vb1 >> 6;
206 if (vb2 <= 0x0f) {
207 *q++ = 0xe0 + vb2;
208 goto out2;
211 vb3 = vb2 >> 6;
212 if (vb3 <= 0x07) {
213 *q++ = 0xf0 + vb3;
214 goto out3;
217 vb4 = vb3 >> 6;
218 if (vb4 <= 0x03) {
219 *q++ = 0xf8 + vb4;
220 goto out4;
224 * Note: this is invalid even for "classic" (pre-UTF16) 31-bit
225 * UTF-8 if the value is >= 0x8000000. This at least tries to do
226 * something vaguely sensible with it. Caveat programmer.
227 * The __utf*__ string transform functions do reject these
228 * as invalid input.
230 * vb5 cannot be more than 3, as a 32-bit value has been shifted
231 * right by 5*6 = 30 bits already.
233 vb5 = vb4 >> 6;
234 *q++ = 0xfc + vb5;
235 goto out5;
237 /* Emit extension bytes as appropriate */
238 out5: *q++ = 0x80 + (vb4 & 63);
239 out4: *q++ = 0x80 + (vb3 & 63);
240 out3: *q++ = 0x80 + (vb2 & 63);
241 out2: *q++ = 0x80 + (vb1 & 63);
242 out1: *q++ = 0x80 + (v & 63);
243 out0: return q;
246 static inline uint32_t ctlbit(uint32_t v)
248 return unlikely(v < 32) ? UINT32_C(1) << v : 0;
251 #define CTL_ERR(c) \
252 (badctl & (ctlmask |= ctlbit(c)))
254 #define EMIT_UTF8(c) \
255 do { \
256 uint32_t ec = (c); \
257 if (!CTL_ERR(ec)) \
258 q = emit_utf8(q, ec); \
259 } while (0)
261 #define EMIT(c) \
262 do { \
263 unsigned char ec = (c); \
264 if (!CTL_ERR(ec)) \
265 *q++ = ec; \
266 } while (0)
269 * Same as nasm_quote, but take the length of a C string;
270 * the lenp argument is optional.
272 char *nasm_quote_cstr(const char *str, size_t *lenp)
274 size_t len = strlen(str);
275 char *qstr = nasm_quote(str, &len);
276 if (lenp)
277 *lenp = len;
278 return qstr;
282 * Do an *in-place* dequoting of the specified string, returning the
283 * resulting length (which may be containing embedded nulls.)
285 * In-place replacement is possible since the unquoted length is always
286 * shorter than or equal to the quoted length.
288 * *ep points to the final quote, or to the null if improperly quoted.
290 * Issue an error if the string contains control characters
291 * corresponding to bits set in badctl; in that case, the output
292 * string, but not *ep, is truncated before the first invalid
293 * character.
296 static size_t nasm_unquote_common(char *str, char **ep,
297 const uint32_t badctl)
299 unsigned char bq;
300 const unsigned char *p;
301 const unsigned char *escp = NULL;
302 unsigned char *q;
303 unsigned char c;
304 uint32_t ctlmask = 0; /* Mask of control characters seen */
305 enum unq_state {
306 st_start,
307 st_backslash,
308 st_hex,
309 st_oct,
310 st_ucs,
311 st_done
312 } state;
313 int ndig = 0;
314 uint32_t nval = 0;
316 p = q = (unsigned char *)str;
318 bq = *p++;
319 if (!bq)
320 return 0;
322 switch (bq) {
323 case '\'':
324 case '\"':
325 /* '...' or "..." string */
326 while ((c = *p++) && (c != bq))
327 EMIT(c);
328 break;
330 case '`':
331 /* `...` string */
332 state = st_start;
334 while (state != st_done) {
335 c = *p++;
336 switch (state) {
337 case st_start:
338 switch (c) {
339 case '\\':
340 state = st_backslash;
341 break;
342 case '`':
343 case '\0':
344 state = st_done;
345 break;
346 default:
347 EMIT(c);
348 break;
350 break;
352 case st_backslash:
353 state = st_start;
354 escp = p; /* Beginning of argument sequence */
355 nval = 0;
356 switch (c) {
357 case 'a':
358 nval = 7;
359 break;
360 case 'b':
361 nval = 8;
362 break;
363 case 'e':
364 nval = 27;
365 break;
366 case 'f':
367 nval = 12;
368 break;
369 case 'n':
370 nval = 10;
371 break;
372 case 'r':
373 nval = 13;
374 break;
375 case 't':
376 nval = 9;
377 break;
378 case 'u':
379 state = st_ucs;
380 ndig = 4;
381 break;
382 case 'U':
383 state = st_ucs;
384 ndig = 8;
385 break;
386 case 'v':
387 nval = 11;
388 break;
389 case 'x':
390 case 'X':
391 state = st_hex;
392 ndig = 2;
393 break;
394 case '0':
395 case '1':
396 case '2':
397 case '3':
398 case '4':
399 case '5':
400 case '6':
401 case '7':
402 state = st_oct;
403 ndig = 2; /* Up to two more digits */
404 nval = c - '0';
405 break;
406 case '\0':
407 nval = '\\';
408 p--; /* Reprocess; terminates string */
409 break;
410 default:
411 nval = c;
412 break;
414 if (state == st_start)
415 EMIT(nval);
416 break;
418 case st_oct:
419 if (c >= '0' && c <= '7') {
420 nval = (nval << 3) + (c - '0');
421 if (--ndig)
422 break; /* Might have more digits */
423 } else {
424 p--; /* Process this character again */
426 EMIT(nval);
427 state = st_start;
428 break;
430 case st_hex:
431 case st_ucs:
432 if (nasm_isxdigit(c)) {
433 nval = (nval << 4) + numvalue(c);
434 if (--ndig)
435 break; /* Might have more digits */
436 } else {
437 p--; /* Process this character again */
440 if (unlikely(p <= escp))
441 EMIT(escp[-1]);
442 else if (state == st_ucs)
443 EMIT_UTF8(nval);
444 else
445 EMIT(nval);
447 state = st_start;
448 break;
450 default:
451 panic();
454 break;
456 default:
457 /* Not a quoted string, just return the input... */
458 while ((c = *p++))
459 EMIT(c);
460 break;
463 /* Zero-terminate the output */
464 *q = '\0';
466 if (ctlmask & badctl)
467 nasm_nonfatal("control character in string not allowed here");
469 if (ep)
470 *ep = (char *)p - 1;
471 return (char *)q - str;
473 #undef EMIT
475 size_t nasm_unquote(char *str, char **ep)
477 return nasm_unquote_common(str, ep, 0);
479 size_t nasm_unquote_cstr(char *str, char **ep)
482 * These are the only control characters permitted: BEL BS TAB ESC
484 const uint32_t okctl = (1 << '\a') | (1 << '\b') | (1 << '\t') | (1 << 27);
486 return nasm_unquote_common(str, ep, ~okctl);
490 * Find the end of a quoted string; returns the pointer to the terminating
491 * character (either the ending quote or the null character, if unterminated.)
492 * If the input is not a quoted string, return NULL.
494 char *nasm_skip_string(const char *str)
496 char bq;
497 const char *p;
498 char c;
499 enum unq_state {
500 st_start,
501 st_backslash,
502 st_done
503 } state;
505 bq = str[0];
506 p = str+1;
507 switch (bq) {
508 case '\'':
509 case '\"':
510 /* '...' or "..." string */
511 while ((c = *p++) && (c != bq))
513 break;
515 case '`':
516 /* `...` string */
517 state = st_start;
518 while (state != st_done) {
519 c = *p++;
520 switch (state) {
521 case st_start:
522 switch (c) {
523 case '\\':
524 state = st_backslash;
525 break;
526 case '`':
527 case '\0':
528 state = st_done;
529 break;
530 default:
531 break;
533 break;
535 case st_backslash:
537 * Note: for the purpose of finding the end of the string,
538 * all successor states to st_backslash are functionally
539 * equivalent to st_start, since either a backslash or
540 * a backquote will force a return to the st_start state.
542 state = c ? st_start : st_done;
543 break;
545 default:
546 panic();
549 break;
551 default:
552 /* Not a string at all... */
553 return NULL;
555 return (char *)p - 1;