quote.c

   1 /* ----------------------------------------------------------------------- *
   2  *
   3  *   Copyright 1996-2009 The NASM Authors - All Rights Reserved
   4  *   See the file AUTHORS included with the NASM distribution for
   5  *   the specific copyright holders.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following
   9  *   conditions are met:
  10  *
  11  *   * Redistributions of source code must retain the above copyright
  12  *     notice, this list of conditions and the following disclaimer.
  13  *   * Redistributions in binary form must reproduce the above
  14  *     copyright notice, this list of conditions and the following
  15  *     disclaimer in the documentation and/or other materials provided
  16  *     with the distribution.
  17  *
  18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  19  *     CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  20  *     INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  21  *     MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22  *     DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  23  *     CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24  *     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  25  *     NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  26  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  *     HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28  *     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  29  *     OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  30  *     EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31  *
  32  * ----------------------------------------------------------------------- */
  33
  34 /*
  35  * quote.c
  36  */
  37
  38 #include "compiler.h"
  39
  40 #include <assert.h>
  41 #include <stdlib.h>
  42
  43 #include "nasmlib.h"
  44 #include "quote.h"
  45
  46 #define numvalue(c)  ((c)>='a' ? (c)-'a'+10 : (c)>='A' ? (c)-'A'+10 : (c)-'0')
  47
  48 char *nasm_quote(char *str, size_t len)
  49 {
  50     char c, c1, *p, *q, *nstr, *ep;
  51     bool sq_ok, dq_ok;
  52     size_t qlen;
  53
  54     sq_ok = dq_ok = true;
  55     ep = str+len;
  56     qlen = 0;                   /* Length if we need `...` quotes */
  57     for (p = str; p < ep; p++) {
  58         c = *p;
  59         switch (c) {
  60         case '\'':
  61             sq_ok = false;
  62             qlen++;
  63             break;
  64         case '\"':
  65             dq_ok = false;
  66             qlen++;
  67             break;
  68         case '`':
  69         case '\\':
  70             qlen += 2;
  71             break;
  72         default:
  73             if (c < ' ' || c > '~') {
  74                 sq_ok = dq_ok = false;
  75                 switch (c) {
  76                 case '\a':
  77                 case '\b':
  78                 case '\t':
  79                 case '\n':
  80                 case '\v':
  81                 case '\f':
  82                 case '\r':
  83                 case 27:
  84                     qlen += 2;
  85                     break;
  86                 default:
  87                     c1 = (p+1 < ep) ? p[1] : 0;
  88                     if (c > 077 || (c1 >= '0' && c1 <= '7'))
  89                         qlen += 4; /* Must use the full form */
  90                     else if (c > 07)
  91                         qlen += 3;
  92                     else
  93                         qlen += 2;
  94                     break;
  95                 }
  96             } else {
  97                 qlen++;
  98             }
  99             break;
 100         }
 101     }
 102
 103     if (sq_ok || dq_ok) {
 104         /* Use '...' or "..." */
 105         nstr = nasm_malloc(len+3);
 106         nstr[0] = nstr[len+1] = sq_ok ? '\'' : '\"';
 107         nstr[len+2] = '\0';
 108         memcpy(nstr+1, str, len);
 109     } else {
 110         /* Need to use `...` quoted syntax */
 111         nstr = nasm_malloc(qlen+3);
 112         q = nstr;
 113         *q++ = '`';
 114         for (p = str; p < ep; p++) {
 115             c = *p;
 116             switch (c) {
 117             case '`':
 118             case '\\':
 119                 *q++ = '\\';
 120                 *q++ = c;
 121                 break;
 122             case 7:
 123                 *q++ = '\\';
 124                 *q++ = 'a';
 125                 break;
 126             case 8:
 127                 *q++ = '\\';
 128                 *q++ = 'b';
 129                 break;
 130             case 9:
 131                 *q++ = '\\';
 132                 *q++ = 't';
 133                 break;
 134             case 10:
 135                 *q++ = '\\';
 136                 *q++ = 'n';
 137                 break;
 138             case 11:
 139                 *q++ = '\\';
 140                 *q++ = 'v';
 141                 break;
 142             case 12:
 143                 *q++ = '\\';
 144                 *q++ = 'f';
 145                 break;
 146             case 13:
 147                 *q++ = '\\';
 148                 *q++ = 'r';
 149                 break;
 150             case 27:
 151                 *q++ = '\\';
 152                 *q++ = 'e';
 153                 break;
 154             default:
 155                 if (c < ' ' || c > '~') {
 156                     c1 = (p+1 < ep) ? p[1] : 0;
 157                     if (c1 >= '0' && c1 <= '7')
 158                         q += sprintf(q, "\\%03o", (unsigned char)c);
 159                     else
 160                         q += sprintf(q, "\\%o", (unsigned char)c);
 161                 } else {
 162                     *q++ = c;
 163                 }
 164                 break;
 165             }
 166         }
 167         *q++ = '`';
 168         *q++ = '\0';
 169         assert((size_t)(q-nstr) == qlen+3);
 170     }
 171     return nstr;
 172 }
 173
 174 static char *emit_utf8(char *q, int32_t v)
 175 {
 176     if (v < 0) {
 177         /* Impossible - do nothing */
 178     } else if (v <= 0x7f) {
 179         *q++ = v;
 180     } else if (v <= 0x000007ff) {
 181         *q++ = 0xc0 | (v >> 6);
 182         *q++ = 0x80 | (v & 63);
 183     } else if (v <= 0x0000ffff) {
 184         *q++ = 0xe0 | (v >> 12);
 185         *q++ = 0x80 | ((v >> 6) & 63);
 186         *q++ = 0x80 | (v & 63);
 187     } else if (v <= 0x001fffff) {
 188         *q++ = 0xf0 | (v >> 18);
 189         *q++ = 0x80 | ((v >> 12) & 63);
 190         *q++ = 0x80 | ((v >> 6) & 63);
 191         *q++ = 0x80 | (v & 63);
 192     } else if (v <= 0x03ffffff) {
 193         *q++ = 0xf8 | (v >> 24);
 194         *q++ = 0x80 | ((v >> 18) & 63);
 195         *q++ = 0x80 | ((v >> 12) & 63);
 196         *q++ = 0x80 | ((v >> 6) & 63);
 197         *q++ = 0x80 | (v & 63);
 198     } else {
 199         *q++ = 0xfc | (v >> 30);
 200         *q++ = 0x80 | ((v >> 24) & 63);
 201         *q++ = 0x80 | ((v >> 18) & 63);
 202         *q++ = 0x80 | ((v >> 12) & 63);
 203         *q++ = 0x80 | ((v >> 6) & 63);
 204         *q++ = 0x80 | (v & 63);
 205     }
 206     return q;
 207 }
 208
 209 /*
 210  * Do an *in-place* dequoting of the specified string, returning the
 211  * resulting length (which may be containing embedded nulls.)
 212  *
 213  * In-place replacement is possible since the unquoted length is always
 214  * shorter than or equal to the quoted length.
 215  *
 216  * *ep points to the final quote, or to the null if improperly quoted.
 217  */
 218 size_t nasm_unquote(char *str, char **ep)
 219 {
 220     char bq;
 221     char *p, *q;
 222     char *escp = NULL;
 223     char c;
 224     enum unq_state {
 225         st_start,
 226         st_backslash,
 227         st_hex,
 228         st_oct,
 229         st_ucs,
 230     } state;
 231     int ndig = 0;
 232     int32_t nval = 0;
 233
 234     p = q = str;
 235
 236     bq = *p++;
 237     if (!bq)
 238         return 0;
 239
 240     switch (bq) {
 241     case '\'':
 242     case '\"':
 243         /* '...' or "..." string */
 244         while ((c = *p) && c != bq) {
 245             p++;
 246             *q++ = c;
 247         }
 248         *q = '\0';
 249         break;
 250
 251     case '`':
 252         /* `...` string */
 253         state = st_start;
 254
 255         while ((c = *p)) {
 256             p++;
 257             switch (state) {
 258             case st_start:
 259                 switch (c) {
 260                 case '\\':
 261                     state = st_backslash;
 262                     break;
 263                 case '`':
 264                     p--;
 265                     goto out;
 266                 default:
 267                     *q++ = c;
 268                     break;
 269                 }
 270                 break;
 271
 272             case st_backslash:
 273                 state = st_start;
 274                 escp = p;       /* Beginning of argument sequence */
 275                 nval = 0;
 276                 switch (c) {
 277                 case 'a':
 278                     *q++ = 7;
 279                     break;
 280                 case 'b':
 281                     *q++ = 8;
 282                     break;
 283                 case 'e':
 284                     *q++ = 27;
 285                     break;
 286                 case 'f':
 287                     *q++ = 12;
 288                     break;
 289                 case 'n':
 290                     *q++ = 10;
 291                     break;
 292                 case 'r':
 293                     *q++ = 13;
 294                     break;
 295                 case 't':
 296                     *q++ = 9;
 297                     break;
 298                 case 'u':
 299                     state = st_ucs;
 300                     ndig = 4;
 301                     break;
 302                 case 'U':
 303                     state = st_ucs;
 304                     ndig = 8;
 305                     break;
 306                 case 'v':
 307                     *q++ = 11;
 308                     break;
 309                 case 'x':
 310                 case 'X':
 311                     state = st_hex;
 312                     ndig = 2;
 313                     break;
 314                 case '0':
 315                 case '1':
 316                 case '2':
 317                 case '3':
 318                 case '4':
 319                 case '5':
 320                 case '6':
 321                 case '7':
 322                     state = st_oct;
 323                     ndig = 2;   /* Up to two more digits */
 324                     nval = c - '0';
 325                     break;
 326                 default:
 327                     *q++ = c;
 328                     break;
 329                 }
 330                 break;
 331
 332             case st_oct:
 333                 if (c >= '0' && c <= '7') {
 334                     nval = (nval << 3) + (c - '0');
 335                     if (!--ndig) {
 336                         *q++ = nval;
 337                         state = st_start;
 338                     }
 339                 } else {
 340                     p--;        /* Process this character again */
 341                     *q++ = nval;
 342                     state = st_start;
 343                 }
 344                 break;
 345
 346             case st_hex:
 347                 if ((c >= '0' && c <= '9') ||
 348                     (c >= 'A' && c <= 'F') ||
 349                     (c >= 'a' && c <= 'f')) {
 350                     nval = (nval << 4) + numvalue(c);
 351                     if (!--ndig) {
 352                         *q++ = nval;
 353                         state = st_start;
 354                     }
 355                 } else {
 356                     p--;        /* Process this character again */
 357                     *q++ = (p > escp) ? nval : escp[-1];
 358                     state = st_start;
 359                 }
 360                 break;
 361
 362             case st_ucs:
 363                 if ((c >= '0' && c <= '9') ||
 364                     (c >= 'A' && c <= 'F') ||
 365                     (c >= 'a' && c <= 'f')) {
 366                     nval = (nval << 4) + numvalue(c);
 367                     if (!--ndig) {
 368                         q = emit_utf8(q, nval);
 369                         state = st_start;
 370                     }
 371                 } else {
 372                     p--;        /* Process this character again */
 373                     if (p > escp)
 374                         q = emit_utf8(q, nval);
 375                     else
 376                         *q++ = escp[-1];
 377                     state = st_start;
 378                 }
 379                 break;
 380             }
 381         }
 382         switch (state) {
 383         case st_start:
 384         case st_backslash:
 385             break;
 386         case st_oct:
 387             *q++ = nval;
 388             break;
 389         case st_hex:
 390             *q++ = (p > escp) ? nval : escp[-1];
 391             break;
 392         case st_ucs:
 393             if (p > escp)
 394                 q = emit_utf8(q, nval);
 395             else
 396                 *q++ = escp[-1];
 397             break;
 398         }
 399     out:
 400         break;
 401
 402     default:
 403         /* Not a quoted string, just return the input... */
 404         p = q = strchr(str, '\0');
 405         break;
 406     }
 407
 408     if (ep)
 409         *ep = p;
 410     return q-str;
 411 }
 412
 413 /*
 414  * Find the end of a quoted string; returns the pointer to the terminating
 415  * character (either the ending quote or the null character, if unterminated.)
 416  */
 417 char *nasm_skip_string(char *str)
 418 {
 419     char bq;
 420     char *p;
 421     char c;
 422     enum unq_state {
 423         st_start,
 424         st_backslash,
 425     } state;
 426
 427     bq = str[0];
 428     if (bq == '\'' || bq == '\"') {
 429         /* '...' or "..." string */
 430         for (p = str+1; *p && *p != bq; p++)
 431             ;
 432         return p;
 433     } else if (bq == '`') {
 434         /* `...` string */
 435         p = str+1;
 436         state = st_start;
 437
 438         while ((c = *p++)) {
 439             switch (state) {
 440             case st_start:
 441                 switch (c) {
 442                 case '\\':
 443                     state = st_backslash;
 444                     break;
 445                 case '`':
 446                     return p-1; /* Found the end */
 447                 default:
 448                     break;
 449                 }
 450                 break;
 451
 452             case st_backslash:
 453                 /*
 454                  * Note: for the purpose of finding the end of the string,
 455                  * all successor states to st_backslash are functionally
 456                  * equivalent to st_start, since either a backslash or
 457                  * a backquote will force a return to the st_start state.
 458                  */
 459                 state = st_start;
 460                 break;
 461             }
 462         }
 463         return p;               /* Unterminated string... */
 464     } else {
 465         return str;             /* Not a string... */
 466     }
 467 }