src/tokenizer.c

   1 /*-
   2  * Copyright (c) 1992, 1993
   3  *  The Regents of the University of California.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to Berkeley by
   6  * Christos Zoulas of Cornell University.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 3. Neither the name of the University nor the names of its contributors
  17  *    may be used to endorse or promote products derived from this software
  18  *    without specific prior written permission.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30  * SUCH DAMAGE.
  31  *
  32  *  NetBSD: tokenizer.c,v 1.14 2003/12/05 13:37:48 lukem Exp
  33  */
  34
  35 #include "config.h"
  36 #include <string.h>
  37 #include <stdlib.h>
  38 #include "tokenizer.h"
  39 #include "util.h"
  40
  41 #define IFS "\t \n"
  42 #define TOK_KEEP 1
  43 #define TOK_EAT  2
  44 #define WINCR    20  /* how much working space to allocate at one time */
  45 #define AINCR    10  /* how many argument slots to allocate at one time */
  46
  47 #define tok_strdup(a)    run2_strdup (a)
  48 #define tok_malloc(a)    run2_malloc (a)
  49 #define tok_free(a)      free (a)
  50 #define tok_realloc(a,b) run2_realloc (a, b)
  51
  52 #define tok_init(a)        run2_tok_init (a)
  53 #define tok_end(a)         run2_tok_end (a)
  54 #define tok_reset(a)       run2_tok_reset (a)
  55 #define tok_str(a,b,c,d)   run2_tok_str (a,b,c,d)
  56
  57 typedef enum
  58 {
  59   Q_none,
  60   Q_single,
  61   Q_double,
  62   Q_one,
  63   Q_doubleone
  64 } quote_t;
  65
  66 struct tokenizer
  67 {
  68   char *ifs;                    /* In field separator                 */
  69   int argc, amax;               /* Current and maximum number of args */
  70   char **argv;                  /* Argument list                      */
  71   char *wptr, *wmax;            /* Space and limit on the word buffer */
  72   char *wstart;                 /* Beginning of next word             */
  73   char *wspace;                 /* Space of word buffer               */
  74   quote_t quote;                /* Quoting state                      */
  75   int flags;                    /* flags;                             */
  76 };
  77
  78 int
  79 run2_split_string (const char *str, int *argc, const char ***argv)
  80 {
  81   Tokenizer *tok;
  82   int rc, i;
  83   int l_argc = 0;
  84   const char **l_argv = NULL;
  85
  86   if (!argv || !argc)
  87     {
  88       errorMsg ("split_string: Null arguments");
  89       return 1;
  90     }
  91   if (!str)
  92     {
  93       *argc = 0;
  94       *argv = NULL;
  95       warnMsg ("split_string: Null string");
  96       return 0;
  97     }
  98
  99   tok = tok_init (IFS);
 100   if (!tok)
 101     {
 102       errorMsg ("Unable to initialize tokenizer");
 103       return 1;
 104     }
 105
 106   if ((rc = tok_str (tok, str, &l_argc, &l_argv)))
 107     {
 108       tok_end (tok);
 109       tok = NULL;
 110     }
 111
 112   switch (rc)
 113     {
 114     case -1:
 115       errorMsg ("internal error while parsing |%s|", str);
 116       break;
 117     case 3:
 118       errorMsg ("string ended with an escaped newline |%s|", str);
 119       break;
 120     case 2:
 121       errorMsg ("unmatched double quote in |%s|", str);
 122       break;
 123     case 1:
 124       errorMsg ("unmatched single quote in |%s|", str);
 125       break;
 126     case 0:
 127       debugMsg (2, "(%s) Parsed string |%s|", __func__, str);
 128       for (i = 0; i < l_argc; i++)
 129         {
 130           debugMsg (2, "(%s)\targv[%d]: |%s|", __func__, i, (l_argv[i] ? l_argv[i] : ""));
 131         }
 132       break;
 133     default:
 134       errorMsg ("unknown error (%d) parsing |%s|", rc, str);
 135       break;
 136     }
 137   if (rc)
 138     return rc;
 139
 140   /* Success. It's time to clean up. But first we have to
 141    * copy l_argv (which points into internal storage of tok)
 142    * into the output argv.
 143    */
 144   *argv = (const char **) run2_malloc ((l_argc + 1) * sizeof (const char *));
 145   for (i = 0; i < l_argc; i++)
 146     {
 147       (*argv)[i] = run2_strdup (l_argv[i]);
 148     }
 149   (*argv)[l_argc] = NULL;
 150   *argc = l_argc;
 151
 152   tok_end (tok);
 153   tok = NULL;
 154   return rc;
 155 }
 156
 157 /* tok_finish():
 158  * Finish a word in the tokenizer.
 159  */
 160 static void
 161 tok_finish (Tokenizer * tok)
 162 {
 163   *(tok->wptr) = '\0';
 164   if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart)
 165     {
 166       tok->argv[tok->argc++] = tok->wstart;
 167       tok->argv[tok->argc] = NULL;
 168       tok->wstart = ++tok->wptr;
 169     }
 170   tok->flags &= ~TOK_KEEP;
 171 }
 172
 173 /* tok_init():
 174  * Initialize the tokenizer
 175  */
 176 Tokenizer *
 177 tok_init (const char *ifs)
 178 {
 179   Tokenizer *tok = (Tokenizer *) tok_malloc (sizeof (Tokenizer));
 180
 181   if (tok == NULL)
 182     return NULL;
 183   tok->ifs = tok_strdup (ifs ? ifs : IFS);
 184   if (tok->ifs == NULL)
 185     {
 186       tok_free ((void *) tok);
 187       return NULL;
 188     }
 189   tok->argc = 0;
 190   tok->amax = AINCR;
 191   tok->argv = (char **) tok_malloc (sizeof (char *) * tok->amax);
 192   if (tok->argv == NULL)
 193     {
 194       tok_free ((void *) tok->ifs);
 195       tok_free ((void *) tok);
 196       return NULL;
 197     }
 198   tok->argv[0] = NULL;
 199   tok->wspace = (char *) tok_malloc (WINCR);
 200   if (tok->wspace == NULL)
 201     {
 202       tok_free ((void *) tok->argv);
 203       tok_free ((void *) tok->ifs);
 204       tok_free ((void *) tok);
 205       return NULL;
 206     }
 207   tok->wmax = tok->wspace + WINCR;
 208   tok->wstart = tok->wspace;
 209   tok->wptr = tok->wspace;
 210   tok->flags = 0;
 211   tok->quote = Q_none;
 212
 213   return (tok);
 214 }
 215
 216 /* tok_reset():
 217  * Reset the tokenizer
 218  */
 219 void
 220 tok_reset (Tokenizer * tok)
 221 {
 222   tok->argc = 0;
 223   tok->wstart = tok->wspace;
 224   tok->wptr = tok->wspace;
 225   tok->flags = 0;
 226   tok->quote = Q_none;
 227 }
 228
 229 /* tok_end():
 230  * Clean up
 231  */
 232 void
 233 tok_end (Tokenizer * tok)
 234 {
 235   if (tok)
 236     {
 237       tok_free ((void *) tok->ifs);
 238       tok_free ((void *) tok->wspace);
 239       tok_free ((void *) tok->argv);
 240       tok_free ((void *) tok);
 241     }
 242 }
 243
 244 /*
 245  * tok_line():
 246  * Bourne shell (sh(1)) like tokenizing
 247  * Arguments:
 248  *      tok     current tokenizer state (setup with tok_init())
 249  *      line    line to parse
 250  * Returns:
 251  *      -1      Internal error
 252  *       3      Quoted return
 253  *       2      Unmatched double quote
 254  *       1      Unmatched single quote
 255  *       0      Ok
 256  * Modifies (if return value is 0):
 257  *      argc    number of arguments
 258  *      argv    argument array
 259  *      cursorc if !NULL, argv element containing cursor
 260  *      cursorv if !NULL, offset in argv[cursorc] of cursor
 261  * NOTE: Made this function static because it is used only by (a) tok_str,
 262  *       or libeditline functions in the original library from which this
 263  *       code derived.
 264  */
 265 static int
 266 tok_line (Tokenizer * tok,
 267           const LineInfo * line,
 268           int *argc, const char ***argv, int *cursorc, int *cursoro)
 269 {
 270   const char *ptr;
 271   int cc, co;
 272
 273   cc = co = -1;
 274   ptr = line->buffer;
 275   for (ptr = line->buffer;; ptr++)
 276     {
 277       if (ptr >= line->lastchar)
 278         ptr = "";
 279       if (ptr == line->cursor)
 280         {
 281           cc = tok->argc;
 282           /* the offset to a position within a word and
 283            * the offset to the start of that word are both
 284            * guaranteed positive. Therefore, their difference
 285            * will not overflow.
 286            */
 287           co = (int) (tok->wptr - tok->wstart);
 288         }
 289       switch (*ptr)
 290         {
 291         case '\'':
 292           tok->flags |= TOK_KEEP;
 293           tok->flags &= ~TOK_EAT;
 294           switch (tok->quote)
 295             {
 296             case Q_none:       /* Enter single quote mode */
 297               tok->quote = Q_single;
 298               break;
 299
 300             case Q_single:     /* Exit single quote mode */
 301               tok->quote = Q_none;
 302               break;
 303
 304             case Q_one:        /* Quote this ' */
 305               tok->quote = Q_none;
 306               *tok->wptr++ = *ptr;
 307               break;
 308
 309             case Q_double:     /* Stay in double quote mode */
 310               *tok->wptr++ = *ptr;
 311               break;
 312
 313             case Q_doubleone:  /* Quote this ' */
 314               tok->quote = Q_double;
 315               *tok->wptr++ = *ptr;
 316               break;
 317
 318             default:
 319               return (-1);
 320             }
 321           break;
 322
 323         case '"':
 324           tok->flags &= ~TOK_EAT;
 325           tok->flags |= TOK_KEEP;
 326           switch (tok->quote)
 327             {
 328             case Q_none:       /* Enter double quote mode */
 329               tok->quote = Q_double;
 330               break;
 331
 332             case Q_double:     /* Exit double quote mode */
 333               tok->quote = Q_none;
 334               break;
 335
 336             case Q_one:        /* Quote this " */
 337               tok->quote = Q_none;
 338               *tok->wptr++ = *ptr;
 339               break;
 340
 341             case Q_single:     /* Stay in single quote mode */
 342               *tok->wptr++ = *ptr;
 343               break;
 344
 345             case Q_doubleone:  /* Quote this " */
 346               tok->quote = Q_double;
 347               *tok->wptr++ = *ptr;
 348               break;
 349
 350             default:
 351               return (-1);
 352             }
 353           break;
 354
 355         case '\\':
 356           tok->flags |= TOK_KEEP;
 357           tok->flags &= ~TOK_EAT;
 358           switch (tok->quote)
 359             {
 360             case Q_none:       /* Quote next character */
 361               tok->quote = Q_one;
 362               break;
 363
 364             case Q_double:     /* Quote next character */
 365               tok->quote = Q_doubleone;
 366               break;
 367
 368             case Q_one:        /* Quote this, restore state */
 369               *tok->wptr++ = *ptr;
 370               tok->quote = Q_none;
 371               break;
 372
 373             case Q_single:     /* Stay in single quote mode */
 374               *tok->wptr++ = *ptr;
 375               break;
 376
 377             case Q_doubleone:  /* Quote this \ */
 378               tok->quote = Q_double;
 379               *tok->wptr++ = *ptr;
 380               break;
 381
 382             default:
 383               return (-1);
 384             }
 385           break;
 386
 387         case '\n':
 388           tok->flags &= ~TOK_EAT;
 389           switch (tok->quote)
 390             {
 391             case Q_none:
 392               goto tok_line_outok;
 393
 394             case Q_single:
 395             case Q_double:
 396               *tok->wptr++ = *ptr;      /* Add the return */
 397               break;
 398
 399             case Q_doubleone:  /* Back to double, eat the '\n' */
 400               tok->flags |= TOK_EAT;
 401               tok->quote = Q_double;
 402               break;
 403
 404             case Q_one:        /* No quote, more eat the '\n' */
 405               tok->flags |= TOK_EAT;
 406               tok->quote = Q_none;
 407               break;
 408
 409             default:
 410               return (0);
 411             }
 412           break;
 413
 414         case '\0':
 415           switch (tok->quote)
 416             {
 417             case Q_none:
 418               /* Finish word and return */
 419               if (tok->flags & TOK_EAT)
 420                 {
 421                   tok->flags &= ~TOK_EAT;
 422                   return (3);
 423                 }
 424               goto tok_line_outok;
 425
 426             case Q_single:
 427               return (1);
 428
 429             case Q_double:
 430               return (2);
 431
 432             case Q_doubleone:
 433               tok->quote = Q_double;
 434               *tok->wptr++ = *ptr;
 435               break;
 436
 437             case Q_one:
 438               tok->quote = Q_none;
 439               *tok->wptr++ = *ptr;
 440               break;
 441
 442             default:
 443               return (-1);
 444             }
 445           break;
 446
 447         default:
 448           tok->flags &= ~TOK_EAT;
 449           switch (tok->quote)
 450             {
 451             case Q_none:
 452               if (strchr (tok->ifs, *ptr) != NULL)
 453                 tok_finish (tok);
 454               else
 455                 *tok->wptr++ = *ptr;
 456               break;
 457
 458             case Q_single:
 459             case Q_double:
 460               *tok->wptr++ = *ptr;
 461               break;
 462
 463
 464             case Q_doubleone:
 465               *tok->wptr++ = '\\';
 466               tok->quote = Q_double;
 467               *tok->wptr++ = *ptr;
 468               break;
 469
 470             case Q_one:
 471               tok->quote = Q_none;
 472               *tok->wptr++ = *ptr;
 473               break;
 474
 475             default:
 476               return (-1);
 477
 478             }
 479           break;
 480         }
 481
 482       if (tok->wptr >= tok->wmax - 4)
 483         {
 484           size_t size = tok->wmax - tok->wspace + WINCR;
 485           char *s = (char *) tok_realloc (tok->wspace, size);
 486           if (s == NULL)
 487             return (-1);
 488
 489           if (s != tok->wspace)
 490             {
 491               int i;
 492               for (i = 0; i < tok->argc; i++)
 493                 {
 494                   tok->argv[i] = (tok->argv[i] - tok->wspace) + s;
 495                 }
 496               tok->wptr = (tok->wptr - tok->wspace) + s;
 497               tok->wstart = (tok->wstart - tok->wspace) + s;
 498               tok->wspace = s;
 499             }
 500           tok->wmax = s + size;
 501         }
 502       if (tok->argc >= tok->amax - 4)
 503         {
 504           char **p;
 505           tok->amax += AINCR;
 506           p = (char **) tok_realloc (tok->argv, tok->amax * sizeof (char *));
 507           if (p == NULL)
 508             return (-1);
 509           tok->argv = p;
 510         }
 511     }
 512 tok_line_outok:
 513   if (cc == -1 && co == -1)
 514     {
 515       cc = tok->argc;
 516       /* the offset to a position within a word and
 517        * the offset to the start of that word are both
 518        * guaranteed positive. Therefore, their difference
 519        * will not overflow.
 520        */
 521       co = (int) (tok->wptr - tok->wstart);
 522     }
 523   if (cursorc != NULL)
 524     *cursorc = cc;
 525   if (cursoro != NULL)
 526     *cursoro = co;
 527   tok_finish (tok);
 528   *argv = (const char **) tok->argv;
 529   *argc = tok->argc;
 530   return (0);
 531 }
 532
 533 /* tok_str
 534  *  Simpler version of tok_line, taking a NUL terminated line
 535  *  and splitting into words, ignoring cursor state.
 536  */
 537 int
 538 tok_str (Tokenizer * tok, const char *line, int *argc, const char ***argv)
 539 {
 540   LineInfo li;
 541
 542   memset (&li, 0, sizeof (li));
 543   li.buffer = line;
 544   li.cursor = li.lastchar = strchr (line, '\0');
 545   return (tok_line (tok, &li, argc, argv, NULL, NULL));
 546 }