lib/libedit/tokenizer.c

   1 /*      $OpenBSD: tokenizer.c,v 1.12 2010/06/30 00:05:35 nicm Exp $     */
   2 /*      $NetBSD: tokenizer.c,v 1.18 2010/01/03 18:27:10 christos Exp $  */
   3
   4 /*-
   5  * Copyright (c) 1992, 1993
   6  *      The Regents of the University of California.  All rights reserved.
   7  *
   8  * This code is derived from software contributed to Berkeley by
   9  * Christos Zoulas of Cornell University.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  * 3. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  */
  35
  36 #include "config.h"
  37
  38 /* We build this file twice, once as NARROW, once as WIDE. */
  39 /*
  40  * tokenize.c: Bourne shell like tokenizer
  41  */
  42 #include <string.h>
  43 #include <stdlib.h>
  44 #include "histedit.h"
  45 #include "chartype.h"
  46
  47 typedef enum {
  48         Q_none, Q_single, Q_double, Q_one, Q_doubleone
  49 } quote_t;
  50
  51 #define TOK_KEEP        1
  52 #define TOK_EAT         2
  53
  54 #define WINCR           20
  55 #define AINCR           10
  56
  57 #define IFS             STR("\t \n")
  58
  59 #define tok_malloc(a)           malloc(a)
  60 #define tok_free(a)             free(a)
  61 #define tok_realloc(a, b)       realloc(a, b)
  62 #define tok_strdup(a)           Strdup(a)
  63
  64
  65 struct TYPE(tokenizer) {
  66         Char    *ifs;           /* In field separator                    */
  67         int      argc, amax;    /* Current and maximum number of args    */
  68         Char   **argv;          /* Argument list                         */
  69         Char    *wptr, *wmax;   /* Space and limit on the word buffer    */
  70         Char    *wstart;        /* Beginning of next word                */
  71         Char    *wspace;        /* Space of word buffer                  */
  72         quote_t  quote;         /* Quoting state                         */
  73         int      flags;         /* flags;                                */
  74 };
  75
  76
  77 private void FUN(tok,finish)(TYPE(Tokenizer) *);
  78
  79
  80 /* FUN(tok,finish)():
  81  *      Finish a word in the tokenizer.
  82  */
  83 private void
  84 FUN(tok,finish)(TYPE(Tokenizer) *tok)
  85 {
  86
  87         *tok->wptr = '\0';
  88         if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart) {
  89                 tok->argv[tok->argc++] = tok->wstart;
  90                 tok->argv[tok->argc] = NULL;
  91                 tok->wstart = ++tok->wptr;
  92         }
  93         tok->flags &= ~TOK_KEEP;
  94 }
  95
  96
  97 /* FUN(tok,init)():
  98  *      Initialize the tokenizer
  99  */
 100 public TYPE(Tokenizer) *
 101 FUN(tok,init)(const Char *ifs)
 102 {
 103         TYPE(Tokenizer) *tok = tok_malloc(sizeof(TYPE(Tokenizer)));
 104
 105         if (tok == NULL)
 106                 return NULL;
 107         tok->ifs = tok_strdup(ifs ? ifs : IFS);
 108         if (tok->ifs == NULL) {
 109                 tok_free((ptr_t)tok);
 110                 return NULL;
 111         }
 112         tok->argc = 0;
 113         tok->amax = AINCR;
 114         tok->argv = tok_malloc(sizeof(*tok->argv) * tok->amax);
 115         if (tok->argv == NULL) {
 116                 tok_free((ptr_t)tok->ifs);
 117                 tok_free((ptr_t)tok);
 118                 return NULL;
 119         }
 120         tok->argv[0] = NULL;
 121         tok->wspace = tok_malloc(WINCR * sizeof(*tok->wspace));
 122         if (tok->wspace == NULL) {
 123                 tok_free((ptr_t)tok->argv);
 124                 tok_free((ptr_t)tok->ifs);
 125                 tok_free((ptr_t)tok);
 126                 return NULL;
 127         }
 128         tok->wmax = tok->wspace + WINCR;
 129         tok->wstart = tok->wspace;
 130         tok->wptr = tok->wspace;
 131         tok->flags = 0;
 132         tok->quote = Q_none;
 133
 134         return (tok);
 135 }
 136
 137
 138 /* FUN(tok,reset)():
 139  *      Reset the tokenizer
 140  */
 141 public void
 142 FUN(tok,reset)(TYPE(Tokenizer) *tok)
 143 {
 144
 145         tok->argc = 0;
 146         tok->wstart = tok->wspace;
 147         tok->wptr = tok->wspace;
 148         tok->flags = 0;
 149         tok->quote = Q_none;
 150 }
 151
 152
 153 /* FUN(tok,end)():
 154  *      Clean up
 155  */
 156 public void
 157 FUN(tok,end)(TYPE(Tokenizer) *tok)
 158 {
 159
 160         tok_free((ptr_t) tok->ifs);
 161         tok_free((ptr_t) tok->wspace);
 162         tok_free((ptr_t) tok->argv);
 163         tok_free((ptr_t) tok);
 164 }
 165
 166
 167
 168 /* FUN(tok,line)():
 169  *      Bourne shell (sh(1)) like tokenizing
 170  *      Arguments:
 171  *              tok     current tokenizer state (setup with FUN(tok,init)())
 172  *              line    line to parse
 173  *      Returns:
 174  *              -1      Internal error
 175  *               3      Quoted return
 176  *               2      Unmatched double quote
 177  *               1      Unmatched single quote
 178  *               0      Ok
 179  *      Modifies (if return value is 0):
 180  *              argc    number of arguments
 181  *              argv    argument array
 182  *              cursorc if !NULL, argv element containing cursor
 183  *              cursorv if !NULL, offset in argv[cursorc] of cursor
 184  */
 185 public int
 186 FUN(tok,line)(TYPE(Tokenizer) *tok, const TYPE(LineInfo) *line,
 187     int *argc, const Char ***argv, int *cursorc, int *cursoro)
 188 {
 189         const Char *ptr;
 190         int cc, co;
 191
 192         cc = co = -1;
 193         ptr = line->buffer;
 194         for (ptr = line->buffer; ;ptr++) {
 195                 if (ptr >= line->lastchar)
 196                         ptr = STR("");
 197                 if (ptr == line->cursor) {
 198                         cc = tok->argc;
 199                         co = (int)(tok->wptr - tok->wstart);
 200                 }
 201                 switch (*ptr) {
 202                 case '\'':
 203                         tok->flags |= TOK_KEEP;
 204                         tok->flags &= ~TOK_EAT;
 205                         switch (tok->quote) {
 206                         case Q_none:
 207                                 tok->quote = Q_single;  /* Enter single quote
 208                                                          * mode */
 209                                 break;
 210
 211                         case Q_single:  /* Exit single quote mode */
 212                                 tok->quote = Q_none;
 213                                 break;
 214
 215                         case Q_one:     /* Quote this ' */
 216                                 tok->quote = Q_none;
 217                                 *tok->wptr++ = *ptr;
 218                                 break;
 219
 220                         case Q_double:  /* Stay in double quote mode */
 221                                 *tok->wptr++ = *ptr;
 222                                 break;
 223
 224                         case Q_doubleone:       /* Quote this ' */
 225                                 tok->quote = Q_double;
 226                                 *tok->wptr++ = *ptr;
 227                                 break;
 228
 229                         default:
 230                                 return (-1);
 231                         }
 232                         break;
 233
 234                 case '"':
 235                         tok->flags &= ~TOK_EAT;
 236                         tok->flags |= TOK_KEEP;
 237                         switch (tok->quote) {
 238                         case Q_none:    /* Enter double quote mode */
 239                                 tok->quote = Q_double;
 240                                 break;
 241
 242                         case Q_double:  /* Exit double quote mode */
 243                                 tok->quote = Q_none;
 244                                 break;
 245
 246                         case Q_one:     /* Quote this " */
 247                                 tok->quote = Q_none;
 248                                 *tok->wptr++ = *ptr;
 249                                 break;
 250
 251                         case Q_single:  /* Stay in single quote mode */
 252                                 *tok->wptr++ = *ptr;
 253                                 break;
 254
 255                         case Q_doubleone:       /* Quote this " */
 256                                 tok->quote = Q_double;
 257                                 *tok->wptr++ = *ptr;
 258                                 break;
 259
 260                         default:
 261                                 return (-1);
 262                         }
 263                         break;
 264
 265                 case '\\':
 266                         tok->flags |= TOK_KEEP;
 267                         tok->flags &= ~TOK_EAT;
 268                         switch (tok->quote) {
 269                         case Q_none:    /* Quote next character */
 270                                 tok->quote = Q_one;
 271                                 break;
 272
 273                         case Q_double:  /* Quote next character */
 274                                 tok->quote = Q_doubleone;
 275                                 break;
 276
 277                         case Q_one:     /* Quote this, restore state */
 278                                 *tok->wptr++ = *ptr;
 279                                 tok->quote = Q_none;
 280                                 break;
 281
 282                         case Q_single:  /* Stay in single quote mode */
 283                                 *tok->wptr++ = *ptr;
 284                                 break;
 285
 286                         case Q_doubleone:       /* Quote this \ */
 287                                 tok->quote = Q_double;
 288                                 *tok->wptr++ = *ptr;
 289                                 break;
 290
 291                         default:
 292                                 return (-1);
 293                         }
 294                         break;
 295
 296                 case '\n':
 297                         tok->flags &= ~TOK_EAT;
 298                         switch (tok->quote) {
 299                         case Q_none:
 300                                 goto tok_line_outok;
 301
 302                         case Q_single:
 303                         case Q_double:
 304                                 *tok->wptr++ = *ptr;    /* Add the return */
 305                                 break;
 306
 307                         case Q_doubleone:   /* Back to double, eat the '\n' */
 308                                 tok->flags |= TOK_EAT;
 309                                 tok->quote = Q_double;
 310                                 break;
 311
 312                         case Q_one:     /* No quote, more eat the '\n' */
 313                                 tok->flags |= TOK_EAT;
 314                                 tok->quote = Q_none;
 315                                 break;
 316
 317                         default:
 318                                 return (0);
 319                         }
 320                         break;
 321
 322                 case '\0':
 323                         switch (tok->quote) {
 324                         case Q_none:
 325                                 /* Finish word and return */
 326                                 if (tok->flags & TOK_EAT) {
 327                                         tok->flags &= ~TOK_EAT;
 328                                         return (3);
 329                                 }
 330                                 goto tok_line_outok;
 331
 332                         case Q_single:
 333                                 return (1);
 334
 335                         case Q_double:
 336                                 return (2);
 337
 338                         case Q_doubleone:
 339                                 tok->quote = Q_double;
 340                                 *tok->wptr++ = *ptr;
 341                                 break;
 342
 343                         case Q_one:
 344                                 tok->quote = Q_none;
 345                                 *tok->wptr++ = *ptr;
 346                                 break;
 347
 348                         default:
 349                                 return (-1);
 350                         }
 351                         break;
 352
 353                 default:
 354                         tok->flags &= ~TOK_EAT;
 355                         switch (tok->quote) {
 356                         case Q_none:
 357                                 if (Strchr(tok->ifs, *ptr) != NULL)
 358                                         FUN(tok,finish)(tok);
 359                                 else
 360                                         *tok->wptr++ = *ptr;
 361                                 break;
 362
 363                         case Q_single:
 364                         case Q_double:
 365                                 *tok->wptr++ = *ptr;
 366                                 break;
 367
 368
 369                         case Q_doubleone:
 370                                 *tok->wptr++ = '\\';
 371                                 tok->quote = Q_double;
 372                                 *tok->wptr++ = *ptr;
 373                                 break;
 374
 375                         case Q_one:
 376                                 tok->quote = Q_none;
 377                                 *tok->wptr++ = *ptr;
 378                                 break;
 379
 380                         default:
 381                                 return (-1);
 382
 383                         }
 384                         break;
 385                 }
 386
 387                 if (tok->wptr >= tok->wmax - 4) {
 388                         size_t size = tok->wmax - tok->wspace + WINCR;
 389                         Char *s = tok_realloc(tok->wspace,
 390                             size * sizeof(*s));
 391                         if (s == NULL)
 392                                 return (-1);
 393
 394                         if (s != tok->wspace) {
 395                                 int i;
 396                                 for (i = 0; i < tok->argc; i++) {
 397                                     tok->argv[i] =
 398                                         (tok->argv[i] - tok->wspace) + s;
 399                                 }
 400                                 tok->wptr = (tok->wptr - tok->wspace) + s;
 401                                 tok->wstart = (tok->wstart - tok->wspace) + s;
 402                                 tok->wspace = s;
 403                         }
 404                         tok->wmax = s + size;
 405                 }
 406                 if (tok->argc >= tok->amax - 4) {
 407                         Char **p;
 408                         tok->amax += AINCR;
 409                         p = tok_realloc(tok->argv, tok->amax * sizeof(*p));
 410                         if (p == NULL)
 411                                 return (-1);
 412                         tok->argv = p;
 413                 }
 414         }
 415  tok_line_outok:
 416         if (cc == -1 && co == -1) {
 417                 cc = tok->argc;
 418                 co = (int)(tok->wptr - tok->wstart);
 419         }
 420         if (cursorc != NULL)
 421                 *cursorc = cc;
 422         if (cursoro != NULL)
 423                 *cursoro = co;
 424         FUN(tok,finish)(tok);
 425         *argv = (const Char **)tok->argv;
 426         *argc = tok->argc;
 427         return (0);
 428 }
 429
 430 /* FUN(tok,str)():
 431  *      Simpler version of tok_line, taking a NUL terminated line
 432  *      and splitting into words, ignoring cursor state.
 433  */
 434 public int
 435 FUN(tok,str)(TYPE(Tokenizer) *tok, const Char *line, int *argc,
 436     const Char ***argv)
 437 {
 438         TYPE(LineInfo) li;
 439
 440         memset(&li, 0, sizeof(li));
 441         li.buffer = line;
 442         li.cursor = li.lastchar = Strchr(line, '\0');
 443         return (FUN(tok,line)(tok, &li, argc, argv, NULL, NULL));
 444 }