src/libopts/tokenize.c

   1 /*
   2  *  This file defines the string_tokenize interface
   3  * Time-stamp:      "2012-03-04 13:23:50 bkorb"
   4  *
   5  *  This file is part of AutoOpts, a companion to AutoGen.
   6  *  AutoOpts is free software.
   7  *  AutoOpts is Copyright (c) 1992-2012 by Bruce Korb - all rights reserved
   8  *
   9  *  AutoOpts is available under any one of two licenses.  The license
  10  *  in use must be one of these two and the choice is under the control
  11  *  of the user of the license.
  12  *
  13  *   The GNU Lesser General Public License, version 3 or later
  14  *      See the files "COPYING.lgplv3" and "COPYING.gplv3"
  15  *
  16  *   The Modified Berkeley Software Distribution License
  17  *      See the file "COPYING.mbsd"
  18  *
  19  *  These files have the following md5sums:
  20  *
  21  *  43b91e8ca915626ed3818ffb1b71248b pkg/libopts/COPYING.gplv3
  22  *  06a1a2e4760c90ea5e1dad8dfaac4d39 pkg/libopts/COPYING.lgplv3
  23  *  66a5cedaf62c4b2637025f049f9b826f pkg/libopts/COPYING.mbsd
  24  */
  25
  26 #include <errno.h>
  27 #include <stdlib.h>
  28
  29 #define cc_t   const unsigned char
  30 #define ch_t   unsigned char
  31
  32 /* = = = START-STATIC-FORWARD = = = */
  33 static void
  34 copy_cooked(ch_t** ppDest, char const ** ppSrc);
  35
  36 static void
  37 copy_raw(ch_t** ppDest, char const ** ppSrc);
  38
  39 static token_list_t *
  40 alloc_token_list(char const * str);
  41 /* = = = END-STATIC-FORWARD = = = */
  42
  43 static void
  44 copy_cooked(ch_t** ppDest, char const ** ppSrc)
  45 {
  46     ch_t* pDest = (ch_t*)*ppDest;
  47     const ch_t* pSrc  = (const ch_t*)(*ppSrc + 1);
  48
  49     for (;;) {
  50         ch_t ch = *(pSrc++);
  51         switch (ch) {
  52         case NUL:   *ppSrc = NULL; return;
  53         case '"':   goto done;
  54         case '\\':
  55             pSrc += ao_string_cook_escape_char((char*)pSrc, (char*)&ch, 0x7F);
  56             if (ch == 0x7F)
  57                 break;
  58             /* FALLTHROUGH */
  59
  60         default:
  61             *(pDest++) = ch;
  62         }
  63     }
  64
  65  done:
  66     *ppDest = (ch_t*)pDest; /* next spot for storing character */
  67     *ppSrc  = (char const *)pSrc;  /* char following closing quote    */
  68 }
  69
  70
  71 static void
  72 copy_raw(ch_t** ppDest, char const ** ppSrc)
  73 {
  74     ch_t* pDest = *ppDest;
  75     cc_t* pSrc  = (cc_t*) (*ppSrc + 1);
  76
  77     for (;;) {
  78         ch_t ch = *(pSrc++);
  79         switch (ch) {
  80         case NUL:   *ppSrc = NULL; return;
  81         case '\'':  goto done;
  82         case '\\':
  83             /*
  84              *  *Four* escapes are handled:  newline removal, escape char
  85              *  quoting and apostrophe quoting
  86              */
  87             switch (*pSrc) {
  88             case NUL:   *ppSrc = NULL; return;
  89             case '\r':
  90                 if (*(++pSrc) == NL)
  91                     ++pSrc;
  92                 continue;
  93
  94             case NL:
  95                 ++pSrc;
  96                 continue;
  97
  98             case '\'':
  99                 ch = '\'';
 100                 /* FALLTHROUGH */
 101
 102             case '\\':
 103                 ++pSrc;
 104                 break;
 105             }
 106             /* FALLTHROUGH */
 107
 108         default:
 109             *(pDest++) = ch;
 110         }
 111     }
 112
 113  done:
 114     *ppDest = pDest; /* next spot for storing character */
 115     *ppSrc  = (char const *) pSrc;  /* char following closing quote    */
 116 }
 117
 118 static token_list_t *
 119 alloc_token_list(char const * str)
 120 {
 121     token_list_t * res;
 122
 123     int max_token_ct = 2; /* allow for trailing NULL pointer & NUL on string */
 124
 125     if (str == NULL) goto enoent_res;
 126
 127     /*
 128      *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
 129      *  an empty string was passed.
 130      */
 131     str = SPN_WHITESPACE_CHARS(str);
 132     if (*str == NUL)  goto enoent_res;
 133
 134     /*
 135      *  Take an approximate count of tokens.  If no quoted strings are used,
 136      *  it will be accurate.  If quoted strings are used, it will be a little
 137      *  high and we'll squander the space for a few extra pointers.
 138      */
 139     {
 140         char const * pz = str;
 141
 142         do {
 143             max_token_ct++;
 144             pz = BRK_WHITESPACE_CHARS(pz+1);
 145             pz = SPN_WHITESPACE_CHARS(pz);
 146         } while (*pz != NUL);
 147
 148         res = malloc(sizeof(*res) + (pz - str)
 149                      + (max_token_ct * sizeof(ch_t*)));
 150     }
 151
 152     if (res == NULL)
 153         errno = ENOMEM;
 154     else res->tkn_list[0] = (ch_t*)(res->tkn_list + (max_token_ct - 1));
 155
 156     return res;
 157
 158     enoent_res:
 159
 160     errno = ENOENT;
 161     return NULL;
 162 }
 163
 164 /*=export_func ao_string_tokenize
 165  *
 166  * what: tokenize an input string
 167  *
 168  * arg:  + char const* + string + string to be tokenized +
 169  *
 170  * ret_type:  token_list_t*
 171  * ret_desc:  pointer to a structure that lists each token
 172  *
 173  * doc:
 174  *
 175  * This function will convert one input string into a list of strings.
 176  * The list of strings is derived by separating the input based on
 177  * white space separation.  However, if the input contains either single
 178  * or double quote characters, then the text after that character up to
 179  * a matching quote will become the string in the list.
 180  *
 181  *  The returned pointer should be deallocated with @code{free(3C)} when
 182  *  are done using the data.  The data are placed in a single block of
 183  *  allocated memory.  Do not deallocate individual token/strings.
 184  *
 185  *  The structure pointed to will contain at least these two fields:
 186  *  @table @samp
 187  *  @item tkn_ct
 188  *  The number of tokens found in the input string.
 189  *  @item tok_list
 190  *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
 191  *  the last pointer set to NULL.
 192  *  @end table
 193  *
 194  * There are two types of quoted strings: single quoted (@code{'}) and
 195  * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
 196  * escape characters (@code{\\}) are simply another character, except when
 197  * preceding the following characters:
 198  * @example
 199  * @code{\\}  double backslashes reduce to one
 200  * @code{'}   incorporates the single quote into the string
 201  * @code{\n}  suppresses both the backslash and newline character
 202  * @end example
 203  *
 204  * Double quote strings are formed according to the rules of string
 205  * constants in ANSI-C programs.
 206  *
 207  * example:
 208  * @example
 209  *    #include <stdlib.h>
 210  *    int ix;
 211  *    token_list_t* ptl = ao_string_tokenize(some_string)
 212  *    for (ix = 0; ix < ptl->tkn_ct; ix++)
 213  *       do_something_with_tkn(ptl->tkn_list[ix]);
 214  *    free(ptl);
 215  * @end example
 216  * Note that everything is freed with the one call to @code{free(3C)}.
 217  *
 218  * err:
 219  *  NULL is returned and @code{errno} will be set to indicate the problem:
 220  *  @itemize @bullet
 221  *  @item
 222  *  @code{EINVAL} - There was an unterminated quoted string.
 223  *  @item
 224  *  @code{ENOENT} - The input string was empty.
 225  *  @item
 226  *  @code{ENOMEM} - There is not enough memory.
 227  *  @end itemize
 228 =*/
 229 token_list_t*
 230 ao_string_tokenize(char const* str)
 231 {
 232     token_list_t* res = alloc_token_list(str);
 233     ch_t* pzDest;
 234
 235     /*
 236      *  Now copy each token into the output buffer.
 237      */
 238     if (res == NULL)
 239         return res;
 240
 241     pzDest = (ch_t*)(res->tkn_list[0]);
 242     res->tkn_ct  = 0;
 243
 244     do  {
 245         res->tkn_list[ res->tkn_ct++ ] = pzDest;
 246         for (;;) {
 247             int ch = (ch_t)*str;
 248             if (IS_WHITESPACE_CHAR(ch)) {
 249             found_white_space:
 250                 str = SPN_WHITESPACE_CHARS(str+1);
 251                 break;
 252             }
 253
 254             switch (ch) {
 255             case '"':
 256                 copy_cooked(&pzDest, &str);
 257                 if (str == NULL) {
 258                     free(res);
 259                     errno = EINVAL;
 260                     return NULL;
 261                 }
 262                 if (IS_WHITESPACE_CHAR(*str))
 263                     goto found_white_space;
 264                 break;
 265
 266             case '\'':
 267                 copy_raw(&pzDest, &str);
 268                 if (str == NULL) {
 269                     free(res);
 270                     errno = EINVAL;
 271                     return NULL;
 272                 }
 273                 if (IS_WHITESPACE_CHAR(*str))
 274                     goto found_white_space;
 275                 break;
 276
 277             case NUL:
 278                 goto copy_done;
 279
 280             default:
 281                 str++;
 282                 *(pzDest++) = (unsigned char)ch;
 283             }
 284         } copy_done:;
 285
 286         /*
 287          * NUL terminate the last token and see if we have any more tokens.
 288          */
 289         *(pzDest++) = NUL;
 290     } while (*str != NUL);
 291
 292     res->tkn_list[ res->tkn_ct ] = NULL;
 293
 294     return res;
 295 }
 296
 297 #ifdef TEST
 298 #include <stdio.h>
 299 #include <string.h>
 300
 301 int
 302 main(int argc, char** argv)
 303 {
 304     if (argc == 1) {
 305         printf("USAGE:  %s arg [ ... ]\n", *argv);
 306         return 1;
 307     }
 308     while (--argc > 0) {
 309         char* arg = *(++argv);
 310         token_list_t* p = ao_string_tokenize(arg);
 311         if (p == NULL) {
 312             printf("Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
 313                    arg, errno, strerror(errno));
 314         } else {
 315             int ix = 0;
 316             printf("Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct);
 317             do {
 318                 printf(" %3d:  ``%s''\n", ix+1, p->tkn_list[ix]);
 319             } while (++ix < p->tkn_ct);
 320             free(p);
 321         }
 322     }
 323     return 0;
 324 }
 325 #endif
 326
 327 /*
 328  * Local Variables:
 329  * mode: C
 330  * c-file-style: "stroustrup"
 331  * indent-tabs-mode: nil
 332  * End:
 333  * end of autoopts/tokenize.c */