dist/ntp/sntp/libopts/tokenize.c

   1 /*      $NetBSD$        */
   2
   3 /*
   4  *  This file defines the string_tokenize interface
   5  * Time-stamp:      "2006-06-24 15:27:49 bkorb"
   6  *
   7  *  string_tokenize copyright 2005 Bruce Korb
   8  *
   9  *  string_tokenize is free software; you can redistribute it and/or
  10  *  modify it under the terms of the GNU Lesser General Public
  11  *  License as published by the Free Software Foundation; either
  12  *  version 2.1 of the License, or (at your option) any later version.
  13  *
  14  *  string_tokenize is distributed in the hope that it will be useful,
  15  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  *  Lesser General Public License for more details.
  18  *
  19  *  You should have received a copy of the GNU Lesser General Public
  20  *  License along with string_tokenize; if not, write to:
  21  *             The Free Software Foundation, Inc.,
  22  *             51 Franklin Street, Fifth Floor,
  23  *             Boston, MA  02110-1301, USA.
  24  */
  25 #include <ctype.h>
  26 #include <errno.h>
  27 #include <stdlib.h>
  28
  29 #define cc_t   const unsigned char
  30 #define ch_t   unsigned char
  31
  32 /* = = = START-STATIC-FORWARD = = = */
  33 /* static forward declarations maintained by :mkfwd */
  34 static void
  35 copy_cooked( ch_t** ppDest, char const ** ppSrc );
  36
  37 static void
  38 copy_raw( ch_t** ppDest, char const ** ppSrc );
  39 /* = = = END-STATIC-FORWARD = = = */
  40
  41 static void
  42 copy_cooked( ch_t** ppDest, char const ** ppSrc )
  43 {
  44     ch_t* pDest = (ch_t*)*ppDest;
  45     const ch_t* pSrc  = (const ch_t*)(*ppSrc + 1);
  46
  47     for (;;) {
  48         ch_t ch = *(pSrc++);
  49         switch (ch) {
  50         case NUL:   *ppSrc = NULL; return;
  51         case '"':   goto done;
  52         case '\\':
  53             pSrc += ao_string_cook_escape_char( (char*)pSrc, (char*)&ch, 0x7F );
  54             if (ch == 0x7F)
  55                 break;
  56             /* FALLTHROUGH */
  57
  58         default:
  59             *(pDest++) = ch;
  60         }
  61     }
  62
  63  done:
  64     *ppDest = (ch_t*)pDest; /* next spot for storing character */
  65     *ppSrc  = (char const *)pSrc;  /* char following closing quote    */
  66 }
  67
  68
  69 static void
  70 copy_raw( ch_t** ppDest, char const ** ppSrc )
  71 {
  72     ch_t* pDest = *ppDest;
  73     cc_t* pSrc  = (cc_t*) (*ppSrc + 1);
  74
  75     for (;;) {
  76         ch_t ch = *(pSrc++);
  77         switch (ch) {
  78         case NUL:   *ppSrc = NULL; return;
  79         case '\'':  goto done;
  80         case '\\':
  81             /*
  82              *  *Four* escapes are handled:  newline removal, escape char
  83              *  quoting and apostrophe quoting
  84              */
  85             switch (*pSrc) {
  86             case NUL:   *ppSrc = NULL; return;
  87             case '\r':
  88                 if (*(++pSrc) == '\n')
  89                     ++pSrc;
  90                 continue;
  91
  92             case '\n':
  93                 ++pSrc;
  94                 continue;
  95
  96             case '\'':
  97                 ch = '\'';
  98                 /* FALLTHROUGH */
  99
 100             case '\\':
 101                 ++pSrc;
 102                 break;
 103             }
 104             /* FALLTHROUGH */
 105
 106         default:
 107             *(pDest++) = ch;
 108         }
 109     }
 110
 111  done:
 112     *ppDest = pDest; /* next spot for storing character */
 113     *ppSrc  = (char const *) pSrc;  /* char following closing quote    */
 114 }
 115
 116
 117 /*=export_func ao_string_tokenize
 118  *
 119  * what: tokenize an input string
 120  *
 121  * arg:  + char const* + string + string to be tokenized +
 122  *
 123  * ret_type:  token_list_t*
 124  * ret_desc:  pointer to a structure that lists each token
 125  *
 126  * doc:
 127  *
 128  * This function will convert one input string into a list of strings.
 129  * The list of strings is derived by separating the input based on
 130  * white space separation.  However, if the input contains either single
 131  * or double quote characters, then the text after that character up to
 132  * a matching quote will become the string in the list.
 133  *
 134  *  The returned pointer should be deallocated with @code{free(3C)} when
 135  *  are done using the data.  The data are placed in a single block of
 136  *  allocated memory.  Do not deallocate individual token/strings.
 137  *
 138  *  The structure pointed to will contain at least these two fields:
 139  *  @table @samp
 140  *  @item tkn_ct
 141  *  The number of tokens found in the input string.
 142  *  @item tok_list
 143  *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
 144  *  the last pointer set to NULL.
 145  *  @end table
 146  *
 147  * There are two types of quoted strings: single quoted (@code{'}) and
 148  * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
 149  * escape characters (@code{\\}) are simply another character, except when
 150  * preceding the following characters:
 151  * @example
 152  * @code{\\}  double backslashes reduce to one
 153  * @code{'}   incorporates the single quote into the string
 154  * @code{\n}  suppresses both the backslash and newline character
 155  * @end example
 156  *
 157  * Double quote strings are formed according to the rules of string
 158  * constants in ANSI-C programs.
 159  *
 160  * example:
 161  * @example
 162  *    #include <stdlib.h>
 163  *    int ix;
 164  *    token_list_t* ptl = ao_string_tokenize( some_string )
 165  *    for (ix = 0; ix < ptl->tkn_ct; ix++)
 166  *       do_something_with_tkn( ptl->tkn_list[ix] );
 167  *    free( ptl );
 168  * @end example
 169  * Note that everything is freed with the one call to @code{free(3C)}.
 170  *
 171  * err:
 172  *  NULL is returned and @code{errno} will be set to indicate the problem:
 173  *  @itemize @bullet
 174  *  @item
 175  *  @code{EINVAL} - There was an unterminated quoted string.
 176  *  @item
 177  *  @code{ENOENT} - The input string was empty.
 178  *  @item
 179  *  @code{ENOMEM} - There is not enough memory.
 180  *  @end itemize
 181 =*/
 182 token_list_t*
 183 ao_string_tokenize( char const* str )
 184 {
 185     int max_token_ct = 1; /* allow for trailing NUL on string */
 186     token_list_t* res;
 187
 188     if (str == NULL)  goto bogus_str;
 189
 190     /*
 191      *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
 192      *  an empty string was passed.
 193      */
 194     while (isspace( (ch_t)*str ))  str++;
 195     if (*str == NUL) {
 196     bogus_str:
 197         errno = ENOENT;
 198         return NULL;
 199     }
 200
 201     /*
 202      *  Take an approximate count of tokens.  If no quoted strings are used,
 203      *  it will be accurate.  If quoted strings are used, it will be a little
 204      *  high and we'll squander the space for a few extra pointers.
 205      */
 206     {
 207         cc_t* pz = (cc_t*)str;
 208
 209         do {
 210             max_token_ct++;
 211             while (! isspace( *++pz ))
 212                 if (*pz == NUL) goto found_nul;
 213             while (isspace( *pz ))  pz++;
 214         } while (*pz != NUL);
 215
 216     found_nul:
 217         ;
 218     }
 219
 220     res = malloc( sizeof(*res) + strlen(str) + (max_token_ct * sizeof(ch_t*)) );
 221     if (res == NULL) {
 222         errno = ENOMEM;
 223         return res;
 224     }
 225
 226     /*
 227      *  Now copy each token into the output buffer.
 228      */
 229     {
 230         ch_t* pzDest = (ch_t*)(res->tkn_list + (max_token_ct + 1));
 231         res->tkn_ct  = 0;
 232
 233         do  {
 234             res->tkn_list[ res->tkn_ct++ ] = pzDest;
 235             for (;;) {
 236                 int ch = (ch_t)*str;
 237                 if (isspace( ch )) {
 238                 found_white_space:
 239                     while (isspace( (ch_t)*++str ))  ;
 240                     break;
 241                 }
 242
 243                 switch (ch) {
 244                 case '"':
 245                     copy_cooked( &pzDest, &str );
 246                     if (str == NULL) {
 247                         free(res);
 248                         errno = EINVAL;
 249                         return NULL;
 250                     }
 251                     if (isspace( (ch_t)*str ))
 252                         goto found_white_space;
 253                     break;
 254
 255                 case '\'':
 256                     copy_raw( &pzDest, &str );
 257                     if (str == NULL) {
 258                         free(res);
 259                         errno = EINVAL;
 260                         return NULL;
 261                     }
 262                     if (isspace( (ch_t)*str ))
 263                         goto found_white_space;
 264                     break;
 265
 266                 case NUL:
 267                     goto copy_done;
 268
 269                 default:
 270                     str++;
 271                     *(pzDest++) = ch;
 272                 }
 273             } copy_done:;
 274
 275             /*
 276              * NUL terminate the last token and see if we have any more tokens.
 277              */
 278             *(pzDest++) = NUL;
 279         } while (*str != NUL);
 280
 281         res->tkn_list[ res->tkn_ct ] = NULL;
 282     }
 283
 284     return res;
 285 }
 286
 287 #ifdef TEST
 288 #include <stdio.h>
 289 #include <string.h>
 290
 291 int
 292 main( int argc, char** argv )
 293 {
 294     if (argc == 1) {
 295         printf("USAGE:  %s arg [ ... ]\n", *argv);
 296         return 1;
 297     }
 298     while (--argc > 0) {
 299         char* arg = *(++argv);
 300         token_list_t* p = ao_string_tokenize( arg );
 301         if (p == NULL) {
 302             printf( "Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
 303                     arg, errno, strerror( errno ));
 304         } else {
 305             int ix = 0;
 306             printf( "Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct );
 307             do {
 308                 printf( " %3d:  ``%s''\n", ix+1, p->tkn_list[ix] );
 309             } while (++ix < p->tkn_ct);
 310             free(p);
 311         }
 312     }
 313     return 0;
 314 }
 315 #endif
 316
 317 /*
 318  * Local Variables:
 319  * mode: C
 320  * c-file-style: "stroustrup"
 321  * indent-tabs-mode: nil
 322  * End:
 323  * end of autoopts/tokenize.c */