4 * This file defines the string_tokenize interface
5 * Time-stamp: "2006-06-24 15:27:49 bkorb"
7 * string_tokenize copyright 2005 Bruce Korb
9 * string_tokenize is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * string_tokenize is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with string_tokenize; if not, write to:
21 * The Free Software Foundation, Inc.,
22 * 51 Franklin Street, Fifth Floor,
23 * Boston, MA 02110-1301, USA.
29 #define cc_t const unsigned char
30 #define ch_t unsigned char
32 /* = = = START-STATIC-FORWARD = = = */
33 /* static forward declarations maintained by :mkfwd */
35 copy_cooked( ch_t
** ppDest
, char const ** ppSrc
);
38 copy_raw( ch_t
** ppDest
, char const ** ppSrc
);
39 /* = = = END-STATIC-FORWARD = = = */
42 copy_cooked( ch_t
** ppDest
, char const ** ppSrc
)
44 ch_t
* pDest
= (ch_t
*)*ppDest
;
45 const ch_t
* pSrc
= (const ch_t
*)(*ppSrc
+ 1);
50 case NUL
: *ppSrc
= NULL
; return;
53 pSrc
+= ao_string_cook_escape_char( (char*)pSrc
, (char*)&ch
, 0x7F );
64 *ppDest
= (ch_t
*)pDest
; /* next spot for storing character */
65 *ppSrc
= (char const *)pSrc
; /* char following closing quote */
70 copy_raw( ch_t
** ppDest
, char const ** ppSrc
)
72 ch_t
* pDest
= *ppDest
;
73 cc_t
* pSrc
= (cc_t
*) (*ppSrc
+ 1);
78 case NUL
: *ppSrc
= NULL
; return;
82 * *Four* escapes are handled: newline removal, escape char
83 * quoting and apostrophe quoting
86 case NUL
: *ppSrc
= NULL
; return;
88 if (*(++pSrc
) == '\n')
112 *ppDest
= pDest
; /* next spot for storing character */
113 *ppSrc
= (char const *) pSrc
; /* char following closing quote */
117 /*=export_func ao_string_tokenize
119 * what: tokenize an input string
121 * arg: + char const* + string + string to be tokenized +
123 * ret_type: token_list_t*
124 * ret_desc: pointer to a structure that lists each token
128 * This function will convert one input string into a list of strings.
129 * The list of strings is derived by separating the input based on
130 * white space separation. However, if the input contains either single
131 * or double quote characters, then the text after that character up to
132 * a matching quote will become the string in the list.
134 * The returned pointer should be deallocated with @code{free(3C)} when
135 * are done using the data. The data are placed in a single block of
136 * allocated memory. Do not deallocate individual token/strings.
138 * The structure pointed to will contain at least these two fields:
141 * The number of tokens found in the input string.
143 * An array of @code{tkn_ct + 1} pointers to substring tokens, with
144 * the last pointer set to NULL.
147 * There are two types of quoted strings: single quoted (@code{'}) and
148 * double quoted (@code{"}). Singly quoted strings are fairly raw in that
149 * escape characters (@code{\\}) are simply another character, except when
150 * preceding the following characters:
152 * @code{\\} double backslashes reduce to one
153 * @code{'} incorporates the single quote into the string
154 * @code{\n} suppresses both the backslash and newline character
157 * Double quote strings are formed according to the rules of string
158 * constants in ANSI-C programs.
162 * #include <stdlib.h>
164 * token_list_t* ptl = ao_string_tokenize( some_string )
165 * for (ix = 0; ix < ptl->tkn_ct; ix++)
166 * do_something_with_tkn( ptl->tkn_list[ix] );
169 * Note that everything is freed with the one call to @code{free(3C)}.
172 * NULL is returned and @code{errno} will be set to indicate the problem:
175 * @code{EINVAL} - There was an unterminated quoted string.
177 * @code{ENOENT} - The input string was empty.
179 * @code{ENOMEM} - There is not enough memory.
183 ao_string_tokenize( char const* str
)
185 int max_token_ct
= 1; /* allow for trailing NUL on string */
188 if (str
== NULL
) goto bogus_str
;
191 * Trim leading white space. Use "ENOENT" and a NULL return to indicate
192 * an empty string was passed.
194 while (isspace( (ch_t
)*str
)) str
++;
202 * Take an approximate count of tokens. If no quoted strings are used,
203 * it will be accurate. If quoted strings are used, it will be a little
204 * high and we'll squander the space for a few extra pointers.
207 cc_t
* pz
= (cc_t
*)str
;
211 while (! isspace( *++pz
))
212 if (*pz
== NUL
) goto found_nul
;
213 while (isspace( *pz
)) pz
++;
214 } while (*pz
!= NUL
);
220 res
= malloc( sizeof(*res
) + strlen(str
) + (max_token_ct
* sizeof(ch_t
*)) );
227 * Now copy each token into the output buffer.
230 ch_t
* pzDest
= (ch_t
*)(res
->tkn_list
+ (max_token_ct
+ 1));
234 res
->tkn_list
[ res
->tkn_ct
++ ] = pzDest
;
239 while (isspace( (ch_t
)*++str
)) ;
245 copy_cooked( &pzDest
, &str
);
251 if (isspace( (ch_t
)*str
))
252 goto found_white_space
;
256 copy_raw( &pzDest
, &str
);
262 if (isspace( (ch_t
)*str
))
263 goto found_white_space
;
276 * NUL terminate the last token and see if we have any more tokens.
279 } while (*str
!= NUL
);
281 res
->tkn_list
[ res
->tkn_ct
] = NULL
;
292 main( int argc
, char** argv
)
295 printf("USAGE: %s arg [ ... ]\n", *argv
);
299 char* arg
= *(++argv
);
300 token_list_t
* p
= ao_string_tokenize( arg
);
302 printf( "Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
303 arg
, errno
, strerror( errno
));
306 printf( "Parsed string ``%s''\ninto %d tokens:\n", arg
, p
->tkn_ct
);
308 printf( " %3d: ``%s''\n", ix
+1, p
->tkn_list
[ix
] );
309 } while (++ix
< p
->tkn_ct
);
320 * c-file-style: "stroustrup"
321 * indent-tabs-mode: nil
323 * end of autoopts/tokenize.c */