release 1.3.4
[cygwin-run.git] / src / tokenizer.c
bloba60052c3371918879f133f809989675dbabbe85a
1 /*-
2 * Copyright (c) 1992, 1993
3 * The Regents of the University of California. All rights reserved.
5 * This code is derived from software contributed to Berkeley by
6 * Christos Zoulas of Cornell University.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
32 * NetBSD: tokenizer.c,v 1.14 2003/12/05 13:37:48 lukem Exp
35 #include "config.h"
36 #include <string.h>
37 #include <stdlib.h>
38 #include "tokenizer.h"
39 #include "util.h"
41 #define IFS "\t \n"
42 #define TOK_KEEP 1
43 #define TOK_EAT 2
44 #define WINCR 20 /* how much working space to allocate at one time */
45 #define AINCR 10 /* how many argument slots to allocate at one time */
47 #define tok_strdup(a) run2_strdup (a)
48 #define tok_malloc(a) run2_malloc (a)
49 #define tok_free(a) free (a)
50 #define tok_realloc(a,b) run2_realloc (a, b)
52 #define tok_init(a) run2_tok_init (a)
53 #define tok_end(a) run2_tok_end (a)
54 #define tok_reset(a) run2_tok_reset (a)
55 #define tok_str(a,b,c,d) run2_tok_str (a,b,c,d)
57 typedef enum
59 Q_none,
60 Q_single,
61 Q_double,
62 Q_one,
63 Q_doubleone
64 } quote_t;
66 struct tokenizer
68 char *ifs; /* In field separator */
69 int argc, amax; /* Current and maximum number of args */
70 char **argv; /* Argument list */
71 char *wptr, *wmax; /* Space and limit on the word buffer */
72 char *wstart; /* Beginning of next word */
73 char *wspace; /* Space of word buffer */
74 quote_t quote; /* Quoting state */
75 int flags; /* flags; */
78 int
79 run2_split_string (const char *str, int *argc, const char ***argv)
81 Tokenizer *tok;
82 int rc, i;
83 int l_argc = 0;
84 const char **l_argv = NULL;
86 if (!argv || !argc)
88 errorMsg ("split_string: Null arguments");
89 return 1;
91 if (!str)
93 *argc = 0;
94 *argv = NULL;
95 warnMsg ("split_string: Null string");
96 return 0;
99 tok = tok_init (IFS);
100 if (!tok)
102 errorMsg ("Unable to initialize tokenizer");
103 return 1;
106 if ((rc = tok_str (tok, str, &l_argc, &l_argv)))
108 tok_end (tok);
109 tok = NULL;
112 switch (rc)
114 case -1:
115 errorMsg ("internal error while parsing |%s|", str);
116 break;
117 case 3:
118 errorMsg ("string ended with an escaped newline |%s|", str);
119 break;
120 case 2:
121 errorMsg ("unmatched double quote in |%s|", str);
122 break;
123 case 1:
124 errorMsg ("unmatched single quote in |%s|", str);
125 break;
126 case 0:
127 debugMsg (2, "(%s) Parsed string |%s|", __func__, str);
128 for (i = 0; i < l_argc; i++)
130 debugMsg (2, "(%s)\targv[%d]: |%s|", __func__, i, (l_argv[i] ? l_argv[i] : ""));
132 break;
133 default:
134 errorMsg ("unknown error (%d) parsing |%s|", rc, str);
135 break;
137 if (rc)
138 return rc;
140 /* Success. It's time to clean up. But first we have to
141 * copy l_argv (which points into internal storage of tok)
142 * into the output argv.
144 *argv = (const char **) run2_malloc ((l_argc + 1) * sizeof (const char *));
145 for (i = 0; i < l_argc; i++)
147 (*argv)[i] = run2_strdup (l_argv[i]);
149 (*argv)[l_argc] = NULL;
150 *argc = l_argc;
152 tok_end (tok);
153 tok = NULL;
154 return rc;
157 /* tok_finish():
158 * Finish a word in the tokenizer.
160 static void
161 tok_finish (Tokenizer * tok)
163 *(tok->wptr) = '\0';
164 if ((tok->flags & TOK_KEEP) || tok->wptr != tok->wstart)
166 tok->argv[tok->argc++] = tok->wstart;
167 tok->argv[tok->argc] = NULL;
168 tok->wstart = ++tok->wptr;
170 tok->flags &= ~TOK_KEEP;
173 /* tok_init():
174 * Initialize the tokenizer
176 Tokenizer *
177 tok_init (const char *ifs)
179 Tokenizer *tok = (Tokenizer *) tok_malloc (sizeof (Tokenizer));
181 if (tok == NULL)
182 return NULL;
183 tok->ifs = tok_strdup (ifs ? ifs : IFS);
184 if (tok->ifs == NULL)
186 tok_free ((void *) tok);
187 return NULL;
189 tok->argc = 0;
190 tok->amax = AINCR;
191 tok->argv = (char **) tok_malloc (sizeof (char *) * tok->amax);
192 if (tok->argv == NULL)
194 tok_free ((void *) tok->ifs);
195 tok_free ((void *) tok);
196 return NULL;
198 tok->argv[0] = NULL;
199 tok->wspace = (char *) tok_malloc (WINCR);
200 if (tok->wspace == NULL)
202 tok_free ((void *) tok->argv);
203 tok_free ((void *) tok->ifs);
204 tok_free ((void *) tok);
205 return NULL;
207 tok->wmax = tok->wspace + WINCR;
208 tok->wstart = tok->wspace;
209 tok->wptr = tok->wspace;
210 tok->flags = 0;
211 tok->quote = Q_none;
213 return (tok);
216 /* tok_reset():
217 * Reset the tokenizer
219 void
220 tok_reset (Tokenizer * tok)
222 tok->argc = 0;
223 tok->wstart = tok->wspace;
224 tok->wptr = tok->wspace;
225 tok->flags = 0;
226 tok->quote = Q_none;
229 /* tok_end():
230 * Clean up
232 void
233 tok_end (Tokenizer * tok)
235 if (tok)
237 tok_free ((void *) tok->ifs);
238 tok_free ((void *) tok->wspace);
239 tok_free ((void *) tok->argv);
240 tok_free ((void *) tok);
245 * tok_line():
246 * Bourne shell (sh(1)) like tokenizing
247 * Arguments:
248 * tok current tokenizer state (setup with tok_init())
249 * line line to parse
250 * Returns:
251 * -1 Internal error
252 * 3 Quoted return
253 * 2 Unmatched double quote
254 * 1 Unmatched single quote
255 * 0 Ok
256 * Modifies (if return value is 0):
257 * argc number of arguments
258 * argv argument array
259 * cursorc if !NULL, argv element containing cursor
260 * cursorv if !NULL, offset in argv[cursorc] of cursor
261 * NOTE: Made this function static because it is used only by (a) tok_str,
262 * or libeditline functions in the original library from which this
263 * code derived.
265 static int
266 tok_line (Tokenizer * tok,
267 const LineInfo * line,
268 int *argc, const char ***argv, int *cursorc, int *cursoro)
270 const char *ptr;
271 int cc, co;
273 cc = co = -1;
274 ptr = line->buffer;
275 for (ptr = line->buffer;; ptr++)
277 if (ptr >= line->lastchar)
278 ptr = "";
279 if (ptr == line->cursor)
281 cc = tok->argc;
282 /* the offset to a position within a word and
283 * the offset to the start of that word are both
284 * guaranteed positive. Therefore, their difference
285 * will not overflow.
287 co = (int) (tok->wptr - tok->wstart);
289 switch (*ptr)
291 case '\'':
292 tok->flags |= TOK_KEEP;
293 tok->flags &= ~TOK_EAT;
294 switch (tok->quote)
296 case Q_none: /* Enter single quote mode */
297 tok->quote = Q_single;
298 break;
300 case Q_single: /* Exit single quote mode */
301 tok->quote = Q_none;
302 break;
304 case Q_one: /* Quote this ' */
305 tok->quote = Q_none;
306 *tok->wptr++ = *ptr;
307 break;
309 case Q_double: /* Stay in double quote mode */
310 *tok->wptr++ = *ptr;
311 break;
313 case Q_doubleone: /* Quote this ' */
314 tok->quote = Q_double;
315 *tok->wptr++ = *ptr;
316 break;
318 default:
319 return (-1);
321 break;
323 case '"':
324 tok->flags &= ~TOK_EAT;
325 tok->flags |= TOK_KEEP;
326 switch (tok->quote)
328 case Q_none: /* Enter double quote mode */
329 tok->quote = Q_double;
330 break;
332 case Q_double: /* Exit double quote mode */
333 tok->quote = Q_none;
334 break;
336 case Q_one: /* Quote this " */
337 tok->quote = Q_none;
338 *tok->wptr++ = *ptr;
339 break;
341 case Q_single: /* Stay in single quote mode */
342 *tok->wptr++ = *ptr;
343 break;
345 case Q_doubleone: /* Quote this " */
346 tok->quote = Q_double;
347 *tok->wptr++ = *ptr;
348 break;
350 default:
351 return (-1);
353 break;
355 case '\\':
356 tok->flags |= TOK_KEEP;
357 tok->flags &= ~TOK_EAT;
358 switch (tok->quote)
360 case Q_none: /* Quote next character */
361 tok->quote = Q_one;
362 break;
364 case Q_double: /* Quote next character */
365 tok->quote = Q_doubleone;
366 break;
368 case Q_one: /* Quote this, restore state */
369 *tok->wptr++ = *ptr;
370 tok->quote = Q_none;
371 break;
373 case Q_single: /* Stay in single quote mode */
374 *tok->wptr++ = *ptr;
375 break;
377 case Q_doubleone: /* Quote this \ */
378 tok->quote = Q_double;
379 *tok->wptr++ = *ptr;
380 break;
382 default:
383 return (-1);
385 break;
387 case '\n':
388 tok->flags &= ~TOK_EAT;
389 switch (tok->quote)
391 case Q_none:
392 goto tok_line_outok;
394 case Q_single:
395 case Q_double:
396 *tok->wptr++ = *ptr; /* Add the return */
397 break;
399 case Q_doubleone: /* Back to double, eat the '\n' */
400 tok->flags |= TOK_EAT;
401 tok->quote = Q_double;
402 break;
404 case Q_one: /* No quote, more eat the '\n' */
405 tok->flags |= TOK_EAT;
406 tok->quote = Q_none;
407 break;
409 default:
410 return (0);
412 break;
414 case '\0':
415 switch (tok->quote)
417 case Q_none:
418 /* Finish word and return */
419 if (tok->flags & TOK_EAT)
421 tok->flags &= ~TOK_EAT;
422 return (3);
424 goto tok_line_outok;
426 case Q_single:
427 return (1);
429 case Q_double:
430 return (2);
432 case Q_doubleone:
433 tok->quote = Q_double;
434 *tok->wptr++ = *ptr;
435 break;
437 case Q_one:
438 tok->quote = Q_none;
439 *tok->wptr++ = *ptr;
440 break;
442 default:
443 return (-1);
445 break;
447 default:
448 tok->flags &= ~TOK_EAT;
449 switch (tok->quote)
451 case Q_none:
452 if (strchr (tok->ifs, *ptr) != NULL)
453 tok_finish (tok);
454 else
455 *tok->wptr++ = *ptr;
456 break;
458 case Q_single:
459 case Q_double:
460 *tok->wptr++ = *ptr;
461 break;
464 case Q_doubleone:
465 *tok->wptr++ = '\\';
466 tok->quote = Q_double;
467 *tok->wptr++ = *ptr;
468 break;
470 case Q_one:
471 tok->quote = Q_none;
472 *tok->wptr++ = *ptr;
473 break;
475 default:
476 return (-1);
479 break;
482 if (tok->wptr >= tok->wmax - 4)
484 size_t size = tok->wmax - tok->wspace + WINCR;
485 char *s = (char *) tok_realloc (tok->wspace, size);
486 if (s == NULL)
487 return (-1);
489 if (s != tok->wspace)
491 int i;
492 for (i = 0; i < tok->argc; i++)
494 tok->argv[i] = (tok->argv[i] - tok->wspace) + s;
496 tok->wptr = (tok->wptr - tok->wspace) + s;
497 tok->wstart = (tok->wstart - tok->wspace) + s;
498 tok->wspace = s;
500 tok->wmax = s + size;
502 if (tok->argc >= tok->amax - 4)
504 char **p;
505 tok->amax += AINCR;
506 p = (char **) tok_realloc (tok->argv, tok->amax * sizeof (char *));
507 if (p == NULL)
508 return (-1);
509 tok->argv = p;
512 tok_line_outok:
513 if (cc == -1 && co == -1)
515 cc = tok->argc;
516 /* the offset to a position within a word and
517 * the offset to the start of that word are both
518 * guaranteed positive. Therefore, their difference
519 * will not overflow.
521 co = (int) (tok->wptr - tok->wstart);
523 if (cursorc != NULL)
524 *cursorc = cc;
525 if (cursoro != NULL)
526 *cursoro = co;
527 tok_finish (tok);
528 *argv = (const char **) tok->argv;
529 *argc = tok->argc;
530 return (0);
533 /* tok_str
534 * Simpler version of tok_line, taking a NUL terminated line
535 * and splitting into words, ignoring cursor state.
538 tok_str (Tokenizer * tok, const char *line, int *argc, const char ***argv)
540 LineInfo li;
542 memset (&li, 0, sizeof (li));
543 li.buffer = line;
544 li.cursor = li.lastchar = strchr (line, '\0');
545 return (tok_line (tok, &li, argc, argv, NULL, NULL));