2 * Copyright (c) 1992, 1993
3 * The Regents of the University of California. All rights reserved.
5 * This code is derived from software contributed to Berkeley by
6 * Christos Zoulas of Cornell University.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * NetBSD: tokenizer.c,v 1.14 2003/12/05 13:37:48 lukem Exp
38 #include "tokenizer.h"
44 #define WINCR 20 /* how much working space to allocate at one time */
45 #define AINCR 10 /* how many argument slots to allocate at one time */
47 #define tok_strdup(a) run2_strdup (a)
48 #define tok_malloc(a) run2_malloc (a)
49 #define tok_free(a) free (a)
50 #define tok_realloc(a,b) run2_realloc (a, b)
52 #define tok_init(a) run2_tok_init (a)
53 #define tok_end(a) run2_tok_end (a)
54 #define tok_reset(a) run2_tok_reset (a)
55 #define tok_str(a,b,c,d) run2_tok_str (a,b,c,d)
68 char *ifs
; /* In field separator */
69 int argc
, amax
; /* Current and maximum number of args */
70 char **argv
; /* Argument list */
71 char *wptr
, *wmax
; /* Space and limit on the word buffer */
72 char *wstart
; /* Beginning of next word */
73 char *wspace
; /* Space of word buffer */
74 quote_t quote
; /* Quoting state */
75 int flags
; /* flags; */
79 run2_split_string (const char *str
, int *argc
, const char ***argv
)
84 const char **l_argv
= NULL
;
88 errorMsg ("split_string: Null arguments");
95 warnMsg ("split_string: Null string");
102 errorMsg ("Unable to initialize tokenizer");
106 if ((rc
= tok_str (tok
, str
, &l_argc
, &l_argv
)))
115 errorMsg ("internal error while parsing |%s|", str
);
118 errorMsg ("string ended with an escaped newline |%s|", str
);
121 errorMsg ("unmatched double quote in |%s|", str
);
124 errorMsg ("unmatched single quote in |%s|", str
);
127 debugMsg (2, "(%s) Parsed string |%s|", __func__
, str
);
128 for (i
= 0; i
< l_argc
; i
++)
130 debugMsg (2, "(%s)\targv[%d]: |%s|", __func__
, i
, (l_argv
[i
] ? l_argv
[i
] : ""));
134 errorMsg ("unknown error (%d) parsing |%s|", rc
, str
);
140 /* Success. It's time to clean up. But first we have to
141 * copy l_argv (which points into internal storage of tok)
142 * into the output argv.
144 *argv
= (const char **) run2_malloc ((l_argc
+ 1) * sizeof (const char *));
145 for (i
= 0; i
< l_argc
; i
++)
147 (*argv
)[i
] = run2_strdup (l_argv
[i
]);
149 (*argv
)[l_argc
] = NULL
;
158 * Finish a word in the tokenizer.
161 tok_finish (Tokenizer
* tok
)
164 if ((tok
->flags
& TOK_KEEP
) || tok
->wptr
!= tok
->wstart
)
166 tok
->argv
[tok
->argc
++] = tok
->wstart
;
167 tok
->argv
[tok
->argc
] = NULL
;
168 tok
->wstart
= ++tok
->wptr
;
170 tok
->flags
&= ~TOK_KEEP
;
174 * Initialize the tokenizer
177 tok_init (const char *ifs
)
179 Tokenizer
*tok
= (Tokenizer
*) tok_malloc (sizeof (Tokenizer
));
183 tok
->ifs
= tok_strdup (ifs
? ifs
: IFS
);
184 if (tok
->ifs
== NULL
)
186 tok_free ((void *) tok
);
191 tok
->argv
= (char **) tok_malloc (sizeof (char *) * tok
->amax
);
192 if (tok
->argv
== NULL
)
194 tok_free ((void *) tok
->ifs
);
195 tok_free ((void *) tok
);
199 tok
->wspace
= (char *) tok_malloc (WINCR
);
200 if (tok
->wspace
== NULL
)
202 tok_free ((void *) tok
->argv
);
203 tok_free ((void *) tok
->ifs
);
204 tok_free ((void *) tok
);
207 tok
->wmax
= tok
->wspace
+ WINCR
;
208 tok
->wstart
= tok
->wspace
;
209 tok
->wptr
= tok
->wspace
;
217 * Reset the tokenizer
220 tok_reset (Tokenizer
* tok
)
223 tok
->wstart
= tok
->wspace
;
224 tok
->wptr
= tok
->wspace
;
233 tok_end (Tokenizer
* tok
)
237 tok_free ((void *) tok
->ifs
);
238 tok_free ((void *) tok
->wspace
);
239 tok_free ((void *) tok
->argv
);
240 tok_free ((void *) tok
);
246 * Bourne shell (sh(1)) like tokenizing
248 * tok current tokenizer state (setup with tok_init())
253 * 2 Unmatched double quote
254 * 1 Unmatched single quote
256 * Modifies (if return value is 0):
257 * argc number of arguments
258 * argv argument array
259 * cursorc if !NULL, argv element containing cursor
260 * cursorv if !NULL, offset in argv[cursorc] of cursor
261 * NOTE: Made this function static because it is used only by (a) tok_str,
262 * or libeditline functions in the original library from which this
266 tok_line (Tokenizer
* tok
,
267 const LineInfo
* line
,
268 int *argc
, const char ***argv
, int *cursorc
, int *cursoro
)
275 for (ptr
= line
->buffer
;; ptr
++)
277 if (ptr
>= line
->lastchar
)
279 if (ptr
== line
->cursor
)
282 /* the offset to a position within a word and
283 * the offset to the start of that word are both
284 * guaranteed positive. Therefore, their difference
287 co
= (int) (tok
->wptr
- tok
->wstart
);
292 tok
->flags
|= TOK_KEEP
;
293 tok
->flags
&= ~TOK_EAT
;
296 case Q_none
: /* Enter single quote mode */
297 tok
->quote
= Q_single
;
300 case Q_single
: /* Exit single quote mode */
304 case Q_one
: /* Quote this ' */
309 case Q_double
: /* Stay in double quote mode */
313 case Q_doubleone
: /* Quote this ' */
314 tok
->quote
= Q_double
;
324 tok
->flags
&= ~TOK_EAT
;
325 tok
->flags
|= TOK_KEEP
;
328 case Q_none
: /* Enter double quote mode */
329 tok
->quote
= Q_double
;
332 case Q_double
: /* Exit double quote mode */
336 case Q_one
: /* Quote this " */
341 case Q_single
: /* Stay in single quote mode */
345 case Q_doubleone
: /* Quote this " */
346 tok
->quote
= Q_double
;
356 tok
->flags
|= TOK_KEEP
;
357 tok
->flags
&= ~TOK_EAT
;
360 case Q_none
: /* Quote next character */
364 case Q_double
: /* Quote next character */
365 tok
->quote
= Q_doubleone
;
368 case Q_one
: /* Quote this, restore state */
373 case Q_single
: /* Stay in single quote mode */
377 case Q_doubleone
: /* Quote this \ */
378 tok
->quote
= Q_double
;
388 tok
->flags
&= ~TOK_EAT
;
396 *tok
->wptr
++ = *ptr
; /* Add the return */
399 case Q_doubleone
: /* Back to double, eat the '\n' */
400 tok
->flags
|= TOK_EAT
;
401 tok
->quote
= Q_double
;
404 case Q_one
: /* No quote, more eat the '\n' */
405 tok
->flags
|= TOK_EAT
;
418 /* Finish word and return */
419 if (tok
->flags
& TOK_EAT
)
421 tok
->flags
&= ~TOK_EAT
;
433 tok
->quote
= Q_double
;
448 tok
->flags
&= ~TOK_EAT
;
452 if (strchr (tok
->ifs
, *ptr
) != NULL
)
466 tok
->quote
= Q_double
;
482 if (tok
->wptr
>= tok
->wmax
- 4)
484 size_t size
= tok
->wmax
- tok
->wspace
+ WINCR
;
485 char *s
= (char *) tok_realloc (tok
->wspace
, size
);
489 if (s
!= tok
->wspace
)
492 for (i
= 0; i
< tok
->argc
; i
++)
494 tok
->argv
[i
] = (tok
->argv
[i
] - tok
->wspace
) + s
;
496 tok
->wptr
= (tok
->wptr
- tok
->wspace
) + s
;
497 tok
->wstart
= (tok
->wstart
- tok
->wspace
) + s
;
500 tok
->wmax
= s
+ size
;
502 if (tok
->argc
>= tok
->amax
- 4)
506 p
= (char **) tok_realloc (tok
->argv
, tok
->amax
* sizeof (char *));
513 if (cc
== -1 && co
== -1)
516 /* the offset to a position within a word and
517 * the offset to the start of that word are both
518 * guaranteed positive. Therefore, their difference
521 co
= (int) (tok
->wptr
- tok
->wstart
);
528 *argv
= (const char **) tok
->argv
;
534 * Simpler version of tok_line, taking a NUL terminated line
535 * and splitting into words, ignoring cursor state.
538 tok_str (Tokenizer
* tok
, const char *line
, int *argc
, const char ***argv
)
542 memset (&li
, 0, sizeof (li
));
544 li
.cursor
= li
.lastchar
= strchr (line
, '\0');
545 return (tok_line (tok
, &li
, argc
, argv
, NULL
, NULL
));