lexer: Add comments and update style in lex_source_get__().
[pspp.git] / src / language / lexer / lexer.c
blobea81f19559926a37fc47e69efe7c464a82384909
1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 #include <config.h>
19 #include "language/lexer/lexer.h"
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <limits.h>
24 #include <math.h>
25 #include <stdarg.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <unictype.h>
29 #include <unistd.h>
30 #include <unistr.h>
31 #include <uniwidth.h>
33 #include "language/command.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/ll.h"
42 #include "libpspp/message.h"
43 #include "libpspp/misc.h"
44 #include "libpspp/str.h"
45 #include "libpspp/u8-istream.h"
46 #include "output/journal.h"
47 #include "output/text-item.h"
49 #include "gl/c-ctype.h"
50 #include "gl/minmax.h"
51 #include "gl/xalloc.h"
52 #include "gl/xmemdup0.h"
54 #include "gettext.h"
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) msgid
58 /* A token within a lex_source. */
59 struct lex_token
61 /* The regular token information. */
62 struct token token;
64 /* Location of token in terms of the lex_source's buffer.
65 src->tail <= line_pos <= token_pos <= src->head. */
66 size_t token_pos; /* Start of token. */
67 size_t token_len; /* Length of source for token in bytes. */
68 size_t line_pos; /* Start of line containing token_pos. */
69 int first_line; /* Line number at token_pos. */
72 /* A source of tokens, corresponding to a syntax file.
74 This is conceptually a lex_reader wrapped with everything needed to convert
75 its UTF-8 bytes into tokens. */
76 struct lex_source
78 struct ll ll; /* In lexer's list of sources. */
79 struct lex_reader *reader;
80 struct segmenter segmenter;
81 bool eof; /* True if T_STOP was read from 'reader'. */
83 /* Buffer of UTF-8 bytes. */
84 char *buffer;
85 size_t allocated; /* Number of bytes allocated. */
86 size_t tail; /* &buffer[0] offset into UTF-8 source. */
87 size_t head; /* &buffer[head - tail] offset into source. */
89 /* Positions in source file, tail <= pos <= head for each member here. */
90 size_t journal_pos; /* First byte not yet output to journal. */
91 size_t seg_pos; /* First byte not yet scanned as token. */
92 size_t line_pos; /* First byte of line containing seg_pos. */
94 int n_newlines; /* Number of new-lines up to seg_pos. */
95 bool suppress_next_newline;
97 /* Tokens. */
98 struct deque deque; /* Indexes into 'tokens'. */
99 struct lex_token *tokens; /* Lookahead tokens for parser. */
102 static struct lex_source *lex_source_create (struct lex_reader *);
103 static void lex_source_destroy (struct lex_source *);
105 /* Lexer. */
106 struct lexer
108 struct ll_list sources; /* Contains "struct lex_source"s. */
111 static struct lex_source *lex_source__ (const struct lexer *);
112 static const struct lex_token *lex_next__ (const struct lexer *, int n);
113 static void lex_source_push_endcmd__ (struct lex_source *);
115 static void lex_source_pop__ (struct lex_source *);
116 static bool lex_source_get__ (const struct lex_source *);
117 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
118 const char *format, va_list)
119 PRINTF_FORMAT (4, 0);
120 static const struct lex_token *lex_source_next__ (const struct lex_source *,
121 int n);
123 /* Initializes READER with the specified CLASS and otherwise some reasonable
124 defaults. The caller should fill in the others members as desired. */
125 void
126 lex_reader_init (struct lex_reader *reader,
127 const struct lex_reader_class *class)
129 reader->class = class;
130 reader->syntax = LEX_SYNTAX_AUTO;
131 reader->error = LEX_ERROR_CONTINUE;
132 reader->file_name = NULL;
133 reader->encoding = NULL;
134 reader->line_number = 0;
137 /* Frees any file name already in READER and replaces it by a copy of
138 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
139 void
140 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
142 free (reader->file_name);
143 reader->file_name = file_name != NULL ? xstrdup (file_name) : NULL;
146 /* Creates and returns a new lexer. */
147 struct lexer *
148 lex_create (void)
150 struct lexer *lexer = xzalloc (sizeof *lexer);
151 ll_init (&lexer->sources);
152 return lexer;
155 /* Destroys LEXER. */
156 void
157 lex_destroy (struct lexer *lexer)
159 if (lexer != NULL)
161 struct lex_source *source, *next;
163 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
164 lex_source_destroy (source);
165 free (lexer);
169 /* Inserts READER into LEXER so that the next token read by LEXER comes from
170 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
171 token. */
172 void
173 lex_include (struct lexer *lexer, struct lex_reader *reader)
175 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
176 ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
179 /* Appends READER to LEXER, so that it will be read after all other current
180 readers have already been read. */
181 void
182 lex_append (struct lexer *lexer, struct lex_reader *reader)
184 ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
187 /* Advancing. */
189 static struct lex_token *
190 lex_push_token__ (struct lex_source *src)
192 struct lex_token *token;
194 if (deque_is_full (&src->deque))
195 src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
197 token = &src->tokens[deque_push_front (&src->deque)];
198 token_init (&token->token);
199 return token;
202 static void
203 lex_source_pop__ (struct lex_source *src)
205 token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
208 static void
209 lex_source_pop_front (struct lex_source *src)
211 token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
214 /* Advances LEXER to the next token, consuming the current token. */
215 void
216 lex_get (struct lexer *lexer)
218 struct lex_source *src;
220 src = lex_source__ (lexer);
221 if (src == NULL)
222 return;
224 if (!deque_is_empty (&src->deque))
225 lex_source_pop__ (src);
227 while (deque_is_empty (&src->deque))
228 if (!lex_source_get__ (src))
230 lex_source_destroy (src);
231 src = lex_source__ (lexer);
232 if (src == NULL)
233 return;
237 /* Issuing errors. */
239 /* Prints a syntax error message containing the current token and
240 given message MESSAGE (if non-null). */
241 void
242 lex_error (struct lexer *lexer, const char *format, ...)
244 va_list args;
246 va_start (args, format);
247 lex_next_error_valist (lexer, 0, 0, format, args);
248 va_end (args);
251 /* Prints a syntax error message containing the current token and
252 given message MESSAGE (if non-null). */
253 void
254 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
256 lex_next_error_valist (lexer, 0, 0, format, args);
259 /* Prints a syntax error message containing the current token and
260 given message MESSAGE (if non-null). */
261 void
262 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
264 va_list args;
266 va_start (args, format);
267 lex_next_error_valist (lexer, n0, n1, format, args);
268 va_end (args);
271 /* Prints a syntax error message saying that OPTION0 or one of the other
272 strings following it, up to the first NULL, is expected. */
273 void
274 lex_error_expecting (struct lexer *lexer, const char *option0, ...)
276 enum { MAX_OPTIONS = 8 };
277 const char *options[MAX_OPTIONS + 1];
278 va_list args;
279 int n;
281 va_start (args, option0);
282 options[0] = option0;
283 n = 0;
284 while (n + 1 < MAX_OPTIONS && options[n] != NULL)
285 options[++n] = va_arg (args, const char *);
286 va_end (args);
288 switch (n)
290 case 0:
291 lex_error (lexer, NULL);
292 break;
294 case 1:
295 lex_error (lexer, _("expecting %s"), options[0]);
296 break;
298 case 2:
299 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
300 break;
302 case 3:
303 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
304 options[2]);
305 break;
307 case 4:
308 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
309 options[0], options[1], options[2], options[3]);
310 break;
312 case 5:
313 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
314 options[0], options[1], options[2], options[3], options[4]);
315 break;
317 case 6:
318 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
319 options[0], options[1], options[2], options[3], options[4],
320 options[5]);
321 break;
323 case 7:
324 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
325 options[0], options[1], options[2], options[3], options[4],
326 options[5], options[6]);
327 break;
329 case 8:
330 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
331 options[0], options[1], options[2], options[3], options[4],
332 options[5], options[6], options[7]);
333 break;
335 default:
336 NOT_REACHED ();
340 /* Reports an error to the effect that subcommand SBC may only be specified
341 once.
343 This function does not take a lexer as an argument or use lex_error(),
344 because the result would ordinarily just be redundant: "Syntax error at
345 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
346 not help the user find the error. */
347 void
348 lex_sbc_only_once (const char *sbc)
350 msg (SE, _("Subcommand %s may only be specified once."), sbc);
353 /* Reports an error to the effect that subcommand SBC is missing.
355 This function does not take a lexer as an argument or use lex_error(),
356 because a missing subcommand can normally be detected only after the whole
357 command has been parsed, and so lex_error() would always report "Syntax
358 error at end of command", which does not help the user find the error. */
359 void
360 lex_sbc_missing (const char *sbc)
362 msg (SE, _("Required subcommand %s was not specified."), sbc);
365 /* Reports an error to the effect that specification SPEC may only be specified
366 once within subcommand SBC. */
367 void
368 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
370 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
371 spec, sbc);
374 /* Reports an error to the effect that specification SPEC is missing within
375 subcommand SBC. */
376 void
377 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
379 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
380 sbc, spec);
383 /* Prints a syntax error message containing the current token and
384 given message MESSAGE (if non-null). */
385 void
386 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
387 const char *format, va_list args)
389 struct lex_source *src = lex_source__ (lexer);
391 if (src != NULL)
392 lex_source_error_valist (src, n0, n1, format, args);
393 else
395 struct string s;
397 ds_init_empty (&s);
398 ds_put_format (&s, _("Syntax error at end of input"));
399 if (format != NULL)
401 ds_put_cstr (&s, ": ");
402 ds_put_vformat (&s, format, args);
404 ds_put_byte (&s, '.');
405 msg (SE, "%s", ds_cstr (&s));
406 ds_destroy (&s);
410 /* Checks that we're at end of command.
411 If so, returns a successful command completion code.
412 If not, flags a syntax error and returns an error command
413 completion code. */
415 lex_end_of_command (struct lexer *lexer)
417 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
419 lex_error (lexer, _("expecting end of command"));
420 return CMD_FAILURE;
422 else
423 return CMD_SUCCESS;
426 /* Token testing functions. */
428 /* Returns true if the current token is a number. */
429 bool
430 lex_is_number (struct lexer *lexer)
432 return lex_next_is_number (lexer, 0);
435 /* Returns true if the current token is a string. */
436 bool
437 lex_is_string (struct lexer *lexer)
439 return lex_next_is_string (lexer, 0);
442 /* Returns the value of the current token, which must be a
443 floating point number. */
444 double
445 lex_number (struct lexer *lexer)
447 return lex_next_number (lexer, 0);
450 /* Returns true iff the current token is an integer. */
451 bool
452 lex_is_integer (struct lexer *lexer)
454 return lex_next_is_integer (lexer, 0);
457 /* Returns the value of the current token, which must be an
458 integer. */
459 long
460 lex_integer (struct lexer *lexer)
462 return lex_next_integer (lexer, 0);
465 /* Token testing functions with lookahead.
467 A value of 0 for N as an argument to any of these functions refers to the
468 current token. Lookahead is limited to the current command. Any N greater
469 than the number of tokens remaining in the current command will be treated
470 as referring to a T_ENDCMD token. */
472 /* Returns true if the token N ahead of the current token is a number. */
473 bool
474 lex_next_is_number (struct lexer *lexer, int n)
476 enum token_type next_token = lex_next_token (lexer, n);
477 return next_token == T_POS_NUM || next_token == T_NEG_NUM;
480 /* Returns true if the token N ahead of the current token is a string. */
481 bool
482 lex_next_is_string (struct lexer *lexer, int n)
484 return lex_next_token (lexer, n) == T_STRING;
487 /* Returns the value of the token N ahead of the current token, which must be a
488 floating point number. */
489 double
490 lex_next_number (struct lexer *lexer, int n)
492 assert (lex_next_is_number (lexer, n));
493 return lex_next_tokval (lexer, n);
496 /* Returns true if the token N ahead of the current token is an integer. */
497 bool
498 lex_next_is_integer (struct lexer *lexer, int n)
500 double value;
502 if (!lex_next_is_number (lexer, n))
503 return false;
505 value = lex_next_tokval (lexer, n);
506 return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
509 /* Returns the value of the token N ahead of the current token, which must be
510 an integer. */
511 long
512 lex_next_integer (struct lexer *lexer, int n)
514 assert (lex_next_is_integer (lexer, n));
515 return lex_next_tokval (lexer, n);
518 /* Token matching functions. */
520 /* If the current token has the specified TYPE, skips it and returns true.
521 Otherwise, returns false. */
522 bool
523 lex_match (struct lexer *lexer, enum token_type type)
525 if (lex_token (lexer) == type)
527 lex_get (lexer);
528 return true;
530 else
531 return false;
534 /* If the current token matches IDENTIFIER, skips it and returns true.
535 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
536 returns false.
538 IDENTIFIER must be an ASCII string. */
539 bool
540 lex_match_id (struct lexer *lexer, const char *identifier)
542 return lex_match_id_n (lexer, identifier, 3);
545 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
546 may be abbreviated to its first N letters. Otherwise, returns false.
548 IDENTIFIER must be an ASCII string. */
549 bool
550 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
552 if (lex_token (lexer) == T_ID
553 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
555 lex_get (lexer);
556 return true;
558 else
559 return false;
562 /* If the current token is integer X, skips it and returns true. Otherwise,
563 returns false. */
564 bool
565 lex_match_int (struct lexer *lexer, int x)
567 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
569 lex_get (lexer);
570 return true;
572 else
573 return false;
576 /* Forced matches. */
578 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
579 abbreviated to its first 3 letters. Otherwise, reports an error and returns
580 false.
582 IDENTIFIER must be an ASCII string. */
583 bool
584 lex_force_match_id (struct lexer *lexer, const char *identifier)
586 if (lex_match_id (lexer, identifier))
587 return true;
588 else
590 lex_error_expecting (lexer, identifier, NULL_SENTINEL);
591 return false;
595 /* If the current token has the specified TYPE, skips it and returns true.
596 Otherwise, reports an error and returns false. */
597 bool
598 lex_force_match (struct lexer *lexer, enum token_type type)
600 if (lex_token (lexer) == type)
602 lex_get (lexer);
603 return true;
605 else
607 char *s = xasprintf ("`%s'", token_type_to_string (type));
608 lex_error_expecting (lexer, s, NULL_SENTINEL);
609 free (s);
610 return false;
614 /* If the current token is a string, does nothing and returns true.
615 Otherwise, reports an error and returns false. */
616 bool
617 lex_force_string (struct lexer *lexer)
619 if (lex_is_string (lexer))
620 return true;
621 else
623 lex_error (lexer, _("expecting string"));
624 return false;
628 /* If the current token is a string or an identifier, does nothing and returns
629 true. Otherwise, reports an error and returns false.
631 This is meant for use in syntactic situations where we want to encourage the
632 user to supply a quoted string, but for compatibility we also accept
633 identifiers. (One example of such a situation is file names.) Therefore,
634 the error message issued when the current token is wrong only says that a
635 string is expected and doesn't mention that an identifier would also be
636 accepted. */
637 bool
638 lex_force_string_or_id (struct lexer *lexer)
640 return lex_is_integer (lexer) || lex_force_string (lexer);
643 /* If the current token is an integer, does nothing and returns true.
644 Otherwise, reports an error and returns false. */
645 bool
646 lex_force_int (struct lexer *lexer)
648 if (lex_is_integer (lexer))
649 return true;
650 else
652 lex_error (lexer, _("expecting integer"));
653 return false;
657 /* If the current token is a number, does nothing and returns true.
658 Otherwise, reports an error and returns false. */
659 bool
660 lex_force_num (struct lexer *lexer)
662 if (lex_is_number (lexer))
663 return true;
665 lex_error (lexer, _("expecting number"));
666 return false;
669 /* If the current token is an identifier, does nothing and returns true.
670 Otherwise, reports an error and returns false. */
671 bool
672 lex_force_id (struct lexer *lexer)
674 if (lex_token (lexer) == T_ID)
675 return true;
677 lex_error (lexer, _("expecting identifier"));
678 return false;
681 /* Token accessors. */
683 /* Returns the type of LEXER's current token. */
684 enum token_type
685 lex_token (const struct lexer *lexer)
687 return lex_next_token (lexer, 0);
690 /* Returns the number in LEXER's current token.
692 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
693 tokens this function will always return zero. */
694 double
695 lex_tokval (const struct lexer *lexer)
697 return lex_next_tokval (lexer, 0);
700 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
702 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
703 this functions this function will always return NULL.
705 The UTF-8 encoding of the returned string is correct for variable names and
706 other identifiers. Use filename_to_utf8() to use it as a filename. Use
707 data_in() to use it in a "union value". */
708 const char *
709 lex_tokcstr (const struct lexer *lexer)
711 return lex_next_tokcstr (lexer, 0);
714 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
715 null-terminated (but the null terminator is not included in the returned
716 substring's 'length').
718 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
719 this functions this function will always return NULL.
721 The UTF-8 encoding of the returned string is correct for variable names and
722 other identifiers. Use filename_to_utf8() to use it as a filename. Use
723 data_in() to use it in a "union value". */
724 struct substring
725 lex_tokss (const struct lexer *lexer)
727 return lex_next_tokss (lexer, 0);
730 /* Looking ahead.
732 A value of 0 for N as an argument to any of these functions refers to the
733 current token. Lookahead is limited to the current command. Any N greater
734 than the number of tokens remaining in the current command will be treated
735 as referring to a T_ENDCMD token. */
737 static const struct lex_token *
738 lex_next__ (const struct lexer *lexer_, int n)
740 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
741 struct lex_source *src = lex_source__ (lexer);
743 if (src != NULL)
744 return lex_source_next__ (src, n);
745 else
747 static const struct lex_token stop_token =
748 { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
750 return &stop_token;
754 static const struct lex_token *
755 lex_source_next__ (const struct lex_source *src, int n)
757 while (deque_count (&src->deque) <= n)
759 if (!deque_is_empty (&src->deque))
761 struct lex_token *front;
763 front = &src->tokens[deque_front (&src->deque, 0)];
764 if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
765 return front;
768 lex_source_get__ (src);
771 return &src->tokens[deque_back (&src->deque, n)];
774 /* Returns the "struct token" of the token N after the current one in LEXER.
775 The returned pointer can be invalidated by pretty much any succeeding call
776 into the lexer, although the string pointer within the returned token is
777 only invalidated by consuming the token (e.g. with lex_get()). */
778 const struct token *
779 lex_next (const struct lexer *lexer, int n)
781 return &lex_next__ (lexer, n)->token;
784 /* Returns the type of the token N after the current one in LEXER. */
785 enum token_type
786 lex_next_token (const struct lexer *lexer, int n)
788 return lex_next (lexer, n)->type;
791 /* Returns the number in the tokn N after the current one in LEXER.
793 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
794 tokens this function will always return zero. */
795 double
796 lex_next_tokval (const struct lexer *lexer, int n)
798 const struct token *token = lex_next (lexer, n);
799 return token->number;
802 /* Returns the null-terminated string in the token N after the current one, in
803 UTF-8 encoding.
805 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
806 this functions this function will always return NULL.
808 The UTF-8 encoding of the returned string is correct for variable names and
809 other identifiers. Use filename_to_utf8() to use it as a filename. Use
810 data_in() to use it in a "union value". */
811 const char *
812 lex_next_tokcstr (const struct lexer *lexer, int n)
814 return lex_next_tokss (lexer, n).string;
817 /* Returns the string in the token N after the current one, in UTF-8 encoding.
818 The string is null-terminated (but the null terminator is not included in
819 the returned substring's 'length').
821 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
822 this functions this function will always return NULL.
824 The UTF-8 encoding of the returned string is correct for variable names and
825 other identifiers. Use filename_to_utf8() to use it as a filename. Use
826 data_in() to use it in a "union value". */
827 struct substring
828 lex_next_tokss (const struct lexer *lexer, int n)
830 return lex_next (lexer, n)->string;
833 static bool
834 lex_tokens_match (const struct token *actual, const struct token *expected)
836 if (actual->type != expected->type)
837 return false;
839 switch (actual->type)
841 case T_POS_NUM:
842 case T_NEG_NUM:
843 return actual->number == expected->number;
845 case T_ID:
846 return lex_id_match (expected->string, actual->string);
848 case T_STRING:
849 return (actual->string.length == expected->string.length
850 && !memcmp (actual->string.string, expected->string.string,
851 actual->string.length));
853 default:
854 return true;
858 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
859 skips it and returns true. Otherwise, returns false.
861 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
862 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
863 first three letters. */
864 bool
865 lex_match_phrase (struct lexer *lexer, const char *s)
867 struct string_lexer slex;
868 struct token token;
869 int i;
871 i = 0;
872 string_lexer_init (&slex, s, SEG_MODE_INTERACTIVE);
873 while (string_lexer_next (&slex, &token))
874 if (token.type != SCAN_SKIP)
876 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
877 token_destroy (&token);
878 if (!match)
879 return false;
882 while (i-- > 0)
883 lex_get (lexer);
884 return true;
887 static int
888 lex_source_get_first_line_number (const struct lex_source *src, int n)
890 return lex_source_next__ (src, n)->first_line;
893 static int
894 count_newlines (char *s, size_t length)
896 int n_newlines = 0;
897 char *newline;
899 while ((newline = memchr (s, '\n', length)) != NULL)
901 n_newlines++;
902 length -= (newline + 1) - s;
903 s = newline + 1;
906 return n_newlines;
909 static int
910 lex_source_get_last_line_number (const struct lex_source *src, int n)
912 const struct lex_token *token = lex_source_next__ (src, n);
914 if (token->first_line == 0)
915 return 0;
916 else
918 char *token_str = &src->buffer[token->token_pos - src->tail];
919 return token->first_line + count_newlines (token_str, token->token_len) + 1;
923 static int
924 count_columns (const char *s_, size_t length)
926 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
927 int columns;
928 size_t ofs;
929 int mblen;
931 columns = 0;
932 for (ofs = 0; ofs < length; ofs += mblen)
934 ucs4_t uc;
936 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
937 if (uc != '\t')
939 int width = uc_width (uc, "UTF-8");
940 if (width > 0)
941 columns += width;
943 else
944 columns = ROUND_UP (columns + 1, 8);
947 return columns + 1;
950 static int
951 lex_source_get_first_column (const struct lex_source *src, int n)
953 const struct lex_token *token = lex_source_next__ (src, n);
954 return count_columns (&src->buffer[token->line_pos - src->tail],
955 token->token_pos - token->line_pos);
958 static int
959 lex_source_get_last_column (const struct lex_source *src, int n)
961 const struct lex_token *token = lex_source_next__ (src, n);
962 char *start, *end, *newline;
964 start = &src->buffer[token->line_pos - src->tail];
965 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
966 newline = memrchr (start, '\n', end - start);
967 if (newline != NULL)
968 start = newline + 1;
969 return count_columns (start, end - start);
972 /* Returns the 1-based line number of the start of the syntax that represents
973 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
974 if the token is drawn from a source that does not have line numbers. */
976 lex_get_first_line_number (const struct lexer *lexer, int n)
978 const struct lex_source *src = lex_source__ (lexer);
979 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
982 /* Returns the 1-based line number of the end of the syntax that represents the
983 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
984 token or if the token is drawn from a source that does not have line
985 numbers.
987 Most of the time, a single token is wholly within a single line of syntax,
988 but there are two exceptions: a T_STRING token can be made up of multiple
989 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
990 token can consist of a "-" on one line followed by the number on the next.
993 lex_get_last_line_number (const struct lexer *lexer, int n)
995 const struct lex_source *src = lex_source__ (lexer);
996 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
999 /* Returns the 1-based column number of the start of the syntax that represents
1000 the token N after the current one in LEXER. Returns 0 for a T_STOP
1001 token.
1003 Column numbers are measured according to the width of characters as shown in
1004 a typical fixed-width font, in which CJK characters have width 2 and
1005 combining characters have width 0. */
1007 lex_get_first_column (const struct lexer *lexer, int n)
1009 const struct lex_source *src = lex_source__ (lexer);
1010 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1013 /* Returns the 1-based column number of the end of the syntax that represents
1014 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1015 token.
1017 Column numbers are measured according to the width of characters as shown in
1018 a typical fixed-width font, in which CJK characters have width 2 and
1019 combining characters have width 0. */
1021 lex_get_last_column (const struct lexer *lexer, int n)
1023 const struct lex_source *src = lex_source__ (lexer);
1024 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1027 /* Returns the name of the syntax file from which the current command is drawn.
1028 Returns NULL for a T_STOP token or if the command's source does not have
1029 line numbers.
1031 There is no version of this function that takes an N argument because
1032 lookahead only works to the end of a command and any given command is always
1033 within a single syntax file. */
1034 const char *
1035 lex_get_file_name (const struct lexer *lexer)
1037 struct lex_source *src = lex_source__ (lexer);
1038 return src == NULL ? NULL : src->reader->file_name;
1041 const char *
1042 lex_get_encoding (const struct lexer *lexer)
1044 struct lex_source *src = lex_source__ (lexer);
1045 return src == NULL ? NULL : src->reader->encoding;
1049 /* Returns the syntax mode for the syntax file from which the current drawn is
1050 drawn. Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1051 source does not have line numbers.
1053 There is no version of this function that takes an N argument because
1054 lookahead only works to the end of a command and any given command is always
1055 within a single syntax file. */
1056 enum lex_syntax_mode
1057 lex_get_syntax_mode (const struct lexer *lexer)
1059 struct lex_source *src = lex_source__ (lexer);
1060 return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1063 /* Returns the error mode for the syntax file from which the current drawn is
1064 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1065 source does not have line numbers.
1067 There is no version of this function that takes an N argument because
1068 lookahead only works to the end of a command and any given command is always
1069 within a single syntax file. */
1070 enum lex_error_mode
1071 lex_get_error_mode (const struct lexer *lexer)
1073 struct lex_source *src = lex_source__ (lexer);
1074 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1077 /* If the source that LEXER is currently reading has error mode
1078 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1079 token to be read comes directly from whatever is next read from the stream.
1081 It makes sense to call this function after encountering an error in a
1082 command entered on the console, because usually the user would prefer not to
1083 have cascading errors. */
1084 void
1085 lex_interactive_reset (struct lexer *lexer)
1087 struct lex_source *src = lex_source__ (lexer);
1088 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1090 src->head = src->tail = 0;
1091 src->journal_pos = src->seg_pos = src->line_pos = 0;
1092 src->n_newlines = 0;
1093 src->suppress_next_newline = false;
1094 segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1095 while (!deque_is_empty (&src->deque))
1096 lex_source_pop__ (src);
1097 lex_source_push_endcmd__ (src);
1101 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1102 void
1103 lex_discard_rest_of_command (struct lexer *lexer)
1105 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1106 lex_get (lexer);
1109 /* Discards all lookahead tokens in LEXER, then discards all input sources
1110 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1111 runs out of input sources. */
1112 void
1113 lex_discard_noninteractive (struct lexer *lexer)
1115 struct lex_source *src = lex_source__ (lexer);
1117 if (src != NULL)
1119 while (!deque_is_empty (&src->deque))
1120 lex_source_pop__ (src);
1122 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1123 src = lex_source__ (lexer))
1124 lex_source_destroy (src);
1128 static size_t
1129 lex_source_max_tail__ (const struct lex_source *src)
1131 const struct lex_token *token;
1132 size_t max_tail;
1134 assert (src->seg_pos >= src->line_pos);
1135 max_tail = MIN (src->journal_pos, src->line_pos);
1137 /* Use the oldest token also. (We know that src->deque cannot be empty
1138 because we are in the process of adding a new token, which is already
1139 initialized enough to use here.) */
1140 token = &src->tokens[deque_back (&src->deque, 0)];
1141 assert (token->token_pos >= token->line_pos);
1142 max_tail = MIN (max_tail, token->line_pos);
1144 return max_tail;
1147 static void
1148 lex_source_expand__ (struct lex_source *src)
1150 if (src->head - src->tail >= src->allocated)
1152 size_t max_tail = lex_source_max_tail__ (src);
1153 if (max_tail > src->tail)
1155 /* Advance the tail, freeing up room at the head. */
1156 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1157 src->head - max_tail);
1158 src->tail = max_tail;
1160 else
1162 /* Buffer is completely full. Expand it. */
1163 src->buffer = x2realloc (src->buffer, &src->allocated);
1166 else
1168 /* There's space available at the head of the buffer. Nothing to do. */
1172 static void
1173 lex_source_read__ (struct lex_source *src)
1177 size_t head_ofs;
1178 size_t space;
1179 size_t n;
1181 lex_source_expand__ (src);
1183 head_ofs = src->head - src->tail;
1184 space = src->allocated - head_ofs;
1185 n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1186 space,
1187 segmenter_get_prompt (&src->segmenter));
1188 assert (n <= space);
1190 if (n == 0)
1192 /* End of input.
1194 Ensure that the input always ends in a new-line followed by a null
1195 byte, as required by the segmenter library. */
1197 if (src->head == src->tail
1198 || src->buffer[src->head - src->tail - 1] != '\n')
1199 src->buffer[src->head++ - src->tail] = '\n';
1201 lex_source_expand__ (src);
1202 src->buffer[src->head++ - src->tail] = '\0';
1204 return;
1207 src->head += n;
1209 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1210 src->head - src->seg_pos));
1213 static struct lex_source *
1214 lex_source__ (const struct lexer *lexer)
1216 return (ll_is_empty (&lexer->sources) ? NULL
1217 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1220 static struct substring
1221 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1223 const struct lex_token *token0 = lex_source_next__ (src, n0);
1224 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1225 size_t start = token0->token_pos;
1226 size_t end = token1->token_pos + token1->token_len;
1228 return ss_buffer (&src->buffer[start - src->tail], end - start);
1231 static void
1232 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1234 size_t out_maxlen;
1235 size_t out_len;
1236 int mblen;
1238 assert (out_size >= 16);
1239 out_maxlen = out_size - (in.length >= out_size ? 3 : 0) - 1;
1240 for (out_len = 0; out_len < in.length; out_len += mblen)
1242 if (in.string[out_len] == '\n'
1243 || (in.string[out_len] == '\r'
1244 && out_len + 1 < in.length
1245 && in.string[out_len + 1] == '\n'))
1246 break;
1248 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1249 in.length - out_len);
1250 if (out_len + mblen > out_maxlen)
1251 break;
1254 memcpy (out, in.string, out_len);
1255 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1258 static void
1259 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1260 const char *format, va_list args)
1262 const struct lex_token *token;
1263 struct string s;
1264 struct msg m;
1266 ds_init_empty (&s);
1268 token = lex_source_next__ (src, n0);
1269 if (token->token.type == T_ENDCMD)
1270 ds_put_cstr (&s, _("Syntax error at end of command"));
1271 else
1273 struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1274 if (!ss_is_empty (syntax))
1276 char syntax_cstr[64];
1278 lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1279 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1281 else
1282 ds_put_cstr (&s, _("Syntax error"));
1285 if (format)
1287 ds_put_cstr (&s, ": ");
1288 ds_put_vformat (&s, format, args);
1290 ds_put_byte (&s, '.');
1292 m.category = MSG_C_SYNTAX;
1293 m.severity = MSG_S_ERROR;
1294 m.file_name = src->reader->file_name;
1295 m.first_line = lex_source_get_first_line_number (src, n0);
1296 m.last_line = lex_source_get_last_line_number (src, n1);
1297 m.first_column = lex_source_get_first_column (src, n0);
1298 m.last_column = lex_source_get_last_column (src, n1);
1299 m.text = ds_steal_cstr (&s);
1300 msg_emit (&m);
1303 static void PRINTF_FORMAT (2, 3)
1304 lex_get_error (struct lex_source *src, const char *format, ...)
1306 va_list args;
1307 int n;
1309 va_start (args, format);
1311 n = deque_count (&src->deque) - 1;
1312 lex_source_error_valist (src, n, n, format, args);
1313 lex_source_pop_front (src);
1315 va_end (args);
1318 /* Attempts to append an additional token into SRC's deque, reading more from
1319 the underlying lex_reader if necessary.. Returns true if successful, false
1320 if the deque already represents (a suffix of) the whole lex_reader's
1321 contents, */
1322 static bool
1323 lex_source_get__ (const struct lex_source *src_)
1325 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1326 if (src->eof)
1327 return false;
1329 /* State maintained while scanning tokens. Usually we only need a single
1330 state, but scanner_push() can return SCAN_SAVE to indicate that the state
1331 needs to be saved and possibly restored later with SCAN_BACK. */
1332 struct state
1334 struct segmenter segmenter;
1335 enum segment_type last_segment;
1336 int newlines; /* Number of newlines encountered so far. */
1337 /* Maintained here so we can update lex_source's similar members when we
1338 finish. */
1339 size_t line_pos;
1340 size_t seg_pos;
1343 /* Initialize state. */
1344 struct state state =
1346 .segmenter = src->segmenter,
1347 .newlines = 0,
1348 .seg_pos = src->seg_pos,
1349 .line_pos = src->line_pos,
1351 struct state saved = state;
1353 /* Append a new token to SRC and initialize it. */
1354 struct lex_token *token = lex_push_token__ (src);
1355 struct scanner scanner;
1356 scanner_init (&scanner, &token->token);
1357 token->line_pos = src->line_pos;
1358 token->token_pos = src->seg_pos;
1359 if (src->reader->line_number > 0)
1360 token->first_line = src->reader->line_number + src->n_newlines;
1361 else
1362 token->first_line = 0;
1364 /* Extract segments and pass them through the scanner until we obtain a
1365 token. */
1366 for (;;)
1368 /* Extract a segment. */
1369 const char *segment = &src->buffer[state.seg_pos - src->tail];
1370 size_t seg_maxlen = src->head - state.seg_pos;
1371 enum segment_type type;
1372 int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1373 &type);
1374 if (seg_len < 0)
1376 /* The segmenter needs more input to produce a segment. */
1377 lex_source_read__ (src);
1378 continue;
1381 /* Update state based on the segment. */
1382 state.last_segment = type;
1383 state.seg_pos += seg_len;
1384 if (type == SEG_NEWLINE)
1386 state.newlines++;
1387 state.line_pos = state.seg_pos;
1390 /* Pass the segment into the scanner and try to get a token out. */
1391 enum scan_result result = scanner_push (&scanner, type,
1392 ss_buffer (segment, seg_len),
1393 &token->token);
1394 if (result == SCAN_SAVE)
1395 saved = state;
1396 else if (result == SCAN_BACK)
1398 state = saved;
1399 break;
1401 else if (result == SCAN_DONE)
1402 break;
1405 /* If we've reached the end of a line, or the end of a command, then pass
1406 the line to the output engine as a syntax text item. */
1407 int n_lines = state.newlines;
1408 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1410 n_lines++;
1411 src->suppress_next_newline = true;
1413 else if (n_lines > 0 && src->suppress_next_newline)
1415 n_lines--;
1416 src->suppress_next_newline = false;
1418 for (int i = 0; i < n_lines; i++)
1420 const char *line = &src->buffer[src->journal_pos - src->tail];
1421 const char *newline = rawmemchr (line, '\n');
1422 size_t line_len = newline - line;
1423 if (line_len > 0 && line[line_len - 1] == '\r')
1424 line_len--;
1426 char *syntax = malloc (line_len + 2);
1427 memcpy (syntax, line, line_len);
1428 syntax[line_len] = '\n';
1429 syntax[line_len + 1] = '\0';
1431 text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX, syntax));
1433 src->journal_pos += newline - line + 1;
1436 token->token_len = state.seg_pos - src->seg_pos;
1438 src->segmenter = state.segmenter;
1439 src->seg_pos = state.seg_pos;
1440 src->line_pos = state.line_pos;
1441 src->n_newlines += state.newlines;
1443 switch (token->token.type)
1445 default:
1446 break;
1448 case T_STOP:
1449 token->token.type = T_ENDCMD;
1450 src->eof = true;
1451 break;
1453 case SCAN_BAD_HEX_LENGTH:
1454 lex_get_error (src, _("String of hex digits has %d characters, which "
1455 "is not a multiple of 2"),
1456 (int) token->token.number);
1457 break;
1459 case SCAN_BAD_HEX_DIGIT:
1460 case SCAN_BAD_UNICODE_DIGIT:
1461 lex_get_error (src, _("`%c' is not a valid hex digit"),
1462 (int) token->token.number);
1463 break;
1465 case SCAN_BAD_UNICODE_LENGTH:
1466 lex_get_error (src, _("Unicode string contains %d bytes, which is "
1467 "not in the valid range of 1 to 8 bytes"),
1468 (int) token->token.number);
1469 break;
1471 case SCAN_BAD_UNICODE_CODE_POINT:
1472 lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1473 (int) token->token.number);
1474 break;
1476 case SCAN_EXPECTED_QUOTE:
1477 lex_get_error (src, _("Unterminated string constant"));
1478 break;
1480 case SCAN_EXPECTED_EXPONENT:
1481 lex_get_error (src, _("Missing exponent following `%s'"),
1482 token->token.string.string);
1483 break;
1485 case SCAN_UNEXPECTED_DOT:
1486 lex_get_error (src, _("Unexpected `.' in middle of command"));
1487 break;
1489 case SCAN_UNEXPECTED_CHAR:
1491 char c_name[16];
1492 lex_get_error (src, _("Bad character %s in input"),
1493 uc_name (token->token.number, c_name));
1495 break;
1497 case SCAN_SKIP:
1498 lex_source_pop_front (src);
1499 break;
1502 return true;
1505 static void
1506 lex_source_push_endcmd__ (struct lex_source *src)
1508 struct lex_token *token = lex_push_token__ (src);
1509 token->token.type = T_ENDCMD;
1510 token->token_pos = 0;
1511 token->token_len = 0;
1512 token->line_pos = 0;
1513 token->first_line = 0;
1516 static struct lex_source *
1517 lex_source_create (struct lex_reader *reader)
1519 struct lex_source *src;
1520 enum segmenter_mode mode;
1522 src = xzalloc (sizeof *src);
1523 src->reader = reader;
1525 if (reader->syntax == LEX_SYNTAX_AUTO)
1526 mode = SEG_MODE_AUTO;
1527 else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1528 mode = SEG_MODE_INTERACTIVE;
1529 else if (reader->syntax == LEX_SYNTAX_BATCH)
1530 mode = SEG_MODE_BATCH;
1531 else
1532 NOT_REACHED ();
1533 segmenter_init (&src->segmenter, mode);
1535 src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1537 lex_source_push_endcmd__ (src);
1539 return src;
1542 static void
1543 lex_source_destroy (struct lex_source *src)
1545 char *file_name = src->reader->file_name;
1546 char *encoding = src->reader->encoding;
1547 if (src->reader->class->destroy != NULL)
1548 src->reader->class->destroy (src->reader);
1549 free (file_name);
1550 free (encoding);
1551 free (src->buffer);
1552 while (!deque_is_empty (&src->deque))
1553 lex_source_pop__ (src);
1554 free (src->tokens);
1555 ll_remove (&src->ll);
1556 free (src);
1559 struct lex_file_reader
1561 struct lex_reader reader;
1562 struct u8_istream *istream;
1565 static struct lex_reader_class lex_file_reader_class;
1567 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1568 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
1569 ENCODING, which should take one of the forms accepted by
1570 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
1571 mode of the new reader, respectively.
1573 Returns a null pointer if FILE_NAME cannot be opened. */
1574 struct lex_reader *
1575 lex_reader_for_file (const char *file_name, const char *encoding,
1576 enum lex_syntax_mode syntax,
1577 enum lex_error_mode error)
1579 struct lex_file_reader *r;
1580 struct u8_istream *istream;
1582 istream = (!strcmp(file_name, "-")
1583 ? u8_istream_for_fd (encoding, STDIN_FILENO)
1584 : u8_istream_for_file (encoding, file_name, O_RDONLY));
1585 if (istream == NULL)
1587 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1588 return NULL;
1591 r = xmalloc (sizeof *r);
1592 lex_reader_init (&r->reader, &lex_file_reader_class);
1593 r->reader.syntax = syntax;
1594 r->reader.error = error;
1595 r->reader.file_name = xstrdup (file_name);
1596 r->reader.encoding = encoding ? xstrdup (encoding) : NULL;
1597 r->reader.line_number = 1;
1598 r->istream = istream;
1600 return &r->reader;
1603 static struct lex_file_reader *
1604 lex_file_reader_cast (struct lex_reader *r)
1606 return UP_CAST (r, struct lex_file_reader, reader);
1609 static size_t
1610 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1611 enum prompt_style prompt_style UNUSED)
1613 struct lex_file_reader *r = lex_file_reader_cast (r_);
1614 ssize_t n_read = u8_istream_read (r->istream, buf, n);
1615 if (n_read < 0)
1617 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1618 return 0;
1620 return n_read;
1623 static void
1624 lex_file_close (struct lex_reader *r_)
1626 struct lex_file_reader *r = lex_file_reader_cast (r_);
1628 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1630 if (u8_istream_close (r->istream) != 0)
1631 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1633 else
1634 u8_istream_free (r->istream);
1636 free (r);
1639 static struct lex_reader_class lex_file_reader_class =
1641 lex_file_read,
1642 lex_file_close
1645 struct lex_string_reader
1647 struct lex_reader reader;
1648 struct substring s;
1649 size_t offset;
1652 static struct lex_reader_class lex_string_reader_class;
1654 /* Creates and returns a new lex_reader for the contents of S, which must be
1655 encoded in the given ENCODING. The new reader takes ownership of S and will free it
1656 with ss_dealloc() when it is closed. */
1657 struct lex_reader *
1658 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1660 struct lex_string_reader *r;
1662 r = xmalloc (sizeof *r);
1663 lex_reader_init (&r->reader, &lex_string_reader_class);
1664 r->reader.syntax = LEX_SYNTAX_AUTO;
1665 r->reader.encoding = encoding ? xstrdup (encoding) : NULL;
1666 r->s = s;
1667 r->offset = 0;
1669 return &r->reader;
1672 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1673 which must be encoded in ENCODING. The caller retains ownership of S. */
1674 struct lex_reader *
1675 lex_reader_for_string (const char *s, const char *encoding)
1677 struct substring ss;
1678 ss_alloc_substring (&ss, ss_cstr (s));
1679 return lex_reader_for_substring_nocopy (ss, encoding);
1682 /* Formats FORMAT as a printf()-like format string and creates and returns a
1683 new lex_reader for the formatted result. */
1684 struct lex_reader *
1685 lex_reader_for_format (const char *format, const char *encoding, ...)
1687 struct lex_reader *r;
1688 va_list args;
1690 va_start (args, encoding);
1691 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
1692 va_end (args);
1694 return r;
1697 static struct lex_string_reader *
1698 lex_string_reader_cast (struct lex_reader *r)
1700 return UP_CAST (r, struct lex_string_reader, reader);
1703 static size_t
1704 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1705 enum prompt_style prompt_style UNUSED)
1707 struct lex_string_reader *r = lex_string_reader_cast (r_);
1708 size_t chunk;
1710 chunk = MIN (n, r->s.length - r->offset);
1711 memcpy (buf, r->s.string + r->offset, chunk);
1712 r->offset += chunk;
1714 return chunk;
1717 static void
1718 lex_string_close (struct lex_reader *r_)
1720 struct lex_string_reader *r = lex_string_reader_cast (r_);
1722 ss_dealloc (&r->s);
1723 free (r);
1726 static struct lex_reader_class lex_string_reader_class =
1728 lex_string_read,
1729 lex_string_close