1 //===-- Format string parser for scanf -------------------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
10 #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
12 #include "src/__support/arg_list.h"
13 #include "src/__support/common.h"
14 #include "src/__support/ctype_utils.h"
15 #include "src/__support/str_to_integer.h"
16 #include "src/stdio/scanf_core/core_structs.h"
17 #include "src/stdio/scanf_core/scanf_config.h"
21 namespace LIBC_NAMESPACE
{
22 namespace scanf_core
{
24 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
25 #define GET_ARG_VAL_SIMPLEST(arg_type, index) get_arg_value<arg_type>(index)
27 #define GET_ARG_VAL_SIMPLEST(arg_type, _) get_next_arg_value<arg_type>()
28 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
30 template <typename ArgProvider
> class Parser
{
31 const char *__restrict str
;
36 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
37 // args_start stores the start of the va_args, which is used when a previous
38 // argument is needed. In that case, we have to read the arguments from the
39 // beginning since they don't support reading backwards.
40 ArgProvider args_start
;
41 size_t args_index
= 1;
42 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
45 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
46 LIBC_INLINE
Parser(const char *__restrict new_str
, internal::ArgList
&args
)
47 : str(new_str
), args_cur(args
), args_start(args
) {}
49 LIBC_INLINE
Parser(const char *__restrict new_str
, internal::ArgList
&args
)
50 : str(new_str
), args_cur(args
) {}
51 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
53 // get_next_section will parse the format string until it has a fully
54 // specified format section. This can either be a raw format section with no
55 // conversion, or a format section with a conversion that has all of its
56 // variables stored in the format section.
57 LIBC_INLINE FormatSection
get_next_section() {
58 FormatSection section
;
59 size_t starting_pos
= cur_pos
;
60 if (str
[cur_pos
] == '%') {
62 section
.has_conv
= true;
65 [[maybe_unused
]] size_t conv_index
= 0;
67 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
68 conv_index
= parse_index(&cur_pos
);
69 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
71 if (str
[cur_pos
] == '*') {
73 section
.flags
= FormatFlags::NO_WRITE
;
77 section
.max_width
= -1;
78 if (internal::isdigit(str
[cur_pos
])) {
79 auto result
= internal::strtointeger
<int>(str
+ cur_pos
, 10);
80 section
.max_width
= result
.value
;
81 cur_pos
= cur_pos
+ result
.parsed_len
;
84 // TODO(michaelrj): add posix allocate flag support.
85 // if (str[cur_pos] == 'm') {
87 // section.flags = FormatFlags::ALLOCATE;
90 LengthModifier lm
= parse_length_modifier(&cur_pos
);
91 section
.length_modifier
= lm
;
93 section
.conv_name
= str
[cur_pos
];
95 // If NO_WRITE is not set, then read the next arg as the output pointer.
96 if ((section
.flags
& FormatFlags::NO_WRITE
) == 0) {
97 // Since all outputs are pointers, there's no need to distinguish when
98 // reading from va_args. They're all the same size and stored the same.
99 section
.output_ptr
= GET_ARG_VAL_SIMPLEST(void *, conv_index
);
102 // If the end of the format section is on the '\0'. This means we need to
103 // not advance the cur_pos and we should not count this has having a
105 if (str
[cur_pos
] != '\0') {
108 section
.has_conv
= false;
111 // If the format is a bracketed one, then we need to parse out the insides
113 if (section
.conv_name
== '[') {
114 constexpr char CLOSING_BRACKET
= ']';
115 constexpr char INVERT_FLAG
= '^';
116 constexpr char RANGE_OPERATOR
= '-';
118 cpp::bitset
<256> scan_set
;
121 // The circumflex in the first position represents the inversion flag,
122 // but it's easier to apply that at the end so we just store it for now.
123 if (str
[cur_pos
] == INVERT_FLAG
) {
128 // This is used to determine if a hyphen is being used as a literal or
129 // as a range operator.
130 size_t set_start_pos
= cur_pos
;
132 // Normally the right bracket closes the set, but if it's the first
133 // character (possibly after the inversion flag) then it's instead
134 // included as a character in the set and the second right bracket
136 if (str
[cur_pos
] == CLOSING_BRACKET
) {
137 scan_set
.set(CLOSING_BRACKET
);
141 while (str
[cur_pos
] != '\0' && str
[cur_pos
] != CLOSING_BRACKET
) {
142 // If a hyphen is being used as a range operator, since it's neither
143 // at the beginning nor end of the set.
144 if (str
[cur_pos
] == RANGE_OPERATOR
&& cur_pos
!= set_start_pos
&&
145 str
[cur_pos
+ 1] != CLOSING_BRACKET
&& str
[cur_pos
+ 1] != '\0') {
146 // Technically there is no requirement to correct the ordering of
147 // the range, but since the range operator is entirely
148 // implementation defined it seems like a good convenience.
149 char a
= str
[cur_pos
- 1];
150 char b
= str
[cur_pos
+ 1];
151 char start
= (a
< b
? a
: b
);
152 char end
= (a
< b
? b
: a
);
153 scan_set
.set_range(start
, end
);
156 scan_set
.set(str
[cur_pos
]);
163 if (str
[cur_pos
] == CLOSING_BRACKET
) {
165 section
.scan_set
= scan_set
;
167 // if the end of the string was encountered, this is not a valid set.
168 section
.has_conv
= false;
173 section
.has_conv
= false;
174 while (str
[cur_pos
] != '%' && str
[cur_pos
] != '\0')
177 section
.raw_string
= {str
+ starting_pos
, cur_pos
- starting_pos
};
182 // parse_length_modifier parses the length modifier inside a format string. It
183 // assumes that str[*local_pos] is inside a format specifier. It returns a
184 // LengthModifier with the length modifier it found. It will advance local_pos
185 // after the format specifier if one is found.
186 LIBC_INLINE LengthModifier
parse_length_modifier(size_t *local_pos
) {
187 switch (str
[*local_pos
]) {
189 if (str
[*local_pos
+ 1] == 'l') {
191 return LengthModifier::ll
;
194 return LengthModifier::l
;
197 if (str
[*local_pos
+ 1] == 'h') {
199 return LengthModifier::hh
;
202 return LengthModifier::h
;
206 return LengthModifier::L
;
209 return LengthModifier::j
;
212 return LengthModifier::z
;
215 return LengthModifier::t
;
217 return LengthModifier::NONE
;
221 // get_next_arg_value gets the next value from the arg list as type T.
222 template <class T
> LIBC_INLINE T
get_next_arg_value() {
223 return args_cur
.template next_var
<T
>();
226 //----------------------------------------------------
227 // INDEX MODE ONLY FUNCTIONS AFTER HERE:
228 //----------------------------------------------------
230 #ifndef LIBC_COPT_SCANF_DISABLE_INDEX_MODE
232 // parse_index parses the index of a value inside a format string. It
233 // assumes that str[*local_pos] points to character after a '%' or '*', and
234 // returns 0 if there is no closing $, or if it finds no number. If it finds a
235 // number, it will move local_pos past the end of the $, else it will not move
237 LIBC_INLINE
size_t parse_index(size_t *local_pos
) {
238 if (internal::isdigit(str
[*local_pos
])) {
239 auto result
= internal::strtointeger
<int>(str
+ *local_pos
, 10);
240 size_t index
= result
.value
;
241 if (str
[*local_pos
+ result
.parsed_len
] != '$')
243 *local_pos
= 1 + result
.parsed_len
+ *local_pos
;
249 // get_arg_value gets the value from the arg list at index (starting at 1).
250 // This may require parsing the format string. An index of 0 is interpreted as
252 template <class T
> LIBC_INLINE T
get_arg_value(size_t index
) {
253 if (!(index
== 0 || index
== args_index
))
254 args_to_index(index
);
257 return get_next_arg_value
<T
>();
260 // the ArgList can only return the next item in the list. This function is
261 // used in index mode when the item that needs to be read is not the next one.
262 // It moves cur_args to the index requested so the appropriate value may
263 // be read. This may involve parsing the format string, and is in the worst
264 // case an O(n^2) operation.
265 LIBC_INLINE
void args_to_index(size_t index
) {
266 if (args_index
> index
) {
268 args_cur
= args_start
;
271 while (args_index
< index
) {
272 // Since all arguments must be pointers, we can just read all of them as
273 // void * and not worry about type issues.
274 args_cur
.template next_var
<void *>();
279 #endif // LIBC_COPT_SCANF_DISABLE_INDEX_MODE
282 } // namespace scanf_core
283 } // namespace LIBC_NAMESPACE
285 #endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H