Break circular dependency between FIR dialect and utilities
[llvm-project.git] / flang / lib / Parser / token-parsers.h
blobff1ba334e73b4dc09d93f9129993f4dc8db92744
1 //===-- lib/Parser/token-parsers.h ------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #ifndef FORTRAN_PARSER_TOKEN_PARSERS_H_
10 #define FORTRAN_PARSER_TOKEN_PARSERS_H_
12 // These parsers are driven by the parsers of the Fortran grammar to consume
13 // the prescanned character stream and recognize context-sensitive tokens.
15 #include "basic-parsers.h"
16 #include "type-parsers.h"
17 #include "flang/Common/idioms.h"
18 #include "flang/Parser/char-set.h"
19 #include "flang/Parser/characters.h"
20 #include "flang/Parser/instrumented-parser.h"
21 #include "flang/Parser/provenance.h"
22 #include <cctype>
23 #include <cstddef>
24 #include <cstring>
25 #include <functional>
26 #include <limits>
27 #include <list>
28 #include <optional>
29 #include <string>
31 namespace Fortran::parser {
33 // "xyz"_ch matches one instance of the characters x, y, or z without skipping
34 // any spaces before or after. The parser returns the location of the character
35 // on success.
36 class AnyOfChars {
37 public:
38 using resultType = const char *;
39 constexpr AnyOfChars(const AnyOfChars &) = default;
40 constexpr AnyOfChars(SetOfChars set) : set_{set} {}
41 std::optional<const char *> Parse(ParseState &state) const {
42 if (std::optional<const char *> at{state.PeekAtNextChar()}) {
43 if (set_.Has(**at)) {
44 state.UncheckedAdvance();
45 state.set_anyTokenMatched();
46 return at;
49 state.Say(MessageExpectedText{set_});
50 return std::nullopt;
53 private:
54 const SetOfChars set_;
57 constexpr AnyOfChars operator""_ch(const char str[], std::size_t n) {
58 return AnyOfChars{SetOfChars(str, n)};
61 constexpr auto letter{"abcdefghijklmnopqrstuvwxyz"_ch};
62 constexpr auto digit{"0123456789"_ch};
64 // Skips over optional spaces. Always succeeds.
65 struct Space {
66 using resultType = Success;
67 constexpr Space() {}
68 static std::optional<Success> Parse(ParseState &state) {
69 while (std::optional<const char *> p{state.PeekAtNextChar()}) {
70 if (**p != ' ') {
71 break;
73 state.UncheckedAdvance();
75 return {Success{}};
78 constexpr Space space;
80 // Skips a space that in free form requires a warning if it precedes a
81 // character that could begin an identifier or keyword. Always succeeds.
82 inline void MissingSpace(ParseState &state) {
83 if (!state.inFixedForm()) {
84 state.Nonstandard(
85 LanguageFeature::OptionalFreeFormSpace, "missing space"_port_en_US);
89 struct SpaceCheck {
90 using resultType = Success;
91 constexpr SpaceCheck() {}
92 static std::optional<Success> Parse(ParseState &state) {
93 if (std::optional<const char *> p{state.PeekAtNextChar()}) {
94 char ch{**p};
95 if (ch == ' ') {
96 state.UncheckedAdvance();
97 return space.Parse(state);
99 if (IsLegalInIdentifier(ch)) {
100 MissingSpace(state);
103 return {Success{}};
106 constexpr SpaceCheck spaceCheck;
108 // Matches a token string. Spaces in the token string denote where
109 // spaces may appear in the source; they can be made mandatory for
110 // some free form keyword sequences. Missing mandatory spaces in free
111 // form elicit a warning; they are not necessary for recognition.
112 // Spaces before and after the token are also skipped.
114 // Token strings appear in the grammar as C++ user-defined literals
115 // like "BIND ( C )"_tok and "SYNC ALL"_sptok. The _tok suffix is implied
116 // when a string literal appears before the sequencing operator >> or
117 // after the sequencing operator /. The literal "..."_id parses a
118 // token that cannot be a prefix of a longer identifier.
119 template <bool MandatoryFreeFormSpace = false, bool MustBeComplete = false>
120 class TokenStringMatch {
121 public:
122 using resultType = Success;
123 constexpr TokenStringMatch(const TokenStringMatch &) = default;
124 constexpr TokenStringMatch(const char *str, std::size_t n)
125 : str_{str}, bytes_{n} {}
126 explicit constexpr TokenStringMatch(const char *str) : str_{str} {}
127 std::optional<Success> Parse(ParseState &state) const {
128 space.Parse(state);
129 const char *start{state.GetLocation()};
130 const char *p{str_};
131 std::optional<const char *> at; // initially empty
132 for (std::size_t j{0}; j < bytes_ && *p != '\0'; ++j, ++p) {
133 bool spaceSkipping{*p == ' '};
134 if (spaceSkipping) {
135 if (j + 1 == bytes_ || p[1] == ' ' || p[1] == '\0') {
136 continue; // redundant; ignore
139 if (!at) {
140 at = nextCh.Parse(state);
141 if (!at) {
142 return std::nullopt;
145 if (spaceSkipping) {
146 if (**at == ' ') {
147 at = nextCh.Parse(state);
148 if (!at) {
149 return std::nullopt;
151 } else if constexpr (MandatoryFreeFormSpace) {
152 MissingSpace(state);
154 // 'at' remains full for next iteration
155 } else if (**at == ToLowerCaseLetter(*p)) {
156 at.reset();
157 } else {
158 state.Say(start, MessageExpectedText{str_, bytes_});
159 return std::nullopt;
162 if constexpr (MustBeComplete) {
163 if (auto after{state.PeekAtNextChar()}) {
164 if (IsLegalInIdentifier(**after)) {
165 state.Say(start, MessageExpectedText{str_, bytes_});
166 return std::nullopt;
170 state.set_anyTokenMatched();
171 if (IsLegalInIdentifier(p[-1])) {
172 return spaceCheck.Parse(state);
173 } else {
174 return space.Parse(state);
178 private:
179 const char *const str_;
180 const std::size_t bytes_{std::string::npos};
183 constexpr TokenStringMatch<> operator""_tok(const char str[], std::size_t n) {
184 return {str, n};
187 constexpr TokenStringMatch<true> operator""_sptok(
188 const char str[], std::size_t n) {
189 return {str, n};
192 constexpr TokenStringMatch<false, true> operator""_id(
193 const char str[], std::size_t n) {
194 return {str, n};
197 template <class PA>
198 inline constexpr std::enable_if_t<std::is_class_v<PA>,
199 SequenceParser<TokenStringMatch<>, PA>>
200 operator>>(const char *str, const PA &p) {
201 return SequenceParser<TokenStringMatch<>, PA>{TokenStringMatch<>{str}, p};
204 template <class PA>
205 inline constexpr std::enable_if_t<std::is_class_v<PA>,
206 FollowParser<PA, TokenStringMatch<>>>
207 operator/(const PA &p, const char *str) {
208 return FollowParser<PA, TokenStringMatch<>>{p, TokenStringMatch<>{str}};
211 template <class PA> inline constexpr auto parenthesized(const PA &p) {
212 return "(" >> p / ")";
215 template <class PA> inline constexpr auto bracketed(const PA &p) {
216 return "[" >> p / "]";
219 // Quoted character literal constants.
220 struct CharLiteralChar {
221 using resultType = std::pair<char, bool /* was escaped */>;
222 static std::optional<resultType> Parse(ParseState &state) {
223 auto at{state.GetLocation()};
224 if (std::optional<const char *> cp{nextCh.Parse(state)}) {
225 char ch{**cp};
226 if (ch == '\n') {
227 state.Say(CharBlock{at, state.GetLocation()},
228 "Unclosed character constant"_err_en_US);
229 return std::nullopt;
231 if (ch == '\\') {
232 // Most escape sequences in character literals are processed later,
233 // but we have to look for quotes here so that doubled quotes work.
234 if (std::optional<const char *> next{state.PeekAtNextChar()}) {
235 char escaped{**next};
236 if (escaped == '\'' || escaped == '"' || escaped == '\\') {
237 state.UncheckedAdvance();
238 return std::make_pair(escaped, true);
242 return std::make_pair(ch, false);
244 return std::nullopt;
248 template <char quote> struct CharLiteral {
249 using resultType = std::string;
250 static std::optional<std::string> Parse(ParseState &state) {
251 std::string str;
252 static constexpr auto nextch{attempt(CharLiteralChar{})};
253 while (auto ch{nextch.Parse(state)}) {
254 if (ch->second) {
255 str += '\\';
256 } else if (ch->first == quote) {
257 static constexpr auto doubled{attempt(AnyOfChars{SetOfChars{quote}})};
258 if (!doubled.Parse(state)) {
259 return str;
262 str += ch->first;
264 return std::nullopt;
268 // Parse "BOZ" binary literal quoted constants.
269 // As extensions, support X as an alternate hexadecimal marker, and allow
270 // BOZX markers to appear as suffixes.
271 struct BOZLiteral {
272 using resultType = std::string;
273 static std::optional<resultType> Parse(ParseState &state) {
274 char base{'\0'};
275 auto baseChar{[&base](char ch) -> bool {
276 switch (ch) {
277 case 'b':
278 case 'o':
279 case 'z':
280 base = ch;
281 return true;
282 case 'x':
283 base = 'z';
284 return true;
285 default:
286 return false;
290 space.Parse(state);
291 const char *start{state.GetLocation()};
292 std::optional<const char *> at{nextCh.Parse(state)};
293 if (!at) {
294 return std::nullopt;
296 if (**at == 'x' &&
297 !state.IsNonstandardOk(LanguageFeature::BOZExtensions,
298 "nonstandard BOZ literal"_port_en_US)) {
299 return std::nullopt;
301 if (baseChar(**at)) {
302 at = nextCh.Parse(state);
303 if (!at) {
304 return std::nullopt;
308 char quote = **at;
309 if (quote != '\'' && quote != '"') {
310 return std::nullopt;
313 std::string content;
314 while (true) {
315 at = nextCh.Parse(state);
316 if (!at) {
317 return std::nullopt;
319 if (**at == quote) {
320 break;
322 if (**at == ' ') {
323 continue;
325 if (!IsHexadecimalDigit(**at)) {
326 return std::nullopt;
328 content += ToLowerCaseLetter(**at);
331 if (!base) {
332 // extension: base allowed to appear as suffix, too
333 if (!(at = nextCh.Parse(state)) || !baseChar(**at) ||
334 !state.IsNonstandardOk(LanguageFeature::BOZExtensions,
335 "nonstandard BOZ literal"_port_en_US)) {
336 return std::nullopt;
338 spaceCheck.Parse(state);
341 if (content.empty()) {
342 state.Say(start, "no digit in BOZ literal"_err_en_US);
343 return std::nullopt;
345 return {std::string{base} + '"' + content + '"'};
349 // R711 digit-string -> digit [digit]...
350 // N.B. not a token -- no space is skipped
351 struct DigitString {
352 using resultType = CharBlock;
353 static std::optional<resultType> Parse(ParseState &state) {
354 if (std::optional<const char *> ch1{state.PeekAtNextChar()}) {
355 if (IsDecimalDigit(**ch1)) {
356 state.UncheckedAdvance();
357 while (std::optional<const char *> p{state.PeekAtNextChar()}) {
358 if (!IsDecimalDigit(**p)) {
359 break;
361 state.UncheckedAdvance();
363 return CharBlock{*ch1, state.GetLocation()};
366 return std::nullopt;
369 constexpr DigitString digitString;
371 struct SignedIntLiteralConstantWithoutKind {
372 using resultType = CharBlock;
373 static std::optional<resultType> Parse(ParseState &state) {
374 resultType result{state.GetLocation()};
375 static constexpr auto sign{maybe("+-"_ch / space)};
376 if (sign.Parse(state)) {
377 if (auto digits{digitString.Parse(state)}) {
378 result.ExtendToCover(*digits);
379 return result;
382 return std::nullopt;
386 struct DigitString64 {
387 using resultType = std::uint64_t;
388 static std::optional<std::uint64_t> Parse(ParseState &state) {
389 std::optional<const char *> firstDigit{digit.Parse(state)};
390 if (!firstDigit) {
391 return std::nullopt;
393 std::uint64_t value = **firstDigit - '0';
394 bool overflow{false};
395 static constexpr auto getDigit{attempt(digit)};
396 while (auto nextDigit{getDigit.Parse(state)}) {
397 if (value > std::numeric_limits<std::uint64_t>::max() / 10) {
398 overflow = true;
400 value *= 10;
401 int digitValue = **nextDigit - '0';
402 if (value > std::numeric_limits<std::uint64_t>::max() - digitValue) {
403 overflow = true;
405 value += digitValue;
407 if (overflow) {
408 state.Say(*firstDigit, "overflow in decimal literal"_err_en_US);
410 return {value};
413 constexpr DigitString64 digitString64;
415 // R707 signed-int-literal-constant -> [sign] int-literal-constant
416 // N.B. Spaces are consumed before and after the sign, since the sign
417 // and the int-literal-constant are distinct tokens. Does not
418 // handle a trailing kind parameter.
419 static std::optional<std::int64_t> SignedInteger(
420 const std::optional<std::uint64_t> &x, Location at, bool negate,
421 ParseState &state) {
422 if (!x) {
423 return std::nullopt;
425 std::uint64_t limit{std::numeric_limits<std::int64_t>::max()};
426 if (negate) {
427 limit = -(limit + 1);
429 if (*x > limit) {
430 state.Say(at, "overflow in signed decimal literal"_err_en_US);
432 std::int64_t value = *x;
433 return std::make_optional<std::int64_t>(negate ? -value : value);
436 // R710 signed-digit-string -> [sign] digit-string
437 // N.B. Not a complete token -- no space is skipped.
438 // Used only in the exponent parts of real literal constants.
439 struct SignedDigitString {
440 using resultType = std::int64_t;
441 static std::optional<std::int64_t> Parse(ParseState &state) {
442 std::optional<const char *> sign{state.PeekAtNextChar()};
443 if (!sign) {
444 return std::nullopt;
446 bool negate{**sign == '-'};
447 if (negate || **sign == '+') {
448 state.UncheckedAdvance();
450 return SignedInteger(digitString64.Parse(state), *sign, negate, state);
454 // Variants of the above for use in FORMAT specifications, where spaces
455 // must be ignored.
456 struct DigitStringIgnoreSpaces {
457 using resultType = std::uint64_t;
458 static std::optional<std::uint64_t> Parse(ParseState &state) {
459 static constexpr auto getFirstDigit{space >> digit};
460 std::optional<const char *> firstDigit{getFirstDigit.Parse(state)};
461 if (!firstDigit) {
462 return std::nullopt;
464 std::uint64_t value = **firstDigit - '0';
465 bool overflow{false};
466 static constexpr auto getDigit{space >> attempt(digit)};
467 while (auto nextDigit{getDigit.Parse(state)}) {
468 if (value > std::numeric_limits<std::uint64_t>::max() / 10) {
469 overflow = true;
471 value *= 10;
472 int digitValue = **nextDigit - '0';
473 if (value > std::numeric_limits<std::uint64_t>::max() - digitValue) {
474 overflow = true;
476 value += digitValue;
478 if (overflow) {
479 state.Say(*firstDigit, "overflow in decimal literal"_err_en_US);
481 return value;
485 struct PositiveDigitStringIgnoreSpaces {
486 using resultType = std::int64_t;
487 static std::optional<std::int64_t> Parse(ParseState &state) {
488 Location at{state.GetLocation()};
489 return SignedInteger(
490 DigitStringIgnoreSpaces{}.Parse(state), at, false /*positive*/, state);
494 struct SignedDigitStringIgnoreSpaces {
495 using resultType = std::int64_t;
496 static std::optional<std::int64_t> Parse(ParseState &state) {
497 static constexpr auto getSign{space >> attempt("+-"_ch)};
498 bool negate{false};
499 if (std::optional<const char *> sign{getSign.Parse(state)}) {
500 negate = **sign == '-';
502 Location at{state.GetLocation()};
503 return SignedInteger(
504 DigitStringIgnoreSpaces{}.Parse(state), at, negate, state);
508 // Legacy feature: Hollerith literal constants
509 struct HollerithLiteral {
510 using resultType = std::string;
511 static std::optional<std::string> Parse(ParseState &state) {
512 space.Parse(state);
513 const char *start{state.GetLocation()};
514 std::optional<std::uint64_t> charCount{
515 DigitStringIgnoreSpaces{}.Parse(state)};
516 if (!charCount || *charCount < 1) {
517 return std::nullopt;
519 static constexpr auto letterH{"h"_ch};
520 std::optional<const char *> h{letterH.Parse(state)};
521 if (!h) {
522 return std::nullopt;
524 std::string content;
525 for (auto j{*charCount}; j-- > 0;) {
526 int chBytes{UTF_8CharacterBytes(state.GetLocation())};
527 for (int bytes{chBytes}; bytes > 0; --bytes) {
528 if (std::optional<const char *> at{nextCh.Parse(state)}) {
529 if (chBytes == 1 && !std::isprint(**at)) {
530 state.Say(start, "Bad character in Hollerith"_err_en_US);
531 return std::nullopt;
533 content += **at;
534 } else {
535 state.Say(start, "Insufficient characters in Hollerith"_err_en_US);
536 return std::nullopt;
540 return content;
544 struct ConsumedAllInputParser {
545 using resultType = Success;
546 constexpr ConsumedAllInputParser() {}
547 static inline std::optional<Success> Parse(ParseState &state) {
548 if (state.IsAtEnd()) {
549 return {Success{}};
551 return std::nullopt;
554 constexpr ConsumedAllInputParser consumedAllInput;
556 template <char goal> struct SkipPast {
557 using resultType = Success;
558 constexpr SkipPast() {}
559 constexpr SkipPast(const SkipPast &) {}
560 static std::optional<Success> Parse(ParseState &state) {
561 while (std::optional<const char *> p{state.GetNextChar()}) {
562 if (**p == goal) {
563 return {Success{}};
566 return std::nullopt;
570 template <char goal> struct SkipTo {
571 using resultType = Success;
572 constexpr SkipTo() {}
573 constexpr SkipTo(const SkipTo &) {}
574 static std::optional<Success> Parse(ParseState &state) {
575 while (std::optional<const char *> p{state.PeekAtNextChar()}) {
576 if (**p == goal) {
577 return {Success{}};
579 state.UncheckedAdvance();
581 return std::nullopt;
585 // A common idiom in the Fortran grammar is an optional item (usually
586 // a nonempty comma-separated list) that, if present, must follow a comma
587 // and precede a doubled colon. When the item is absent, the comma must
588 // not appear, and the doubled colons are optional.
589 // [[, xyz] ::] is optionalBeforeColons(xyz)
590 // [[, xyz]... ::] is optionalBeforeColons(nonemptyList(xyz))
591 template <typename PA> inline constexpr auto optionalBeforeColons(const PA &p) {
592 using resultType = std::optional<typename PA::resultType>;
593 return "," >> construct<resultType>(p) / "::" ||
594 ("::"_tok || !","_tok) >> pure<resultType>();
596 template <typename PA>
597 inline constexpr auto optionalListBeforeColons(const PA &p) {
598 using resultType = std::list<typename PA::resultType>;
599 return "," >> nonemptyList(p) / "::" ||
600 ("::"_tok || !","_tok) >> pure<resultType>();
603 // Skip over empty lines, leading spaces, and some compiler directives (viz.,
604 // the ones that specify the source form) that might appear before the
605 // next statement. Skip over empty statements (bare semicolons) when
606 // not in strict standard conformance mode. Always succeeds.
607 struct SkipStuffBeforeStatement {
608 using resultType = Success;
609 static std::optional<Success> Parse(ParseState &state) {
610 if (UserState * ustate{state.userState()}) {
611 if (ParsingLog * log{ustate->log()}) {
612 // Save memory: vacate the parsing log before each statement unless
613 // we're logging the whole parse for debugging.
614 if (!ustate->instrumentedParse()) {
615 log->clear();
619 while (std::optional<const char *> at{state.PeekAtNextChar()}) {
620 if (**at == '\n' || **at == ' ') {
621 state.UncheckedAdvance();
622 } else if (**at == '!') {
623 static const char fixed[] = "!dir$ fixed\n", free[] = "!dir$ free\n";
624 static constexpr std::size_t fixedBytes{sizeof fixed - 1};
625 static constexpr std::size_t freeBytes{sizeof free - 1};
626 std::size_t remain{state.BytesRemaining()};
627 if (remain >= fixedBytes && std::memcmp(*at, fixed, fixedBytes) == 0) {
628 state.set_inFixedForm(true).UncheckedAdvance(fixedBytes);
629 } else if (remain >= freeBytes &&
630 std::memcmp(*at, free, freeBytes) == 0) {
631 state.set_inFixedForm(false).UncheckedAdvance(freeBytes);
632 } else {
633 break;
635 } else if (**at == ';' &&
636 state.IsNonstandardOk(
637 LanguageFeature::EmptyStatement, "empty statement"_port_en_US)) {
638 state.UncheckedAdvance();
639 } else {
640 break;
643 return {Success{}};
646 constexpr SkipStuffBeforeStatement skipStuffBeforeStatement;
648 // R602 underscore -> _
649 constexpr auto underscore{"_"_ch};
651 // Characters besides letters and digits that may appear in names.
652 // N.B. Don't accept an underscore if it is immediately followed by a
653 // quotation mark, so that kindParam_"character literal" is parsed properly.
654 // PGI and ifort accept '$' in identifiers, even as the initial character.
655 // Cray and gfortran accept '$', but not as the first character.
656 // Cray accepts '@' as well.
657 constexpr auto otherIdChar{underscore / !"'\""_ch ||
658 extension<LanguageFeature::PunctuationInNames>(
659 "nonstandard usage: punctuation in name"_port_en_US, "$@"_ch)};
661 constexpr auto logicalTRUE{
662 (".TRUE."_tok ||
663 extension<LanguageFeature::LogicalAbbreviations>(
664 "nonstandard usage: .T. spelling of .TRUE."_port_en_US,
665 ".T."_tok)) >>
666 pure(true)};
667 constexpr auto logicalFALSE{
668 (".FALSE."_tok ||
669 extension<LanguageFeature::LogicalAbbreviations>(
670 "nonstandard usage: .F. spelling of .FALSE."_port_en_US,
671 ".F."_tok)) >>
672 pure(false)};
674 // deprecated: Hollerith literals
675 constexpr auto rawHollerithLiteral{
676 deprecated<LanguageFeature::Hollerith>(HollerithLiteral{})};
678 template <typename A> constexpr decltype(auto) verbatim(A x) {
679 return sourced(construct<Verbatim>(x));
682 } // namespace Fortran::parser
683 #endif // FORTRAN_PARSER_TOKEN_PARSERS_H_