1 //===-- lib/Parser/characters.cpp -----------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "flang/Parser/characters.h"
10 #include "flang/Common/idioms.h"
14 #include <type_traits>
16 namespace Fortran::parser
{
18 bool useHexadecimalEscapeSequences
{false};
20 int UTF_8CharacterBytes(const char *p
) {
21 if ((*p
& 0x80) == 0) {
23 } else if ((*p
& 0xe0) == 0xc0) {
25 } else if ((*p
& 0xf0) == 0xe0) {
27 } else if ((*p
& 0xf8) == 0xf0) {
29 } else if ((*p
& 0xfc) == 0xf8) {
36 template <typename STRING
>
37 std::string
QuoteCharacterLiteralHelper(
38 const STRING
&str
, bool backslashEscapes
, Encoding encoding
) {
39 std::string result
{'"'};
40 const auto emit
{[&](char ch
) { result
+= ch
; }};
42 using CharT
= std::decay_t
<decltype(ch
)>;
43 char32_t ch32
{static_cast<std::make_unsigned_t
<CharT
>>(ch
)};
44 if (ch32
== static_cast<unsigned char>('"')) {
45 emit('"'); // double the " when it appears in the text
47 EmitQuotedChar(ch32
, emit
, emit
, backslashEscapes
, encoding
);
53 std::string
QuoteCharacterLiteral(
54 const std::string
&str
, bool backslashEscapes
, Encoding encoding
) {
55 return QuoteCharacterLiteralHelper(str
, backslashEscapes
, encoding
);
58 std::string
QuoteCharacterLiteral(
59 const std::u16string
&str
, bool backslashEscapes
, Encoding encoding
) {
60 return QuoteCharacterLiteralHelper(str
, backslashEscapes
, encoding
);
63 std::string
QuoteCharacterLiteral(
64 const std::u32string
&str
, bool backslashEscapes
, Encoding encoding
) {
65 return QuoteCharacterLiteralHelper(str
, backslashEscapes
, encoding
);
68 template <> EncodedCharacter EncodeCharacter
<Encoding::LATIN_1
>(char32_t ucs
) {
70 EncodedCharacter result
;
71 result
.buffer
[0] = ucs
;
76 template <> EncodedCharacter EncodeCharacter
<Encoding::UTF_8
>(char32_t ucs
) {
77 // N.B. char32_t is unsigned
78 EncodedCharacter result
;
80 result
.buffer
[0] = ucs
;
82 } else if (ucs
<= 0x7ff) {
83 result
.buffer
[0] = 0xc0 | (ucs
>> 6);
84 result
.buffer
[1] = 0x80 | (ucs
& 0x3f);
86 } else if (ucs
<= 0xffff) {
87 result
.buffer
[0] = 0xe0 | (ucs
>> 12);
88 result
.buffer
[1] = 0x80 | ((ucs
>> 6) & 0x3f);
89 result
.buffer
[2] = 0x80 | (ucs
& 0x3f);
91 } else if (ucs
<= 0x1fffff) {
92 // UCS actually only goes up to 0x10ffff, but the
93 // UTF-8 encoding can handle 32 bits.
94 result
.buffer
[0] = 0xf0 | (ucs
>> 18);
95 result
.buffer
[1] = 0x80 | ((ucs
>> 12) & 0x3f);
96 result
.buffer
[2] = 0x80 | ((ucs
>> 6) & 0x3f);
97 result
.buffer
[3] = 0x80 | (ucs
& 0x3f);
99 } else if (ucs
<= 0x3ffffff) {
100 result
.buffer
[0] = 0xf8 | (ucs
>> 24);
101 result
.buffer
[1] = 0x80 | ((ucs
>> 18) & 0x3f);
102 result
.buffer
[2] = 0x80 | ((ucs
>> 12) & 0x3f);
103 result
.buffer
[3] = 0x80 | ((ucs
>> 6) & 0x3f);
104 result
.buffer
[4] = 0x80 | (ucs
& 0x3f);
107 result
.buffer
[0] = 0xfc | (ucs
>> 30);
108 result
.buffer
[1] = 0x80 | ((ucs
>> 24) & 0x3f);
109 result
.buffer
[2] = 0x80 | ((ucs
>> 18) & 0x3f);
110 result
.buffer
[3] = 0x80 | ((ucs
>> 12) & 0x3f);
111 result
.buffer
[4] = 0x80 | ((ucs
>> 6) & 0x3f);
112 result
.buffer
[5] = 0x80 | (ucs
& 0x3f);
118 EncodedCharacter
EncodeCharacter(Encoding encoding
, char32_t ucs
) {
120 SWITCH_COVERS_ALL_CASES
121 case Encoding::LATIN_1
:
122 return EncodeCharacter
<Encoding::LATIN_1
>(ucs
);
123 case Encoding::UTF_8
:
124 return EncodeCharacter
<Encoding::UTF_8
>(ucs
);
128 template <Encoding ENCODING
, typename STRING
>
129 std::string
EncodeString(const STRING
&str
) {
131 for (auto ch
: str
) {
132 char32_t uch
{static_cast<std::make_unsigned_t
<decltype(ch
)>>(ch
)};
133 EncodedCharacter encoded
{EncodeCharacter
<ENCODING
>(uch
)};
134 result
.append(encoded
.buffer
, static_cast<std::size_t>(encoded
.bytes
));
139 template std::string EncodeString
<Encoding::LATIN_1
, std::string
>(
140 const std::string
&);
141 template std::string EncodeString
<Encoding::UTF_8
, std::u16string
>(
142 const std::u16string
&);
143 template std::string EncodeString
<Encoding::UTF_8
, std::u32string
>(
144 const std::u32string
&);
147 DecodedCharacter DecodeRawCharacter
<Encoding::LATIN_1
>(
148 const char *cp
, std::size_t bytes
) {
150 return {*reinterpret_cast<const std::uint8_t *>(cp
), 1};
157 DecodedCharacter DecodeRawCharacter
<Encoding::UTF_8
>(
158 const char *cp
, std::size_t bytes
) {
159 auto p
{reinterpret_cast<const std::uint8_t *>(cp
)};
163 } else if ((ch
& 0xf8) == 0xf0 && bytes
>= 4 && ch
> 0xf0 &&
164 ((p
[1] | p
[2] | p
[3]) & 0xc0) == 0x80) {
165 ch
= ((ch
& 7) << 6) | (p
[1] & 0x3f);
166 ch
= (ch
<< 6) | (p
[2] & 0x3f);
167 ch
= (ch
<< 6) | (p
[3] & 0x3f);
169 } else if ((ch
& 0xf0) == 0xe0 && bytes
>= 3 && ch
> 0xe0 &&
170 ((p
[1] | p
[2]) & 0xc0) == 0x80) {
171 ch
= ((ch
& 0xf) << 6) | (p
[1] & 0x3f);
172 ch
= (ch
<< 6) | (p
[2] & 0x3f);
174 } else if ((ch
& 0xe0) == 0xc0 && bytes
>= 2 && ch
> 0xc0 &&
175 (p
[1] & 0xc0) == 0x80) {
176 ch
= ((ch
& 0x1f) << 6) | (p
[1] & 0x3f);
179 return {}; // not valid UTF-8
183 static DecodedCharacter
DecodeEscapedCharacter(
184 const char *cp
, std::size_t bytes
) {
185 if (cp
[0] == '\\' && bytes
>= 2) {
186 if (std::optional
<char> escChar
{BackslashEscapeValue(cp
[1])}) {
187 return {static_cast<unsigned char>(*escChar
), 2};
188 } else if (IsOctalDigit(cp
[1])) {
189 std::size_t maxLen
{std::min(std::size_t{4}, bytes
)};
190 char32_t code
{static_cast<char32_t
>(DecimalDigitValue(cp
[1]))};
191 std::size_t len
{2}; // so far
192 for (; code
<= 037 && len
< maxLen
&& IsOctalDigit(cp
[len
]); ++len
) {
193 code
= 8 * code
+ DecimalDigitValue(cp
[len
]);
195 return {code
, static_cast<int>(len
)};
196 } else if (bytes
>= 4 && ToLowerCaseLetter(cp
[1]) == 'x' &&
197 IsHexadecimalDigit(cp
[2]) && IsHexadecimalDigit(cp
[3])) {
198 return {static_cast<char32_t
>(16 * HexadecimalDigitValue(cp
[2]) +
199 HexadecimalDigitValue(cp
[3])),
201 } else if (IsLetter(cp
[1])) {
202 // Unknown escape - ignore the '\' (PGI compatibility)
203 return {static_cast<unsigned char>(cp
[1]), 2};
205 // Not an escape character.
209 return {static_cast<unsigned char>(cp
[0]), 1};
212 template <Encoding ENCODING
>
213 static DecodedCharacter
DecodeEscapedCharacters(
214 const char *cp
, std::size_t bytes
) {
215 char buffer
[EncodedCharacter::maxEncodingBytes
];
216 int count
[EncodedCharacter::maxEncodingBytes
];
217 std::size_t at
{0}, len
{0};
218 for (; len
< EncodedCharacter::maxEncodingBytes
&& at
< bytes
; ++len
) {
219 DecodedCharacter code
{DecodeEscapedCharacter(cp
+ at
, bytes
- at
)};
220 buffer
[len
] = code
.codepoint
;
224 DecodedCharacter code
{DecodeCharacter
<ENCODING
>(buffer
, len
, false)};
225 if (code
.bytes
> 0) {
226 code
.bytes
= count
[code
.bytes
- 1];
228 code
.codepoint
= buffer
[0] & 0xff;
229 code
.bytes
= count
[0];
234 template <Encoding ENCODING
>
235 DecodedCharacter
DecodeCharacter(
236 const char *cp
, std::size_t bytes
, bool backslashEscapes
) {
237 if (backslashEscapes
&& bytes
>= 2 && *cp
== '\\') {
238 if (ENCODING
== Encoding::UTF_8
&& bytes
>= 6 &&
239 ToLowerCaseLetter(cp
[1]) == 'u' && IsHexadecimalDigit(cp
[2]) &&
240 IsHexadecimalDigit(cp
[3]) && IsHexadecimalDigit(cp
[4]) &&
241 IsHexadecimalDigit(cp
[5])) {
243 static_cast<char32_t
>(4096 * HexadecimalDigitValue(cp
[2]) +
244 256 * HexadecimalDigitValue(cp
[3]) +
245 16 * HexadecimalDigitValue(cp
[4]) + HexadecimalDigitValue(cp
[5])),
247 if (bytes
>= 10 && IsHexadecimalDigit(cp
[6]) &&
248 IsHexadecimalDigit(cp
[7]) && IsHexadecimalDigit(cp
[8]) &&
249 IsHexadecimalDigit(cp
[9])) {
251 (4096 * HexadecimalDigitValue(cp
[6]) +
252 256 * HexadecimalDigitValue(cp
[7]) +
253 16 * HexadecimalDigitValue(cp
[8]) +
254 HexadecimalDigitValue(cp
[9])),
260 return DecodeEscapedCharacters
<ENCODING
>(cp
, bytes
);
263 return DecodeRawCharacter
<ENCODING
>(cp
, bytes
);
267 template DecodedCharacter DecodeCharacter
<Encoding::LATIN_1
>(
268 const char *, std::size_t, bool);
269 template DecodedCharacter DecodeCharacter
<Encoding::UTF_8
>(
270 const char *, std::size_t, bool);
272 DecodedCharacter
DecodeCharacter(Encoding encoding
, const char *cp
,
273 std::size_t bytes
, bool backslashEscapes
) {
275 SWITCH_COVERS_ALL_CASES
276 case Encoding::LATIN_1
:
277 return DecodeCharacter
<Encoding::LATIN_1
>(cp
, bytes
, backslashEscapes
);
278 case Encoding::UTF_8
:
279 return DecodeCharacter
<Encoding::UTF_8
>(cp
, bytes
, backslashEscapes
);
283 template <typename RESULT
, Encoding ENCODING
>
284 RESULT
DecodeString(const std::string
&s
, bool backslashEscapes
) {
286 const char *p
{s
.c_str()};
287 for (auto bytes
{s
.size()}; bytes
!= 0;) {
288 DecodedCharacter decoded
{
289 DecodeCharacter
<ENCODING
>(p
, bytes
, backslashEscapes
)};
290 if (decoded
.bytes
> 0) {
291 if (static_cast<std::size_t>(decoded
.bytes
) <= bytes
) {
292 result
.append(1, decoded
.codepoint
);
293 bytes
-= decoded
.bytes
;
298 result
.append(1, static_cast<uint8_t>(*p
));
305 template std::string DecodeString
<std::string
, Encoding::LATIN_1
>(
306 const std::string
&, bool);
307 template std::u16string DecodeString
<std::u16string
, Encoding::UTF_8
>(
308 const std::string
&, bool);
309 template std::u32string DecodeString
<std::u32string
, Encoding::UTF_8
>(
310 const std::string
&, bool);
311 } // namespace Fortran::parser