1 //===-- runtime/utf.cpp ---------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
11 namespace Fortran::runtime
{
14 const std::uint8_t UTF8FirstByteTable
[256]{
15 /* 00 - 7F: 7 bit payload in single byte */
16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24 /* 80 - BF: invalid first byte, valid later byte */
25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29 /* C0 - DF: 11 bit payload */
30 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
31 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
32 /* E0 - EF: 16 bit payload */
33 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
34 /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4,
35 /* F8 - FB: 26 bit payload */ 5, 5, 5, 5,
36 /* FC - FD: 31 bit payload */ 6, 6,
37 /* FE: 32 bit payload */ 7,
42 // Non-minimal encodings are accepted.
43 std::optional
<char32_t
> DecodeUTF8(const char *p0
) {
44 const std::uint8_t *p
{reinterpret_cast<const std::uint8_t *>(p0
)};
45 std::size_t bytes
{MeasureUTF8Bytes(*p0
)};
48 } else if (bytes
> 1) {
49 std::uint64_t result
{char32_t
{*p
} & (0x7f >> bytes
)};
50 for (std::size_t j
{1}; j
< bytes
; ++j
) {
51 std::uint8_t next
{p
[j
]};
52 if (next
< 0x80 || next
> 0xbf) {
55 result
= (result
<< 6) | (next
& 0x3f);
57 if (result
<= 0xffffffff) {
58 return static_cast<char32_t
>(result
);
64 std::size_t EncodeUTF8(char *p0
, char32_t ucs
) {
65 std::uint8_t *p
{reinterpret_cast<std::uint8_t *>(p0
)};
69 } else if (ucs
<= 0x7ff) {
70 p
[0] = 0xc0 | (ucs
>> 6);
71 p
[1] = 0x80 | (ucs
& 0x3f);
73 } else if (ucs
<= 0xffff) {
74 p
[0] = 0xe0 | (ucs
>> 12);
75 p
[1] = 0x80 | ((ucs
>> 6) & 0x3f);
76 p
[2] = 0x80 | (ucs
& 0x3f);
78 } else if (ucs
<= 0x1fffff) {
79 p
[0] = 0xf0 | (ucs
>> 18);
80 p
[1] = 0x80 | ((ucs
>> 12) & 0x3f);
81 p
[2] = 0x80 | ((ucs
>> 6) & 0x3f);
82 p
[3] = 0x80 | (ucs
& 0x3f);
84 } else if (ucs
<= 0x3ffffff) {
85 p
[0] = 0xf8 | (ucs
>> 24);
86 p
[1] = 0x80 | ((ucs
>> 18) & 0x3f);
87 p
[2] = 0x80 | ((ucs
>> 12) & 0x3f);
88 p
[3] = 0x80 | ((ucs
>> 6) & 0x3f);
89 p
[4] = 0x80 | (ucs
& 0x3f);
91 } else if (ucs
<= 0x7ffffff) {
92 p
[0] = 0xf8 | (ucs
>> 30);
93 p
[1] = 0x80 | ((ucs
>> 24) & 0x3f);
94 p
[2] = 0x80 | ((ucs
>> 18) & 0x3f);
95 p
[3] = 0x80 | ((ucs
>> 12) & 0x3f);
96 p
[4] = 0x80 | ((ucs
>> 6) & 0x3f);
97 p
[5] = 0x80 | (ucs
& 0x3f);
101 p
[1] = 0x80 | ((ucs
>> 30) & 0x3f);
102 p
[2] = 0x80 | ((ucs
>> 24) & 0x3f);
103 p
[3] = 0x80 | ((ucs
>> 18) & 0x3f);
104 p
[4] = 0x80 | ((ucs
>> 12) & 0x3f);
105 p
[5] = 0x80 | ((ucs
>> 6) & 0x3f);
106 p
[6] = 0x80 | (ucs
& 0x3f);
111 } // namespace Fortran::runtime