flang/lib/Parser/characters.cpp

   1 //===-- lib/Parser/characters.cpp -----------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "flang/Parser/characters.h"
  10 #include "flang/Common/idioms.h"
  11 #include <algorithm>
  12 #include <cstddef>
  13 #include <optional>
  14 #include <type_traits>
  15
  16 namespace Fortran::parser {
  17
  18 bool useHexadecimalEscapeSequences{false};
  19
  20 int UTF_8CharacterBytes(const char *p) {
  21   if ((*p & 0x80) == 0) {
  22     return 1;
  23   } else if ((*p & 0xe0) == 0xc0) {
  24     return 2;
  25   } else if ((*p & 0xf0) == 0xe0) {
  26     return 3;
  27   } else if ((*p & 0xf8) == 0xf0) {
  28     return 4;
  29   } else if ((*p & 0xfc) == 0xf8) {
  30     return 5;
  31   } else {
  32     return 6;
  33   }
  34 }
  35
  36 template <typename STRING>
  37 std::string QuoteCharacterLiteralHelper(
  38     const STRING &str, bool backslashEscapes, Encoding encoding) {
  39   std::string result{'"'};
  40   const auto emit{[&](char ch) { result += ch; }};
  41   for (auto ch : str) {
  42     using CharT = std::decay_t<decltype(ch)>;
  43     char32_t ch32{static_cast<std::make_unsigned_t<CharT>>(ch)};
  44     if (ch32 == static_cast<unsigned char>('"')) {
  45       emit('"'); // double the " when it appears in the text
  46     }
  47     EmitQuotedChar(ch32, emit, emit, backslashEscapes, encoding);
  48   }
  49   result += '"';
  50   return result;
  51 }
  52
  53 std::string QuoteCharacterLiteral(
  54     const std::string &str, bool backslashEscapes, Encoding encoding) {
  55   return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
  56 }
  57
  58 std::string QuoteCharacterLiteral(
  59     const std::u16string &str, bool backslashEscapes, Encoding encoding) {
  60   return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
  61 }
  62
  63 std::string QuoteCharacterLiteral(
  64     const std::u32string &str, bool backslashEscapes, Encoding encoding) {
  65   return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
  66 }
  67
  68 template <> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t ucs) {
  69   CHECK(ucs <= 0xff);
  70   EncodedCharacter result;
  71   result.buffer[0] = ucs;
  72   result.bytes = 1;
  73   return result;
  74 }
  75
  76 template <> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t ucs) {
  77   // N.B. char32_t is unsigned
  78   EncodedCharacter result;
  79   if (ucs <= 0x7f) {
  80     result.buffer[0] = ucs;
  81     result.bytes = 1;
  82   } else if (ucs <= 0x7ff) {
  83     result.buffer[0] = 0xc0 | (ucs >> 6);
  84     result.buffer[1] = 0x80 | (ucs & 0x3f);
  85     result.bytes = 2;
  86   } else if (ucs <= 0xffff) {
  87     result.buffer[0] = 0xe0 | (ucs >> 12);
  88     result.buffer[1] = 0x80 | ((ucs >> 6) & 0x3f);
  89     result.buffer[2] = 0x80 | (ucs & 0x3f);
  90     result.bytes = 3;
  91   } else if (ucs <= 0x1fffff) {
  92     // UCS actually only goes up to 0x10ffff, but the
  93     // UTF-8 encoding can handle 32 bits.
  94     result.buffer[0] = 0xf0 | (ucs >> 18);
  95     result.buffer[1] = 0x80 | ((ucs >> 12) & 0x3f);
  96     result.buffer[2] = 0x80 | ((ucs >> 6) & 0x3f);
  97     result.buffer[3] = 0x80 | (ucs & 0x3f);
  98     result.bytes = 4;
  99   } else if (ucs <= 0x3ffffff) {
 100     result.buffer[0] = 0xf8 | (ucs >> 24);
 101     result.buffer[1] = 0x80 | ((ucs >> 18) & 0x3f);
 102     result.buffer[2] = 0x80 | ((ucs >> 12) & 0x3f);
 103     result.buffer[3] = 0x80 | ((ucs >> 6) & 0x3f);
 104     result.buffer[4] = 0x80 | (ucs & 0x3f);
 105     result.bytes = 5;
 106   } else {
 107     result.buffer[0] = 0xfc | (ucs >> 30);
 108     result.buffer[1] = 0x80 | ((ucs >> 24) & 0x3f);
 109     result.buffer[2] = 0x80 | ((ucs >> 18) & 0x3f);
 110     result.buffer[3] = 0x80 | ((ucs >> 12) & 0x3f);
 111     result.buffer[4] = 0x80 | ((ucs >> 6) & 0x3f);
 112     result.buffer[5] = 0x80 | (ucs & 0x3f);
 113     result.bytes = 6;
 114   }
 115   return result;
 116 }
 117
 118 EncodedCharacter EncodeCharacter(Encoding encoding, char32_t ucs) {
 119   switch (encoding) {
 120     SWITCH_COVERS_ALL_CASES
 121   case Encoding::LATIN_1:
 122     return EncodeCharacter<Encoding::LATIN_1>(ucs);
 123   case Encoding::UTF_8:
 124     return EncodeCharacter<Encoding::UTF_8>(ucs);
 125   }
 126 }
 127
 128 template <Encoding ENCODING, typename STRING>
 129 std::string EncodeString(const STRING &str) {
 130   std::string result;
 131   for (auto ch : str) {
 132     char32_t uch{static_cast<std::make_unsigned_t<decltype(ch)>>(ch)};
 133     EncodedCharacter encoded{EncodeCharacter<ENCODING>(uch)};
 134     result.append(encoded.buffer, static_cast<std::size_t>(encoded.bytes));
 135   }
 136   return result;
 137 }
 138
 139 template std::string EncodeString<Encoding::LATIN_1, std::string>(
 140     const std::string &);
 141 template std::string EncodeString<Encoding::UTF_8, std::u16string>(
 142     const std::u16string &);
 143 template std::string EncodeString<Encoding::UTF_8, std::u32string>(
 144     const std::u32string &);
 145
 146 template <>
 147 DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>(
 148     const char *cp, std::size_t bytes) {
 149   if (bytes >= 1) {
 150     return {*reinterpret_cast<const std::uint8_t *>(cp), 1};
 151   } else {
 152     return {};
 153   }
 154 }
 155
 156 template <>
 157 DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(
 158     const char *cp, std::size_t bytes) {
 159   auto p{reinterpret_cast<const std::uint8_t *>(cp)};
 160   char32_t ch{*p};
 161   if (ch <= 0x7f) {
 162     return {ch, 1};
 163   } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 &&
 164       ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) {
 165     ch = ((ch & 7) << 6) | (p[1] & 0x3f);
 166     ch = (ch << 6) | (p[2] & 0x3f);
 167     ch = (ch << 6) | (p[3] & 0x3f);
 168     return {ch, 4};
 169   } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 &&
 170       ((p[1] | p[2]) & 0xc0) == 0x80) {
 171     ch = ((ch & 0xf) << 6) | (p[1] & 0x3f);
 172     ch = (ch << 6) | (p[2] & 0x3f);
 173     return {ch, 3};
 174   } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 &&
 175       (p[1] & 0xc0) == 0x80) {
 176     ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f);
 177     return {ch, 2};
 178   } else {
 179     return {}; // not valid UTF-8
 180   }
 181 }
 182
 183 static DecodedCharacter DecodeEscapedCharacter(
 184     const char *cp, std::size_t bytes) {
 185   if (cp[0] == '\\' && bytes >= 2) {
 186     if (std::optional<char> escChar{BackslashEscapeValue(cp[1])}) {
 187       return {static_cast<unsigned char>(*escChar), 2};
 188     } else if (IsOctalDigit(cp[1])) {
 189       std::size_t maxLen{std::min(std::size_t{4}, bytes)};
 190       char32_t code{static_cast<char32_t>(DecimalDigitValue(cp[1]))};
 191       std::size_t len{2}; // so far
 192       for (; code <= 037 && len < maxLen && IsOctalDigit(cp[len]); ++len) {
 193         code = 8 * code + DecimalDigitValue(cp[len]);
 194       }
 195       return {code, static_cast<int>(len)};
 196     } else if (bytes >= 4 && ToLowerCaseLetter(cp[1]) == 'x' &&
 197         IsHexadecimalDigit(cp[2]) && IsHexadecimalDigit(cp[3])) {
 198       return {static_cast<char32_t>(16 * HexadecimalDigitValue(cp[2]) +
 199                   HexadecimalDigitValue(cp[3])),
 200           4};
 201     } else if (IsLetter(cp[1])) {
 202       // Unknown escape - ignore the '\' (PGI compatibility)
 203       return {static_cast<unsigned char>(cp[1]), 2};
 204     } else {
 205       // Not an escape character.
 206       return {'\\', 1};
 207     }
 208   }
 209   return {static_cast<unsigned char>(cp[0]), 1};
 210 }
 211
 212 template <Encoding ENCODING>
 213 static DecodedCharacter DecodeEscapedCharacters(
 214     const char *cp, std::size_t bytes) {
 215   char buffer[EncodedCharacter::maxEncodingBytes];
 216   int count[EncodedCharacter::maxEncodingBytes];
 217   std::size_t at{0}, len{0};
 218   for (; len < EncodedCharacter::maxEncodingBytes && at < bytes; ++len) {
 219     DecodedCharacter code{DecodeEscapedCharacter(cp + at, bytes - at)};
 220     buffer[len] = code.codepoint;
 221     at += code.bytes;
 222     count[len] = at;
 223   }
 224   DecodedCharacter code{DecodeCharacter<ENCODING>(buffer, len, false)};
 225   if (code.bytes > 0) {
 226     code.bytes = count[code.bytes - 1];
 227   } else {
 228     code.codepoint = buffer[0] & 0xff;
 229     code.bytes = count[0];
 230   }
 231   return code;
 232 }
 233
 234 template <Encoding ENCODING>
 235 DecodedCharacter DecodeCharacter(
 236     const char *cp, std::size_t bytes, bool backslashEscapes) {
 237   if (backslashEscapes && bytes >= 2 && *cp == '\\') {
 238     if (ENCODING == Encoding::UTF_8 && bytes >= 6 &&
 239         ToLowerCaseLetter(cp[1]) == 'u' && IsHexadecimalDigit(cp[2]) &&
 240         IsHexadecimalDigit(cp[3]) && IsHexadecimalDigit(cp[4]) &&
 241         IsHexadecimalDigit(cp[5])) {
 242       char32_t ch{
 243           static_cast<char32_t>(4096 * HexadecimalDigitValue(cp[2]) +
 244               256 * HexadecimalDigitValue(cp[3]) +
 245               16 * HexadecimalDigitValue(cp[4]) + HexadecimalDigitValue(cp[5])),
 246       };
 247       if (bytes >= 10 && IsHexadecimalDigit(cp[6]) &&
 248           IsHexadecimalDigit(cp[7]) && IsHexadecimalDigit(cp[8]) &&
 249           IsHexadecimalDigit(cp[9])) {
 250         return {(ch << 16) |
 251                 (4096 * HexadecimalDigitValue(cp[6]) +
 252                     256 * HexadecimalDigitValue(cp[7]) +
 253                     16 * HexadecimalDigitValue(cp[8]) +
 254                     HexadecimalDigitValue(cp[9])),
 255             10};
 256       } else {
 257         return {ch, 6};
 258       }
 259     } else {
 260       return DecodeEscapedCharacters<ENCODING>(cp, bytes);
 261     }
 262   } else {
 263     return DecodeRawCharacter<ENCODING>(cp, bytes);
 264   }
 265 }
 266
 267 template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>(
 268     const char *, std::size_t, bool);
 269 template DecodedCharacter DecodeCharacter<Encoding::UTF_8>(
 270     const char *, std::size_t, bool);
 271
 272 DecodedCharacter DecodeCharacter(Encoding encoding, const char *cp,
 273     std::size_t bytes, bool backslashEscapes) {
 274   switch (encoding) {
 275     SWITCH_COVERS_ALL_CASES
 276   case Encoding::LATIN_1:
 277     return DecodeCharacter<Encoding::LATIN_1>(cp, bytes, backslashEscapes);
 278   case Encoding::UTF_8:
 279     return DecodeCharacter<Encoding::UTF_8>(cp, bytes, backslashEscapes);
 280   }
 281 }
 282
 283 template <typename RESULT, Encoding ENCODING>
 284 RESULT DecodeString(const std::string &s, bool backslashEscapes) {
 285   RESULT result;
 286   const char *p{s.c_str()};
 287   for (auto bytes{s.size()}; bytes != 0;) {
 288     DecodedCharacter decoded{
 289         DecodeCharacter<ENCODING>(p, bytes, backslashEscapes)};
 290     if (decoded.bytes > 0) {
 291       if (static_cast<std::size_t>(decoded.bytes) <= bytes) {
 292         result.append(1, decoded.codepoint);
 293         bytes -= decoded.bytes;
 294         p += decoded.bytes;
 295         continue;
 296       }
 297     }
 298     result.append(1, static_cast<uint8_t>(*p));
 299     ++p;
 300     --bytes;
 301   }
 302   return result;
 303 }
 304
 305 template std::string DecodeString<std::string, Encoding::LATIN_1>(
 306     const std::string &, bool);
 307 template std::u16string DecodeString<std::u16string, Encoding::UTF_8>(
 308     const std::string &, bool);
 309 template std::u32string DecodeString<std::u32string, Encoding::UTF_8>(
 310     const std::string &, bool);
 311 } // namespace Fortran::parser