flang/runtime/utf.h

   1 //===-- runtime/utf.h -----------------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 // UTF-8 is the variant-width standard encoding of Unicode (ISO 10646)
  10 // code points.
  11 //
  12 // 7-bit values in [00 .. 7F] represent themselves as single bytes, so true
  13 // 7-bit ASCII is also valid UTF-8.
  14 //
  15 // Larger values are encoded with a start byte in [C0 .. FE] that carries
  16 // the length of the encoding and some of the upper bits of the value, followed
  17 // by one or more bytes in the range [80 .. BF].
  18 //
  19 // Specifically, the first byte holds two or more uppermost set bits,
  20 // a zero bit, and some payload; the second and later bytes each start with
  21 // their uppermost bit set, the next bit clear, and six bits of payload.
  22 // Payload parcels are in big-endian order.  All bytes must be present in a
  23 // valid sequence; i.e., low-order sezo bits must be explicit.  UTF-8 is
  24 // self-synchronizing on input as any byte value cannot be both a valid
  25 // first byte or trailing byte.
  26 //
  27 // 0xxxxxxx - 7 bit ASCII
  28 // 110xxxxx 10xxxxxx - 11-bit value
  29 // 1110xxxx 10xxxxxx 10xxxxxx - 16-bit value
  30 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21-bit value
  31 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26-bit value
  32 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 31-bit value
  33 // 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 36-bit value
  34 //
  35 // Canonical UTF-8 sequences should be minimal, and our output is so, but
  36 // we do not reject non-minimal sequences on input.  Unicode only defines
  37 // code points up to 0x10FFFF, so 21-bit (4-byte) UTF-8 is the actual
  38 // standard maximum.  However, we support extended forms up to 32 bits so that
  39 // CHARACTER(KIND=4) can be abused to hold arbitrary 32-bit data.
  40
  41 #ifndef FORTRAN_RUNTIME_UTF_H_
  42 #define FORTRAN_RUNTIME_UTF_H_
  43
  44 #include <cstddef>
  45 #include <cstdint>
  46 #include <optional>
  47
  48 namespace Fortran::runtime {
  49
  50 // Derive the length of a UTF-8 character encoding from its first byte.
  51 // A zero result signifies an invalid encoding.
  52 extern const std::uint8_t UTF8FirstByteTable[256];
  53 static inline std::size_t MeasureUTF8Bytes(char first) {
  54   return UTF8FirstByteTable[static_cast<std::uint8_t>(first)];
  55 }
  56
  57 static constexpr std::size_t maxUTF8Bytes{7};
  58
  59 // Ensure that all bytes are present in sequence in the input buffer
  60 // before calling; use MeasureUTF8Bytes(first byte) to count them.
  61 std::optional<char32_t> DecodeUTF8(const char *);
  62
  63 // Ensure that at least maxUTF8Bytes remain in the output
  64 // buffer before calling.
  65 std::size_t EncodeUTF8(char *, char32_t);
  66
  67 } // namespace Fortran::runtime
  68 #endif // FORTRAN_RUNTIME_UTF_H_