headers/private/interface/utf8_functions.h

   1 /*
   2  * Copyright 2004-2010, Haiku, Inc.
   3  * Distributed under the terms of the MIT License.
   4  */
   5 #ifndef _UTF8_FUNCTIONS_H
   6 #define _UTF8_FUNCTIONS_H
   7
   8
   9 #include <SupportDefs.h>
  10
  11
  12 static inline bool
  13 IsInsideGlyph(uchar ch)
  14 {
  15         return (ch & 0xc0) == 0x80;
  16 }
  17
  18
  19 static inline uint32
  20 UTF8NextCharLenUnsafe(const char *text)
  21 {
  22         const char *ptr = text;
  23
  24         do {
  25                 ptr++;
  26         } while (IsInsideGlyph(*ptr));
  27
  28         return ptr - text;
  29 }
  30
  31
  32 static inline uint32
  33 UTF8NextCharLen(const char *text)
  34 {
  35         if (text == NULL || *text == 0)
  36                 return 0;
  37
  38         return UTF8NextCharLenUnsafe(text);
  39 }
  40
  41
  42 static inline uint32
  43 UTF8NextCharLen(const char *bytes, size_t length)
  44 {
  45         if (bytes == NULL || length == 0 || bytes[0] == 0)
  46                 return 0;
  47
  48         if ((bytes[0] & 0x80) == 0) {
  49                 // A single ASCII char - or so...
  50                 return 1;
  51         }
  52
  53         if (IsInsideGlyph(bytes[0])) {
  54                 // Not a proper multibyte start.
  55                 return 0;
  56         }
  57
  58         // We already know that we have the upper two bits set due to the above
  59         // two checks.
  60         uint8 mask = 0x20;
  61         size_t bytesExpected = 2;
  62         while ((bytes[0] & mask) != 0) {
  63                 if (mask == 0x02) {
  64                         // Seven byte char - invalid.
  65                         return 0;
  66                 }
  67
  68                 bytesExpected++;
  69                 mask >>= 1;
  70         }
  71
  72         // There would need to be more bytes to satisfy the char.
  73         if (bytesExpected > length)
  74                 return 0;
  75
  76         // We already know the first byte is fine, check the rest.
  77         for (size_t i = 1; i < bytesExpected; i++) {
  78                 if (!IsInsideGlyph(bytes[i])) {
  79                         // The sequence is incomplete.
  80                         return 0;
  81                 }
  82         }
  83
  84         // Puh, everything's fine.
  85         return bytesExpected;
  86 }
  87
  88
  89 static inline uint32
  90 UTF8PreviousCharLen(const char *text, const char *limit)
  91 {
  92         const char *ptr = text;
  93
  94         if (ptr == NULL || limit == NULL)
  95                 return 0;
  96
  97         do {
  98                 if (ptr == limit)
  99                         break;
 100                 ptr--;
 101         } while (IsInsideGlyph(*ptr));
 102
 103         return text - ptr;
 104 }
 105
 106
 107 /*!     UTF8CountBytes gets the length (in bytes) of a UTF8 string. Up to
 108         numChars characters are read. If numChars is a negative value it is ignored
 109         and the string is read up to the terminating 0.
 110 */
 111 static inline uint32
 112 UTF8CountBytes(const char *bytes, int32 numChars)
 113 {
 114         if (bytes == NULL)
 115                 return 0;
 116
 117         if (numChars < 0)
 118                 numChars = INT_MAX;
 119
 120         const char *base = bytes;
 121         while (bytes[0] != '\0') {
 122                 if ((bytes[0] & 0xc0) != 0x80) {
 123                         if (--numChars < 0)
 124                                 break;
 125                 }
 126                 bytes++;
 127         }
 128
 129         return bytes - base;
 130 }
 131
 132
 133 /*!     UTF8CountChars gets the length (in characters) of a UTF8 string. Up to
 134         numBytes bytes are read. If numBytes is a negative value it is ignored
 135         and the string is read up to the terminating 0.
 136 */
 137 static inline uint32
 138 UTF8CountChars(const char *bytes, int32 numBytes)
 139 {
 140         if (bytes == NULL)
 141                 return 0;
 142
 143         uint32 length = 0;
 144         const char *last;
 145         if (numBytes < 0)
 146                 last = (const char *)SIZE_MAX;
 147         else
 148                 last = bytes + numBytes - 1;
 149
 150         while (bytes[0] && bytes <= last) {
 151                 if ((bytes++[0] & 0xc0) != 0x80)
 152                         length++;
 153         }
 154
 155         return length;
 156 }
 157
 158
 159 /*!     UTF8ToCharCode converts the input that includes potential multibyte chars
 160         to UTF-32 char codes that can be used by FreeType. The string pointer is
 161         then advanced to the next character in the string. In case the terminating
 162         0 is reached, the string pointer is not advanced anymore and nulls are
 163         returned. This makes it safe to overruns and enables streamed processing
 164         of UTF8 strings.
 165 */
 166 static inline uint32
 167 UTF8ToCharCode(const char **bytes)
 168 {
 169         #define UTF8_SUBSTITUTE_CHARACTER       0xfffd
 170
 171         uint32 result;
 172         if (((*bytes)[0] & 0x80) == 0) {
 173                 // a single byte character
 174                 result = (*bytes)[0];
 175                 if (result != '\0') {
 176                         // do not advance beyond the terminating '\0'
 177                         (*bytes)++;
 178                 }
 179
 180                 return result;
 181         }
 182
 183         if (((*bytes)[0] & 0xc0) == 0x80) {
 184                 // not a proper multibyte start
 185                 (*bytes)++;
 186                 return UTF8_SUBSTITUTE_CHARACTER;
 187         }
 188
 189         // start of a multibyte character
 190         uint8 mask = 0x80;
 191         result = (uint32)((*bytes)[0] & 0xff);
 192         (*bytes)++;
 193
 194         while (result & mask) {
 195                 if (mask == 0x02) {
 196                         // seven byte char - invalid
 197                         return UTF8_SUBSTITUTE_CHARACTER;
 198                 }
 199
 200                 result &= ~mask;
 201                 mask >>= 1;
 202         }
 203
 204         while (((*bytes)[0] & 0xc0) == 0x80) {
 205                 result <<= 6;
 206                 result += (*bytes)[0] & 0x3f;
 207                 (*bytes)++;
 208
 209                 mask <<= 1;
 210                 if (mask == 0x40)
 211                         return result;
 212         }
 213
 214         if (mask == 0x40)
 215                 return result;
 216
 217         if ((*bytes)[0] == '\0') {
 218                 // string terminated within multibyte char
 219                 return 0x00;
 220         }
 221
 222         // not enough bytes in multibyte char
 223         return UTF8_SUBSTITUTE_CHARACTER;
 224
 225         #undef UTF8_SUBSTITUTE_CHARACTER
 226 }
 227
 228 #endif  // _UTF8_FUNCTIONS_H