lang/C/web/utf-decoder/utf-8.decoder.txt

   1 Flexible and Economical UTF-8 Decoder
   2
   3 Systems with elaborate Unicode support usually confront programmers with a
   4 multitude of different functions and macros to process UTF-8 encoded strings,
   5 often with different ideas on handling buffer boundaries, state between calls,
   6 error conditions, and performance characteristics, making them difficult to use
   7 correctly and efficiently. Implementations also tend to be very long and
   8 complicated; one popular library has over 500 lines of code just for one
   9 version of the decoder. This page presents one that is very easy to use
  10 correctly, short, small, fast, and free.
  11
  12 Implementation in C (C99)
  13
  14 // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
  15 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
  16
  17 #define UTF8_ACCEPT 0
  18 #define UTF8_REJECT 1
  19
  20 static const uint8_t utf8d[] = {
  21   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
  22   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
  23   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
  24   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
  25   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
  26   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
  27   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
  28   0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
  29   0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
  30   0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
  31   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
  32   1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
  33   1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
  34   1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
  35 };
  36
  37 uint32_t inline
  38 decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
  39   uint32_t type = utf8d[byte];
  40
  41   *codep = (*state != UTF8_ACCEPT) ?
  42     (byte & 0x3fu) | (*codep << 6) :
  43     (0xff >> type) & (byte);
  44
  45   *state = utf8d[256 + *state*16 + type];
  46   return *state;
  47 }
  48
  49 Usage
  50
  51 UTF-8 is a variable length character encoding. To decode a character one or
  52 more bytes have to be read from a string. The decode function implements a
  53 single step in this process. It takes two parameters maintaining state and a
  54 byte, and returns the state achieved after processing the byte. Specifically,
  55 it returns the value UTF8_ACCEPT (0) if enough bytes have been read for a
  56 character, UTF8_REJECT (1) if the byte is not allowed to occur at its position,
  57 and some other positive value if more bytes have to be read.
  58
  59 When decoding the first byte of a string, the caller must set the state
  60 variable to UTF8_ACCEPT. If, after decoding one or more bytes the state
  61 UTF8_ACCEPT is reached again, then the decoded Unicode character value is
  62 available through the codep parameter. If the state UTF8_REJECT is entered,
  63 that state will never be exited unless the caller intervenes. See the examples
  64 below for more information on usage and error handling, and the section on
  65 implementation details for how the decoder is constructed.
  66
  67 Examples
  68
  69 Validating and counting characters
  70
  71 This function checks if a null-terminated string is a well-formed UTF-8
  72 sequence and counts how many code points are in the string.
  73
  74 int
  75 countCodePoints(uint8_t* s, size_t* count) {
  76   uint32_t codepoint;
  77   uint32_t state = 0;
  78
  79   for (*count = 0; *s; ++s)
  80     if (!decode(&state, &codepoint, *s))
  81       *count += 1;
  82
  83   return state != UTF8_ACCEPT;
  84 }
  85
  86 It could be used like so:
  87
  88 if (countCodePoints(s, &count)) {
  89   printf("The string is malformed\n");
  90 } else {
  91   printf("The string is %u characters long\n", count);
  92 }
  93
  94 Printing code point values
  95
  96 This function prints out all code points in the string and an error message if
  97 unexpected bytes are encountered, or if the string ends with an incomplete
  98 sequence.
  99
 100 void
 101 printCodePoints(uint8_t* s) {
 102   uint32_t codepoint;
 103   uint32_t state = 0;
 104
 105   for (; *s; ++s)
 106     if (!decode(&state, &codepoint, *s))
 107       printf("U+%04X\n", codepoint);
 108
 109   if (state != UTF8_ACCEPT)
 110     printf("The string is not well-formed\n");
 111
 112 }
 113
 114 Printing UTF-16 code units
 115
 116 This loop prints out UTF-16 code units for the characters in a null-terminated
 117 UTF-8 encoded string.
 118
 119 for (; *s; ++s) {
 120
 121   if (decode(&state, &codepoint, *s))
 122     continue;
 123
 124   if (codepoint <= 0xFFFF) {
 125     printf("0x%04X\n", codepoint);
 126     continue;
 127   }
 128
 129   // Encode code points above U+FFFF as surrogate pair.
 130   printf("0x%04X\n", (0xD7C0 + (codepoint >> 10)));
 131   printf("0x%04X\n", (0xDC00 + (codepoint & 0x3FF)));
 132 }
 133
 134 Error recovery
 135
 136 It is sometimes desireable to recover from errors when decoding strings that
 137 are supposed to be UTF-8 encoded. Programmers should be aware that this can
 138 negatively affect the security properties of their application. A common
 139 recovery method is to replace malformed sequences with a substitute character
 140 like U+FFFD REPLACEMENT CHARACTER.
 141
 142 Decoder implementations differ in which octets they replace and where they
 143 restart. Consider for instance the sequence 0xED 0xA0 0x80. It encodes a
 144 surrogate code point which is prohibited in UTF-8. A recovering decoder may
 145 replace the whole sequence and restart with the next byte, or it may replace
 146 the first byte and restart with the second byte, replace it, restart with the
 147 third, and replace the third byte aswell.
 148
 149 The following code implements one such recovery strategy. When an unexpected
 150 byte is encountered, the sequence up to that point will be replaced and, if the
 151 error occured in the middle of a sequence, will retry the byte as if it occured
 152 at the beginning of a string. Note that the decode function detects errors as
 153 early as possible, so the sequence 0xED 0xA0 0x80 would result in three
 154 replacement characters.
 155
 156 for (prev = 0, current = 0; *s; prev = current, ++s) {
 157
 158   switch (decode(&current, &codepoint, *s)) {
 159   case UTF8_ACCEPT:
 160     // A properly encoded character has been found.
 161     printf("U+%04X\n", codepoint);
 162     break;
 163
 164   case UTF8_REJECT:
 165     // The byte is invalid, replace it and restart.
 166     printf("U+FFFD (Bad UTF-8 sequence)\n");
 167     current = UTF8_ACCEPT;
 168     if (prev != UTF8_ACCEPT)
 169       s--;
 170     break;
 171   ...
 172
 173 For some recovery strategies it may be useful to determine the number of bytes
 174 expected. The states in the automaton are numbered such that, assuming C's
 175 division operator, state / 3 + 1 is that number. Of course, this will only work
 176 for states other than UTF8_ACCEPT and UTF8_REJECT. This number could then be
 177 used, for instance, to skip the continuation octets in the illegal sequence
 178 0xED 0xA0 0x80 so it will be replaced by a single replacement character.
 179
 180 Transcoding to UTF-16 buffer
 181
 182 This is a rough outline of a UTF-16 transcoder. Actual applications would add
 183 code for error reporting, reporting of words written, required buffer size in
 184 the case of a small buffer, and possibly other things. Note that in order to
 185 avoid checking for free space in the inner loop, we determine how many bytes
 186 can be read without running out of space. This is one utf-8 byte per available
 187 utf-16 word, with one exception: if the last byte read was the third byte in a
 188 four byte sequence we would get two words for the next byte; so we read one
 189 byte less than we have words available. This additional word is also needed for
 190 null-termination, so it's never wrong to read one less.
 191
 192 int
 193 toUtf16(uint8_t* src, size_t srcBytes, uint16_t* dst, size_t dstWords, ...) {
 194
 195   uint8_t* src_actual_end = src + srcBytes;
 196   uint8_t* s = src;
 197   uint16_t* d = dst;
 198   uint32_t codepoint;
 199   uint32_t state = 0;
 200
 201   while (s < src_actual_end) {
 202
 203     size_t dst_words_free = dstWords - (d - dst);
 204     uint8_t* src_current_end = s + dst_words_free - 1;
 205
 206     if (src_actual_end < src_current_end)
 207       src_current_end = src_actual_end;
 208
 209     if (src_current_end <= s)
 210       goto toosmall;
 211
 212     while (s < src_current_end) {
 213
 214       if (decode(&state, &codepoint, *s++))
 215         continue;
 216
 217       if (codepoint > 0xffff) {
 218         *d++ = (uint16_t)(0xD7C0 + (codepoint >> 10));
 219         *d++ = (uint16_t)(0xDC00 + (codepoint & 0x3FF));
 220       } else {
 221         *d++ = (uint16_t)codepoint;
 222       }
 223     }
 224   }
 225
 226   if (state != UTF8_ACCEPT) {
 227     ...
 228   }
 229
 230   if ((dstWords - (d - dst)) == 0)
 231     goto toosmall;
 232
 233   *d++ = 0;
 234   ...
 235
 236 toosmall:
 237   ...
 238 }
 239
 240
 241 Implementation details
 242
 243 The utf8d table consists of two parts. The first part maps bytes to character
 244 classes, the second part encodes a deterministic finite automaton using these
 245 character classes as transitions. This section details the composition of the
 246 table.
 247
 248 Canonical UTF-8 automaton
 249
 250 UTF-8 is a variable length character encoding. That means state has to be
 251 maintained while processing a string. The following transition graph
 252 illustrates the process. We start in state zero, and whenever we come back to
 253 it, we've seen a whole Unicode character. Transitions not in the graph are
 254 disallowed; they all lead to state one, which has been omitted for readability.
 255
 256 DFA with range transitions
 257
 258 Automaton with character class transitions
 259
 260 The byte ranges in the transition graph above are not easily encoded in the
 261 automaton in a manner that would allow fast lookup. Instead of encoding the
 262 ranges directly, the ranges are split such that each byte belongs to exactly
 263 one character class. Then the transitions go over these character classes.
 264
 265 DFA with class transitions
 266
 267 Mapping bytes to character classes
 268
 269 Primarily to save space in the transition table, bytes are mapped to character
 270 classes. This is the mapping:
 271
 272 00..7f 0  80..8f 1
 273 90..9f 9  a0..bf 7
 274 c0..c1 8  c2..df 2
 275 e0..e0 10 e1..ec 3
 276 ed..ed 4  ee..ef 3
 277 f0..f0 11 f1..f3 6
 278 f4..f4 5  f5..ff 8
 279
 280 For bytes that may occur at the beginning of a multibyte sequence, the
 281 character class number is also used to remove the most significant bits from
 282 the byte, which do not contribute to the actual code point value. Note that
 283 0xc0, 0xc1, and 0xf5 .. 0xff have all their bits removed. These bytes cannot
 284 occur in well-formed sequences, so it does not matter which bits, if any, are
 285 retained.
 286
 287 c0 8 11000000 d0 2 11010000 e0 10 11100000 f0 11 11110000
 288 c1 8 11000001 d1 2 11010001 e1 3  11100001 f1 6  11110001
 289 c2 2 11000010 d2 2 11010010 e2 3  11100010 f2 6  11110010
 290 c3 2 11000011 d3 2 11010011 e3 3  11100011 f3 6  11110011
 291 c4 2 11000100 d4 2 11010100 e4 3  11100100 f4 5  11110100
 292 c5 2 11000101 d5 2 11010101 e5 3  11100101 f5 8  11110101
 293 c6 2 11000110 d6 2 11010110 e6 3  11100110 f6 8  11110110
 294 c7 2 11000111 d7 2 11010111 e7 3  11100111 f7 8  11110111
 295 c8 2 11001000 d8 2 11011000 e8 3  11101000 f8 8  11111000
 296 c9 2 11001001 d9 2 11011001 e9 3  11101001 f9 8  11111001
 297 ca 2 11001010 da 2 11011010 ea 3  11101010 fa 8  11111010
 298 cb 2 11001011 db 2 11011011 eb 3  11101011 fb 8  11111011
 299 cc 2 11001100 dc 2 11011100 ec 3  11101100 fc 8  11111100
 300 cd 2 11001101 dd 2 11011101 ed 4  11101101 fd 8  11111101
 301 ce 2 11001110 de 2 11011110 ee 3  11101110 fe 8  11111110
 302 cf 2 11001111 df 2 11011111 ef 3  11101111 ff 8  11111111
 303
 304 Notes on Variations
 305
 306 There are several ways to change the implementation of this decoder. For
 307 example, the size of the data table can be reduced, at the cost of a couple
 308 more instructions, so it omits the mapping of bytes in the US-ASCII range, and
 309 since all entries in the table are 4 bit values, two values could be stored in
 310 a single byte.
 311
 312 In some situations it may be beneficial to have a separate start state. This is
 313 easily achieved by copying the s0 state in the array to the end, and using the
 314 new state 9 as start state as needed.
 315
 316 Where callers require the code point values, compilers tend to generate
 317 slightly better code if the state calculation is moved into the branches, for
 318 example
 319
 320 if (*state != UTF8_ACCEPT) {
 321   *state = utf8d[256 + *state*16 + type];
 322   *codep = (*codep << 6) | (byte & 63);
 323 } else {
 324   *state = utf8d[256 + *state*16 + type];
 325   *codep = (byte) & (255 >> type);
 326 }
 327
 328 As the state will be zero in the else branch, this saves a shift and an
 329 addition for each starter. Unfortunately, compilers will then typically
 330 generate worse code if the codepoint value is not needed. Naturally, then, two
 331 functions could be used, one that only calculates the states for validation,
 332 counting, and similar applications, and one for full decoding. For the sample
 333 UTF-16 transcoder a more substantial increase in performance can be achieved by
 334 manually including the decode code in the inner loop; then it is also
 335 worthwhile to make code points in the US-ASCII range a special case:
 336
 337 while (s < src_current_end) {
 338
 339   uint32_t byte = *s++;
 340   uint32_t type = utf8d[byte];
 341
 342   if (state != UTF8_ACCEPT) {
 343     codep = (codep << 6) | (byte & 63);
 344     state = utf8d[256 + state*16 + type];
 345
 346     if (state)
 347       continue;
 348
 349   } else if (byte > 0x7f) {
 350     codep = (byte) & (255 >> type);
 351     state = utf8d[256 + type];
 352     continue;
 353
 354   } else {
 355     *d++ = (uint16_t)byte;
 356     continue;
 357   }
 358   ...
 359
 360 Another variation worth of note is changing the comparison when setting the
 361 code point value to this:
 362
 363 *codep = (*state >  UTF8_REJECT) ?
 364   (byte & 0x3fu) | (*codep << 6) :
 365   (0xff >> type) & (byte);
 366
 367 This ensures that the code point value does not exceed the value 0xff after
 368 some malformed sequence is encountered.
 369
 370 As written, the decoder disallows encoding of surrogate code points, overlong
 371 2, 3, and 4 byte sequences, and 4 byte sequences outside the Unicode range.
 372 Allowing them can have serious security implications, but can easily be
 373 achieved by changing the character class assignments in the table.
 374
 375 The code samples have generally been written to perform well on my system when
 376 compiled with Visual C++ 7.1 and GCC 3.4.5. Slight changes may improve
 377 performance, for example, Visual C++ 7.1 will produce slightly faster code
 378 when, in the manually inlined version of the transcoder discussed above, the
 379 type assignment is moved into the branches where it is needed, and the state
 380 and codepoint assignments in the non-ASCII starter is swapped (approximately a
 381 5% increase), but GCC 3.4.5 will produce considerably slower code
 382 (approximately 10%).
 383
 384 I have experimented with various rearrangements of states and character
 385 classes. A seemingly promising one is the following:
 386
 387 Re-arranged DFA with class transitions
 388
 389 One of the continuation ranges has been split into two, the other changes are
 390 just renamings. This arrangement allows, when a continuation octet is expected,
 391 to compute the character class with a shift instead of a table lookup, and when
 392 looking at a non-ASCII starter, the next state is simply the character class.
 393 On my system the change in performance is in the area of +/- 1%. This encoding
 394 would have a number of downsides: more rejecting states are required to account
 395 for continuation octets where starters are expected, the table formatting would
 396 use more hex notation making it longer, and calculating the number of expected
 397 continuation octets from a given state is more difficult. One thing I'd still
 398 like to try out is if, perhaps by adding a couple of additional states, for
 399 continuation states the next state can be computed without any table lookup
 400 with a few easily paired instructions.
 401
 402 On 24th June 2010 Rich Felker pointed out that the state values in the
 403 transition table can be pre-multiplied with 16 which would save a shift
 404 instruction for every byte. D'oh! We actually just need 12 and can throw away
 405 the filler values previously in the table making the table 36 bytes shorter and
 406 save the shift in the code.
 407
 408 // Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
 409 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
 410
 411 #define UTF8_ACCEPT 0
 412 #define UTF8_REJECT 12
 413
 414 static const uint8_t utf8d[] = {
 415   // The first part of the table maps bytes to character classes that
 416   // to reduce the size of the transition table and create bitmasks.
 417    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 418    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 419    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 420    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 421    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
 422    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
 423    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 424   10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
 425
 426   // The second part is a transition table that maps a combination
 427   // of a state of the automaton and a character class to a state.
 428    0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
 429   12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
 430   12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
 431   12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
 432   12,36,12,12,12,12,12,12,12,12,12,12,
 433 };
 434
 435 uint32_t inline
 436 decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
 437   uint32_t type = utf8d[byte];
 438
 439   *codep = (*state != UTF8_ACCEPT) ?
 440     (byte & 0x3fu) | (*codep << 6) :
 441     (0xff >> type) & (byte);
 442
 443   *state = utf8d[256 + *state + type];
 444   return *state;
 445 }
 446
 447 Notes on performance
 448
 449 To conduct some ad-hoc performance testing I've used three different UTF-8
 450 encoded buffers and passed them through a couple of UTF-8 to UTF-16
 451 transcoders. The large buffer is a April 2009 Hindi Wikipedia article XML dump,
 452 the medium buffer Markus Kuhn's UTF-8-demo.txt, and the tiny buffer my name,
 453 each about the number of times required for about 1GB of data. All tests ran on
 454 a Intel Prescott Celeron at 2666 MHz. See Changes for some additional details.
 455
 456                                                         Large  Medium    Tiny
 457 NS_CStringToUTF16() Mozilla 1.9 (includes malloc/free  36924ms 39773ms 107958ms
 458                         time)
 459   iconv() 1.9 compiled with Visual C++ (Cygwin iconv   22740ms 21765ms 32595ms
 460                     1.11 similar)
 461  g_utf8_to_utf16() Cygwin Glib 2.0 (includes malloc/   21599ms 20345ms 98782ms
 462                       free time)
 463 ConvertUTF8toUTF16() Unicode Inc., Visual C++ 7.1 -Ox  11183ms 11251ms 19453ms
 464                        -Ot -G7
 465  MultiByteToWideChar() Windows API (Server 2003 SP2)   9857ms  10779ms 12771ms
 466 u_strFromUTF8 from ICU 4.0.1 (Visual Studio 2008, web  8778ms  5223ms  5419ms
 467                   site distribution)
 468  PyUnicode_DecodeUTF8Stateful (3.1a2), Visual C++ 7.1  4523ms  5686ms  3138ms
 469                      -Ox -Ot -G7
 470 Example section transcoder, Visual C++ 7.1 -Ox -Ot -G7 5397ms  5789ms  6250ms
 471  Manually inlined transcoder (see above), Visual C++   4277ms  4998ms  4640ms
 472                    7.1 -Ox -Ot -G7
 473         Same, Cygwin GCC 3.4.5 -march=prescott         4492ms  5154ms  4432ms
 474                -fomit-frame-pointer -O3
 475         Same, Cygwin GCC 4.3.2 -march=prescott         5439ms  6322ms  5567ms
 476                -fomit-frame-pointer -O3
 477              Same, Visual C++ 6.0 -TP -O2              5398ms  6259ms  6446ms
 478 Same, Visual C++ 7.1 -Ox -Ot -G7 (includes malloc/free 5498ms  5086ms  25852ms
 479                         time)
 480
 481 I have also timed functions that xor all code points in the large buffer. In
 482 Visual Studio 2008 ICU's U8_NEXT macro comes out at ~8000ms, the U8_NEXT_UNSAFE
 483 macro, which requires complete and well-formed input, at ~4000ms, and the
 484 decode function is at ~5900ms. Using the same manual inlining as for the
 485 transcode function, Cygwin GCC 3.4.5 -march=prescott -O3 -fomit-frame-pointer
 486 brings it down to roughly the same times as the transcode function for all
 487 three buffers.
 488
 489 While these results do not model real-world applications well, it seems
 490 reasonable to suggest that the reduced complexity does not come at the price of
 491 reduced performance. Note that instructions that compute the code point values
 492 will generally be optimized away when not needed. For example, checking if a
 493 null-terminated string is properly UTF-8 encoded ...
 494
 495 int
 496 IsUTF8(uint8_t* s) {
 497   uint32_t codepoint, state = 0;
 498
 499   while (*s)
 500     decode(&state, &codepoint, *s++);
 501
 502   return state == UTF8_ACCEPT;
 503 }
 504
 505 ... does not require the individual code point values, and so the loop becomes
 506 something like this:
 507
 508 l1: movzx  eax,al
 509     shl    edx,4
 510     add    ecx,1
 511     movzx  eax,byte ptr [eax+404000h]
 512     movzx  edx,byte ptr [eax+edx+256+404000h]
 513     movzx  eax,byte ptr [ecx]
 514     test   al,al
 515     jne    l1
 516
 517 For comparison, this is a typical strlen loop:
 518
 519 l1: mov    cl,byte ptr [eax]
 520     add    eax,1
 521     test   cl,cl
 522     jne    l1
 523
 524 With the large buffer and the same number of times as above, strlen takes
 525 1507ms while IsUTF8 takes 2514ms.
 526
 527 License
 528
 529 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
 530
 531 Permission is hereby granted, free of charge, to any person obtaining a copy of
 532 this software and associated documentation files (the "Software"), to deal in
 533 the Software without restriction, including without limitation the rights to
 534 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 535 of the Software, and to permit persons to whom the Software is furnished to do
 536 so, subject to the following conditions:
 537
 538 The above copyright notice and this permission notice shall be included in all
 539 copies or substantial portions of the Software.
 540
 541 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 542 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 543 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 544 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 545 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 546 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 547 SOFTWARE.
 548
 549 Changes
 550
 551 25 Jun 2010
 552     Added an improved variation based on an observation from Rich Felker.
 553 30 April 2009
 554     Added some more items to the performance table: the manually inlined
 555     transcoder allocating worst case memory for each run and freeing it before
 556     the next run; and results for Mozilla's NS_CStringToUTF16 (a new
 557     nsAutoString is created for each run, and truncated before the next). This
 558     used the XULRunner SDK 1.9.0.7 binary distribution from the Mozilla
 559     website.
 560 18 April 2009
 561     Added notes to the Variations section on handling malformed sequences and
 562     failed optimization attempts.
 563 14 April 2009
 564     Added PyUnicode_DecodeUTF8Stateful times; the function has been modified
 565     slightly so it works outside Python and so it uses a pre-allocated buffer.
 566     Normally does not check output buffer boundaries but rather allocates a
 567     worst case buffer, then resizes it. Apparently the decoder allows encodings
 568     of surrogate code points.
 569
 570 Author
 571
 572
 573 Björn Höhrmann bjoern@hoehrmann.de (Donate via SourceForge, PayPal)
 574
 575