workbench/libs/codesetslib/src/convertUTF.c

   1 /*
   2  * Copyright 2001-2004 Unicode, Inc.
   3  *
   4  * Disclaimer
   5  *
   6  * This source code is provided as is by Unicode, Inc. No claims are
   7  * made as to fitness for any particular purpose. No warranties of any
   8  * kind are expressed or implied. The recipient agrees to determine
   9  * applicability of information provided. If this file has been
  10  * purchased on magnetic or optical media from Unicode, Inc., the
  11  * sole remedy for any claim will be exchange of defective media
  12  * within 90 days of receipt.
  13  *
  14  * Limitations on Rights to Redistribute This Code
  15  *
  16  * Unicode, Inc. hereby grants the right to freely use the information
  17  * supplied in this file in the creation of products supporting the
  18  * Unicode Standard, and to make copies of this file in any form
  19  * for internal or external distribution as long as this notice
  20  * remains attached.
  21  */
  22
  23 /* ---------------------------------------------------------------------
  24
  25     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
  26     Author: Mark E. Davis, 1994.
  27     Rev History: Rick McGowan, fixes & updates May 2001.
  28     Sept 2001: fixed const & error conditions per
  29         mods suggested by S. Parent & A. Lillich.
  30     June 2002: Tim Dodd added detection and handling of incomplete
  31         source sequences, enhanced error detection, added casts
  32         to eliminate compiler warnings.
  33     July 2003: slight mods to back out aggressive FFFE detection.
  34     Jan 2004: updated switches in from-UTF8 conversions.
  35     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
  36
  37     See the header file "ConvertUTF.h" for complete documentation.
  38
  39 ------------------------------------------------------------------------ */
  40
  41 #include "lib.h"
  42 #include "convertUTF.h"
  43
  44 #include "SDI_lib.h"
  45
  46 #include "debug.h"
  47
  48 /***********************************************************************/
  49
  50 static const int halfShift = 10;    /* used for shifting by 10 bits */
  51
  52 static const UTF32 halfBase = 0x0010000UL;
  53 static const UTF32 halfMask = 0x3FFUL;
  54
  55 #define UNI_SUR_HIGH_START  (UTF32)0xD800
  56 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
  57 #define UNI_SUR_LOW_START   (UTF32)0xDC00
  58 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
  59
  60 /***********************************************************************/
  61
  62 ULONG LIBFUNC
  63 CodesetsConvertUTF32toUTF16(REG(a0, const UTF32 ** sourceStart),
  64                             REG(a1, const UTF32 * sourceEnd),
  65                             REG(a2, UTF16 ** targetStart),
  66                             REG(a3, UTF16 * targetEnd),
  67                             REG(d0, ULONG flags))
  68 {
  69   ULONG result = CSR_ConversionOK;
  70   const UTF32 *source = *sourceStart;
  71   UTF16 *target = *targetStart;
  72
  73   ENTER();
  74
  75   while(source < sourceEnd)
  76   {
  77     UTF32 ch;
  78
  79     if(target >= targetEnd)
  80     {
  81       result = CSR_TargetExhausted;
  82       break;
  83     }
  84
  85     ch = *source++;
  86     if(ch <= UNI_MAX_BMP)
  87     {
  88       /* Target is a character <= 0xFFFF */
  89       /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
  90       if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
  91       {
  92         if(flags == CSF_StrictConversion)
  93         {
  94           --source;   /* return to the illegal value itself */
  95           result = CSR_SourceIllegal;
  96           break;
  97         }
  98         else
  99         {
 100           *target++ = UNI_REPLACEMENT_CHAR;
 101         }
 102       }
 103       else
 104       {
 105         *target++ = (UTF16)ch; /* normal case */
 106       }
 107     }
 108     else if(ch > UNI_MAX_LEGAL_UTF32)
 109     {
 110       if(flags == CSF_StrictConversion)
 111       {
 112         result = CSR_SourceIllegal;
 113       }
 114       else
 115       {
 116         *target++ = UNI_REPLACEMENT_CHAR;
 117       }
 118     }
 119     else
 120     {
 121       /* target is a character in range 0xFFFF - 0x10FFFF. */
 122       if(target + 1 >= targetEnd)
 123       {
 124         --source;      /* Back up source pointer! */
 125         result = CSR_TargetExhausted;
 126         break;
 127       }
 128       ch -= halfBase;
 129       *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
 130       *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
 131     }
 132   }
 133
 134   *sourceStart = source;
 135   *targetStart = target;
 136
 137   RETURN(result);
 138   return result;
 139 }
 140
 141 /***********************************************************************/
 142
 143 ULONG LIBFUNC
 144 CodesetsConvertUTF16toUTF32(REG(a0, const UTF16 ** sourceStart),
 145                             REG(a1, const UTF16 * sourceEnd),
 146                             REG(a2, UTF32 ** targetStart),
 147                             REG(a3, UTF32 * targetEnd),
 148                             REG(d0, ULONG flags))
 149 {
 150   ULONG result = CSR_ConversionOK;
 151   const UTF16 *source = *sourceStart;
 152   UTF32 *target = *targetStart;
 153   UTF32 ch=0, ch2=0;
 154
 155   ENTER();
 156
 157   while(source < sourceEnd)
 158   {
 159     const UTF16 *oldSource = source;    /*  In case we have to back up because of target overflow. */
 160
 161     ch = *source++;
 162     /* If we have a surrogate pair, convert to UTF32 first. */
 163     if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
 164     {
 165       /* If the 16 bits following the high surrogate are in the source buffer... */
 166       if(source < sourceEnd)
 167       {
 168         ch2 = *source;
 169
 170         /* If it's a low surrogate, convert to UTF32. */
 171         if(ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
 172         {
 173           ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
 174                 + (ch2 - UNI_SUR_LOW_START) + halfBase;
 175
 176           ++source;
 177         }
 178         else if(flags == CSF_StrictConversion)
 179         {
 180           /* it's an unpaired high surrogate */
 181           --source;   /* return to the illegal value itself */
 182           result = CSR_SourceIllegal;
 183
 184           break;
 185         }
 186       }
 187       else
 188       {
 189         /* We don't have the 16 bits following the high surrogate. */
 190         --source;       /* return to the high surrogate */
 191         result = CSR_SourceExhausted;
 192
 193         break;
 194       }
 195     }
 196     else if (flags == CSF_StrictConversion)
 197     {
 198       /* UTF-16 surrogate values are illegal in UTF-32 */
 199       if(ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
 200       {
 201         --source;       /* return to the illegal value itself */
 202         result = CSR_SourceIllegal;
 203
 204         break;
 205       }
 206     }
 207
 208     if(target >= targetEnd)
 209     {
 210       source = oldSource; /* Back up source pointer! */
 211       result = CSR_TargetExhausted;
 212
 213       break;
 214     }
 215     *target++ = ch;
 216   }
 217
 218   *sourceStart = source;
 219   *targetStart = target;
 220
 221   #if defined(DEBUG)
 222   if(result == CSR_SourceIllegal)
 223   {
 224     E(DBF_UTF, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x", ch, ch2);
 225   }
 226   #endif
 227
 228   RETURN(result);
 229   return result;
 230 }
 231
 232 /***********************************************************************/
 233
 234 /*
 235  * Index into the table below with the first byte of a UTF-8 sequence to
 236  * get the number of trailing bytes that are supposed to follow it.
 237  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
 238  * left as-is for anyone who may want to do such conversion, which was
 239  * allowed in earlier algorithms.
 240  */
 241 const char trailingBytesForUTF8[256] = {
 242     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 243     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 244     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 245     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 246     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 247     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 248     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 249     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
 250 };
 251
 252 /*
 253  * Magic values subtracted from a buffer value during UTF8 conversion.
 254  * This table contains as many values as there might be trailing bytes
 255  * in a UTF-8 sequence.
 256  */
 257 static const UTF32 offsetsFromUTF8[6] = {
 258     0x00000000UL, 0x00003080UL, 0x000E2080UL,
 259     0x03C82080UL, 0xFA082080UL, 0x82082080UL
 260 };
 261
 262 /*
 263  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
 264  * into the first byte, depending on how many bytes follow.  There are
 265  * as many entries in this table as there are UTF-8 sequence types.
 266  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
 267  * for *legal* UTF-8 will be 4 or fewer bytes total.
 268  */
 269 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 270
 271 /***********************************************************************/
 272
 273 /* The interface converts a whole buffer to avoid function-call overhead.
 274  * Constants have been gathered. Loops & conditionals have been removed as
 275  * much as possible for efficiency, in favor of drop-through switches.
 276  * (See "Note A" at the bottom of the file for equivalent code.)
 277  * If your compiler supports it, the "isLegalUTF8" call can be turned
 278  * into an inline function.
 279  */
 280
 281 /***********************************************************************/
 282
 283 ULONG LIBFUNC
 284 CodesetsConvertUTF16toUTF8(REG(a0, const UTF16 ** sourceStart),
 285                            REG(a1, const UTF16 * sourceEnd),
 286                            REG(a2, UTF8 ** targetStart),
 287                            REG(a3, UTF8 * targetEnd),
 288                            REG(d0, ULONG flags))
 289 {
 290   ULONG result = CSR_ConversionOK;
 291   const UTF16 *source = *sourceStart;
 292   UTF8 *target = *targetStart;
 293   UTF8 *start = target;
 294
 295   ENTER();
 296
 297   while(source < sourceEnd)
 298   {
 299     UTF32 ch;
 300     unsigned short bytesToWrite = 0;
 301     const UTF32 byteMask = 0xBF;
 302     const UTF32 byteMark = 0x80;
 303     const UTF16 *oldSource = source;    /* In case we have to back up because of target overflow. */
 304
 305     ch = *source++;
 306
 307     /* If we have a surrogate pair, convert to UTF32 first. */
 308     if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
 309     {
 310       /* If the 16 bits following the high surrogate are in the source buffer... */
 311       if(source < sourceEnd)
 312       {
 313         UTF32 ch2 = *source;
 314
 315         /* If it's a low surrogate, convert to UTF32. */
 316         if(ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
 317         {
 318           ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
 319                 + (ch2 - UNI_SUR_LOW_START) + halfBase;
 320
 321           ++source;
 322         }
 323         else if(flags == CSF_StrictConversion)
 324         {
 325           /* it's an unpaired high surrogate */
 326           --source;   /* return to the illegal value itself */
 327           result = CSR_SourceIllegal;
 328           break;
 329         }
 330       }
 331       else
 332       {
 333         /* We don't have the 16 bits following the high surrogate. */
 334         --source;       /* return to the high surrogate */
 335         result = CSR_SourceExhausted;
 336
 337         break;
 338       }
 339     }
 340     else if(flags == CSF_StrictConversion)
 341     {
 342       /* UTF-16 surrogate values are illegal in UTF-32 */
 343       if(ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
 344       {
 345         --source;       /* return to the illegal value itself */
 346         result = CSR_SourceIllegal;
 347
 348         break;
 349       }
 350     }
 351     /* Figure out how many bytes the result will require */
 352     if(ch < (UTF32) 0x80)
 353     {
 354       bytesToWrite = 1;
 355     }
 356     else if (ch < (UTF32) 0x800)
 357     {
 358       bytesToWrite = 2;
 359     }
 360     else if (ch < (UTF32) 0x10000)
 361     {
 362       bytesToWrite = 3;
 363     }
 364     else if (ch < (UTF32) 0x110000)
 365     {
 366       bytesToWrite = 4;
 367     }
 368     else
 369     {
 370       bytesToWrite = 3;
 371       ch = UNI_REPLACEMENT_CHAR;
 372     }
 373
 374     target += bytesToWrite;
 375     if(start)
 376     {
 377       if(target > targetEnd)
 378       {
 379         source = oldSource; /* Back up source pointer! */
 380         target -= bytesToWrite;
 381         result = CSR_TargetExhausted;
 382
 383         break;
 384       }
 385       switch(bytesToWrite)
 386       {
 387         /* note: everything falls through. */
 388         case 4:
 389           *--target = (UTF8) ((ch | byteMark) & byteMask);
 390           ch >>= 6;
 391
 392         case 3:
 393           *--target = (UTF8) ((ch | byteMark) & byteMask);
 394           ch >>= 6;
 395
 396         case 2:
 397           *--target = (UTF8) ((ch | byteMark) & byteMask);
 398           ch >>= 6;
 399
 400         case 1:
 401           *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
 402       }
 403
 404       target += bytesToWrite;
 405     }
 406   }
 407
 408   *sourceStart = source;
 409   *targetStart = target;
 410
 411   RETURN(result);
 412   return result;
 413 }
 414
 415 /***********************************************************************/
 416
 417 /*
 418  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
 419  * This must be called with the length pre-determined by the first byte.
 420  * If not calling this from ConvertUTF8to*, then the length can be set by:
 421  *  length = trailingBytesForUTF8[*source]+1;
 422  * and the sequence is illegal right away if there aren't that many bytes
 423  * available.
 424  * If presented with a length > 4, this returns FALSE.  The Unicode
 425  * definition of UTF-8 goes up to 4-byte sequences.
 426  */
 427
 428 BOOL LIBFUNC
 429 CodesetsIsLegalUTF8(REG(a0, const UTF8 * source),
 430                                 REG(d0, ULONG length))
 431 {
 432   UTF8 a;
 433   const UTF8 *srcptr = source + length;
 434
 435   ENTER();
 436
 437   switch(length)
 438   {
 439     default:
 440       RETURN(FALSE);
 441       return FALSE;
 442
 443     /* Everything else falls through when "TRUE"... */
 444     case 4:
 445       if((a = (*--srcptr)) < 0x80 || a > 0xBF)
 446       {
 447         RETURN(FALSE);
 448         return FALSE;
 449       }
 450
 451     case 3:
 452       if((a = (*--srcptr)) < 0x80 || a > 0xBF)
 453       {
 454         RETURN(FALSE);
 455         return FALSE;
 456       }
 457
 458     case 2:
 459       if((a = (*--srcptr)) > 0xBF)
 460       {
 461         RETURN(FALSE);
 462         return FALSE;
 463       }
 464
 465       switch (*source)
 466       {
 467         /* no fall-through in this inner switch */
 468         case 0xE0:
 469           if(a < 0xA0)
 470           {
 471             RETURN(FALSE);
 472             return FALSE;
 473           }
 474         break;
 475
 476         case 0xED:
 477           if(a > 0x9F)
 478           {
 479             RETURN(FALSE);
 480             return FALSE;
 481           }
 482         break;
 483
 484         case 0xF0:
 485           if(a < 0x90)
 486           {
 487             RETURN(FALSE);
 488             return FALSE;
 489           }
 490           break;
 491
 492         case 0xF4:
 493           if(a > 0x8F)
 494           {
 495             RETURN(FALSE);
 496             return FALSE;
 497           }
 498         break;
 499
 500         default:
 501           if(a < 0x80)
 502           {
 503             RETURN(FALSE);
 504             return FALSE;
 505           }
 506       }
 507
 508     case 1:
 509       if(*source >= 0x80 && *source < 0xC2)
 510       {
 511         RETURN(FALSE);
 512         return FALSE;
 513       }
 514   }
 515
 516   if(*source > 0xF4)
 517   {
 518     RETURN(FALSE);
 519     return FALSE;
 520   }
 521
 522   RETURN(TRUE);
 523   return TRUE;
 524 }
 525
 526 /***********************************************************************/
 527
 528 /*
 529  * Exported function to return whether a UTF-8 sequence is legal or not.
 530  * This is not used here; it's just exported.
 531  */
 532
 533 BOOL LIBFUNC
 534 CodesetsIsLegalUTF8Sequence(REG(a0, const UTF8 * source),
 535                             REG(a1, const UTF8 * sourceEnd))
 536 {
 537   int length = trailingBytesForUTF8[*source] + 1;
 538   BOOL res = FALSE;
 539
 540   ENTER();
 541
 542   if(source + length > sourceEnd)
 543   {
 544     RETURN(FALSE);
 545     return FALSE;
 546   }
 547
 548   res = CodesetsIsLegalUTF8(source, length);
 549
 550   RETURN(res);
 551   return res;
 552 }
 553
 554 /***********************************************************************/
 555
 556 ULONG LIBFUNC
 557 CodesetsConvertUTF8toUTF16(REG(a0, const UTF8 ** sourceStart),
 558                            REG(a1, const UTF8 * sourceEnd),
 559                            REG(a2, UTF16 ** targetStart),
 560                            REG(a3, UTF16 * targetEnd),
 561                            REG(d0, ULONG flags))
 562 {
 563   ULONG result = CSR_ConversionOK;
 564   const UTF8 *source = *sourceStart;
 565   UTF16 *target = *targetStart;
 566   UTF16 *start = target;
 567
 568   ENTER();
 569
 570   while(source < sourceEnd)
 571   {
 572     UTF32 ch = 0;
 573     unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 574
 575     if(source + extraBytesToRead >= sourceEnd)
 576     {
 577       result = CSR_SourceExhausted;
 578       break;
 579     }
 580
 581     /* Do this check whether lenient or strict */
 582     if(!CodesetsIsLegalUTF8 (source, extraBytesToRead + 1))
 583     {
 584       result = CSR_SourceIllegal;
 585       break;
 586     }
 587
 588     /*
 589      * The cases all fall through. See "Note A" below.
 590      */
 591     switch (extraBytesToRead)
 592     {
 593       case 5:
 594         ch += *source++;
 595         ch <<= 6;       /* remember, illegal UTF-8 */
 596
 597       case 4:
 598         ch += *source++;
 599         ch <<= 6;       /* remember, illegal UTF-8 */
 600
 601       case 3:
 602         ch += *source++;
 603         ch <<= 6;
 604
 605       case 2:
 606         ch += *source++;
 607         ch <<= 6;
 608
 609       case 1:
 610         ch += *source++;
 611         ch <<= 6;
 612
 613       case 0:
 614         ch += *source++;
 615     }
 616
 617     ch -= offsetsFromUTF8[extraBytesToRead];
 618
 619     if(start && (target >= targetEnd))
 620     {
 621       source -= (extraBytesToRead + 1);   /* Back up source pointer! */
 622       result = CSR_TargetExhausted;
 623
 624       break;
 625     }
 626
 627     if(ch <= UNI_MAX_BMP)
 628     {
 629       /* Target is a character <= 0xFFFF */
 630       /* UTF-16 surrogate values are illegal in UTF-32 */
 631       if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
 632       {
 633         if(flags == CSF_StrictConversion)
 634         {
 635           source -= (extraBytesToRead + 1);   /* return to the illegal value itself */
 636           result = CSR_SourceIllegal;
 637
 638           break;
 639         }
 640         else
 641                 ch = UNI_REPLACEMENT_CHAR;
 642       }
 643       if(start)
 644               *target = (UTF16) ch; /* normal case */
 645       target++;
 646     }
 647     else if(ch > UNI_MAX_UTF16)
 648     {
 649       if(flags == CSF_StrictConversion)
 650       {
 651         result = CSR_SourceIllegal;
 652         source -= (extraBytesToRead + 1);   /* return to the start */
 653
 654         break;          /* Bail out; shouldn't continue */
 655       }
 656       if(start)
 657               *target = UNI_REPLACEMENT_CHAR;
 658       target++;
 659     }
 660     else
 661     {
 662       /* target is a character in range 0xFFFF - 0x10FFFF. */
 663       if(start)
 664       {
 665         if(target + 1 >= targetEnd)
 666         {
 667                 source -= (extraBytesToRead + 1);   /* Back up source pointer! */
 668                 result = CSR_TargetExhausted;
 669
 670                 break;
 671         }
 672
 673         ch -= halfBase;
 674         target[0] = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
 675         target[1] = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
 676       }
 677       target += 2;
 678     }
 679   }
 680
 681   *sourceStart = source;
 682   *targetStart = target;
 683
 684   RETURN(result);
 685   return result;
 686 }
 687
 688 /***********************************************************************/
 689
 690 ULONG LIBFUNC
 691 CodesetsConvertUTF32toUTF8(REG(a0, const UTF32 ** sourceStart),
 692                            REG(a1, const UTF32 * sourceEnd),
 693                            REG(a2, UTF8 ** targetStart),
 694                            REG(a3, UTF8 * targetEnd),
 695                            REG(d0, ULONG flags))
 696 {
 697   ULONG result = CSR_ConversionOK;
 698   const UTF32 *source = *sourceStart;
 699   UTF8 *target = *targetStart;
 700   UTF8 *start = target;
 701
 702   ENTER();
 703
 704   while(source < sourceEnd)
 705   {
 706     UTF32 ch;
 707     unsigned short bytesToWrite = 0;
 708     const UTF32 byteMask = 0xBF;
 709     const UTF32 byteMark = 0x80;
 710
 711     ch = *source++;
 712
 713     if(flags == CSF_StrictConversion)
 714     {
 715       /* UTF-16 surrogate values are illegal in UTF-32 */
 716       if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
 717       {
 718         --source;       /* return to the illegal value itself */
 719         result = CSR_SourceIllegal;
 720
 721         break;
 722       }
 723     }
 724
 725     /*
 726      * Figure out how many bytes the result will require. Turn any
 727      * illegally large UTF32 things (> Plane 17) into replacement chars.
 728     */
 729     if(ch < (UTF32) 0x80)
 730     {
 731       bytesToWrite = 1;
 732     }
 733     else if(ch < (UTF32) 0x800)
 734     {
 735       bytesToWrite = 2;
 736     }
 737     else if(ch < (UTF32) 0x10000)
 738     {
 739       bytesToWrite = 3;
 740     }
 741     else if(ch <= UNI_MAX_LEGAL_UTF32)
 742     {
 743       bytesToWrite = 4;
 744     }
 745     else
 746     {
 747       bytesToWrite = 3;
 748       ch = UNI_REPLACEMENT_CHAR;
 749       result = CSR_SourceIllegal;
 750     }
 751
 752     target += bytesToWrite;
 753     if(start)
 754     {
 755       if(target > targetEnd)
 756       {
 757         --source;           /* Back up source pointer! */
 758         target -= bytesToWrite;
 759         result = CSR_TargetExhausted;
 760
 761         break;
 762       }
 763       switch(bytesToWrite)
 764       {
 765         /* note: everything falls through. */
 766         case 4:
 767           *--target = (UTF8) ((ch | byteMark) & byteMask);
 768           ch >>= 6;
 769
 770         case 3:
 771           *--target = (UTF8) ((ch | byteMark) & byteMask);
 772           ch >>= 6;
 773
 774         case 2:
 775           *--target = (UTF8) ((ch | byteMark) & byteMask);
 776           ch >>= 6;
 777
 778         case 1:
 779           *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
 780       }
 781
 782       target += bytesToWrite;
 783     }
 784   }
 785
 786   *sourceStart = source;
 787   *targetStart = target;
 788
 789   RETURN(result);
 790   return result;
 791 }
 792
 793 /***********************************************************************/
 794
 795 ULONG LIBFUNC
 796 CodesetsConvertUTF8toUTF32(REG(a0, const UTF8 ** sourceStart),
 797                            REG(a1, const UTF8 * sourceEnd),
 798                            REG(a2, UTF32 ** targetStart),
 799                            REG(a3, UTF32 * targetEnd),
 800                            REG(d0, ULONG flags))
 801 {
 802   ULONG result = CSR_ConversionOK;
 803   const UTF8 *source = *sourceStart;
 804   UTF32 *target = *targetStart;
 805   UTF32 *start = target;
 806
 807   ENTER();
 808
 809   while(source < sourceEnd)
 810   {
 811     UTF32 ch = 0;
 812     unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 813
 814     if(source + extraBytesToRead >= sourceEnd)
 815     {
 816       result = CSR_SourceExhausted;
 817       break;
 818     }
 819
 820     /* Do this check whether lenient or strict */
 821     if(!CodesetsIsLegalUTF8(source, extraBytesToRead + 1))
 822     {
 823       result = CSR_SourceIllegal;
 824       break;
 825     }
 826
 827     /*
 828      * The cases all fall through. See "Note A" below.
 829     */
 830     switch (extraBytesToRead)
 831     {
 832       case 5:
 833         ch += *source++;
 834         ch <<= 6;
 835
 836       case 4:
 837         ch += *source++;
 838         ch <<= 6;
 839
 840       case 3:
 841         ch += *source++;
 842         ch <<= 6;
 843
 844       case 2:
 845         ch += *source++;
 846         ch <<= 6;
 847
 848       case 1:
 849         ch += *source++;
 850         ch <<= 6;
 851
 852       case 0:
 853         ch += *source++;
 854     }
 855
 856     ch -= offsetsFromUTF8[extraBytesToRead];
 857
 858     if(start)
 859     {
 860       if(target >= targetEnd)
 861       {
 862         source -= (extraBytesToRead + 1);   /* Back up the source pointer! */
 863         result = CSR_TargetExhausted;
 864
 865         break;
 866       }
 867
 868       if(ch <= UNI_MAX_LEGAL_UTF32)
 869       {
 870         /*
 871          * UTF-16 surrogate values are illegal in UTF-32, and anything
 872          * over Plane 17 (> 0x10FFFF) is illegal.
 873         */
 874         if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
 875         {
 876           if(flags == CSF_StrictConversion)
 877           {
 878             source -= (extraBytesToRead + 1);   /* return to the illegal value itself */
 879             result = CSR_SourceIllegal;
 880
 881             break;
 882           }
 883           else
 884           {
 885             *target++ = UNI_REPLACEMENT_CHAR;
 886           }
 887         }
 888         else
 889         {
 890           *target++ = ch;
 891         }
 892       }
 893       else
 894       {
 895         /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
 896         result = CSR_SourceIllegal;
 897         *target++ = UNI_REPLACEMENT_CHAR;
 898       }
 899     }
 900     else
 901       target++;
 902   }
 903
 904   *sourceStart = source;
 905   *targetStart = target;
 906
 907   RETURN(result);
 908   return result;
 909 }
 910
 911 /***********************************************************************
 912
 913     Note A.
 914     The fall-through switches in UTF-8 reading code save a
 915     temp variable, some decrements & conditionals.  The switches
 916     are equivalent to the following loop:
 917     {
 918         int tmpBytesToRead = extraBytesToRead+1;
 919         do {
 920         ch += *source++;
 921         --tmpBytesToRead;
 922         if (tmpBytesToRead) ch <<= 6;
 923         } while (tmpBytesToRead > 0);
 924     }
 925     In UTF-8 writing code, the switches on "bytesToWrite" are
 926     similarly unrolled loops.
 927
 928 ***********************************************************************/