workbench/libs/codesets/src/convertUTF.c

   1 /*
   2  * Copyright 2001-2004 Unicode, Inc.
   3  *
   4  * Disclaimer
   5  *
   6  * This source code is provided as is by Unicode, Inc. No claims are
   7  * made as to fitness for any particular purpose. No warranties of any
   8  * kind are expressed or implied. The recipient agrees to determine
   9  * applicability of information provided. If this file has been
  10  * purchased on magnetic or optical media from Unicode, Inc., the
  11  * sole remedy for any claim will be exchange of defective media
  12  * within 90 days of receipt.
  13  *
  14  * Limitations on Rights to Redistribute This Code
  15  *
  16  * Unicode, Inc. hereby grants the right to freely use the information
  17  * supplied in this file in the creation of products supporting the
  18  * Unicode Standard, and to make copies of this file in any form
  19  * for internal or external distribution as long as this notice
  20  * remains attached.
  21  */
  22
  23 /* ---------------------------------------------------------------------
  24
  25     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
  26     Author: Mark E. Davis, 1994.
  27     Rev History: Rick McGowan, fixes & updates May 2001.
  28     Sept 2001: fixed const & error conditions per
  29         mods suggested by S. Parent & A. Lillich.
  30     June 2002: Tim Dodd added detection and handling of incomplete
  31         source sequences, enhanced error detection, added casts
  32         to eliminate compiler warnings.
  33     July 2003: slight mods to back out aggressive FFFE detection.
  34     Jan 2004: updated switches in from-UTF8 conversions.
  35     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
  36
  37     See the header file "ConvertUTF.h" for complete documentation.
  38
  39 ------------------------------------------------------------------------ */
  40
  41 #include "lib.h"
  42 #include "convertUTF.h"
  43
  44 #include "SDI_lib.h"
  45
  46 #include "debug.h"
  47
  48 #define __NOLIBBASE__
  49 #include <proto/codesets.h>
  50
  51 /***********************************************************************/
  52
  53 static const int halfShift = 10;    /* used for shifting by 10 bits */
  54
  55 static const UTF32 halfBase = 0x0010000UL;
  56 static const UTF32 halfMask = 0x3FFUL;
  57
  58 #define UNI_SUR_HIGH_START  (UTF32)0xD800
  59 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
  60 #define UNI_SUR_LOW_START   (UTF32)0xDC00
  61 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
  62
  63 /***********************************************************************/
  64
  65 LIBPROTO(CodesetsConvertUTF32toUTF16, ULONG, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, const UTF32 **sourceStart), REG(a1, const UTF32 *sourceEnd), REG(a2, UTF16 **targetStart), REG(a3, UTF16 *targetEnd), REG(d0, ULONG flags))
  66 {
  67   ULONG result = CSR_ConversionOK;
  68   const UTF32 *source = *sourceStart;
  69   UTF16 *target = *targetStart;
  70
  71   ENTER();
  72
  73   while(source < sourceEnd)
  74   {
  75     UTF32 ch;
  76
  77     if(target >= targetEnd)
  78     {
  79       result = CSR_TargetExhausted;
  80       break;
  81     }
  82
  83     ch = *source++;
  84     if(ch <= UNI_MAX_BMP)
  85     {
  86       /* Target is a character <= 0xFFFF */
  87       /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
  88       if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
  89       {
  90         if(flags == CSF_StrictConversion)
  91         {
  92           --source;   /* return to the illegal value itself */
  93           result = CSR_SourceIllegal;
  94           break;
  95         }
  96         else
  97         {
  98           *target++ = UNI_REPLACEMENT_CHAR;
  99         }
 100       }
 101       else
 102       {
 103         *target++ = (UTF16)ch; /* normal case */
 104       }
 105     }
 106     else if(ch > UNI_MAX_LEGAL_UTF32)
 107     {
 108       if(flags == CSF_StrictConversion)
 109       {
 110         result = CSR_SourceIllegal;
 111       }
 112       else
 113       {
 114         *target++ = UNI_REPLACEMENT_CHAR;
 115       }
 116     }
 117     else
 118     {
 119       /* target is a character in range 0xFFFF - 0x10FFFF. */
 120       if(target + 1 >= targetEnd)
 121       {
 122         --source;      /* Back up source pointer! */
 123         result = CSR_TargetExhausted;
 124         break;
 125       }
 126       ch -= halfBase;
 127       *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
 128       *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
 129     }
 130   }
 131
 132   *sourceStart = source;
 133   *targetStart = target;
 134
 135   RETURN(result);
 136   return result;
 137 }
 138
 139 /***********************************************************************/
 140
 141 LIBPROTO(CodesetsConvertUTF16toUTF32, ULONG, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, const UTF16 **sourceStart), REG(a1, const UTF16 *sourceEnd), REG(a2, UTF32 **targetStart), REG(a3, UTF32 *targetEnd), REG(d0, ULONG flags))
 142 {
 143   ULONG result = CSR_ConversionOK;
 144   const UTF16 *source = *sourceStart;
 145   UTF32 *target = *targetStart;
 146   UTF32 ch=0, ch2=0;
 147
 148   ENTER();
 149
 150   while(source < sourceEnd)
 151   {
 152     const UTF16 *oldSource = source;    /*  In case we have to back up because of target overflow. */
 153
 154     ch = *source++;
 155     /* If we have a surrogate pair, convert to UTF32 first. */
 156     if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
 157     {
 158       /* If the 16 bits following the high surrogate are in the source buffer... */
 159       if(source < sourceEnd)
 160       {
 161         ch2 = *source;
 162
 163         /* If it's a low surrogate, convert to UTF32. */
 164         if(ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
 165         {
 166           ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
 167                 + (ch2 - UNI_SUR_LOW_START) + halfBase;
 168
 169           ++source;
 170         }
 171         else if(flags == CSF_StrictConversion)
 172         {
 173           /* it's an unpaired high surrogate */
 174           --source;   /* return to the illegal value itself */
 175           result = CSR_SourceIllegal;
 176
 177           break;
 178         }
 179       }
 180       else
 181       {
 182         /* We don't have the 16 bits following the high surrogate. */
 183         --source;       /* return to the high surrogate */
 184         result = CSR_SourceExhausted;
 185
 186         break;
 187       }
 188     }
 189     else if (flags == CSF_StrictConversion)
 190     {
 191       /* UTF-16 surrogate values are illegal in UTF-32 */
 192       if(ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
 193       {
 194         --source;       /* return to the illegal value itself */
 195         result = CSR_SourceIllegal;
 196
 197         break;
 198       }
 199     }
 200
 201     if(target >= targetEnd)
 202     {
 203       source = oldSource; /* Back up source pointer! */
 204       result = CSR_TargetExhausted;
 205
 206       break;
 207     }
 208     *target++ = ch;
 209   }
 210
 211   *sourceStart = source;
 212   *targetStart = target;
 213
 214   #if defined(DEBUG)
 215   if(result == CSR_SourceIllegal)
 216   {
 217     E(DBF_UTF, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x", ch, ch2);
 218   }
 219   #endif
 220
 221   RETURN(result);
 222   return result;
 223 }
 224
 225 /***********************************************************************/
 226
 227 /*
 228  * Index into the table below with the first byte of a UTF-8 sequence to
 229  * get the number of trailing bytes that are supposed to follow it.
 230  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
 231  * left as-is for anyone who may want to do such conversion, which was
 232  * allowed in earlier algorithms.
 233  */
 234 const char trailingBytesForUTF8[256] = {
 235     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 236     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 237     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 238     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 239     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 240     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 241     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 242     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
 243 };
 244
 245 /*
 246  * Magic values subtracted from a buffer value during UTF8 conversion.
 247  * This table contains as many values as there might be trailing bytes
 248  * in a UTF-8 sequence.
 249  */
 250 static const UTF32 offsetsFromUTF8[6] = {
 251     0x00000000UL, 0x00003080UL, 0x000E2080UL,
 252     0x03C82080UL, 0xFA082080UL, 0x82082080UL
 253 };
 254
 255 /*
 256  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
 257  * into the first byte, depending on how many bytes follow.  There are
 258  * as many entries in this table as there are UTF-8 sequence types.
 259  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
 260  * for *legal* UTF-8 will be 4 or fewer bytes total.
 261  */
 262 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 263
 264 /***********************************************************************/
 265
 266 /* The interface converts a whole buffer to avoid function-call overhead.
 267  * Constants have been gathered. Loops & conditionals have been removed as
 268  * much as possible for efficiency, in favor of drop-through switches.
 269  * (See "Note A" at the bottom of the file for equivalent code.)
 270  * If your compiler supports it, the "isLegalUTF8" call can be turned
 271  * into an inline function.
 272  */
 273
 274 /***********************************************************************/
 275
 276 LIBPROTO(CodesetsConvertUTF16toUTF8, ULONG, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, const UTF16 **sourceStart), REG(a1, const UTF16 *sourceEnd) , REG(a2, UTF8 **targetStart), REG(a3, UTF8 *targetEnd), REG(d0, ULONG flags))
 277 {
 278   ULONG result = CSR_ConversionOK;
 279   const UTF16 *source = *sourceStart;
 280   UTF8 *target = *targetStart;
 281   UTF8 *start = target;
 282
 283   ENTER();
 284
 285   while(source < sourceEnd)
 286   {
 287     UTF32 ch;
 288     unsigned short bytesToWrite = 0;
 289     const UTF32 byteMask = 0xBF;
 290     const UTF32 byteMark = 0x80;
 291     const UTF16 *oldSource = source;    /* In case we have to back up because of target overflow. */
 292
 293     ch = *source++;
 294
 295     /* If we have a surrogate pair, convert to UTF32 first. */
 296     if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
 297     {
 298       /* If the 16 bits following the high surrogate are in the source buffer... */
 299       if(source < sourceEnd)
 300       {
 301         UTF32 ch2 = *source;
 302
 303         /* If it's a low surrogate, convert to UTF32. */
 304         if(ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
 305         {
 306           ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
 307                 + (ch2 - UNI_SUR_LOW_START) + halfBase;
 308
 309           ++source;
 310         }
 311         else if(flags == CSF_StrictConversion)
 312         {
 313           /* it's an unpaired high surrogate */
 314           --source;   /* return to the illegal value itself */
 315           result = CSR_SourceIllegal;
 316           break;
 317         }
 318       }
 319       else
 320       {
 321         /* We don't have the 16 bits following the high surrogate. */
 322         --source;       /* return to the high surrogate */
 323         result = CSR_SourceExhausted;
 324
 325         break;
 326       }
 327     }
 328     else if(flags == CSF_StrictConversion)
 329     {
 330       /* UTF-16 surrogate values are illegal in UTF-32 */
 331       if(ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
 332       {
 333         --source;       /* return to the illegal value itself */
 334         result = CSR_SourceIllegal;
 335
 336         break;
 337       }
 338     }
 339     /* Figure out how many bytes the result will require */
 340     if(ch < (UTF32) 0x80)
 341     {
 342       bytesToWrite = 1;
 343     }
 344     else if (ch < (UTF32) 0x800)
 345     {
 346       bytesToWrite = 2;
 347     }
 348     else if (ch < (UTF32) 0x10000)
 349     {
 350       bytesToWrite = 3;
 351     }
 352     else if (ch < (UTF32) 0x110000)
 353     {
 354       bytesToWrite = 4;
 355     }
 356     else
 357     {
 358       bytesToWrite = 3;
 359       ch = UNI_REPLACEMENT_CHAR;
 360     }
 361
 362     target += bytesToWrite;
 363     if(start)
 364     {
 365       if(target > targetEnd)
 366       {
 367         source = oldSource; /* Back up source pointer! */
 368         target -= bytesToWrite;
 369         result = CSR_TargetExhausted;
 370
 371         break;
 372       }
 373       switch(bytesToWrite)
 374       {
 375         /* note: everything falls through. */
 376         case 4:
 377           *--target = (UTF8) ((ch | byteMark) & byteMask);
 378           ch >>= 6;
 379
 380         case 3:
 381           *--target = (UTF8) ((ch | byteMark) & byteMask);
 382           ch >>= 6;
 383
 384         case 2:
 385           *--target = (UTF8) ((ch | byteMark) & byteMask);
 386           ch >>= 6;
 387
 388         case 1:
 389           *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
 390       }
 391
 392       target += bytesToWrite;
 393     }
 394   }
 395
 396   *sourceStart = source;
 397   *targetStart = target;
 398
 399   RETURN(result);
 400   return result;
 401 }
 402
 403 /***********************************************************************/
 404
 405 /*
 406  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
 407  * This must be called with the length pre-determined by the first byte.
 408  * If not calling this from ConvertUTF8to*, then the length can be set by:
 409  *  length = trailingBytesForUTF8[*source]+1;
 410  * and the sequence is illegal right away if there aren't that many bytes
 411  * available.
 412  * If presented with a length > 4, this returns FALSE.  The Unicode
 413  * definition of UTF-8 goes up to 4-byte sequences.
 414  */
 415
 416 LIBPROTO(CodesetsIsLegalUTF8, BOOL, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, const UTF8 *source), REG(d0, ULONG length))
 417 {
 418   UTF8 a;
 419   const UTF8 *srcptr = source + length;
 420
 421   ENTER();
 422
 423   switch(length)
 424   {
 425     default:
 426       RETURN(FALSE);
 427       return FALSE;
 428
 429     /* Everything else falls through when "TRUE"... */
 430     case 4:
 431       if((a = (*--srcptr)) < 0x80 || a > 0xBF)
 432       {
 433         RETURN(FALSE);
 434         return FALSE;
 435       }
 436
 437     case 3:
 438       if((a = (*--srcptr)) < 0x80 || a > 0xBF)
 439       {
 440         RETURN(FALSE);
 441         return FALSE;
 442       }
 443
 444     case 2:
 445       if((a = (*--srcptr)) > 0xBF)
 446       {
 447         RETURN(FALSE);
 448         return FALSE;
 449       }
 450
 451       switch (*source)
 452       {
 453         /* no fall-through in this inner switch */
 454         case 0xE0:
 455           if(a < 0xA0)
 456           {
 457             RETURN(FALSE);
 458             return FALSE;
 459           }
 460         break;
 461
 462         case 0xED:
 463           if(a > 0x9F)
 464           {
 465             RETURN(FALSE);
 466             return FALSE;
 467           }
 468         break;
 469
 470         case 0xF0:
 471           if(a < 0x90)
 472           {
 473             RETURN(FALSE);
 474             return FALSE;
 475           }
 476           break;
 477
 478         case 0xF4:
 479           if(a > 0x8F)
 480           {
 481             RETURN(FALSE);
 482             return FALSE;
 483           }
 484         break;
 485
 486         default:
 487           if(a < 0x80)
 488           {
 489             RETURN(FALSE);
 490             return FALSE;
 491           }
 492       }
 493
 494     case 1:
 495       if(*source >= 0x80 && *source < 0xC2)
 496       {
 497         RETURN(FALSE);
 498         return FALSE;
 499       }
 500   }
 501
 502   if(*source > 0xF4)
 503   {
 504     RETURN(FALSE);
 505     return FALSE;
 506   }
 507
 508   RETURN(TRUE);
 509   return TRUE;
 510 }
 511
 512 /***********************************************************************/
 513
 514 /*
 515  * Exported function to return whether a UTF-8 sequence is legal or not.
 516  * This is not used here; it's just exported.
 517  */
 518
 519 LIBPROTO(CodesetsIsLegalUTF8Sequence, BOOL, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, const UTF8 *source), REG(a1, const UTF8 *sourceEnd))
 520 {
 521   int length = trailingBytesForUTF8[*source] + 1;
 522   BOOL res = FALSE;
 523
 524   ENTER();
 525
 526   if(source + length > sourceEnd)
 527   {
 528     RETURN(FALSE);
 529     return FALSE;
 530   }
 531
 532   res = CodesetsIsLegalUTF8(source, length);
 533
 534   RETURN(res);
 535   return res;
 536 }
 537
 538 /***********************************************************************/
 539
 540 LIBPROTO(CodesetsConvertUTF8toUTF16, ULONG, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, const UTF8 **sourceStart), REG(a1, const UTF8 *sourceEnd), REG(a2, UTF16 **targetStart), REG(a3, UTF16 *targetEnd), REG(d0, ULONG flags))
 541 {
 542   ULONG result = CSR_ConversionOK;
 543   const UTF8 *source = *sourceStart;
 544   UTF16 *target = *targetStart;
 545   UTF16 *start = target;
 546
 547   ENTER();
 548
 549   while(source < sourceEnd)
 550   {
 551     UTF32 ch = 0;
 552     unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 553
 554     if(source + extraBytesToRead >= sourceEnd)
 555     {
 556       result = CSR_SourceExhausted;
 557       break;
 558     }
 559
 560     /* Do this check whether lenient or strict */
 561     if(!CodesetsIsLegalUTF8 (source, extraBytesToRead + 1))
 562     {
 563       result = CSR_SourceIllegal;
 564       break;
 565     }
 566
 567     /*
 568      * The cases all fall through. See "Note A" below.
 569      */
 570     switch (extraBytesToRead)
 571     {
 572       case 5:
 573         ch += *source++;
 574         ch <<= 6;       /* remember, illegal UTF-8 */
 575
 576       case 4:
 577         ch += *source++;
 578         ch <<= 6;       /* remember, illegal UTF-8 */
 579
 580       case 3:
 581         ch += *source++;
 582         ch <<= 6;
 583
 584       case 2:
 585         ch += *source++;
 586         ch <<= 6;
 587
 588       case 1:
 589         ch += *source++;
 590         ch <<= 6;
 591
 592       case 0:
 593         ch += *source++;
 594     }
 595
 596     ch -= offsetsFromUTF8[extraBytesToRead];
 597
 598     if(start && (target >= targetEnd))
 599     {
 600       source -= (extraBytesToRead + 1);   /* Back up source pointer! */
 601       result = CSR_TargetExhausted;
 602
 603       break;
 604     }
 605
 606     if(ch <= UNI_MAX_BMP)
 607     {
 608       /* Target is a character <= 0xFFFF */
 609       /* UTF-16 surrogate values are illegal in UTF-32 */
 610       if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
 611       {
 612         if(flags == CSF_StrictConversion)
 613         {
 614           source -= (extraBytesToRead + 1);   /* return to the illegal value itself */
 615           result = CSR_SourceIllegal;
 616
 617           break;
 618         }
 619         else
 620           ch = UNI_REPLACEMENT_CHAR;
 621       }
 622       if(start)
 623         *target = (UTF16) ch; /* normal case */
 624       target++;
 625     }
 626     else if(ch > UNI_MAX_UTF16)
 627     {
 628       if(flags == CSF_StrictConversion)
 629       {
 630         result = CSR_SourceIllegal;
 631         source -= (extraBytesToRead + 1);   /* return to the start */
 632
 633         break;          /* Bail out; shouldn't continue */
 634       }
 635       if(start)
 636         *target = UNI_REPLACEMENT_CHAR;
 637       target++;
 638     }
 639     else
 640     {
 641       /* target is a character in range 0xFFFF - 0x10FFFF. */
 642       if(start)
 643       {
 644         if(target + 1 >= targetEnd)
 645         {
 646           source -= (extraBytesToRead + 1);   /* Back up source pointer! */
 647           result = CSR_TargetExhausted;
 648
 649           break;
 650         }
 651
 652         ch -= halfBase;
 653         target[0] = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
 654         target[1] = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
 655       }
 656       target += 2;
 657     }
 658   }
 659
 660   *sourceStart = source;
 661   *targetStart = target;
 662
 663   RETURN(result);
 664   return result;
 665 }
 666
 667 /***********************************************************************/
 668
 669 LIBPROTO(CodesetsConvertUTF32toUTF8, ULONG, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, const UTF32 **sourceStart), REG(a1, const UTF32 *sourceEnd), REG(a2, UTF8 **targetStart), REG(a3, UTF8 *targetEnd), REG(d0, ULONG flags))
 670 {
 671   ULONG result = CSR_ConversionOK;
 672   const UTF32 *source = *sourceStart;
 673   UTF8 *target = *targetStart;
 674   UTF8 *start = target;
 675
 676   ENTER();
 677
 678   while(source < sourceEnd)
 679   {
 680     UTF32 ch;
 681     unsigned short bytesToWrite = 0;
 682     const UTF32 byteMask = 0xBF;
 683     const UTF32 byteMark = 0x80;
 684
 685     ch = *source++;
 686
 687     if(flags == CSF_StrictConversion)
 688     {
 689       /* UTF-16 surrogate values are illegal in UTF-32 */
 690       if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
 691       {
 692         --source;       /* return to the illegal value itself */
 693         result = CSR_SourceIllegal;
 694
 695         break;
 696       }
 697     }
 698
 699     /*
 700      * Figure out how many bytes the result will require. Turn any
 701      * illegally large UTF32 things (> Plane 17) into replacement chars.
 702     */
 703     if(ch < (UTF32) 0x80)
 704     {
 705       bytesToWrite = 1;
 706     }
 707     else if(ch < (UTF32) 0x800)
 708     {
 709       bytesToWrite = 2;
 710     }
 711     else if(ch < (UTF32) 0x10000)
 712     {
 713       bytesToWrite = 3;
 714     }
 715     else if(ch <= UNI_MAX_LEGAL_UTF32)
 716     {
 717       bytesToWrite = 4;
 718     }
 719     else
 720     {
 721       bytesToWrite = 3;
 722       ch = UNI_REPLACEMENT_CHAR;
 723       result = CSR_SourceIllegal;
 724     }
 725
 726     target += bytesToWrite;
 727     if(start)
 728     {
 729       if(target > targetEnd)
 730       {
 731         --source;           /* Back up source pointer! */
 732         target -= bytesToWrite;
 733         result = CSR_TargetExhausted;
 734
 735         break;
 736       }
 737       switch(bytesToWrite)
 738       {
 739         /* note: everything falls through. */
 740         case 4:
 741           *--target = (UTF8) ((ch | byteMark) & byteMask);
 742           ch >>= 6;
 743
 744         case 3:
 745           *--target = (UTF8) ((ch | byteMark) & byteMask);
 746           ch >>= 6;
 747
 748         case 2:
 749           *--target = (UTF8) ((ch | byteMark) & byteMask);
 750           ch >>= 6;
 751
 752         case 1:
 753           *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
 754       }
 755
 756       target += bytesToWrite;
 757     }
 758   }
 759
 760   *sourceStart = source;
 761   *targetStart = target;
 762
 763   RETURN(result);
 764   return result;
 765 }
 766
 767 /***********************************************************************/
 768
 769 LIBPROTO(CodesetsConvertUTF8toUTF32, ULONG, REG(a6, UNUSED __BASE_OR_IFACE), REG(a0, const UTF8 **sourceStart), REG(a1, const UTF8 *sourceEnd), REG(a2, UTF32 **targetStart), REG(a3, UTF32 *targetEnd), REG(d0, ULONG flags))
 770 {
 771   ULONG result = CSR_ConversionOK;
 772   const UTF8 *source = *sourceStart;
 773   UTF32 *target = *targetStart;
 774   UTF32 *start = target;
 775
 776   ENTER();
 777
 778   while(source < sourceEnd)
 779   {
 780     UTF32 ch = 0;
 781     unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 782
 783     if(source + extraBytesToRead >= sourceEnd)
 784     {
 785       result = CSR_SourceExhausted;
 786       break;
 787     }
 788
 789     /* Do this check whether lenient or strict */
 790     if(!CodesetsIsLegalUTF8(source, extraBytesToRead + 1))
 791     {
 792       result = CSR_SourceIllegal;
 793       break;
 794     }
 795
 796     /*
 797      * The cases all fall through. See "Note A" below.
 798     */
 799     switch (extraBytesToRead)
 800     {
 801       case 5:
 802         ch += *source++;
 803         ch <<= 6;
 804
 805       case 4:
 806         ch += *source++;
 807         ch <<= 6;
 808
 809       case 3:
 810         ch += *source++;
 811         ch <<= 6;
 812
 813       case 2:
 814         ch += *source++;
 815         ch <<= 6;
 816
 817       case 1:
 818         ch += *source++;
 819         ch <<= 6;
 820
 821       case 0:
 822         ch += *source++;
 823     }
 824
 825     ch -= offsetsFromUTF8[extraBytesToRead];
 826
 827     if(start)
 828     {
 829       if(target >= targetEnd)
 830       {
 831         source -= (extraBytesToRead + 1);   /* Back up the source pointer! */
 832         result = CSR_TargetExhausted;
 833
 834         break;
 835       }
 836
 837       if(ch <= UNI_MAX_LEGAL_UTF32)
 838       {
 839         /*
 840          * UTF-16 surrogate values are illegal in UTF-32, and anything
 841          * over Plane 17 (> 0x10FFFF) is illegal.
 842         */
 843         if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
 844         {
 845           if(flags == CSF_StrictConversion)
 846           {
 847             source -= (extraBytesToRead + 1);   /* return to the illegal value itself */
 848             result = CSR_SourceIllegal;
 849
 850             break;
 851           }
 852           else
 853           {
 854             *target++ = UNI_REPLACEMENT_CHAR;
 855           }
 856         }
 857         else
 858         {
 859           *target++ = ch;
 860         }
 861       }
 862       else
 863       {
 864         /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
 865         result = CSR_SourceIllegal;
 866         *target++ = UNI_REPLACEMENT_CHAR;
 867       }
 868     }
 869     else
 870       target++;
 871   }
 872
 873   *sourceStart = source;
 874   *targetStart = target;
 875
 876   RETURN(result);
 877   return result;
 878 }
 879
 880 /***********************************************************************
 881
 882     Note A.
 883     The fall-through switches in UTF-8 reading code save a
 884     temp variable, some decrements & conditionals.  The switches
 885     are equivalent to the following loop:
 886     {
 887         int tmpBytesToRead = extraBytesToRead+1;
 888         do {
 889         ch += *source++;
 890         --tmpBytesToRead;
 891         if (tmpBytesToRead) ch <<= 6;
 892         } while (tmpBytesToRead > 0);
 893     }
 894     In UTF-8 writing code, the switches on "bytesToWrite" are
 895     similarly unrolled loops.
 896
 897 ***********************************************************************/