2 * Copyright 2001-2004 Unicode, Inc.
6 * This source code is provided as is by Unicode, Inc. No claims are
7 * made as to fitness for any particular purpose. No warranties of any
8 * kind are expressed or implied. The recipient agrees to determine
9 * applicability of information provided. If this file has been
10 * purchased on magnetic or optical media from Unicode, Inc., the
11 * sole remedy for any claim will be exchange of defective media
12 * within 90 days of receipt.
14 * Limitations on Rights to Redistribute This Code
16 * Unicode, Inc. hereby grants the right to freely use the information
17 * supplied in this file in the creation of products supporting the
18 * Unicode Standard, and to make copies of this file in any form
19 * for internal or external distribution as long as this notice
23 /* ---------------------------------------------------------------------
25 Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26 Author: Mark E. Davis, 1994.
27 Rev History: Rick McGowan, fixes & updates May 2001.
28 Sept 2001: fixed const & error conditions per
29 mods suggested by S. Parent & A. Lillich.
30 June 2002: Tim Dodd added detection and handling of incomplete
31 source sequences, enhanced error detection, added casts
32 to eliminate compiler warnings.
33 July 2003: slight mods to back out aggressive FFFE detection.
34 Jan 2004: updated switches in from-UTF8 conversions.
35 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
37 See the header file "ConvertUTF.h" for complete documentation.
39 ------------------------------------------------------------------------ */
42 #include "convertUTF.h"
48 /***********************************************************************/
50 static const int halfShift
= 10; /* used for shifting by 10 bits */
52 static const UTF32 halfBase
= 0x0010000UL
;
53 static const UTF32 halfMask
= 0x3FFUL
;
55 #define UNI_SUR_HIGH_START (UTF32)0xD800
56 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
57 #define UNI_SUR_LOW_START (UTF32)0xDC00
58 #define UNI_SUR_LOW_END (UTF32)0xDFFF
60 /***********************************************************************/
63 CodesetsConvertUTF32toUTF16(REG(a0
, const UTF32
** sourceStart
),
64 REG(a1
, const UTF32
* sourceEnd
),
65 REG(a2
, UTF16
** targetStart
),
66 REG(a3
, UTF16
* targetEnd
),
69 ULONG result
= CSR_ConversionOK
;
70 const UTF32
*source
= *sourceStart
;
71 UTF16
*target
= *targetStart
;
75 while(source
< sourceEnd
)
79 if(target
>= targetEnd
)
81 result
= CSR_TargetExhausted
;
88 /* Target is a character <= 0xFFFF */
89 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
90 if(ch
>= UNI_SUR_HIGH_START
&& ch
<= UNI_SUR_LOW_END
)
92 if(flags
== CSF_StrictConversion
)
94 --source
; /* return to the illegal value itself */
95 result
= CSR_SourceIllegal
;
100 *target
++ = UNI_REPLACEMENT_CHAR
;
105 *target
++ = (UTF16
)ch
; /* normal case */
108 else if(ch
> UNI_MAX_LEGAL_UTF32
)
110 if(flags
== CSF_StrictConversion
)
112 result
= CSR_SourceIllegal
;
116 *target
++ = UNI_REPLACEMENT_CHAR
;
121 /* target is a character in range 0xFFFF - 0x10FFFF. */
122 if(target
+ 1 >= targetEnd
)
124 --source
; /* Back up source pointer! */
125 result
= CSR_TargetExhausted
;
129 *target
++ = (UTF16
) ((ch
>> halfShift
) + UNI_SUR_HIGH_START
);
130 *target
++ = (UTF16
) ((ch
& halfMask
) + UNI_SUR_LOW_START
);
134 *sourceStart
= source
;
135 *targetStart
= target
;
141 /***********************************************************************/
144 CodesetsConvertUTF16toUTF32(REG(a0
, const UTF16
** sourceStart
),
145 REG(a1
, const UTF16
* sourceEnd
),
146 REG(a2
, UTF32
** targetStart
),
147 REG(a3
, UTF32
* targetEnd
),
148 REG(d0
, ULONG flags
))
150 ULONG result
= CSR_ConversionOK
;
151 const UTF16
*source
= *sourceStart
;
152 UTF32
*target
= *targetStart
;
157 while(source
< sourceEnd
)
159 const UTF16
*oldSource
= source
; /* In case we have to back up because of target overflow. */
162 /* If we have a surrogate pair, convert to UTF32 first. */
163 if(ch
>= UNI_SUR_HIGH_START
&& ch
<= UNI_SUR_HIGH_END
)
165 /* If the 16 bits following the high surrogate are in the source buffer... */
166 if(source
< sourceEnd
)
170 /* If it's a low surrogate, convert to UTF32. */
171 if(ch2
>= UNI_SUR_LOW_START
&& ch2
<= UNI_SUR_LOW_END
)
173 ch
= ((ch
- UNI_SUR_HIGH_START
) << halfShift
)
174 + (ch2
- UNI_SUR_LOW_START
) + halfBase
;
178 else if(flags
== CSF_StrictConversion
)
180 /* it's an unpaired high surrogate */
181 --source
; /* return to the illegal value itself */
182 result
= CSR_SourceIllegal
;
189 /* We don't have the 16 bits following the high surrogate. */
190 --source
; /* return to the high surrogate */
191 result
= CSR_SourceExhausted
;
196 else if (flags
== CSF_StrictConversion
)
198 /* UTF-16 surrogate values are illegal in UTF-32 */
199 if(ch
>= UNI_SUR_LOW_START
&& ch
<= UNI_SUR_LOW_END
)
201 --source
; /* return to the illegal value itself */
202 result
= CSR_SourceIllegal
;
208 if(target
>= targetEnd
)
210 source
= oldSource
; /* Back up source pointer! */
211 result
= CSR_TargetExhausted
;
218 *sourceStart
= source
;
219 *targetStart
= target
;
222 if(result
== CSR_SourceIllegal
)
224 E(DBF_UTF
, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x", ch
, ch2
);
232 /***********************************************************************/
235 * Index into the table below with the first byte of a UTF-8 sequence to
236 * get the number of trailing bytes that are supposed to follow it.
237 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
238 * left as-is for anyone who may want to do such conversion, which was
239 * allowed in earlier algorithms.
241 const char trailingBytesForUTF8
[256] = {
242 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
243 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
244 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
245 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
246 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
247 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
248 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
249 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
253 * Magic values subtracted from a buffer value during UTF8 conversion.
254 * This table contains as many values as there might be trailing bytes
255 * in a UTF-8 sequence.
257 static const UTF32 offsetsFromUTF8
[6] = {
258 0x00000000UL
, 0x00003080UL
, 0x000E2080UL
,
259 0x03C82080UL
, 0xFA082080UL
, 0x82082080UL
263 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
264 * into the first byte, depending on how many bytes follow. There are
265 * as many entries in this table as there are UTF-8 sequence types.
266 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
267 * for *legal* UTF-8 will be 4 or fewer bytes total.
269 static const UTF8 firstByteMark
[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
271 /***********************************************************************/
273 /* The interface converts a whole buffer to avoid function-call overhead.
274 * Constants have been gathered. Loops & conditionals have been removed as
275 * much as possible for efficiency, in favor of drop-through switches.
276 * (See "Note A" at the bottom of the file for equivalent code.)
277 * If your compiler supports it, the "isLegalUTF8" call can be turned
278 * into an inline function.
281 /***********************************************************************/
284 CodesetsConvertUTF16toUTF8(REG(a0
, const UTF16
** sourceStart
),
285 REG(a1
, const UTF16
* sourceEnd
),
286 REG(a2
, UTF8
** targetStart
),
287 REG(a3
, UTF8
* targetEnd
),
288 REG(d0
, ULONG flags
))
290 ULONG result
= CSR_ConversionOK
;
291 const UTF16
*source
= *sourceStart
;
292 UTF8
*target
= *targetStart
;
293 UTF8
*start
= target
;
297 while(source
< sourceEnd
)
300 unsigned short bytesToWrite
= 0;
301 const UTF32 byteMask
= 0xBF;
302 const UTF32 byteMark
= 0x80;
303 const UTF16
*oldSource
= source
; /* In case we have to back up because of target overflow. */
307 /* If we have a surrogate pair, convert to UTF32 first. */
308 if(ch
>= UNI_SUR_HIGH_START
&& ch
<= UNI_SUR_HIGH_END
)
310 /* If the 16 bits following the high surrogate are in the source buffer... */
311 if(source
< sourceEnd
)
315 /* If it's a low surrogate, convert to UTF32. */
316 if(ch2
>= UNI_SUR_LOW_START
&& ch2
<= UNI_SUR_LOW_END
)
318 ch
= ((ch
- UNI_SUR_HIGH_START
) << halfShift
)
319 + (ch2
- UNI_SUR_LOW_START
) + halfBase
;
323 else if(flags
== CSF_StrictConversion
)
325 /* it's an unpaired high surrogate */
326 --source
; /* return to the illegal value itself */
327 result
= CSR_SourceIllegal
;
333 /* We don't have the 16 bits following the high surrogate. */
334 --source
; /* return to the high surrogate */
335 result
= CSR_SourceExhausted
;
340 else if(flags
== CSF_StrictConversion
)
342 /* UTF-16 surrogate values are illegal in UTF-32 */
343 if(ch
>= UNI_SUR_LOW_START
&& ch
<= UNI_SUR_LOW_END
)
345 --source
; /* return to the illegal value itself */
346 result
= CSR_SourceIllegal
;
351 /* Figure out how many bytes the result will require */
352 if(ch
< (UTF32
) 0x80)
356 else if (ch
< (UTF32
) 0x800)
360 else if (ch
< (UTF32
) 0x10000)
364 else if (ch
< (UTF32
) 0x110000)
371 ch
= UNI_REPLACEMENT_CHAR
;
374 target
+= bytesToWrite
;
377 if(target
> targetEnd
)
379 source
= oldSource
; /* Back up source pointer! */
380 target
-= bytesToWrite
;
381 result
= CSR_TargetExhausted
;
387 /* note: everything falls through. */
389 *--target
= (UTF8
) ((ch
| byteMark
) & byteMask
);
393 *--target
= (UTF8
) ((ch
| byteMark
) & byteMask
);
397 *--target
= (UTF8
) ((ch
| byteMark
) & byteMask
);
401 *--target
= (UTF8
) (ch
| firstByteMark
[bytesToWrite
]);
404 target
+= bytesToWrite
;
408 *sourceStart
= source
;
409 *targetStart
= target
;
415 /***********************************************************************/
418 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
419 * This must be called with the length pre-determined by the first byte.
420 * If not calling this from ConvertUTF8to*, then the length can be set by:
421 * length = trailingBytesForUTF8[*source]+1;
422 * and the sequence is illegal right away if there aren't that many bytes
424 * If presented with a length > 4, this returns FALSE. The Unicode
425 * definition of UTF-8 goes up to 4-byte sequences.
429 CodesetsIsLegalUTF8(REG(a0
, const UTF8
* source
),
430 REG(d0
, ULONG length
))
433 const UTF8
*srcptr
= source
+ length
;
443 /* Everything else falls through when "TRUE"... */
445 if((a
= (*--srcptr
)) < 0x80 || a
> 0xBF)
452 if((a
= (*--srcptr
)) < 0x80 || a
> 0xBF)
459 if((a
= (*--srcptr
)) > 0xBF)
467 /* no fall-through in this inner switch */
509 if(*source
>= 0x80 && *source
< 0xC2)
526 /***********************************************************************/
529 * Exported function to return whether a UTF-8 sequence is legal or not.
530 * This is not used here; it's just exported.
534 CodesetsIsLegalUTF8Sequence(REG(a0
, const UTF8
* source
),
535 REG(a1
, const UTF8
* sourceEnd
))
537 int length
= trailingBytesForUTF8
[*source
] + 1;
542 if(source
+ length
> sourceEnd
)
548 res
= CodesetsIsLegalUTF8(source
, length
);
554 /***********************************************************************/
557 CodesetsConvertUTF8toUTF16(REG(a0
, const UTF8
** sourceStart
),
558 REG(a1
, const UTF8
* sourceEnd
),
559 REG(a2
, UTF16
** targetStart
),
560 REG(a3
, UTF16
* targetEnd
),
561 REG(d0
, ULONG flags
))
563 ULONG result
= CSR_ConversionOK
;
564 const UTF8
*source
= *sourceStart
;
565 UTF16
*target
= *targetStart
;
566 UTF16
*start
= target
;
570 while(source
< sourceEnd
)
573 unsigned short extraBytesToRead
= trailingBytesForUTF8
[*source
];
575 if(source
+ extraBytesToRead
>= sourceEnd
)
577 result
= CSR_SourceExhausted
;
581 /* Do this check whether lenient or strict */
582 if(!CodesetsIsLegalUTF8 (source
, extraBytesToRead
+ 1))
584 result
= CSR_SourceIllegal
;
589 * The cases all fall through. See "Note A" below.
591 switch (extraBytesToRead
)
595 ch
<<= 6; /* remember, illegal UTF-8 */
599 ch
<<= 6; /* remember, illegal UTF-8 */
617 ch
-= offsetsFromUTF8
[extraBytesToRead
];
619 if(start
&& (target
>= targetEnd
))
621 source
-= (extraBytesToRead
+ 1); /* Back up source pointer! */
622 result
= CSR_TargetExhausted
;
627 if(ch
<= UNI_MAX_BMP
)
629 /* Target is a character <= 0xFFFF */
630 /* UTF-16 surrogate values are illegal in UTF-32 */
631 if(ch
>= UNI_SUR_HIGH_START
&& ch
<= UNI_SUR_LOW_END
)
633 if(flags
== CSF_StrictConversion
)
635 source
-= (extraBytesToRead
+ 1); /* return to the illegal value itself */
636 result
= CSR_SourceIllegal
;
641 ch
= UNI_REPLACEMENT_CHAR
;
644 *target
= (UTF16
) ch
; /* normal case */
647 else if(ch
> UNI_MAX_UTF16
)
649 if(flags
== CSF_StrictConversion
)
651 result
= CSR_SourceIllegal
;
652 source
-= (extraBytesToRead
+ 1); /* return to the start */
654 break; /* Bail out; shouldn't continue */
657 *target
= UNI_REPLACEMENT_CHAR
;
662 /* target is a character in range 0xFFFF - 0x10FFFF. */
665 if(target
+ 1 >= targetEnd
)
667 source
-= (extraBytesToRead
+ 1); /* Back up source pointer! */
668 result
= CSR_TargetExhausted
;
674 target
[0] = (UTF16
) ((ch
>> halfShift
) + UNI_SUR_HIGH_START
);
675 target
[1] = (UTF16
) ((ch
& halfMask
) + UNI_SUR_LOW_START
);
681 *sourceStart
= source
;
682 *targetStart
= target
;
688 /***********************************************************************/
691 CodesetsConvertUTF32toUTF8(REG(a0
, const UTF32
** sourceStart
),
692 REG(a1
, const UTF32
* sourceEnd
),
693 REG(a2
, UTF8
** targetStart
),
694 REG(a3
, UTF8
* targetEnd
),
695 REG(d0
, ULONG flags
))
697 ULONG result
= CSR_ConversionOK
;
698 const UTF32
*source
= *sourceStart
;
699 UTF8
*target
= *targetStart
;
700 UTF8
*start
= target
;
704 while(source
< sourceEnd
)
707 unsigned short bytesToWrite
= 0;
708 const UTF32 byteMask
= 0xBF;
709 const UTF32 byteMark
= 0x80;
713 if(flags
== CSF_StrictConversion
)
715 /* UTF-16 surrogate values are illegal in UTF-32 */
716 if(ch
>= UNI_SUR_HIGH_START
&& ch
<= UNI_SUR_LOW_END
)
718 --source
; /* return to the illegal value itself */
719 result
= CSR_SourceIllegal
;
726 * Figure out how many bytes the result will require. Turn any
727 * illegally large UTF32 things (> Plane 17) into replacement chars.
729 if(ch
< (UTF32
) 0x80)
733 else if(ch
< (UTF32
) 0x800)
737 else if(ch
< (UTF32
) 0x10000)
741 else if(ch
<= UNI_MAX_LEGAL_UTF32
)
748 ch
= UNI_REPLACEMENT_CHAR
;
749 result
= CSR_SourceIllegal
;
752 target
+= bytesToWrite
;
755 if(target
> targetEnd
)
757 --source
; /* Back up source pointer! */
758 target
-= bytesToWrite
;
759 result
= CSR_TargetExhausted
;
765 /* note: everything falls through. */
767 *--target
= (UTF8
) ((ch
| byteMark
) & byteMask
);
771 *--target
= (UTF8
) ((ch
| byteMark
) & byteMask
);
775 *--target
= (UTF8
) ((ch
| byteMark
) & byteMask
);
779 *--target
= (UTF8
) (ch
| firstByteMark
[bytesToWrite
]);
782 target
+= bytesToWrite
;
786 *sourceStart
= source
;
787 *targetStart
= target
;
793 /***********************************************************************/
796 CodesetsConvertUTF8toUTF32(REG(a0
, const UTF8
** sourceStart
),
797 REG(a1
, const UTF8
* sourceEnd
),
798 REG(a2
, UTF32
** targetStart
),
799 REG(a3
, UTF32
* targetEnd
),
800 REG(d0
, ULONG flags
))
802 ULONG result
= CSR_ConversionOK
;
803 const UTF8
*source
= *sourceStart
;
804 UTF32
*target
= *targetStart
;
805 UTF32
*start
= target
;
809 while(source
< sourceEnd
)
812 unsigned short extraBytesToRead
= trailingBytesForUTF8
[*source
];
814 if(source
+ extraBytesToRead
>= sourceEnd
)
816 result
= CSR_SourceExhausted
;
820 /* Do this check whether lenient or strict */
821 if(!CodesetsIsLegalUTF8(source
, extraBytesToRead
+ 1))
823 result
= CSR_SourceIllegal
;
828 * The cases all fall through. See "Note A" below.
830 switch (extraBytesToRead
)
856 ch
-= offsetsFromUTF8
[extraBytesToRead
];
860 if(target
>= targetEnd
)
862 source
-= (extraBytesToRead
+ 1); /* Back up the source pointer! */
863 result
= CSR_TargetExhausted
;
868 if(ch
<= UNI_MAX_LEGAL_UTF32
)
871 * UTF-16 surrogate values are illegal in UTF-32, and anything
872 * over Plane 17 (> 0x10FFFF) is illegal.
874 if(ch
>= UNI_SUR_HIGH_START
&& ch
<= UNI_SUR_LOW_END
)
876 if(flags
== CSF_StrictConversion
)
878 source
-= (extraBytesToRead
+ 1); /* return to the illegal value itself */
879 result
= CSR_SourceIllegal
;
885 *target
++ = UNI_REPLACEMENT_CHAR
;
895 /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
896 result
= CSR_SourceIllegal
;
897 *target
++ = UNI_REPLACEMENT_CHAR
;
904 *sourceStart
= source
;
905 *targetStart
= target
;
911 /***********************************************************************
914 The fall-through switches in UTF-8 reading code save a
915 temp variable, some decrements & conditionals. The switches
916 are equivalent to the following loop:
918 int tmpBytesToRead = extraBytesToRead+1;
922 if (tmpBytesToRead) ch <<= 6;
923 } while (tmpBytesToRead > 0);
925 In UTF-8 writing code, the switches on "bytesToWrite" are
926 similarly unrolled loops.
928 ***********************************************************************/