2 * Copyright 2014 Jonathan Schleifer <js@webkeks.org>
3 * Copyright 2014 Haiku, Inc. All rights reserved.
5 * Distributed under the terms of the MIT License.
8 * Jonathan Schleifer, js@webkeks.org
9 * John Scipione, jscipione@gmail.com
13 #include "convertutf.h"
16 #include <ByteOrder.h>
18 #include <StorageDefs.h>
22 glyph_length(uint32 glyph
)
26 else if (glyph
< 0x800)
28 else if (glyph
< 0x10000)
30 else if (glyph
< 0x110000)
38 encode_glyph(uint32 glyph
, size_t glyphLength
, char* buffer
)
40 if (glyphLength
== 1) {
42 } else if (glyphLength
== 2) {
43 *buffer
++ = 0xC0 | (glyph
>> 6);
44 *buffer
= 0x80 | (glyph
& 0x3F);
45 } else if (glyphLength
== 3) {
46 *buffer
++ = 0xE0 | (glyph
>> 12);
47 *buffer
++ = 0x80 | (glyph
>> 6 & 0x3F);
48 *buffer
= 0x80 | (glyph
& 0x3F);
49 } else if (glyphLength
== 4) {
50 *buffer
++ = 0xF0 | (glyph
>> 18);
51 *buffer
++ = 0x80 | (glyph
>> 12 & 0x3F);
52 *buffer
++ = 0x80 | (glyph
>> 6 & 0x3F);
53 *buffer
= 0x80 | (glyph
& 0x3F);
59 utf16_to_utf8(const uint16
* source
, size_t sourceCodeUnitCount
, char* target
,
60 size_t targetLength
, bool isLittleEndian
)
62 if (source
== NULL
|| sourceCodeUnitCount
== 0
63 || target
== NULL
|| targetLength
== 0) {
67 ssize_t outLength
= 0;
69 for (size_t i
= 0; i
< sourceCodeUnitCount
; i
++) {
70 uint32 glyph
= isLittleEndian
71 ? B_LENDIAN_TO_HOST_INT32(source
[i
])
72 : B_BENDIAN_TO_HOST_INT32(source
[i
]);
74 if ((glyph
& 0xFC00) == 0xDC00) {
75 // missing high surrogate
79 if ((glyph
& 0xFC00) == 0xD800) {
80 if (sourceCodeUnitCount
<= i
+ 1) {
81 // high surrogate at end of string
85 uint32 low
= isLittleEndian
86 ? B_LENDIAN_TO_HOST_INT32(source
[i
+ 1])
87 : B_BENDIAN_TO_HOST_INT32(source
[i
+ 1]);
88 if ((low
& 0xFC00) != 0xDC00) {
89 // missing low surrogate
93 glyph
= (((glyph
& 0x3FF) << 10) | (low
& 0x3FF)) + 0x10000;
97 size_t glyphLength
= glyph_length(glyph
);
100 else if (outLength
+ glyphLength
>= targetLength
101 || outLength
+ glyphLength
>= B_FILE_NAME_LENGTH
) {
102 // NUL terminate the string so the caller can use the
103 // abbreviated version in this case. Since the length
104 // isn't returned the caller will need to call strlen()
105 // to get the length of the string.
106 target
[outLength
] = '\0';
107 return B_NAME_TOO_LONG
;
110 encode_glyph(glyph
, glyphLength
, target
+ outLength
);
111 outLength
+= glyphLength
;
114 target
[outLength
] = '\0';
121 utf16le_to_utf8(const uint16
* source
, size_t sourceCodeUnitCount
,
122 char* target
, size_t targetLength
)
124 return utf16_to_utf8(source
, sourceCodeUnitCount
, target
, targetLength
,
130 utf16be_to_utf8(const uint16
* source
, size_t sourceCodeUnitCount
,
131 char* target
, size_t targetLength
)
133 return utf16_to_utf8(source
, sourceCodeUnitCount
, target
, targetLength
,