5 #include <AutoDeleter.h>
11 /*! \brief Converts the given unicode character to utf8.
13 \param c The unicode character.
14 \param out Pointer to a C-string of at least 4 characters
15 long into which the output utf8 characters will
16 be written. The string that is pointed to will
17 be incremented to reflect the number of characters
18 written, i.e. if \a out initially points to a pointer
19 to the first character in string named \c str, and
20 the function writes 4 characters to \c str, then
21 upon returning, out will point to a pointer to
22 the fifth character in \c str.
25 unicode_to_utf8(uint32 c
, char **out
)
32 *(s
++) = 0xc0 | (c
>>6);
33 *(s
++) = 0x80 | (c
& 0x3f);
34 } else if (c
< 0x10000) {
35 *(s
++) = 0xe0 | (c
>>12);
36 *(s
++) = 0x80 | ((c
>>6) & 0x3f);
37 *(s
++) = 0x80 | (c
& 0x3f);
38 } else if (c
<= 0x10ffff) {
39 *(s
++) = 0xf0 | (c
>>18);
40 *(s
++) = 0x80 | ((c
>>12) & 0x3f);
41 *(s
++) = 0x80 | ((c
>>6) & 0x3f);
42 *(s
++) = 0x80 | (c
& 0x3f);
47 /*! \brief Converts the given utf8 character to 4-byte unicode.
49 \param in Pointer to a C-String from which utf8 characters
50 will be read. *in will be incremented to reflect
51 the number of characters read, similarly to the
52 \c out parameter for unicode_to_utf8().
54 \return The 4-byte unicode character, or **in if passed an
55 invalid character, or 0 if passed any NULL pointers.
58 utf8_to_unicode(const char **in
)
62 uint8
*bytes
= (uint8
*)*in
;
69 switch (bytes
[0] & 0xf0) {
71 case 0xd0: length
= 2; break;
72 case 0xe0: length
= 3; break;
78 // valid 1-byte character
79 // and invalid characters
83 uint32 c
= bytes
[0] & mask
;
85 for (;i
< length
&& (bytes
[i
] & 0x80) > 0;i
++)
86 c
= (c
<< 6) | (bytes
[i
] & 0x3f);
91 return (uint32
)bytes
[0];
101 /*! \brief Creates an empty string object. */
102 UdfString::UdfString()
110 /*! \brief Creates a new UdfString object from the given Utf8 string. */
111 UdfString::UdfString(const char *utf8
)
120 /*! \brief Creates a new UdfString object from the given Cs0 string. */
121 UdfString::UdfString(const char *cs0
, uint32 length
)
130 UdfString::~UdfString()
136 /*! \brief Assignment from a Utf8 string. */
138 UdfString::SetTo(const char *utf8
)
140 TRACE(("UdfString::SetTo: utf8 = `%s', strlen(utf8) = %ld\n",
141 utf8
, utf8
? strlen(utf8
) : 0));
145 TRACE_ERROR(("UdfString::SetTo: passed NULL utf8 string\n"));
149 uint32 length
= strlen(utf8
);
150 // First copy the utf8 string
151 fUtf8String
= new(nothrow
) char[length
+ 1];
152 if (fUtf8String
== NULL
) {
153 TRACE_ERROR(("UdfString::SetTo: fUtf8String[%" B_PRIu32
154 "] allocation failed\n", length
+ 1));
158 memcpy(fUtf8String
, utf8
, length
+ 1);
159 // Next convert to raw 4-byte unicode. Then we'll do some
160 // analysis to figure out if we have any invalid characters,
161 // and whether we can get away with compressed 8-bit unicode,
162 // or have to use burly 16-bit unicode.
163 uint32
*raw
= new(nothrow
) uint32
[length
];
165 TRACE_ERROR(("UdfString::SetTo: uint32 raw[%" B_PRIu32
"] temporary"
166 " string allocation failed\n", length
));
171 ArrayDeleter
<uint32
> rawDeleter(raw
);
173 const char *in
= utf8
;
174 uint32 rawLength
= 0;
175 for (uint32 i
= 0; i
< length
&& uint32(in
- utf8
) < length
; i
++, rawLength
++)
176 raw
[i
] = utf8_to_unicode(&in
);
178 // Check for invalids.
179 uint32 mask
= 0xffff0000;
180 for (uint32 i
= 0; i
< rawLength
; i
++) {
182 TRACE(("WARNING: utf8 string contained a multi-byte sequence which "
183 "was converted into a unicode character larger than 16-bits; "
184 "character will be converted to an underscore character for "
189 // See if we can get away with 8-bit compressed unicode
191 bool canUse8bit
= true;
192 for (uint32 i
= 0; i
< rawLength
; i
++) {
198 // Build our cs0 string
200 fCs0Length
= rawLength
+ 1;
201 fCs0String
= new(nothrow
) char[fCs0Length
];
202 if (fCs0String
!= NULL
) {
203 fCs0String
[0] = '\x08'; // 8-bit compressed unicode
204 for (uint32 i
= 0; i
< rawLength
; i
++)
205 fCs0String
[i
+ 1] = raw
[i
] % 256;
207 TRACE_ERROR(("UdfString::SetTo: fCs0String[%" B_PRIu32
208 "] allocation failed\n", fCs0Length
));
213 fCs0Length
= rawLength
* 2 + 1;
214 fCs0String
= new(nothrow
) char[fCs0Length
];
215 if (fCs0String
!= NULL
) {
217 fCs0String
[pos
++] = '\x10'; // 16-bit unicode
218 for (uint32 i
= 0; i
< rawLength
; i
++) {
219 // 16-bit unicode chars must be written big endian
220 uint16 value
= uint16(raw
[i
]);
221 uint8 high
= uint8(value
>> 8 & 0xff);
222 uint8 low
= uint8(value
& 0xff);
223 fCs0String
[pos
++] = high
;
224 fCs0String
[pos
++] = low
;
227 TRACE_ERROR(("UdfString::SetTo: fCs0String[%" B_PRIu32
228 "] allocation failed\n", fCs0Length
));
236 /*! \brief Assignment from a Cs0 string. */
238 UdfString::SetTo(const char *cs0
, uint32 length
)
240 DEBUG_INIT_ETC("UdfString", ("cs0: %p, length: %ld", cs0
, length
));
246 PRINT(("passed NULL cs0 string\n"));
250 // First copy the Cs0 string and length
251 fCs0String
= new(nothrow
) char[length
];
253 memcpy(fCs0String
, cs0
, length
);
256 PRINT(("new fCs0String[%ld] allocation failed\n", length
));
260 // Now convert to utf8
262 // The first byte of the CS0 string is the compression ID.
263 // - 8: 1 byte characters
264 // - 16: 2 byte, big endian characters
265 // - 254: "CS0 expansion is empty and unique", 1 byte characters
266 // - 255: "CS0 expansion is empty and unique", 2 byte, big endian characters
267 PRINT(("compression ID: %d\n", cs0
[0]));
268 switch (reinterpret_cast<const uint8
*>(cs0
)[0]) {
272 const uint8
*inputString
= reinterpret_cast<const uint8
*>(&(cs0
[1]));
273 int32 maxLength
= length
-1; // Max length of input string in uint8 characters
274 int32 allocationLength
= maxLength
*2+1; // Need at most 2 utf8 chars per uint8 char
275 fUtf8String
= new(nothrow
) char[allocationLength
];
277 char *outputString
= fUtf8String
;
279 for (int32 i
= 0; i
< maxLength
&& inputString
[i
]; i
++) {
280 unicode_to_utf8(inputString
[i
], &outputString
);
284 PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength
));
293 const uint16
*inputString
= reinterpret_cast<const uint16
*>(&(cs0
[1]));
294 int32 maxLength
= (length
-1) / 2; // Max length of input string in uint16 characters
295 int32 allocationLength
= maxLength
*3+1; // Need at most 3 utf8 chars per uint16 char
296 fUtf8String
= new(nothrow
) char[allocationLength
];
298 char *outputString
= fUtf8String
;
300 for (int32 i
= 0; i
< maxLength
&& inputString
[i
]; i
++) {
301 unicode_to_utf8(B_BENDIAN_TO_HOST_INT16(inputString
[i
]), &outputString
);
305 PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength
));
312 PRINT(("invalid compression id!\n"));
320 DEBUG_INIT("UdfString");
322 delete [] fCs0String
;
324 delete [] fUtf8String
;