btrfs: [] on the end of a struct field is a variable length array.
[haiku.git] / src / add-ons / kernel / file_systems / udf / UdfString.cpp
blobb8fa5918fe0fcb61719929983d0dfa89d643d6da
1 #include "UdfString.h"
3 #include <ByteOrder.h>
5 #include <AutoDeleter.h>
8 using std::nothrow;
11 /*! \brief Converts the given unicode character to utf8.
13 \param c The unicode character.
14 \param out Pointer to a C-string of at least 4 characters
15 long into which the output utf8 characters will
16 be written. The string that is pointed to will
17 be incremented to reflect the number of characters
18 written, i.e. if \a out initially points to a pointer
19 to the first character in string named \c str, and
20 the function writes 4 characters to \c str, then
21 upon returning, out will point to a pointer to
22 the fifth character in \c str.
24 static void
25 unicode_to_utf8(uint32 c, char **out)
27 char *s = *out;
29 if (c < 0x80)
30 *(s++) = c;
31 else if (c < 0x800) {
32 *(s++) = 0xc0 | (c>>6);
33 *(s++) = 0x80 | (c & 0x3f);
34 } else if (c < 0x10000) {
35 *(s++) = 0xe0 | (c>>12);
36 *(s++) = 0x80 | ((c>>6) & 0x3f);
37 *(s++) = 0x80 | (c & 0x3f);
38 } else if (c <= 0x10ffff) {
39 *(s++) = 0xf0 | (c>>18);
40 *(s++) = 0x80 | ((c>>12) & 0x3f);
41 *(s++) = 0x80 | ((c>>6) & 0x3f);
42 *(s++) = 0x80 | (c & 0x3f);
44 *out = s;
47 /*! \brief Converts the given utf8 character to 4-byte unicode.
49 \param in Pointer to a C-String from which utf8 characters
50 will be read. *in will be incremented to reflect
51 the number of characters read, similarly to the
52 \c out parameter for unicode_to_utf8().
54 \return The 4-byte unicode character, or **in if passed an
55 invalid character, or 0 if passed any NULL pointers.
57 static uint32
58 utf8_to_unicode(const char **in)
60 if (!in)
61 return 0;
62 uint8 *bytes = (uint8 *)*in;
63 if (!bytes)
64 return 0;
66 int32 length;
67 uint8 mask = 0x1f;
69 switch (bytes[0] & 0xf0) {
70 case 0xc0:
71 case 0xd0: length = 2; break;
72 case 0xe0: length = 3; break;
73 case 0xf0:
74 mask = 0x0f;
75 length = 4;
76 break;
77 default:
78 // valid 1-byte character
79 // and invalid characters
80 (*in)++;
81 return bytes[0];
83 uint32 c = bytes[0] & mask;
84 int32 i = 1;
85 for (;i < length && (bytes[i] & 0x80) > 0;i++)
86 c = (c << 6) | (bytes[i] & 0x3f);
88 if (i < length) {
89 // invalid character
90 (*in)++;
91 return (uint32)bytes[0];
93 *in += length;
94 return c;
98 // #pragma mark -
101 /*! \brief Creates an empty string object. */
102 UdfString::UdfString()
104 fCs0String(NULL),
105 fUtf8String(NULL)
110 /*! \brief Creates a new UdfString object from the given Utf8 string. */
111 UdfString::UdfString(const char *utf8)
113 fCs0String(NULL),
114 fUtf8String(NULL)
116 SetTo(utf8);
120 /*! \brief Creates a new UdfString object from the given Cs0 string. */
121 UdfString::UdfString(const char *cs0, uint32 length)
123 fCs0String(NULL),
124 fUtf8String(NULL)
126 SetTo(cs0, length);
130 UdfString::~UdfString()
132 _Clear();
136 /*! \brief Assignment from a Utf8 string. */
137 void
138 UdfString::SetTo(const char *utf8)
140 TRACE(("UdfString::SetTo: utf8 = `%s', strlen(utf8) = %ld\n",
141 utf8, utf8 ? strlen(utf8) : 0));
142 _Clear();
144 if (utf8 == NULL) {
145 TRACE_ERROR(("UdfString::SetTo: passed NULL utf8 string\n"));
146 return;
149 uint32 length = strlen(utf8);
150 // First copy the utf8 string
151 fUtf8String = new(nothrow) char[length + 1];
152 if (fUtf8String == NULL) {
153 TRACE_ERROR(("UdfString::SetTo: fUtf8String[%" B_PRIu32
154 "] allocation failed\n", length + 1));
155 return;
158 memcpy(fUtf8String, utf8, length + 1);
159 // Next convert to raw 4-byte unicode. Then we'll do some
160 // analysis to figure out if we have any invalid characters,
161 // and whether we can get away with compressed 8-bit unicode,
162 // or have to use burly 16-bit unicode.
163 uint32 *raw = new(nothrow) uint32[length];
164 if (raw == NULL) {
165 TRACE_ERROR(("UdfString::SetTo: uint32 raw[%" B_PRIu32 "] temporary"
166 " string allocation failed\n", length));
167 _Clear();
168 return;
171 ArrayDeleter<uint32> rawDeleter(raw);
173 const char *in = utf8;
174 uint32 rawLength = 0;
175 for (uint32 i = 0; i < length && uint32(in - utf8) < length; i++, rawLength++)
176 raw[i] = utf8_to_unicode(&in);
178 // Check for invalids.
179 uint32 mask = 0xffff0000;
180 for (uint32 i = 0; i < rawLength; i++) {
181 if (raw[i] & mask) {
182 TRACE(("WARNING: utf8 string contained a multi-byte sequence which "
183 "was converted into a unicode character larger than 16-bits; "
184 "character will be converted to an underscore character for "
185 "safety.\n"));
186 raw[i] = '_';
189 // See if we can get away with 8-bit compressed unicode
190 mask = 0xffffff00;
191 bool canUse8bit = true;
192 for (uint32 i = 0; i < rawLength; i++) {
193 if (raw[i] & mask) {
194 canUse8bit = false;
195 break;
198 // Build our cs0 string
199 if (canUse8bit) {
200 fCs0Length = rawLength + 1;
201 fCs0String = new(nothrow) char[fCs0Length];
202 if (fCs0String != NULL) {
203 fCs0String[0] = '\x08'; // 8-bit compressed unicode
204 for (uint32 i = 0; i < rawLength; i++)
205 fCs0String[i + 1] = raw[i] % 256;
206 } else {
207 TRACE_ERROR(("UdfString::SetTo: fCs0String[%" B_PRIu32
208 "] allocation failed\n", fCs0Length));
209 _Clear();
210 return;
212 } else {
213 fCs0Length = rawLength * 2 + 1;
214 fCs0String = new(nothrow) char[fCs0Length];
215 if (fCs0String != NULL) {
216 uint32 pos = 0;
217 fCs0String[pos++] = '\x10'; // 16-bit unicode
218 for (uint32 i = 0; i < rawLength; i++) {
219 // 16-bit unicode chars must be written big endian
220 uint16 value = uint16(raw[i]);
221 uint8 high = uint8(value >> 8 & 0xff);
222 uint8 low = uint8(value & 0xff);
223 fCs0String[pos++] = high;
224 fCs0String[pos++] = low;
226 } else {
227 TRACE_ERROR(("UdfString::SetTo: fCs0String[%" B_PRIu32
228 "] allocation failed\n", fCs0Length));
229 _Clear();
230 return;
236 /*! \brief Assignment from a Cs0 string. */
237 void
238 UdfString::SetTo(const char *cs0, uint32 length)
240 DEBUG_INIT_ETC("UdfString", ("cs0: %p, length: %ld", cs0, length));
242 _Clear();
243 if (length == 0)
244 return;
245 if (!cs0) {
246 PRINT(("passed NULL cs0 string\n"));
247 return;
250 // First copy the Cs0 string and length
251 fCs0String = new(nothrow) char[length];
252 if (fCs0String) {
253 memcpy(fCs0String, cs0, length);
254 fCs0Length = length;
255 } else {
256 PRINT(("new fCs0String[%ld] allocation failed\n", length));
257 return;
260 // Now convert to utf8
262 // The first byte of the CS0 string is the compression ID.
263 // - 8: 1 byte characters
264 // - 16: 2 byte, big endian characters
265 // - 254: "CS0 expansion is empty and unique", 1 byte characters
266 // - 255: "CS0 expansion is empty and unique", 2 byte, big endian characters
267 PRINT(("compression ID: %d\n", cs0[0]));
268 switch (reinterpret_cast<const uint8*>(cs0)[0]) {
269 case 8:
270 case 254:
272 const uint8 *inputString = reinterpret_cast<const uint8*>(&(cs0[1]));
273 int32 maxLength = length-1; // Max length of input string in uint8 characters
274 int32 allocationLength = maxLength*2+1; // Need at most 2 utf8 chars per uint8 char
275 fUtf8String = new(nothrow) char[allocationLength];
276 if (fUtf8String) {
277 char *outputString = fUtf8String;
279 for (int32 i = 0; i < maxLength && inputString[i]; i++) {
280 unicode_to_utf8(inputString[i], &outputString);
282 outputString[0] = 0;
283 } else {
284 PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
287 break;
290 case 16:
291 case 255:
293 const uint16 *inputString = reinterpret_cast<const uint16*>(&(cs0[1]));
294 int32 maxLength = (length-1) / 2; // Max length of input string in uint16 characters
295 int32 allocationLength = maxLength*3+1; // Need at most 3 utf8 chars per uint16 char
296 fUtf8String = new(nothrow) char[allocationLength];
297 if (fUtf8String) {
298 char *outputString = fUtf8String;
300 for (int32 i = 0; i < maxLength && inputString[i]; i++) {
301 unicode_to_utf8(B_BENDIAN_TO_HOST_INT16(inputString[i]), &outputString);
303 outputString[0] = 0;
304 } else {
305 PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
308 break;
311 default:
312 PRINT(("invalid compression id!\n"));
313 break;
317 void
318 UdfString::_Clear()
320 DEBUG_INIT("UdfString");
322 delete [] fCs0String;
323 fCs0String = NULL;
324 delete [] fUtf8String;
325 fUtf8String = NULL;