src/add-ons/kernel/file_systems/udf/UdfString.cpp

   1 #include "UdfString.h"
   2
   3 #include <ByteOrder.h>
   4
   5 #include <AutoDeleter.h>
   6
   7
   8 using std::nothrow;
   9
  10
  11 /*! \brief Converts the given unicode character to utf8.
  12
  13         \param c The unicode character.
  14         \param out Pointer to a C-string of at least 4 characters
  15                    long into which the output utf8 characters will
  16                    be written. The string that is pointed to will
  17                    be incremented to reflect the number of characters
  18                    written, i.e. if \a out initially points to a pointer
  19                    to the first character in string named \c str, and
  20                    the function writes 4 characters to \c str, then
  21                    upon returning, out will point to a pointer to
  22                    the fifth character in \c str.
  23 */
  24 static void
  25 unicode_to_utf8(uint32 c, char **out)
  26 {
  27         char *s = *out;
  28
  29         if (c < 0x80)
  30                 *(s++) = c;
  31         else if (c < 0x800) {
  32                 *(s++) = 0xc0 | (c>>6);
  33                 *(s++) = 0x80 | (c & 0x3f);
  34         } else if (c < 0x10000) {
  35                 *(s++) = 0xe0 | (c>>12);
  36                 *(s++) = 0x80 | ((c>>6) & 0x3f);
  37                 *(s++) = 0x80 | (c & 0x3f);
  38         } else if (c <= 0x10ffff) {
  39                 *(s++) = 0xf0 | (c>>18);
  40                 *(s++) = 0x80 | ((c>>12) & 0x3f);
  41                 *(s++) = 0x80 | ((c>>6) & 0x3f);
  42                 *(s++) = 0x80 | (c & 0x3f);
  43         }
  44         *out = s;
  45 }
  46
  47 /*! \brief Converts the given utf8 character to 4-byte unicode.
  48
  49         \param in Pointer to a C-String from which utf8 characters
  50                   will be read. *in will be incremented to reflect
  51                   the number of characters read, similarly to the
  52                   \c out parameter for unicode_to_utf8().
  53
  54         \return The 4-byte unicode character, or **in if passed an
  55                 invalid character, or 0 if passed any NULL pointers.
  56 */
  57 static uint32
  58 utf8_to_unicode(const char **in)
  59 {
  60         if (!in)
  61                 return 0;
  62         uint8 *bytes = (uint8 *)*in;
  63         if (!bytes)
  64                 return 0;
  65
  66         int32 length;
  67         uint8 mask = 0x1f;
  68
  69         switch (bytes[0] & 0xf0) {
  70                 case 0xc0:
  71                 case 0xd0:      length = 2; break;
  72                 case 0xe0:      length = 3; break;
  73                 case 0xf0:
  74                         mask = 0x0f;
  75                         length = 4;
  76                         break;
  77                 default:
  78                         // valid 1-byte character
  79                         // and invalid characters
  80                         (*in)++;
  81                         return bytes[0];
  82         }
  83         uint32 c = bytes[0] & mask;
  84         int32 i = 1;
  85         for (;i < length && (bytes[i] & 0x80) > 0;i++)
  86                 c = (c << 6) | (bytes[i] & 0x3f);
  87
  88         if (i < length) {
  89                 // invalid character
  90                 (*in)++;
  91                 return (uint32)bytes[0];
  92         }
  93         *in += length;
  94         return c;
  95 }
  96
  97
  98 // #pragma mark -
  99
 100
 101 /*! \brief Creates an empty string object. */
 102 UdfString::UdfString()
 103         :
 104         fCs0String(NULL),
 105         fUtf8String(NULL)
 106 {
 107 }
 108
 109
 110 /*! \brief Creates a new UdfString object from the given Utf8 string. */
 111 UdfString::UdfString(const char *utf8)
 112         :
 113         fCs0String(NULL),
 114         fUtf8String(NULL)
 115 {
 116         SetTo(utf8);
 117 }
 118
 119
 120 /*! \brief Creates a new UdfString object from the given Cs0 string. */
 121 UdfString::UdfString(const char *cs0, uint32 length)
 122         :
 123         fCs0String(NULL),
 124         fUtf8String(NULL)
 125 {
 126         SetTo(cs0, length);
 127 }
 128
 129
 130 UdfString::~UdfString()
 131 {
 132         _Clear();
 133 }
 134
 135
 136 /*! \brief Assignment from a Utf8 string. */
 137 void
 138 UdfString::SetTo(const char *utf8)
 139 {
 140         TRACE(("UdfString::SetTo: utf8 = `%s', strlen(utf8) = %ld\n",
 141                 utf8, utf8 ? strlen(utf8) : 0));
 142         _Clear();
 143
 144         if (utf8 == NULL) {
 145                 TRACE_ERROR(("UdfString::SetTo: passed NULL utf8 string\n"));
 146                 return;
 147         }
 148
 149         uint32 length = strlen(utf8);
 150         // First copy the utf8 string
 151         fUtf8String = new(nothrow) char[length + 1];
 152         if (fUtf8String == NULL) {
 153                 TRACE_ERROR(("UdfString::SetTo: fUtf8String[%" B_PRIu32
 154                         "] allocation failed\n", length + 1));
 155                 return;
 156         }
 157
 158         memcpy(fUtf8String, utf8, length + 1);
 159         // Next convert to raw 4-byte unicode. Then we'll do some
 160         // analysis to figure out if we have any invalid characters,
 161         // and whether we can get away with compressed 8-bit unicode,
 162         // or have to use burly 16-bit unicode.
 163         uint32 *raw = new(nothrow) uint32[length];
 164         if (raw == NULL) {
 165                 TRACE_ERROR(("UdfString::SetTo: uint32 raw[%" B_PRIu32 "] temporary"
 166                         " string allocation failed\n", length));
 167                 _Clear();
 168                 return;
 169         }
 170
 171         ArrayDeleter<uint32> rawDeleter(raw);
 172
 173         const char *in = utf8;
 174         uint32 rawLength = 0;
 175         for (uint32 i = 0; i < length && uint32(in - utf8) < length; i++, rawLength++)
 176                 raw[i] = utf8_to_unicode(&in);
 177
 178         // Check for invalids.
 179         uint32 mask = 0xffff0000;
 180         for (uint32 i = 0; i < rawLength; i++) {
 181                 if (raw[i] & mask) {
 182                         TRACE(("WARNING: utf8 string contained a multi-byte sequence which "
 183                                "was converted into a unicode character larger than 16-bits; "
 184                                "character will be converted to an underscore character for "
 185                                "safety.\n"));
 186                         raw[i] = '_';
 187                 }
 188         }
 189         // See if we can get away with 8-bit compressed unicode
 190         mask = 0xffffff00;
 191         bool canUse8bit = true;
 192         for (uint32 i = 0; i < rawLength; i++) {
 193                 if (raw[i] & mask) {
 194                         canUse8bit = false;
 195                         break;
 196                 }
 197         }
 198         // Build our cs0 string
 199         if (canUse8bit) {
 200                 fCs0Length = rawLength + 1;
 201                 fCs0String = new(nothrow) char[fCs0Length];
 202                 if (fCs0String != NULL) {
 203                         fCs0String[0] = '\x08'; // 8-bit compressed unicode
 204                         for (uint32 i = 0; i < rawLength; i++)
 205                                 fCs0String[i + 1] = raw[i] % 256;
 206                 } else {
 207                         TRACE_ERROR(("UdfString::SetTo: fCs0String[%" B_PRIu32
 208                                 "] allocation failed\n", fCs0Length));
 209                         _Clear();
 210                         return;
 211                 }
 212         } else {
 213                 fCs0Length = rawLength * 2 + 1;
 214                 fCs0String = new(nothrow) char[fCs0Length];
 215                 if (fCs0String != NULL) {
 216                         uint32 pos = 0;
 217                         fCs0String[pos++] = '\x10';     // 16-bit unicode
 218                         for (uint32 i = 0; i < rawLength; i++) {
 219                                 // 16-bit unicode chars must be written big endian
 220                                 uint16 value = uint16(raw[i]);
 221                                 uint8 high = uint8(value >> 8 & 0xff);
 222                                 uint8 low = uint8(value & 0xff);
 223                                 fCs0String[pos++] = high;
 224                                 fCs0String[pos++] = low;
 225                         }
 226                 } else {
 227                         TRACE_ERROR(("UdfString::SetTo: fCs0String[%" B_PRIu32
 228                                 "] allocation failed\n", fCs0Length));
 229                         _Clear();
 230                         return;
 231                 }
 232         }
 233 }
 234
 235
 236 /*! \brief Assignment from a Cs0 string. */
 237 void
 238 UdfString::SetTo(const char *cs0, uint32 length)
 239 {
 240         DEBUG_INIT_ETC("UdfString", ("cs0: %p, length: %ld", cs0, length));
 241
 242         _Clear();
 243         if (length == 0)
 244                 return;
 245         if (!cs0) {
 246                 PRINT(("passed NULL cs0 string\n"));
 247                 return;
 248         }
 249
 250         // First copy the Cs0 string and length
 251         fCs0String = new(nothrow) char[length];
 252         if (fCs0String) {
 253                 memcpy(fCs0String, cs0, length);
 254                 fCs0Length = length;
 255         } else {
 256                 PRINT(("new fCs0String[%ld] allocation failed\n", length));
 257                 return;
 258         }
 259
 260         // Now convert to utf8
 261
 262         // The first byte of the CS0 string is the compression ID.
 263         // - 8: 1 byte characters
 264         // - 16: 2 byte, big endian characters
 265         // - 254: "CS0 expansion is empty and unique", 1 byte characters
 266         // - 255: "CS0 expansion is empty and unique", 2 byte, big endian characters
 267         PRINT(("compression ID: %d\n", cs0[0]));
 268         switch (reinterpret_cast<const uint8*>(cs0)[0]) {
 269                 case 8:
 270                 case 254:
 271                 {
 272                         const uint8 *inputString = reinterpret_cast<const uint8*>(&(cs0[1]));
 273                         int32 maxLength = length-1;                             // Max length of input string in uint8 characters
 274                         int32 allocationLength = maxLength*2+1; // Need at most 2 utf8 chars per uint8 char
 275                         fUtf8String = new(nothrow) char[allocationLength];
 276                         if (fUtf8String) {
 277                                 char *outputString = fUtf8String;
 278
 279                                 for (int32 i = 0; i < maxLength && inputString[i]; i++) {
 280                                         unicode_to_utf8(inputString[i], &outputString);
 281                                 }
 282                                 outputString[0] = 0;
 283                         } else {
 284                                 PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
 285                         }
 286
 287                         break;
 288                 }
 289
 290                 case 16:
 291                 case 255:
 292                 {
 293                         const uint16 *inputString = reinterpret_cast<const uint16*>(&(cs0[1]));
 294                         int32 maxLength = (length-1) / 2;               // Max length of input string in uint16 characters
 295                         int32 allocationLength = maxLength*3+1; // Need at most 3 utf8 chars per uint16 char
 296                         fUtf8String = new(nothrow) char[allocationLength];
 297                         if (fUtf8String) {
 298                                 char *outputString = fUtf8String;
 299
 300                                 for (int32 i = 0; i < maxLength && inputString[i]; i++) {
 301                                         unicode_to_utf8(B_BENDIAN_TO_HOST_INT16(inputString[i]), &outputString);
 302                                 }
 303                                 outputString[0] = 0;
 304                         } else {
 305                                 PRINT(("new fUtf8String[%ld] allocation failed\n", allocationLength));
 306                         }
 307
 308                         break;
 309                 }
 310
 311                 default:
 312                         PRINT(("invalid compression id!\n"));
 313                         break;
 314         }
 315 }
 316
 317 void
 318 UdfString::_Clear()
 319 {
 320         DEBUG_INIT("UdfString");
 321
 322         delete [] fCs0String;
 323         fCs0String = NULL;
 324         delete [] fUtf8String;
 325         fUtf8String = NULL;
 326 }