nel/src/misc/utf_string_view.cpp

   1 // NeL - MMORPG Framework <https://wiki.ryzom.dev/>
   2 // Copyright (C) 2020  Jan BOON (Kaetemi) <jan.boon@kaetemi.be>
   3 //
   4 // This program is free software: you can redistribute it and/or modify
   5 // it under the terms of the GNU Affero General Public License as
   6 // published by the Free Software Foundation, either version 3 of the
   7 // License, or (at your option) any later version.
   8 //
   9 // This program is distributed in the hope that it will be useful,
  10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 // GNU Affero General Public License for more details.
  13 //
  14 // You should have received a copy of the GNU Affero General Public License
  15 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17 #include "stdmisc.h"
  18
  19 // Project includes
  20 #include <nel/misc/utf_string_view.h>
  21 #include <nel/misc/stream.h>
  22
  23 // References:
  24 // - https://twiserandom.com/unicode/unicode-encoding-utf-8-utf-16-utf-32/
  25 // - https://www.compart.com/en/unicode/U+1F30D
  26 //   - 0xF0 0x9F 0x8C 0x8D
  27 //   - 0xD83C 0xDF0D
  28 //   - 0x0001F30D
  29
  30 namespace NLMISC
  31 {
  32
  33 NL_FORCE_INLINE void appendUtf8(std::string &str, u32char c)
  34 {
  35         if (c < 0x80)
  36         {
  37                 // Encode as 1 byte
  38                 str += (char)c;
  39         }
  40         else if (c < 0x0800)
  41         {
  42                 // Encode as 2 bytes
  43                 str += (char)((c & 0x07C0) >> 6) | 0xC0;
  44                 str += (char)(c & 0x3F) | 0x80;
  45         }
  46         else if (c < 0x010000)
  47         {
  48                 // Encode as 3 bytes
  49                 str += (char)((c & 0xF000) >> 12) | 0xE0;
  50                 str += (char)((c & 0x0FC0) >> 6) | 0x80;
  51                 str += (char)(c & 0x3F) | 0x80;
  52         }
  53         else if (c < 0x110000)
  54         {
  55                 // Encode as 4 bytes
  56                 str += (char)((c & 0x1C0000) >> 18) | 0xF0;
  57                 str += (char)((c & 0x03F000) >> 12) | 0x80;
  58                 str += (char)((c & 0x0FC0) >> 6) | 0x80;
  59                 str += (char)(c & 0x3F) | 0x80;
  60         }
  61         else
  62         {
  63                 // Replacement character �
  64                 str += "\xEF\xB\xBD";
  65         }
  66 }
  67
  68 void CUtfStringView::append(std::string &str, u32char c)
  69 {
  70         appendUtf8(str, c);
  71 }
  72
  73 void CUtfStringView::append(IStream &s, u32char c)
  74 {
  75         nlassert(!s.isReading());
  76         std::string tmp;
  77         tmp.reserve(4);
  78         append(tmp, c);
  79         s.serialBuffer((uint8 *)&tmp[0], tmp.size());
  80 }
  81
  82 u32char CUtfStringView::get(IStream &s)
  83 {
  84         nlassert(s.isReading());
  85
  86         std::string tmp;
  87         tmp.reserve(4);
  88         uint8 c;
  89         s.serial(c);
  90
  91         // Single byte
  92         if (c < 0x80)
  93                 return c;
  94
  95         // Do a fast check of length
  96         tmp += (char)c;
  97         size_t len;
  98         if ((c & 0xF0) == 0xF0) len = 4;
  99         if ((c & 0xE0) == 0xE0) len = 3;
 100         else len = 2;
 101
 102         // Read from stream
 103         tmp.resize(len);
 104         s.serialBuffer((uint8 *)&tmp[1], len - 1);
 105
 106         // Decode
 107         const void *str = tmp.c_str();
 108         return utf8Iterator(&str);
 109 }
 110
 111 std::string CUtfStringView::toUtf8(bool reEncode) const
 112 {
 113         // Decode UTF and encode UTF-8
 114         // This implementation makes no attempt at fixing invalid codepoints
 115         if (m_Iterator == utf8Iterator && !reEncode)
 116                 return std::string((const char *)m_Str, (const char *)((ptrdiff_t)m_Str + m_Size));
 117         std::string res;
 118         res.reserve(m_Size);
 119         for (iterator it(begin()), end(this->end()); it != end; ++it)
 120         {
 121                 appendUtf8(res, *it);
 122         }
 123         return res;
 124 }
 125
 126 ucstring CUtfStringView::toUtf16(bool reEncode) const
 127 {
 128         if (m_Iterator == utf16Iterator && !reEncode)
 129                 return ucstring((const ucchar *)m_Str, (const ucchar *)((ptrdiff_t)m_Str + m_Size));
 130         ucstring res;
 131         res.reserve(m_Size << 1);
 132         for (iterator it(begin()), end(this->end()); it != end; ++it)
 133         {
 134                 u32char c = *it;
 135                 if (c < 0x10000)
 136                 {
 137                         res += c;
 138                 }
 139                 else
 140                 {
 141                         c -= 0x10000;
 142                         res += (c >> 10) | 0xD800;
 143                         res += (c & 0x3FF) | 0xDC00;
 144                 }
 145         }
 146         return res;
 147 }
 148
 149 ::u32string CUtfStringView::toUtf32() const
 150 {
 151         // Decode any UTF
 152         // This implementation makes no attempt at fixing bad encoding
 153         if (m_Iterator == utf32Iterator)
 154                 return ::u32string((const u32char *)m_Str, (const u32char *)((ptrdiff_t)m_Str + m_Size));
 155         ::u32string res;
 156         res.reserve(m_Size << 2);
 157         for (iterator it(begin()), end(this->end()); it != end; ++it)
 158                 res += *it;
 159         return res;
 160 }
 161
 162 std::string CUtfStringView::toAscii() const
 163 {
 164         std::string res;
 165         res.reserve(m_Size);
 166         for (iterator it(begin()), end(this->end()); it != end; ++it)
 167         {
 168                 u32char c = *it;
 169                 if (c < 0x80)
 170                         res += c;
 171                 else
 172                         res += '?';
 173         }
 174         return res;
 175 }
 176
 177 std::string CUtfStringView::fromAscii(const std::string &str)
 178 {
 179         std::string res;
 180         res.reserve(str.size());
 181         for (std::string::const_iterator it(str.begin()), end(str.end()); it != end; ++it)
 182         {
 183                 unsigned char c = *it;
 184                 if (c < 0x80)
 185                         res += (char)c;
 186                 else
 187                         res += '?';
 188         }
 189         return res;
 190 }
 191
 192 std::wstring CUtfStringView::toWide() const
 193 {
 194 #ifdef NL_OS_WINDOWS
 195         if (m_Iterator == utf16Iterator)
 196                 return std::wstring((const wchar_t *)m_Str, (const wchar_t *)((ptrdiff_t)m_Str + m_Size));
 197         std::wstring res;
 198         res.reserve(m_Size << 1);
 199         for (iterator it(begin()), end(this->end()); it != end; ++it)
 200         {
 201                 u32char c = *it;
 202                 if (c < 0x10000)
 203                 {
 204                         res += c;
 205                 }
 206                 else
 207                 {
 208                         c -= 0x10000;
 209                         res += (c >> 10) | 0xD800;
 210                         res += (c & 0x3FF) | 0xDC00;
 211                 }
 212         }
 213         return res;
 214 #else
 215         if (m_Iterator == utf32Iterator)
 216                 return std::wstring((const wchar_t *)m_Str, (const wchar_t *)((ptrdiff_t)m_Str + m_Size));
 217         std::wstring res;
 218         res.reserve(m_Size << 2);
 219         for (iterator it(begin()), end(this->end()); it != end; ++it)
 220                 res += *it;
 221         return res;
 222 #endif
 223 }
 224
 225 size_t CUtfStringView::count() const
 226 {
 227         size_t res = 0;
 228         for (iterator it(begin()), end(this->end()); it != end; ++it)
 229                 ++res;
 230         return res;
 231 }
 232
 233 ptrdiff_t CUtfStringView::offset(ptrdiff_t i)
 234 {
 235         size_t res = 0;
 236         for (iterator it(begin()), end(this->end()); it != end; ++it)
 237         {
 238                 if (res == i)
 239                         return (ptrdiff_t)it.ptr() - (ptrdiff_t)ptr();
 240                 ++res;
 241         }
 242         return res;
 243 }
 244
 245 u32char CUtfStringView::utf8Iterator(const void **addr)
 246 {
 247         // Decode UTF-8
 248         // This implementation makes no attempt at fixing bad encoding, except for bad UTF-16 surrogate pairs
 249         // Invalid characters are replaced with the replacement character
 250         const uint8 **pp = reinterpret_cast<const uint8 **>(addr);
 251         u32char c0 = **pp;
 252         ++(*pp);
 253         if (c0 >= 0x80)
 254         {
 255                 if (c0 < 0xC0)
 256                 {
 257                         // Replacement character �
 258                         return 0xFFFD;
 259                 }
 260                 uint8 cx = **pp;
 261                 if ((cx & 0xC0) == 0x80)
 262                 {
 263                         ++(*pp);
 264                         c0 &= 0x3F; // Drop first two bits
 265                         c0 <<= 6;
 266                         c0 |= (cx & 0x3F); // 12 bits now (6 + 6), 2-byte encoding
 267                         if (c0 & 0x800)
 268                         {
 269                                 cx = **pp;
 270                                 if ((cx & 0xC0) == 0x80)
 271                                 {
 272                                         ++(*pp);
 273                                         c0 &= 0x07FF; // Drop first bit
 274                                         c0 <<= 6;
 275                                         c0 |= (cx & 0x3F); // 17 bits now (12 - 1 + 6), 3-byte encoding
 276                                         if (c0 & 0x10000)
 277                                         {
 278                                                 cx = **pp;
 279                                                 if ((cx & 0xC0) == 0x80)
 280                                                 {
 281                                                         ++(*pp);
 282                                                         c0 &= 0xFFFF; // Drop first bit
 283                                                         c0 <<= 6;
 284                                                         c0 |= (cx & 0x3F); // 22 bits now (17 - 1 + 6), 4-byte encoding
 285                                                         if (c0 >= 0x110000)
 286                                                         {
 287                                                                 // Replacement character �
 288                                                                 return 0xFFFD;
 289                                                         }
 290                                                         else if (c0 < 0x10000)
 291                                                         {
 292                                                                 // Invalid encoding
 293                                                                 // Replacement character �
 294                                                                 return 0xFFFD;
 295                                                         }
 296                                                 }
 297                                                 else
 298                                                 {
 299                                                         // Replacement character �
 300                                                         return 0xFFFD;
 301                                                 }
 302                                         }
 303                                         else if ((c0 & 0xFC00) == 0xD800) // Higher bits of nutcase UTF-16 encoded as UTF-8
 304                                         {
 305                                                 uint8 cy;
 306                                                 if ((*pp)[0] == 0xED && ((cx = (*pp)[1]) & 0xF0) == 0xB0 && ((cy = (*pp)[2]) & 0xC0) == 0x80)
 307                                                 {
 308                                                         // Lower bits of nutcase UTF-16 encoded as UTF-8
 309                                                         (*pp) += 3;
 310                                                         uint16 c1 = (cx & 0x0F);
 311                                                         c1 <<= 6;
 312                                                         c1 |= (cy & 0x3F);
 313                                                         c0 &= 0x03FF;
 314                                                         c0 <<= 10;
 315                                                         c0 |= (c1 & 0x03FF);
 316                                                         c0 += 0x10000;
 317                                                 }
 318                                                 else
 319                                                 {
 320                                                         // Replacement character �
 321                                                         return 0xFFFD;
 322                                                 }
 323                                         }
 324                                         else if ((c0 & 0xFC00) == 0xDC00) // Lower bits of nutcase UTF-16 encoded as UTF-8
 325                                         {
 326                                                 // Replacement character �
 327                                                 return 0xFFFD;
 328                                         }
 329                                         else if (c0 < 0x0800)
 330                                         {
 331                                                 // Invalid encoding
 332                                                 // Replacement character �
 333                                                 return 0xFFFD;
 334                                         }
 335                                 }
 336                                 else
 337                                 {
 338                                         // Replacement character �
 339                                         return 0xFFFD;
 340                                 }
 341                         }
 342                         else if (c0 < 0x80)
 343                         {
 344                                 // Invalid encoding
 345                                 // Replacement character �
 346                                 return 0xFFFD;
 347                         }
 348                 }
 349                 else
 350                 {
 351                         // Replacement character �
 352                         return 0xFFFD;
 353                 }
 354         }
 355         return c0;
 356 }
 357
 358 u32char CUtfStringView::utf16Iterator(const void **addr)
 359 {
 360         // Decode UTF-16
 361         // This implementation makes no attempt at fixing bad encoding
 362         const uint16 **pp = reinterpret_cast<const uint16 **>(addr);
 363         u32char c0 = **pp;
 364         ++(*pp);
 365         if ((c0 & 0xFC00) == 0xD800) // Higher bits
 366         {
 367                 uint16 c1 = **pp;
 368                 if ((c1 & 0xFC00) == 0xDC00) // Lower bits
 369                 {
 370                         ++(*pp);
 371                         c0 &= 0x03FF;
 372                         c0 <<= 10;
 373                         c0 |= (c1 & 0x03FF);
 374                         c0 += 0x10000;
 375                 }
 376         }
 377         return c0;
 378 }
 379
 380 u32char CUtfStringView::utf32Iterator(const void **addr)
 381 {
 382         // UTF-32
 383         // This implementation makes no attempt at fixing bad encoding
 384         const u32char **pp = reinterpret_cast<const u32char **>(addr);
 385         u32char c = **pp;
 386         ++(*pp);
 387         return c;
 388 }
 389
 390 } /* namespace NLMISC */
 391
 392 /* end of file */