sal/textenc/tcvtutf8.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <sal/config.h>
  21
  22 #include <cassert>
  23
  24 #include <sal/types.h>
  25 #include <rtl/character.hxx>
  26 #include <rtl/textcvt.h>
  27
  28 #include "converter.hxx"
  29 #include "tcvtutf8.hxx"
  30 #include "tenchelp.hxx"
  31 #include "unichars.hxx"
  32
  33 namespace {
  34
  35 struct ImplUtf8ToUnicodeContext
  36 {
  37     sal_uInt32 nUtf32;
  38     int nBytes;
  39     int nShift;
  40     bool bCheckBom;
  41 };
  42
  43 struct ImplUnicodeToUtf8Context
  44 {
  45     sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
  46 };
  47
  48 }
  49
  50 void * ImplCreateUtf8ToUnicodeContext()
  51 {
  52     ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
  53     ImplResetUtf8ToUnicodeContext(p);
  54     return p;
  55 }
  56
  57 void ImplResetUtf8ToUnicodeContext(void * pContext)
  58 {
  59     if (pContext != nullptr)
  60     {
  61         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
  62         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
  63     }
  64 }
  65
  66 void ImplDestroyUtf8ToUnicodeContext(void * pContext)
  67 {
  68     delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
  69 }
  70
  71 sal_Size ImplConvertUtf8ToUnicode(
  72     void const * pData, void * pContext, char const * pSrcBuf,
  73     sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
  74     sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
  75 {
  76     bool bJavaUtf8 = pData != nullptr;
  77     sal_uInt32 nUtf32 = 0;
  78     int nBytes = int();
  79     int nShift = -1;
  80     bool bCheckBom = true;
  81     sal_uInt32 nInfo = 0;
  82     unsigned char const * pSrcBufPtr = reinterpret_cast<unsigned char const *>(pSrcBuf);
  83     unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
  84     sal_Unicode * pDestBufPtr = pDestBuf;
  85     sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
  86     unsigned char const * startOfCurrentChar = pSrcBufPtr;
  87
  88     if (pContext != nullptr)
  89     {
  90         nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
  91         nBytes = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes;
  92         nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
  93         bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
  94     }
  95
  96     while (pSrcBufPtr < pSrcBufEnd)
  97     {
  98         bool bConsume = true;
  99         sal_uInt32 nChar = *pSrcBufPtr++;
 100         if (nShift < 0)
 101             // Allow (illegal) 5 and 6 byte sequences, so they are read as a
 102             // single individual bad character:
 103             if (nChar <= 0x7F)
 104             {
 105                 nUtf32 = nChar;
 106                 nBytes = 1;
 107                 goto transform;
 108             }
 109             else if (nChar <= 0xBF)
 110                 goto bad_input;
 111             else if (nChar <= 0xDF)
 112             {
 113                 nUtf32 = (nChar & 0x1F) << 6;
 114                 nBytes = 2;
 115                 nShift = 0;
 116             }
 117             else if (nChar <= 0xEF)
 118             {
 119                 nUtf32 = (nChar & 0x0F) << 12;
 120                 nBytes = 3;
 121                 nShift = 6;
 122             }
 123             else if (nChar <= 0xF7)
 124             {
 125                 nUtf32 = (nChar & 0x07) << 18;
 126                 nBytes = 4;
 127                 nShift = 12;
 128             }
 129             else if (nChar <= 0xFB)
 130             {
 131                 nUtf32 = (nChar & 0x03) << 24;
 132                 nBytes = 5;
 133                 nShift = 18;
 134             }
 135             else if (nChar <= 0xFD)
 136             {
 137                 nUtf32 = (nChar & 0x01) << 30;
 138                 nBytes = 6;
 139                 nShift = 24;
 140             }
 141             else
 142                 goto bad_input;
 143         else if ((nChar & 0xC0) == 0x80)
 144         {
 145             nUtf32 |= (nChar & 0x3F) << nShift;
 146             if (nShift == 0)
 147                 goto transform;
 148             else
 149                 nShift -= 6;
 150         }
 151         else
 152         {
 153             /*
 154              This byte is preceded by a broken UTF-8 sequence; if this byte
 155              is neither in the range [0x80..0xBF] nor in the range
 156              [0xFE..0xFF], assume that this byte does not belong to that
 157              broken sequence, but instead starts a new, legal UTF-8 sequence:
 158              */
 159             bConsume = nChar >= 0xFE;
 160             goto bad_input;
 161         }
 162         continue;
 163
 164     transform:
 165         if (!bCheckBom || nUtf32 != 0xFEFF || nBytes != 3
 166             || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
 167             || bJavaUtf8)
 168         {
 169             switch (nBytes) {
 170             case 1:
 171                 if (bJavaUtf8 && nUtf32 == 0) {
 172                     goto bad_input;
 173                 }
 174                 break;
 175             case 2:
 176                 if (nUtf32 < 0x80 && !(bJavaUtf8 && nUtf32 == 0)) {
 177                     goto bad_input;
 178                 }
 179                 break;
 180             case 3:
 181                 if (nUtf32 < 0x800 || (!bJavaUtf8 && rtl::isSurrogate(nUtf32)))
 182                 {
 183                     goto bad_input;
 184                 }
 185                 break;
 186             case 4:
 187                 if (nUtf32 < 0x10000 || !rtl::isUnicodeCodePoint(nUtf32)
 188                     || bJavaUtf8)
 189                 {
 190                     goto bad_input;
 191                 }
 192                 break;
 193             default:
 194                 goto bad_input;
 195             }
 196             if (nUtf32 <= 0xFFFF)
 197                 if (pDestBufPtr != pDestBufEnd)
 198                     *pDestBufPtr++ = static_cast<sal_Unicode>(nUtf32);
 199                 else
 200                     goto no_output;
 201             else if (pDestBufEnd - pDestBufPtr >= 2)
 202                 pDestBufPtr += rtl::splitSurrogates(nUtf32, pDestBufPtr);
 203             else
 204                 goto no_output;
 205         }
 206         nShift = -1;
 207         bCheckBom = false;
 208         startOfCurrentChar = pSrcBufPtr;
 209         continue;
 210
 211     bad_input:
 212         switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
 213                     false, nBytes != 1, 0, nFlags, &pDestBufPtr, pDestBufEnd,
 214                     &nInfo))
 215         {
 216         case sal::detail::textenc::BAD_INPUT_STOP:
 217             nShift = -1;
 218             bCheckBom = false;
 219             if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
 220                 if (!bConsume)
 221                     --pSrcBufPtr;
 222             } else {
 223                 pSrcBufPtr = startOfCurrentChar;
 224             }
 225             break;
 226
 227         case sal::detail::textenc::BAD_INPUT_CONTINUE:
 228             nShift = -1;
 229             bCheckBom = false;
 230             if (!bConsume)
 231                 --pSrcBufPtr;
 232             startOfCurrentChar = pSrcBufPtr;
 233             continue;
 234
 235         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 236             goto no_output;
 237         }
 238         break;
 239
 240     no_output:
 241         --pSrcBufPtr;
 242         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
 243         break;
 244     }
 245
 246     if (nShift >= 0
 247         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
 248                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL))
 249                == 0)
 250     {
 251         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
 252             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL;
 253         else
 254             switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
 255                         false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
 256                         &nInfo))
 257             {
 258             case sal::detail::textenc::BAD_INPUT_STOP:
 259                 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
 260                     pSrcBufPtr = startOfCurrentChar;
 261                 }
 262                 [[fallthrough]];
 263             case sal::detail::textenc::BAD_INPUT_CONTINUE:
 264                 nShift = -1;
 265                 bCheckBom = false;
 266                 break;
 267
 268             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 269                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
 270                 break;
 271             }
 272     }
 273
 274     if (pContext != nullptr)
 275     {
 276         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
 277         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = nBytes;
 278         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
 279         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
 280     }
 281     if (pInfo != nullptr)
 282         *pInfo = nInfo;
 283     if (pSrcCvtBytes != nullptr)
 284         *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
 285     return pDestBufPtr - pDestBuf;
 286 }
 287
 288 void * ImplCreateUnicodeToUtf8Context()
 289 {
 290     ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
 291     ImplResetUnicodeToUtf8Context(p);
 292     return p;
 293 }
 294
 295 void ImplResetUnicodeToUtf8Context(void * pContext)
 296 {
 297     if (pContext != nullptr)
 298         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
 299 }
 300
 301 void ImplDestroyUnicodeToUtf8Context(void * pContext)
 302 {
 303     delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
 304 }
 305
 306 sal_Size ImplConvertUnicodeToUtf8(
 307     void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
 308     sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
 309     sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
 310 {
 311     bool bJavaUtf8 = pData != nullptr;
 312     sal_Unicode nHighSurrogate = 0xFFFF;
 313     sal_uInt32 nInfo = 0;
 314     sal_Unicode const * pSrcBufPtr = pSrcBuf;
 315     sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
 316     char * pDestBufPtr = pDestBuf;
 317     char * pDestBufEnd = pDestBufPtr + nDestBytes;
 318
 319     if (pContext != nullptr)
 320         nHighSurrogate
 321             = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
 322
 323     if (nHighSurrogate == 0xFFFF)
 324     {
 325         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
 326             && !bJavaUtf8)
 327         {
 328             if (pDestBufEnd - pDestBufPtr >= 3)
 329             {
 330                 /* Write BOM (U+FEFF) as UTF-8: */
 331                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
 332                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
 333                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
 334             }
 335             else
 336             {
 337                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
 338                 goto done;
 339             }
 340         }
 341         nHighSurrogate = 0;
 342     }
 343
 344     while (pSrcBufPtr < pSrcBufEnd)
 345     {
 346         sal_uInt32 nChar = *pSrcBufPtr++;
 347         if (nHighSurrogate == 0)
 348         {
 349             if (rtl::isHighSurrogate(nChar) && !bJavaUtf8)
 350             {
 351                 nHighSurrogate = static_cast<sal_Unicode>(nChar);
 352                 continue;
 353             }
 354             else if (rtl::isLowSurrogate(nChar) && !bJavaUtf8)
 355             {
 356                 goto bad_input;
 357             }
 358         }
 359         else if (rtl::isLowSurrogate(nChar) && !bJavaUtf8)
 360             nChar = rtl::combineSurrogates(nHighSurrogate, nChar);
 361         else
 362             goto bad_input;
 363
 364         assert(bJavaUtf8 ? nChar <= 0xFFFF : rtl::isUnicodeScalarValue(nChar));
 365
 366         if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
 367             if (pDestBufPtr != pDestBufEnd)
 368                 *pDestBufPtr++ = static_cast< char >(nChar);
 369             else
 370                 goto no_output;
 371         else if (nChar <= 0x7FF)
 372             if (pDestBufEnd - pDestBufPtr >= 2)
 373             {
 374                 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
 375                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
 376             }
 377             else
 378                 goto no_output;
 379         else if (nChar <= 0xFFFF)
 380             if (pDestBufEnd - pDestBufPtr >= 3)
 381             {
 382                 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
 383                 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
 384                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
 385             }
 386             else
 387                 goto no_output;
 388         else if (pDestBufEnd - pDestBufPtr >= 4)
 389         {
 390             *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
 391             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
 392             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
 393             *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
 394         }
 395         else
 396             goto no_output;
 397         nHighSurrogate = 0;
 398         continue;
 399
 400     bad_input:
 401         switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
 402                     false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, nullptr,
 403                     0, nullptr))
 404         {
 405         case sal::detail::textenc::BAD_INPUT_STOP:
 406             nHighSurrogate = 0;
 407             break;
 408
 409         case sal::detail::textenc::BAD_INPUT_CONTINUE:
 410             nHighSurrogate = 0;
 411             continue;
 412
 413         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 414             goto no_output;
 415         }
 416         break;
 417
 418     no_output:
 419         --pSrcBufPtr;
 420         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
 421         break;
 422     }
 423
 424     if (nHighSurrogate != 0
 425         && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
 426                          | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
 427                == 0)
 428     {
 429         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
 430             nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
 431         else
 432             switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
 433                         false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
 434                         nullptr, 0, nullptr))
 435             {
 436             case sal::detail::textenc::BAD_INPUT_STOP:
 437             case sal::detail::textenc::BAD_INPUT_CONTINUE:
 438                 nHighSurrogate = 0;
 439                 break;
 440
 441             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 442                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
 443                 break;
 444             }
 445     }
 446
 447  done:
 448     if (pContext != nullptr)
 449         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
 450             = nHighSurrogate;
 451     if (pInfo != nullptr)
 452         *pInfo = nInfo;
 453     if (pSrcCvtChars != nullptr)
 454         *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
 455     return pDestBufPtr - pDestBuf;
 456 }
 457
 458 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */