sal/textenc/tcvtutf8.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <sal/config.h>
  21
  22 #include <cassert>
  23
  24 #include <sal/types.h>
  25 #include <rtl/character.hxx>
  26 #include <rtl/textcvt.h>
  27
  28 #include "converter.hxx"
  29 #include "tcvtutf8.hxx"
  30
  31 namespace {
  32
  33 struct ImplUtf8ToUnicodeContext
  34 {
  35     sal_uInt32 nUtf32;
  36     int nBytes;
  37     int nShift;
  38     bool bCheckBom;
  39 };
  40
  41 struct ImplUnicodeToUtf8Context
  42 {
  43     sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
  44 };
  45
  46 }
  47
  48 void * ImplCreateUtf8ToUnicodeContext()
  49 {
  50     ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
  51     ImplResetUtf8ToUnicodeContext(p);
  52     return p;
  53 }
  54
  55 void ImplResetUtf8ToUnicodeContext(void * pContext)
  56 {
  57     if (pContext != nullptr)
  58     {
  59         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = 1;
  60         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
  61         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
  62     }
  63 }
  64
  65 void ImplDestroyUtf8ToUnicodeContext(void * pContext)
  66 {
  67     delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
  68 }
  69
  70 sal_Size ImplConvertUtf8ToUnicode(
  71     void const * pData, void * pContext, char const * pSrcBuf,
  72     sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
  73     sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
  74 {
  75     bool bJavaUtf8 = pData != nullptr;
  76     sal_uInt32 nUtf32 = 0;
  77     int nBytes = 1;
  78     int nShift = -1;
  79     bool bCheckBom = true;
  80     sal_uInt32 nInfo = 0;
  81     unsigned char const * pSrcBufPtr = reinterpret_cast<unsigned char const *>(pSrcBuf);
  82     unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
  83     sal_Unicode * pDestBufPtr = pDestBuf;
  84     sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
  85     unsigned char const * startOfCurrentChar = pSrcBufPtr;
  86
  87     if (pContext != nullptr)
  88     {
  89         nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
  90         nBytes = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes;
  91         nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
  92         bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
  93     }
  94
  95     while (pSrcBufPtr < pSrcBufEnd)
  96     {
  97         bool bConsume = true;
  98         sal_uInt32 nChar = *pSrcBufPtr++;
  99         if (nShift < 0)
 100             // Allow (illegal) 5 and 6 byte sequences, so they are read as a
 101             // single individual bad character:
 102             if (nChar <= 0x7F)
 103             {
 104                 nUtf32 = nChar;
 105                 nBytes = 1;
 106                 goto transform;
 107             }
 108             else if (nChar <= 0xBF)
 109                 goto bad_input;
 110             else if (nChar <= 0xDF)
 111             {
 112                 nUtf32 = (nChar & 0x1F) << 6;
 113                 nBytes = 2;
 114                 nShift = 0;
 115             }
 116             else if (nChar <= 0xEF)
 117             {
 118                 nUtf32 = (nChar & 0x0F) << 12;
 119                 nBytes = 3;
 120                 nShift = 6;
 121             }
 122             else if (nChar <= 0xF7)
 123             {
 124                 nUtf32 = (nChar & 0x07) << 18;
 125                 nBytes = 4;
 126                 nShift = 12;
 127             }
 128             else if (nChar <= 0xFB)
 129             {
 130                 nUtf32 = (nChar & 0x03) << 24;
 131                 nBytes = 5;
 132                 nShift = 18;
 133             }
 134             else if (nChar <= 0xFD)
 135             {
 136                 nUtf32 = (nChar & 0x01) << 30;
 137                 nBytes = 6;
 138                 nShift = 24;
 139             }
 140             else
 141                 goto bad_input;
 142         else if ((nChar & 0xC0) == 0x80)
 143         {
 144             nUtf32 |= (nChar & 0x3F) << nShift;
 145             if (nShift == 0)
 146                 goto transform;
 147             else
 148                 nShift -= 6;
 149         }
 150         else
 151         {
 152             /*
 153              This byte is preceded by a broken UTF-8 sequence; if this byte
 154              is neither in the range [0x80..0xBF] nor in the range
 155              [0xFE..0xFF], assume that this byte does not belong to that
 156              broken sequence, but instead starts a new, legal UTF-8 sequence:
 157              */
 158             bConsume = nChar >= 0xFE;
 159             goto bad_input;
 160         }
 161         continue;
 162
 163     transform:
 164         if (!bCheckBom || nUtf32 != 0xFEFF || nBytes != 3
 165             || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
 166             || bJavaUtf8)
 167         {
 168             switch (nBytes) {
 169             case 1:
 170                 if (bJavaUtf8 && nUtf32 == 0) {
 171                     goto bad_input;
 172                 }
 173                 break;
 174             case 2:
 175                 if (nUtf32 < 0x80 && !(bJavaUtf8 && nUtf32 == 0)) {
 176                     goto bad_input;
 177                 }
 178                 break;
 179             case 3:
 180                 if (nUtf32 < 0x800 || (!bJavaUtf8 && rtl::isSurrogate(nUtf32)))
 181                 {
 182                     goto bad_input;
 183                 }
 184                 break;
 185             case 4:
 186                 if (nUtf32 < 0x10000 || !rtl::isUnicodeCodePoint(nUtf32)
 187                     || bJavaUtf8)
 188                 {
 189                     goto bad_input;
 190                 }
 191                 break;
 192             default:
 193                 goto bad_input;
 194             }
 195             if (nUtf32 <= 0xFFFF)
 196                 if (pDestBufPtr != pDestBufEnd)
 197                     *pDestBufPtr++ = static_cast<sal_Unicode>(nUtf32);
 198                 else
 199                     goto no_output;
 200             else if (pDestBufEnd - pDestBufPtr >= 2)
 201                 pDestBufPtr += rtl::splitSurrogates(nUtf32, pDestBufPtr);
 202             else
 203                 goto no_output;
 204         }
 205         nShift = -1;
 206         bCheckBom = false;
 207         startOfCurrentChar = pSrcBufPtr;
 208         continue;
 209
 210     bad_input:
 211         switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
 212                     false, nBytes != 1, 0, nFlags, &pDestBufPtr, pDestBufEnd,
 213                     &nInfo))
 214         {
 215         case sal::detail::textenc::BAD_INPUT_STOP:
 216             nShift = -1;
 217             bCheckBom = false;
 218             if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) {
 219                 if (!bConsume)
 220                     --pSrcBufPtr;
 221             } else {
 222                 pSrcBufPtr = startOfCurrentChar;
 223             }
 224             break;
 225
 226         case sal::detail::textenc::BAD_INPUT_CONTINUE:
 227             nShift = -1;
 228             bCheckBom = false;
 229             if (!bConsume)
 230                 --pSrcBufPtr;
 231             startOfCurrentChar = pSrcBufPtr;
 232             continue;
 233
 234         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 235             goto no_output;
 236         }
 237         break;
 238
 239     no_output:
 240         --pSrcBufPtr;
 241         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
 242         break;
 243     }
 244
 245     if (nShift >= 0
 246         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
 247                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL))
 248                == 0)
 249     {
 250         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
 251             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL;
 252         else
 253             switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
 254                         false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
 255                         &nInfo))
 256             {
 257             case sal::detail::textenc::BAD_INPUT_STOP:
 258                 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) != 0) {
 259                     pSrcBufPtr = startOfCurrentChar;
 260                 }
 261                 [[fallthrough]];
 262             case sal::detail::textenc::BAD_INPUT_CONTINUE:
 263                 nShift = -1;
 264                 bCheckBom = false;
 265                 break;
 266
 267             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 268                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
 269                 break;
 270             }
 271     }
 272
 273     if (pContext != nullptr)
 274     {
 275         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
 276         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nBytes = nBytes;
 277         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
 278         static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
 279     }
 280     if (pInfo != nullptr)
 281         *pInfo = nInfo;
 282     if (pSrcCvtBytes != nullptr)
 283         *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
 284     return pDestBufPtr - pDestBuf;
 285 }
 286
 287 void * ImplCreateUnicodeToUtf8Context()
 288 {
 289     ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
 290     ImplResetUnicodeToUtf8Context(p);
 291     return p;
 292 }
 293
 294 void ImplResetUnicodeToUtf8Context(void * pContext)
 295 {
 296     if (pContext != nullptr)
 297         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
 298 }
 299
 300 void ImplDestroyUnicodeToUtf8Context(void * pContext)
 301 {
 302     delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
 303 }
 304
 305 sal_Size ImplConvertUnicodeToUtf8(
 306     void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
 307     sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
 308     sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
 309 {
 310     bool bJavaUtf8 = pData != nullptr;
 311     sal_Unicode nHighSurrogate = 0xFFFF;
 312     sal_uInt32 nInfo = 0;
 313     sal_Unicode const * pSrcBufPtr = pSrcBuf;
 314     sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
 315     char * pDestBufPtr = pDestBuf;
 316     char * pDestBufEnd = pDestBufPtr + nDestBytes;
 317
 318     if (pContext != nullptr)
 319         nHighSurrogate
 320             = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
 321
 322     if (nHighSurrogate == 0xFFFF)
 323     {
 324         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
 325             && !bJavaUtf8)
 326         {
 327             if (pDestBufEnd - pDestBufPtr >= 3)
 328             {
 329                 /* Write BOM (U+FEFF) as UTF-8: */
 330                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
 331                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
 332                 *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
 333             }
 334             else
 335             {
 336                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
 337                 goto done;
 338             }
 339         }
 340         nHighSurrogate = 0;
 341     }
 342
 343     while (pSrcBufPtr < pSrcBufEnd)
 344     {
 345         sal_uInt32 nChar = *pSrcBufPtr++;
 346         if (nHighSurrogate == 0)
 347         {
 348             if (rtl::isHighSurrogate(nChar) && !bJavaUtf8)
 349             {
 350                 nHighSurrogate = static_cast<sal_Unicode>(nChar);
 351                 continue;
 352             }
 353             else if (rtl::isLowSurrogate(nChar) && !bJavaUtf8)
 354             {
 355                 goto bad_input;
 356             }
 357         }
 358         else if (rtl::isLowSurrogate(nChar) && !bJavaUtf8)
 359             nChar = rtl::combineSurrogates(nHighSurrogate, nChar);
 360         else
 361             goto bad_input;
 362
 363         assert(bJavaUtf8 ? nChar <= 0xFFFF : rtl::isUnicodeScalarValue(nChar));
 364
 365         if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
 366             if (pDestBufPtr != pDestBufEnd)
 367                 *pDestBufPtr++ = static_cast< char >(nChar);
 368             else
 369                 goto no_output;
 370         else if (nChar <= 0x7FF)
 371             if (pDestBufEnd - pDestBufPtr >= 2)
 372             {
 373                 *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
 374                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
 375             }
 376             else
 377                 goto no_output;
 378         else if (nChar <= 0xFFFF)
 379             if (pDestBufEnd - pDestBufPtr >= 3)
 380             {
 381                 *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
 382                 *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
 383                 *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
 384             }
 385             else
 386                 goto no_output;
 387         else if (pDestBufEnd - pDestBufPtr >= 4)
 388         {
 389             *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
 390             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
 391             *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
 392             *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
 393         }
 394         else
 395             goto no_output;
 396         nHighSurrogate = 0;
 397         continue;
 398
 399     bad_input:
 400         switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
 401                     false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, nullptr,
 402                     0, nullptr))
 403         {
 404         case sal::detail::textenc::BAD_INPUT_STOP:
 405             nHighSurrogate = 0;
 406             break;
 407
 408         case sal::detail::textenc::BAD_INPUT_CONTINUE:
 409             nHighSurrogate = 0;
 410             continue;
 411
 412         case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 413             goto no_output;
 414         }
 415         break;
 416
 417     no_output:
 418         --pSrcBufPtr;
 419         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
 420         break;
 421     }
 422
 423     if (nHighSurrogate != 0
 424         && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
 425                          | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
 426                == 0)
 427     {
 428         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
 429             nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
 430         else
 431             switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
 432                         false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
 433                         nullptr, 0, nullptr))
 434             {
 435             case sal::detail::textenc::BAD_INPUT_STOP:
 436             case sal::detail::textenc::BAD_INPUT_CONTINUE:
 437                 nHighSurrogate = 0;
 438                 break;
 439
 440             case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
 441                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
 442                 break;
 443             }
 444     }
 445
 446  done:
 447     if (pContext != nullptr)
 448         static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
 449             = nHighSurrogate;
 450     if (pInfo != nullptr)
 451         *pInfo = nInfo;
 452     if (pSrcCvtChars != nullptr)
 453         *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
 454     return pDestBufPtr - pDestBuf;
 455 }
 456
 457 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */