xapian-applications/omega/utf8convert.cc

   1 /** @file
   2  * @brief convert a string to UTF-8 encoding.
   3  */
   4 /* Copyright (C) 2006,2007,2008,2010,2013,2017,2019,2021,2023 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  19  */
  20
  21 #include <config.h>
  22
  23 #include "utf8convert.h"
  24
  25 #include <algorithm>
  26 #include <cerrno>
  27 #include <string>
  28
  29 #ifdef HAVE_ICONV
  30 # include <iconv.h>
  31 #endif
  32 #include <xapian.h>
  33 #include "strcasecmp.h"
  34 #include "stringutils.h"
  35
  36 using namespace std;
  37
  38 bool
  39 convert_to_utf8_(string_view text, const string& charset, string& output)
  40 {
  41     // Shortcut if it's already in utf8!
  42     if (charset.size() == 5 && strcasecmp(charset.c_str(), "utf-8") == 0)
  43         return false;
  44     if (charset.size() == 4 && strcasecmp(charset.c_str(), "utf8") == 0)
  45         return false;
  46     if (charset.size() == 8 && strcasecmp(charset.c_str(), "us-ascii") == 0)
  47         return false;
  48
  49     // Nobody has told us what charset it's in, so do as little work as
  50     // possible!
  51     if (charset.empty())
  52         return false;
  53
  54     char buf[1024];
  55     string tmp;
  56
  57     /* Handle iso-8859-1/iso-8859-15//windows-1252/cp-1252, utf-16/ucs-2,
  58      * utf-16be/ucs-2be, and utf-16le/ucs-2le. */
  59     const char * p = charset.c_str();
  60
  61     bool utf16 = false;
  62     if (strncasecmp(p, "utf", 3) == 0) {
  63         p += 3;
  64         if (*p == '-' || *p == '_' || *p == ' ') ++p;
  65         if (*p != '1' || p[1] != '6') goto try_iconv;
  66         p += 2;
  67         utf16 = true;
  68     } else if (strncasecmp(p, "ucs", 3) == 0) {
  69         p += 3;
  70         if (*p == '-' || *p == '_' || *p == ' ') ++p;
  71         if (*p != '2') goto try_iconv;
  72         ++p;
  73         utf16 = true;
  74     }
  75
  76     if (utf16) {
  77         if (text.size() < 2) return false;
  78
  79         bool big_endian = true;
  80         auto i = text.begin();
  81         if (*p == '\0') {
  82             // GNU iconv doesn't seem to handle BOMs.
  83             if (startswith(text, "\xfe\xff")) {
  84                 i += 2;
  85             } else if (startswith(text, "\xff\xfe")) {
  86                 big_endian = false;
  87                 i += 2;
  88             }
  89             // UTF-16 with no BOM is meant to be assumed to be BE.  Strictly
  90             // speaking, we're not meant to assume anything for UCS-2 with
  91             // no BOM, but we've got to do something, so we might as well
  92             // assume it's UTF-16 mislabelled, which is easy and sane.
  93         } else if (strcasecmp(p, "LE") == 0) {
  94             big_endian = false;
  95         } else if (!(strcasecmp(p, "BE") == 0)) {
  96             goto try_iconv;
  97         }
  98
  99         tmp.reserve(text.size() / 2);
 100
 101         size_t start = 0;
 102         auto text_end = text.end();
 103         if (text.size() & 1) {
 104             // If there's a half-character at the end, nuke it now to make the
 105             // conversion loop below simpler.
 106             --text_end;
 107         }
 108
 109         while (i != text_end) {
 110             unsigned ch = static_cast<unsigned char>(*i++);
 111             unsigned ch2 = static_cast<unsigned char>(*i++);
 112             if (big_endian) {
 113                 ch = (ch << 8) | ch2;
 114             } else {
 115                 ch = (ch2 << 8) | ch;
 116             }
 117             if (ch >> 10 == 0xd800 >> 10) {
 118                 // Surrogate pair.
 119                 if (i == text_end) break;
 120                 unsigned hi = (ch & 0x3ff);
 121                 ch = static_cast<unsigned char>(*i++);
 122                 ch2 = static_cast<unsigned char>(*i++);
 123                 if (big_endian) {
 124                     ch = (ch << 8) | ch2;
 125                 } else {
 126                     ch = (ch2 << 8) | ch;
 127                 }
 128                 if (ch >> 10 == 0xdc00 >> 10) {
 129                     ch &= 0x3ff;
 130                     ch |= (hi << 10);
 131                     ch += 0x10000;
 132                 }
 133             }
 134             start += Xapian::Unicode::to_utf8(ch, buf + start);
 135             if (start >= sizeof(buf) - 4) {
 136                 tmp.append(buf, start);
 137                 start = 0;
 138             }
 139         }
 140         if (start) tmp.append(buf, start);
 141     } else {
 142         // Assume windows-1252 if iso-8859-1 is specified.  The only
 143         // differences are in the range 128-159 which are control characters in
 144         // iso-8859-1, and a lot of content is mislabelled.  We use our own
 145         // conversion code for this case, as GNU iconv fails if it sees one of
 146         // the unassigned code points in windows-1252, whereas it would accept
 147         // the same input as iso-8859-1, and it seems undesirable to be
 148         // rejecting input due to this behind-the-scenes character set
 149         // shenanigans.
 150         const char * q = NULL;
 151         if (strncasecmp(p, "windows", 7) == 0) {
 152             q = p + 7;
 153         } else if (strncasecmp(p, "cp", 2) == 0) {
 154             q = p + 2;
 155         }
 156         if (q) {
 157             if (*q == '-' || *q == '_' || *q == ' ') ++q;
 158             if (strcmp(q, "1252") != 0)
 159                 goto try_iconv;
 160         } else {
 161             if (strncasecmp(p, "iso", 3) == 0) {
 162                 p += 3;
 163                 if (*p == '-' || *p == '_' || *p == ' ') ++p;
 164             }
 165             if (strncmp(p, "8859", 4) != 0) goto try_iconv;
 166             p += 4;
 167             if (*p == '-' || *p == '_' || *p == ' ') ++p;
 168             if (*p != '1') goto try_iconv;
 169             if (strcmp(p + 1, "5") == 0) goto iso8859_15;
 170             if (p[1] != '\0') goto try_iconv;
 171         }
 172
 173         // FIXME: pull this out as a standard "normalise utf-8" function?
 174         tmp.reserve(text.size());
 175
 176         size_t start = 0;
 177         for (unsigned char ch : text) {
 178             static const unsigned cp1252_to_unicode[32] = {
 179                 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
 180                 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
 181                 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
 182                 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
 183             };
 184             unsigned code_point = ch;
 185             unsigned i = UNSIGNED_OVERFLOW_OK(code_point - 128);
 186             if (i < std::size(cp1252_to_unicode))
 187                 code_point = cp1252_to_unicode[i];
 188             start += Xapian::Unicode::to_utf8(code_point, buf + start);
 189             if (start >= sizeof(buf) - 4) {
 190                 tmp.append(buf, start);
 191                 start = 0;
 192             }
 193         }
 194         if (start) tmp.append(buf, start);
 195     }
 196
 197     if (false) {
 198 try_iconv:
 199 #ifdef HAVE_ICONV
 200         iconv_t conv = iconv_open("UTF-8", charset.c_str());
 201         if (conv == reinterpret_cast<iconv_t>(-1))
 202             return false;
 203         ICONV_CONST char* in = const_cast<char *>(text.data());
 204         size_t in_len = text.size();
 205         while (in_len) {
 206             char * out = buf;
 207             size_t out_len = sizeof(buf);
 208             if (iconv(conv, &in, &in_len, &out, &out_len) == size_t(-1) &&
 209                 errno != E2BIG) {
 210                 // FIXME: how to handle this?
 211                 break;
 212             }
 213             tmp.append(buf, out - buf);
 214         }
 215
 216         (void)iconv_close(conv);
 217 #else
 218         return false;
 219 #endif
 220     }
 221
 222     if (false) {
 223 iso8859_15:
 224         tmp.reserve(text.size());
 225
 226         size_t start = 0;
 227         for (unsigned char ch : text) {
 228             static const unsigned iso8859_15_to_unicode[] = {
 229                 0x20ac, 0x00a5, 0x0160, 0x00a7, 0x0161, 0x00a9, 0x00aa, 0x00ab,
 230                 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3,
 231                 0x017d, 0x00b5, 0x00b6, 0x00b7, 0x017e, 0x00b9, 0x00ba, 0x00bb,
 232                 0x0152, 0x0153, 0x0178
 233             };
 234             unsigned code_point = ch;
 235             unsigned i = UNSIGNED_OVERFLOW_OK(code_point - 164);
 236             if (i < std::size(iso8859_15_to_unicode))
 237                 code_point = iso8859_15_to_unicode[i];
 238             start += Xapian::Unicode::to_utf8(code_point, buf + start);
 239             if (start >= sizeof(buf) - 4) {
 240                 tmp.append(buf, start);
 241                 start = 0;
 242             }
 243         }
 244         if (start) tmp.append(buf, start);
 245     }
 246
 247     // `output` may be a reference to the same string object as `text` so we
 248     // only switch after we've done converting.
 249     output = std::move(tmp);
 250     return true;
 251 }