xapian-applications/omega/utf8convert.cc

   1 /** @file
   2  * @brief convert a string to UTF-8 encoding.
   3  */
   4 /* Copyright (C) 2006,2007,2008,2010,2013,2017,2019,2021 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  19  */
  20
  21 #include <config.h>
  22
  23 #include "utf8convert.h"
  24
  25 #include <algorithm>
  26 #include <cerrno>
  27 #include <string>
  28
  29 #ifdef HAVE_ICONV
  30 # include <iconv.h>
  31 #endif
  32 #include <xapian.h>
  33 #include "strcasecmp.h"
  34 #include "stringutils.h"
  35
  36 using namespace std;
  37
  38 bool
  39 convert_to_utf8_(const string& text, const string& charset, string& output)
  40 {
  41     // Shortcut if it's already in utf8!
  42     if (charset.size() == 5 && strcasecmp(charset.c_str(), "utf-8") == 0)
  43         return false;
  44     if (charset.size() == 4 && strcasecmp(charset.c_str(), "utf8") == 0)
  45         return false;
  46     if (charset.size() == 8 && strcasecmp(charset.c_str(), "us-ascii") == 0)
  47         return false;
  48
  49     // Nobody has told us what charset it's in, so do as little work as
  50     // possible!
  51     if (charset.empty())
  52         return false;
  53
  54     char buf[1024];
  55     string tmp;
  56
  57     /* Handle iso-8859-1/iso-8859-15//windows-1252/cp-1252, utf-16/ucs-2,
  58      * utf-16be/ucs-2be, and utf-16le/ucs-2le. */
  59     const char * p = charset.c_str();
  60
  61     bool utf16 = false;
  62     if (strncasecmp(p, "utf", 3) == 0) {
  63         p += 3;
  64         if (*p == '-' || *p == '_' || *p == ' ') ++p;
  65         if (*p != '1' || p[1] != '6') goto try_iconv;
  66         p += 2;
  67         utf16 = true;
  68     } else if (strncasecmp(p, "ucs", 3) == 0) {
  69         p += 3;
  70         if (*p == '-' || *p == '_' || *p == ' ') ++p;
  71         if (*p != '2') goto try_iconv;
  72         ++p;
  73         utf16 = true;
  74     }
  75
  76     if (utf16) {
  77         if (text.size() < 2) return false;
  78
  79         bool big_endian = true;
  80         string::const_iterator i = text.begin();
  81         if (*p == '\0') {
  82             // GNU iconv doesn't seem to handle BOMs.
  83             if (startswith(text, "\xfe\xff")) {
  84                 i += 2;
  85             } else if (startswith(text, "\xff\xfe")) {
  86                 big_endian = false;
  87                 i += 2;
  88             }
  89             // UTF-16 with no BOM is meant to be assumed to be BE.  Strictly
  90             // speaking, we're not meant to assume anything for UCS-2 with
  91             // no BOM, but we've got to do something, so we might as well
  92             // assume it's UTF-16 mislabelled, which is easy and sane.
  93         } else if (strcasecmp(p, "LE") == 0) {
  94             big_endian = false;
  95         } else if (!(strcasecmp(p, "BE") == 0)) {
  96             goto try_iconv;
  97         }
  98
  99         tmp.reserve(text.size() / 2);
 100
 101         size_t start = 0;
 102         auto text_end = text.end();
 103         if (text.size() & 1) {
 104             // If there's a half-character at the end, nuke it now to make the
 105             // conversion loop below simpler.
 106             --text_end;
 107         }
 108
 109         while (i != text_end) {
 110             unsigned ch = static_cast<unsigned char>(*i++);
 111             unsigned ch2 = static_cast<unsigned char>(*i++);
 112             if (big_endian) {
 113                 ch = (ch << 8) | ch2;
 114             } else {
 115                 ch = (ch2 << 8) | ch;
 116             }
 117             if (ch >> 10 == 0xd800 >> 10) {
 118                 // Surrogate pair.
 119                 if (i == text_end) break;
 120                 unsigned hi = (ch & 0x3ff);
 121                 ch = static_cast<unsigned char>(*i++);
 122                 ch2 = static_cast<unsigned char>(*i++);
 123                 if (big_endian) {
 124                     ch = (ch << 8) | ch2;
 125                 } else {
 126                     ch = (ch2 << 8) | ch;
 127                 }
 128                 if (ch >> 10 == 0xdc00 >> 10) {
 129                     ch &= 0x3ff;
 130                     ch |= (hi << 10);
 131                     ch += 0x10000;
 132                 }
 133             }
 134             start += Xapian::Unicode::to_utf8(ch, buf + start);
 135             if (start >= sizeof(buf) - 4) {
 136                 tmp.append(buf, start);
 137                 start = 0;
 138             }
 139         }
 140         if (start) tmp.append(buf, start);
 141     } else {
 142         // Assume windows-1252 if iso-8859-1 is specified.  The only
 143         // differences are in the range 128-159 which are control characters in
 144         // iso-8859-1, and a lot of content is mislabelled.  We use our own
 145         // conversion code for this case, as GNU iconv fails if it sees one of
 146         // the unassigned code points in windows-1252, whereas it would accept
 147         // the same input as iso-8859-1, and it seems undesirable to be
 148         // rejecting input due to this behind-the-scenes character set
 149         // shenanigans.
 150         const char * q = NULL;
 151         if (strncasecmp(p, "windows", 7) == 0) {
 152             q = p + 7;
 153         } else if (strncasecmp(p, "cp", 2) == 0) {
 154             q = p + 2;
 155         }
 156         if (q) {
 157             if (*q == '-' || *q == '_' || *q == ' ') ++q;
 158             if (strcmp(q, "1252") != 0)
 159                 goto try_iconv;
 160         } else {
 161             if (strncasecmp(p, "iso", 3) == 0) {
 162                 p += 3;
 163                 if (*p == '-' || *p == '_' || *p == ' ') ++p;
 164             }
 165             if (strncmp(p, "8859", 4) != 0) goto try_iconv;
 166             p += 4;
 167             if (*p == '-' || *p == '_' || *p == ' ') ++p;
 168             if (*p != '1') goto try_iconv;
 169             if (strcmp(p + 1, "5") == 0) goto iso8859_15;
 170             if (p[1] != '\0') goto try_iconv;
 171         }
 172
 173         // FIXME: pull this out as a standard "normalise utf-8" function?
 174         tmp.reserve(text.size());
 175
 176         size_t start = 0;
 177         for (string::const_iterator i = text.begin(); i != text.end(); ++i) {
 178             static const unsigned cp1252_to_unicode[32] = {
 179                 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
 180                 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
 181                 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
 182                 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
 183             };
 184             const size_t CP1252_TO_UNICODE_ENTRIES =
 185                 sizeof(cp1252_to_unicode) / sizeof(*cp1252_to_unicode);
 186             unsigned ch = static_cast<unsigned char>(*i);
 187             if (ch - 128 < CP1252_TO_UNICODE_ENTRIES)
 188                 ch = cp1252_to_unicode[ch - 128];
 189             start += Xapian::Unicode::to_utf8(ch, buf + start);
 190             if (start >= sizeof(buf) - 4) {
 191                 tmp.append(buf, start);
 192                 start = 0;
 193             }
 194         }
 195         if (start) tmp.append(buf, start);
 196     }
 197
 198     if (false) {
 199 try_iconv:
 200 #ifdef HAVE_ICONV
 201         iconv_t conv = iconv_open("UTF-8", charset.c_str());
 202         if (conv == reinterpret_cast<iconv_t>(-1))
 203             return false;
 204         ICONV_CONST char* in = const_cast<char *>(text.c_str());
 205         size_t in_len = text.size();
 206         while (in_len) {
 207             char * out = buf;
 208             size_t out_len = sizeof(buf);
 209             if (iconv(conv, &in, &in_len, &out, &out_len) == size_t(-1) &&
 210                 errno != E2BIG) {
 211                 // FIXME: how to handle this?
 212                 break;
 213             }
 214             tmp.append(buf, out - buf);
 215         }
 216
 217         (void)iconv_close(conv);
 218 #else
 219         return false;
 220 #endif
 221     }
 222
 223     if (false) {
 224 iso8859_15:
 225         tmp.reserve(text.size());
 226
 227         size_t start = 0;
 228         for (string::const_iterator i = text.begin(); i != text.end(); ++i) {
 229             static const unsigned iso8859_15_to_unicode[] = {
 230                 0x20ac, 0x00a5, 0x0160, 0x00a7, 0x0161, 0x00a9, 0x00aa, 0x00ab,
 231                 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3,
 232                 0x017d, 0x00b5, 0x00b6, 0x00b7, 0x017e, 0x00b9, 0x00ba, 0x00bb,
 233                 0x0152, 0x0153, 0x0178
 234             };
 235             const size_t ISO8859_15_TO_UNICODE_ENTRIES =
 236                 sizeof(iso8859_15_to_unicode) / sizeof(*iso8859_15_to_unicode);
 237             unsigned ch = static_cast<unsigned char>(*i);
 238             if (ch - 164 < ISO8859_15_TO_UNICODE_ENTRIES)
 239                 ch = iso8859_15_to_unicode[ch - 164];
 240             start += Xapian::Unicode::to_utf8(ch, buf + start);
 241             if (start >= sizeof(buf) - 4) {
 242                 tmp.append(buf, start);
 243                 start = 0;
 244             }
 245         }
 246         if (start) tmp.append(buf, start);
 247     }
 248
 249     // `output` may be a reference to the same string object as `text` so we
 250     // only switch after we've done converting.
 251     output = std::move(tmp);
 252     return true;
 253 }