lib/iconv_open1.h

   1 /*
   2  * Copyright (C) 1999-2008, 2011, 2018, 2020 Free Software Foundation, Inc.
   3  * This file is part of the GNU LIBICONV Library.
   4  *
   5  * The GNU LIBICONV Library is free software; you can redistribute it
   6  * and/or modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either version 2.1
   8  * of the License, or (at your option) any later version.
   9  *
  10  * The GNU LIBICONV Library is distributed in the hope that it will be
  11  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  17  * If not, see <https://www.gnu.org/licenses/>.
  18  */
  19
  20 /* Part 1 of iconv_open.
  21    Input: const char* tocode, const char* fromcode.
  22    Output:
  23      unsigned int from_index;
  24      int from_wchar;
  25      unsigned int to_index;
  26      int to_wchar;
  27      int transliterate;
  28      int discard_ilseq;
  29    Jumps to 'invalid' in case of errror.
  30  */
  31 {
  32   char buf[MAX_WORD_LENGTH+10+1];
  33   const char* cp;
  34   char* bp;
  35   const struct alias * ap;
  36   unsigned int count;
  37
  38   transliterate = 0;
  39   discard_ilseq = 0;
  40
  41   /* Before calling aliases_lookup, convert the input string to upper case,
  42    * and check whether it's entirely ASCII (we call gperf with option "-7"
  43    * to achieve a smaller table) and non-empty. If it's not entirely ASCII,
  44    * or if it's too long, it is not a valid encoding name.
  45    */
  46   for (to_wchar = 0;;) {
  47     /* Search tocode in the table. */
  48     for (cp = tocode, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) {
  49       unsigned char c = (unsigned char) *cp;
  50       if (c >= 0x80)
  51         goto invalid;
  52       if (c >= 'a' && c <= 'z')
  53         c -= 'a'-'A';
  54       *bp = c;
  55       if (c == '\0')
  56         break;
  57       if (--count == 0)
  58         goto invalid;
  59     }
  60     for (;;) {
  61       if (bp-buf >= 10 && memcmp(bp-10,"//TRANSLIT",10)==0) {
  62         bp -= 10;
  63         *bp = '\0';
  64         transliterate = 1;
  65         continue;
  66       }
  67       if (bp-buf >= 8 && memcmp(bp-8,"//IGNORE",8)==0) {
  68         bp -= 8;
  69         *bp = '\0';
  70         discard_ilseq = 1;
  71         continue;
  72       }
  73       break;
  74     }
  75     if (buf[0] == '\0') {
  76       tocode = locale_charset();
  77       /* Avoid an endless loop that could occur when using an older version
  78          of localcharset.c. */
  79       if (tocode[0] == '\0')
  80         goto invalid;
  81       continue;
  82     }
  83     ap = aliases_lookup(buf,bp-buf);
  84     if (ap == NULL) {
  85       ap = aliases2_lookup(buf);
  86       if (ap == NULL)
  87         goto invalid;
  88     }
  89     if (ap->encoding_index == ei_local_char) {
  90       tocode = locale_charset();
  91       /* Avoid an endless loop that could occur when using an older version
  92          of localcharset.c. */
  93       if (tocode[0] == '\0')
  94         goto invalid;
  95       continue;
  96     }
  97     if (ap->encoding_index == ei_local_wchar_t) {
  98       /* On systems which define __STDC_ISO_10646__, wchar_t is Unicode.
  99          This is also the case on native Woe32 systems and Cygwin >= 1.7, where
 100          we know that it is UTF-16.  */
 101 #if (defined _WIN32 && !defined __CYGWIN__) || (defined __CYGWIN__ && CYGWIN_VERSION_DLL_MAJOR >= 1007)
 102       if (sizeof(wchar_t) == 4) {
 103         to_index = ei_ucs4internal;
 104         break;
 105       }
 106       if (sizeof(wchar_t) == 2) {
 107 # if WORDS_LITTLEENDIAN
 108         to_index = ei_utf16le;
 109 # else
 110         to_index = ei_utf16be;
 111 # endif
 112         break;
 113       }
 114 #elif __STDC_ISO_10646__
 115       if (sizeof(wchar_t) == 4) {
 116         to_index = ei_ucs4internal;
 117         break;
 118       }
 119       if (sizeof(wchar_t) == 2) {
 120         to_index = ei_ucs2internal;
 121         break;
 122       }
 123       if (sizeof(wchar_t) == 1) {
 124         to_index = ei_iso8859_1;
 125         break;
 126       }
 127 #endif
 128 #if HAVE_MBRTOWC
 129       to_wchar = 1;
 130       tocode = locale_charset();
 131       continue;
 132 #endif
 133       goto invalid;
 134     }
 135     to_index = ap->encoding_index;
 136     break;
 137   }
 138   for (from_wchar = 0;;) {
 139     /* Search fromcode in the table. */
 140     for (cp = fromcode, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) {
 141       unsigned char c = (unsigned char) *cp;
 142       if (c >= 0x80)
 143         goto invalid;
 144       if (c >= 'a' && c <= 'z')
 145         c -= 'a'-'A';
 146       *bp = c;
 147       if (c == '\0')
 148         break;
 149       if (--count == 0)
 150         goto invalid;
 151     }
 152     for (;;) {
 153       if (bp-buf >= 10 && memcmp(bp-10,"//TRANSLIT",10)==0) {
 154         bp -= 10;
 155         *bp = '\0';
 156         continue;
 157       }
 158       if (bp-buf >= 8 && memcmp(bp-8,"//IGNORE",8)==0) {
 159         bp -= 8;
 160         *bp = '\0';
 161         continue;
 162       }
 163       break;
 164     }
 165     if (buf[0] == '\0') {
 166       fromcode = locale_charset();
 167       /* Avoid an endless loop that could occur when using an older version
 168          of localcharset.c. */
 169       if (fromcode[0] == '\0')
 170         goto invalid;
 171       continue;
 172     }
 173     ap = aliases_lookup(buf,bp-buf);
 174     if (ap == NULL) {
 175       ap = aliases2_lookup(buf);
 176       if (ap == NULL)
 177         goto invalid;
 178     }
 179     if (ap->encoding_index == ei_local_char) {
 180       fromcode = locale_charset();
 181       /* Avoid an endless loop that could occur when using an older version
 182          of localcharset.c. */
 183       if (fromcode[0] == '\0')
 184         goto invalid;
 185       continue;
 186     }
 187     if (ap->encoding_index == ei_local_wchar_t) {
 188       /* On systems which define __STDC_ISO_10646__, wchar_t is Unicode.
 189          This is also the case on native Woe32 systems and Cygwin >= 1.7, where
 190          we know that it is UTF-16.  */
 191 #if (defined _WIN32 && !defined __CYGWIN__) || (defined __CYGWIN__ && CYGWIN_VERSION_DLL_MAJOR >= 1007)
 192       if (sizeof(wchar_t) == 4) {
 193         from_index = ei_ucs4internal;
 194         break;
 195       }
 196       if (sizeof(wchar_t) == 2) {
 197 # if WORDS_LITTLEENDIAN
 198         from_index = ei_utf16le;
 199 # else
 200         from_index = ei_utf16be;
 201 # endif
 202         break;
 203       }
 204 #elif __STDC_ISO_10646__
 205       if (sizeof(wchar_t) == 4) {
 206         from_index = ei_ucs4internal;
 207         break;
 208       }
 209       if (sizeof(wchar_t) == 2) {
 210         from_index = ei_ucs2internal;
 211         break;
 212       }
 213       if (sizeof(wchar_t) == 1) {
 214         from_index = ei_iso8859_1;
 215         break;
 216       }
 217 #endif
 218 #if HAVE_WCRTOMB
 219       from_wchar = 1;
 220       fromcode = locale_charset();
 221       continue;
 222 #endif
 223       goto invalid;
 224     }
 225     from_index = ap->encoding_index;
 226     break;
 227   }
 228 }