usr/src/uts/common/kiconv/kiconv_sc/kiconv_cck_common.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27
  28 #include <sys/types.h>
  29 #include <sys/param.h>
  30 #include <sys/sysmacros.h>
  31 #include <sys/systm.h>
  32 #include <sys/debug.h>
  33 #include <sys/kmem.h>
  34 #include <sys/sunddi.h>
  35 #include <sys/byteorder.h>
  36 #include <sys/errno.h>
  37 #include <sys/u8_textprep.h>
  38 #include <sys/kiconv.h>
  39 #include <sys/kiconv_cck_common.h>
  40
  41 /*LINTLIBRARY*/
  42
  43 /*
  44  * Common kiconv_open method for UTF-8 -> CCK conversion.
  45  */
  46 void *
  47 kiconv_open_to_cck()
  48 {
  49         kiconv_state_t st;
  50
  51         st = (kiconv_state_t)kmem_alloc(sizeof (kiconv_state_data_t), KM_SLEEP);
  52
  53         st->bom_processed = 0;
  54
  55         return ((void *)st);
  56 }
  57
  58 /*
  59  * Common kiconv_close method for UTF-8 -> CCK conversion.
  60  */
  61 int
  62 kiconv_close_to_cck(void *kcd)
  63 {
  64         if (! kcd || kcd == (void *)-1)
  65                 return (EBADF);
  66
  67         kmem_free(kcd, sizeof (kiconv_state_data_t));
  68
  69         return (0);
  70 }
  71
  72 /*
  73  * Common routine to convert UTF-8 sequence to CCK legal character sequence.
  74  */
  75 size_t
  76 kiconv_utf8_to_cck(void *kcd, char **inbuf, size_t *inbytesleft,
  77         char **outbuf, size_t *outbytesleft, int *errno,
  78         kiconv_utf8tocck_t ptr_utf8tocck)
  79 {
  80         uchar_t         *ib;
  81         uchar_t         *ob;
  82         uchar_t         *ibtail;
  83         uchar_t         *obtail;
  84         uchar_t         *oldib;
  85         size_t          ret_val;
  86         size_t          i;              /* temp variable in for loop */
  87         uint32_t        u8;
  88         int8_t          sz;
  89
  90         /* Check on the kiconv code conversion descriptor. */
  91         if (! kcd || kcd == (void *)-1) {
  92                 *errno = EBADF;
  93                 return ((size_t)-1);
  94         }
  95
  96         /* If this is a state reset request, process and return. */
  97         if (! inbuf || !(*inbuf)) {
  98                 ((kiconv_state_t)kcd)->bom_processed = 0;
  99                 return (0);
 100         }
 101
 102         ret_val = 0;
 103         ib = (uchar_t *)*inbuf;
 104         ob = (uchar_t *)*outbuf;
 105         ibtail = ib + *inbytesleft;
 106         obtail = ob + *outbytesleft;
 107
 108         KICONV_CHECK_UTF8_BOM(ib, ibtail);
 109
 110         while (ib < ibtail) {
 111                 sz = u8_number_of_bytes[*ib];
 112
 113                 /*
 114                  * If it is a 7-bit ASCII character, we don't need to
 115                  * process further and we just copy the character over.
 116                  *
 117                  * If not, we connect the chracter bytes up to four bytes,
 118                  * validate the bytes, and binary search for the corresponding
 119                  * table. If we find it from the mapping table, we put that
 120                  * into the output buffer; otherwise, we put a replacement
 121                  * character instead as a non-identical conversion.
 122                  */
 123                 if (sz == 1) {
 124                         if (ob >= obtail) {
 125                                 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
 126                         }
 127
 128                         *ob++ = *ib++;
 129                         continue;
 130                 }
 131
 132                 /*
 133                  * Issue EILSEQ error if the first byte is a
 134                  * invalid UTF-8 character leading byte.
 135                  */
 136                 if (sz <= 0) {
 137                         KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
 138                 }
 139
 140                 /*
 141                  * Issue EINVAL error if input buffer has an incomplete
 142                  * character at the end of the buffer.
 143                  */
 144                 if (ibtail - ib < sz) {
 145                         KICONV_SET_ERRNO_AND_BREAK(EINVAL);
 146                 }
 147
 148                 /*
 149                  * We collect UTF-8 character bytes and also check if this
 150                  * is a valid UTF-8 character without any bogus bytes based
 151                  * on the latest UTF-8 binary representation.
 152                  */
 153                 oldib = ib;
 154                 u8 = *ib++;
 155
 156                 if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8))
 157                         goto ILLEGAL_CHAR_PROCESS;
 158                 u8 = (u8 << 8) | *ib++;
 159
 160                 for (i = 2; i < sz; i++) {
 161                         if (*ib < 0x80 || *ib > 0xbf) {
 162 ILLEGAL_CHAR_PROCESS:
 163                                 *errno = EILSEQ;
 164                                 ret_val = (size_t)-1;
 165                                 ib = oldib;
 166                                 goto ILLEGAL_CHAR_ERR;
 167                         }
 168
 169                         u8 = (u8 << 8) | *ib++;
 170                 }
 171
 172                 /* Now we have a valid UTF-8 character. */
 173                 sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val);
 174                 if (sz < 0) {
 175                         ib = oldib;
 176                         KICONV_SET_ERRNO_AND_BREAK(E2BIG);
 177                 }
 178
 179                 ob += sz;
 180         }
 181
 182 ILLEGAL_CHAR_ERR:
 183         *inbuf = (char *)ib;
 184         *inbytesleft = ibtail - ib;
 185         *outbuf = (char *)ob;
 186         *outbytesleft = obtail - ob;
 187
 188         return (ret_val);
 189 }
 190
 191 size_t
 192 kiconvstr_utf8_to_cck(uchar_t *ib, size_t *inlen, uchar_t *ob, size_t *outlen,
 193         int flag, int *errno, kiconv_utf8tocck_t ptr_utf8tocck)
 194 {
 195         uchar_t         *ibtail;
 196         uchar_t         *obtail;
 197         uchar_t         *oldib;
 198         size_t          ret_val;
 199         size_t          i;              /* temp variable in for loop */
 200         uint32_t        u8;
 201         int8_t          sz;
 202         boolean_t       do_not_ignore_null;
 203
 204         ret_val = 0;
 205         ibtail = ib + *inlen;
 206         obtail = ob + *outlen;
 207         do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
 208
 209         KICONV_CHECK_UTF8_BOM_WITHOUT_STATE(ib, ibtail);
 210
 211         while (ib < ibtail) {
 212                 if (*ib == '\0' && do_not_ignore_null)
 213                         break;
 214
 215                 sz = u8_number_of_bytes[*ib];
 216
 217                 if (sz == 1) {
 218                         if (ob >= obtail) {
 219                                 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
 220                         }
 221
 222                         *ob++ = *ib++;
 223                         continue;
 224                 }
 225
 226                 oldib = ib;
 227
 228                 if (sz <= 0) {
 229                         KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
 230                 }
 231
 232                 if (ibtail - ib < sz) {
 233                         if (flag & KICONV_REPLACE_INVALID) {
 234                                 ib = ibtail;
 235                                 goto REPLACE_INVALID;
 236                         }
 237
 238                         KICONV_SET_ERRNO_AND_BREAK(EINVAL);
 239                 }
 240
 241                 u8 = *ib++;
 242
 243                 if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8))
 244                         goto ILLEGAL_CHAR_PROCESS;
 245                 u8 = (u8 << 8) | *ib++;
 246
 247                 for (i = 2; i < sz; i++) {
 248                         if (*ib < 0x80 || *ib > 0xbf) {
 249 ILLEGAL_CHAR_PROCESS:
 250                                 if (flag & KICONV_REPLACE_INVALID) {
 251                                         ib = oldib + sz;
 252                                         goto REPLACE_INVALID;
 253                                 }
 254
 255                                 *errno = EILSEQ;
 256                                 ret_val = (size_t)-1;
 257                                 ib = oldib;
 258                                 goto ILLEGAL_CHAR_ERR;
 259                         }
 260
 261                         u8 = (u8 << 8) | *ib++;
 262                 }
 263
 264                 /* Now we get a valid character encoded in UTF-8. */
 265                 sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val);
 266                 if (sz < 0) {
 267                         ib = oldib;
 268                         KICONV_SET_ERRNO_AND_BREAK(E2BIG);
 269                 }
 270
 271                 ob += sz;
 272                 continue;
 273
 274 REPLACE_INVALID:
 275                 if (ob >= obtail) {
 276                         ib = oldib;
 277                         KICONV_SET_ERRNO_AND_BREAK(E2BIG);
 278                 }
 279
 280                 *ob++ = KICONV_ASCII_REPLACEMENT_CHAR;
 281                 ret_val++;
 282         }
 283
 284 ILLEGAL_CHAR_ERR:
 285         *inlen = ibtail - ib;
 286         *outlen = obtail - ob;
 287
 288         return (ret_val);
 289 }
 290
 291 /*
 292  * Search key in tbl[0] <= tbl[1] <= ... <= tbl[n-1].  Return 0 if not found.
 293  * tbl[0] is a special element for non-identical conversion.
 294  */
 295 size_t
 296 kiconv_binsearch(uint32_t key, void *tbl, size_t nitems)
 297 {
 298         size_t low, high, mid;
 299         kiconv_table_t *table;
 300
 301         low = 1;
 302         high = nitems - 1;
 303         table = (kiconv_table_t *)tbl;
 304
 305         while (low <= high) {
 306                 mid = (low + high) / 2;
 307
 308                 if (key < table[mid].key)
 309                         high = mid - 1;
 310                 else if (key > table[mid].key)
 311                         low = mid + 1;
 312                 else
 313                         return (mid);
 314         }
 315
 316         return (0);
 317 }