sys/fs/unicode.h

   1 /* $NetBSD: unicode.h,v 1.7 2014/04/06 19:25:22 jakllsch Exp $ */
   2
   3 /*-
   4  * Copyright (c) 2001, 2004 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26  * POSSIBILITY OF SUCH DAMAGE.
  27  */
  28
  29 /*-
  30  * Copyright (c) 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * This code is derived from software contributed to Berkeley by
  34  * Paul Borman at Krystal Technologies.
  35  *
  36  * Redistribution and use in source and binary forms, with or without
  37  * modification, are permitted provided that the following conditions
  38  * are met:
  39  * 1. Redistributions of source code must retain the above copyright
  40  *    notice, this list of conditions and the following disclaimer.
  41  * 2. Redistributions in binary form must reproduce the above copyright
  42  *    notice, this list of conditions and the following disclaimer in the
  43  *    documentation and/or other materials provided with the distribution.
  44  * 3. All advertising materials mentioning features or use of this software
  45  *    must display the following acknowledgement:
  46  *      This product includes software developed by the University of
  47  *      California, Berkeley and its contributors.
  48  * 4. Neither the name of the University nor the names of its contributors
  49  *    may be used to endorse or promote products derived from this software
  50  *    without specific prior written permission.
  51  *
  52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  62  * SUCH DAMAGE.
  63  */
  64
  65 /*
  66  * Routines for handling Unicode encoded in UTF-8 form, code derived from
  67  * src/lib/libc/locale/utf2.c.
  68  */
  69 static u_int16_t wget_utf8(const char **, size_t *) __unused;
  70 static int wput_utf8(char *, size_t, u_int16_t) __unused;
  71
  72 /*
  73  * Read one UTF8-encoded character off the string, shift the string pointer
  74  * and return the character.
  75  */
  76 static u_int16_t
  77 wget_utf8(const char **str, size_t *sz)
  78 {
  79         size_t c;
  80         u_int16_t rune = 0;
  81         const char *s = *str;
  82         static const int _utf_count[16] = {
  83                 1, 1, 1, 1, 1, 1, 1, 1,
  84                 0, 0, 0, 0, 2, 2, 3, 0,
  85         };
  86
  87         /* must be called with at least one byte remaining */
  88         KASSERT(*sz > 0);
  89
  90         c = _utf_count[(s[0] & 0xf0) >> 4];
  91         if (c == 0 || c > *sz) {
  92     decoding_error:
  93                 /*
  94                  * The first character is in range 128-255 and doesn't
  95                  * mark valid a valid UTF-8 sequence. There is not much
  96                  * we can do with this, so handle by returning
  97                  * the first character as if it would be a correctly
  98                  * encoded ISO-8859-1 character.
  99                  */
 100                 c = 1;
 101         }
 102
 103         switch (c) {
 104         case 1:
 105                 rune = s[0] & 0xff;
 106                 break;
 107         case 2:
 108                 if ((s[1] & 0xc0) != 0x80)
 109                         goto decoding_error;
 110                 rune = ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
 111                 break;
 112         case 3:
 113                 if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
 114                         goto decoding_error;
 115                 rune = ((s[0] & 0x0F) << 12) | ((s[1] & 0x3F) << 6)
 116                     | (s[2] & 0x3F);
 117                 break;
 118         }
 119
 120         *str += c;
 121         *sz -= c;
 122         return rune;
 123 }
 124
 125 /*
 126  * Encode wide character and write it to the string. 'n' specifies
 127  * how much buffer space remains in 's'. Returns number of bytes written
 128  * to the target string 's'.
 129  */
 130 static int
 131 wput_utf8(char *s, size_t n, u_int16_t wc)
 132 {
 133         if (wc & 0xf800) {
 134                 if (n < 3) {
 135                         /* bound check failure */
 136                         return 0;
 137                 }
 138
 139                 s[0] = 0xE0 | (wc >> 12);
 140                 s[1] = 0x80 | ((wc >> 6) & 0x3F);
 141                 s[2] = 0x80 | ((wc) & 0x3F);
 142                 return 3;
 143         } else if (wc & 0x0780) {
 144                 if (n < 2) {
 145                         /* bound check failure */
 146                         return 0;
 147                 }
 148
 149                 s[0] = 0xC0 | (wc >> 6);
 150                 s[1] = 0x80 | ((wc) & 0x3F);
 151                 return 2;
 152         } else {
 153                 if (n < 1) {
 154                         /* bound check failure */
 155                         return 0;
 156                 }
 157
 158                 s[0] = wc;
 159                 return 1;
 160         }
 161 }