4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
25 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
29 * Multibyte/wide-char conversion routines. Wide-char encoding provides
30 * a fixed size character encoding that maps to the Unicode 16-bit
31 * (UCS-2) character set standard. Multibyte or UCS transformation
32 * format (UTF) encoding is a variable length character encoding scheme
33 * that s compatible with existing ASCII characters and guarantees that
34 * the resultant strings do not contain embedded null characters. Both
35 * types of encoding provide a null terminator: single byte for UTF-8
36 * and a wide-char null for Unicode. See RFC 2044.
38 * The table below illustrates the UTF-8 encoding scheme. The letter x
39 * indicates bits available for encoding the character value.
41 * UCS-2 UTF-8 octet sequence (binary)
42 * 0x0000-0x007F 0xxxxxxx
43 * 0x0080-0x07FF 110xxxxx 10xxxxxx
44 * 0x0800-0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
47 * UTF-8,a transformation format of UNICODE and ISO 10646
54 #include <sys/types.h>
55 #include <sys/sunddi.h>
62 #include <smbsrv/string.h>
68 * The mbstowcs() function converts a multibyte character string
69 * mbstring into a wide character string wcstring. No more than
70 * nwchars wide characters are stored. A terminating null wide
71 * character is appended if there is room.
73 * Returns the number of wide characters converted, not counting
74 * any terminating null wide character. Returns -1 if an invalid
75 * multibyte character is encountered.
78 smb_mbstowcs(smb_wchar_t
*wcstring
, const char *mbstring
, size_t nwchars
)
81 smb_wchar_t
*start
= wcstring
;
84 len
= smb_mbtowc(wcstring
, mbstring
, MTS_MB_CHAR_MAX
);
97 return (wcstring
- start
);
104 * The mbtowc() function converts a multibyte character mbchar into
105 * a wide character and stores the result in the object pointed to
106 * by wcharp. Up to nbytes bytes are examined.
108 * If mbchar is NULL, mbtowc() returns zero to indicate that shift
109 * states are not supported. Shift states are used to switch between
110 * representation modes using reserved bytes to signal shifting
111 * without them being interpreted as characters. If mbchar is null
112 * mbtowc should return non-zero if the current locale requires shift
113 * states. Otherwise it should be return 0.
115 * If mbchar is non-null, returns the number of bytes processed in
116 * mbchar. If mbchar is invalid, returns -1.
119 smb_mbtowc(smb_wchar_t
*wcharp
, const char *mbchar
, size_t nbytes
)
122 smb_wchar_t wide_char
;
127 return (0); /* no shift states */
129 /* 0xxxxxxx -> 1 byte ASCII encoding */
130 if (((mbyte
= *mbchar
++) & 0x80) == 0) {
132 *wcharp
= (smb_wchar_t
)mbyte
;
134 return (mbyte
? 1 : 0);
137 /* 10xxxxxx -> invalid first byte */
138 if ((mbyte
& 0x40) == 0)
142 if ((mbyte
& 0x20) == 0) {
145 } else if ((mbyte
& 0x10) == 0) {
153 while (bytes_left
--) {
154 if (((mbyte
= *mbchar
++) & 0xc0) != 0x80)
158 wide_char
= (wide_char
<< 6) | (mbyte
& 0x3f);
171 * The wctomb() function converts a wide character wchar into a multibyte
172 * character and stores the result in mbchar. The object pointed to by
173 * mbchar must be large enough to accommodate the multibyte character.
175 * Returns the numberof bytes written to mbchar.
178 smb_wctomb(char *mbchar
, smb_wchar_t wchar
)
180 if ((wchar
& ~0x7f) == 0) {
181 *mbchar
= (char)wchar
;
185 if ((wchar
& ~0x7ff) == 0) {
186 *mbchar
++ = (wchar
>> 6) | 0xc0;
187 *mbchar
= (wchar
& 0x3f) | 0x80;
191 *mbchar
++ = (wchar
>> 12) | 0xe0;
192 *mbchar
++ = ((wchar
>> 6) & 0x3f) | 0x80;
193 *mbchar
= (wchar
& 0x3f) | 0x80;
201 * The wcstombs() function converts a wide character string wcstring
202 * into a multibyte character string mbstring. Up to nbytes bytes are
203 * stored in mbstring. Partial multibyte characters at the end of the
204 * string are not stored. The multibyte character string is null
205 * terminated if there is room.
207 * Returns the number of bytes converted, not counting the terminating
211 smb_wcstombs(char *mbstring
, const smb_wchar_t
*wcstring
, size_t nbytes
)
213 char *start
= mbstring
;
214 const smb_wchar_t
*wcp
= wcstring
;
215 smb_wchar_t wide_char
= 0;
219 if ((mbstring
== NULL
) || (wcstring
== NULL
))
222 while (nbytes
> MTS_MB_CHAR_MAX
) {
224 len
= smb_wctomb(mbstring
, wide_char
);
227 /*LINTED E_PTRDIFF_OVERFLOW*/
228 return (mbstring
- start
);
234 while (wide_char
&& nbytes
) {
236 if ((len
= smb_wctomb(buf
, wide_char
)) > nbytes
) {
241 bcopy(buf
, mbstring
, len
);
246 /*LINTED E_PTRDIFF_OVERFLOW*/
247 return (mbstring
- start
);
252 * Returns the number of bytes that would be written if the multi-
253 * byte string mbs was converted to a wide character string, not
254 * counting the terminating null wide character.
257 smb_wcequiv_strlen(const char *mbs
)
259 smb_wchar_t wide_char
;
264 bytes
= smb_mbtowc(&wide_char
, mbs
, MTS_MB_CHAR_MAX
);
265 if (bytes
== ((size_t)-1))
268 len
+= sizeof (smb_wchar_t
);
277 * Returns the number of bytes that would be written if the multi-
278 * byte string mbs was converted to a single byte character string,
279 * not counting the terminating null character.
282 smb_sbequiv_strlen(const char *mbs
)
284 smb_wchar_t wide_char
;
289 nbytes
= smb_mbtowc(&wide_char
, mbs
, MTS_MB_CHAR_MAX
);
290 if (nbytes
== ((size_t)-1))
293 if (wide_char
& 0xFF00)
294 len
+= sizeof (smb_wchar_t
);
308 * Convert a regular null terminated string 'string' to a UTF-8 encoded
309 * null terminated multi-byte string 'mbstring'. Only full converted
310 * UTF-8 characters will be written 'mbstring'. If a character will not
311 * fit within the remaining buffer space or 'mbstring' will overflow
312 * max_mblen, the conversion process will be terminated and 'mbstring'
313 * will be null terminated.
315 * Returns the number of bytes written to 'mbstring', excluding the
316 * terminating null character.
318 * If either mbstring or string is a null pointer, -1 is returned.
321 smb_stombs(char *mbstring
, char *string
, int max_mblen
)
323 char *start
= mbstring
;
324 unsigned char *p
= (unsigned char *)string
;
325 int space_left
= max_mblen
;
327 smb_wchar_t wide_char
;
330 if (!mbstring
|| !string
)
333 while (*p
&& space_left
> 2) {
335 len
= smb_wctomb(mbstring
, wide_char
);
342 if ((len
= smb_wctomb(buf
, wide_char
)) < 2) {
351 /*LINTED E_PTRDIFF_OVERFLOW*/
352 return (mbstring
- start
);
359 * Convert a null terminated multi-byte string 'mbstring' to a regular
360 * null terminated string 'string'. A 1-byte character in 'mbstring'
361 * maps to a 1-byte character in 'string'. A 2-byte character in
362 * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
363 * Otherwise the upper byte null will be discarded to ensure that the
364 * output stream does not contain embedded null characters.
366 * If the input stream contains invalid multi-byte characters, a value
367 * of -1 will be returned. Otherwise the length of 'string', excluding
368 * the terminating null character, is returned.
370 * If either mbstring or string is a null pointer, -1 is returned.
373 smb_mbstos(char *string
, const char *mbstring
)
376 unsigned char *start
= (unsigned char *)string
;
379 if (string
== NULL
|| mbstring
== NULL
)
383 if ((len
= smb_mbtowc(&wc
, mbstring
, MTS_MB_CHAR_MAX
)) < 0) {
389 /*LINTED E_BAD_PTR_CAST_ALIGN*/
390 *((smb_wchar_t
*)string
) = wc
;
391 string
+= sizeof (smb_wchar_t
);
395 *string
= (unsigned char)wc
;
404 /*LINTED E_PTRDIFF_OVERFLOW*/
405 return ((unsigned char *)string
- start
);