Remove product literal strings in "pht()", part 5
[phabricator.git] / externals / figlet / utf8.c
blobb8338c1ea8fd7af4a358825a54c9ac3f88f9f96e
1 #ifdef TLF_FONTS
2 /*
3 * Copyright (c) 2007 Alexey Vatchenko <av@bsdua.org>
5 * Permission to use, copy, modify, and/or distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 #include <sys/types.h>
19 #include <wchar.h>
20 #include <arpa/inet.h> /* for htonl() */
22 #include "utf8.h"
24 #define _NXT 0x80
25 #define _SEQ2 0xc0
26 #define _SEQ3 0xe0
27 #define _SEQ4 0xf0
28 #define _SEQ5 0xf8
29 #define _SEQ6 0xfc
31 #define _BOM 0xfeff
33 static int __wchar_forbitten(wchar_t sym);
34 static int __utf8_forbitten(u_char octet);
36 static int
37 __wchar_forbitten(wchar_t sym)
40 /* Surrogate pairs */
41 if (sym >= 0xd800 && sym <= 0xdfff)
42 return (-1);
44 return (0);
47 static int
48 __utf8_forbitten(u_char octet)
51 switch (octet) {
52 case 0xc0:
53 case 0xc1:
54 case 0xf5:
55 case 0xff:
56 return (-1);
59 return (0);
63 * DESCRIPTION
64 * This function translates UTF-8 string into UCS-4 string (all symbols
65 * will be in local machine byte order).
67 * It takes the following arguments:
68 * in - input UTF-8 string. It can be null-terminated.
69 * insize - size of input string in bytes.
70 * out - result buffer for UCS-4 string. If out is NULL,
71 * function returns size of result buffer.
72 * outsize - size of out buffer in wide characters.
74 * RETURN VALUES
75 * The function returns size of result buffer (in wide characters).
76 * Zero is returned in case of error.
78 * CAVEATS
79 * 1. If UTF-8 string contains zero symbols, they will be translated
80 * as regular symbols.
81 * 2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary
82 * when `out' is NULL and not NULL. It's because of special UTF-8
83 * sequences which may result in forbitten (by RFC3629) UNICODE
84 * characters. So, the caller must check return value every time and
85 * not prepare buffer in advance (\0 terminate) but after calling this
86 * function.
88 size_t
89 utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize,
90 int flags)
92 u_char *p, *lim;
93 wchar_t *wlim, high;
94 size_t n, total, i, n_bits;
96 if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
97 return (0);
99 total = 0;
100 p = (u_char *)in;
101 lim = p + insize;
102 wlim = out + outsize;
104 for (; p < lim; p += n) {
105 if (__utf8_forbitten(*p) != 0 &&
106 (flags & UTF8_IGNORE_ERROR) == 0)
107 return (0);
110 * Get number of bytes for one wide character.
112 n = 1; /* default: 1 byte. Used when skipping bytes. */
113 if ((*p & 0x80) == 0)
114 high = (wchar_t)*p;
115 else if ((*p & 0xe0) == _SEQ2) {
116 n = 2;
117 high = (wchar_t)(*p & 0x1f);
118 } else if ((*p & 0xf0) == _SEQ3) {
119 n = 3;
120 high = (wchar_t)(*p & 0x0f);
121 } else if ((*p & 0xf8) == _SEQ4) {
122 n = 4;
123 high = (wchar_t)(*p & 0x07);
124 } else if ((*p & 0xfc) == _SEQ5) {
125 n = 5;
126 high = (wchar_t)(*p & 0x03);
127 } else if ((*p & 0xfe) == _SEQ6) {
128 n = 6;
129 high = (wchar_t)(*p & 0x01);
130 } else {
131 if ((flags & UTF8_IGNORE_ERROR) == 0)
132 return (0);
133 continue;
136 /* does the sequence header tell us truth about length? */
137 if (lim - p <= n - 1) {
138 if ((flags & UTF8_IGNORE_ERROR) == 0)
139 return (0);
140 n = 1;
141 continue; /* skip */
145 * Validate sequence.
146 * All symbols must have higher bits set to 10xxxxxx
148 if (n > 1) {
149 for (i = 1; i < n; i++) {
150 if ((p[i] & 0xc0) != _NXT)
151 break;
153 if (i != n) {
154 if ((flags & UTF8_IGNORE_ERROR) == 0)
155 return (0);
156 n = 1;
157 continue; /* skip */
161 total++;
163 if (out == NULL)
164 continue;
166 if (out >= wlim)
167 return (0); /* no space left */
169 *out = 0;
170 n_bits = 0;
171 for (i = 1; i < n; i++) {
172 *out |= (wchar_t)(p[n - i] & 0x3f) << n_bits;
173 n_bits += 6; /* 6 low bits in every byte */
175 *out |= high << n_bits;
177 if (*out == 0) /* return at end of string */
178 break;
180 if (__wchar_forbitten(*out) != 0) {
181 if ((flags & UTF8_IGNORE_ERROR) == 0)
182 return (0); /* forbitten character */
183 else {
184 total--;
185 out--;
187 } else if (*out == _BOM && (flags & UTF8_SKIP_BOM) != 0) {
188 total--;
189 out--;
192 out++;
195 return (total);
199 * DESCRIPTION
200 * This function translates UCS-4 symbols (given in local machine
201 * byte order) into UTF-8 string.
203 * It takes the following arguments:
204 * in - input unicode string. It can be null-terminated.
205 * insize - size of input string in wide characters.
206 * out - result buffer for utf8 string. If out is NULL,
207 * function returns size of result buffer.
208 * outsize - size of result buffer.
210 * RETURN VALUES
211 * The function returns size of result buffer (in bytes). Zero is returned
212 * in case of error.
214 * CAVEATS
215 * If UCS-4 string contains zero symbols, they will be translated
216 * as regular symbols.
218 size_t
219 wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize,
220 int flags)
222 wchar_t *w, *wlim, ch;
223 u_char *p, *lim, *oc;
224 size_t total, n;
226 if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
227 return (0);
229 w = (wchar_t *)in;
230 wlim = w + insize;
231 p = (u_char *)out;
232 lim = p + outsize;
233 total = 0;
234 for (; w < wlim; w++) {
235 if (__wchar_forbitten(*w) != 0) {
236 if ((flags & UTF8_IGNORE_ERROR) == 0)
237 return (0);
238 else
239 continue;
242 if (*w == _BOM && (flags & UTF8_SKIP_BOM) != 0)
243 continue;
245 if (*w < 0) {
246 if ((flags & UTF8_IGNORE_ERROR) == 0)
247 return (0);
248 continue;
249 } else if (*w <= 0x0000007f)
250 n = 1;
251 else if (*w <= 0x000007ff)
252 n = 2;
253 else if (*w <= 0x0000ffff)
254 n = 3;
255 else if (*w <= 0x001fffff)
256 n = 4;
257 else if (*w <= 0x03ffffff)
258 n = 5;
259 else /* if (*w <= 0x7fffffff) */
260 n = 6;
262 total += n;
264 if (out == NULL)
265 continue;
267 if (lim - p <= n - 1)
268 return (0); /* no space left */
270 /* make it work under different endians */
271 ch = htonl(*w);
272 oc = (u_char *)&ch;
273 switch (n) {
274 case 1:
275 *p = oc[3];
276 break;
278 case 2:
279 p[1] = _NXT | (oc[3] & 0x3f);
280 p[0] = _SEQ2 | (oc[3] >> 6) | ((oc[2] & 0x07) << 2);
281 break;
283 case 3:
284 p[2] = _NXT | (oc[3] & 0x3f);
285 p[1] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
286 p[0] = _SEQ3 | ((oc[2] & 0xf0) >> 4);
287 break;
289 case 4:
290 p[3] = _NXT | (oc[3] & 0x3f);
291 p[2] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
292 p[1] = _NXT | ((oc[2] & 0xf0) >> 4) |
293 ((oc[1] & 0x03) << 4);
294 p[0] = _SEQ4 | ((oc[1] & 0x1f) >> 2);
295 break;
297 case 5:
298 p[4] = _NXT | (oc[3] & 0x3f);
299 p[3] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
300 p[2] = _NXT | ((oc[2] & 0xf0) >> 4) |
301 ((oc[1] & 0x03) << 4);
302 p[1] = _NXT | (oc[1] >> 2);
303 p[0] = _SEQ5 | (oc[0] & 0x03);
304 break;
306 case 6:
307 p[5] = _NXT | (oc[3] & 0x3f);
308 p[4] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
309 p[3] = _NXT | (oc[2] >> 4) | ((oc[1] & 0x03) << 4);
310 p[2] = _NXT | (oc[1] >> 2);
311 p[1] = _NXT | (oc[0] & 0x3f);
312 p[0] = _SEQ6 | ((oc[0] & 0x40) >> 6);
313 break;
317 * NOTE: do not check here for forbitten UTF-8 characters.
318 * They cannot appear here because we do proper convertion.
321 p += n;
324 return (total);
326 #endif /* TLF_FONTS */