Fix missing pointer dereference and missing assignment.
[gwm.git] / utf8.c
blobd1eb4e03ca1056812d0800da4d9808871679d5bc
1 /*
2 * utf8.c
4 * Part of gwm, the Gratuitous Window Manager,
5 * by Gary Wong, <gtw@gnu.org>.
7 * Copyright (C) 2009 Gary Wong
9 * This program is free software: you can redistribute it and/or modify
10 * it under the terms of version 3 of the GNU General Public License as
11 * published by the Free Software Foundation.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program. If not, see <http://www.gnu.org/licenses/>.
21 * $Id$
24 #include <config.h>
26 #include <assert.h>
27 #if HAVE_ICONV_H
28 #include <iconv.h>
29 #endif
30 #include <string.h>
31 #include <xcb/xcb.h>
33 #include "gwm.h"
35 #include "utf8.h"
37 #if HAVE_ICONV
38 iconv_t iso2022;
39 int tried_iso2022;
40 #endif
42 extern char *to_utf8( enum gwm_encoding encoding, const char *in,
43 size_t len ) {
45 size_t outlen;
46 char *out, *outp;
48 if( len < 0 )
49 len = strlen( in );
51 #if HAVE_ICONV
52 outlen = len << ( encoding == ENCODING_COMPOUND ? 2 : 1 );
53 #else
54 outlen = len << 1;
55 #endif
57 outp = out = xmalloc( outlen + 1 );
59 #if HAVE_ICONV
60 if( encoding == ENCODING_COMPOUND ) {
61 if( !tried_iso2022 ) {
62 iso2022 = iconv_open( "UTF-8", "ISO-2022-JP-2" );
63 tried_iso2022 = TRUE;
66 if( iso2022 != (iconv_t) -1 ) {
67 static const char resetseq[ 3 ] = "\x1B\x2D\x41";
68 const char *inp;
69 size_t resetlen = 3;
71 /* Reset the decoder to the Compound Text initial state
72 (G1 = ASCII, G3 = ISO 8859-1). */
73 inp = resetseq;
74 iconv( iso2022, NULL, NULL, NULL, NULL );
75 /* Bah. Several old implementations of iconv() declared
76 the inbuf parameter as (const char **), but SUS says
77 it's simply (char **). We cast the thing to (void *),
78 which will keep them both happy. */
79 iconv( iso2022, (void *) &inp, &resetlen, &outp, &outlen );
81 iconv( iso2022, (void *) &in, &len, &outp, &outlen );
83 *outp++ = 0;
85 assert( !utf8_illegal( (unsigned char *) out ) );
87 return xrealloc( out, outp - out );
90 #endif
92 for( ; len; len-- )
93 if( *in & 0x80 ) {
94 *outp++ = 0xC0 | ( (const unsigned char) *in >> 6 );
95 *outp++ = 0x80 | ( *in++ & 0x3F );
96 } else
97 *outp++ = *in++;
99 *outp++ = 0;
101 assert( !utf8_illegal( (unsigned char *) out ) );
103 return xrealloc( out, outp - out );
106 extern PURE unsigned char *utf8_illegal( const unsigned char *str ) {
108 for(;;)
109 if( !*str )
110 /* End of string. Everything was legal. */
111 return NULL;
112 else if( !( str[ 0 ] & 0x80 ) )
113 /* Legal single byte character. */
114 str++;
115 else if( ( str[ 0 ] >= 0x80 && str[ 0 ] <= 0xC1 ) ||
116 ( str[ 0 ] > 0xF4 ) )
117 /* Illegal continuation byte, long representation of single
118 byte character, or overly long sequence. */
119 return (unsigned char *) str;
120 else if( str[ 0 ] >= 0xC2 && str[ 0 ] <= 0xDF ) {
121 /* Two byte sequence... */
122 if( str[ 1 ] < 0x80 || str[ 1 ] > 0xBF )
123 /* ...where byte 2 is illegal. */
124 return (unsigned char *) str + 1;
125 else
126 /* ...which is fully legal. */
127 str += 2;
128 } else if( str[ 0 ] >= 0xE0 && str[ 0 ] <= 0xEF ) {
129 /* Three byte sequence... */
130 if( str[ 1 ] < 0x80 || str[ 1 ] > 0xBF ||
131 ( str[ 0 ] == 0xE0 && str[ 1 ] < 0xA0 ) ||
132 ( str[ 0 ] == 0xED && str[ 1 ] > 0x9F ) )
133 /* ...where byte 2 is illegal. */
134 return (unsigned char *) str + 1;
135 else if( str[ 2 ] < 0x80 || str[ 2 ] > 0xBF )
136 /* ...where byte 3 is illegal. */
137 return (unsigned char *) str + 2;
138 else
139 /* ...which is fully legal. */
140 str += 3;
141 } else {
142 assert( str[ 0 ] >= 0xF0 && str[ 0 ] <= 0xF4 );
143 /* Four byte sequence... */
144 if( str[ 1 ] < 0x80 || str[ 1 ] > 0xBF ||
145 ( str[ 0 ] == 0xF0 && str[ 1 ] < 0x90 ) ||
146 ( str[ 0 ] == 0xF4 && str[ 1 ] > 0x8F ) )
147 /* ...where byte 2 is illegal. */
148 return (unsigned char *) str + 1;
149 else if( str[ 2 ] < 0x80 || str[ 2 ] > 0xBF )
150 /* ...where byte 3 is illegal. */
151 return (unsigned char *) str + 2;
152 else if( str[ 3 ] < 0x80 || str[ 3 ] > 0xBF )
153 /* ...where byte 4 is illegal. */
154 return (unsigned char *) str + 3;
155 else
156 /* ...which is fully legal. */
157 str += 4;
161 extern PURE int utf8_length( const unsigned char *str ) {
163 int len;
165 assert( !utf8_illegal( str ) );
167 for( len = 0; *str; len++ )
168 if( *str < 0x80 )
169 str++;
170 else if( *str < 0xE0 )
171 str += 2;
172 else if( *str < 0xF0 )
173 str += 3;
174 else
175 str += 4;
177 return len;
180 static MALLOC unsigned char *dup_valid_common( const unsigned char *str,
181 int len ) {
183 const unsigned char *p;
184 unsigned char *out, *outp;
186 outp = out = xmalloc( len + 1 );
187 p = str;
188 for(;;)
189 if( outp == out + len ) {
190 /* End of string. */
191 *outp = 0;
192 assert( !utf8_illegal( out ) );
193 return out;
194 } else if( !( p[ 0 ] & 0x80 ) )
195 /* Legal single byte character. */
196 *outp++ = *p++;
197 else if( *p >= 0xC2 && *p <= 0xDF &&
198 p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF ) {
199 /* Legal two byte character. */
200 *outp++ = *p++;
201 *outp++ = *p++;
202 } else if( *p >= 0xE0 && *p <= 0xEF &&
203 p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF &&
204 p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF &&
205 ( *p > 0xE0 || p[ 1 ] > 0x9F ) &&
206 ( *p != 0xED || p[ 1 ] < 0xA0 ) ) {
207 /* Legal three byte character. */
208 *outp++ = *p++;
209 *outp++ = *p++;
210 *outp++ = *p++;
211 } else if( *p >= 0xF0 && *p <= 0xF4 &&
212 p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF &&
213 p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF &&
214 p[ 3 ] >= 0x80 && p[ 3 ] <= 0xBF &&
215 ( *p > 0xF0 || p[ 1 ] > 0x8F ) &&
216 ( *p != 0xF4 || p[ 1 ] < 0x90 ) ) {
217 /* Legal four byte character. */
218 *outp++ = *p++;
219 *outp++ = *p++;
220 *outp++ = *p++;
221 *outp++ = *p++;
222 } else
223 /* Illegal character: ignore this byte and continue. */
224 p++;
227 extern MALLOC unsigned char *utf8_dup_valid_len( const unsigned char *str,
228 int num_bytes ) {
230 int len;
231 const unsigned char *p;
233 len = 0;
234 p = str;
235 for(;;)
236 if( !num_bytes )
237 /* End of string. */
238 break;
239 else if( !( p[ 0 ] & 0x80 ) ) {
240 /* Legal single byte character. */
241 len++;
242 p++;
243 num_bytes--;
244 } else if( num_bytes >= 2 &&
245 *p >= 0xC2 && *p <= 0xDF &&
246 p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF ) {
247 /* Legal two byte character. */
248 len += 2;
249 p += 2;
250 num_bytes -= 2;
251 } else if( num_bytes >= 3 &&
252 *p >= 0xE0 && *p <= 0xEF &&
253 p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF &&
254 p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF &&
255 ( *p > 0xE0 || p[ 1 ] > 0x9F ) &&
256 ( *p != 0xED || p[ 1 ] < 0xA0 ) ) {
257 /* Legal three byte character. */
258 len += 3;
259 p += 3;
260 num_bytes -= 3;
261 } else if( num_bytes >= 4 &&
262 *p >= 0xF0 && *p <= 0xF4 &&
263 p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF &&
264 p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF &&
265 p[ 3 ] >= 0x80 && p[ 3 ] <= 0xBF &&
266 ( *p > 0xF0 || p[ 1 ] > 0x8F ) &&
267 ( *p != 0xF4 || p[ 1 ] < 0x90 ) ) {
268 /* Legal four byte character. */
269 len += 4;
270 p += 4;
271 num_bytes -= 4;
272 } else {
273 /* Illegal character: ignore this byte and continue. */
274 p++;
275 num_bytes--;
278 return dup_valid_common( str, len );
281 extern MALLOC unsigned char *utf8_dup_valid( const unsigned char *str ) {
283 int len;
284 const unsigned char *p;
286 len = 0;
287 p = str;
288 for(;;)
289 if( !p[ 0 ] )
290 /* End of string. */
291 break;
292 else if( !( p[ 0 ] & 0x80 ) ) {
293 /* Legal single byte character. */
294 len++;
295 p++;
296 } else if( *p >= 0xC2 && *p <= 0xDF &&
297 p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF ) {
298 /* Legal two byte character. */
299 len += 2;
300 p += 2;
301 } else if( *p >= 0xE0 && *p <= 0xEF &&
302 p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF &&
303 p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF &&
304 ( *p > 0xE0 || p[ 1 ] > 0x9F ) &&
305 ( *p != 0xED || p[ 1 ] < 0xA0 ) ) {
306 /* Legal three byte character. */
307 len += 3;
308 p += 3;
309 } else if( *p >= 0xF0 && *p <= 0xF4 &&
310 p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF &&
311 p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF &&
312 p[ 3 ] >= 0x80 && p[ 3 ] <= 0xBF &&
313 ( *p > 0xF0 || p[ 1 ] > 0x8F ) &&
314 ( *p != 0xF4 || p[ 1 ] < 0x90 ) ) {
315 /* Legal four byte character. */
316 len += 4;
317 p += 4;
318 } else
319 /* Illegal character: ignore this byte and continue. */
320 p++;
322 return dup_valid_common( str, len );
325 extern uint32_t utf8_next( const unsigned char **p ) {
327 uint32_t n;
328 const unsigned char *c = *p;
330 assert( c[ 0 ] < 0x80 ||
331 ( c[ 0 ] >= 0xC2 && c[ 0 ] <= 0xDF &&
332 c[ 1 ] >= 0x80 && c[ 1 ] <= 0xBF ) ||
333 ( c[ 0 ] >= 0xE0 && c[ 0 ] <= 0xEF &&
334 c[ 1 ] >= 0x80 && c[ 1 ] <= 0xBF &&
335 c[ 2 ] >= 0x80 && c[ 2 ] <= 0xBF &&
336 ( c[ 0 ] > 0xE0 || c[ 1 ] > 0x9F ) &&
337 ( c[ 0 ] != 0xED || c[ 1 ] < 0xA0 ) ) ||
338 ( c[ 0 ] >= 0xF0 && c[ 0 ] <= 0xF4 &&
339 c[ 1 ] >= 0x80 && c[ 1 ] <= 0xBF &&
340 c[ 2 ] >= 0x80 && c[ 2 ] <= 0xBF &&
341 c[ 3 ] >= 0x80 && c[ 3 ] <= 0xBF &&
342 ( c[ 0 ] > 0xF0 || c[ 1 ] > 0x8F ) &&
343 ( c[ 0 ] != 0xF4 || c[ 1 ] < 0x90 ) ) );
345 if( !*c )
346 return 0;
348 if( c[ 0 ] < 0x80 ) {
349 n = c[ 0 ];
350 ( *p )++;
351 } else if( c[ 0 ] < 0xE0 ) {
352 n = ( ( c[ 0 ] & 0x1F ) << 6 ) | ( c[ 1 ] & 0x3F );
353 *p += 2;
354 } else if( c[ 0 ] < 0xF0 ) {
355 n = ( ( c[ 0 ] & 0x0F ) << 12 ) | ( ( c[ 1 ] & 0x3F ) << 6 ) |
356 ( c[ 2 ] & 0x3F );
357 *p += 3;
358 } else {
359 n = ( ( c[ 0 ] & 0x07 ) << 18 ) | ( ( c[ 1 ] & 0x3F ) << 12 ) |
360 ( ( c[ 2 ] & 0x3F ) << 6 ) | ( c[ 3 ] & 0x3F );
361 *p += 4;
364 return n;
367 extern void cleanup_utf8( void ) {
369 #if HAVE_ICONV
370 if( tried_iso2022 && iso2022 != (iconv_t) -1 )
371 iconv_close( iso2022 );
372 #endif