4 * Part of gwm, the Gratuitous Window Manager,
5 * by Gary Wong, <gtw@gnu.org>.
7 * Copyright (C) 2009 Gary Wong
9 * This program is free software: you can redistribute it and/or modify
10 * it under the terms of version 3 of the GNU General Public License as
11 * published by the Free Software Foundation.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program. If not, see <http://www.gnu.org/licenses/>.
42 extern char *to_utf8( enum gwm_encoding encoding
, const char *in
,
52 outlen
= len
<< ( encoding
== ENCODING_COMPOUND
? 2 : 1 );
57 outp
= out
= xmalloc( outlen
+ 1 );
60 if( encoding
== ENCODING_COMPOUND
) {
61 if( !tried_iso2022
) {
62 iso2022
= iconv_open( "UTF-8", "ISO-2022-JP-2" );
66 if( iso2022
!= (iconv_t
) -1 ) {
67 static const char resetseq
[ 3 ] = "\x1B\x2D\x41";
71 /* Reset the decoder to the Compound Text initial state
72 (G1 = ASCII, G3 = ISO 8859-1). */
74 iconv( iso2022
, NULL
, NULL
, NULL
, NULL
);
75 /* Bah. Several old implementations of iconv() declared
76 the inbuf parameter as (const char **), but SUS says
77 it's simply (char **). We cast the thing to (void *),
78 which will keep them both happy. */
79 iconv( iso2022
, (void *) &inp
, &resetlen
, &outp
, &outlen
);
81 iconv( iso2022
, (void *) &in
, &len
, &outp
, &outlen
);
85 assert( !utf8_illegal( (unsigned char *) out
) );
87 return xrealloc( out
, outp
- out
);
94 *outp
++ = 0xC0 | ( (const unsigned char) *in
>> 6 );
95 *outp
++ = 0x80 | ( *in
++ & 0x3F );
101 assert( !utf8_illegal( (unsigned char *) out
) );
103 return xrealloc( out
, outp
- out
);
106 extern PURE
unsigned char *utf8_illegal( const unsigned char *str
) {
110 /* End of string. Everything was legal. */
112 else if( !( str
[ 0 ] & 0x80 ) )
113 /* Legal single byte character. */
115 else if( ( str
[ 0 ] >= 0x80 && str
[ 0 ] <= 0xC1 ) ||
116 ( str
[ 0 ] > 0xF4 ) )
117 /* Illegal continuation byte, long representation of single
118 byte character, or overly long sequence. */
119 return (unsigned char *) str
;
120 else if( str
[ 0 ] >= 0xC2 && str
[ 0 ] <= 0xDF ) {
121 /* Two byte sequence... */
122 if( str
[ 1 ] < 0x80 || str
[ 1 ] > 0xBF )
123 /* ...where byte 2 is illegal. */
124 return (unsigned char *) str
+ 1;
126 /* ...which is fully legal. */
128 } else if( str
[ 0 ] >= 0xE0 && str
[ 0 ] <= 0xEF ) {
129 /* Three byte sequence... */
130 if( str
[ 1 ] < 0x80 || str
[ 1 ] > 0xBF ||
131 ( str
[ 0 ] == 0xE0 && str
[ 1 ] < 0xA0 ) ||
132 ( str
[ 0 ] == 0xED && str
[ 1 ] > 0x9F ) )
133 /* ...where byte 2 is illegal. */
134 return (unsigned char *) str
+ 1;
135 else if( str
[ 2 ] < 0x80 || str
[ 2 ] > 0xBF )
136 /* ...where byte 3 is illegal. */
137 return (unsigned char *) str
+ 2;
139 /* ...which is fully legal. */
142 assert( str
[ 0 ] >= 0xF0 && str
[ 0 ] <= 0xF4 );
143 /* Four byte sequence... */
144 if( str
[ 1 ] < 0x80 || str
[ 1 ] > 0xBF ||
145 ( str
[ 0 ] == 0xF0 && str
[ 1 ] < 0x90 ) ||
146 ( str
[ 0 ] == 0xF4 && str
[ 1 ] > 0x8F ) )
147 /* ...where byte 2 is illegal. */
148 return (unsigned char *) str
+ 1;
149 else if( str
[ 2 ] < 0x80 || str
[ 2 ] > 0xBF )
150 /* ...where byte 3 is illegal. */
151 return (unsigned char *) str
+ 2;
152 else if( str
[ 3 ] < 0x80 || str
[ 3 ] > 0xBF )
153 /* ...where byte 4 is illegal. */
154 return (unsigned char *) str
+ 3;
156 /* ...which is fully legal. */
161 extern PURE
int utf8_length( const unsigned char *str
) {
165 assert( !utf8_illegal( str
) );
167 for( len
= 0; *str
; len
++ )
170 else if( *str
< 0xE0 )
172 else if( *str
< 0xF0 )
180 static MALLOC
unsigned char *dup_valid_common( const unsigned char *str
,
183 const unsigned char *p
;
184 unsigned char *out
, *outp
;
186 outp
= out
= xmalloc( len
+ 1 );
189 if( outp
== out
+ len
) {
192 assert( !utf8_illegal( out
) );
194 } else if( !( p
[ 0 ] & 0x80 ) )
195 /* Legal single byte character. */
197 else if( *p
>= 0xC2 && *p
<= 0xDF &&
198 p
[ 1 ] >= 0x80 && p
[ 1 ] <= 0xBF ) {
199 /* Legal two byte character. */
202 } else if( *p
>= 0xE0 && *p
<= 0xEF &&
203 p
[ 1 ] >= 0x80 && p
[ 1 ] <= 0xBF &&
204 p
[ 2 ] >= 0x80 && p
[ 2 ] <= 0xBF &&
205 ( *p
> 0xE0 || p
[ 1 ] > 0x9F ) &&
206 ( *p
!= 0xED || p
[ 1 ] < 0xA0 ) ) {
207 /* Legal three byte character. */
211 } else if( *p
>= 0xF0 && *p
<= 0xF4 &&
212 p
[ 1 ] >= 0x80 && p
[ 1 ] <= 0xBF &&
213 p
[ 2 ] >= 0x80 && p
[ 2 ] <= 0xBF &&
214 p
[ 3 ] >= 0x80 && p
[ 3 ] <= 0xBF &&
215 ( *p
> 0xF0 || p
[ 1 ] > 0x8F ) &&
216 ( *p
!= 0xF4 || p
[ 1 ] < 0x90 ) ) {
217 /* Legal four byte character. */
223 /* Illegal character: ignore this byte and continue. */
227 extern MALLOC
unsigned char *utf8_dup_valid_len( const unsigned char *str
,
231 const unsigned char *p
;
239 else if( !( p
[ 0 ] & 0x80 ) ) {
240 /* Legal single byte character. */
244 } else if( num_bytes
>= 2 &&
245 *p
>= 0xC2 && *p
<= 0xDF &&
246 p
[ 1 ] >= 0x80 && p
[ 1 ] <= 0xBF ) {
247 /* Legal two byte character. */
251 } else if( num_bytes
>= 3 &&
252 *p
>= 0xE0 && *p
<= 0xEF &&
253 p
[ 1 ] >= 0x80 && p
[ 1 ] <= 0xBF &&
254 p
[ 2 ] >= 0x80 && p
[ 2 ] <= 0xBF &&
255 ( *p
> 0xE0 || p
[ 1 ] > 0x9F ) &&
256 ( *p
!= 0xED || p
[ 1 ] < 0xA0 ) ) {
257 /* Legal three byte character. */
261 } else if( num_bytes
>= 4 &&
262 *p
>= 0xF0 && *p
<= 0xF4 &&
263 p
[ 1 ] >= 0x80 && p
[ 1 ] <= 0xBF &&
264 p
[ 2 ] >= 0x80 && p
[ 2 ] <= 0xBF &&
265 p
[ 3 ] >= 0x80 && p
[ 3 ] <= 0xBF &&
266 ( *p
> 0xF0 || p
[ 1 ] > 0x8F ) &&
267 ( *p
!= 0xF4 || p
[ 1 ] < 0x90 ) ) {
268 /* Legal four byte character. */
273 /* Illegal character: ignore this byte and continue. */
278 return dup_valid_common( str
, len
);
281 extern MALLOC
unsigned char *utf8_dup_valid( const unsigned char *str
) {
284 const unsigned char *p
;
292 else if( !( p
[ 0 ] & 0x80 ) ) {
293 /* Legal single byte character. */
296 } else if( *p
>= 0xC2 && *p
<= 0xDF &&
297 p
[ 1 ] >= 0x80 && p
[ 1 ] <= 0xBF ) {
298 /* Legal two byte character. */
301 } else if( *p
>= 0xE0 && *p
<= 0xEF &&
302 p
[ 1 ] >= 0x80 && p
[ 1 ] <= 0xBF &&
303 p
[ 2 ] >= 0x80 && p
[ 2 ] <= 0xBF &&
304 ( *p
> 0xE0 || p
[ 1 ] > 0x9F ) &&
305 ( *p
!= 0xED || p
[ 1 ] < 0xA0 ) ) {
306 /* Legal three byte character. */
309 } else if( *p
>= 0xF0 && *p
<= 0xF4 &&
310 p
[ 1 ] >= 0x80 && p
[ 1 ] <= 0xBF &&
311 p
[ 2 ] >= 0x80 && p
[ 2 ] <= 0xBF &&
312 p
[ 3 ] >= 0x80 && p
[ 3 ] <= 0xBF &&
313 ( *p
> 0xF0 || p
[ 1 ] > 0x8F ) &&
314 ( *p
!= 0xF4 || p
[ 1 ] < 0x90 ) ) {
315 /* Legal four byte character. */
319 /* Illegal character: ignore this byte and continue. */
322 return dup_valid_common( str
, len
);
325 extern uint32_t utf8_next( const unsigned char **p
) {
328 const unsigned char *c
= *p
;
330 assert( c
[ 0 ] < 0x80 ||
331 ( c
[ 0 ] >= 0xC2 && c
[ 0 ] <= 0xDF &&
332 c
[ 1 ] >= 0x80 && c
[ 1 ] <= 0xBF ) ||
333 ( c
[ 0 ] >= 0xE0 && c
[ 0 ] <= 0xEF &&
334 c
[ 1 ] >= 0x80 && c
[ 1 ] <= 0xBF &&
335 c
[ 2 ] >= 0x80 && c
[ 2 ] <= 0xBF &&
336 ( c
[ 0 ] > 0xE0 || c
[ 1 ] > 0x9F ) &&
337 ( c
[ 0 ] != 0xED || c
[ 1 ] < 0xA0 ) ) ||
338 ( c
[ 0 ] >= 0xF0 && c
[ 0 ] <= 0xF4 &&
339 c
[ 1 ] >= 0x80 && c
[ 1 ] <= 0xBF &&
340 c
[ 2 ] >= 0x80 && c
[ 2 ] <= 0xBF &&
341 c
[ 3 ] >= 0x80 && c
[ 3 ] <= 0xBF &&
342 ( c
[ 0 ] > 0xF0 || c
[ 1 ] > 0x8F ) &&
343 ( c
[ 0 ] != 0xF4 || c
[ 1 ] < 0x90 ) ) );
348 if( c
[ 0 ] < 0x80 ) {
351 } else if( c
[ 0 ] < 0xE0 ) {
352 n
= ( ( c
[ 0 ] & 0x1F ) << 6 ) | ( c
[ 1 ] & 0x3F );
354 } else if( c
[ 0 ] < 0xF0 ) {
355 n
= ( ( c
[ 0 ] & 0x0F ) << 12 ) | ( ( c
[ 1 ] & 0x3F ) << 6 ) |
359 n
= ( ( c
[ 0 ] & 0x07 ) << 18 ) | ( ( c
[ 1 ] & 0x3F ) << 12 ) |
360 ( ( c
[ 2 ] & 0x3F ) << 6 ) | ( c
[ 3 ] & 0x3F );
367 extern void cleanup_utf8( void ) {
370 if( tried_iso2022
&& iso2022
!= (iconv_t
) -1 )
371 iconv_close( iso2022
);