Sync usage with man page.
[netbsd-mini2440.git] / gnu / dist / gettext / gettext-tools / lib / linebreak.c
blob375802719f6bd6d63ad379119e22a94078ea06d5
1 /* linebreak.c - line breaking of Unicode strings
2 Copyright (C) 2001-2003 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19 #ifdef HAVE_CONFIG_H
20 # include <config.h>
21 #endif
23 /* Specification. */
24 #include "linebreak.h"
26 #include <stdlib.h>
27 #include <string.h>
28 #include "c-ctype.h"
29 #include "xsize.h"
31 #include "utf8-ucs4.h"
33 #ifdef unused
34 #include "utf16-ucs4.h"
36 static inline int
37 u32_mbtouc (unsigned int *puc, const unsigned int *s, size_t n)
39 *puc = *s;
40 return 1;
42 #endif
45 /* Help GCC to generate good code for string comparisons with
46 immediate strings. */
47 #if defined (__GNUC__) && defined (__OPTIMIZE__)
49 static inline int
50 streq9 (const char *s1, const char *s2)
52 return strcmp (s1 + 9, s2 + 9) == 0;
55 static inline int
56 streq8 (const char *s1, const char *s2, char s28)
58 if (s1[8] == s28)
60 if (s28 == 0)
61 return 1;
62 else
63 return streq9 (s1, s2);
65 else
66 return 0;
69 static inline int
70 streq7 (const char *s1, const char *s2, char s27, char s28)
72 if (s1[7] == s27)
74 if (s27 == 0)
75 return 1;
76 else
77 return streq8 (s1, s2, s28);
79 else
80 return 0;
83 static inline int
84 streq6 (const char *s1, const char *s2, char s26, char s27, char s28)
86 if (s1[6] == s26)
88 if (s26 == 0)
89 return 1;
90 else
91 return streq7 (s1, s2, s27, s28);
93 else
94 return 0;
97 static inline int
98 streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28)
100 if (s1[5] == s25)
102 if (s25 == 0)
103 return 1;
104 else
105 return streq6 (s1, s2, s26, s27, s28);
107 else
108 return 0;
111 static inline int
112 streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28)
114 if (s1[4] == s24)
116 if (s24 == 0)
117 return 1;
118 else
119 return streq5 (s1, s2, s25, s26, s27, s28);
121 else
122 return 0;
125 static inline int
126 streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28)
128 if (s1[3] == s23)
130 if (s23 == 0)
131 return 1;
132 else
133 return streq4 (s1, s2, s24, s25, s26, s27, s28);
135 else
136 return 0;
139 static inline int
140 streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
142 if (s1[2] == s22)
144 if (s22 == 0)
145 return 1;
146 else
147 return streq3 (s1, s2, s23, s24, s25, s26, s27, s28);
149 else
150 return 0;
153 static inline int
154 streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
156 if (s1[1] == s21)
158 if (s21 == 0)
159 return 1;
160 else
161 return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28);
163 else
164 return 0;
167 static inline int
168 streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
170 if (s1[0] == s20)
172 if (s20 == 0)
173 return 1;
174 else
175 return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28);
177 else
178 return 0;
181 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
182 streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28)
184 #else
186 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
187 (strcmp (s1, s2) == 0)
189 #endif
192 static int
193 is_cjk_encoding (const char *encoding)
195 if (0
196 /* Legacy Japanese encodings */
197 || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)
198 /* Legacy Chinese encodings */
199 || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
200 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
201 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
202 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
203 /* Legacy Korean encodings */
204 || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
205 || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0)
206 || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0))
207 return 1;
208 return 0;
211 static int
212 is_utf8_encoding (const char *encoding)
214 if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
215 return 1;
216 return 0;
220 /* Determine number of column positions required for UC. */
221 int uc_width (unsigned int uc, const char *encoding);
224 * Non-spacing attribute table.
225 * Consists of:
226 * - Non-spacing characters; generated from PropList.txt or
227 * "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
228 * - Format control characters; generated from
229 * "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
230 * - Zero width characters; generated from
231 * "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
233 static const unsigned char nonspacing_table_data[16*64] = {
234 /* 0x0000-0x01ff */
235 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */
236 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */
237 0xff, 0xff, 0xff, 0xff, 0x00, 0x20, 0x00, 0x00, /* 0x0080-0x00bf */
238 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */
239 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */
240 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */
241 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */
242 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */
243 /* 0x0200-0x03ff */
244 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */
245 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */
246 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */
247 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */
248 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */
249 0xff, 0xff, 0xff, 0xe0, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */
250 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */
251 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */
252 /* 0x0400-0x05ff */
253 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */
254 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */
255 0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */
256 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */
257 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */
258 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */
259 0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */
260 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
261 /* 0x0600-0x07ff */
262 0x0f, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
263 0x00, 0xf8, 0xff, 0x01, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
264 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
265 0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
266 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
267 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
268 0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
269 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */
270 /* 0x0800-0x09ff */
271 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */
272 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
273 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
274 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */
275 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */
276 0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
277 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
278 0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */
279 /* 0x0a00-0x0bff */
280 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */
281 0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */
282 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */
283 0xbe, 0x21, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */
284 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */
285 0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */
286 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */
287 0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */
288 /* 0x0c00-0x0dff */
289 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */
290 0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */
291 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0c80-0x0cbf */
292 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */
293 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */
294 0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */
295 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */
296 0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */
297 /* 0x0e00-0x0fff */
298 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */
299 0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */
300 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */
301 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */
302 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */
303 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */
304 0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */
305 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */
306 /* 0x1000-0x11ff */
307 0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */
308 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */
309 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */
310 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */
311 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */
312 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */
313 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */
314 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */
315 /* 0x1600-0x17ff */
316 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */
317 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */
318 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */
319 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */
320 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */
321 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */
322 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x3f, /* 0x1780-0x17bf */
323 0x40, 0xfe, 0x0f, 0x20, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */
324 /* 0x1800-0x19ff */
325 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */
326 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */
327 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
328 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */
329 0x00, 0x00, 0x00, 0x00, 0x87, 0x0f, 0x04, 0x0e, /* 0x1900-0x193f */
330 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */
331 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */
332 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */
333 /* 0x2000-0x21ff */
334 0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */
335 0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */
336 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */
337 0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */
338 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */
339 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */
340 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */
341 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */
342 /* 0x3000-0x31ff */
343 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */
344 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */
345 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */
346 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */
347 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */
348 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
349 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */
350 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */
351 /* 0xfa00-0xfbff */
352 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */
353 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */
354 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */
355 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */
356 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */
357 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */
358 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */
359 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */
360 /* 0xfe00-0xffff */
361 0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */
362 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */
363 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */
364 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */
365 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */
366 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */
367 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
368 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */
369 /* 0x1d000-0x1d1ff */
370 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */
371 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */
372 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */
373 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */
374 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */
375 0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */
376 0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */
377 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* 0x1d1c0-0x1d1ff */
379 static const signed char nonspacing_table_ind[240] = {
380 0, 1, 2, 3, 4, 5, 6, 7, /* 0x0000-0x0fff */
381 8, -1, -1, 9, 10, -1, -1, -1, /* 0x1000-0x1fff */
382 11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */
383 12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */
384 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */
385 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */
386 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */
387 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */
388 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */
389 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */
390 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */
391 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */
392 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */
393 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */
394 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */
395 -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */
396 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */
397 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */
398 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */
399 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */
400 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */
401 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */
402 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */
403 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */
404 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */
405 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */
406 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */
407 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */
408 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */
409 15, -1, -1, -1, -1, -1, -1, -1 /* 0x1d000-0x1dfff */
412 /* Determine number of column positions required for UC. */
414 uc_width (unsigned int uc, const char *encoding)
416 /* Test for non-spacing or control character. */
417 if ((uc >> 9) < 240)
419 int ind = nonspacing_table_ind[uc >> 9];
420 if (ind >= 0)
421 if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
423 if (uc > 0 && uc < 0xa0)
424 return -1;
425 else
426 return 0;
429 else if ((uc >> 9) == (0xe0000 >> 9))
431 if (uc < 0xe0100
432 ? (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
433 : (uc <= 0xe01ef))
434 return 0;
436 /* Test for double-width character.
437 * Generated from "grep '^....;[WF]' EastAsianWidth.txt"
438 * and "grep '^....;[^WF]' EastAsianWidth.txt"
440 if (uc >= 0x1100
441 && ((uc < 0x1160) /* Hangul Jamo */
442 || (uc >= 0x2e80 && uc < 0x4dc0 /* CJK */
443 && !(uc == 0x303f))
444 || (uc >= 0x4e00 && uc < 0xa4d0) /* CJK ... Yi */
445 || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */
446 || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */
447 || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */
448 || (uc >= 0xff00 && uc < 0xff61) /* Fullwidth Forms */
449 || (uc >= 0xffe0 && uc < 0xffe7)
450 || (uc >= 0x20000 && uc <= 0x2fffd) /* CJK, CJK Compatibility Ideographs */
451 || (uc >= 0x30000 && uc <= 0x3fffd)
453 return 2;
454 /* In ancient CJK encodings, Cyrillic and most other characters are
455 double-width as well. */
456 if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
457 && is_cjk_encoding (encoding))
458 return 2;
459 return 1;
463 #ifdef unused
465 /* Determine number of column positions required for first N units
466 (or fewer if S ends before this) in S. */
469 u8_width (const unsigned char *s, size_t n, const char *encoding)
471 const unsigned char *s_end = s + n;
472 int width = 0;
474 while (s < s_end)
476 unsigned int uc;
477 int w;
479 s += u8_mbtouc (&uc, s, s_end - s);
481 if (uc == 0)
482 break; /* end of string reached */
484 w = uc_width (uc, encoding);
485 if (w >= 0) /* ignore control characters in the string */
486 width += w;
489 return width;
493 u16_width (const unsigned short *s, size_t n, const char *encoding)
495 const unsigned short *s_end = s + n;
496 int width = 0;
498 while (s < s_end)
500 unsigned int uc;
501 int w;
503 s += u16_mbtouc (&uc, s, s_end - s);
505 if (uc == 0)
506 break; /* end of string reached */
508 w = uc_width (uc, encoding);
509 if (w >= 0) /* ignore control characters in the string */
510 width += w;
513 return width;
517 u32_width (const unsigned int *s, size_t n, const char *encoding)
519 const unsigned int *s_end = s + n;
520 int width = 0;
522 while (s < s_end)
524 unsigned int uc = *s++;
525 int w;
527 if (uc == 0)
528 break; /* end of string reached */
530 w = uc_width (uc, encoding);
531 if (w >= 0) /* ignore control characters in the string */
532 width += w;
535 return width;
538 #endif
541 /* Determine the line break points in S, and store the result at p[0..n-1]. */
542 /* We don't support line breaking of complex-context dependent characters
543 (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
545 /* Line breaking classification. */
547 enum
549 /* Values >= 20 are resolved at run time. */
550 LBP_BK = 0, /* mandatory break */
551 /*LBP_CR, carriage return - not used here because it's a DOSism */
552 /*LBP_LF, line feed - not used here because it's a DOSism */
553 LBP_CM = 20, /* attached characters and combining marks */
554 /*LBP_SG, surrogates - not used here because they are not characters */
555 LBP_ZW = 1, /* zero width space */
556 LBP_IN = 2, /* inseparable */
557 LBP_GL = 3, /* non-breaking (glue) */
558 LBP_CB = 22, /* contingent break opportunity */
559 LBP_SP = 21, /* space */
560 LBP_BA = 4, /* break opportunity after */
561 LBP_BB = 5, /* break opportunity before */
562 LBP_B2 = 6, /* break opportunity before and after */
563 LBP_HY = 7, /* hyphen */
564 LBP_NS = 8, /* non starter */
565 LBP_OP = 9, /* opening punctuation */
566 LBP_CL = 10, /* closing punctuation */
567 LBP_QU = 11, /* ambiguous quotation */
568 LBP_EX = 12, /* exclamation/interrogation */
569 LBP_ID = 13, /* ideographic */
570 LBP_NU = 14, /* numeric */
571 LBP_IS = 15, /* infix separator (numeric) */
572 LBP_SY = 16, /* symbols allowing breaks */
573 LBP_AL = 17, /* ordinary alphabetic and symbol characters */
574 LBP_PR = 18, /* prefix (numeric) */
575 LBP_PO = 19, /* postfix (numeric) */
576 LBP_SA = 23, /* complex context (South East Asian) */
577 LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
578 LBP_XX = 25 /* unknown */
581 #include "lbrkprop.h"
583 static inline unsigned char
584 lbrkprop_lookup (unsigned int uc)
586 unsigned int index1 = uc >> lbrkprop_header_0;
587 if (index1 < lbrkprop_header_1)
589 int lookup1 = lbrkprop.level1[index1];
590 if (lookup1 >= 0)
592 unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
593 int lookup2 = lbrkprop.level2[lookup1 + index2];
594 if (lookup2 >= 0)
596 unsigned int index3 = uc & lbrkprop_header_4;
597 return lbrkprop.level3[lookup2 + index3];
601 return LBP_XX;
604 /* Table indexed by two line breaking classifications. */
605 #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */
606 #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
607 #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */
608 static const unsigned char lbrk_table[19][19] = {
609 /* after */
610 /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
611 /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
612 /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
613 /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
614 /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
615 /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
616 /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
617 /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
618 /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
619 /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
620 /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
621 /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
622 /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
623 /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
624 /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
625 /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
626 /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
627 /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
628 /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
629 /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
630 /* "" */
631 /* before */
633 /* Note: The (B2,B2) entry should probably be D instead of P. */
634 /* Note: The (PR,ID) entry should probably be D instead of I. */
636 void
637 u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
639 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
640 const unsigned char *s_end = s + n;
641 int last_prop = LBP_BK; /* line break property of last non-space character */
642 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
643 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
645 /* Don't break inside multibyte characters. */
646 memset (p, UC_BREAK_PROHIBITED, n);
648 while (s < s_end)
650 unsigned int uc;
651 int count = u8_mbtouc (&uc, s, s_end - s);
652 int prop = lbrkprop_lookup (uc);
654 if (prop == LBP_BK)
656 /* Mandatory break. */
657 *p = UC_BREAK_MANDATORY;
658 last_prop = LBP_BK;
659 seen_space = NULL;
660 seen_space2 = NULL;
662 else
664 char *q;
666 /* Resolve property values whose behaviour is not fixed. */
667 switch (prop)
669 case LBP_AI:
670 /* Resolve ambiguous. */
671 prop = LBP_AI_REPLACEMENT;
672 break;
673 case LBP_CB:
674 /* This is arbitrary. */
675 prop = LBP_ID;
676 break;
677 case LBP_SA:
678 /* We don't handle complex scripts yet.
679 Treat LBP_SA like LBP_XX. */
680 case LBP_XX:
681 /* This is arbitrary. */
682 prop = LBP_AL;
683 break;
686 /* Deal with combining characters. */
687 q = p;
688 if (prop == LBP_CM)
690 /* Don't break just before a combining character. */
691 *p = UC_BREAK_PROHIBITED;
692 /* A combining character turns a preceding space into LBP_AL. */
693 if (seen_space != NULL)
695 q = seen_space;
696 seen_space = seen_space2;
697 prop = LBP_AL;
698 goto lookup_via_table;
701 else if (prop == LBP_SP)
703 /* Don't break just before a space. */
704 *p = UC_BREAK_PROHIBITED;
705 seen_space2 = seen_space;
706 seen_space = p;
708 else
710 lookup_via_table:
711 /* prop must be usable as an index for table 7.3 of UTR #14. */
712 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
713 abort ();
715 if (last_prop == LBP_BK)
717 /* Don't break at the beginning of a line. */
718 *q = UC_BREAK_PROHIBITED;
720 else
722 switch (lbrk_table [last_prop-1] [prop-1])
724 case D:
725 *q = UC_BREAK_POSSIBLE;
726 break;
727 case I:
728 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
729 break;
730 case P:
731 *q = UC_BREAK_PROHIBITED;
732 break;
733 default:
734 abort ();
737 last_prop = prop;
738 seen_space = NULL;
739 seen_space2 = NULL;
743 s += count;
744 p += count;
748 #ifdef unused
750 void
751 u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
753 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
754 const unsigned short *s_end = s + n;
755 int last_prop = LBP_BK; /* line break property of last non-space character */
756 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
757 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
759 /* Don't break inside multibyte characters. */
760 memset (p, UC_BREAK_PROHIBITED, n);
762 while (s < s_end)
764 unsigned int uc;
765 int count = u16_mbtouc (&uc, s, s_end - s);
766 int prop = lbrkprop_lookup (uc);
768 if (prop == LBP_BK)
770 /* Mandatory break. */
771 *p = UC_BREAK_MANDATORY;
772 last_prop = LBP_BK;
773 seen_space = NULL;
774 seen_space2 = NULL;
776 else
778 char *q;
780 /* Resolve property values whose behaviour is not fixed. */
781 switch (prop)
783 case LBP_AI:
784 /* Resolve ambiguous. */
785 prop = LBP_AI_REPLACEMENT;
786 break;
787 case LBP_CB:
788 /* This is arbitrary. */
789 prop = LBP_ID;
790 break;
791 case LBP_SA:
792 /* We don't handle complex scripts yet.
793 Treat LBP_SA like LBP_XX. */
794 case LBP_XX:
795 /* This is arbitrary. */
796 prop = LBP_AL;
797 break;
800 /* Deal with combining characters. */
801 q = p;
802 if (prop == LBP_CM)
804 /* Don't break just before a combining character. */
805 *p = UC_BREAK_PROHIBITED;
806 /* A combining character turns a preceding space into LBP_AL. */
807 if (seen_space != NULL)
809 q = seen_space;
810 seen_space = seen_space2;
811 prop = LBP_AL;
812 goto lookup_via_table;
815 else if (prop == LBP_SP)
817 /* Don't break just before a space. */
818 *p = UC_BREAK_PROHIBITED;
819 seen_space2 = seen_space;
820 seen_space = p;
822 else
824 lookup_via_table:
825 /* prop must be usable as an index for table 7.3 of UTR #14. */
826 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
827 abort ();
829 if (last_prop == LBP_BK)
831 /* Don't break at the beginning of a line. */
832 *q = UC_BREAK_PROHIBITED;
834 else
836 switch (lbrk_table [last_prop-1] [prop-1])
838 case D:
839 *q = UC_BREAK_POSSIBLE;
840 break;
841 case I:
842 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
843 break;
844 case P:
845 *q = UC_BREAK_PROHIBITED;
846 break;
847 default:
848 abort ();
851 last_prop = prop;
852 seen_space = NULL;
853 seen_space2 = NULL;
857 s += count;
858 p += count;
862 void
863 u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
865 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
866 const unsigned int *s_end = s + n;
867 int last_prop = LBP_BK; /* line break property of last non-space character */
868 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
869 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
871 while (s < s_end)
873 unsigned int uc = *s;
874 int prop = lbrkprop_lookup (uc);
876 if (prop == LBP_BK)
878 /* Mandatory break. */
879 *p = UC_BREAK_MANDATORY;
880 last_prop = LBP_BK;
881 seen_space = NULL;
882 seen_space2 = NULL;
884 else
886 char *q;
888 /* Resolve property values whose behaviour is not fixed. */
889 switch (prop)
891 case LBP_AI:
892 /* Resolve ambiguous. */
893 prop = LBP_AI_REPLACEMENT;
894 break;
895 case LBP_CB:
896 /* This is arbitrary. */
897 prop = LBP_ID;
898 break;
899 case LBP_SA:
900 /* We don't handle complex scripts yet.
901 Treat LBP_SA like LBP_XX. */
902 case LBP_XX:
903 /* This is arbitrary. */
904 prop = LBP_AL;
905 break;
908 /* Deal with combining characters. */
909 q = p;
910 if (prop == LBP_CM)
912 /* Don't break just before a combining character. */
913 *p = UC_BREAK_PROHIBITED;
914 /* A combining character turns a preceding space into LBP_AL. */
915 if (seen_space != NULL)
917 q = seen_space;
918 seen_space = seen_space2;
919 prop = LBP_AL;
920 goto lookup_via_table;
923 else if (prop == LBP_SP)
925 /* Don't break just before a space. */
926 *p = UC_BREAK_PROHIBITED;
927 seen_space2 = seen_space;
928 seen_space = p;
930 else
932 lookup_via_table:
933 /* prop must be usable as an index for table 7.3 of UTR #14. */
934 if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
935 abort ();
937 if (last_prop == LBP_BK)
939 /* Don't break at the beginning of a line. */
940 *q = UC_BREAK_PROHIBITED;
942 else
944 switch (lbrk_table [last_prop-1] [prop-1])
946 case D:
947 *q = UC_BREAK_POSSIBLE;
948 break;
949 case I:
950 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
951 break;
952 case P:
953 *q = UC_BREAK_PROHIBITED;
954 break;
955 default:
956 abort ();
959 last_prop = prop;
960 seen_space = NULL;
961 seen_space2 = NULL;
965 s++;
966 p++;
970 #endif
973 /* Choose the best line breaks, assuming the uc_width function.
974 Return the column after the end of the string. */
977 u8_width_linebreaks (const unsigned char *s, size_t n,
978 int width, int start_column, int at_end_columns,
979 const char *o, const char *encoding,
980 char *p)
982 const unsigned char *s_end;
983 char *last_p;
984 int last_column;
985 int piece_width;
987 u8_possible_linebreaks (s, n, encoding, p);
989 s_end = s + n;
990 last_p = NULL;
991 last_column = start_column;
992 piece_width = 0;
993 while (s < s_end)
995 unsigned int uc;
996 int count = u8_mbtouc (&uc, s, s_end - s);
998 /* Respect the override. */
999 if (o != NULL && *o != UC_BREAK_UNDEFINED)
1000 *p = *o;
1002 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1004 /* An atomic piece of text ends here. */
1005 if (last_p != NULL && last_column + piece_width > width)
1007 /* Insert a line break. */
1008 *last_p = UC_BREAK_POSSIBLE;
1009 last_column = 0;
1013 if (*p == UC_BREAK_MANDATORY)
1015 /* uc is a line break character. */
1016 /* Start a new piece at column 0. */
1017 last_p = NULL;
1018 last_column = 0;
1019 piece_width = 0;
1021 else
1023 /* uc is not a line break character. */
1024 int w;
1026 if (*p == UC_BREAK_POSSIBLE)
1028 /* Start a new piece. */
1029 last_p = p;
1030 last_column += piece_width;
1031 piece_width = 0;
1032 /* No line break for the moment, may be turned into
1033 UC_BREAK_POSSIBLE later, via last_p. */
1036 *p = UC_BREAK_PROHIBITED;
1038 w = uc_width (uc, encoding);
1039 if (w >= 0) /* ignore control characters in the string */
1040 piece_width += w;
1043 s += count;
1044 p += count;
1045 if (o != NULL)
1046 o += count;
1049 /* The last atomic piece of text ends here. */
1050 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1052 /* Insert a line break. */
1053 *last_p = UC_BREAK_POSSIBLE;
1054 last_column = 0;
1057 return last_column + piece_width;
1060 #ifdef unused
1063 u16_width_linebreaks (const unsigned short *s, size_t n,
1064 int width, int start_column, int at_end_columns,
1065 const char *o, const char *encoding,
1066 char *p)
1068 const unsigned short *s_end;
1069 char *last_p;
1070 int last_column;
1071 int piece_width;
1073 u16_possible_linebreaks (s, n, encoding, p);
1075 s_end = s + n;
1076 last_p = NULL;
1077 last_column = start_column;
1078 piece_width = 0;
1079 while (s < s_end)
1081 unsigned int uc;
1082 int count = u16_mbtouc (&uc, s, s_end - s);
1084 /* Respect the override. */
1085 if (o != NULL && *o != UC_BREAK_UNDEFINED)
1086 *p = *o;
1088 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1090 /* An atomic piece of text ends here. */
1091 if (last_p != NULL && last_column + piece_width > width)
1093 /* Insert a line break. */
1094 *last_p = UC_BREAK_POSSIBLE;
1095 last_column = 0;
1099 if (*p == UC_BREAK_MANDATORY)
1101 /* uc is a line break character. */
1102 /* Start a new piece at column 0. */
1103 last_p = NULL;
1104 last_column = 0;
1105 piece_width = 0;
1107 else
1109 /* uc is not a line break character. */
1110 int w;
1112 if (*p == UC_BREAK_POSSIBLE)
1114 /* Start a new piece. */
1115 last_p = p;
1116 last_column += piece_width;
1117 piece_width = 0;
1118 /* No line break for the moment, may be turned into
1119 UC_BREAK_POSSIBLE later, via last_p. */
1122 *p = UC_BREAK_PROHIBITED;
1124 w = uc_width (uc, encoding);
1125 if (w >= 0) /* ignore control characters in the string */
1126 piece_width += w;
1129 s += count;
1130 p += count;
1131 if (o != NULL)
1132 o += count;
1135 /* The last atomic piece of text ends here. */
1136 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1138 /* Insert a line break. */
1139 *last_p = UC_BREAK_POSSIBLE;
1140 last_column = 0;
1143 return last_column + piece_width;
1147 u32_width_linebreaks (const unsigned int *s, size_t n,
1148 int width, int start_column, int at_end_columns,
1149 const char *o, const char *encoding,
1150 char *p)
1152 const unsigned int *s_end;
1153 char *last_p;
1154 int last_column;
1155 int piece_width;
1157 u32_possible_linebreaks (s, n, encoding, p);
1159 s_end = s + n;
1160 last_p = NULL;
1161 last_column = start_column;
1162 piece_width = 0;
1163 while (s < s_end)
1165 unsigned int uc = *s;
1167 /* Respect the override. */
1168 if (o != NULL && *o != UC_BREAK_UNDEFINED)
1169 *p = *o;
1171 if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1173 /* An atomic piece of text ends here. */
1174 if (last_p != NULL && last_column + piece_width > width)
1176 /* Insert a line break. */
1177 *last_p = UC_BREAK_POSSIBLE;
1178 last_column = 0;
1182 if (*p == UC_BREAK_MANDATORY)
1184 /* uc is a line break character. */
1185 /* Start a new piece at column 0. */
1186 last_p = NULL;
1187 last_column = 0;
1188 piece_width = 0;
1190 else
1192 /* uc is not a line break character. */
1193 int w;
1195 if (*p == UC_BREAK_POSSIBLE)
1197 /* Start a new piece. */
1198 last_p = p;
1199 last_column += piece_width;
1200 piece_width = 0;
1201 /* No line break for the moment, may be turned into
1202 UC_BREAK_POSSIBLE later, via last_p. */
1205 *p = UC_BREAK_PROHIBITED;
1207 w = uc_width (uc, encoding);
1208 if (w >= 0) /* ignore control characters in the string */
1209 piece_width += w;
1212 s++;
1213 p++;
1214 if (o != NULL)
1215 o++;
1218 /* The last atomic piece of text ends here. */
1219 if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1221 /* Insert a line break. */
1222 *last_p = UC_BREAK_POSSIBLE;
1223 last_column = 0;
1226 return last_column + piece_width;
1229 #endif
1232 #ifdef TEST1
1234 #include <stdio.h>
1236 /* Read the contents of an input stream, and return it, terminated with a NUL
1237 byte. */
1238 char *
1239 read_file (FILE *stream)
1241 #define BUFSIZE 4096
1242 char *buf = NULL;
1243 int alloc = 0;
1244 int size = 0;
1245 int count;
1247 while (! feof (stream))
1249 if (size + BUFSIZE > alloc)
1251 alloc = alloc + alloc / 2;
1252 if (alloc < size + BUFSIZE)
1253 alloc = size + BUFSIZE;
1254 buf = realloc (buf, alloc);
1255 if (buf == NULL)
1257 fprintf (stderr, "out of memory\n");
1258 exit (1);
1261 count = fread (buf + size, 1, BUFSIZE, stream);
1262 if (count == 0)
1264 if (ferror (stream))
1266 perror ("fread");
1267 exit (1);
1270 else
1271 size += count;
1273 buf = realloc (buf, size + 1);
1274 if (buf == NULL)
1276 fprintf (stderr, "out of memory\n");
1277 exit (1);
1279 buf[size] = '\0';
1280 return buf;
1281 #undef BUFSIZE
1285 main (int argc, char * argv[])
1287 if (argc == 1)
1289 /* Display all the break opportunities in the input string. */
1290 char *input = read_file (stdin);
1291 int length = strlen (input);
1292 char *breaks = malloc (length);
1293 int i;
1295 u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
1297 for (i = 0; i < length; i++)
1299 switch (breaks[i])
1301 case UC_BREAK_POSSIBLE:
1302 /* U+2027 in UTF-8 encoding */
1303 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
1304 break;
1305 case UC_BREAK_MANDATORY:
1306 /* U+21B2 (or U+21B5) in UTF-8 encoding */
1307 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
1308 break;
1309 case UC_BREAK_PROHIBITED:
1310 break;
1311 default:
1312 abort ();
1314 putc (input[i], stdout);
1317 free (breaks);
1319 return 0;
1321 else if (argc == 2)
1323 /* Insert line breaks for a given width. */
1324 int width = atoi (argv[1]);
1325 char *input = read_file (stdin);
1326 int length = strlen (input);
1327 char *breaks = malloc (length);
1328 int i;
1330 u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
1332 for (i = 0; i < length; i++)
1334 switch (breaks[i])
1336 case UC_BREAK_POSSIBLE:
1337 putc ('\n', stdout);
1338 break;
1339 case UC_BREAK_MANDATORY:
1340 break;
1341 case UC_BREAK_PROHIBITED:
1342 break;
1343 default:
1344 abort ();
1346 putc (input[i], stdout);
1349 free (breaks);
1351 return 0;
1353 else
1354 return 1;
1357 #endif /* TEST1 */
1360 /* Now the same thing with an arbitrary encoding.
1362 We convert the input string to Unicode.
1364 The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
1365 UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to
1366 \U0000FFFF. UTF-16 and variants support only characters up to
1367 \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1.
1368 UCS-4 specification leaves doubts about endianness and byte order mark.
1369 glibc currently interprets it as big endian without byte order mark,
1370 but this is not backed by an RFC. So we use UTF-8. It supports
1371 characters up to \U7FFFFFFF and is unambiguously defined. */
1373 #if HAVE_ICONV
1375 #include <iconv.h>
1376 #include <errno.h>
1378 /* Luckily, the encoding's name is platform independent. */
1379 #define UTF8_NAME "UTF-8"
1381 /* Return the length of a string after conversion through an iconv_t. */
1382 static size_t
1383 iconv_string_length (iconv_t cd, const char *s, size_t n)
1385 #define TMPBUFSIZE 4096
1386 size_t count = 0;
1387 char tmpbuf[TMPBUFSIZE];
1388 const char *inptr = s;
1389 size_t insize = n;
1390 while (insize > 0)
1392 char *outptr = tmpbuf;
1393 size_t outsize = TMPBUFSIZE;
1394 size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1395 if (res == (size_t)(-1) && errno != E2BIG)
1396 return (size_t)(-1);
1397 count += outptr - tmpbuf;
1399 /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug. */
1400 #if defined _LIBICONV_VERSION \
1401 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1403 char *outptr = tmpbuf;
1404 size_t outsize = TMPBUFSIZE;
1405 size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
1406 if (res == (size_t)(-1))
1407 return (size_t)(-1);
1408 count += outptr - tmpbuf;
1410 /* Return to the initial state. */
1411 iconv (cd, NULL, NULL, NULL, NULL);
1412 #endif
1413 return count;
1414 #undef TMPBUFSIZE
1417 static void
1418 iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
1419 size_t *offtable, char *t, size_t m)
1421 size_t i;
1422 const char *s_end;
1423 const char *inptr;
1424 char *outptr;
1425 size_t outsize;
1426 /* Avoid glibc-2.1 bug. */
1427 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
1428 const size_t extra = 1;
1429 #else
1430 const size_t extra = 0;
1431 #endif
1433 for (i = 0; i < n; i++)
1434 offtable[i] = (size_t)(-1);
1436 s_end = s + n;
1437 inptr = s;
1438 outptr = t;
1439 outsize = m + extra;
1440 while (inptr < s_end)
1442 const char *saved_inptr;
1443 size_t insize;
1444 size_t res;
1446 offtable[inptr - s] = outptr - t;
1448 saved_inptr = inptr;
1449 res = (size_t)(-1);
1450 for (insize = 1; inptr + insize <= s_end; insize++)
1452 res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1453 if (!(res == (size_t)(-1) && errno == EINVAL))
1454 break;
1455 /* We expect that no input bytes have been consumed so far. */
1456 if (inptr != saved_inptr)
1457 abort ();
1459 /* After we verified the convertibility and computed the translation's
1460 size m, there shouldn't be any conversion error here. */
1461 if (res == (size_t)(-1))
1462 abort ();
1464 /* Avoid glibc-2.1 bug and Solaris 7 bug. */
1465 #if defined _LIBICONV_VERSION \
1466 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1467 if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
1468 abort ();
1469 #endif
1470 /* We should have produced exactly m output bytes. */
1471 if (outsize != extra)
1472 abort ();
1475 #endif /* HAVE_ICONV */
1477 #if C_CTYPE_ASCII
1479 /* Tests whether a string is entirely ASCII. Returns 1 if yes.
1480 Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding. */
1481 static int
1482 is_all_ascii (const char *s, size_t n)
1484 for (; n > 0; s++, n--)
1486 unsigned char c = (unsigned char) *s;
1488 if (!(c_isprint (c) || c_isspace (c)))
1489 return 0;
1491 return 1;
1494 #endif /* C_CTYPE_ASCII */
1496 #if defined unused || defined TEST2
1498 void
1499 mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
1500 char *p)
1502 if (n == 0)
1503 return;
1504 if (is_utf8_encoding (encoding))
1505 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1506 else
1508 #if HAVE_ICONV
1509 iconv_t to_utf8;
1510 /* Avoid glibc-2.1 bug with EUC-KR. */
1511 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1512 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1513 to_utf8 = (iconv_t)(-1);
1514 else
1515 # endif
1516 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1517 GB18030. */
1518 # if defined __sun && !defined _LIBICONV_VERSION
1519 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1520 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1521 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1522 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1523 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1524 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1525 to_utf8 = (iconv_t)(-1);
1526 else
1527 # endif
1528 to_utf8 = iconv_open (UTF8_NAME, encoding);
1529 if (to_utf8 != (iconv_t)(-1))
1531 /* Determine the length of the resulting UTF-8 string. */
1532 size_t m = iconv_string_length (to_utf8, s, n);
1533 if (m != (size_t)(-1))
1535 /* Convert the string to UTF-8 and build a translation table
1536 from offsets into s to offsets into the translated string. */
1537 size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
1538 char *memory =
1539 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1540 if (memory != NULL)
1542 size_t *offtable = (size_t *) memory;
1543 char *t = (char *) (offtable + n);
1544 char *q = (char *) (t + m);
1545 size_t i;
1547 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1549 /* Determine the possible line breaks of the UTF-8 string. */
1550 u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1552 /* Translate the result back to the original string. */
1553 memset (p, UC_BREAK_PROHIBITED, n);
1554 for (i = 0; i < n; i++)
1555 if (offtable[i] != (size_t)(-1))
1556 p[i] = q[offtable[i]];
1558 free (memory);
1559 iconv_close (to_utf8);
1560 return;
1563 iconv_close (to_utf8);
1565 #endif
1566 /* Impossible to convert. */
1567 #if C_CTYPE_ASCII
1568 if (is_all_ascii (s, n))
1570 /* ASCII is a subset of UTF-8. */
1571 u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1572 return;
1574 #endif
1575 /* We have a non-ASCII string and cannot convert it.
1576 Don't produce line breaks except those already present in the
1577 input string. All we assume here is that the encoding is
1578 minimally ASCII compatible. */
1580 const char *s_end = s + n;
1581 while (s < s_end)
1583 *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1584 s++;
1585 p++;
1591 #endif
1594 mbs_width_linebreaks (const char *s, size_t n,
1595 int width, int start_column, int at_end_columns,
1596 const char *o, const char *encoding,
1597 char *p)
1599 if (n == 0)
1600 return start_column;
1601 if (is_utf8_encoding (encoding))
1602 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1603 else
1605 #if HAVE_ICONV
1606 iconv_t to_utf8;
1607 /* Avoid glibc-2.1 bug with EUC-KR. */
1608 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1609 if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1610 to_utf8 = (iconv_t)(-1);
1611 else
1612 # endif
1613 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1614 GB18030. */
1615 # if defined __sun && !defined _LIBICONV_VERSION
1616 if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1617 || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1618 || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1619 || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1620 || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1621 || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1622 to_utf8 = (iconv_t)(-1);
1623 else
1624 # endif
1625 to_utf8 = iconv_open (UTF8_NAME, encoding);
1626 if (to_utf8 != (iconv_t)(-1))
1628 /* Determine the length of the resulting UTF-8 string. */
1629 size_t m = iconv_string_length (to_utf8, s, n);
1630 if (m != (size_t)(-1))
1632 /* Convert the string to UTF-8 and build a translation table
1633 from offsets into s to offsets into the translated string. */
1634 size_t memory_size =
1635 xsum4 (xtimes (n, sizeof (size_t)), m, m,
1636 (o != NULL ? m : 0));
1637 char *memory =
1638 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1639 if (memory != NULL)
1641 size_t *offtable = (size_t *) memory;
1642 char *t = (char *) (offtable + n);
1643 char *q = (char *) (t + m);
1644 char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1645 int res_column;
1646 size_t i;
1648 iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1650 /* Translate the overrides to the UTF-8 string. */
1651 if (o != NULL)
1653 memset (o8, UC_BREAK_UNDEFINED, m);
1654 for (i = 0; i < n; i++)
1655 if (offtable[i] != (size_t)(-1))
1656 o8[offtable[i]] = o[i];
1659 /* Determine the line breaks of the UTF-8 string. */
1660 res_column =
1661 u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1663 /* Translate the result back to the original string. */
1664 memset (p, UC_BREAK_PROHIBITED, n);
1665 for (i = 0; i < n; i++)
1666 if (offtable[i] != (size_t)(-1))
1667 p[i] = q[offtable[i]];
1669 free (memory);
1670 iconv_close (to_utf8);
1671 return res_column;
1674 iconv_close (to_utf8);
1676 #endif
1677 /* Impossible to convert. */
1678 #if C_CTYPE_ASCII
1679 if (is_all_ascii (s, n))
1681 /* ASCII is a subset of UTF-8. */
1682 return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1684 #endif
1685 /* We have a non-ASCII string and cannot convert it.
1686 Don't produce line breaks except those already present in the
1687 input string. All we assume here is that the encoding is
1688 minimally ASCII compatible. */
1690 const char *s_end = s + n;
1691 while (s < s_end)
1693 *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1694 ? UC_BREAK_MANDATORY
1695 : UC_BREAK_PROHIBITED);
1696 s++;
1697 p++;
1698 if (o != NULL)
1699 o++;
1701 /* We cannot compute widths in this case. */
1702 return start_column;
1708 #ifdef TEST2
1710 #include <stdio.h>
1711 #include <locale.h>
1713 /* Read the contents of an input stream, and return it, terminated with a NUL
1714 byte. */
1715 char *
1716 read_file (FILE *stream)
1718 #define BUFSIZE 4096
1719 char *buf = NULL;
1720 int alloc = 0;
1721 int size = 0;
1722 int count;
1724 while (! feof (stream))
1726 if (size + BUFSIZE > alloc)
1728 alloc = alloc + alloc / 2;
1729 if (alloc < size + BUFSIZE)
1730 alloc = size + BUFSIZE;
1731 buf = realloc (buf, alloc);
1732 if (buf == NULL)
1734 fprintf (stderr, "out of memory\n");
1735 exit (1);
1738 count = fread (buf + size, 1, BUFSIZE, stream);
1739 if (count == 0)
1741 if (ferror (stream))
1743 perror ("fread");
1744 exit (1);
1747 else
1748 size += count;
1750 buf = realloc (buf, size + 1);
1751 if (buf == NULL)
1753 fprintf (stderr, "out of memory\n");
1754 exit (1);
1756 buf[size] = '\0';
1757 return buf;
1758 #undef BUFSIZE
1762 main (int argc, char * argv[])
1764 setlocale (LC_CTYPE, "");
1765 if (argc == 1)
1767 /* Display all the break opportunities in the input string. */
1768 char *input = read_file (stdin);
1769 int length = strlen (input);
1770 char *breaks = malloc (length);
1771 int i;
1773 mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1775 for (i = 0; i < length; i++)
1777 switch (breaks[i])
1779 case UC_BREAK_POSSIBLE:
1780 putc ('|', stdout);
1781 break;
1782 case UC_BREAK_MANDATORY:
1783 break;
1784 case UC_BREAK_PROHIBITED:
1785 break;
1786 default:
1787 abort ();
1789 putc (input[i], stdout);
1792 free (breaks);
1794 return 0;
1796 else if (argc == 2)
1798 /* Insert line breaks for a given width. */
1799 int width = atoi (argv[1]);
1800 char *input = read_file (stdin);
1801 int length = strlen (input);
1802 char *breaks = malloc (length);
1803 int i;
1805 mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1807 for (i = 0; i < length; i++)
1809 switch (breaks[i])
1811 case UC_BREAK_POSSIBLE:
1812 putc ('\n', stdout);
1813 break;
1814 case UC_BREAK_MANDATORY:
1815 break;
1816 case UC_BREAK_PROHIBITED:
1817 break;
1818 default:
1819 abort ();
1821 putc (input[i], stdout);
1824 free (breaks);
1826 return 0;
1828 else
1829 return 1;
1832 #endif /* TEST2 */