1 /* linebreak.c - line breaking of Unicode strings
2 Copyright (C) 2001-2003 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
24 #include "linebreak.h"
31 #include "utf8-ucs4.h"
34 #include "utf16-ucs4.h"
37 u32_mbtouc (unsigned int *puc
, const unsigned int *s
, size_t n
)
45 /* Help GCC to generate good code for string comparisons with
47 #if defined (__GNUC__) && defined (__OPTIMIZE__)
50 streq9 (const char *s1
, const char *s2
)
52 return strcmp (s1
+ 9, s2
+ 9) == 0;
56 streq8 (const char *s1
, const char *s2
, char s28
)
63 return streq9 (s1
, s2
);
70 streq7 (const char *s1
, const char *s2
, char s27
, char s28
)
77 return streq8 (s1
, s2
, s28
);
84 streq6 (const char *s1
, const char *s2
, char s26
, char s27
, char s28
)
91 return streq7 (s1
, s2
, s27
, s28
);
98 streq5 (const char *s1
, const char *s2
, char s25
, char s26
, char s27
, char s28
)
105 return streq6 (s1
, s2
, s26
, s27
, s28
);
112 streq4 (const char *s1
, const char *s2
, char s24
, char s25
, char s26
, char s27
, char s28
)
119 return streq5 (s1
, s2
, s25
, s26
, s27
, s28
);
126 streq3 (const char *s1
, const char *s2
, char s23
, char s24
, char s25
, char s26
, char s27
, char s28
)
133 return streq4 (s1
, s2
, s24
, s25
, s26
, s27
, s28
);
140 streq2 (const char *s1
, const char *s2
, char s22
, char s23
, char s24
, char s25
, char s26
, char s27
, char s28
)
147 return streq3 (s1
, s2
, s23
, s24
, s25
, s26
, s27
, s28
);
154 streq1 (const char *s1
, const char *s2
, char s21
, char s22
, char s23
, char s24
, char s25
, char s26
, char s27
, char s28
)
161 return streq2 (s1
, s2
, s22
, s23
, s24
, s25
, s26
, s27
, s28
);
168 streq0 (const char *s1
, const char *s2
, char s20
, char s21
, char s22
, char s23
, char s24
, char s25
, char s26
, char s27
, char s28
)
175 return streq1 (s1
, s2
, s21
, s22
, s23
, s24
, s25
, s26
, s27
, s28
);
181 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
182 streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28)
186 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
187 (strcmp (s1, s2) == 0)
193 is_cjk_encoding (const char *encoding
)
196 /* Legacy Japanese encodings */
197 || STREQ (encoding
, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)
198 /* Legacy Chinese encodings */
199 || STREQ (encoding
, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
200 || STREQ (encoding
, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
201 || STREQ (encoding
, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
202 || STREQ (encoding
, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
203 /* Legacy Korean encodings */
204 || STREQ (encoding
, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
205 || STREQ (encoding
, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0)
206 || STREQ (encoding
, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0))
212 is_utf8_encoding (const char *encoding
)
214 if (STREQ (encoding
, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
220 /* Determine number of column positions required for UC. */
221 int uc_width (unsigned int uc
, const char *encoding
);
224 * Non-spacing attribute table.
226 * - Non-spacing characters; generated from PropList.txt or
227 * "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
228 * - Format control characters; generated from
229 * "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
230 * - Zero width characters; generated from
231 * "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
233 static const unsigned char nonspacing_table_data
[16*64] = {
235 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */
236 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */
237 0xff, 0xff, 0xff, 0xff, 0x00, 0x20, 0x00, 0x00, /* 0x0080-0x00bf */
238 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */
239 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */
240 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */
241 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */
242 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */
244 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */
245 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */
246 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */
247 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */
248 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */
249 0xff, 0xff, 0xff, 0xe0, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */
250 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */
251 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */
253 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */
254 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */
255 0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */
256 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */
257 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */
258 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */
259 0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */
260 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
262 0x0f, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
263 0x00, 0xf8, 0xff, 0x01, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
264 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
265 0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
266 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
267 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
268 0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
269 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */
271 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */
272 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
273 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
274 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */
275 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */
276 0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
277 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
278 0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */
280 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */
281 0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */
282 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */
283 0xbe, 0x21, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */
284 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */
285 0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */
286 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */
287 0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */
289 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */
290 0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */
291 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0c80-0x0cbf */
292 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */
293 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */
294 0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */
295 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */
296 0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */
298 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */
299 0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */
300 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */
301 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */
302 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */
303 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */
304 0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */
305 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */
307 0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */
308 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */
309 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */
310 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */
311 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */
312 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */
313 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */
314 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */
316 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */
317 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */
318 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */
319 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */
320 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */
321 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */
322 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x3f, /* 0x1780-0x17bf */
323 0x40, 0xfe, 0x0f, 0x20, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */
325 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */
326 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */
327 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
328 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */
329 0x00, 0x00, 0x00, 0x00, 0x87, 0x0f, 0x04, 0x0e, /* 0x1900-0x193f */
330 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */
331 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */
332 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */
334 0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */
335 0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */
336 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */
337 0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */
338 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */
339 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */
340 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */
341 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */
343 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */
344 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */
345 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */
346 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */
347 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */
348 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
349 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */
350 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */
352 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */
353 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */
354 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */
355 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */
356 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */
357 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */
358 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */
359 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */
361 0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */
362 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */
363 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */
364 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */
365 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */
366 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */
367 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
368 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */
369 /* 0x1d000-0x1d1ff */
370 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */
371 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */
372 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */
373 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */
374 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */
375 0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */
376 0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */
377 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* 0x1d1c0-0x1d1ff */
379 static const signed char nonspacing_table_ind
[240] = {
380 0, 1, 2, 3, 4, 5, 6, 7, /* 0x0000-0x0fff */
381 8, -1, -1, 9, 10, -1, -1, -1, /* 0x1000-0x1fff */
382 11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */
383 12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */
384 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */
385 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */
386 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */
387 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */
388 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */
389 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */
390 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */
391 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */
392 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */
393 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */
394 -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */
395 -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */
396 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */
397 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */
398 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */
399 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */
400 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */
401 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */
402 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */
403 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */
404 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */
405 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */
406 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */
407 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */
408 -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */
409 15, -1, -1, -1, -1, -1, -1, -1 /* 0x1d000-0x1dfff */
412 /* Determine number of column positions required for UC. */
414 uc_width (unsigned int uc
, const char *encoding
)
416 /* Test for non-spacing or control character. */
419 int ind
= nonspacing_table_ind
[uc
>> 9];
421 if ((nonspacing_table_data
[64*ind
+ ((uc
>> 3) & 63)] >> (uc
& 7)) & 1)
423 if (uc
> 0 && uc
< 0xa0)
429 else if ((uc
>> 9) == (0xe0000 >> 9))
432 ? (uc
>= 0xe0020 ? uc
<= 0xe007f : uc
== 0xe0001)
436 /* Test for double-width character.
437 * Generated from "grep '^....;[WF]' EastAsianWidth.txt"
438 * and "grep '^....;[^WF]' EastAsianWidth.txt"
441 && ((uc
< 0x1160) /* Hangul Jamo */
442 || (uc
>= 0x2e80 && uc
< 0x4dc0 /* CJK */
444 || (uc
>= 0x4e00 && uc
< 0xa4d0) /* CJK ... Yi */
445 || (uc
>= 0xac00 && uc
< 0xd7a4) /* Hangul Syllables */
446 || (uc
>= 0xf900 && uc
< 0xfb00) /* CJK Compatibility Ideographs */
447 || (uc
>= 0xfe30 && uc
< 0xfe70) /* CJK Compatibility Forms */
448 || (uc
>= 0xff00 && uc
< 0xff61) /* Fullwidth Forms */
449 || (uc
>= 0xffe0 && uc
< 0xffe7)
450 || (uc
>= 0x20000 && uc
<= 0x2fffd) /* CJK, CJK Compatibility Ideographs */
451 || (uc
>= 0x30000 && uc
<= 0x3fffd)
454 /* In ancient CJK encodings, Cyrillic and most other characters are
455 double-width as well. */
456 if (uc
>= 0x00A1 && uc
< 0xFF61 && uc
!= 0x20A9
457 && is_cjk_encoding (encoding
))
465 /* Determine number of column positions required for first N units
466 (or fewer if S ends before this) in S. */
469 u8_width (const unsigned char *s
, size_t n
, const char *encoding
)
471 const unsigned char *s_end
= s
+ n
;
479 s
+= u8_mbtouc (&uc
, s
, s_end
- s
);
482 break; /* end of string reached */
484 w
= uc_width (uc
, encoding
);
485 if (w
>= 0) /* ignore control characters in the string */
493 u16_width (const unsigned short *s
, size_t n
, const char *encoding
)
495 const unsigned short *s_end
= s
+ n
;
503 s
+= u16_mbtouc (&uc
, s
, s_end
- s
);
506 break; /* end of string reached */
508 w
= uc_width (uc
, encoding
);
509 if (w
>= 0) /* ignore control characters in the string */
517 u32_width (const unsigned int *s
, size_t n
, const char *encoding
)
519 const unsigned int *s_end
= s
+ n
;
524 unsigned int uc
= *s
++;
528 break; /* end of string reached */
530 w
= uc_width (uc
, encoding
);
531 if (w
>= 0) /* ignore control characters in the string */
541 /* Determine the line break points in S, and store the result at p[0..n-1]. */
542 /* We don't support line breaking of complex-context dependent characters
543 (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
545 /* Line breaking classification. */
549 /* Values >= 20 are resolved at run time. */
550 LBP_BK
= 0, /* mandatory break */
551 /*LBP_CR, carriage return - not used here because it's a DOSism */
552 /*LBP_LF, line feed - not used here because it's a DOSism */
553 LBP_CM
= 20, /* attached characters and combining marks */
554 /*LBP_SG, surrogates - not used here because they are not characters */
555 LBP_ZW
= 1, /* zero width space */
556 LBP_IN
= 2, /* inseparable */
557 LBP_GL
= 3, /* non-breaking (glue) */
558 LBP_CB
= 22, /* contingent break opportunity */
559 LBP_SP
= 21, /* space */
560 LBP_BA
= 4, /* break opportunity after */
561 LBP_BB
= 5, /* break opportunity before */
562 LBP_B2
= 6, /* break opportunity before and after */
563 LBP_HY
= 7, /* hyphen */
564 LBP_NS
= 8, /* non starter */
565 LBP_OP
= 9, /* opening punctuation */
566 LBP_CL
= 10, /* closing punctuation */
567 LBP_QU
= 11, /* ambiguous quotation */
568 LBP_EX
= 12, /* exclamation/interrogation */
569 LBP_ID
= 13, /* ideographic */
570 LBP_NU
= 14, /* numeric */
571 LBP_IS
= 15, /* infix separator (numeric) */
572 LBP_SY
= 16, /* symbols allowing breaks */
573 LBP_AL
= 17, /* ordinary alphabetic and symbol characters */
574 LBP_PR
= 18, /* prefix (numeric) */
575 LBP_PO
= 19, /* postfix (numeric) */
576 LBP_SA
= 23, /* complex context (South East Asian) */
577 LBP_AI
= 24, /* ambiguous (alphabetic or ideograph) */
578 LBP_XX
= 25 /* unknown */
581 #include "lbrkprop.h"
583 static inline unsigned char
584 lbrkprop_lookup (unsigned int uc
)
586 unsigned int index1
= uc
>> lbrkprop_header_0
;
587 if (index1
< lbrkprop_header_1
)
589 int lookup1
= lbrkprop
.level1
[index1
];
592 unsigned int index2
= (uc
>> lbrkprop_header_2
) & lbrkprop_header_3
;
593 int lookup2
= lbrkprop
.level2
[lookup1
+ index2
];
596 unsigned int index3
= uc
& lbrkprop_header_4
;
597 return lbrkprop
.level3
[lookup2
+ index3
];
604 /* Table indexed by two line breaking classifications. */
605 #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */
606 #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
607 #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */
608 static const unsigned char lbrk_table
[19][19] = {
610 /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
611 /* ZW */ { P
, D
, D
, D
, D
, D
, D
, D
, D
, D
, D
, D
, D
, D
, D
, D
, D
, D
, D
, },
612 /* IN */ { P
, I
, I
, I
, D
, D
, I
, I
, D
, P
, I
, P
, D
, D
, P
, P
, D
, D
, D
, },
613 /* GL */ { P
, I
, I
, I
, I
, I
, I
, I
, I
, P
, I
, P
, I
, I
, P
, P
, I
, I
, I
, },
614 /* BA */ { P
, D
, I
, I
, D
, D
, I
, I
, D
, P
, I
, P
, D
, D
, P
, P
, D
, D
, D
, },
615 /* BB */ { P
, I
, I
, I
, I
, I
, I
, I
, I
, P
, I
, P
, I
, I
, P
, P
, I
, I
, I
, },
616 /* B2 */ { P
, D
, I
, I
, D
, P
, I
, I
, D
, P
, I
, P
, D
, D
, P
, P
, D
, D
, D
, },
617 /* HY */ { P
, D
, I
, I
, D
, D
, I
, I
, D
, P
, I
, P
, D
, D
, P
, P
, D
, D
, D
, },
618 /* NS */ { P
, D
, I
, I
, D
, D
, I
, I
, D
, P
, I
, P
, D
, D
, P
, P
, D
, D
, D
, },
619 /* OP */ { P
, P
, P
, P
, P
, P
, P
, P
, P
, P
, P
, P
, P
, P
, P
, P
, P
, P
, P
, },
620 /* CL */ { P
, D
, I
, I
, D
, D
, I
, P
, D
, P
, I
, P
, D
, D
, P
, P
, D
, D
, I
, },
621 /* QU */ { P
, I
, I
, I
, I
, I
, I
, I
, P
, P
, I
, P
, I
, I
, P
, P
, I
, I
, I
, },
622 /* EX */ { P
, D
, I
, I
, D
, D
, I
, I
, D
, P
, I
, P
, D
, D
, P
, P
, D
, D
, D
, },
623 /* ID */ { P
, I
, I
, I
, D
, D
, I
, I
, D
, P
, I
, P
, D
, D
, P
, P
, D
, D
, I
, },
624 /* NU */ { P
, I
, I
, I
, D
, D
, I
, I
, D
, P
, I
, P
, D
, I
, P
, P
, I
, D
, I
, },
625 /* IS */ { P
, D
, I
, I
, D
, D
, I
, I
, D
, P
, I
, P
, D
, I
, P
, P
, D
, D
, D
, },
626 /* SY */ { P
, D
, I
, I
, D
, D
, I
, I
, D
, P
, I
, P
, D
, I
, P
, P
, D
, D
, D
, },
627 /* AL */ { P
, I
, I
, I
, D
, D
, I
, I
, D
, P
, I
, P
, D
, I
, P
, P
, I
, D
, D
, },
628 /* PR */ { P
, D
, I
, I
, D
, D
, I
, I
, I
, P
, I
, P
, I
, I
, P
, P
, I
, D
, D
, },
629 /* PO */ { P
, D
, I
, I
, D
, D
, I
, I
, D
, P
, I
, P
, D
, D
, P
, P
, D
, D
, D
, },
633 /* Note: The (B2,B2) entry should probably be D instead of P. */
634 /* Note: The (PR,ID) entry should probably be D instead of I. */
637 u8_possible_linebreaks (const unsigned char *s
, size_t n
, const char *encoding
, char *p
)
639 int LBP_AI_REPLACEMENT
= (is_cjk_encoding (encoding
) ? LBP_ID
: LBP_AL
);
640 const unsigned char *s_end
= s
+ n
;
641 int last_prop
= LBP_BK
; /* line break property of last non-space character */
642 char *seen_space
= NULL
; /* Was a space seen after the last non-space character? */
643 char *seen_space2
= NULL
; /* At least two spaces after the last non-space? */
645 /* Don't break inside multibyte characters. */
646 memset (p
, UC_BREAK_PROHIBITED
, n
);
651 int count
= u8_mbtouc (&uc
, s
, s_end
- s
);
652 int prop
= lbrkprop_lookup (uc
);
656 /* Mandatory break. */
657 *p
= UC_BREAK_MANDATORY
;
666 /* Resolve property values whose behaviour is not fixed. */
670 /* Resolve ambiguous. */
671 prop
= LBP_AI_REPLACEMENT
;
674 /* This is arbitrary. */
678 /* We don't handle complex scripts yet.
679 Treat LBP_SA like LBP_XX. */
681 /* This is arbitrary. */
686 /* Deal with combining characters. */
690 /* Don't break just before a combining character. */
691 *p
= UC_BREAK_PROHIBITED
;
692 /* A combining character turns a preceding space into LBP_AL. */
693 if (seen_space
!= NULL
)
696 seen_space
= seen_space2
;
698 goto lookup_via_table
;
701 else if (prop
== LBP_SP
)
703 /* Don't break just before a space. */
704 *p
= UC_BREAK_PROHIBITED
;
705 seen_space2
= seen_space
;
711 /* prop must be usable as an index for table 7.3 of UTR #14. */
712 if (!(prop
>= 1 && prop
<= sizeof(lbrk_table
) / sizeof(lbrk_table
[0])))
715 if (last_prop
== LBP_BK
)
717 /* Don't break at the beginning of a line. */
718 *q
= UC_BREAK_PROHIBITED
;
722 switch (lbrk_table
[last_prop
-1] [prop
-1])
725 *q
= UC_BREAK_POSSIBLE
;
728 *q
= (seen_space
!= NULL
? UC_BREAK_POSSIBLE
: UC_BREAK_PROHIBITED
);
731 *q
= UC_BREAK_PROHIBITED
;
751 u16_possible_linebreaks (const unsigned short *s
, size_t n
, const char *encoding
, char *p
)
753 int LBP_AI_REPLACEMENT
= (is_cjk_encoding (encoding
) ? LBP_ID
: LBP_AL
);
754 const unsigned short *s_end
= s
+ n
;
755 int last_prop
= LBP_BK
; /* line break property of last non-space character */
756 char *seen_space
= NULL
; /* Was a space seen after the last non-space character? */
757 char *seen_space2
= NULL
; /* At least two spaces after the last non-space? */
759 /* Don't break inside multibyte characters. */
760 memset (p
, UC_BREAK_PROHIBITED
, n
);
765 int count
= u16_mbtouc (&uc
, s
, s_end
- s
);
766 int prop
= lbrkprop_lookup (uc
);
770 /* Mandatory break. */
771 *p
= UC_BREAK_MANDATORY
;
780 /* Resolve property values whose behaviour is not fixed. */
784 /* Resolve ambiguous. */
785 prop
= LBP_AI_REPLACEMENT
;
788 /* This is arbitrary. */
792 /* We don't handle complex scripts yet.
793 Treat LBP_SA like LBP_XX. */
795 /* This is arbitrary. */
800 /* Deal with combining characters. */
804 /* Don't break just before a combining character. */
805 *p
= UC_BREAK_PROHIBITED
;
806 /* A combining character turns a preceding space into LBP_AL. */
807 if (seen_space
!= NULL
)
810 seen_space
= seen_space2
;
812 goto lookup_via_table
;
815 else if (prop
== LBP_SP
)
817 /* Don't break just before a space. */
818 *p
= UC_BREAK_PROHIBITED
;
819 seen_space2
= seen_space
;
825 /* prop must be usable as an index for table 7.3 of UTR #14. */
826 if (!(prop
>= 1 && prop
<= sizeof(lbrk_table
) / sizeof(lbrk_table
[0])))
829 if (last_prop
== LBP_BK
)
831 /* Don't break at the beginning of a line. */
832 *q
= UC_BREAK_PROHIBITED
;
836 switch (lbrk_table
[last_prop
-1] [prop
-1])
839 *q
= UC_BREAK_POSSIBLE
;
842 *q
= (seen_space
!= NULL
? UC_BREAK_POSSIBLE
: UC_BREAK_PROHIBITED
);
845 *q
= UC_BREAK_PROHIBITED
;
863 u32_possible_linebreaks (const unsigned int *s
, size_t n
, const char *encoding
, char *p
)
865 int LBP_AI_REPLACEMENT
= (is_cjk_encoding (encoding
) ? LBP_ID
: LBP_AL
);
866 const unsigned int *s_end
= s
+ n
;
867 int last_prop
= LBP_BK
; /* line break property of last non-space character */
868 char *seen_space
= NULL
; /* Was a space seen after the last non-space character? */
869 char *seen_space2
= NULL
; /* At least two spaces after the last non-space? */
873 unsigned int uc
= *s
;
874 int prop
= lbrkprop_lookup (uc
);
878 /* Mandatory break. */
879 *p
= UC_BREAK_MANDATORY
;
888 /* Resolve property values whose behaviour is not fixed. */
892 /* Resolve ambiguous. */
893 prop
= LBP_AI_REPLACEMENT
;
896 /* This is arbitrary. */
900 /* We don't handle complex scripts yet.
901 Treat LBP_SA like LBP_XX. */
903 /* This is arbitrary. */
908 /* Deal with combining characters. */
912 /* Don't break just before a combining character. */
913 *p
= UC_BREAK_PROHIBITED
;
914 /* A combining character turns a preceding space into LBP_AL. */
915 if (seen_space
!= NULL
)
918 seen_space
= seen_space2
;
920 goto lookup_via_table
;
923 else if (prop
== LBP_SP
)
925 /* Don't break just before a space. */
926 *p
= UC_BREAK_PROHIBITED
;
927 seen_space2
= seen_space
;
933 /* prop must be usable as an index for table 7.3 of UTR #14. */
934 if (!(prop
>= 1 && prop
<= sizeof(lbrk_table
) / sizeof(lbrk_table
[0])))
937 if (last_prop
== LBP_BK
)
939 /* Don't break at the beginning of a line. */
940 *q
= UC_BREAK_PROHIBITED
;
944 switch (lbrk_table
[last_prop
-1] [prop
-1])
947 *q
= UC_BREAK_POSSIBLE
;
950 *q
= (seen_space
!= NULL
? UC_BREAK_POSSIBLE
: UC_BREAK_PROHIBITED
);
953 *q
= UC_BREAK_PROHIBITED
;
973 /* Choose the best line breaks, assuming the uc_width function.
974 Return the column after the end of the string. */
977 u8_width_linebreaks (const unsigned char *s
, size_t n
,
978 int width
, int start_column
, int at_end_columns
,
979 const char *o
, const char *encoding
,
982 const unsigned char *s_end
;
987 u8_possible_linebreaks (s
, n
, encoding
, p
);
991 last_column
= start_column
;
996 int count
= u8_mbtouc (&uc
, s
, s_end
- s
);
998 /* Respect the override. */
999 if (o
!= NULL
&& *o
!= UC_BREAK_UNDEFINED
)
1002 if (*p
== UC_BREAK_POSSIBLE
|| *p
== UC_BREAK_MANDATORY
)
1004 /* An atomic piece of text ends here. */
1005 if (last_p
!= NULL
&& last_column
+ piece_width
> width
)
1007 /* Insert a line break. */
1008 *last_p
= UC_BREAK_POSSIBLE
;
1013 if (*p
== UC_BREAK_MANDATORY
)
1015 /* uc is a line break character. */
1016 /* Start a new piece at column 0. */
1023 /* uc is not a line break character. */
1026 if (*p
== UC_BREAK_POSSIBLE
)
1028 /* Start a new piece. */
1030 last_column
+= piece_width
;
1032 /* No line break for the moment, may be turned into
1033 UC_BREAK_POSSIBLE later, via last_p. */
1036 *p
= UC_BREAK_PROHIBITED
;
1038 w
= uc_width (uc
, encoding
);
1039 if (w
>= 0) /* ignore control characters in the string */
1049 /* The last atomic piece of text ends here. */
1050 if (last_p
!= NULL
&& last_column
+ piece_width
+ at_end_columns
> width
)
1052 /* Insert a line break. */
1053 *last_p
= UC_BREAK_POSSIBLE
;
1057 return last_column
+ piece_width
;
1063 u16_width_linebreaks (const unsigned short *s
, size_t n
,
1064 int width
, int start_column
, int at_end_columns
,
1065 const char *o
, const char *encoding
,
1068 const unsigned short *s_end
;
1073 u16_possible_linebreaks (s
, n
, encoding
, p
);
1077 last_column
= start_column
;
1082 int count
= u16_mbtouc (&uc
, s
, s_end
- s
);
1084 /* Respect the override. */
1085 if (o
!= NULL
&& *o
!= UC_BREAK_UNDEFINED
)
1088 if (*p
== UC_BREAK_POSSIBLE
|| *p
== UC_BREAK_MANDATORY
)
1090 /* An atomic piece of text ends here. */
1091 if (last_p
!= NULL
&& last_column
+ piece_width
> width
)
1093 /* Insert a line break. */
1094 *last_p
= UC_BREAK_POSSIBLE
;
1099 if (*p
== UC_BREAK_MANDATORY
)
1101 /* uc is a line break character. */
1102 /* Start a new piece at column 0. */
1109 /* uc is not a line break character. */
1112 if (*p
== UC_BREAK_POSSIBLE
)
1114 /* Start a new piece. */
1116 last_column
+= piece_width
;
1118 /* No line break for the moment, may be turned into
1119 UC_BREAK_POSSIBLE later, via last_p. */
1122 *p
= UC_BREAK_PROHIBITED
;
1124 w
= uc_width (uc
, encoding
);
1125 if (w
>= 0) /* ignore control characters in the string */
1135 /* The last atomic piece of text ends here. */
1136 if (last_p
!= NULL
&& last_column
+ piece_width
+ at_end_columns
> width
)
1138 /* Insert a line break. */
1139 *last_p
= UC_BREAK_POSSIBLE
;
1143 return last_column
+ piece_width
;
1147 u32_width_linebreaks (const unsigned int *s
, size_t n
,
1148 int width
, int start_column
, int at_end_columns
,
1149 const char *o
, const char *encoding
,
1152 const unsigned int *s_end
;
1157 u32_possible_linebreaks (s
, n
, encoding
, p
);
1161 last_column
= start_column
;
1165 unsigned int uc
= *s
;
1167 /* Respect the override. */
1168 if (o
!= NULL
&& *o
!= UC_BREAK_UNDEFINED
)
1171 if (*p
== UC_BREAK_POSSIBLE
|| *p
== UC_BREAK_MANDATORY
)
1173 /* An atomic piece of text ends here. */
1174 if (last_p
!= NULL
&& last_column
+ piece_width
> width
)
1176 /* Insert a line break. */
1177 *last_p
= UC_BREAK_POSSIBLE
;
1182 if (*p
== UC_BREAK_MANDATORY
)
1184 /* uc is a line break character. */
1185 /* Start a new piece at column 0. */
1192 /* uc is not a line break character. */
1195 if (*p
== UC_BREAK_POSSIBLE
)
1197 /* Start a new piece. */
1199 last_column
+= piece_width
;
1201 /* No line break for the moment, may be turned into
1202 UC_BREAK_POSSIBLE later, via last_p. */
1205 *p
= UC_BREAK_PROHIBITED
;
1207 w
= uc_width (uc
, encoding
);
1208 if (w
>= 0) /* ignore control characters in the string */
1218 /* The last atomic piece of text ends here. */
1219 if (last_p
!= NULL
&& last_column
+ piece_width
+ at_end_columns
> width
)
1221 /* Insert a line break. */
1222 *last_p
= UC_BREAK_POSSIBLE
;
1226 return last_column
+ piece_width
;
1236 /* Read the contents of an input stream, and return it, terminated with a NUL
1239 read_file (FILE *stream
)
1241 #define BUFSIZE 4096
1247 while (! feof (stream
))
1249 if (size
+ BUFSIZE
> alloc
)
1251 alloc
= alloc
+ alloc
/ 2;
1252 if (alloc
< size
+ BUFSIZE
)
1253 alloc
= size
+ BUFSIZE
;
1254 buf
= realloc (buf
, alloc
);
1257 fprintf (stderr
, "out of memory\n");
1261 count
= fread (buf
+ size
, 1, BUFSIZE
, stream
);
1264 if (ferror (stream
))
1273 buf
= realloc (buf
, size
+ 1);
1276 fprintf (stderr
, "out of memory\n");
1285 main (int argc
, char * argv
[])
1289 /* Display all the break opportunities in the input string. */
1290 char *input
= read_file (stdin
);
1291 int length
= strlen (input
);
1292 char *breaks
= malloc (length
);
1295 u8_possible_linebreaks ((unsigned char *) input
, length
, "UTF-8", breaks
);
1297 for (i
= 0; i
< length
; i
++)
1301 case UC_BREAK_POSSIBLE
:
1302 /* U+2027 in UTF-8 encoding */
1303 putc (0xe2, stdout
); putc (0x80, stdout
); putc (0xa7, stdout
);
1305 case UC_BREAK_MANDATORY
:
1306 /* U+21B2 (or U+21B5) in UTF-8 encoding */
1307 putc (0xe2, stdout
); putc (0x86, stdout
); putc (0xb2, stdout
);
1309 case UC_BREAK_PROHIBITED
:
1314 putc (input
[i
], stdout
);
1323 /* Insert line breaks for a given width. */
1324 int width
= atoi (argv
[1]);
1325 char *input
= read_file (stdin
);
1326 int length
= strlen (input
);
1327 char *breaks
= malloc (length
);
1330 u8_width_linebreaks ((unsigned char *) input
, length
, width
, 0, 0, NULL
, "UTF-8", breaks
);
1332 for (i
= 0; i
< length
; i
++)
1336 case UC_BREAK_POSSIBLE
:
1337 putc ('\n', stdout
);
1339 case UC_BREAK_MANDATORY
:
1341 case UC_BREAK_PROHIBITED
:
1346 putc (input
[i
], stdout
);
1360 /* Now the same thing with an arbitrary encoding.
1362 We convert the input string to Unicode.
1364 The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
1365 UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to
1366 \U0000FFFF. UTF-16 and variants support only characters up to
1367 \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1.
1368 UCS-4 specification leaves doubts about endianness and byte order mark.
1369 glibc currently interprets it as big endian without byte order mark,
1370 but this is not backed by an RFC. So we use UTF-8. It supports
1371 characters up to \U7FFFFFFF and is unambiguously defined. */
1378 /* Luckily, the encoding's name is platform independent. */
1379 #define UTF8_NAME "UTF-8"
1381 /* Return the length of a string after conversion through an iconv_t. */
1383 iconv_string_length (iconv_t cd
, const char *s
, size_t n
)
1385 #define TMPBUFSIZE 4096
1387 char tmpbuf
[TMPBUFSIZE
];
1388 const char *inptr
= s
;
1392 char *outptr
= tmpbuf
;
1393 size_t outsize
= TMPBUFSIZE
;
1394 size_t res
= iconv (cd
, (ICONV_CONST
char **) &inptr
, &insize
, &outptr
, &outsize
);
1395 if (res
== (size_t)(-1) && errno
!= E2BIG
)
1396 return (size_t)(-1);
1397 count
+= outptr
- tmpbuf
;
1399 /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug. */
1400 #if defined _LIBICONV_VERSION \
1401 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1403 char *outptr
= tmpbuf
;
1404 size_t outsize
= TMPBUFSIZE
;
1405 size_t res
= iconv (cd
, NULL
, NULL
, &outptr
, &outsize
);
1406 if (res
== (size_t)(-1))
1407 return (size_t)(-1);
1408 count
+= outptr
- tmpbuf
;
1410 /* Return to the initial state. */
1411 iconv (cd
, NULL
, NULL
, NULL
, NULL
);
1418 iconv_string_keeping_offsets (iconv_t cd
, const char *s
, size_t n
,
1419 size_t *offtable
, char *t
, size_t m
)
1426 /* Avoid glibc-2.1 bug. */
1427 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
1428 const size_t extra
= 1;
1430 const size_t extra
= 0;
1433 for (i
= 0; i
< n
; i
++)
1434 offtable
[i
] = (size_t)(-1);
1439 outsize
= m
+ extra
;
1440 while (inptr
< s_end
)
1442 const char *saved_inptr
;
1446 offtable
[inptr
- s
] = outptr
- t
;
1448 saved_inptr
= inptr
;
1450 for (insize
= 1; inptr
+ insize
<= s_end
; insize
++)
1452 res
= iconv (cd
, (ICONV_CONST
char **) &inptr
, &insize
, &outptr
, &outsize
);
1453 if (!(res
== (size_t)(-1) && errno
== EINVAL
))
1455 /* We expect that no input bytes have been consumed so far. */
1456 if (inptr
!= saved_inptr
)
1459 /* After we verified the convertibility and computed the translation's
1460 size m, there shouldn't be any conversion error here. */
1461 if (res
== (size_t)(-1))
1464 /* Avoid glibc-2.1 bug and Solaris 7 bug. */
1465 #if defined _LIBICONV_VERSION \
1466 || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1467 if (iconv (cd
, NULL
, NULL
, &outptr
, &outsize
) == (size_t)(-1))
1470 /* We should have produced exactly m output bytes. */
1471 if (outsize
!= extra
)
1475 #endif /* HAVE_ICONV */
1479 /* Tests whether a string is entirely ASCII. Returns 1 if yes.
1480 Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding. */
1482 is_all_ascii (const char *s
, size_t n
)
1484 for (; n
> 0; s
++, n
--)
1486 unsigned char c
= (unsigned char) *s
;
1488 if (!(c_isprint (c
) || c_isspace (c
)))
1494 #endif /* C_CTYPE_ASCII */
1496 #if defined unused || defined TEST2
1499 mbs_possible_linebreaks (const char *s
, size_t n
, const char *encoding
,
1504 if (is_utf8_encoding (encoding
))
1505 u8_possible_linebreaks ((const unsigned char *) s
, n
, encoding
, p
);
1510 /* Avoid glibc-2.1 bug with EUC-KR. */
1511 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1512 if (STREQ (encoding
, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1513 to_utf8
= (iconv_t
)(-1);
1516 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1518 # if defined __sun && !defined _LIBICONV_VERSION
1519 if ( STREQ (encoding
, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1520 || STREQ (encoding
, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1521 || STREQ (encoding
, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1522 || STREQ (encoding
, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1523 || STREQ (encoding
, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1524 || STREQ (encoding
, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1525 to_utf8
= (iconv_t
)(-1);
1528 to_utf8
= iconv_open (UTF8_NAME
, encoding
);
1529 if (to_utf8
!= (iconv_t
)(-1))
1531 /* Determine the length of the resulting UTF-8 string. */
1532 size_t m
= iconv_string_length (to_utf8
, s
, n
);
1533 if (m
!= (size_t)(-1))
1535 /* Convert the string to UTF-8 and build a translation table
1536 from offsets into s to offsets into the translated string. */
1537 size_t memory_size
= xsum3 (xtimes (n
, sizeof (size_t)), m
, m
);
1539 (size_in_bounds_p (memory_size
) ? malloc (memory_size
) : NULL
);
1542 size_t *offtable
= (size_t *) memory
;
1543 char *t
= (char *) (offtable
+ n
);
1544 char *q
= (char *) (t
+ m
);
1547 iconv_string_keeping_offsets (to_utf8
, s
, n
, offtable
, t
, m
);
1549 /* Determine the possible line breaks of the UTF-8 string. */
1550 u8_possible_linebreaks ((const unsigned char *) t
, m
, encoding
, q
);
1552 /* Translate the result back to the original string. */
1553 memset (p
, UC_BREAK_PROHIBITED
, n
);
1554 for (i
= 0; i
< n
; i
++)
1555 if (offtable
[i
] != (size_t)(-1))
1556 p
[i
] = q
[offtable
[i
]];
1559 iconv_close (to_utf8
);
1563 iconv_close (to_utf8
);
1566 /* Impossible to convert. */
1568 if (is_all_ascii (s
, n
))
1570 /* ASCII is a subset of UTF-8. */
1571 u8_possible_linebreaks ((const unsigned char *) s
, n
, encoding
, p
);
1575 /* We have a non-ASCII string and cannot convert it.
1576 Don't produce line breaks except those already present in the
1577 input string. All we assume here is that the encoding is
1578 minimally ASCII compatible. */
1580 const char *s_end
= s
+ n
;
1583 *p
= (*s
== '\n' ? UC_BREAK_MANDATORY
: UC_BREAK_PROHIBITED
);
1594 mbs_width_linebreaks (const char *s
, size_t n
,
1595 int width
, int start_column
, int at_end_columns
,
1596 const char *o
, const char *encoding
,
1600 return start_column
;
1601 if (is_utf8_encoding (encoding
))
1602 return u8_width_linebreaks ((const unsigned char *) s
, n
, width
, start_column
, at_end_columns
, o
, encoding
, p
);
1607 /* Avoid glibc-2.1 bug with EUC-KR. */
1608 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1609 if (STREQ (encoding
, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1610 to_utf8
= (iconv_t
)(-1);
1613 /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1615 # if defined __sun && !defined _LIBICONV_VERSION
1616 if ( STREQ (encoding
, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1617 || STREQ (encoding
, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1618 || STREQ (encoding
, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1619 || STREQ (encoding
, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1620 || STREQ (encoding
, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1621 || STREQ (encoding
, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1622 to_utf8
= (iconv_t
)(-1);
1625 to_utf8
= iconv_open (UTF8_NAME
, encoding
);
1626 if (to_utf8
!= (iconv_t
)(-1))
1628 /* Determine the length of the resulting UTF-8 string. */
1629 size_t m
= iconv_string_length (to_utf8
, s
, n
);
1630 if (m
!= (size_t)(-1))
1632 /* Convert the string to UTF-8 and build a translation table
1633 from offsets into s to offsets into the translated string. */
1634 size_t memory_size
=
1635 xsum4 (xtimes (n
, sizeof (size_t)), m
, m
,
1636 (o
!= NULL
? m
: 0));
1638 (size_in_bounds_p (memory_size
) ? malloc (memory_size
) : NULL
);
1641 size_t *offtable
= (size_t *) memory
;
1642 char *t
= (char *) (offtable
+ n
);
1643 char *q
= (char *) (t
+ m
);
1644 char *o8
= (o
!= NULL
? (char *) (q
+ m
) : NULL
);
1648 iconv_string_keeping_offsets (to_utf8
, s
, n
, offtable
, t
, m
);
1650 /* Translate the overrides to the UTF-8 string. */
1653 memset (o8
, UC_BREAK_UNDEFINED
, m
);
1654 for (i
= 0; i
< n
; i
++)
1655 if (offtable
[i
] != (size_t)(-1))
1656 o8
[offtable
[i
]] = o
[i
];
1659 /* Determine the line breaks of the UTF-8 string. */
1661 u8_width_linebreaks ((const unsigned char *) t
, m
, width
, start_column
, at_end_columns
, o8
, encoding
, q
);
1663 /* Translate the result back to the original string. */
1664 memset (p
, UC_BREAK_PROHIBITED
, n
);
1665 for (i
= 0; i
< n
; i
++)
1666 if (offtable
[i
] != (size_t)(-1))
1667 p
[i
] = q
[offtable
[i
]];
1670 iconv_close (to_utf8
);
1674 iconv_close (to_utf8
);
1677 /* Impossible to convert. */
1679 if (is_all_ascii (s
, n
))
1681 /* ASCII is a subset of UTF-8. */
1682 return u8_width_linebreaks ((const unsigned char *) s
, n
, width
, start_column
, at_end_columns
, o
, encoding
, p
);
1685 /* We have a non-ASCII string and cannot convert it.
1686 Don't produce line breaks except those already present in the
1687 input string. All we assume here is that the encoding is
1688 minimally ASCII compatible. */
1690 const char *s_end
= s
+ n
;
1693 *p
= ((o
!= NULL
&& *o
== UC_BREAK_MANDATORY
) || *s
== '\n'
1694 ? UC_BREAK_MANDATORY
1695 : UC_BREAK_PROHIBITED
);
1701 /* We cannot compute widths in this case. */
1702 return start_column
;
1713 /* Read the contents of an input stream, and return it, terminated with a NUL
1716 read_file (FILE *stream
)
1718 #define BUFSIZE 4096
1724 while (! feof (stream
))
1726 if (size
+ BUFSIZE
> alloc
)
1728 alloc
= alloc
+ alloc
/ 2;
1729 if (alloc
< size
+ BUFSIZE
)
1730 alloc
= size
+ BUFSIZE
;
1731 buf
= realloc (buf
, alloc
);
1734 fprintf (stderr
, "out of memory\n");
1738 count
= fread (buf
+ size
, 1, BUFSIZE
, stream
);
1741 if (ferror (stream
))
1750 buf
= realloc (buf
, size
+ 1);
1753 fprintf (stderr
, "out of memory\n");
1762 main (int argc
, char * argv
[])
1764 setlocale (LC_CTYPE
, "");
1767 /* Display all the break opportunities in the input string. */
1768 char *input
= read_file (stdin
);
1769 int length
= strlen (input
);
1770 char *breaks
= malloc (length
);
1773 mbs_possible_linebreaks (input
, length
, locale_charset (), breaks
);
1775 for (i
= 0; i
< length
; i
++)
1779 case UC_BREAK_POSSIBLE
:
1782 case UC_BREAK_MANDATORY
:
1784 case UC_BREAK_PROHIBITED
:
1789 putc (input
[i
], stdout
);
1798 /* Insert line breaks for a given width. */
1799 int width
= atoi (argv
[1]);
1800 char *input
= read_file (stdin
);
1801 int length
= strlen (input
);
1802 char *breaks
= malloc (length
);
1805 mbs_width_linebreaks (input
, length
, width
, 0, 0, NULL
, locale_charset (), breaks
);
1807 for (i
= 0; i
< length
; i
++)
1811 case UC_BREAK_POSSIBLE
:
1812 putc ('\n', stdout
);
1814 case UC_BREAK_MANDATORY
:
1816 case UC_BREAK_PROHIBITED
:
1821 putc (input
[i
], stdout
);