gnu/dist/gettext/gettext-tools/lib/linebreak.c

   1 /* linebreak.c - line breaking of Unicode strings
   2    Copyright (C) 2001-2003 Free Software Foundation, Inc.
   3    Written by Bruno Haible <haible@clisp.cons.org>, 2001.
   4
   5 This program is free software; you can redistribute it and/or modify
   6 it under the terms of the GNU General Public License as published by
   7 the Free Software Foundation; either version 2, or (at your option)
   8 any later version.
   9
  10 This program is distributed in the hope that it will be useful,
  11 but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 GNU General Public License for more details.
  14
  15 You should have received a copy of the GNU General Public License
  16 along with this program; if not, write to the Free Software
  17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include <config.h>
  21 #endif
  22
  23 /* Specification.  */
  24 #include "linebreak.h"
  25
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include "c-ctype.h"
  29 #include "xsize.h"
  30
  31 #include "utf8-ucs4.h"
  32
  33 #ifdef unused
  34 #include "utf16-ucs4.h"
  35
  36 static inline int
  37 u32_mbtouc (unsigned int *puc, const unsigned int *s, size_t n)
  38 {
  39   *puc = *s;
  40   return 1;
  41 }
  42 #endif
  43
  44
  45 /* Help GCC to generate good code for string comparisons with
  46    immediate strings. */
  47 #if defined (__GNUC__) && defined (__OPTIMIZE__)
  48
  49 static inline int
  50 streq9 (const char *s1, const char *s2)
  51 {
  52   return strcmp (s1 + 9, s2 + 9) == 0;
  53 }
  54
  55 static inline int
  56 streq8 (const char *s1, const char *s2, char s28)
  57 {
  58   if (s1[8] == s28)
  59     {
  60       if (s28 == 0)
  61         return 1;
  62       else
  63         return streq9 (s1, s2);
  64     }
  65   else
  66     return 0;
  67 }
  68
  69 static inline int
  70 streq7 (const char *s1, const char *s2, char s27, char s28)
  71 {
  72   if (s1[7] == s27)
  73     {
  74       if (s27 == 0)
  75         return 1;
  76       else
  77         return streq8 (s1, s2, s28);
  78     }
  79   else
  80     return 0;
  81 }
  82
  83 static inline int
  84 streq6 (const char *s1, const char *s2, char s26, char s27, char s28)
  85 {
  86   if (s1[6] == s26)
  87     {
  88       if (s26 == 0)
  89         return 1;
  90       else
  91         return streq7 (s1, s2, s27, s28);
  92     }
  93   else
  94     return 0;
  95 }
  96
  97 static inline int
  98 streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28)
  99 {
 100   if (s1[5] == s25)
 101     {
 102       if (s25 == 0)
 103         return 1;
 104       else
 105         return streq6 (s1, s2, s26, s27, s28);
 106     }
 107   else
 108     return 0;
 109 }
 110
 111 static inline int
 112 streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28)
 113 {
 114   if (s1[4] == s24)
 115     {
 116       if (s24 == 0)
 117         return 1;
 118       else
 119         return streq5 (s1, s2, s25, s26, s27, s28);
 120     }
 121   else
 122     return 0;
 123 }
 124
 125 static inline int
 126 streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28)
 127 {
 128   if (s1[3] == s23)
 129     {
 130       if (s23 == 0)
 131         return 1;
 132       else
 133         return streq4 (s1, s2, s24, s25, s26, s27, s28);
 134     }
 135   else
 136     return 0;
 137 }
 138
 139 static inline int
 140 streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
 141 {
 142   if (s1[2] == s22)
 143     {
 144       if (s22 == 0)
 145         return 1;
 146       else
 147         return streq3 (s1, s2, s23, s24, s25, s26, s27, s28);
 148     }
 149   else
 150     return 0;
 151 }
 152
 153 static inline int
 154 streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
 155 {
 156   if (s1[1] == s21)
 157     {
 158       if (s21 == 0)
 159         return 1;
 160       else
 161         return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28);
 162     }
 163   else
 164     return 0;
 165 }
 166
 167 static inline int
 168 streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
 169 {
 170   if (s1[0] == s20)
 171     {
 172       if (s20 == 0)
 173         return 1;
 174       else
 175         return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28);
 176     }
 177   else
 178     return 0;
 179 }
 180
 181 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
 182   streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28)
 183
 184 #else
 185
 186 #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
 187   (strcmp (s1, s2) == 0)
 188
 189 #endif
 190
 191
 192 static int
 193 is_cjk_encoding (const char *encoding)
 194 {
 195   if (0
 196       /* Legacy Japanese encodings */
 197       || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)
 198       /* Legacy Chinese encodings */
 199       || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
 200       || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
 201       || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
 202       || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
 203       /* Legacy Korean encodings */
 204       || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
 205       || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0)
 206       || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0))
 207     return 1;
 208   return 0;
 209 }
 210
 211 static int
 212 is_utf8_encoding (const char *encoding)
 213 {
 214   if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
 215     return 1;
 216   return 0;
 217 }
 218
 219
 220 /* Determine number of column positions required for UC. */
 221 int uc_width (unsigned int uc, const char *encoding);
 222
 223 /*
 224  * Non-spacing attribute table.
 225  * Consists of:
 226  * - Non-spacing characters; generated from PropList.txt or
 227  *   "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
 228  * - Format control characters; generated from
 229  *   "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
 230  * - Zero width characters; generated from
 231  *   "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
 232  */
 233 static const unsigned char nonspacing_table_data[16*64] = {
 234   /* 0x0000-0x01ff */
 235   0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */
 236   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */
 237   0xff, 0xff, 0xff, 0xff, 0x00, 0x20, 0x00, 0x00, /* 0x0080-0x00bf */
 238   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */
 239   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */
 240   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */
 241   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */
 242   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */
 243   /* 0x0200-0x03ff */
 244   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */
 245   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */
 246   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */
 247   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */
 248   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */
 249   0xff, 0xff, 0xff, 0xe0, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */
 250   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */
 251   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */
 252   /* 0x0400-0x05ff */
 253   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */
 254   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */
 255   0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */
 256   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */
 257   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */
 258   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */
 259   0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */
 260   0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
 261   /* 0x0600-0x07ff */
 262   0x0f, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
 263   0x00, 0xf8, 0xff, 0x01, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
 264   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
 265   0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
 266   0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
 267   0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
 268   0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
 269   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */
 270   /* 0x0800-0x09ff */
 271   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */
 272   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
 273   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
 274   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */
 275   0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */
 276   0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
 277   0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
 278   0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */
 279   /* 0x0a00-0x0bff */
 280   0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */
 281   0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */
 282   0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */
 283   0xbe, 0x21, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */
 284   0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */
 285   0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */
 286   0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */
 287   0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */
 288   /* 0x0c00-0x0dff */
 289   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */
 290   0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */
 291   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0c80-0x0cbf */
 292   0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */
 293   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */
 294   0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */
 295   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */
 296   0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */
 297   /* 0x0e00-0x0fff */
 298   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */
 299   0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */
 300   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */
 301   0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */
 302   0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */
 303   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */
 304   0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */
 305   0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */
 306   /* 0x1000-0x11ff */
 307   0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */
 308   0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */
 309   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */
 310   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */
 311   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */
 312   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */
 313   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */
 314   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */
 315   /* 0x1600-0x17ff */
 316   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */
 317   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */
 318   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */
 319   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */
 320   0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */
 321   0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */
 322   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x3f, /* 0x1780-0x17bf */
 323   0x40, 0xfe, 0x0f, 0x20, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */
 324   /* 0x1800-0x19ff */
 325   0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */
 326   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */
 327   0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
 328   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */
 329   0x00, 0x00, 0x00, 0x00, 0x87, 0x0f, 0x04, 0x0e, /* 0x1900-0x193f */
 330   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */
 331   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */
 332   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */
 333   /* 0x2000-0x21ff */
 334   0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */
 335   0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */
 336   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */
 337   0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */
 338   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */
 339   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */
 340   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */
 341   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */
 342   /* 0x3000-0x31ff */
 343   0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */
 344   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */
 345   0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */
 346   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */
 347   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */
 348   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
 349   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */
 350   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */
 351   /* 0xfa00-0xfbff */
 352   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */
 353   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */
 354   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */
 355   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */
 356   0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */
 357   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */
 358   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */
 359   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */
 360   /* 0xfe00-0xffff */
 361   0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */
 362   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */
 363   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */
 364   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */
 365   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */
 366   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */
 367   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
 368   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */
 369   /* 0x1d000-0x1d1ff */
 370   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */
 371   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */
 372   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */
 373   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */
 374   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */
 375   0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */
 376   0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */
 377   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* 0x1d1c0-0x1d1ff */
 378 };
 379 static const signed char nonspacing_table_ind[240] = {
 380    0,  1,  2,  3,  4,  5,  6,  7, /* 0x0000-0x0fff */
 381    8, -1, -1,  9, 10, -1, -1, -1, /* 0x1000-0x1fff */
 382   11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */
 383   12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */
 384   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */
 385   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */
 386   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */
 387   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */
 388   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */
 389   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */
 390   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */
 391   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */
 392   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */
 393   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */
 394   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */
 395   -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */
 396   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */
 397   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */
 398   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */
 399   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */
 400   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */
 401   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */
 402   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */
 403   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */
 404   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */
 405   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */
 406   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */
 407   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */
 408   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */
 409   15, -1, -1, -1, -1, -1, -1, -1  /* 0x1d000-0x1dfff */
 410 };
 411
 412 /* Determine number of column positions required for UC. */
 413 int
 414 uc_width (unsigned int uc, const char *encoding)
 415 {
 416   /* Test for non-spacing or control character.  */
 417   if ((uc >> 9) < 240)
 418     {
 419       int ind = nonspacing_table_ind[uc >> 9];
 420       if (ind >= 0)
 421         if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
 422           {
 423             if (uc > 0 && uc < 0xa0)
 424               return -1;
 425             else
 426               return 0;
 427           }
 428     }
 429   else if ((uc >> 9) == (0xe0000 >> 9))
 430     {
 431       if (uc < 0xe0100
 432           ? (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
 433           : (uc <= 0xe01ef))
 434         return 0;
 435     }
 436   /* Test for double-width character.
 437    * Generated from "grep '^....;[WF]' EastAsianWidth.txt"
 438    * and            "grep '^....;[^WF]' EastAsianWidth.txt"
 439    */
 440   if (uc >= 0x1100
 441       && ((uc < 0x1160) /* Hangul Jamo */
 442           || (uc >= 0x2e80 && uc < 0x4dc0  /* CJK */
 443               && !(uc == 0x303f))
 444           || (uc >= 0x4e00 && uc < 0xa4d0) /* CJK ... Yi */
 445           || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */
 446           || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */
 447           || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */
 448           || (uc >= 0xff00 && uc < 0xff61) /* Fullwidth Forms */
 449           || (uc >= 0xffe0 && uc < 0xffe7)
 450           || (uc >= 0x20000 && uc <= 0x2fffd) /* CJK, CJK Compatibility Ideographs */
 451           || (uc >= 0x30000 && uc <= 0x3fffd)
 452      )   )
 453     return 2;
 454   /* In ancient CJK encodings, Cyrillic and most other characters are
 455      double-width as well.  */
 456   if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
 457       && is_cjk_encoding (encoding))
 458     return 2;
 459   return 1;
 460 }
 461
 462
 463 #ifdef unused
 464
 465 /* Determine number of column positions required for first N units
 466    (or fewer if S ends before this) in S.  */
 467
 468 int
 469 u8_width (const unsigned char *s, size_t n, const char *encoding)
 470 {
 471   const unsigned char *s_end = s + n;
 472   int width = 0;
 473
 474   while (s < s_end)
 475     {
 476       unsigned int uc;
 477       int w;
 478
 479       s += u8_mbtouc (&uc, s, s_end - s);
 480
 481       if (uc == 0)
 482         break; /* end of string reached */
 483
 484       w = uc_width (uc, encoding);
 485       if (w >= 0) /* ignore control characters in the string */
 486         width += w;
 487     }
 488
 489   return width;
 490 }
 491
 492 int
 493 u16_width (const unsigned short *s, size_t n, const char *encoding)
 494 {
 495   const unsigned short *s_end = s + n;
 496   int width = 0;
 497
 498   while (s < s_end)
 499     {
 500       unsigned int uc;
 501       int w;
 502
 503       s += u16_mbtouc (&uc, s, s_end - s);
 504
 505       if (uc == 0)
 506         break; /* end of string reached */
 507
 508       w = uc_width (uc, encoding);
 509       if (w >= 0) /* ignore control characters in the string */
 510         width += w;
 511     }
 512
 513   return width;
 514 }
 515
 516 int
 517 u32_width (const unsigned int *s, size_t n, const char *encoding)
 518 {
 519   const unsigned int *s_end = s + n;
 520   int width = 0;
 521
 522   while (s < s_end)
 523     {
 524       unsigned int uc = *s++;
 525       int w;
 526
 527       if (uc == 0)
 528         break; /* end of string reached */
 529
 530       w = uc_width (uc, encoding);
 531       if (w >= 0) /* ignore control characters in the string */
 532         width += w;
 533     }
 534
 535   return width;
 536 }
 537
 538 #endif
 539
 540
 541 /* Determine the line break points in S, and store the result at p[0..n-1].  */
 542 /* We don't support line breaking of complex-context dependent characters
 543    (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
 544
 545 /* Line breaking classification.  */
 546
 547 enum
 548 {
 549   /* Values >= 20 are resolved at run time. */
 550   LBP_BK =  0, /* mandatory break */
 551 /*LBP_CR,         carriage return - not used here because it's a DOSism */
 552 /*LBP_LF,         line feed - not used here because it's a DOSism */
 553   LBP_CM = 20, /* attached characters and combining marks */
 554 /*LBP_SG,         surrogates - not used here because they are not characters */
 555   LBP_ZW =  1, /* zero width space */
 556   LBP_IN =  2, /* inseparable */
 557   LBP_GL =  3, /* non-breaking (glue) */
 558   LBP_CB = 22, /* contingent break opportunity */
 559   LBP_SP = 21, /* space */
 560   LBP_BA =  4, /* break opportunity after */
 561   LBP_BB =  5, /* break opportunity before */
 562   LBP_B2 =  6, /* break opportunity before and after */
 563   LBP_HY =  7, /* hyphen */
 564   LBP_NS =  8, /* non starter */
 565   LBP_OP =  9, /* opening punctuation */
 566   LBP_CL = 10, /* closing punctuation */
 567   LBP_QU = 11, /* ambiguous quotation */
 568   LBP_EX = 12, /* exclamation/interrogation */
 569   LBP_ID = 13, /* ideographic */
 570   LBP_NU = 14, /* numeric */
 571   LBP_IS = 15, /* infix separator (numeric) */
 572   LBP_SY = 16, /* symbols allowing breaks */
 573   LBP_AL = 17, /* ordinary alphabetic and symbol characters */
 574   LBP_PR = 18, /* prefix (numeric) */
 575   LBP_PO = 19, /* postfix (numeric) */
 576   LBP_SA = 23, /* complex context (South East Asian) */
 577   LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
 578   LBP_XX = 25  /* unknown */
 579 };
 580
 581 #include "lbrkprop.h"
 582
 583 static inline unsigned char
 584 lbrkprop_lookup (unsigned int uc)
 585 {
 586   unsigned int index1 = uc >> lbrkprop_header_0;
 587   if (index1 < lbrkprop_header_1)
 588     {
 589       int lookup1 = lbrkprop.level1[index1];
 590       if (lookup1 >= 0)
 591         {
 592           unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
 593           int lookup2 = lbrkprop.level2[lookup1 + index2];
 594           if (lookup2 >= 0)
 595             {
 596               unsigned int index3 = uc & lbrkprop_header_4;
 597               return lbrkprop.level3[lookup2 + index3];
 598             }
 599         }
 600     }
 601   return LBP_XX;
 602 }
 603
 604 /* Table indexed by two line breaking classifications.  */
 605 #define D 1  /* direct break opportunity, empty in table 7.3 of UTR #14 */
 606 #define I 2  /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
 607 #define P 3  /* prohibited break,           '^' in table 7.3 of UTR #14 */
 608 static const unsigned char lbrk_table[19][19] = {
 609                                 /* after */
 610         /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
 611 /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
 612 /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 613 /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
 614 /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 615 /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
 616 /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 617 /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 618 /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 619 /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
 620 /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
 621 /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
 622 /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 623 /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
 624 /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
 625 /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
 626 /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
 627 /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
 628 /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
 629 /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
 630 /* "" */
 631 /* before */
 632 };
 633 /* Note: The (B2,B2) entry should probably be D instead of P.  */
 634 /* Note: The (PR,ID) entry should probably be D instead of I.  */
 635
 636 void
 637 u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
 638 {
 639   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
 640   const unsigned char *s_end = s + n;
 641   int last_prop = LBP_BK; /* line break property of last non-space character */
 642   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 643   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
 644
 645   /* Don't break inside multibyte characters.  */
 646   memset (p, UC_BREAK_PROHIBITED, n);
 647
 648   while (s < s_end)
 649     {
 650       unsigned int uc;
 651       int count = u8_mbtouc (&uc, s, s_end - s);
 652       int prop = lbrkprop_lookup (uc);
 653
 654       if (prop == LBP_BK)
 655         {
 656           /* Mandatory break.  */
 657           *p = UC_BREAK_MANDATORY;
 658           last_prop = LBP_BK;
 659           seen_space = NULL;
 660           seen_space2 = NULL;
 661         }
 662       else
 663         {
 664           char *q;
 665
 666           /* Resolve property values whose behaviour is not fixed.  */
 667           switch (prop)
 668             {
 669               case LBP_AI:
 670                 /* Resolve ambiguous.  */
 671                 prop = LBP_AI_REPLACEMENT;
 672                 break;
 673               case LBP_CB:
 674                 /* This is arbitrary.  */
 675                 prop = LBP_ID;
 676                 break;
 677               case LBP_SA:
 678                 /* We don't handle complex scripts yet.
 679                    Treat LBP_SA like LBP_XX.  */
 680               case LBP_XX:
 681                 /* This is arbitrary.  */
 682                 prop = LBP_AL;
 683                 break;
 684             }
 685
 686           /* Deal with combining characters.  */
 687           q = p;
 688           if (prop == LBP_CM)
 689             {
 690               /* Don't break just before a combining character.  */
 691               *p = UC_BREAK_PROHIBITED;
 692               /* A combining character turns a preceding space into LBP_AL.  */
 693               if (seen_space != NULL)
 694                 {
 695                   q = seen_space;
 696                   seen_space = seen_space2;
 697                   prop = LBP_AL;
 698                   goto lookup_via_table;
 699                 }
 700             }
 701           else if (prop == LBP_SP)
 702             {
 703               /* Don't break just before a space.  */
 704               *p = UC_BREAK_PROHIBITED;
 705               seen_space2 = seen_space;
 706               seen_space = p;
 707             }
 708           else
 709             {
 710              lookup_via_table:
 711               /* prop must be usable as an index for table 7.3 of UTR #14.  */
 712               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
 713                 abort ();
 714
 715               if (last_prop == LBP_BK)
 716                 {
 717                   /* Don't break at the beginning of a line.  */
 718                   *q = UC_BREAK_PROHIBITED;
 719                 }
 720               else
 721                 {
 722                   switch (lbrk_table [last_prop-1] [prop-1])
 723                     {
 724                       case D:
 725                         *q = UC_BREAK_POSSIBLE;
 726                         break;
 727                       case I:
 728                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 729                         break;
 730                       case P:
 731                         *q = UC_BREAK_PROHIBITED;
 732                         break;
 733                       default:
 734                         abort ();
 735                     }
 736                 }
 737               last_prop = prop;
 738               seen_space = NULL;
 739               seen_space2 = NULL;
 740             }
 741         }
 742
 743       s += count;
 744       p += count;
 745     }
 746 }
 747
 748 #ifdef unused
 749
 750 void
 751 u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
 752 {
 753   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
 754   const unsigned short *s_end = s + n;
 755   int last_prop = LBP_BK; /* line break property of last non-space character */
 756   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 757   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
 758
 759   /* Don't break inside multibyte characters.  */
 760   memset (p, UC_BREAK_PROHIBITED, n);
 761
 762   while (s < s_end)
 763     {
 764       unsigned int uc;
 765       int count = u16_mbtouc (&uc, s, s_end - s);
 766       int prop = lbrkprop_lookup (uc);
 767
 768       if (prop == LBP_BK)
 769         {
 770           /* Mandatory break.  */
 771           *p = UC_BREAK_MANDATORY;
 772           last_prop = LBP_BK;
 773           seen_space = NULL;
 774           seen_space2 = NULL;
 775         }
 776       else
 777         {
 778           char *q;
 779
 780           /* Resolve property values whose behaviour is not fixed.  */
 781           switch (prop)
 782             {
 783               case LBP_AI:
 784                 /* Resolve ambiguous.  */
 785                 prop = LBP_AI_REPLACEMENT;
 786                 break;
 787               case LBP_CB:
 788                 /* This is arbitrary.  */
 789                 prop = LBP_ID;
 790                 break;
 791               case LBP_SA:
 792                 /* We don't handle complex scripts yet.
 793                    Treat LBP_SA like LBP_XX.  */
 794               case LBP_XX:
 795                 /* This is arbitrary.  */
 796                 prop = LBP_AL;
 797                 break;
 798             }
 799
 800           /* Deal with combining characters.  */
 801           q = p;
 802           if (prop == LBP_CM)
 803             {
 804               /* Don't break just before a combining character.  */
 805               *p = UC_BREAK_PROHIBITED;
 806               /* A combining character turns a preceding space into LBP_AL.  */
 807               if (seen_space != NULL)
 808                 {
 809                   q = seen_space;
 810                   seen_space = seen_space2;
 811                   prop = LBP_AL;
 812                   goto lookup_via_table;
 813                 }
 814             }
 815           else if (prop == LBP_SP)
 816             {
 817               /* Don't break just before a space.  */
 818               *p = UC_BREAK_PROHIBITED;
 819               seen_space2 = seen_space;
 820               seen_space = p;
 821             }
 822           else
 823             {
 824              lookup_via_table:
 825               /* prop must be usable as an index for table 7.3 of UTR #14.  */
 826               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
 827                 abort ();
 828
 829               if (last_prop == LBP_BK)
 830                 {
 831                   /* Don't break at the beginning of a line.  */
 832                   *q = UC_BREAK_PROHIBITED;
 833                 }
 834               else
 835                 {
 836                   switch (lbrk_table [last_prop-1] [prop-1])
 837                     {
 838                       case D:
 839                         *q = UC_BREAK_POSSIBLE;
 840                         break;
 841                       case I:
 842                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 843                         break;
 844                       case P:
 845                         *q = UC_BREAK_PROHIBITED;
 846                         break;
 847                       default:
 848                         abort ();
 849                     }
 850                 }
 851               last_prop = prop;
 852               seen_space = NULL;
 853               seen_space2 = NULL;
 854             }
 855         }
 856
 857       s += count;
 858       p += count;
 859     }
 860 }
 861
 862 void
 863 u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
 864 {
 865   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
 866   const unsigned int *s_end = s + n;
 867   int last_prop = LBP_BK; /* line break property of last non-space character */
 868   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 869   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
 870
 871   while (s < s_end)
 872     {
 873       unsigned int uc = *s;
 874       int prop = lbrkprop_lookup (uc);
 875
 876       if (prop == LBP_BK)
 877         {
 878           /* Mandatory break.  */
 879           *p = UC_BREAK_MANDATORY;
 880           last_prop = LBP_BK;
 881           seen_space = NULL;
 882           seen_space2 = NULL;
 883         }
 884       else
 885         {
 886           char *q;
 887
 888           /* Resolve property values whose behaviour is not fixed.  */
 889           switch (prop)
 890             {
 891               case LBP_AI:
 892                 /* Resolve ambiguous.  */
 893                 prop = LBP_AI_REPLACEMENT;
 894                 break;
 895               case LBP_CB:
 896                 /* This is arbitrary.  */
 897                 prop = LBP_ID;
 898                 break;
 899               case LBP_SA:
 900                 /* We don't handle complex scripts yet.
 901                    Treat LBP_SA like LBP_XX.  */
 902               case LBP_XX:
 903                 /* This is arbitrary.  */
 904                 prop = LBP_AL;
 905                 break;
 906             }
 907
 908           /* Deal with combining characters.  */
 909           q = p;
 910           if (prop == LBP_CM)
 911             {
 912               /* Don't break just before a combining character.  */
 913               *p = UC_BREAK_PROHIBITED;
 914               /* A combining character turns a preceding space into LBP_AL.  */
 915               if (seen_space != NULL)
 916                 {
 917                   q = seen_space;
 918                   seen_space = seen_space2;
 919                   prop = LBP_AL;
 920                   goto lookup_via_table;
 921                 }
 922             }
 923           else if (prop == LBP_SP)
 924             {
 925               /* Don't break just before a space.  */
 926               *p = UC_BREAK_PROHIBITED;
 927               seen_space2 = seen_space;
 928               seen_space = p;
 929             }
 930           else
 931             {
 932              lookup_via_table:
 933               /* prop must be usable as an index for table 7.3 of UTR #14.  */
 934               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
 935                 abort ();
 936
 937               if (last_prop == LBP_BK)
 938                 {
 939                   /* Don't break at the beginning of a line.  */
 940                   *q = UC_BREAK_PROHIBITED;
 941                 }
 942               else
 943                 {
 944                   switch (lbrk_table [last_prop-1] [prop-1])
 945                     {
 946                       case D:
 947                         *q = UC_BREAK_POSSIBLE;
 948                         break;
 949                       case I:
 950                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 951                         break;
 952                       case P:
 953                         *q = UC_BREAK_PROHIBITED;
 954                         break;
 955                       default:
 956                         abort ();
 957                     }
 958                 }
 959               last_prop = prop;
 960               seen_space = NULL;
 961               seen_space2 = NULL;
 962             }
 963         }
 964
 965       s++;
 966       p++;
 967     }
 968 }
 969
 970 #endif
 971
 972
 973 /* Choose the best line breaks, assuming the uc_width function.
 974    Return the column after the end of the string.  */
 975
 976 int
 977 u8_width_linebreaks (const unsigned char *s, size_t n,
 978                      int width, int start_column, int at_end_columns,
 979                      const char *o, const char *encoding,
 980                      char *p)
 981 {
 982   const unsigned char *s_end;
 983   char *last_p;
 984   int last_column;
 985   int piece_width;
 986
 987   u8_possible_linebreaks (s, n, encoding, p);
 988
 989   s_end = s + n;
 990   last_p = NULL;
 991   last_column = start_column;
 992   piece_width = 0;
 993   while (s < s_end)
 994     {
 995       unsigned int uc;
 996       int count = u8_mbtouc (&uc, s, s_end - s);
 997
 998       /* Respect the override.  */
 999       if (o != NULL && *o != UC_BREAK_UNDEFINED)
1000         *p = *o;
1001
1002       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1003         {
1004           /* An atomic piece of text ends here.  */
1005           if (last_p != NULL && last_column + piece_width > width)
1006             {
1007               /* Insert a line break.  */
1008               *last_p = UC_BREAK_POSSIBLE;
1009               last_column = 0;
1010             }
1011         }
1012
1013       if (*p == UC_BREAK_MANDATORY)
1014         {
1015           /* uc is a line break character.  */
1016           /* Start a new piece at column 0.  */
1017           last_p = NULL;
1018           last_column = 0;
1019           piece_width = 0;
1020         }
1021       else
1022         {
1023           /* uc is not a line break character.  */
1024           int w;
1025
1026           if (*p == UC_BREAK_POSSIBLE)
1027             {
1028               /* Start a new piece.  */
1029               last_p = p;
1030               last_column += piece_width;
1031               piece_width = 0;
1032               /* No line break for the moment, may be turned into
1033                  UC_BREAK_POSSIBLE later, via last_p. */
1034             }
1035
1036           *p = UC_BREAK_PROHIBITED;
1037
1038           w = uc_width (uc, encoding);
1039           if (w >= 0) /* ignore control characters in the string */
1040             piece_width += w;
1041          }
1042
1043       s += count;
1044       p += count;
1045       if (o != NULL)
1046         o += count;
1047     }
1048
1049   /* The last atomic piece of text ends here.  */
1050   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1051     {
1052       /* Insert a line break.  */
1053       *last_p = UC_BREAK_POSSIBLE;
1054       last_column = 0;
1055     }
1056
1057   return last_column + piece_width;
1058 }
1059
1060 #ifdef unused
1061
1062 int
1063 u16_width_linebreaks (const unsigned short *s, size_t n,
1064                       int width, int start_column, int at_end_columns,
1065                       const char *o, const char *encoding,
1066                       char *p)
1067 {
1068   const unsigned short *s_end;
1069   char *last_p;
1070   int last_column;
1071   int piece_width;
1072
1073   u16_possible_linebreaks (s, n, encoding, p);
1074
1075   s_end = s + n;
1076   last_p = NULL;
1077   last_column = start_column;
1078   piece_width = 0;
1079   while (s < s_end)
1080     {
1081       unsigned int uc;
1082       int count = u16_mbtouc (&uc, s, s_end - s);
1083
1084       /* Respect the override.  */
1085       if (o != NULL && *o != UC_BREAK_UNDEFINED)
1086         *p = *o;
1087
1088       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1089         {
1090           /* An atomic piece of text ends here.  */
1091           if (last_p != NULL && last_column + piece_width > width)
1092             {
1093               /* Insert a line break.  */
1094               *last_p = UC_BREAK_POSSIBLE;
1095               last_column = 0;
1096             }
1097         }
1098
1099       if (*p == UC_BREAK_MANDATORY)
1100         {
1101           /* uc is a line break character.  */
1102           /* Start a new piece at column 0.  */
1103           last_p = NULL;
1104           last_column = 0;
1105           piece_width = 0;
1106         }
1107       else
1108         {
1109           /* uc is not a line break character.  */
1110           int w;
1111
1112           if (*p == UC_BREAK_POSSIBLE)
1113             {
1114               /* Start a new piece.  */
1115               last_p = p;
1116               last_column += piece_width;
1117               piece_width = 0;
1118               /* No line break for the moment, may be turned into
1119                  UC_BREAK_POSSIBLE later, via last_p. */
1120             }
1121
1122           *p = UC_BREAK_PROHIBITED;
1123
1124           w = uc_width (uc, encoding);
1125           if (w >= 0) /* ignore control characters in the string */
1126             piece_width += w;
1127          }
1128
1129       s += count;
1130       p += count;
1131       if (o != NULL)
1132         o += count;
1133     }
1134
1135   /* The last atomic piece of text ends here.  */
1136   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1137     {
1138       /* Insert a line break.  */
1139       *last_p = UC_BREAK_POSSIBLE;
1140       last_column = 0;
1141     }
1142
1143   return last_column + piece_width;
1144 }
1145
1146 int
1147 u32_width_linebreaks (const unsigned int *s, size_t n,
1148                       int width, int start_column, int at_end_columns,
1149                       const char *o, const char *encoding,
1150                       char *p)
1151 {
1152   const unsigned int *s_end;
1153   char *last_p;
1154   int last_column;
1155   int piece_width;
1156
1157   u32_possible_linebreaks (s, n, encoding, p);
1158
1159   s_end = s + n;
1160   last_p = NULL;
1161   last_column = start_column;
1162   piece_width = 0;
1163   while (s < s_end)
1164     {
1165       unsigned int uc = *s;
1166
1167       /* Respect the override.  */
1168       if (o != NULL && *o != UC_BREAK_UNDEFINED)
1169         *p = *o;
1170
1171       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1172         {
1173           /* An atomic piece of text ends here.  */
1174           if (last_p != NULL && last_column + piece_width > width)
1175             {
1176               /* Insert a line break.  */
1177               *last_p = UC_BREAK_POSSIBLE;
1178               last_column = 0;
1179             }
1180         }
1181
1182       if (*p == UC_BREAK_MANDATORY)
1183         {
1184           /* uc is a line break character.  */
1185           /* Start a new piece at column 0.  */
1186           last_p = NULL;
1187           last_column = 0;
1188           piece_width = 0;
1189         }
1190       else
1191         {
1192           /* uc is not a line break character.  */
1193           int w;
1194
1195           if (*p == UC_BREAK_POSSIBLE)
1196             {
1197               /* Start a new piece.  */
1198               last_p = p;
1199               last_column += piece_width;
1200               piece_width = 0;
1201               /* No line break for the moment, may be turned into
1202                  UC_BREAK_POSSIBLE later, via last_p. */
1203             }
1204
1205           *p = UC_BREAK_PROHIBITED;
1206
1207           w = uc_width (uc, encoding);
1208           if (w >= 0) /* ignore control characters in the string */
1209             piece_width += w;
1210          }
1211
1212       s++;
1213       p++;
1214       if (o != NULL)
1215         o++;
1216     }
1217
1218   /* The last atomic piece of text ends here.  */
1219   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1220     {
1221       /* Insert a line break.  */
1222       *last_p = UC_BREAK_POSSIBLE;
1223       last_column = 0;
1224     }
1225
1226   return last_column + piece_width;
1227 }
1228
1229 #endif
1230
1231
1232 #ifdef TEST1
1233
1234 #include <stdio.h>
1235
1236 /* Read the contents of an input stream, and return it, terminated with a NUL
1237    byte. */
1238 char *
1239 read_file (FILE *stream)
1240 {
1241 #define BUFSIZE 4096
1242   char *buf = NULL;
1243   int alloc = 0;
1244   int size = 0;
1245   int count;
1246
1247   while (! feof (stream))
1248     {
1249       if (size + BUFSIZE > alloc)
1250         {
1251           alloc = alloc + alloc / 2;
1252           if (alloc < size + BUFSIZE)
1253             alloc = size + BUFSIZE;
1254           buf = realloc (buf, alloc);
1255           if (buf == NULL)
1256             {
1257               fprintf (stderr, "out of memory\n");
1258               exit (1);
1259             }
1260         }
1261       count = fread (buf + size, 1, BUFSIZE, stream);
1262       if (count == 0)
1263         {
1264           if (ferror (stream))
1265             {
1266               perror ("fread");
1267               exit (1);
1268             }
1269         }
1270       else
1271         size += count;
1272     }
1273   buf = realloc (buf, size + 1);
1274   if (buf == NULL)
1275     {
1276       fprintf (stderr, "out of memory\n");
1277       exit (1);
1278     }
1279   buf[size] = '\0';
1280   return buf;
1281 #undef BUFSIZE
1282 }
1283
1284 int
1285 main (int argc, char * argv[])
1286 {
1287   if (argc == 1)
1288     {
1289       /* Display all the break opportunities in the input string.  */
1290       char *input = read_file (stdin);
1291       int length = strlen (input);
1292       char *breaks = malloc (length);
1293       int i;
1294
1295       u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
1296
1297       for (i = 0; i < length; i++)
1298         {
1299           switch (breaks[i])
1300             {
1301               case UC_BREAK_POSSIBLE:
1302                 /* U+2027 in UTF-8 encoding */
1303                 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
1304                 break;
1305               case UC_BREAK_MANDATORY:
1306                 /* U+21B2 (or U+21B5) in UTF-8 encoding */
1307                 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
1308                 break;
1309               case UC_BREAK_PROHIBITED:
1310                 break;
1311               default:
1312                 abort ();
1313             }
1314           putc (input[i], stdout);
1315         }
1316
1317       free (breaks);
1318
1319       return 0;
1320     }
1321   else if (argc == 2)
1322     {
1323       /* Insert line breaks for a given width.  */
1324       int width = atoi (argv[1]);
1325       char *input = read_file (stdin);
1326       int length = strlen (input);
1327       char *breaks = malloc (length);
1328       int i;
1329
1330       u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
1331
1332       for (i = 0; i < length; i++)
1333         {
1334           switch (breaks[i])
1335             {
1336               case UC_BREAK_POSSIBLE:
1337                 putc ('\n', stdout);
1338                 break;
1339               case UC_BREAK_MANDATORY:
1340                 break;
1341               case UC_BREAK_PROHIBITED:
1342                 break;
1343               default:
1344                 abort ();
1345             }
1346           putc (input[i], stdout);
1347         }
1348
1349       free (breaks);
1350
1351       return 0;
1352     }
1353   else
1354     return 1;
1355 }
1356
1357 #endif /* TEST1 */
1358
1359
1360 /* Now the same thing with an arbitrary encoding.
1361
1362    We convert the input string to Unicode.
1363
1364    The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
1365    UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
1366    \U0000FFFF.  UTF-16 and variants support only characters up to
1367    \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
1368    UCS-4 specification leaves doubts about endianness and byte order mark.
1369    glibc currently interprets it as big endian without byte order mark,
1370    but this is not backed by an RFC.  So we use UTF-8. It supports
1371    characters up to \U7FFFFFFF and is unambiguously defined.  */
1372
1373 #if HAVE_ICONV
1374
1375 #include <iconv.h>
1376 #include <errno.h>
1377
1378 /* Luckily, the encoding's name is platform independent.  */
1379 #define UTF8_NAME "UTF-8"
1380
1381 /* Return the length of a string after conversion through an iconv_t.  */
1382 static size_t
1383 iconv_string_length (iconv_t cd, const char *s, size_t n)
1384 {
1385 #define TMPBUFSIZE 4096
1386   size_t count = 0;
1387   char tmpbuf[TMPBUFSIZE];
1388   const char *inptr = s;
1389   size_t insize = n;
1390   while (insize > 0)
1391     {
1392       char *outptr = tmpbuf;
1393       size_t outsize = TMPBUFSIZE;
1394       size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1395       if (res == (size_t)(-1) && errno != E2BIG)
1396         return (size_t)(-1);
1397       count += outptr - tmpbuf;
1398     }
1399   /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug.  */
1400 #if defined _LIBICONV_VERSION \
1401     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1402   {
1403     char *outptr = tmpbuf;
1404     size_t outsize = TMPBUFSIZE;
1405     size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
1406     if (res == (size_t)(-1))
1407       return (size_t)(-1);
1408     count += outptr - tmpbuf;
1409   }
1410   /* Return to the initial state.  */
1411   iconv (cd, NULL, NULL, NULL, NULL);
1412 #endif
1413   return count;
1414 #undef TMPBUFSIZE
1415 }
1416
1417 static void
1418 iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
1419                               size_t *offtable, char *t, size_t m)
1420 {
1421   size_t i;
1422   const char *s_end;
1423   const char *inptr;
1424   char *outptr;
1425   size_t outsize;
1426   /* Avoid glibc-2.1 bug.  */
1427 #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
1428   const size_t extra = 1;
1429 #else
1430   const size_t extra = 0;
1431 #endif
1432
1433   for (i = 0; i < n; i++)
1434     offtable[i] = (size_t)(-1);
1435
1436   s_end = s + n;
1437   inptr = s;
1438   outptr = t;
1439   outsize = m + extra;
1440   while (inptr < s_end)
1441     {
1442       const char *saved_inptr;
1443       size_t insize;
1444       size_t res;
1445
1446       offtable[inptr - s] = outptr - t;
1447
1448       saved_inptr = inptr;
1449       res = (size_t)(-1);
1450       for (insize = 1; inptr + insize <= s_end; insize++)
1451         {
1452           res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1453           if (!(res == (size_t)(-1) && errno == EINVAL))
1454             break;
1455           /* We expect that no input bytes have been consumed so far.  */
1456           if (inptr != saved_inptr)
1457             abort ();
1458         }
1459       /* After we verified the convertibility and computed the translation's
1460          size m, there shouldn't be any conversion error here. */
1461       if (res == (size_t)(-1))
1462         abort ();
1463     }
1464   /* Avoid glibc-2.1 bug and Solaris 7 bug.  */
1465 #if defined _LIBICONV_VERSION \
1466     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1467   if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
1468     abort ();
1469 #endif
1470   /* We should have produced exactly m output bytes.  */
1471   if (outsize != extra)
1472     abort ();
1473 }
1474
1475 #endif /* HAVE_ICONV */
1476
1477 #if C_CTYPE_ASCII
1478
1479 /* Tests whether a string is entirely ASCII.  Returns 1 if yes.
1480    Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding.  */
1481 static int
1482 is_all_ascii (const char *s, size_t n)
1483 {
1484   for (; n > 0; s++, n--)
1485     {
1486       unsigned char c = (unsigned char) *s;
1487
1488       if (!(c_isprint (c) || c_isspace (c)))
1489         return 0;
1490     }
1491   return 1;
1492 }
1493
1494 #endif /* C_CTYPE_ASCII */
1495
1496 #if defined unused || defined TEST2
1497
1498 void
1499 mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
1500                          char *p)
1501 {
1502   if (n == 0)
1503     return;
1504   if (is_utf8_encoding (encoding))
1505     u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1506   else
1507     {
1508 #if HAVE_ICONV
1509       iconv_t to_utf8;
1510       /* Avoid glibc-2.1 bug with EUC-KR.  */
1511 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1512       if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1513         to_utf8 = (iconv_t)(-1);
1514       else
1515 # endif
1516       /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1517          GB18030.  */
1518 # if defined __sun && !defined _LIBICONV_VERSION
1519       if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1520           || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1521           || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1522           || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1523           || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1524           || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1525         to_utf8 = (iconv_t)(-1);
1526       else
1527 # endif
1528       to_utf8 = iconv_open (UTF8_NAME, encoding);
1529       if (to_utf8 != (iconv_t)(-1))
1530         {
1531           /* Determine the length of the resulting UTF-8 string.  */
1532           size_t m = iconv_string_length (to_utf8, s, n);
1533           if (m != (size_t)(-1))
1534             {
1535               /* Convert the string to UTF-8 and build a translation table
1536                  from offsets into s to offsets into the translated string.  */
1537               size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
1538               char *memory =
1539                 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1540               if (memory != NULL)
1541                 {
1542                   size_t *offtable = (size_t *) memory;
1543                   char *t = (char *) (offtable + n);
1544                   char *q = (char *) (t + m);
1545                   size_t i;
1546
1547                   iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1548
1549                   /* Determine the possible line breaks of the UTF-8 string.  */
1550                   u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1551
1552                   /* Translate the result back to the original string.  */
1553                   memset (p, UC_BREAK_PROHIBITED, n);
1554                   for (i = 0; i < n; i++)
1555                     if (offtable[i] != (size_t)(-1))
1556                       p[i] = q[offtable[i]];
1557
1558                   free (memory);
1559                   iconv_close (to_utf8);
1560                   return;
1561                 }
1562             }
1563           iconv_close (to_utf8);
1564         }
1565 #endif
1566       /* Impossible to convert.  */
1567 #if C_CTYPE_ASCII
1568       if (is_all_ascii (s, n))
1569         {
1570           /* ASCII is a subset of UTF-8.  */
1571           u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1572           return;
1573         }
1574 #endif
1575       /* We have a non-ASCII string and cannot convert it.
1576          Don't produce line breaks except those already present in the
1577          input string.  All we assume here is that the encoding is
1578          minimally ASCII compatible.  */
1579       {
1580         const char *s_end = s + n;
1581         while (s < s_end)
1582           {
1583             *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1584             s++;
1585             p++;
1586           }
1587       }
1588     }
1589 }
1590
1591 #endif
1592
1593 int
1594 mbs_width_linebreaks (const char *s, size_t n,
1595                       int width, int start_column, int at_end_columns,
1596                       const char *o, const char *encoding,
1597                       char *p)
1598 {
1599   if (n == 0)
1600     return start_column;
1601   if (is_utf8_encoding (encoding))
1602     return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1603   else
1604     {
1605 #if HAVE_ICONV
1606       iconv_t to_utf8;
1607       /* Avoid glibc-2.1 bug with EUC-KR.  */
1608 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1609       if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1610         to_utf8 = (iconv_t)(-1);
1611       else
1612 # endif
1613       /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1614          GB18030.  */
1615 # if defined __sun && !defined _LIBICONV_VERSION
1616       if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1617           || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1618           || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1619           || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1620           || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1621           || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1622         to_utf8 = (iconv_t)(-1);
1623       else
1624 # endif
1625       to_utf8 = iconv_open (UTF8_NAME, encoding);
1626       if (to_utf8 != (iconv_t)(-1))
1627         {
1628           /* Determine the length of the resulting UTF-8 string.  */
1629           size_t m = iconv_string_length (to_utf8, s, n);
1630           if (m != (size_t)(-1))
1631             {
1632               /* Convert the string to UTF-8 and build a translation table
1633                  from offsets into s to offsets into the translated string.  */
1634               size_t memory_size =
1635                 xsum4 (xtimes (n, sizeof (size_t)), m, m,
1636                        (o != NULL ? m : 0));
1637               char *memory =
1638                 (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1639               if (memory != NULL)
1640                 {
1641                   size_t *offtable = (size_t *) memory;
1642                   char *t = (char *) (offtable + n);
1643                   char *q = (char *) (t + m);
1644                   char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1645                   int res_column;
1646                   size_t i;
1647
1648                   iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1649
1650                   /* Translate the overrides to the UTF-8 string.  */
1651                   if (o != NULL)
1652                     {
1653                       memset (o8, UC_BREAK_UNDEFINED, m);
1654                       for (i = 0; i < n; i++)
1655                         if (offtable[i] != (size_t)(-1))
1656                           o8[offtable[i]] = o[i];
1657                     }
1658
1659                   /* Determine the line breaks of the UTF-8 string.  */
1660                   res_column =
1661                     u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1662
1663                   /* Translate the result back to the original string.  */
1664                   memset (p, UC_BREAK_PROHIBITED, n);
1665                   for (i = 0; i < n; i++)
1666                     if (offtable[i] != (size_t)(-1))
1667                       p[i] = q[offtable[i]];
1668
1669                   free (memory);
1670                   iconv_close (to_utf8);
1671                   return res_column;
1672                 }
1673             }
1674           iconv_close (to_utf8);
1675         }
1676 #endif
1677       /* Impossible to convert.  */
1678 #if C_CTYPE_ASCII
1679       if (is_all_ascii (s, n))
1680         {
1681           /* ASCII is a subset of UTF-8.  */
1682           return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1683         }
1684 #endif
1685       /* We have a non-ASCII string and cannot convert it.
1686          Don't produce line breaks except those already present in the
1687          input string.  All we assume here is that the encoding is
1688          minimally ASCII compatible.  */
1689       {
1690         const char *s_end = s + n;
1691         while (s < s_end)
1692           {
1693             *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1694                   ? UC_BREAK_MANDATORY
1695                   : UC_BREAK_PROHIBITED);
1696             s++;
1697             p++;
1698             if (o != NULL)
1699               o++;
1700           }
1701         /* We cannot compute widths in this case.  */
1702         return start_column;
1703       }
1704     }
1705 }
1706
1707
1708 #ifdef TEST2
1709
1710 #include <stdio.h>
1711 #include <locale.h>
1712
1713 /* Read the contents of an input stream, and return it, terminated with a NUL
1714    byte. */
1715 char *
1716 read_file (FILE *stream)
1717 {
1718 #define BUFSIZE 4096
1719   char *buf = NULL;
1720   int alloc = 0;
1721   int size = 0;
1722   int count;
1723
1724   while (! feof (stream))
1725     {
1726       if (size + BUFSIZE > alloc)
1727         {
1728           alloc = alloc + alloc / 2;
1729           if (alloc < size + BUFSIZE)
1730             alloc = size + BUFSIZE;
1731           buf = realloc (buf, alloc);
1732           if (buf == NULL)
1733             {
1734               fprintf (stderr, "out of memory\n");
1735               exit (1);
1736             }
1737         }
1738       count = fread (buf + size, 1, BUFSIZE, stream);
1739       if (count == 0)
1740         {
1741           if (ferror (stream))
1742             {
1743               perror ("fread");
1744               exit (1);
1745             }
1746         }
1747       else
1748         size += count;
1749     }
1750   buf = realloc (buf, size + 1);
1751   if (buf == NULL)
1752     {
1753       fprintf (stderr, "out of memory\n");
1754       exit (1);
1755     }
1756   buf[size] = '\0';
1757   return buf;
1758 #undef BUFSIZE
1759 }
1760
1761 int
1762 main (int argc, char * argv[])
1763 {
1764   setlocale (LC_CTYPE, "");
1765   if (argc == 1)
1766     {
1767       /* Display all the break opportunities in the input string.  */
1768       char *input = read_file (stdin);
1769       int length = strlen (input);
1770       char *breaks = malloc (length);
1771       int i;
1772
1773       mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1774
1775       for (i = 0; i < length; i++)
1776         {
1777           switch (breaks[i])
1778             {
1779               case UC_BREAK_POSSIBLE:
1780                 putc ('|', stdout);
1781                 break;
1782               case UC_BREAK_MANDATORY:
1783                 break;
1784               case UC_BREAK_PROHIBITED:
1785                 break;
1786               default:
1787                 abort ();
1788             }
1789           putc (input[i], stdout);
1790         }
1791
1792       free (breaks);
1793
1794       return 0;
1795     }
1796   else if (argc == 2)
1797     {
1798       /* Insert line breaks for a given width.  */
1799       int width = atoi (argv[1]);
1800       char *input = read_file (stdin);
1801       int length = strlen (input);
1802       char *breaks = malloc (length);
1803       int i;
1804
1805       mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1806
1807       for (i = 0; i < length; i++)
1808         {
1809           switch (breaks[i])
1810             {
1811               case UC_BREAK_POSSIBLE:
1812                 putc ('\n', stdout);
1813                 break;
1814               case UC_BREAK_MANDATORY:
1815                 break;
1816               case UC_BREAK_PROHIBITED:
1817                 break;
1818               default:
1819                 abort ();
1820             }
1821           putc (input[i], stdout);
1822         }
1823
1824       free (breaks);
1825
1826       return 0;
1827     }
1828   else
1829     return 1;
1830 }
1831
1832 #endif /* TEST2 */