1 /* Generate a Unicode conforming Line Break Properties tables from a
3 Written by Bruno Haible <bruno@clisp.org>, 2000-2004.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
20 $ gen-lbrkprop /usr/local/share/Unidata/UnicodeData.txt \
22 /usr/local/share/Unidata/EastAsianWidth.txt \
23 /usr/local/share/Unidata/LineBreak.txt \
34 /* This structure represents one line in the UnicodeData.txt file. */
35 struct unicode_attribute
37 const char *name
; /* Character name */
38 const char *category
; /* General category */
39 const char *combining
; /* Canonical combining classes */
40 const char *bidi
; /* Bidirectional category */
41 const char *decomposition
; /* Character decomposition mapping */
42 const char *decdigit
; /* Decimal digit value */
43 const char *digit
; /* Digit value */
44 const char *numeric
; /* Numeric value */
45 int mirrored
; /* mirrored */
46 const char *oldname
; /* Old Unicode 1.0 name */
47 const char *comment
; /* Comment */
48 unsigned int upper
; /* Uppercase mapping */
49 unsigned int lower
; /* Lowercase mapping */
50 unsigned int title
; /* Titlecase mapping */
53 /* Missing fields are represented with "" for strings, and NONE for
55 #define NONE (~(unsigned int)0)
57 /* The entire contents of the UnicodeData.txt file. */
58 struct unicode_attribute unicode_attributes
[0x110000];
60 /* Stores in unicode_attributes[i] the values from the given fields. */
62 fill_attribute (unsigned int i
,
63 const char *field1
, const char *field2
,
64 const char *field3
, const char *field4
,
65 const char *field5
, const char *field6
,
66 const char *field7
, const char *field8
,
67 const char *field9
, const char *field10
,
68 const char *field11
, const char *field12
,
69 const char *field13
, const char *field14
)
71 struct unicode_attribute
* uni
;
75 fprintf (stderr
, "index too large\n");
78 uni
= &unicode_attributes
[i
];
79 /* Copy the strings. */
80 uni
->name
= strdup (field1
);
81 uni
->category
= (field2
[0] == '\0' ? "" : strdup (field2
));
82 uni
->combining
= (field3
[0] == '\0' ? "" : strdup (field3
));
83 uni
->bidi
= (field4
[0] == '\0' ? "" : strdup (field4
));
84 uni
->decomposition
= (field5
[0] == '\0' ? "" : strdup (field5
));
85 uni
->decdigit
= (field6
[0] == '\0' ? "" : strdup (field6
));
86 uni
->digit
= (field7
[0] == '\0' ? "" : strdup (field7
));
87 uni
->numeric
= (field8
[0] == '\0' ? "" : strdup (field8
));
88 uni
->mirrored
= (field9
[0] == 'Y');
89 uni
->oldname
= (field10
[0] == '\0' ? "" : strdup (field10
));
90 uni
->comment
= (field11
[0] == '\0' ? "" : strdup (field11
));
91 uni
->upper
= (field12
[0] =='\0' ? NONE
: strtoul (field12
, NULL
, 16));
92 uni
->lower
= (field13
[0] =='\0' ? NONE
: strtoul (field13
, NULL
, 16));
93 uni
->title
= (field14
[0] =='\0' ? NONE
: strtoul (field14
, NULL
, 16));
96 /* Maximum length of a field in the UnicodeData.txt file. */
99 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
100 Reads up to (but excluding) DELIM.
101 Returns 1 when a field was successfully read, otherwise 0. */
103 getfield (FILE *stream
, char *buffer
, int delim
)
108 for (; (c
= getc (stream
)), (c
!= EOF
&& c
!= delim
); )
110 /* The original unicode.org UnicodeData.txt file happens to have
111 CR/LF line terminators. Silently convert to LF. */
115 /* Put c into the buffer. */
116 if (++count
>= FIELDLEN
- 1)
118 fprintf (stderr
, "field too long\n");
131 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
134 fill_attributes (const char *unicodedata_filename
)
138 char field0
[FIELDLEN
];
139 char field1
[FIELDLEN
];
140 char field2
[FIELDLEN
];
141 char field3
[FIELDLEN
];
142 char field4
[FIELDLEN
];
143 char field5
[FIELDLEN
];
144 char field6
[FIELDLEN
];
145 char field7
[FIELDLEN
];
146 char field8
[FIELDLEN
];
147 char field9
[FIELDLEN
];
148 char field10
[FIELDLEN
];
149 char field11
[FIELDLEN
];
150 char field12
[FIELDLEN
];
151 char field13
[FIELDLEN
];
152 char field14
[FIELDLEN
];
155 for (i
= 0; i
< 0x110000; i
++)
156 unicode_attributes
[i
].name
= NULL
;
158 stream
= fopen (unicodedata_filename
, "r");
161 fprintf (stderr
, "error during fopen of '%s'\n", unicodedata_filename
);
170 n
= getfield (stream
, field0
, ';');
171 n
+= getfield (stream
, field1
, ';');
172 n
+= getfield (stream
, field2
, ';');
173 n
+= getfield (stream
, field3
, ';');
174 n
+= getfield (stream
, field4
, ';');
175 n
+= getfield (stream
, field5
, ';');
176 n
+= getfield (stream
, field6
, ';');
177 n
+= getfield (stream
, field7
, ';');
178 n
+= getfield (stream
, field8
, ';');
179 n
+= getfield (stream
, field9
, ';');
180 n
+= getfield (stream
, field10
, ';');
181 n
+= getfield (stream
, field11
, ';');
182 n
+= getfield (stream
, field12
, ';');
183 n
+= getfield (stream
, field13
, ';');
184 n
+= getfield (stream
, field14
, '\n');
189 fprintf (stderr
, "short line in'%s':%d\n",
190 unicodedata_filename
, lineno
);
193 i
= strtoul (field0
, NULL
, 16);
195 && strlen (field1
) >= 9
196 && !strcmp (field1
+ strlen(field1
) - 8, ", First>"))
198 /* Deal with a range. */
200 n
= getfield (stream
, field0
, ';');
201 n
+= getfield (stream
, field1
, ';');
202 n
+= getfield (stream
, field2
, ';');
203 n
+= getfield (stream
, field3
, ';');
204 n
+= getfield (stream
, field4
, ';');
205 n
+= getfield (stream
, field5
, ';');
206 n
+= getfield (stream
, field6
, ';');
207 n
+= getfield (stream
, field7
, ';');
208 n
+= getfield (stream
, field8
, ';');
209 n
+= getfield (stream
, field9
, ';');
210 n
+= getfield (stream
, field10
, ';');
211 n
+= getfield (stream
, field11
, ';');
212 n
+= getfield (stream
, field12
, ';');
213 n
+= getfield (stream
, field13
, ';');
214 n
+= getfield (stream
, field14
, '\n');
217 fprintf (stderr
, "missing end range in '%s':%d\n",
218 unicodedata_filename
, lineno
);
221 if (!(field1
[0] == '<'
222 && strlen (field1
) >= 8
223 && !strcmp (field1
+ strlen (field1
) - 7, ", Last>")))
225 fprintf (stderr
, "missing end range in '%s':%d\n",
226 unicodedata_filename
, lineno
);
229 field1
[strlen (field1
) - 7] = '\0';
230 j
= strtoul (field0
, NULL
, 16);
232 fill_attribute (i
, field1
+1, field2
, field3
, field4
, field5
,
233 field6
, field7
, field8
, field9
, field10
,
234 field11
, field12
, field13
, field14
);
238 /* Single character line */
239 fill_attribute (i
, field1
, field2
, field3
, field4
, field5
,
240 field6
, field7
, field8
, field9
, field10
,
241 field11
, field12
, field13
, field14
);
244 if (ferror (stream
) || fclose (stream
))
246 fprintf (stderr
, "error reading from '%s'\n", unicodedata_filename
);
251 /* The combining property from the PropList.txt file. */
252 char unicode_combining
[0x110000];
254 /* Stores in unicode_combining[] the Combining property from the
255 Unicode 3.0 PropList.txt file. */
257 fill_combining (const char *proplist_filename
)
263 for (i
= 0; i
< 0x110000; i
++)
264 unicode_combining
[i
] = 0;
266 stream
= fopen (proplist_filename
, "r");
269 fprintf (stderr
, "error during fopen of '%s'\n", proplist_filename
);
273 /* Search for the "Property dump for: 0x20000004 (Combining)" line. */
276 if (fscanf (stream
, "%100[^\n]\n", buf
) < 1)
278 fprintf (stderr
, "no combining property found in '%s'\n",
283 while (strstr (buf
, "(Combining)") == NULL
);
289 if (fscanf (stream
, "%100[^\n]\n", buf
) < 1)
291 fprintf (stderr
, "premature end of combining property in '%s'\n",
297 if (strlen (buf
) >= 10 && buf
[4] == '.' && buf
[5] == '.')
299 if (sscanf (buf
, "%4X..%4X", &i1
, &i2
) < 2)
301 fprintf (stderr
, "parse error in combining property in '%s'\n",
306 else if (strlen (buf
) >= 4)
308 if (sscanf (buf
, "%4X", &i1
) < 1)
310 fprintf (stderr
, "parse error in combining property in '%s'\n",
318 fprintf (stderr
, "parse error in combining property in '%s'\n",
322 for (i
= i1
; i
<= i2
; i
++)
323 unicode_combining
[i
] = 1;
325 if (ferror (stream
) || fclose (stream
))
327 fprintf (stderr
, "error reading from '%s'\n", proplist_filename
);
332 /* The width property from the EastAsianWidth.txt file.
333 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
334 const char * unicode_width
[0x110000];
336 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
339 fill_width (const char *width_filename
)
343 char field0
[FIELDLEN
];
344 char field1
[FIELDLEN
];
345 char field2
[FIELDLEN
];
348 for (i
= 0; i
< 0x110000; i
++)
349 unicode_width
[i
] = (unicode_attributes
[i
].name
!= NULL
? "N" : NULL
);
351 stream
= fopen (width_filename
, "r");
354 fprintf (stderr
, "error during fopen of '%s'\n", width_filename
);
369 do c
= getc (stream
); while (c
!= EOF
&& c
!= '\n');
373 n
= getfield (stream
, field0
, ';');
374 n
+= getfield (stream
, field1
, ' ');
375 n
+= getfield (stream
, field2
, '\n');
380 fprintf (stderr
, "short line in '%s':%d\n", width_filename
, lineno
);
383 i
= strtoul (field0
, NULL
, 16);
384 if (strstr (field0
, "..") != NULL
)
386 /* Deal with a range. */
387 j
= strtoul (strstr (field0
, "..") + 2, NULL
, 16);
389 unicode_width
[i
] = strdup (field1
);
393 /* Single character line. */
394 unicode_width
[i
] = strdup (field1
);
397 if (ferror (stream
) || fclose (stream
))
399 fprintf (stderr
, "error reading from '%s'\n", width_filename
);
404 /* Line breaking classification. */
408 /* Values >= 20 are resolved at run time. */
409 LBP_BK
= 0, /* mandatory break */
410 /*LBP_CR, carriage return - not used here because it's a DOSism */
411 /*LBP_LF, line feed - not used here because it's a DOSism */
412 LBP_CM
= 20, /* attached characters and combining marks */
413 /*LBP_SG, surrogates - not used here because they are not characters */
414 LBP_ZW
= 1, /* zero width space */
415 LBP_IN
= 2, /* inseparable */
416 LBP_GL
= 3, /* non-breaking (glue) */
417 LBP_CB
= 22, /* contingent break opportunity */
418 LBP_SP
= 21, /* space */
419 LBP_BA
= 4, /* break opportunity after */
420 LBP_BB
= 5, /* break opportunity before */
421 LBP_B2
= 6, /* break opportunity before and after */
422 LBP_HY
= 7, /* hyphen */
423 LBP_NS
= 8, /* non starter */
424 LBP_OP
= 9, /* opening punctuation */
425 LBP_CL
= 10, /* closing punctuation */
426 LBP_QU
= 11, /* ambiguous quotation */
427 LBP_EX
= 12, /* exclamation/interrogation */
428 LBP_ID
= 13, /* ideographic */
429 LBP_NU
= 14, /* numeric */
430 LBP_IS
= 15, /* infix separator (numeric) */
431 LBP_SY
= 16, /* symbols allowing breaks */
432 LBP_AL
= 17, /* ordinary alphabetic and symbol characters */
433 LBP_PR
= 18, /* prefix (numeric) */
434 LBP_PO
= 19, /* postfix (numeric) */
435 LBP_SA
= 23, /* complex context (South East Asian) */
436 LBP_AI
= 24, /* ambiguous (alphabetic or ideograph) */
437 LBP_XX
= 25 /* unknown */
440 /* Returns the line breaking classification for ch, as a bit mask. */
442 get_lbp (unsigned int ch
)
446 if (unicode_attributes
[ch
].name
!= NULL
)
448 /* mandatory break */
449 if (ch
== 0x000A || ch
== 0x000D || ch
== 0x0085 /* newline */
450 || ch
== 0x000C /* form feed */
451 || ch
== 0x2028 /* LINE SEPARATOR */
452 || ch
== 0x2029 /* PARAGRAPH SEPARATOR */)
455 /* zero width space */
456 if (ch
== 0x200B /* ZERO WIDTH SPACE */)
460 if (ch
== 0x2024 /* ONE DOT LEADER */
461 || ch
== 0x2025 /* TWO DOT LEADER */
462 || ch
== 0x2026 /* HORIZONTAL ELLIPSIS */)
465 /* non-breaking (glue) */
466 if (ch
== 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */
467 || ch
== 0x00A0 /* NO-BREAK SPACE */
468 || ch
== 0x202F /* NARROW NO-BREAK SPACE */
469 || ch
== 0x2007 /* FIGURE SPACE */
470 || ch
== 0x2011 /* NON-BREAKING HYPHEN */
471 || ch
== 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */)
474 /* contingent break opportunity */
475 if (ch
== 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
479 if (ch
== 0x0020 /* SPACE */)
482 /* break opportunity after */
483 if (ch
== 0x2000 /* EN QUAD */
484 || ch
== 0x2001 /* EM QUAD */
485 || ch
== 0x2002 /* EN SPACE */
486 || ch
== 0x2003 /* EM SPACE */
487 || ch
== 0x2004 /* THREE-PER-EM SPACE */
488 || ch
== 0x2005 /* FOUR-PER-EM SPACE */
489 || ch
== 0x2006 /* SIX-PER-EM SPACE */
490 || ch
== 0x2008 /* PUNCTUATION SPACE */
491 || ch
== 0x2009 /* THIN SPACE */
492 || ch
== 0x200A /* HAIR SPACE */
493 || ch
== 0x0009 /* tab */
494 || ch
== 0x058A /* ARMENIAN HYPHEN */
495 || ch
== 0x2010 /* HYPHEN */
496 || ch
== 0x2012 /* FIGURE DASH */
497 || ch
== 0x2013 /* EN DASH */
498 || ch
== 0x00AD /* SOFT HYPHEN */
499 || ch
== 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
500 || ch
== 0x1361 /* ETHIOPIC WORDSPACE */
501 || ch
== 0x1680 /* OGHAM SPACE MARK */
502 || ch
== 0x17D5 /* KHMER SIGN BARIYOOSAN */
503 || ch
== 0x2027 /* HYPHENATION POINT */
504 || ch
== 0x007C /* VERTICAL LINE */)
507 /* break opportunity before */
508 if (ch
== 0x00B4 /* ACUTE ACCENT */
509 || ch
== 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
510 || ch
== 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
511 || ch
== 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
514 /* break opportunity before and after */
515 if (ch
== 0x2014 /* EM DASH */)
519 if (ch
== 0x002D /* HYPHEN-MINUS */)
522 /* exclamation/interrogation */
523 if (ch
== 0x0021 /* EXCLAMATION MARK */
524 || ch
== 0x003F /* QUESTION MARK */
525 || ch
== 0xFE56 /* SMALL QUESTION MARK */
526 || ch
== 0xFE57 /* SMALL EXCLAMATION MARK */
527 || ch
== 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
528 || ch
== 0xFF1F /* FULLWIDTH QUESTION MARK */)
531 /* opening punctuation */
532 if (unicode_attributes
[ch
].category
[0] == 'P'
533 && unicode_attributes
[ch
].category
[1] == 's')
536 /* closing punctuation */
537 if (ch
== 0x3001 /* IDEOGRAPHIC COMMA */
538 || ch
== 0x3002 /* IDEOGRAPHIC FULL STOP */
539 || ch
== 0xFE50 /* SMALL COMMA */
540 || ch
== 0xFE52 /* SMALL FULL STOP */
541 || ch
== 0xFF0C /* FULLWIDTH COMMA */
542 || ch
== 0xFF0E /* FULLWIDTH FULL STOP */
543 || ch
== 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
544 || ch
== 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
545 || (unicode_attributes
[ch
].category
[0] == 'P'
546 && unicode_attributes
[ch
].category
[1] == 'e'))
549 /* ambiguous quotation */
550 if (ch
== 0x0022 /* QUOTATION MARK */
551 || ch
== 0x0027 /* APOSTROPHE */
552 || (unicode_attributes
[ch
].category
[0] == 'P'
553 && (unicode_attributes
[ch
].category
[1] == 'f'
554 || unicode_attributes
[ch
].category
[1] == 'i')))
557 /* attached characters and combining marks */
558 if ((unicode_attributes
[ch
].category
[0] == 'M'
559 && (unicode_attributes
[ch
].category
[1] == 'n'
560 || unicode_attributes
[ch
].category
[1] == 'c'
561 || unicode_attributes
[ch
].category
[1] == 'e'))
562 || (ch
>= 0x1160 && ch
<= 0x11F9)
563 || (unicode_attributes
[ch
].category
[0] == 'C'
564 && (unicode_attributes
[ch
].category
[1] == 'c'
565 || unicode_attributes
[ch
].category
[1] == 'f')))
566 if (!(attr
& ((1 << LBP_BK
) | (1 << LBP_BA
) | (1 << LBP_GL
))))
570 if (ch
== 0x0E5A /* THAI CHARACTER ANGKHANKHU */
571 || ch
== 0x0E5B /* THAI CHARACTER KHOMUT */
572 || ch
== 0x17D4 /* KHMER SIGN KHAN */
573 || ch
== 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
574 || ch
== 0x17D7 /* KHMER SIGN LEK TOO */
575 || ch
== 0x17D8 /* KHMER SIGN BEYYAL */
576 || ch
== 0x17D9 /* KHMER SIGN PHNAEK MUAN */
577 || ch
== 0x17DA /* KHMER SIGN KOOMUUT */
578 || ch
== 0x203C /* DOUBLE EXCLAMATION MARK */
579 || ch
== 0x2044 /* FRACTION SLASH */
580 || ch
== 0x3005 /* IDEOGRAPHIC ITERATION MARK */
581 || ch
== 0x301C /* WAVE DASH */
582 || ch
== 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
583 || ch
== 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
584 || ch
== 0x309D /* HIRAGANA ITERATION MARK */
585 || ch
== 0x309E /* HIRAGANA VOICED ITERATION MARK */
586 || ch
== 0x30FB /* KATAKANA MIDDLE DOT */
587 || ch
== 0x30FD /* KATAKANA ITERATION MARK */
588 || ch
== 0xFE54 /* SMALL SEMICOLON */
589 || ch
== 0xFE55 /* SMALL COLON */
590 || ch
== 0xFF1A /* FULLWIDTH COLON */
591 || ch
== 0xFF1B /* FULLWIDTH SEMICOLON */
592 || ch
== 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
593 || ch
== 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
594 || ch
== 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
595 || ch
== 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
596 || (unicode_attributes
[ch
].category
[0] == 'L'
597 && unicode_attributes
[ch
].category
[1] == 'm'
598 && (unicode_width
[ch
][0] == 'W'
599 || unicode_width
[ch
][0] == 'H'))
600 || (unicode_attributes
[ch
].category
[0] == 'S'
601 && unicode_attributes
[ch
].category
[1] == 'k'
602 && unicode_width
[ch
][0] == 'W')
603 || strstr (unicode_attributes
[ch
].name
, "HIRAGANA LETTER SMALL ") != NULL
604 || strstr (unicode_attributes
[ch
].name
, "KATAKANA LETTER SMALL ") != NULL
)
608 if (unicode_attributes
[ch
].category
[0] == 'N'
609 && unicode_attributes
[ch
].category
[1] == 'd'
610 && strstr (unicode_attributes
[ch
].name
, "FULLWIDTH") == NULL
)
613 /* infix separator (numeric) */
614 if (ch
== 0x002C /* COMMA */
615 || ch
== 0x002E /* FULL STOP */
616 || ch
== 0x003A /* COLON */
617 || ch
== 0x003B /* SEMICOLON */
618 || ch
== 0x0589 /* ARMENIAN FULL STOP */)
621 /* symbols allowing breaks */
622 if (ch
== 0x002F /* SOLIDUS */)
625 /* postfix (numeric) */
626 if (ch
== 0x0025 /* PERCENT SIGN */
627 || ch
== 0x00A2 /* CENT SIGN */
628 || ch
== 0x00B0 /* DEGREE SIGN */
629 || ch
== 0x2030 /* PER MILLE SIGN */
630 || ch
== 0x2031 /* PER TEN THOUSAND SIGN */
631 || ch
== 0x2032 /* PRIME */
632 || ch
== 0x2033 /* DOUBLE PRIME */
633 || ch
== 0x2034 /* TRIPLE PRIME */
634 || ch
== 0x2035 /* REVERSED PRIME */
635 || ch
== 0x2036 /* REVERSED DOUBLE PRIME */
636 || ch
== 0x2037 /* REVERSED TRIPLE PRIME */
637 || ch
== 0x20A7 /* PESETA SIGN */
638 || ch
== 0x2103 /* DEGREE CELSIUS */
639 || ch
== 0x2109 /* DEGREE FAHRENHEIT */
640 || ch
== 0x2126 /* OHM SIGN */
641 || ch
== 0xFE6A /* SMALL PERCENT SIGN */
642 || ch
== 0xFF05 /* FULLWIDTH PERCENT SIGN */
643 || ch
== 0xFFE0 /* FULLWIDTH DIGIT ZERO */)
646 /* prefix (numeric) */
647 if (ch
== 0x002B /* PLUS SIGN */
648 || ch
== 0x005C /* REVERSE SOLIDUS */
649 || ch
== 0x00B1 /* PLUS-MINUS SIGN */
650 || ch
== 0x2116 /* NUMERO SIGN */
651 || ch
== 0x2212 /* MINUS SIGN */
652 || ch
== 0x2213 /* MINUS-OR-PLUS SIGN */
653 || (unicode_attributes
[ch
].category
[0] == 'S'
654 && unicode_attributes
[ch
].category
[1] == 'c'))
655 if (!(attr
& (1 << LBP_PO
)))
658 /* complex context (South East Asian) */
659 if (((ch
>= 0x0E00 && ch
<= 0x0EFF)
660 || (ch
>= 0x1000 && ch
<= 0x109F)
661 || (ch
>= 0x1780 && ch
<= 0x17FF))
662 && unicode_attributes
[ch
].category
[0] == 'L'
663 && (unicode_attributes
[ch
].category
[1] == 'm'
664 || unicode_attributes
[ch
].category
[1] == 'o'))
665 if (!(attr
& ((1 << LBP_CM
) | (1 << LBP_NS
) | (1 << LBP_NU
) | (1 << LBP_BA
) | (1 << LBP_PR
))))
669 if ((ch
>= 0x1100 && ch
<= 0x115F) /* HANGUL CHOSEONG */
670 || (ch
>= 0x2E80 && ch
<= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
671 || ch
== 0x3000 /* IDEOGRAPHIC SPACE */
672 || (ch
>= 0x3130 && ch
<= 0x318F) /* HANGUL LETTER */
673 || (ch
>= 0x3400 && ch
<= 0x4DBF) /* CJK Ideograph Extension A */
674 || (ch
>= 0x4E00 && ch
<= 0x9FAF) /* CJK Ideograph */
675 || (ch
>= 0xF900 && ch
<= 0xFAFF) /* CJK COMPATIBILITY IDEOGRAPH */
676 || (ch
>= 0xAC00 && ch
<= 0xD7AF) /* HANGUL SYLLABLE */
677 || (ch
>= 0xA000 && ch
<= 0xA48C) /* YI SYLLABLE */
678 || (ch
>= 0xA490 && ch
<= 0xA4C6) /* YI RADICAL */
679 || ch
== 0xFE62 /* SMALL PLUS SIGN */
680 || ch
== 0xFE63 /* SMALL HYPHEN-MINUS */
681 || ch
== 0xFE64 /* SMALL LESS-THAN SIGN */
682 || ch
== 0xFE65 /* SMALL GREATER-THAN SIGN */
683 || ch
== 0xFE66 /* SMALL EQUALS SIGN */
684 || (ch
>= 0xFF10 && ch
<= 0xFF19) /* FULLWIDTH DIGIT */
685 || (ch
>= 0x20000 && ch
<= 0x2A6D6) /* CJK Ideograph Extension B */
686 || (ch
>= 0x2F800 && ch
<= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
687 || strstr (unicode_attributes
[ch
].name
, "FULLWIDTH LATIN ") != NULL
688 || (ch
>= 0x3000 && ch
<= 0x33FF
689 && !(attr
& ((1 << LBP_CM
) | (1 << LBP_NS
) | (1 << LBP_OP
) | (1 << LBP_CL
))))
690 /* Extra characters for compatibility with Unicode LineBreak.txt. */
691 || ch
== 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
692 || ch
== 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
693 || ch
== 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
694 || ch
== 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
695 || ch
== 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
696 || ch
== 0xFE49 /* DASHED OVERLINE */
697 || ch
== 0xFE4A /* CENTRELINE OVERLINE */
698 || ch
== 0xFE4B /* WAVY OVERLINE */
699 || ch
== 0xFE4C /* DOUBLE WAVY OVERLINE */
700 || ch
== 0xFE4D /* DASHED LOW LINE */
701 || ch
== 0xFE4E /* CENTRELINE LOW LINE */
702 || ch
== 0xFE4F /* WAVY LOW LINE */
703 || ch
== 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
704 || ch
== 0xFE58 /* SMALL EM DASH */
705 || ch
== 0xFE5F /* SMALL NUMBER SIGN */
706 || ch
== 0xFE60 /* SMALL AMPERSAND */
707 || ch
== 0xFE61 /* SMALL ASTERISK */
708 || ch
== 0xFE68 /* SMALL REVERSE SOLIDUS */
709 || ch
== 0xFE6B /* SMALL COMMERCIAL AT */
710 || ch
== 0xFF02 /* FULLWIDTH QUOTATION MARK */
711 || ch
== 0xFF03 /* FULLWIDTH NUMBER SIGN */
712 || ch
== 0xFF06 /* FULLWIDTH AMPERSAND */
713 || ch
== 0xFF07 /* FULLWIDTH APOSTROPHE */
714 || ch
== 0xFF0A /* FULLWIDTH ASTERISK */
715 || ch
== 0xFF0B /* FULLWIDTH PLUS SIGN */
716 || ch
== 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
717 || ch
== 0xFF0F /* FULLWIDTH SOLIDUS */
718 || ch
== 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
719 || ch
== 0xFF1D /* FULLWIDTH EQUALS SIGN */
720 || ch
== 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
721 || ch
== 0xFF20 /* FULLWIDTH COMMERCIAL AT */
722 || ch
== 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
723 || ch
== 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
724 || ch
== 0xFF3F /* FULLWIDTH LOW LINE */
725 || ch
== 0xFF40 /* FULLWIDTH GRAVE ACCENT */
726 || ch
== 0xFF5C /* FULLWIDTH VERTICAL LINE */
727 || ch
== 0xFF5E /* FULLWIDTH TILDE */
728 || ch
== 0xFFE2 /* FULLWIDTH NOT SIGN */
729 || ch
== 0xFFE3 /* FULLWIDTH MACRON */
730 || ch
== 0xFFE4) /* FULLWIDTH BROKEN BAR */
732 /* ambiguous (ideograph) ? */
733 if (unicode_width
[ch
] != NULL
734 && unicode_width
[ch
][0] == 'A')
740 /* ordinary alphabetic and symbol characters */
741 if ((unicode_attributes
[ch
].category
[0] == 'L'
742 && (unicode_attributes
[ch
].category
[1] == 'u'
743 || unicode_attributes
[ch
].category
[1] == 'l'
744 || unicode_attributes
[ch
].category
[1] == 't'
745 || unicode_attributes
[ch
].category
[1] == 'm'
746 || unicode_attributes
[ch
].category
[1] == 'o'))
747 || (unicode_attributes
[ch
].category
[0] == 'S'
748 && (unicode_attributes
[ch
].category
[1] == 'm'
749 || unicode_attributes
[ch
].category
[1] == 'c'
750 || unicode_attributes
[ch
].category
[1] == 'k'
751 || unicode_attributes
[ch
].category
[1] == 'o'))
752 /* Extra characters for compatibility with Unicode LineBreak.txt. */
753 || ch
== 0x0023 /* NUMBER SIGN */
754 || ch
== 0x0026 /* AMPERSAND */
755 || ch
== 0x002A /* ASTERISK */
756 || ch
== 0x0040 /* COMMERCIAL AT */
757 || ch
== 0x005F /* LOW LINE */
758 || ch
== 0x00A1 /* INVERTED EXCLAMATION MARK */
759 || ch
== 0x00B2 /* SUPERSCRIPT TWO */
760 || ch
== 0x00B3 /* SUPERSCRIPT THREE */
761 || ch
== 0x00B7 /* MIDDLE DOT */
762 || ch
== 0x00B9 /* SUPERSCRIPT ONE */
763 || ch
== 0x00BC /* VULGAR FRACTION ONE QUARTER */
764 || ch
== 0x00BD /* VULGAR FRACTION ONE HALF */
765 || ch
== 0x00BE /* VULGAR FRACTION THREE QUARTERS */
766 || ch
== 0x00BF /* INVERTED QUESTION MARK */
767 || ch
== 0x037E /* GREEK QUESTION MARK */
768 || ch
== 0x0387 /* GREEK ANO TELEIA */
769 || ch
== 0x055A /* ARMENIAN APOSTROPHE */
770 || ch
== 0x055B /* ARMENIAN EMPHASIS MARK */
771 || ch
== 0x055C /* ARMENIAN EXCLAMATION MARK */
772 || ch
== 0x055D /* ARMENIAN COMMA */
773 || ch
== 0x055E /* ARMENIAN QUESTION MARK */
774 || ch
== 0x055F /* ARMENIAN ABBREVIATION MARK */
775 || ch
== 0x05BE /* HEBREW PUNCTUATION MAQAF */
776 || ch
== 0x05C0 /* HEBREW PUNCTUATION PASEQ */
777 || ch
== 0x05C3 /* HEBREW PUNCTUATION SOF PASUQ */
778 || ch
== 0x05F3 /* HEBREW PUNCTUATION GERESH */
779 || ch
== 0x05F4 /* HEBREW PUNCTUATION GERSHAYIM */
780 || ch
== 0x060C /* ARABIC COMMA */
781 || ch
== 0x061B /* ARABIC SEMICOLON */
782 || ch
== 0x061F /* ARABIC QUESTION MARK */
783 || ch
== 0x066A /* ARABIC PERCENT SIGN */
784 || ch
== 0x066B /* ARABIC DECIMAL SEPARATOR */
785 || ch
== 0x066C /* ARABIC THOUSANDS SEPARATOR */
786 || ch
== 0x066D /* ARABIC FIVE POINTED STAR */
787 || ch
== 0x06D4 /* ARABIC FULL STOP */
788 || ch
== 0x0700 /* SYRIAC END OF PARAGRAPH */
789 || ch
== 0x0701 /* SYRIAC SUPRALINEAR FULL STOP */
790 || ch
== 0x0702 /* SYRIAC SUBLINEAR FULL STOP */
791 || ch
== 0x0703 /* SYRIAC SUPRALINEAR COLON */
792 || ch
== 0x0704 /* SYRIAC SUBLINEAR COLON */
793 || ch
== 0x0705 /* SYRIAC HORIZONTAL COLON */
794 || ch
== 0x0706 /* SYRIAC COLON SKEWED LEFT */
795 || ch
== 0x0707 /* SYRIAC COLON SKEWED RIGHT */
796 || ch
== 0x0708 /* SYRIAC SUPRALINEAR COLON SKEWED LEFT */
797 || ch
== 0x0709 /* SYRIAC SUBLINEAR COLON SKEWED RIGHT */
798 || ch
== 0x070A /* SYRIAC CONTRACTION */
799 || ch
== 0x070B /* SYRIAC HARKLEAN OBELUS */
800 || ch
== 0x070C /* SYRIAC HARKLEAN METOBELUS */
801 || ch
== 0x070D /* SYRIAC HARKLEAN ASTERISCUS */
802 || ch
== 0x0964 /* DEVANAGARI DANDA */
803 || ch
== 0x0965 /* DEVANAGARI DOUBLE DANDA */
804 || ch
== 0x0970 /* DEVANAGARI ABBREVIATION SIGN */
805 || ch
== 0x09F4 /* BENGALI CURRENCY NUMERATOR ONE */
806 || ch
== 0x09F5 /* BENGALI CURRENCY NUMERATOR TWO */
807 || ch
== 0x09F6 /* BENGALI CURRENCY NUMERATOR THREE */
808 || ch
== 0x09F7 /* BENGALI CURRENCY NUMERATOR FOUR */
809 || ch
== 0x09F8 /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
810 || ch
== 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
811 || ch
== 0x0BF0 /* TAMIL NUMBER TEN */
812 || ch
== 0x0BF1 /* TAMIL NUMBER ONE HUNDRED */
813 || ch
== 0x0BF2 /* TAMIL NUMBER ONE THOUSAND */
814 || ch
== 0x0DF4 /* SINHALA PUNCTUATION KUNDDALIYA */
815 || ch
== 0x0E4F /* THAI CHARACTER FONGMAN */
816 || ch
== 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
817 || ch
== 0x0F05 /* TIBETAN MARK CLOSING YIG MGO SGAB MA */
818 || ch
== 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
819 || ch
== 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
820 || ch
== 0x0F08 /* TIBETAN MARK SBRUL SHAD */
821 || ch
== 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
822 || ch
== 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
823 || ch
== 0x0F0D /* TIBETAN MARK SHAD */
824 || ch
== 0x0F0E /* TIBETAN MARK NYIS SHAD */
825 || ch
== 0x0F0F /* TIBETAN MARK TSHEG SHAD */
826 || ch
== 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
827 || ch
== 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
828 || ch
== 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
829 || ch
== 0x0F2A /* TIBETAN DIGIT HALF ONE */
830 || ch
== 0x0F2B /* TIBETAN DIGIT HALF TWO */
831 || ch
== 0x0F2C /* TIBETAN DIGIT HALF THREE */
832 || ch
== 0x0F2D /* TIBETAN DIGIT HALF FOUR */
833 || ch
== 0x0F2E /* TIBETAN DIGIT HALF FIVE */
834 || ch
== 0x0F2F /* TIBETAN DIGIT HALF SIX */
835 || ch
== 0x0F30 /* TIBETAN DIGIT HALF SEVEN */
836 || ch
== 0x0F31 /* TIBETAN DIGIT HALF EIGHT */
837 || ch
== 0x0F32 /* TIBETAN DIGIT HALF NINE */
838 || ch
== 0x0F33 /* TIBETAN DIGIT HALF ZERO */
839 || ch
== 0x0F85 /* TIBETAN MARK PALUTA */
840 || ch
== 0x104A /* MYANMAR SIGN LITTLE SECTION */
841 || ch
== 0x104B /* MYANMAR SIGN SECTION */
842 || ch
== 0x104C /* MYANMAR SYMBOL LOCATIVE */
843 || ch
== 0x104D /* MYANMAR SYMBOL COMPLETED */
844 || ch
== 0x104E /* MYANMAR SYMBOL AFOREMENTIONED */
845 || ch
== 0x104F /* MYANMAR SYMBOL GENITIVE */
846 || ch
== 0x10FB /* GEORGIAN PARAGRAPH SEPARATOR */
847 || ch
== 0x1362 /* ETHIOPIC FULL STOP */
848 || ch
== 0x1363 /* ETHIOPIC COMMA */
849 || ch
== 0x1364 /* ETHIOPIC SEMICOLON */
850 || ch
== 0x1365 /* ETHIOPIC COLON */
851 || ch
== 0x1366 /* ETHIOPIC PREFACE COLON */
852 || ch
== 0x1367 /* ETHIOPIC QUESTION MARK */
853 || ch
== 0x1368 /* ETHIOPIC PARAGRAPH SEPARATOR */
854 || ch
== 0x1372 /* ETHIOPIC NUMBER TEN */
855 || ch
== 0x1373 /* ETHIOPIC NUMBER TWENTY */
856 || ch
== 0x1374 /* ETHIOPIC NUMBER THIRTY */
857 || ch
== 0x1375 /* ETHIOPIC NUMBER FORTY */
858 || ch
== 0x1376 /* ETHIOPIC NUMBER FIFTY */
859 || ch
== 0x1377 /* ETHIOPIC NUMBER SIXTY */
860 || ch
== 0x1378 /* ETHIOPIC NUMBER SEVENTY */
861 || ch
== 0x1379 /* ETHIOPIC NUMBER EIGHTY */
862 || ch
== 0x137A /* ETHIOPIC NUMBER NINETY */
863 || ch
== 0x137B /* ETHIOPIC NUMBER HUNDRED */
864 || ch
== 0x137C /* ETHIOPIC NUMBER TEN THOUSAND */
865 || ch
== 0x166D /* CANADIAN SYLLABICS CHI SIGN */
866 || ch
== 0x166E /* CANADIAN SYLLABICS FULL STOP */
867 || ch
== 0x16EB /* RUNIC SINGLE PUNCTUATION */
868 || ch
== 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
869 || ch
== 0x16ED /* RUNIC CROSS PUNCTUATION */
870 || ch
== 0x16EE /* RUNIC ARLAUG SYMBOL */
871 || ch
== 0x16EF /* RUNIC TVIMADUR SYMBOL */
872 || ch
== 0x16F0 /* RUNIC BELGTHOR SYMBOL */
873 || ch
== 0x17DC /* KHMER SIGN AVAKRAHASANYA */
874 || ch
== 0x1800 /* MONGOLIAN BIRGA */
875 || ch
== 0x1801 /* MONGOLIAN ELLIPSIS */
876 || ch
== 0x1802 /* MONGOLIAN COMMA */
877 || ch
== 0x1803 /* MONGOLIAN FULL STOP */
878 || ch
== 0x1804 /* MONGOLIAN COLON */
879 || ch
== 0x1805 /* MONGOLIAN FOUR DOTS */
880 || ch
== 0x1807 /* MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER */
881 || ch
== 0x1808 /* MONGOLIAN MANCHU COMMA */
882 || ch
== 0x1809 /* MONGOLIAN MANCHU FULL STOP */
883 || ch
== 0x180A /* MONGOLIAN NIRUGU */
884 || ch
== 0x2015 /* HORIZONTAL BAR */
885 || ch
== 0x2016 /* DOUBLE VERTICAL LINE */
886 || ch
== 0x2017 /* DOUBLE LOW LINE */
887 || ch
== 0x2020 /* DAGGER */
888 || ch
== 0x2021 /* DOUBLE DAGGER */
889 || ch
== 0x2022 /* BULLET */
890 || ch
== 0x2023 /* TRIANGULAR BULLET */
891 || ch
== 0x2038 /* CARET */
892 || ch
== 0x203B /* REFERENCE MARK */
893 || ch
== 0x203D /* INTERROBANG */
894 || ch
== 0x203E /* OVERLINE */
895 || ch
== 0x203F /* UNDERTIE */
896 || ch
== 0x2040 /* CHARACTER TIE */
897 || ch
== 0x2041 /* CARET INSERTION POINT */
898 || ch
== 0x2042 /* ASTERISM */
899 || ch
== 0x2043 /* HYPHEN BULLET */
900 || ch
== 0x2048 /* QUESTION EXCLAMATION MARK */
901 || ch
== 0x2049 /* EXCLAMATION QUESTION MARK */
902 || ch
== 0x204A /* TIRONIAN SIGN ET */
903 || ch
== 0x204B /* REVERSED PILCROW SIGN */
904 || ch
== 0x204C /* BLACK LEFTWARDS BULLET */
905 || ch
== 0x204D /* BLACK RIGHTWARDS BULLET */
906 || ch
== 0x2070 /* SUPERSCRIPT ZERO */
907 || ch
== 0x2074 /* SUPERSCRIPT FOUR */
908 || ch
== 0x2075 /* SUPERSCRIPT FIVE */
909 || ch
== 0x2076 /* SUPERSCRIPT SIX */
910 || ch
== 0x2077 /* SUPERSCRIPT SEVEN */
911 || ch
== 0x2078 /* SUPERSCRIPT EIGHT */
912 || ch
== 0x2079 /* SUPERSCRIPT NINE */
913 || ch
== 0x2080 /* SUBSCRIPT ZERO */
914 || ch
== 0x2081 /* SUBSCRIPT ONE */
915 || ch
== 0x2082 /* SUBSCRIPT TWO */
916 || ch
== 0x2083 /* SUBSCRIPT THREE */
917 || ch
== 0x2084 /* SUBSCRIPT FOUR */
918 || ch
== 0x2085 /* SUBSCRIPT FIVE */
919 || ch
== 0x2086 /* SUBSCRIPT SIX */
920 || ch
== 0x2087 /* SUBSCRIPT SEVEN */
921 || ch
== 0x2088 /* SUBSCRIPT EIGHT */
922 || ch
== 0x2089 /* SUBSCRIPT NINE */
923 || (ch
>= 0x2153 && ch
<= 0x215E) /* VULGAR FRACTION */
924 || ch
== 0x215F /* FRACTION NUMERATOR ONE */
925 || (ch
>= 0x2160 && ch
<= 0x2183) /* ROMAN NUMERAL */
926 || (ch
>= 0x2460 && ch
<= 0x2473) /* CIRCLED NUMBER */
927 || (ch
>= 0x2474 && ch
<= 0x2487) /* PARENTHESIZED NUMBER */
928 || (ch
>= 0x2488 && ch
<= 0x249B) /* NUMBER FULL STOP */
929 || ch
== 0x24EA /* CIRCLED DIGIT ZERO */
930 || (ch
>= 0x2776 && ch
<= 0x2793) /* DINGBAT CIRCLED DIGIT */
931 || ch
== 0x10320 /* OLD ITALIC NUMERAL ONE */
932 || ch
== 0x10321 /* OLD ITALIC NUMERAL FIVE */
933 || ch
== 0x10322 /* OLD ITALIC NUMERAL TEN */
934 || ch
== 0x10323 /* OLD ITALIC NUMERAL FIFTY */
935 || ch
== 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
936 if (!(attr
& ((1 << LBP_CM
) | (1 << LBP_NS
) | (1 << LBP_ID
) | (1 << LBP_BA
) | (1 << LBP_BB
) | (1 << LBP_PO
) | (1 << LBP_PR
) | (1 << LBP_SA
) | (1 << LBP_CB
))))
938 /* ambiguous (alphabetic) ? */
939 if (unicode_width
[ch
] != NULL
940 && unicode_width
[ch
][0] == 'A')
954 /* Output the line breaking properties in a human readable format. */
956 debug_output_lbp (FILE *stream
)
960 for (i
= 0; i
< 0x110000; i
++)
962 int attr
= get_lbp (i
);
963 if (attr
!= 1 << LBP_XX
)
965 fprintf (stream
, "0x%04X", i
);
966 #define PRINT_BIT(attr,bit) \
967 if (attr & (1 << bit)) fprintf (stream, " " #bit);
968 PRINT_BIT(attr
,LBP_BK
);
969 PRINT_BIT(attr
,LBP_CM
);
970 PRINT_BIT(attr
,LBP_ZW
);
971 PRINT_BIT(attr
,LBP_IN
);
972 PRINT_BIT(attr
,LBP_GL
);
973 PRINT_BIT(attr
,LBP_CB
);
974 PRINT_BIT(attr
,LBP_SP
);
975 PRINT_BIT(attr
,LBP_BA
);
976 PRINT_BIT(attr
,LBP_BB
);
977 PRINT_BIT(attr
,LBP_B2
);
978 PRINT_BIT(attr
,LBP_HY
);
979 PRINT_BIT(attr
,LBP_NS
);
980 PRINT_BIT(attr
,LBP_OP
);
981 PRINT_BIT(attr
,LBP_CL
);
982 PRINT_BIT(attr
,LBP_QU
);
983 PRINT_BIT(attr
,LBP_EX
);
984 PRINT_BIT(attr
,LBP_ID
);
985 PRINT_BIT(attr
,LBP_NU
);
986 PRINT_BIT(attr
,LBP_IS
);
987 PRINT_BIT(attr
,LBP_SY
);
988 PRINT_BIT(attr
,LBP_AL
);
989 PRINT_BIT(attr
,LBP_PR
);
990 PRINT_BIT(attr
,LBP_PO
);
991 PRINT_BIT(attr
,LBP_SA
);
992 PRINT_BIT(attr
,LBP_XX
);
993 PRINT_BIT(attr
,LBP_AI
);
995 fprintf (stream
, "\n");
1001 debug_output_tables (const char *filename
)
1005 stream
= fopen (filename
, "w");
1008 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
1012 debug_output_lbp (stream
);
1014 if (ferror (stream
) || fclose (stream
))
1016 fprintf (stderr
, "error writing to '%s'\n", filename
);
1021 /* The line breaking property from the LineBreak.txt file. */
1022 int unicode_org_lbp
[0x110000];
1024 /* Stores in unicode_org_lbp[] the line breaking property from the
1025 LineBreak.txt file. */
1027 fill_org_lbp (const char *linebreak_filename
)
1031 char field0
[FIELDLEN
];
1032 char field1
[FIELDLEN
];
1033 char field2
[FIELDLEN
];
1036 for (i
= 0; i
< 0x110000; i
++)
1037 unicode_org_lbp
[i
] = LBP_XX
;
1039 stream
= fopen (linebreak_filename
, "r");
1042 fprintf (stderr
, "error during fopen of '%s'\n", linebreak_filename
);
1058 do c
= getc (stream
); while (c
!= EOF
&& c
!= '\n');
1062 n
= getfield (stream
, field0
, ';');
1063 n
+= getfield (stream
, field1
, ' ');
1064 n
+= getfield (stream
, field2
, '\n');
1069 fprintf (stderr
, "short line in '%s':%d\n", linebreak_filename
,
1073 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
1102 else if (strcmp (field1
, "LF") == 0) value
= LBP_BK
;
1103 else if (strcmp (field1
, "CR") == 0) value
= LBP_BK
;
1104 else if (strcmp (field1
, "SG") == 0) value
= LBP_XX
;
1107 fprintf (stderr
, "unknown property value \"%s\" in '%s':%d\n",
1108 field1
, linebreak_filename
, lineno
);
1111 i
= strtoul (field0
, NULL
, 16);
1112 if (strstr (field0
, "..") != NULL
)
1114 /* Deal with a range. */
1115 j
= strtoul (strstr (field0
, "..") + 2, NULL
, 16);
1117 unicode_org_lbp
[i
] = value
;
1121 /* Single character line. */
1122 unicode_org_lbp
[i
] = value
;
1125 if (ferror (stream
) || fclose (stream
))
1127 fprintf (stderr
, "error reading from '%s'\n", linebreak_filename
);
1132 /* Output the line breaking properties in a human readable format. */
1134 debug_output_org_lbp (FILE *stream
)
1138 for (i
= 0; i
< 0x110000; i
++)
1140 int attr
= unicode_org_lbp
[i
];
1143 fprintf (stream
, "0x%04X", i
);
1144 #define PRINT_BIT(attr,bit) \
1145 if (attr == bit) fprintf (stream, " " #bit);
1146 PRINT_BIT(attr
,LBP_BK
);
1147 PRINT_BIT(attr
,LBP_CM
);
1148 PRINT_BIT(attr
,LBP_ZW
);
1149 PRINT_BIT(attr
,LBP_IN
);
1150 PRINT_BIT(attr
,LBP_GL
);
1151 PRINT_BIT(attr
,LBP_CB
);
1152 PRINT_BIT(attr
,LBP_SP
);
1153 PRINT_BIT(attr
,LBP_BA
);
1154 PRINT_BIT(attr
,LBP_BB
);
1155 PRINT_BIT(attr
,LBP_B2
);
1156 PRINT_BIT(attr
,LBP_HY
);
1157 PRINT_BIT(attr
,LBP_NS
);
1158 PRINT_BIT(attr
,LBP_OP
);
1159 PRINT_BIT(attr
,LBP_CL
);
1160 PRINT_BIT(attr
,LBP_QU
);
1161 PRINT_BIT(attr
,LBP_EX
);
1162 PRINT_BIT(attr
,LBP_ID
);
1163 PRINT_BIT(attr
,LBP_NU
);
1164 PRINT_BIT(attr
,LBP_IS
);
1165 PRINT_BIT(attr
,LBP_SY
);
1166 PRINT_BIT(attr
,LBP_AL
);
1167 PRINT_BIT(attr
,LBP_PR
);
1168 PRINT_BIT(attr
,LBP_PO
);
1169 PRINT_BIT(attr
,LBP_SA
);
1170 PRINT_BIT(attr
,LBP_XX
);
1171 PRINT_BIT(attr
,LBP_AI
);
1173 fprintf (stream
, "\n");
1179 debug_output_org_tables (const char *filename
)
1183 stream
= fopen (filename
, "w");
1186 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
1190 debug_output_org_lbp (stream
);
1192 if (ferror (stream
) || fclose (stream
))
1194 fprintf (stderr
, "error writing to '%s'\n", filename
);
1199 /* Construction of sparse 3-level tables. */
1200 #define TABLE lbp_table
1201 #define ELEMENT unsigned char
1202 #define DEFAULT LBP_XX
1203 #define xmalloc malloc
1204 #define xrealloc realloc
1208 output_lbp (FILE *stream
)
1212 unsigned int level1_offset
, level2_offset
, level3_offset
;
1216 lbp_table_init (&t
);
1218 for (i
= 0; i
< 0x110000; i
++)
1220 int attr
= get_lbp (i
);
1222 /* Now attr should contain exactly one bit. */
1223 if (attr
== 0 || ((attr
& (attr
- 1)) != 0))
1226 if (attr
!= 1 << LBP_XX
)
1228 unsigned int log2_attr
;
1229 for (log2_attr
= 0; attr
> 1; attr
>>= 1, log2_attr
++);
1231 lbp_table_add (&t
, i
, log2_attr
);
1235 lbp_table_finalize (&t
);
1238 5 * sizeof (uint32_t);
1240 5 * sizeof (uint32_t)
1241 + t
.level1_size
* sizeof (uint32_t);
1243 5 * sizeof (uint32_t)
1244 + t
.level1_size
* sizeof (uint32_t)
1245 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
1247 for (i
= 0; i
< 5; i
++)
1248 fprintf (stream
, "#define lbrkprop_header_%d %d\n", i
,
1249 ((uint32_t *) t
.result
)[i
]);
1250 fprintf (stream
, "static const\n");
1251 fprintf (stream
, "struct\n");
1252 fprintf (stream
, " {\n");
1253 fprintf (stream
, " int level1[%d];\n", t
.level1_size
);
1254 fprintf (stream
, " int level2[%d << %d];\n", t
.level2_size
, t
.q
);
1255 fprintf (stream
, " unsigned char level3[%d << %d];\n", t
.level3_size
, t
.p
);
1256 fprintf (stream
, " }\n");
1257 fprintf (stream
, "lbrkprop =\n");
1258 fprintf (stream
, "{\n");
1259 fprintf (stream
, " {");
1260 for (i
= 0; i
< t
.level1_size
; i
++)
1263 if (i
> 0 && (i
% 8) == 0)
1264 fprintf (stream
, "\n ");
1265 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
1266 fprintf (stream
, " %5d%s",
1267 offset
== 0 ? -1 : (offset
- level2_offset
) / sizeof (uint32_t),
1268 (i
+1 < t
.level1_size
? "," : ""));
1270 fprintf (stream
, " },\n");
1271 fprintf (stream
, " {");
1272 if (t
.level2_size
<< t
.q
> 8)
1273 fprintf (stream
, "\n ");
1274 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
1277 if (i
> 0 && (i
% 8) == 0)
1278 fprintf (stream
, "\n ");
1279 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
1280 fprintf (stream
, " %5d%s",
1281 offset
== 0 ? -1 : (offset
- level3_offset
) / sizeof (uint8_t),
1282 (i
+1 < t
.level2_size
<< t
.q
? "," : ""));
1284 if (t
.level2_size
<< t
.q
> 8)
1285 fprintf (stream
, "\n ");
1286 fprintf (stream
, " },\n");
1287 fprintf (stream
, " {");
1288 if (t
.level3_size
<< t
.p
> 8)
1289 fprintf (stream
, "\n ");
1290 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
1292 unsigned char value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
];
1293 const char *value_string
;
1296 #define CASE(x) case x: value_string = #x; break;
1327 if (i
> 0 && (i
% 8) == 0)
1328 fprintf (stream
, "\n ");
1329 fprintf (stream
, " %s%s", value_string
,
1330 (i
+1 < t
.level3_size
<< t
.p
? "," : ""));
1332 if (t
.level3_size
<< t
.p
> 8)
1333 fprintf (stream
, "\n ");
1334 fprintf (stream
, " }\n");
1335 fprintf (stream
, "};\n");
1339 output_tables (const char *filename
, const char *version
)
1343 stream
= fopen (filename
, "w");
1346 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
1350 fprintf (stream
, "/* Line breaking properties of Unicode characters. */\n");
1351 fprintf (stream
, "/* Generated automatically by gen-lbrkprop for Unicode %s. */\n",
1353 fprintf (stream
, "\n");
1355 /* Put a GPL header on it. The gnulib module is under LGPL (although it
1356 still carries the GPL header), and it's gnulib-tool which replaces the
1357 GPL header with an LGPL header. */
1358 fprintf (stream
, "/* Copyright (C) 2000-2004 Free Software Foundation, Inc.\n");
1359 fprintf (stream
, "\n");
1360 fprintf (stream
, "This program is free software; you can redistribute it and/or modify\n");
1361 fprintf (stream
, "it under the terms of the GNU General Public License as published by\n");
1362 fprintf (stream
, "the Free Software Foundation; either version 2, or (at your option)\n");
1363 fprintf (stream
, "any later version.\n");
1364 fprintf (stream
, "\n");
1365 fprintf (stream
, "This program is distributed in the hope that it will be useful,\n");
1366 fprintf (stream
, "but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
1367 fprintf (stream
, "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
1368 fprintf (stream
, "GNU General Public License for more details.\n");
1369 fprintf (stream
, "\n");
1370 fprintf (stream
, "You should have received a copy of the GNU General Public License\n");
1371 fprintf (stream
, "along with this program; if not, write to the Free Software\n");
1372 fprintf (stream
, "Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */\n");
1373 fprintf (stream
, "\n");
1375 output_lbp (stream
);
1377 if (ferror (stream
) || fclose (stream
))
1379 fprintf (stderr
, "error writing to '%s'\n", filename
);
1385 main (int argc
, char * argv
[])
1389 fprintf (stderr
, "Usage: %s UnicodeData.txt Combining.txt EastAsianWidth.txt LineBreak.txt version\n",
1394 fill_attributes (argv
[1]);
1395 fill_combining (argv
[2]);
1396 fill_width (argv
[3]);
1397 fill_org_lbp (argv
[4]);
1399 debug_output_tables ("lbrkprop.txt");
1400 debug_output_org_tables ("lbrkprop_org.txt");
1402 output_tables ("lbrkprop.h", argv
[5]);