1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2024 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <https://www.gnu.org/licenses/>. */
21 $ gen-uni-tables /usr/local/share/www.unicode.org/Public/16.0.0/ucd/UnicodeData.txt \
22 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/PropList.txt \
23 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/DerivedCoreProperties.txt \
24 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt \
25 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/ArabicShaping.txt \
26 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/Scripts.txt \
27 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/Blocks.txt \
28 /usr/local/share/www.unicode.org/Public/3.0-Update1/PropList-3.0.1.txt \
29 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/BidiMirroring.txt \
30 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/EastAsianWidth.txt \
31 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/LineBreak.txt \
32 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/auxiliary/WordBreakProperty.txt \
33 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \
34 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/CompositionExclusions.txt \
35 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/SpecialCasing.txt \
36 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/CaseFolding.txt \
41 #if __STDC_VERSION__ < 202311L
50 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
52 /* ========================================================================= */
54 /* Reading UnicodeData.txt. */
57 /* This structure represents one line in the UnicodeData.txt file. */
58 struct unicode_attribute
60 const char *name
; /* Character name */
61 const char *category
; /* General category */
62 const char *combining
; /* Canonical combining class */
63 const char *bidi
; /* Bidirectional category */
64 const char *decomposition
; /* Character decomposition mapping */
65 const char *decdigit
; /* Decimal digit value */
66 const char *digit
; /* Digit value */
67 const char *numeric
; /* Numeric value */
68 bool mirrored
; /* mirrored */
69 const char *oldname
; /* Old Unicode 1.0 name */
70 const char *comment
; /* Comment */
71 unsigned int upper
; /* Uppercase mapping */
72 unsigned int lower
; /* Lowercase mapping */
73 unsigned int title
; /* Titlecase mapping */
76 /* Missing fields are represented with "" for strings, and NONE for
78 #define NONE (~(unsigned int)0)
80 /* The entire contents of the UnicodeData.txt file. */
81 struct unicode_attribute unicode_attributes
[0x110000];
83 /* Stores in unicode_attributes[i] the values from the given fields. */
85 fill_attribute (unsigned int i
,
86 const char *field1
, const char *field2
,
87 const char *field3
, const char *field4
,
88 const char *field5
, const char *field6
,
89 const char *field7
, const char *field8
,
90 const char *field9
, const char *field10
,
91 const char *field11
, const char *field12
,
92 const char *field13
, const char *field14
)
94 struct unicode_attribute
* uni
;
98 fprintf (stderr
, "index too large\n");
101 if (strcmp (field2
, "Cs") == 0)
102 /* Surrogates are UTF-16 artifacts, not real characters. Ignore them. */
104 uni
= &unicode_attributes
[i
];
105 /* Copy the strings. */
106 uni
->name
= strdup (field1
);
107 uni
->category
= (field2
[0] == '\0' ? "" : strdup (field2
));
108 uni
->combining
= (field3
[0] == '\0' ? "" : strdup (field3
));
109 uni
->bidi
= (field4
[0] == '\0' ? "" : strdup (field4
));
110 uni
->decomposition
= (field5
[0] == '\0' ? "" : strdup (field5
));
111 uni
->decdigit
= (field6
[0] == '\0' ? "" : strdup (field6
));
112 uni
->digit
= (field7
[0] == '\0' ? "" : strdup (field7
));
113 uni
->numeric
= (field8
[0] == '\0' ? "" : strdup (field8
));
114 uni
->mirrored
= (field9
[0] == 'Y');
115 uni
->oldname
= (field10
[0] == '\0' ? "" : strdup (field10
));
116 uni
->comment
= (field11
[0] == '\0' ? "" : strdup (field11
));
117 uni
->upper
= (field12
[0] =='\0' ? NONE
: strtoul (field12
, NULL
, 16));
118 uni
->lower
= (field13
[0] =='\0' ? NONE
: strtoul (field13
, NULL
, 16));
119 uni
->title
= (field14
[0] =='\0' ? NONE
: strtoul (field14
, NULL
, 16));
122 /* Maximum length of a field in the UnicodeData.txt file. */
125 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
126 Reads up to (but excluding) DELIM.
127 Returns 1 when a field was successfully read, otherwise 0. */
129 getfield (FILE *stream
, char *buffer
, int delim
)
134 for (; (c
= getc (stream
)), (c
!= EOF
&& c
!= delim
); )
136 /* The original unicode.org UnicodeData.txt file happens to have
137 CR/LF line terminators. Silently convert to LF. */
141 /* Put c into the buffer. */
142 if (++count
>= FIELDLEN
- 1)
144 fprintf (stderr
, "field longer than expected, increase FIELDLEN\n");
157 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
160 fill_attributes (const char *unicodedata_filename
)
164 char field0
[FIELDLEN
];
165 char field1
[FIELDLEN
];
166 char field2
[FIELDLEN
];
167 char field3
[FIELDLEN
];
168 char field4
[FIELDLEN
];
169 char field5
[FIELDLEN
];
170 char field6
[FIELDLEN
];
171 char field7
[FIELDLEN
];
172 char field8
[FIELDLEN
];
173 char field9
[FIELDLEN
];
174 char field10
[FIELDLEN
];
175 char field11
[FIELDLEN
];
176 char field12
[FIELDLEN
];
177 char field13
[FIELDLEN
];
178 char field14
[FIELDLEN
];
181 for (i
= 0; i
< 0x110000; i
++)
182 unicode_attributes
[i
].name
= NULL
;
184 stream
= fopen (unicodedata_filename
, "r");
187 fprintf (stderr
, "error during fopen of '%s'\n", unicodedata_filename
);
196 n
= getfield (stream
, field0
, ';');
197 n
+= getfield (stream
, field1
, ';');
198 n
+= getfield (stream
, field2
, ';');
199 n
+= getfield (stream
, field3
, ';');
200 n
+= getfield (stream
, field4
, ';');
201 n
+= getfield (stream
, field5
, ';');
202 n
+= getfield (stream
, field6
, ';');
203 n
+= getfield (stream
, field7
, ';');
204 n
+= getfield (stream
, field8
, ';');
205 n
+= getfield (stream
, field9
, ';');
206 n
+= getfield (stream
, field10
, ';');
207 n
+= getfield (stream
, field11
, ';');
208 n
+= getfield (stream
, field12
, ';');
209 n
+= getfield (stream
, field13
, ';');
210 n
+= getfield (stream
, field14
, '\n');
215 fprintf (stderr
, "short line in '%s':%d\n",
216 unicodedata_filename
, lineno
);
219 i
= strtoul (field0
, NULL
, 16);
221 && strlen (field1
) >= 9
222 && strcmp (field1
+ strlen (field1
) - 8, ", First>") == 0)
224 /* Deal with a range. */
226 n
= getfield (stream
, field0
, ';');
227 n
+= getfield (stream
, field1
, ';');
228 n
+= getfield (stream
, field2
, ';');
229 n
+= getfield (stream
, field3
, ';');
230 n
+= getfield (stream
, field4
, ';');
231 n
+= getfield (stream
, field5
, ';');
232 n
+= getfield (stream
, field6
, ';');
233 n
+= getfield (stream
, field7
, ';');
234 n
+= getfield (stream
, field8
, ';');
235 n
+= getfield (stream
, field9
, ';');
236 n
+= getfield (stream
, field10
, ';');
237 n
+= getfield (stream
, field11
, ';');
238 n
+= getfield (stream
, field12
, ';');
239 n
+= getfield (stream
, field13
, ';');
240 n
+= getfield (stream
, field14
, '\n');
243 fprintf (stderr
, "missing end range in '%s':%d\n",
244 unicodedata_filename
, lineno
);
247 if (!(field1
[0] == '<'
248 && strlen (field1
) >= 8
249 && strcmp (field1
+ strlen (field1
) - 7, ", Last>") == 0))
251 fprintf (stderr
, "missing end range in '%s':%d\n",
252 unicodedata_filename
, lineno
);
255 field1
[strlen (field1
) - 7] = '\0';
256 j
= strtoul (field0
, NULL
, 16);
258 fill_attribute (i
, field1
+1, field2
, field3
, field4
, field5
,
259 field6
, field7
, field8
, field9
, field10
,
260 field11
, field12
, field13
, field14
);
264 /* Single character line */
265 fill_attribute (i
, field1
, field2
, field3
, field4
, field5
,
266 field6
, field7
, field8
, field9
, field10
,
267 field11
, field12
, field13
, field14
);
271 if (ferror (stream
) || fclose (stream
))
273 fprintf (stderr
, "error reading from '%s'\n", unicodedata_filename
);
278 /* ========================================================================= */
280 /* Output the license notice for a library file.
281 This closes an open C syntax comment. */
283 output_library_license (FILE *stream
, bool lgplv2plus
)
287 /* These Gnulib modules are under the LGPLv2+ license. */
288 fprintf (stream
, " This file is free software: you can redistribute it and/or modify\n");
289 fprintf (stream
, " it under the terms of the GNU Lesser General Public License as\n");
290 fprintf (stream
, " published by the Free Software Foundation; either version 2.1 of the\n");
291 fprintf (stream
, " License, or (at your option) any later version.\n");
292 fprintf (stream
, "\n");
293 fprintf (stream
, " This file is distributed in the hope that it will be useful,\n");
294 fprintf (stream
, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
295 fprintf (stream
, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
296 fprintf (stream
, " GNU Lesser General Public License for more details.\n");
297 fprintf (stream
, "\n");
298 fprintf (stream
, " You should have received a copy of the GNU Lesser General Public License\n");
299 fprintf (stream
, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
303 /* These Gnulib modules are under the 'LGPLv3+ or GPLv2+' license. */
304 fprintf (stream
, " This file is free software.\n");
305 fprintf (stream
, " It is dual-licensed under \"the GNU LGPLv3+ or the GNU GPLv2+\".\n");
306 fprintf (stream
, " You can redistribute it and/or modify it under either\n");
307 fprintf (stream
, " - the terms of the GNU Lesser General Public License as published\n");
308 fprintf (stream
, " by the Free Software Foundation, either version 3, or (at your\n");
309 fprintf (stream
, " option) any later version, or\n");
310 fprintf (stream
, " - the terms of the GNU General Public License as published by the\n");
311 fprintf (stream
, " Free Software Foundation; either version 2, or (at your option)\n");
312 fprintf (stream
, " any later version, or\n");
313 fprintf (stream
, " - the same dual license \"the GNU LGPLv3+ or the GNU GPLv2+\".\n");
314 fprintf (stream
, "\n");
315 fprintf (stream
, " This file is distributed in the hope that it will be useful,\n");
316 fprintf (stream
, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
317 fprintf (stream
, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n");
318 fprintf (stream
, " Lesser General Public License and the GNU General Public License\n");
319 fprintf (stream
, " for more details.\n");
320 fprintf (stream
, "\n");
321 fprintf (stream
, " You should have received a copy of the GNU Lesser General Public\n");
322 fprintf (stream
, " License and of the GNU General Public License along with this\n");
323 fprintf (stream
, " program. If not, see <https://www.gnu.org/licenses/>. */\n");
327 /* Output the license notice for a tests file.
328 This closes an open C syntax comment. */
330 output_tests_license (FILE *stream
)
332 /* Gnulib tests modules are under the GPLv3+ license. */
333 fprintf (stream
, " This file is free software: you can redistribute it and/or modify\n");
334 fprintf (stream
, " it under the terms of the GNU General Public License as published\n");
335 fprintf (stream
, " by the Free Software Foundation, either version 3 of the License,\n");
336 fprintf (stream
, " or (at your option) any later version.\n");
337 fprintf (stream
, "\n");
338 fprintf (stream
, " This file is distributed in the hope that it will be useful,\n");
339 fprintf (stream
, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
340 fprintf (stream
, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
341 fprintf (stream
, " GNU General Public License for more details.\n");
342 fprintf (stream
, "\n");
343 fprintf (stream
, " You should have received a copy of the GNU General Public License\n");
344 fprintf (stream
, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
347 /* ========================================================================= */
349 /* General category. */
350 /* See Unicode 3.0 book, section 4.5,
354 is_category_L (unsigned int ch
)
356 return (unicode_attributes
[ch
].name
!= NULL
357 && unicode_attributes
[ch
].category
[0] == 'L');
361 is_category_LC (unsigned int ch
)
363 /* See PropertyValueAliases.txt. */
364 return (unicode_attributes
[ch
].name
!= NULL
365 && unicode_attributes
[ch
].category
[0] == 'L'
366 && (unicode_attributes
[ch
].category
[1] == 'u'
367 || unicode_attributes
[ch
].category
[1] == 'l'
368 || unicode_attributes
[ch
].category
[1] == 't'));
372 is_category_Lu (unsigned int ch
)
374 return (unicode_attributes
[ch
].name
!= NULL
375 && unicode_attributes
[ch
].category
[0] == 'L'
376 && unicode_attributes
[ch
].category
[1] == 'u');
380 is_category_Ll (unsigned int ch
)
382 return (unicode_attributes
[ch
].name
!= NULL
383 && unicode_attributes
[ch
].category
[0] == 'L'
384 && unicode_attributes
[ch
].category
[1] == 'l');
388 is_category_Lt (unsigned int ch
)
390 return (unicode_attributes
[ch
].name
!= NULL
391 && unicode_attributes
[ch
].category
[0] == 'L'
392 && unicode_attributes
[ch
].category
[1] == 't');
396 is_category_Lm (unsigned int ch
)
398 return (unicode_attributes
[ch
].name
!= NULL
399 && unicode_attributes
[ch
].category
[0] == 'L'
400 && unicode_attributes
[ch
].category
[1] == 'm');
404 is_category_Lo (unsigned int ch
)
406 return (unicode_attributes
[ch
].name
!= NULL
407 && unicode_attributes
[ch
].category
[0] == 'L'
408 && unicode_attributes
[ch
].category
[1] == 'o');
412 is_category_M (unsigned int ch
)
414 return (unicode_attributes
[ch
].name
!= NULL
415 && unicode_attributes
[ch
].category
[0] == 'M');
419 is_category_Mn (unsigned int ch
)
421 return (unicode_attributes
[ch
].name
!= NULL
422 && unicode_attributes
[ch
].category
[0] == 'M'
423 && unicode_attributes
[ch
].category
[1] == 'n');
427 is_category_Mc (unsigned int ch
)
429 return (unicode_attributes
[ch
].name
!= NULL
430 && unicode_attributes
[ch
].category
[0] == 'M'
431 && unicode_attributes
[ch
].category
[1] == 'c');
435 is_category_Me (unsigned int ch
)
437 return (unicode_attributes
[ch
].name
!= NULL
438 && unicode_attributes
[ch
].category
[0] == 'M'
439 && unicode_attributes
[ch
].category
[1] == 'e');
443 is_category_N (unsigned int ch
)
445 return (unicode_attributes
[ch
].name
!= NULL
446 && unicode_attributes
[ch
].category
[0] == 'N');
450 is_category_Nd (unsigned int ch
)
452 return (unicode_attributes
[ch
].name
!= NULL
453 && unicode_attributes
[ch
].category
[0] == 'N'
454 && unicode_attributes
[ch
].category
[1] == 'd');
458 is_category_Nl (unsigned int ch
)
460 return (unicode_attributes
[ch
].name
!= NULL
461 && unicode_attributes
[ch
].category
[0] == 'N'
462 && unicode_attributes
[ch
].category
[1] == 'l');
466 is_category_No (unsigned int ch
)
468 return (unicode_attributes
[ch
].name
!= NULL
469 && unicode_attributes
[ch
].category
[0] == 'N'
470 && unicode_attributes
[ch
].category
[1] == 'o');
474 is_category_P (unsigned int ch
)
476 return (unicode_attributes
[ch
].name
!= NULL
477 && unicode_attributes
[ch
].category
[0] == 'P');
481 is_category_Pc (unsigned int ch
)
483 return (unicode_attributes
[ch
].name
!= NULL
484 && unicode_attributes
[ch
].category
[0] == 'P'
485 && unicode_attributes
[ch
].category
[1] == 'c');
489 is_category_Pd (unsigned int ch
)
491 return (unicode_attributes
[ch
].name
!= NULL
492 && unicode_attributes
[ch
].category
[0] == 'P'
493 && unicode_attributes
[ch
].category
[1] == 'd');
497 is_category_Ps (unsigned int ch
)
499 return (unicode_attributes
[ch
].name
!= NULL
500 && unicode_attributes
[ch
].category
[0] == 'P'
501 && unicode_attributes
[ch
].category
[1] == 's');
505 is_category_Pe (unsigned int ch
)
507 return (unicode_attributes
[ch
].name
!= NULL
508 && unicode_attributes
[ch
].category
[0] == 'P'
509 && unicode_attributes
[ch
].category
[1] == 'e');
513 is_category_Pi (unsigned int ch
)
515 return (unicode_attributes
[ch
].name
!= NULL
516 && unicode_attributes
[ch
].category
[0] == 'P'
517 && unicode_attributes
[ch
].category
[1] == 'i');
521 is_category_Pf (unsigned int ch
)
523 return (unicode_attributes
[ch
].name
!= NULL
524 && unicode_attributes
[ch
].category
[0] == 'P'
525 && unicode_attributes
[ch
].category
[1] == 'f');
529 is_category_Po (unsigned int ch
)
531 return (unicode_attributes
[ch
].name
!= NULL
532 && unicode_attributes
[ch
].category
[0] == 'P'
533 && unicode_attributes
[ch
].category
[1] == 'o');
537 is_category_S (unsigned int ch
)
539 return (unicode_attributes
[ch
].name
!= NULL
540 && unicode_attributes
[ch
].category
[0] == 'S');
544 is_category_Sm (unsigned int ch
)
546 return (unicode_attributes
[ch
].name
!= NULL
547 && unicode_attributes
[ch
].category
[0] == 'S'
548 && unicode_attributes
[ch
].category
[1] == 'm');
552 is_category_Sc (unsigned int ch
)
554 return (unicode_attributes
[ch
].name
!= NULL
555 && unicode_attributes
[ch
].category
[0] == 'S'
556 && unicode_attributes
[ch
].category
[1] == 'c');
560 is_category_Sk (unsigned int ch
)
562 return (unicode_attributes
[ch
].name
!= NULL
563 && unicode_attributes
[ch
].category
[0] == 'S'
564 && unicode_attributes
[ch
].category
[1] == 'k');
568 is_category_So (unsigned int ch
)
570 return (unicode_attributes
[ch
].name
!= NULL
571 && unicode_attributes
[ch
].category
[0] == 'S'
572 && unicode_attributes
[ch
].category
[1] == 'o');
576 is_category_Z (unsigned int ch
)
578 return (unicode_attributes
[ch
].name
!= NULL
579 && unicode_attributes
[ch
].category
[0] == 'Z');
583 is_category_Zs (unsigned int ch
)
585 return (unicode_attributes
[ch
].name
!= NULL
586 && unicode_attributes
[ch
].category
[0] == 'Z'
587 && unicode_attributes
[ch
].category
[1] == 's');
591 is_category_Zl (unsigned int ch
)
593 return (unicode_attributes
[ch
].name
!= NULL
594 && unicode_attributes
[ch
].category
[0] == 'Z'
595 && unicode_attributes
[ch
].category
[1] == 'l');
599 is_category_Zp (unsigned int ch
)
601 return (unicode_attributes
[ch
].name
!= NULL
602 && unicode_attributes
[ch
].category
[0] == 'Z'
603 && unicode_attributes
[ch
].category
[1] == 'p');
607 is_category_C (unsigned int ch
)
609 return (unicode_attributes
[ch
].name
== NULL
610 || unicode_attributes
[ch
].category
[0] == 'C');
614 is_category_Cc (unsigned int ch
)
616 return (unicode_attributes
[ch
].name
!= NULL
617 && unicode_attributes
[ch
].category
[0] == 'C'
618 && unicode_attributes
[ch
].category
[1] == 'c');
622 is_category_Cf (unsigned int ch
)
624 return (unicode_attributes
[ch
].name
!= NULL
625 && unicode_attributes
[ch
].category
[0] == 'C'
626 && unicode_attributes
[ch
].category
[1] == 'f');
630 is_category_Cs (unsigned int ch
)
632 return (ch
>= 0xd800 && ch
< 0xe000);
636 is_category_Co (unsigned int ch
)
638 return (unicode_attributes
[ch
].name
!= NULL
639 && unicode_attributes
[ch
].category
[0] == 'C'
640 && unicode_attributes
[ch
].category
[1] == 'o');
644 is_category_Cn (unsigned int ch
)
646 return (unicode_attributes
[ch
].name
== NULL
647 && !(ch
>= 0xd800 && ch
< 0xe000));
650 /* Output a boolean property in a human readable format. */
652 debug_output_predicate (const char *filename
, bool (*predicate
) (unsigned int))
657 stream
= fopen (filename
, "w");
660 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
664 #if 0 /* This yields huge text output. */
665 for (ch
= 0; ch
< 0x110000; ch
++)
668 fprintf (stream
, "0x%04X\n", ch
);
671 for (ch
= 0; ch
< 0x110000; ch
++)
674 unsigned int first
= ch
;
677 while (ch
+ 1 < 0x110000 && predicate (ch
+ 1))
681 fprintf (stream
, "0x%04X..0x%04X\n", first
, last
);
683 fprintf (stream
, "0x%04X\n", ch
);
687 if (ferror (stream
) || fclose (stream
))
689 fprintf (stderr
, "error writing to '%s'\n", filename
);
694 /* Output the unit test for a boolean property. */
696 output_predicate_test (const char *filename
, bool (*predicate
) (unsigned int), const char *expression
)
702 stream
= fopen (filename
, "w");
705 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
709 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
710 fprintf (stream
, "/* Test the Unicode character type functions.\n");
711 fprintf (stream
, " Copyright (C) 2007-2024 Free Software Foundation, Inc.\n");
712 fprintf (stream
, "\n");
713 output_tests_license (stream
);
714 fprintf (stream
, "\n");
715 fprintf (stream
, "#include \"test-predicate-part1.h\"\n");
716 fprintf (stream
, "\n");
719 for (ch
= 0; ch
< 0x110000; ch
++)
722 unsigned int first
= ch
;
725 while (ch
+ 1 < 0x110000 && predicate (ch
+ 1))
729 fprintf (stream
, ",\n");
730 fprintf (stream
, " { 0x%04X, 0x%04X }", first
, last
);
734 fprintf (stream
, "\n");
736 fprintf (stream
, "\n");
737 fprintf (stream
, "#define PREDICATE(c) %s\n", expression
);
738 fprintf (stream
, "#include \"test-predicate-part2.h\"\n");
740 if (ferror (stream
) || fclose (stream
))
742 fprintf (stderr
, "error writing to '%s'\n", filename
);
747 /* Construction of sparse 3-level tables. */
748 #define TABLE predicate_table
749 #define xmalloc malloc
750 #define xrealloc realloc
751 #include "3levelbit.h"
753 /* Output a boolean property in a three-level bitmap. */
755 output_predicate (const char *filename
, bool (*predicate
) (unsigned int), const char *name
, const char *comment
, const char *version
)
759 struct predicate_table t
;
760 unsigned int level1_offset
, level2_offset
, level3_offset
;
762 stream
= fopen (filename
, "w");
765 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
769 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
770 fprintf (stream
, "/* %s of Unicode characters. */\n", comment
);
771 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
773 fprintf (stream
, "\n");
775 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
776 fprintf (stream
, "\n");
777 output_library_license (stream
,
778 strcmp (filename
, "unictype/categ_M.h") == 0
779 || strncmp (filename
, "unictype/ctype_", 15) == 0
780 || strcmp (filename
, "uniwidth/width2.h") == 0);
781 fprintf (stream
, "\n");
785 predicate_table_init (&t
);
787 for (ch
= 0; ch
< 0x110000; ch
++)
789 predicate_table_add (&t
, ch
);
791 predicate_table_finalize (&t
);
793 /* Offsets in t.result, in memory of this process. */
795 5 * sizeof (uint32_t);
797 5 * sizeof (uint32_t)
798 + t
.level1_size
* sizeof (uint32_t);
800 5 * sizeof (uint32_t)
801 + t
.level1_size
* sizeof (uint32_t)
802 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
804 for (i
= 0; i
< 5; i
++)
806 fprintf (stream
, "#define header_%d %d\n", i
,
807 ((uint32_t *) t
.result
)[i
]);
809 fprintf (stream
, "static const\n");
810 fprintf (stream
, "struct\n");
811 fprintf (stream
, " {\n");
812 fprintf (stream
, " int header[1];\n");
813 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
814 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
815 fprintf (stream
, " unsigned int level3[%zu << %d];\n", t
.level3_size
, t
.p
);
816 fprintf (stream
, " }\n");
817 fprintf (stream
, "%s =\n", name
);
818 fprintf (stream
, "{\n");
819 fprintf (stream
, " { %d },\n", ((uint32_t *) t
.result
)[1]);
820 fprintf (stream
, " {");
821 if (t
.level1_size
> 1)
822 fprintf (stream
, "\n ");
823 for (i
= 0; i
< t
.level1_size
; i
++)
826 if (i
> 0 && (i
% 1) == 0)
827 fprintf (stream
, "\n ");
828 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
830 fprintf (stream
, " %5d", -1);
832 fprintf (stream
, " %5zu * sizeof (int) / sizeof (short) + %5zu",
833 1 + t
.level1_size
, (offset
- level2_offset
) / sizeof (uint32_t));
834 if (i
+1 < t
.level1_size
)
835 fprintf (stream
, ",");
837 if (t
.level1_size
> 1)
838 fprintf (stream
, "\n ");
839 fprintf (stream
, " },\n");
840 fprintf (stream
, " {");
841 if (t
.level2_size
<< t
.q
> 1)
842 fprintf (stream
, "\n ");
843 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
846 if (i
> 0 && (i
% 1) == 0)
847 fprintf (stream
, "\n ");
848 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
850 fprintf (stream
, " %5d", -1);
852 fprintf (stream
, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
853 1 + t
.level1_size
, t
.level2_size
<< t
.q
, (offset
- level3_offset
) / sizeof (uint32_t));
854 if (i
+1 < t
.level2_size
<< t
.q
)
855 fprintf (stream
, ",");
857 if (t
.level2_size
<< t
.q
> 1)
858 fprintf (stream
, "\n ");
859 fprintf (stream
, " },\n");
860 fprintf (stream
, " {");
861 if (t
.level3_size
<< t
.p
> 4)
862 fprintf (stream
, "\n ");
863 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
865 if (i
> 0 && (i
% 4) == 0)
866 fprintf (stream
, "\n ");
867 fprintf (stream
, " 0x%08XU",
868 ((uint32_t *) (t
.result
+ level3_offset
))[i
]);
869 if (i
+1 < t
.level3_size
<< t
.p
)
870 fprintf (stream
, ",");
872 if (t
.level3_size
<< t
.p
> 4)
873 fprintf (stream
, "\n ");
874 fprintf (stream
, " }\n");
875 fprintf (stream
, "};\n");
877 if (ferror (stream
) || fclose (stream
))
879 fprintf (stderr
, "error writing to '%s'\n", filename
);
884 /* Output all categories. */
886 output_categories (const char *version
)
888 #define CATEGORY(C) \
889 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
890 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
891 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
935 UC_CATEGORY_MASK_L
= 0x0000001f,
936 UC_CATEGORY_MASK_LC
= 0x00000007,
937 UC_CATEGORY_MASK_Lu
= 0x00000001,
938 UC_CATEGORY_MASK_Ll
= 0x00000002,
939 UC_CATEGORY_MASK_Lt
= 0x00000004,
940 UC_CATEGORY_MASK_Lm
= 0x00000008,
941 UC_CATEGORY_MASK_Lo
= 0x00000010,
942 UC_CATEGORY_MASK_M
= 0x000000e0,
943 UC_CATEGORY_MASK_Mn
= 0x00000020,
944 UC_CATEGORY_MASK_Mc
= 0x00000040,
945 UC_CATEGORY_MASK_Me
= 0x00000080,
946 UC_CATEGORY_MASK_N
= 0x00000700,
947 UC_CATEGORY_MASK_Nd
= 0x00000100,
948 UC_CATEGORY_MASK_Nl
= 0x00000200,
949 UC_CATEGORY_MASK_No
= 0x00000400,
950 UC_CATEGORY_MASK_P
= 0x0003f800,
951 UC_CATEGORY_MASK_Pc
= 0x00000800,
952 UC_CATEGORY_MASK_Pd
= 0x00001000,
953 UC_CATEGORY_MASK_Ps
= 0x00002000,
954 UC_CATEGORY_MASK_Pe
= 0x00004000,
955 UC_CATEGORY_MASK_Pi
= 0x00008000,
956 UC_CATEGORY_MASK_Pf
= 0x00010000,
957 UC_CATEGORY_MASK_Po
= 0x00020000,
958 UC_CATEGORY_MASK_S
= 0x003c0000,
959 UC_CATEGORY_MASK_Sm
= 0x00040000,
960 UC_CATEGORY_MASK_Sc
= 0x00080000,
961 UC_CATEGORY_MASK_Sk
= 0x00100000,
962 UC_CATEGORY_MASK_So
= 0x00200000,
963 UC_CATEGORY_MASK_Z
= 0x01c00000,
964 UC_CATEGORY_MASK_Zs
= 0x00400000,
965 UC_CATEGORY_MASK_Zl
= 0x00800000,
966 UC_CATEGORY_MASK_Zp
= 0x01000000,
967 UC_CATEGORY_MASK_C
= 0x3e000000,
968 UC_CATEGORY_MASK_Cc
= 0x02000000,
969 UC_CATEGORY_MASK_Cf
= 0x04000000,
970 UC_CATEGORY_MASK_Cs
= 0x08000000,
971 UC_CATEGORY_MASK_Co
= 0x10000000,
972 UC_CATEGORY_MASK_Cn
= 0x20000000
976 general_category_byname (const char *category_name
)
978 if (category_name
[0] != '\0'
979 && (category_name
[1] == '\0' || category_name
[2] == '\0'))
980 switch (category_name
[0])
983 switch (category_name
[1])
985 case '\0': return UC_CATEGORY_MASK_L
;
986 case 'C': return UC_CATEGORY_MASK_LC
;
987 case 'u': return UC_CATEGORY_MASK_Lu
;
988 case 'l': return UC_CATEGORY_MASK_Ll
;
989 case 't': return UC_CATEGORY_MASK_Lt
;
990 case 'm': return UC_CATEGORY_MASK_Lm
;
991 case 'o': return UC_CATEGORY_MASK_Lo
;
995 switch (category_name
[1])
997 case '\0': return UC_CATEGORY_MASK_M
;
998 case 'n': return UC_CATEGORY_MASK_Mn
;
999 case 'c': return UC_CATEGORY_MASK_Mc
;
1000 case 'e': return UC_CATEGORY_MASK_Me
;
1004 switch (category_name
[1])
1006 case '\0': return UC_CATEGORY_MASK_N
;
1007 case 'd': return UC_CATEGORY_MASK_Nd
;
1008 case 'l': return UC_CATEGORY_MASK_Nl
;
1009 case 'o': return UC_CATEGORY_MASK_No
;
1013 switch (category_name
[1])
1015 case '\0': return UC_CATEGORY_MASK_P
;
1016 case 'c': return UC_CATEGORY_MASK_Pc
;
1017 case 'd': return UC_CATEGORY_MASK_Pd
;
1018 case 's': return UC_CATEGORY_MASK_Ps
;
1019 case 'e': return UC_CATEGORY_MASK_Pe
;
1020 case 'i': return UC_CATEGORY_MASK_Pi
;
1021 case 'f': return UC_CATEGORY_MASK_Pf
;
1022 case 'o': return UC_CATEGORY_MASK_Po
;
1026 switch (category_name
[1])
1028 case '\0': return UC_CATEGORY_MASK_S
;
1029 case 'm': return UC_CATEGORY_MASK_Sm
;
1030 case 'c': return UC_CATEGORY_MASK_Sc
;
1031 case 'k': return UC_CATEGORY_MASK_Sk
;
1032 case 'o': return UC_CATEGORY_MASK_So
;
1036 switch (category_name
[1])
1038 case '\0': return UC_CATEGORY_MASK_Z
;
1039 case 's': return UC_CATEGORY_MASK_Zs
;
1040 case 'l': return UC_CATEGORY_MASK_Zl
;
1041 case 'p': return UC_CATEGORY_MASK_Zp
;
1045 switch (category_name
[1])
1047 case '\0': return UC_CATEGORY_MASK_C
;
1048 case 'c': return UC_CATEGORY_MASK_Cc
;
1049 case 'f': return UC_CATEGORY_MASK_Cf
;
1050 case 's': return UC_CATEGORY_MASK_Cs
;
1051 case 'o': return UC_CATEGORY_MASK_Co
;
1052 case 'n': return UC_CATEGORY_MASK_Cn
;
1056 /* Invalid category name. */
1060 /* Construction of sparse 3-level tables. */
1061 #define TABLE category_table
1062 #define ELEMENT uint8_t
1063 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
1064 #define xmalloc malloc
1065 #define xrealloc realloc
1068 /* Output the per-character category table. */
1070 output_category (const char *filename
, const char *version
)
1074 struct category_table t
;
1075 unsigned int level1_offset
, level2_offset
, level3_offset
;
1076 uint16_t *level3_packed
;
1078 stream
= fopen (filename
, "w");
1081 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
1085 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1086 fprintf (stream
, "/* Categories of Unicode characters. */\n");
1087 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1089 fprintf (stream
, "\n");
1091 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
1092 fprintf (stream
, "\n");
1093 output_library_license (stream
, true);
1094 fprintf (stream
, "\n");
1098 category_table_init (&t
);
1100 for (ch
= 0; ch
< 0x110000; ch
++)
1103 unsigned int log2_value
;
1105 if (is_category_Cs (ch
))
1106 value
= UC_CATEGORY_MASK_Cs
;
1107 else if (unicode_attributes
[ch
].name
!= NULL
)
1108 value
= general_category_byname (unicode_attributes
[ch
].category
);
1112 /* Now value should contain exactly one bit. */
1113 assert (value
!= 0 && (value
& (value
- 1)) == 0);
1115 for (log2_value
= 0; value
> 1; value
>>= 1, log2_value
++);
1117 assert (log2_value
<= 0x1f);
1119 category_table_add (&t
, ch
, log2_value
);
1122 category_table_finalize (&t
);
1124 /* Offsets in t.result, in memory of this process. */
1126 5 * sizeof (uint32_t);
1128 5 * sizeof (uint32_t)
1129 + t
.level1_size
* sizeof (uint32_t);
1131 5 * sizeof (uint32_t)
1132 + t
.level1_size
* sizeof (uint32_t)
1133 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
1135 for (i
= 0; i
< 5; i
++)
1136 fprintf (stream
, "#define category_header_%d %d\n", i
,
1137 ((uint32_t *) t
.result
)[i
]);
1138 fprintf (stream
, "static const\n");
1139 fprintf (stream
, "struct\n");
1140 fprintf (stream
, " {\n");
1141 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
1142 fprintf (stream
, " unsigned short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
1143 fprintf (stream
, " unsigned short level3[%zu * %d + 1];\n", t
.level3_size
,
1144 (1 << t
.p
) * 5 / 16);
1145 fprintf (stream
, " }\n");
1146 fprintf (stream
, "u_category =\n");
1147 fprintf (stream
, "{\n");
1148 fprintf (stream
, " {");
1149 if (t
.level1_size
> 8)
1150 fprintf (stream
, "\n ");
1151 for (i
= 0; i
< t
.level1_size
; i
++)
1154 if (i
> 0 && (i
% 8) == 0)
1155 fprintf (stream
, "\n ");
1156 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
1158 fprintf (stream
, " %5d", -1);
1160 fprintf (stream
, " %5zu",
1161 (offset
- level2_offset
) / sizeof (uint32_t));
1162 if (i
+1 < t
.level1_size
)
1163 fprintf (stream
, ",");
1165 if (t
.level1_size
> 8)
1166 fprintf (stream
, "\n ");
1167 fprintf (stream
, " },\n");
1168 fprintf (stream
, " {");
1169 if (t
.level2_size
<< t
.q
> 8)
1170 fprintf (stream
, "\n ");
1171 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
1174 if (i
> 0 && (i
% 8) == 0)
1175 fprintf (stream
, "\n ");
1176 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
1177 /* To make the level2 values fit in 16 bits, we use 'unsigned short'
1178 instead of 'short' and add 1 to each value. */
1180 fprintf (stream
, " %5d", -1 + 1);
1182 fprintf (stream
, " %5zu",
1183 (offset
- level3_offset
) / sizeof (uint8_t) + 1);
1184 if (i
+1 < t
.level2_size
<< t
.q
)
1185 fprintf (stream
, ",");
1187 if (t
.level2_size
<< t
.q
> 8)
1188 fprintf (stream
, "\n ");
1189 fprintf (stream
, " },\n");
1190 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1191 not 32-bit units, in order to make the lookup function easier. */
1194 calloc ((t
.level3_size
<< t
.p
) * 5 / 16 + 1, sizeof (uint16_t));
1195 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
1197 unsigned int j
= (i
* 5) / 16;
1198 unsigned int k
= (i
* 5) % 16;
1199 uint32_t value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
];
1200 value
= level3_packed
[j
] | (level3_packed
[j
+1] << 16) | (value
<< k
);
1201 level3_packed
[j
] = value
& 0xffff;
1202 level3_packed
[j
+1] = value
>> 16;
1204 fprintf (stream
, " {");
1205 if ((t
.level3_size
<< t
.p
) * 5 / 16 + 1 > 8)
1206 fprintf (stream
, "\n ");
1207 for (i
= 0; i
< (t
.level3_size
<< t
.p
) * 5 / 16 + 1; i
++)
1209 if (i
> 0 && (i
% 8) == 0)
1210 fprintf (stream
, "\n ");
1211 fprintf (stream
, " 0x%04x", level3_packed
[i
]);
1212 if (i
+1 < (t
.level3_size
<< t
.p
) * 5 / 16 + 1)
1213 fprintf (stream
, ",");
1215 if ((t
.level3_size
<< t
.p
) * 5 / 16 + 1 > 8)
1216 fprintf (stream
, "\n ");
1217 fprintf (stream
, " }\n");
1218 free (level3_packed
);
1219 fprintf (stream
, "};\n");
1221 if (ferror (stream
) || fclose (stream
))
1223 fprintf (stderr
, "error writing to '%s'\n", filename
);
1228 /* ========================================================================= */
1230 /* Canonical combining class. */
1231 /* See Unicode 3.0 book, section 4.2,
1234 /* Construction of sparse 3-level tables. */
1235 #define TABLE combclass_table
1236 #define ELEMENT uint8_t
1238 #define xmalloc malloc
1239 #define xrealloc realloc
1242 /* Output the per-character combining class table. */
1244 output_combclass (const char *filename
, const char *version
)
1248 struct combclass_table t
;
1249 unsigned int level1_offset
, level2_offset
, level3_offset
;
1251 stream
= fopen (filename
, "w");
1254 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
1258 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1259 fprintf (stream
, "/* Combining class of Unicode characters. */\n");
1260 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1262 fprintf (stream
, "\n");
1264 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
1265 fprintf (stream
, "\n");
1266 output_library_license (stream
, true);
1267 fprintf (stream
, "\n");
1271 combclass_table_init (&t
);
1273 for (ch
= 0; ch
< 0x110000; ch
++)
1274 if (unicode_attributes
[ch
].name
!= NULL
)
1276 int value
= atoi (unicode_attributes
[ch
].combining
);
1277 assert (value
>= 0 && value
<= 255);
1278 combclass_table_add (&t
, ch
, value
);
1281 combclass_table_finalize (&t
);
1283 /* Offsets in t.result, in memory of this process. */
1285 5 * sizeof (uint32_t);
1287 5 * sizeof (uint32_t)
1288 + t
.level1_size
* sizeof (uint32_t);
1290 5 * sizeof (uint32_t)
1291 + t
.level1_size
* sizeof (uint32_t)
1292 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
1294 for (i
= 0; i
< 5; i
++)
1295 fprintf (stream
, "#define combclass_header_%d %d\n", i
,
1296 ((uint32_t *) t
.result
)[i
]);
1297 fprintf (stream
, "static const\n");
1298 fprintf (stream
, "struct\n");
1299 fprintf (stream
, " {\n");
1300 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
1301 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
1302 fprintf (stream
, " unsigned char level3[%zu << %d];\n", t
.level3_size
, t
.p
);
1303 fprintf (stream
, " }\n");
1304 fprintf (stream
, "u_combclass =\n");
1305 fprintf (stream
, "{\n");
1306 fprintf (stream
, " {");
1307 if (t
.level1_size
> 8)
1308 fprintf (stream
, "\n ");
1309 for (i
= 0; i
< t
.level1_size
; i
++)
1312 if (i
> 0 && (i
% 8) == 0)
1313 fprintf (stream
, "\n ");
1314 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
1316 fprintf (stream
, " %5d", -1);
1318 fprintf (stream
, " %5zu",
1319 (offset
- level2_offset
) / sizeof (uint32_t));
1320 if (i
+1 < t
.level1_size
)
1321 fprintf (stream
, ",");
1323 if (t
.level1_size
> 8)
1324 fprintf (stream
, "\n ");
1325 fprintf (stream
, " },\n");
1326 fprintf (stream
, " {");
1327 if (t
.level2_size
<< t
.q
> 8)
1328 fprintf (stream
, "\n ");
1329 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
1332 if (i
> 0 && (i
% 8) == 0)
1333 fprintf (stream
, "\n ");
1334 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
1336 fprintf (stream
, " %5d", -1);
1338 fprintf (stream
, " %5zu",
1339 (offset
- level3_offset
) / sizeof (uint8_t));
1340 if (i
+1 < t
.level2_size
<< t
.q
)
1341 fprintf (stream
, ",");
1343 if (t
.level2_size
<< t
.q
> 8)
1344 fprintf (stream
, "\n ");
1345 fprintf (stream
, " },\n");
1346 fprintf (stream
, " {");
1347 if (t
.level3_size
<< t
.p
> 8)
1348 fprintf (stream
, "\n ");
1349 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
1351 if (i
> 0 && (i
% 8) == 0)
1352 fprintf (stream
, "\n ");
1353 fprintf (stream
, " %3d", ((uint8_t *) (t
.result
+ level3_offset
))[i
]);
1354 if (i
+1 < t
.level3_size
<< t
.p
)
1355 fprintf (stream
, ",");
1357 if (t
.level3_size
<< t
.p
> 8)
1358 fprintf (stream
, "\n ");
1359 fprintf (stream
, " }\n");
1360 fprintf (stream
, "};\n");
1362 if (ferror (stream
) || fclose (stream
))
1364 fprintf (stderr
, "error writing to '%s'\n", filename
);
1369 /* ========================================================================= */
1371 /* Bidirectional category. */
1372 /* See Unicode 3.0 book, section 4.3,
1377 UC_BIDI_L
, /* Left-to-Right */
1378 UC_BIDI_LRE
, /* Left-to-Right Embedding */
1379 UC_BIDI_LRO
, /* Left-to-Right Override */
1380 UC_BIDI_R
, /* Right-to-Left */
1381 UC_BIDI_AL
, /* Right-to-Left Arabic */
1382 UC_BIDI_RLE
, /* Right-to-Left Embedding */
1383 UC_BIDI_RLO
, /* Right-to-Left Override */
1384 UC_BIDI_PDF
, /* Pop Directional Format */
1385 UC_BIDI_EN
, /* European Number */
1386 UC_BIDI_ES
, /* European Number Separator */
1387 UC_BIDI_ET
, /* European Number Terminator */
1388 UC_BIDI_AN
, /* Arabic Number */
1389 UC_BIDI_CS
, /* Common Number Separator */
1390 UC_BIDI_NSM
, /* Non-Spacing Mark */
1391 UC_BIDI_BN
, /* Boundary Neutral */
1392 UC_BIDI_B
, /* Paragraph Separator */
1393 UC_BIDI_S
, /* Segment Separator */
1394 UC_BIDI_WS
, /* Whitespace */
1395 UC_BIDI_ON
, /* Other Neutral */
1396 UC_BIDI_LRI
, /* Left-to-Right Isolate */
1397 UC_BIDI_RLI
, /* Right-to-Left Isolate */
1398 UC_BIDI_FSI
, /* First Strong Isolate */
1399 UC_BIDI_PDI
/* Pop Directional Isolate */
1403 bidi_category_byname (const char *category_name
)
1405 switch (category_name
[0])
1408 switch (category_name
[1])
1411 if (category_name
[2] == '\0')
1415 if (category_name
[2] == '\0')
1421 switch (category_name
[1])
1426 if (category_name
[2] == '\0')
1432 switch (category_name
[1])
1435 if (category_name
[2] == '\0')
1441 switch (category_name
[1])
1444 if (category_name
[2] == '\0')
1448 if (category_name
[2] == '\0')
1452 if (category_name
[2] == '\0')
1458 switch (category_name
[1])
1461 switch (category_name
[2])
1464 if (category_name
[3] == '\0')
1471 switch (category_name
[1])
1476 switch (category_name
[2])
1479 if (category_name
[3] == '\0')
1483 if (category_name
[3] == '\0')
1487 if (category_name
[3] == '\0')
1495 switch (category_name
[1])
1498 switch (category_name
[2])
1501 if (category_name
[3] == '\0')
1509 switch (category_name
[1])
1512 if (category_name
[2] == '\0')
1518 switch (category_name
[1])
1521 switch (category_name
[2])
1524 if (category_name
[3] == '\0')
1528 if (category_name
[3] == '\0')
1536 switch (category_name
[1])
1541 switch (category_name
[2])
1544 if (category_name
[3] == '\0')
1548 if (category_name
[3] == '\0')
1552 if (category_name
[3] == '\0')
1560 if (category_name
[1] == '\0')
1564 switch (category_name
[1])
1567 if (category_name
[2] == '\0')
1573 /* Invalid bidi category name. */
1578 get_bidi_category (unsigned int ch
)
1580 if (unicode_attributes
[ch
].name
!= NULL
)
1581 return bidi_category_byname (unicode_attributes
[ch
].bidi
);
1584 /* The bidi category of unassigned characters depends on the range.
1585 See UTR #9 and DerivedBidiClass.txt. */
1586 if ((ch
>= 0x0590 && ch
<= 0x05FF)
1587 || (ch
>= 0x07FB && ch
<= 0x08FF)
1588 || (ch
>= 0xFB37 && ch
<= 0xFB45)
1589 || (ch
>= 0x10800 && ch
<= 0x10FFF))
1591 else if ((ch
>= 0x0600 && ch
<= 0x07BF)
1592 || (ch
>= 0x2064 && ch
<= 0x2069)
1593 || (ch
>= 0xFBB2 && ch
<= 0xFDCF)
1594 || (ch
>= 0xFDFE && ch
<= 0xFEFE))
1596 else if ((ch
>= 0xFDD0 && ch
<= 0xFDEF)
1597 || (ch
>= 0xFFF0 && ch
<= 0xFFFF)
1598 || (ch
& 0xFFFF) == 0xFFFE
1599 || (ch
& 0xFFFF) == 0xFFFF
1600 || (ch
>= 0xE0000 && ch
<= 0xE0FFF))
1607 /* Construction of sparse 3-level tables. */
1608 #define TABLE bidi_category_table
1609 #define ELEMENT uint8_t
1610 #define DEFAULT UC_BIDI_L
1611 #define xmalloc malloc
1612 #define xrealloc realloc
1615 /* Output the per-character bidi category table. */
1617 output_bidi_category (const char *filename
, const char *version
)
1621 struct bidi_category_table t
;
1622 unsigned int level1_offset
, level2_offset
, level3_offset
;
1623 uint16_t *level3_packed
;
1625 stream
= fopen (filename
, "w");
1628 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
1632 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1633 fprintf (stream
, "/* Bidi categories of Unicode characters. */\n");
1634 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1636 fprintf (stream
, "\n");
1638 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
1639 fprintf (stream
, "\n");
1640 output_library_license (stream
, true);
1641 fprintf (stream
, "\n");
1645 bidi_category_table_init (&t
);
1647 for (ch
= 0; ch
< 0x110000; ch
++)
1649 int value
= get_bidi_category (ch
);
1651 assert (value
<= 0x1f);
1653 bidi_category_table_add (&t
, ch
, value
);
1656 bidi_category_table_finalize (&t
);
1658 /* Offsets in t.result, in memory of this process. */
1660 5 * sizeof (uint32_t);
1662 5 * sizeof (uint32_t)
1663 + t
.level1_size
* sizeof (uint32_t);
1665 5 * sizeof (uint32_t)
1666 + t
.level1_size
* sizeof (uint32_t)
1667 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
1669 for (i
= 0; i
< 5; i
++)
1670 fprintf (stream
, "#define bidi_category_header_%d %d\n", i
,
1671 ((uint32_t *) t
.result
)[i
]);
1672 fprintf (stream
, "static const\n");
1673 fprintf (stream
, "struct\n");
1674 fprintf (stream
, " {\n");
1675 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
1676 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
1677 fprintf (stream
, " unsigned short level3[%zu * %d + 1];\n", t
.level3_size
,
1678 (1 << t
.p
) * 5 / 16);
1679 fprintf (stream
, " }\n");
1680 fprintf (stream
, "u_bidi_category =\n");
1681 fprintf (stream
, "{\n");
1682 fprintf (stream
, " {");
1683 if (t
.level1_size
> 8)
1684 fprintf (stream
, "\n ");
1685 for (i
= 0; i
< t
.level1_size
; i
++)
1688 if (i
> 0 && (i
% 8) == 0)
1689 fprintf (stream
, "\n ");
1690 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
1692 fprintf (stream
, " %5d", -1);
1694 fprintf (stream
, " %5zu",
1695 (offset
- level2_offset
) / sizeof (uint32_t));
1696 if (i
+1 < t
.level1_size
)
1697 fprintf (stream
, ",");
1699 if (t
.level1_size
> 8)
1700 fprintf (stream
, "\n ");
1701 fprintf (stream
, " },\n");
1702 fprintf (stream
, " {");
1703 if (t
.level2_size
<< t
.q
> 8)
1704 fprintf (stream
, "\n ");
1705 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
1708 if (i
> 0 && (i
% 8) == 0)
1709 fprintf (stream
, "\n ");
1710 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
1712 fprintf (stream
, " %5d", -1);
1714 fprintf (stream
, " %5zu",
1715 (offset
- level3_offset
) / sizeof (uint8_t));
1716 if (i
+1 < t
.level2_size
<< t
.q
)
1717 fprintf (stream
, ",");
1719 if (t
.level2_size
<< t
.q
> 8)
1720 fprintf (stream
, "\n ");
1721 fprintf (stream
, " },\n");
1722 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1723 not 32-bit units, in order to make the lookup function easier. */
1726 calloc ((t
.level3_size
<< t
.p
) * 5 / 16 + 1, sizeof (uint16_t));
1727 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
1729 unsigned int j
= (i
* 5) / 16;
1730 unsigned int k
= (i
* 5) % 16;
1731 uint32_t value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
];
1732 value
= level3_packed
[j
] | (level3_packed
[j
+1] << 16) | (value
<< k
);
1733 level3_packed
[j
] = value
& 0xffff;
1734 level3_packed
[j
+1] = value
>> 16;
1736 fprintf (stream
, " {");
1737 if ((t
.level3_size
<< t
.p
) * 5 / 16 + 1 > 8)
1738 fprintf (stream
, "\n ");
1739 for (i
= 0; i
< (t
.level3_size
<< t
.p
) * 5 / 16 + 1; i
++)
1741 if (i
> 0 && (i
% 8) == 0)
1742 fprintf (stream
, "\n ");
1743 fprintf (stream
, " 0x%04x", level3_packed
[i
]);
1744 if (i
+1 < (t
.level3_size
<< t
.p
) * 5 / 16 + 1)
1745 fprintf (stream
, ",");
1747 if ((t
.level3_size
<< t
.p
) * 5 / 16 + 1 > 8)
1748 fprintf (stream
, "\n ");
1749 fprintf (stream
, " }\n");
1750 free (level3_packed
);
1751 fprintf (stream
, "};\n");
1753 if (ferror (stream
) || fclose (stream
))
1755 fprintf (stderr
, "error writing to '%s'\n", filename
);
1760 /* ========================================================================= */
1762 /* Decimal digit value. */
1763 /* See Unicode 3.0 book, section 4.6. */
1766 get_decdigit_value (unsigned int ch
)
1768 if (unicode_attributes
[ch
].name
!= NULL
1769 && unicode_attributes
[ch
].decdigit
[0] != '\0')
1770 return atoi (unicode_attributes
[ch
].decdigit
);
1774 /* Construction of sparse 3-level tables. */
1775 #define TABLE decdigit_table
1776 #define ELEMENT uint8_t
1778 #define xmalloc malloc
1779 #define xrealloc realloc
1782 /* Output the unit test for the per-character decimal digit value table. */
1784 output_decimal_digit_test (const char *filename
, const char *version
)
1790 stream
= fopen (filename
, "w");
1793 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
1797 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1798 fprintf (stream
, "/* Decimal digit values of Unicode characters. */\n");
1799 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1801 fprintf (stream
, "\n");
1803 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
1804 fprintf (stream
, "\n");
1805 output_tests_license (stream
);
1806 fprintf (stream
, "\n");
1809 for (ch
= 0; ch
< 0x110000; ch
++)
1811 int value
= get_decdigit_value (ch
);
1813 assert (value
>= -1 && value
< 10);
1818 fprintf (stream
, ",\n");
1819 fprintf (stream
, " { 0x%04X, %d }", ch
, value
);
1824 fprintf (stream
, "\n");
1826 if (ferror (stream
) || fclose (stream
))
1828 fprintf (stderr
, "error writing to '%s'\n", filename
);
1833 /* Output the per-character decimal digit value table. */
1835 output_decimal_digit (const char *filename
, const char *version
)
1839 struct decdigit_table t
;
1840 unsigned int level1_offset
, level2_offset
, level3_offset
;
1842 stream
= fopen (filename
, "w");
1845 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
1849 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1850 fprintf (stream
, "/* Decimal digit values of Unicode characters. */\n");
1851 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1853 fprintf (stream
, "\n");
1855 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
1856 fprintf (stream
, "\n");
1857 output_library_license (stream
, false);
1858 fprintf (stream
, "\n");
1862 decdigit_table_init (&t
);
1864 for (ch
= 0; ch
< 0x110000; ch
++)
1866 int value
= 1 + get_decdigit_value (ch
);
1868 assert (value
>= 0 && value
<= 10);
1870 decdigit_table_add (&t
, ch
, value
);
1873 decdigit_table_finalize (&t
);
1875 /* Offsets in t.result, in memory of this process. */
1877 5 * sizeof (uint32_t);
1879 5 * sizeof (uint32_t)
1880 + t
.level1_size
* sizeof (uint32_t);
1882 5 * sizeof (uint32_t)
1883 + t
.level1_size
* sizeof (uint32_t)
1884 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
1886 for (i
= 0; i
< 5; i
++)
1887 fprintf (stream
, "#define decdigit_header_%d %d\n", i
,
1888 ((uint32_t *) t
.result
)[i
]);
1889 fprintf (stream
, "static const\n");
1890 fprintf (stream
, "struct\n");
1891 fprintf (stream
, " {\n");
1892 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
1893 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
1894 fprintf (stream
, " unsigned char level3[%zu << %d];\n", t
.level3_size
,
1896 fprintf (stream
, " }\n");
1897 fprintf (stream
, "u_decdigit =\n");
1898 fprintf (stream
, "{\n");
1899 fprintf (stream
, " {");
1900 if (t
.level1_size
> 8)
1901 fprintf (stream
, "\n ");
1902 for (i
= 0; i
< t
.level1_size
; i
++)
1905 if (i
> 0 && (i
% 8) == 0)
1906 fprintf (stream
, "\n ");
1907 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
1909 fprintf (stream
, " %5d", -1);
1911 fprintf (stream
, " %5zu",
1912 (offset
- level2_offset
) / sizeof (uint32_t));
1913 if (i
+1 < t
.level1_size
)
1914 fprintf (stream
, ",");
1916 if (t
.level1_size
> 8)
1917 fprintf (stream
, "\n ");
1918 fprintf (stream
, " },\n");
1919 fprintf (stream
, " {");
1920 if (t
.level2_size
<< t
.q
> 8)
1921 fprintf (stream
, "\n ");
1922 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
1925 if (i
> 0 && (i
% 8) == 0)
1926 fprintf (stream
, "\n ");
1927 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
1929 fprintf (stream
, " %5d", -1);
1931 fprintf (stream
, " %5zu",
1932 (offset
- level3_offset
) / sizeof (uint8_t));
1933 if (i
+1 < t
.level2_size
<< t
.q
)
1934 fprintf (stream
, ",");
1936 if (t
.level2_size
<< t
.q
> 8)
1937 fprintf (stream
, "\n ");
1938 fprintf (stream
, " },\n");
1939 /* Pack the level3 array. Each entry needs 4 bits only. */
1940 fprintf (stream
, " {");
1941 if (t
.level3_size
<< (t
.p
- 1) > 8)
1942 fprintf (stream
, "\n ");
1943 for (i
= 0; i
< t
.level3_size
<< (t
.p
- 1); i
++)
1945 if (i
> 0 && (i
% 8) == 0)
1946 fprintf (stream
, "\n ");
1947 fprintf (stream
, " 0x%02x",
1948 ((uint8_t *) (t
.result
+ level3_offset
))[2*i
]
1949 + (((uint8_t *) (t
.result
+ level3_offset
))[2*i
+1] << 4));
1950 if (i
+1 < t
.level3_size
<< (t
.p
- 1))
1951 fprintf (stream
, ",");
1953 if (t
.level3_size
<< (t
.p
- 1) > 8)
1954 fprintf (stream
, "\n ");
1955 fprintf (stream
, " }\n");
1956 fprintf (stream
, "};\n");
1958 if (ferror (stream
) || fclose (stream
))
1960 fprintf (stderr
, "error writing to '%s'\n", filename
);
1965 /* ========================================================================= */
1968 /* See Unicode 3.0 book, section 4.6. */
1971 get_digit_value (unsigned int ch
)
1973 if (unicode_attributes
[ch
].name
!= NULL
1974 && unicode_attributes
[ch
].digit
[0] != '\0')
1975 return atoi (unicode_attributes
[ch
].digit
);
1979 /* Output the unit test for the per-character digit value table. */
1981 output_digit_test (const char *filename
, const char *version
)
1987 stream
= fopen (filename
, "w");
1990 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
1994 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1995 fprintf (stream
, "/* Digit values of Unicode characters. */\n");
1996 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1998 fprintf (stream
, "\n");
2000 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
2001 fprintf (stream
, "\n");
2002 output_tests_license (stream
);
2003 fprintf (stream
, "\n");
2006 for (ch
= 0; ch
< 0x110000; ch
++)
2008 int value
= get_digit_value (ch
);
2010 assert (value
>= -1 && value
< 10);
2015 fprintf (stream
, ",\n");
2016 fprintf (stream
, " { 0x%04X, %d }", ch
, value
);
2021 fprintf (stream
, "\n");
2023 if (ferror (stream
) || fclose (stream
))
2025 fprintf (stderr
, "error writing to '%s'\n", filename
);
2030 /* Output the per-character digit value table. */
2032 output_digit (const char *filename
, const char *version
)
2036 struct decdigit_table t
;
2037 unsigned int level1_offset
, level2_offset
, level3_offset
;
2039 stream
= fopen (filename
, "w");
2042 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
2046 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2047 fprintf (stream
, "/* Digit values of Unicode characters. */\n");
2048 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2050 fprintf (stream
, "\n");
2052 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
2053 fprintf (stream
, "\n");
2054 output_library_license (stream
, false);
2055 fprintf (stream
, "\n");
2059 decdigit_table_init (&t
);
2061 for (ch
= 0; ch
< 0x110000; ch
++)
2063 int value
= 1 + get_digit_value (ch
);
2065 assert (value
>= 0 && value
<= 10);
2067 decdigit_table_add (&t
, ch
, value
);
2070 decdigit_table_finalize (&t
);
2072 /* Offsets in t.result, in memory of this process. */
2074 5 * sizeof (uint32_t);
2076 5 * sizeof (uint32_t)
2077 + t
.level1_size
* sizeof (uint32_t);
2079 5 * sizeof (uint32_t)
2080 + t
.level1_size
* sizeof (uint32_t)
2081 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
2083 for (i
= 0; i
< 5; i
++)
2084 fprintf (stream
, "#define digit_header_%d %d\n", i
,
2085 ((uint32_t *) t
.result
)[i
]);
2086 fprintf (stream
, "static const\n");
2087 fprintf (stream
, "struct\n");
2088 fprintf (stream
, " {\n");
2089 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
2090 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
2091 fprintf (stream
, " unsigned char level3[%zu << %d];\n", t
.level3_size
,
2093 fprintf (stream
, " }\n");
2094 fprintf (stream
, "u_digit =\n");
2095 fprintf (stream
, "{\n");
2096 fprintf (stream
, " {");
2097 if (t
.level1_size
> 8)
2098 fprintf (stream
, "\n ");
2099 for (i
= 0; i
< t
.level1_size
; i
++)
2102 if (i
> 0 && (i
% 8) == 0)
2103 fprintf (stream
, "\n ");
2104 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
2106 fprintf (stream
, " %5d", -1);
2108 fprintf (stream
, " %5zu",
2109 (offset
- level2_offset
) / sizeof (uint32_t));
2110 if (i
+1 < t
.level1_size
)
2111 fprintf (stream
, ",");
2113 if (t
.level1_size
> 8)
2114 fprintf (stream
, "\n ");
2115 fprintf (stream
, " },\n");
2116 fprintf (stream
, " {");
2117 if (t
.level2_size
<< t
.q
> 8)
2118 fprintf (stream
, "\n ");
2119 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
2122 if (i
> 0 && (i
% 8) == 0)
2123 fprintf (stream
, "\n ");
2124 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
2126 fprintf (stream
, " %5d", -1);
2128 fprintf (stream
, " %5zu",
2129 (offset
- level3_offset
) / sizeof (uint8_t));
2130 if (i
+1 < t
.level2_size
<< t
.q
)
2131 fprintf (stream
, ",");
2133 if (t
.level2_size
<< t
.q
> 8)
2134 fprintf (stream
, "\n ");
2135 fprintf (stream
, " },\n");
2136 /* Pack the level3 array. Each entry needs 4 bits only. */
2137 fprintf (stream
, " {");
2138 if (t
.level3_size
<< (t
.p
- 1) > 8)
2139 fprintf (stream
, "\n ");
2140 for (i
= 0; i
< t
.level3_size
<< (t
.p
- 1); i
++)
2142 if (i
> 0 && (i
% 8) == 0)
2143 fprintf (stream
, "\n ");
2144 fprintf (stream
, " 0x%02x",
2145 ((uint8_t *) (t
.result
+ level3_offset
))[2*i
]
2146 + (((uint8_t *) (t
.result
+ level3_offset
))[2*i
+1] << 4));
2147 if (i
+1 < t
.level3_size
<< (t
.p
- 1))
2148 fprintf (stream
, ",");
2150 if (t
.level3_size
<< (t
.p
- 1) > 8)
2151 fprintf (stream
, "\n ");
2152 fprintf (stream
, " }\n");
2153 fprintf (stream
, "};\n");
2155 if (ferror (stream
) || fclose (stream
))
2157 fprintf (stderr
, "error writing to '%s'\n", filename
);
2162 /* ========================================================================= */
2164 /* Numeric value. */
2165 /* See Unicode 3.0 book, section 4.6. */
2167 typedef struct { int numerator
; int denominator
; } uc_fraction_t
;
2169 static uc_fraction_t
2170 get_numeric_value (unsigned int ch
)
2172 uc_fraction_t value
;
2174 if (unicode_attributes
[ch
].name
!= NULL
2175 && unicode_attributes
[ch
].numeric
[0] != '\0')
2177 const char *str
= unicode_attributes
[ch
].numeric
;
2178 /* str is of the form "integer" or "integer/posinteger". */
2179 value
.numerator
= atoi (str
);
2180 if (strchr (str
, '/') != NULL
)
2181 value
.denominator
= atoi (strchr (str
, '/') + 1);
2183 value
.denominator
= 1;
2187 value
.numerator
= 0;
2188 value
.denominator
= 0;
2193 /* Output the unit test for the per-character numeric value table. */
2195 output_numeric_test (const char *filename
, const char *version
)
2201 stream
= fopen (filename
, "w");
2204 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
2208 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2209 fprintf (stream
, "/* Numeric values of Unicode characters. */\n");
2210 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2212 fprintf (stream
, "\n");
2214 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
2215 fprintf (stream
, "\n");
2216 output_tests_license (stream
);
2217 fprintf (stream
, "\n");
2220 for (ch
= 0; ch
< 0x110000; ch
++)
2222 uc_fraction_t value
= get_numeric_value (ch
);
2224 if (value
.numerator
!= 0 || value
.denominator
!= 0)
2227 fprintf (stream
, ",\n");
2228 fprintf (stream
, " { 0x%04X, %d, %d }",
2229 ch
, value
.numerator
, value
.denominator
);
2234 fprintf (stream
, "\n");
2236 if (ferror (stream
) || fclose (stream
))
2238 fprintf (stderr
, "error writing to '%s'\n", filename
);
2243 /* Construction of sparse 3-level tables. */
2244 #define TABLE numeric_table
2245 #define ELEMENT uint8_t
2247 #define xmalloc malloc
2248 #define xrealloc realloc
2251 /* Output the per-character numeric value table. */
2253 output_numeric (const char *filename
, const char *version
)
2256 uc_fraction_t fractions
[160];
2257 unsigned int nfractions
;
2258 unsigned int ch
, i
, j
;
2259 struct numeric_table t
;
2260 unsigned int level1_offset
, level2_offset
, level3_offset
;
2261 uint16_t *level3_packed
;
2263 stream
= fopen (filename
, "w");
2266 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
2270 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2271 fprintf (stream
, "/* Numeric values of Unicode characters. */\n");
2272 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2274 fprintf (stream
, "\n");
2276 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
2277 fprintf (stream
, "\n");
2278 output_library_license (stream
, false);
2279 fprintf (stream
, "\n");
2281 /* Create table of occurring fractions. */
2283 for (ch
= 0; ch
< 0x110000; ch
++)
2285 uc_fraction_t value
= get_numeric_value (ch
);
2287 for (i
= 0; i
< nfractions
; i
++)
2288 if (value
.numerator
== fractions
[i
].numerator
2289 && value
.denominator
== fractions
[i
].denominator
)
2291 if (i
== nfractions
)
2293 assert (nfractions
!= SIZEOF (fractions
));
2294 for (i
= 0; i
< nfractions
; i
++)
2295 if (value
.denominator
< fractions
[i
].denominator
2296 || (value
.denominator
== fractions
[i
].denominator
2297 && value
.numerator
< fractions
[i
].numerator
))
2299 for (j
= nfractions
; j
> i
; j
--)
2300 fractions
[j
] = fractions
[j
- 1];
2301 fractions
[i
] = value
;
2306 fprintf (stream
, "static const uc_fraction_t u_numeric_values[%d] =\n",
2308 fprintf (stream
, "{\n");
2309 for (i
= 0; i
< nfractions
; i
++)
2311 fprintf (stream
, " { %d, %d }", fractions
[i
].numerator
,
2312 fractions
[i
].denominator
);
2313 if (i
+1 < nfractions
)
2314 fprintf (stream
, ",");
2315 fprintf (stream
, "\n");
2317 fprintf (stream
, "};\n");
2321 numeric_table_init (&t
);
2323 for (ch
= 0; ch
< 0x110000; ch
++)
2325 uc_fraction_t value
= get_numeric_value (ch
);
2327 for (i
= 0; i
< nfractions
; i
++)
2328 if (value
.numerator
== fractions
[i
].numerator
2329 && value
.denominator
== fractions
[i
].denominator
)
2331 assert (i
!= nfractions
);
2333 numeric_table_add (&t
, ch
, i
);
2336 numeric_table_finalize (&t
);
2338 /* Offsets in t.result, in memory of this process. */
2340 5 * sizeof (uint32_t);
2342 5 * sizeof (uint32_t)
2343 + t
.level1_size
* sizeof (uint32_t);
2345 5 * sizeof (uint32_t)
2346 + t
.level1_size
* sizeof (uint32_t)
2347 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
2349 for (i
= 0; i
< 5; i
++)
2350 fprintf (stream
, "#define numeric_header_%d %d\n", i
,
2351 ((uint32_t *) t
.result
)[i
]);
2352 fprintf (stream
, "static const\n");
2353 fprintf (stream
, "struct\n");
2354 fprintf (stream
, " {\n");
2355 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
2356 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
2357 fprintf (stream
, " unsigned short level3[%zu * %d + 1];\n", t
.level3_size
,
2358 (1 << t
.p
) * 8 / 16);
2359 fprintf (stream
, " }\n");
2360 fprintf (stream
, "u_numeric =\n");
2361 fprintf (stream
, "{\n");
2362 fprintf (stream
, " {");
2363 if (t
.level1_size
> 8)
2364 fprintf (stream
, "\n ");
2365 for (i
= 0; i
< t
.level1_size
; i
++)
2368 if (i
> 0 && (i
% 8) == 0)
2369 fprintf (stream
, "\n ");
2370 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
2372 fprintf (stream
, " %5d", -1);
2374 fprintf (stream
, " %5zu",
2375 (offset
- level2_offset
) / sizeof (uint32_t));
2376 if (i
+1 < t
.level1_size
)
2377 fprintf (stream
, ",");
2379 if (t
.level1_size
> 8)
2380 fprintf (stream
, "\n ");
2381 fprintf (stream
, " },\n");
2382 fprintf (stream
, " {");
2383 if (t
.level2_size
<< t
.q
> 8)
2384 fprintf (stream
, "\n ");
2385 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
2388 if (i
> 0 && (i
% 8) == 0)
2389 fprintf (stream
, "\n ");
2390 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
2392 fprintf (stream
, " %5d", -1);
2394 fprintf (stream
, " %5zu",
2395 (offset
- level3_offset
) / sizeof (uint8_t));
2396 if (i
+1 < t
.level2_size
<< t
.q
)
2397 fprintf (stream
, ",");
2399 if (t
.level2_size
<< t
.q
> 8)
2400 fprintf (stream
, "\n ");
2401 fprintf (stream
, " },\n");
2402 /* Pack the level3 array. Each entry needs 8 bits only. Use 16-bit units,
2403 not 32-bit units, in order to make the lookup function easier. */
2406 calloc ((t
.level3_size
<< t
.p
) * 8 / 16 + 1, sizeof (uint16_t));
2407 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
2409 unsigned int j
= (i
* 8) / 16;
2410 unsigned int k
= (i
* 8) % 16;
2411 uint32_t value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
];
2412 value
= level3_packed
[j
] | (level3_packed
[j
+1] << 16) | (value
<< k
);
2413 level3_packed
[j
] = value
& 0xffff;
2414 level3_packed
[j
+1] = value
>> 16;
2416 fprintf (stream
, " {");
2417 if ((t
.level3_size
<< t
.p
) * 8 / 16 + 1 > 8)
2418 fprintf (stream
, "\n ");
2419 for (i
= 0; i
< (t
.level3_size
<< t
.p
) * 8 / 16 + 1; i
++)
2421 if (i
> 0 && (i
% 8) == 0)
2422 fprintf (stream
, "\n ");
2423 fprintf (stream
, " 0x%04x", level3_packed
[i
]);
2424 if (i
+1 < (t
.level3_size
<< t
.p
) * 8 / 16 + 1)
2425 fprintf (stream
, ",");
2427 if ((t
.level3_size
<< t
.p
) * 8 / 16 + 1 > 8)
2428 fprintf (stream
, "\n ");
2429 fprintf (stream
, " }\n");
2430 free (level3_packed
);
2431 fprintf (stream
, "};\n");
2433 if (ferror (stream
) || fclose (stream
))
2435 fprintf (stderr
, "error writing to '%s'\n", filename
);
2440 /* ========================================================================= */
2443 /* See Unicode 3.0 book, section 4.7,
2446 /* A pair of mirrored characters. */
2447 struct mirror_pair
{ unsigned int uc
[2]; };
2449 /* List of mirrored character pairs, from the BidiMirroring.txt file.
2450 This is a subset of the characters having the BidiMirrored property. */
2451 static struct mirror_pair mirror_pairs
[1000];
2452 static unsigned int mirror_pairs_count
;
2454 /* Stores in mirror_pairs[] the mirrored character pairs from the
2455 BidiMirroring.txt file. */
2457 fill_mirror (const char *bidimirroring_filename
)
2460 char field0
[FIELDLEN
];
2461 char field1
[FIELDLEN
];
2462 char field2
[FIELDLEN
];
2465 stream
= fopen (bidimirroring_filename
, "r");
2468 fprintf (stderr
, "error during fopen of '%s'\n", bidimirroring_filename
);
2472 mirror_pairs_count
= 0;
2489 do c
= getc (stream
); while (c
!= EOF
&& c
!= '\n');
2493 n
= getfield (stream
, field0
, ';');
2494 do c
= getc (stream
); while (c
== ' ');
2496 n
+= getfield (stream
, field1
, '#');
2497 n
+= getfield (stream
, field2
, '\n');
2502 fprintf (stderr
, "short line in '%s':%d\n",
2503 bidimirroring_filename
, lineno
);
2506 /* Remove trailing spaces from field1. */
2507 while (strlen (field1
) > 0 && field1
[strlen (field1
) - 1] == ' ')
2508 field1
[strlen (field1
) - 1] = '\0';
2509 /* The line should contain two characters. */
2510 uc1
= strtoul (field0
, NULL
, 16);
2511 uc2
= strtoul (field1
, NULL
, 16);
2512 if (uc1
== 0 || uc2
== 0 || uc1
== uc2
)
2514 fprintf (stderr
, "parse error at '%s':%d\n",
2515 bidimirroring_filename
, lineno
);
2518 /* Verify that uc1 and uc2 are in range. */
2519 if (!(uc1
< 0x110000))
2521 fprintf (stderr
, "%s mentions 0x%04X, which is out-of-range.\n",
2522 bidimirroring_filename
, uc1
);
2525 if (!(uc2
< 0x110000))
2527 fprintf (stderr
, "%s mentions 0x%04X, which is out-of-range.\n",
2528 bidimirroring_filename
, uc2
);
2531 /* Have we seen uc1 or uc2 already? */
2532 for (i
= 0; i
< mirror_pairs_count
; i
++)
2534 if (uc1
== mirror_pairs
[i
].uc
[0])
2536 fprintf (stderr
, "%s: mapping conflict for 0x%04X\n",
2537 bidimirroring_filename
, uc1
);
2540 if (uc2
== mirror_pairs
[i
].uc
[1])
2542 fprintf (stderr
, "%s: mapping conflict for 0x%04X\n",
2543 bidimirroring_filename
, uc2
);
2547 for (i
= 0; i
< mirror_pairs_count
; i
++)
2548 if (uc1
== mirror_pairs
[i
].uc
[1] || uc2
== mirror_pairs
[i
].uc
[0])
2550 if (i
< mirror_pairs_count
)
2552 if (uc1
!= mirror_pairs
[i
].uc
[1])
2554 /* uc1 != mirror_pairs[i].uc[1], uc2 == mirror_pairs[i].uc[0] */
2555 fprintf (stderr
, "%s: mapping conflict for 0x%04X\n",
2556 bidimirroring_filename
, uc2
);
2559 if (uc2
!= mirror_pairs
[i
].uc
[0])
2561 /* uc1 == mirror_pairs[i].uc[1], uc2 != mirror_pairs[i].uc[0] */
2562 fprintf (stderr
, "%s: mapping conflict for 0x%04X\n",
2563 bidimirroring_filename
, uc1
);
2566 /* uc1 == mirror_pairs[i].uc[1], uc2 == mirror_pairs[i].uc[0].
2567 (uc1, uc2) is the reverse pair of a pair that we already had
2568 encountered: (uc2, uc1). */
2573 if (mirror_pairs_count
== SIZEOF (mirror_pairs
))
2575 fprintf (stderr
, "%s contains more pairs than expected, "
2576 "increase mirror_pairs' size.\n",
2577 bidimirroring_filename
);
2580 mirror_pairs
[mirror_pairs_count
].uc
[0] = uc1
;
2581 mirror_pairs
[mirror_pairs_count
].uc
[1] = uc2
;
2582 mirror_pairs_count
++;
2584 /* Verify that uc1 and uc2 have the BidiMirrored property. */
2585 if (!(unicode_attributes
[uc1
].name
!= NULL
2586 && unicode_attributes
[uc1
].mirrored
))
2588 fprintf (stderr
, "%s mentions 0x%04X, which is not BidiMirrored\n",
2589 bidimirroring_filename
, uc1
);
2592 if (!(unicode_attributes
[uc2
].name
!= NULL
2593 && unicode_attributes
[uc2
].mirrored
))
2595 fprintf (stderr
, "%s mentions 0x%04X, which is not BidiMirrored\n",
2596 bidimirroring_filename
, uc2
);
2601 if (ferror (stream
) || fclose (stream
))
2603 fprintf (stderr
, "error reading from '%s'\n", bidimirroring_filename
);
2609 get_mirror_value (unsigned int ch
)
2612 unsigned int mirror_char
;
2615 mirrored
= (unicode_attributes
[ch
].name
!= NULL
2616 && unicode_attributes
[ch
].mirrored
);
2617 mirror_char
= 0xfffd;
2618 for (i
= 0; i
< mirror_pairs_count
; i
++)
2619 if (ch
== mirror_pairs
[i
].uc
[0])
2621 mirror_char
= mirror_pairs
[i
].uc
[1];
2624 else if (ch
== mirror_pairs
[i
].uc
[1])
2626 mirror_char
= mirror_pairs
[i
].uc
[0];
2630 return (int) mirror_char
- (int) ch
;
2633 assert (mirror_char
== 0xfffd);
2638 /* Construction of sparse 3-level tables. */
2639 #define TABLE mirror_table
2640 #define ELEMENT int32_t
2642 #define xmalloc malloc
2643 #define xrealloc realloc
2646 /* Output the per-character mirror table. */
2648 output_mirror (const char *filename
, const char *version
)
2652 struct mirror_table t
;
2653 unsigned int level1_offset
, level2_offset
, level3_offset
;
2655 stream
= fopen (filename
, "w");
2658 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
2662 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2663 fprintf (stream
, "/* Mirrored Unicode characters. */\n");
2664 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2666 fprintf (stream
, "\n");
2668 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
2669 fprintf (stream
, "\n");
2670 output_library_license (stream
, false);
2671 fprintf (stream
, "\n");
2675 mirror_table_init (&t
);
2677 for (ch
= 0; ch
< 0x110000; ch
++)
2679 int value
= get_mirror_value (ch
);
2681 mirror_table_add (&t
, ch
, value
);
2684 mirror_table_finalize (&t
);
2686 /* Offsets in t.result, in memory of this process. */
2688 5 * sizeof (uint32_t);
2690 5 * sizeof (uint32_t)
2691 + t
.level1_size
* sizeof (uint32_t);
2693 5 * sizeof (uint32_t)
2694 + t
.level1_size
* sizeof (uint32_t)
2695 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
2697 for (i
= 0; i
< 5; i
++)
2698 fprintf (stream
, "#define mirror_header_%d %d\n", i
,
2699 ((uint32_t *) t
.result
)[i
]);
2700 fprintf (stream
, "static const\n");
2701 fprintf (stream
, "struct\n");
2702 fprintf (stream
, " {\n");
2703 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
2704 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
2705 fprintf (stream
, " int level3[%zu << %d];\n", t
.level3_size
, t
.p
);
2706 fprintf (stream
, " }\n");
2707 fprintf (stream
, "u_mirror =\n");
2708 fprintf (stream
, "{\n");
2709 fprintf (stream
, " {");
2710 if (t
.level1_size
> 8)
2711 fprintf (stream
, "\n ");
2712 for (i
= 0; i
< t
.level1_size
; i
++)
2715 if (i
> 0 && (i
% 8) == 0)
2716 fprintf (stream
, "\n ");
2717 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
2719 fprintf (stream
, " %5d", -1);
2721 fprintf (stream
, " %5zu",
2722 (offset
- level2_offset
) / sizeof (uint32_t));
2723 if (i
+1 < t
.level1_size
)
2724 fprintf (stream
, ",");
2726 if (t
.level1_size
> 8)
2727 fprintf (stream
, "\n ");
2728 fprintf (stream
, " },\n");
2729 fprintf (stream
, " {");
2730 if (t
.level2_size
<< t
.q
> 8)
2731 fprintf (stream
, "\n ");
2732 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
2735 if (i
> 0 && (i
% 8) == 0)
2736 fprintf (stream
, "\n ");
2737 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
2739 fprintf (stream
, " %5d", -1);
2741 fprintf (stream
, " %5zu",
2742 (offset
- level3_offset
) / sizeof (int32_t));
2743 if (i
+1 < t
.level2_size
<< t
.q
)
2744 fprintf (stream
, ",");
2746 if (t
.level2_size
<< t
.q
> 8)
2747 fprintf (stream
, "\n ");
2748 fprintf (stream
, " },\n");
2749 fprintf (stream
, " {");
2750 if (t
.level3_size
<< t
.p
> 8)
2751 fprintf (stream
, "\n ");
2752 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
2754 if (i
> 0 && (i
% 8) == 0)
2755 fprintf (stream
, "\n ");
2756 fprintf (stream
, " %5d", ((int32_t *) (t
.result
+ level3_offset
))[i
]);
2757 if (i
+1 < t
.level3_size
<< t
.p
)
2758 fprintf (stream
, ",");
2760 if (t
.level3_size
<< t
.p
> 8)
2761 fprintf (stream
, "\n ");
2762 fprintf (stream
, " }\n");
2763 fprintf (stream
, "};\n");
2765 if (ferror (stream
) || fclose (stream
))
2767 fprintf (stderr
, "error writing to '%s'\n", filename
);
2772 /* ========================================================================= */
2774 /* Particular values of the word break property. */
2777 is_WBP_MIDNUMLET (unsigned int ch
)
2779 return (ch
== 0x002E || ch
== 0x2018 || ch
== 0x2019
2780 || ch
== 0x2024 || ch
== 0xFE52 || ch
== 0xFF07 || ch
== 0xFF0E);
2784 is_WBP_MIDLETTER (unsigned int ch
)
2786 return (ch
== 0x00B7 || ch
== 0x05F4 || ch
== 0x2027 || ch
== 0x003A
2787 || ch
== 0x0387 || ch
== 0xFE13 || ch
== 0xFE55 || ch
== 0xFF1A
2791 /* ========================================================================= */
2795 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2802 PROP_PREPENDED_CONCATENATION_MARK
,
2805 PROP_QUOTATION_MARK
,
2806 PROP_TERMINAL_PUNCTUATION
,
2809 PROP_ASCII_HEX_DIGIT
,
2810 PROP_OTHER_ALPHABETIC
,
2814 PROP_OTHER_LOWERCASE
,
2815 PROP_OTHER_UPPERCASE
,
2816 PROP_NONCHARACTER_CODE_POINT
,
2817 PROP_OTHER_GRAPHEME_EXTEND
,
2818 PROP_IDS_BINARY_OPERATOR
,
2819 PROP_IDS_TRINARY_OPERATOR
,
2820 PROP_IDS_UNARY_OPERATOR
,
2822 PROP_UNIFIED_IDEOGRAPH
,
2823 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT
,
2826 PROP_LOGICAL_ORDER_EXCEPTION
,
2827 PROP_OTHER_ID_START
,
2828 PROP_OTHER_ID_CONTINUE
,
2829 PROP_ID_COMPAT_MATH_CONTINUE
,
2830 PROP_ID_COMPAT_MATH_START
,
2831 PROP_SENTENCE_TERMINAL
,
2832 PROP_VARIATION_SELECTOR
,
2833 PROP_PATTERN_WHITE_SPACE
,
2834 PROP_PATTERN_SYNTAX
,
2835 PROP_REGIONAL_INDICATOR
,
2836 PROP_MODIFIER_COMBINING_MARK
,
2837 /* DerivedCoreProperties.txt */
2843 PROP_CASE_IGNORABLE
,
2844 PROP_CHANGES_WHEN_LOWERCASED
,
2845 PROP_CHANGES_WHEN_UPPERCASED
,
2846 PROP_CHANGES_WHEN_TITLECASED
,
2847 PROP_CHANGES_WHEN_CASEFOLDED
,
2848 PROP_CHANGES_WHEN_CASEMAPPED
,
2853 PROP_DEFAULT_IGNORABLE_CODE_POINT
,
2854 PROP_GRAPHEME_EXTEND
,
2857 /* emoji-data.txt */
2859 PROP_EMOJI_PRESENTATION
,
2860 PROP_EMOJI_MODIFIER
,
2861 PROP_EMOJI_MODIFIER_BASE
,
2862 PROP_EMOJI_COMPONENT
,
2863 PROP_EXTENDED_PICTOGRAPHIC
2865 unsigned long long unicode_properties
[0x110000];
2869 UC_INDIC_CONJUNCT_BREAK_NONE
= 0, /* None */
2870 UC_INDIC_CONJUNCT_BREAK_CONSONANT
, /* Consonant */
2871 UC_INDIC_CONJUNCT_BREAK_LINKER
, /* Linker */
2872 UC_INDIC_CONJUNCT_BREAK_EXTEND
/* Extend */
2874 static uint8_t unicode_indic_conjunct_break
[0x110000];
2877 clear_properties (void)
2881 for (i
= 0; i
< 0x110000; i
++)
2882 unicode_properties
[i
] = 0;
2885 /* Stores in unicode_properties[] the properties from the
2886 PropList.txt or DerivedCoreProperties.txt file. */
2888 fill_properties (const char *proplist_filename
)
2893 stream
= fopen (proplist_filename
, "r");
2896 fprintf (stderr
, "error during fopen of '%s'\n", proplist_filename
);
2903 unsigned int i1
, i2
;
2904 char padding
[200+1];
2905 char propname
[200+1];
2906 char rest_of_line
[200+1];
2907 unsigned int propcode
;
2909 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
2912 if (buf
[0] == '\0' || buf
[0] == '#')
2915 if (sscanf (buf
, "%X..%X%[ ;]%[^ #]%200s", &i1
, &i2
, padding
, propname
, rest_of_line
) != 5)
2917 if (sscanf (buf
, "%X%[ ;]%[^ #]%200s", &i1
, padding
, propname
, rest_of_line
) != 4)
2919 fprintf (stderr
, "parse error in '%s'\n", proplist_filename
);
2924 #define PROP(name,code) \
2925 if (strcmp (propname, name) == 0) propcode = code; else
2927 PROP ("White_Space", PROP_WHITE_SPACE
)
2928 PROP ("Bidi_Control", PROP_BIDI_CONTROL
)
2929 PROP ("Join_Control", PROP_JOIN_CONTROL
)
2930 PROP ("Prepended_Concatenation_Mark", PROP_PREPENDED_CONCATENATION_MARK
)
2931 PROP ("Dash", PROP_DASH
)
2932 PROP ("Hyphen", PROP_HYPHEN
)
2933 PROP ("Quotation_Mark", PROP_QUOTATION_MARK
)
2934 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION
)
2935 PROP ("Other_Math", PROP_OTHER_MATH
)
2936 PROP ("Hex_Digit", PROP_HEX_DIGIT
)
2937 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT
)
2938 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC
)
2939 PROP ("Ideographic", PROP_IDEOGRAPHIC
)
2940 PROP ("Diacritic", PROP_DIACRITIC
)
2941 PROP ("Extender", PROP_EXTENDER
)
2942 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE
)
2943 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE
)
2944 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT
)
2945 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND
)
2946 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR
)
2947 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR
)
2948 PROP ("IDS_Unary_Operator", PROP_IDS_UNARY_OPERATOR
)
2949 PROP ("Radical", PROP_RADICAL
)
2950 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH
)
2951 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT
)
2952 PROP ("Deprecated", PROP_DEPRECATED
)
2953 PROP ("Soft_Dotted", PROP_SOFT_DOTTED
)
2954 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION
)
2955 PROP ("Other_ID_Start", PROP_OTHER_ID_START
)
2956 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE
)
2957 PROP ("ID_Compat_Math_Continue", PROP_ID_COMPAT_MATH_CONTINUE
)
2958 PROP ("ID_Compat_Math_Start", PROP_ID_COMPAT_MATH_START
)
2959 PROP ("Sentence_Terminal", PROP_SENTENCE_TERMINAL
)
2960 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR
)
2961 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE
)
2962 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX
)
2963 PROP ("Regional_Indicator", PROP_REGIONAL_INDICATOR
)
2964 PROP ("Modifier_Combining_Mark", PROP_MODIFIER_COMBINING_MARK
)
2965 /* DerivedCoreProperties.txt */
2966 PROP ("Math", PROP_MATH
)
2967 PROP ("Alphabetic", PROP_ALPHABETIC
)
2968 PROP ("Lowercase", PROP_LOWERCASE
)
2969 PROP ("Uppercase", PROP_UPPERCASE
)
2970 PROP ("Cased", PROP_CASED
)
2971 PROP ("Case_Ignorable", PROP_CASE_IGNORABLE
)
2972 PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED
)
2973 PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED
)
2974 PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED
)
2975 PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED
)
2976 PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED
)
2977 PROP ("ID_Start", PROP_ID_START
)
2978 PROP ("ID_Continue", PROP_ID_CONTINUE
)
2979 PROP ("XID_Start", PROP_XID_START
)
2980 PROP ("XID_Continue", PROP_XID_CONTINUE
)
2981 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT
)
2982 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND
)
2983 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE
)
2984 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK
)
2985 /* emoji-data.txt */
2986 PROP ("Emoji", PROP_EMOJI
)
2987 PROP ("Emoji_Presentation", PROP_EMOJI_PRESENTATION
)
2988 PROP ("Emoji_Modifier", PROP_EMOJI_MODIFIER
)
2989 PROP ("Emoji_Modifier_Base", PROP_EMOJI_MODIFIER_BASE
)
2990 PROP ("Emoji_Component", PROP_EMOJI_COMPONENT
)
2991 PROP ("Extended_Pictographic", PROP_EXTENDED_PICTOGRAPHIC
)
2993 /* An enum-valued property from DerivedCoreProperties.txt */
2994 if (strcmp (propname
, "InCB;") == 0)
2996 char valuename
[200+1];
2997 unsigned int valuecode
;
2999 if (sscanf (rest_of_line
, "%[^ #]", valuename
) != 1)
3001 fprintf (stderr
, "parse error 2 in '%s'\n", proplist_filename
);
3005 if (strcmp (valuename
, "None") == 0)
3006 valuecode
= UC_INDIC_CONJUNCT_BREAK_NONE
;
3007 else if (strcmp (valuename
, "Consonant") == 0)
3008 valuecode
= UC_INDIC_CONJUNCT_BREAK_CONSONANT
;
3009 else if (strcmp (valuename
, "Linker") == 0)
3010 valuecode
= UC_INDIC_CONJUNCT_BREAK_LINKER
;
3011 else if (strcmp (valuename
, "Extend") == 0)
3012 valuecode
= UC_INDIC_CONJUNCT_BREAK_EXTEND
;
3015 fprintf (stderr
, "unknown InCB value named '%s' in '%s'\n",
3016 valuename
, proplist_filename
);
3020 assert (i1
<= i2
&& i2
< 0x110000);
3021 for (i
= i1
; i
<= i2
; i
++)
3022 unicode_indic_conjunct_break
[i
] = valuecode
;
3028 fprintf (stderr
, "unknown property named '%s' in '%s'\n", propname
,
3033 assert (i1
<= i2
&& i2
< 0x110000);
3034 for (i
= i1
; i
<= i2
; i
++)
3035 unicode_properties
[i
] |= 1ULL << propcode
;
3040 if (ferror (stream
) || fclose (stream
))
3042 fprintf (stderr
, "error reading from '%s'\n", proplist_filename
);
3047 /* Stores in array the given property from the Unicode 3.0 PropList.txt
3050 fill_property30 (char array
[0x110000], const char *proplist_filename
, const char *property_name
)
3056 for (i
= 0; i
< 0x110000; i
++)
3059 stream
= fopen (proplist_filename
, "r");
3062 fprintf (stderr
, "error during fopen of '%s'\n", proplist_filename
);
3066 /* Search for the "Property dump for: ..." line. */
3069 if (fscanf (stream
, "%100[^\n]\n", buf
) < 1)
3071 fprintf (stderr
, "no property found in '%s'\n", proplist_filename
);
3075 while (strstr (buf
, property_name
) == NULL
);
3079 unsigned int i1
, i2
;
3081 if (fscanf (stream
, "%100[^\n]\n", buf
) < 1)
3085 if (strlen (buf
) >= 10 && buf
[4] == '.' && buf
[5] == '.')
3087 if (sscanf (buf
, "%4X..%4X", &i1
, &i2
) < 2)
3089 fprintf (stderr
, "parse error in property in '%s'\n",
3094 else if (strlen (buf
) >= 4)
3096 if (sscanf (buf
, "%4X", &i1
) < 1)
3098 fprintf (stderr
, "parse error in property in '%s'\n",
3106 fprintf (stderr
, "parse error in property in '%s'\n",
3110 assert (i1
<= i2
&& i2
< 0x110000);
3111 for (i
= i1
; i
<= i2
; i
++)
3115 if (ferror (stream
) || fclose (stream
))
3117 fprintf (stderr
, "error reading from '%s'\n", proplist_filename
);
3122 /* Properties from Unicode 3.0 PropList.txt file. */
3124 /* The paired punctuation property from the PropList.txt file. */
3125 char unicode_pairedpunctuation
[0x110000];
3127 /* The left of pair property from the PropList.txt file. */
3128 char unicode_leftofpair
[0x110000];
3131 fill_properties30 (const char *proplist30_filename
)
3133 fill_property30 (unicode_pairedpunctuation
, proplist30_filename
, "(Paired Punctuation)");
3134 fill_property30 (unicode_leftofpair
, proplist30_filename
, "(Left of Pair)");
3137 /* ------------------------------------------------------------------------- */
3139 /* See PropList.txt, UCD.html. */
3141 is_property_white_space (unsigned int ch
)
3143 return ((unicode_properties
[ch
] & (1ULL << PROP_WHITE_SPACE
)) != 0);
3146 /* See Unicode 3.0 book, section 4.10,
3147 PropList.txt, UCD.html,
3148 DerivedCoreProperties.txt, UCD.html. */
3150 is_property_alphabetic (unsigned int ch
)
3154 || ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_ALPHABETIC
)) != 0)
3155 /* For some reason, the following are listed as having property
3156 Alphabetic but not as having property Other_Alphabetic. */
3157 || (ch
>= 0x16EE && ch
<= 0x16F0) /* RUNIC SYMBOLS */
3158 || (ch
>= 0x2160 && ch
<= 0x2182) /* ROMAN NUMERALS */
3159 || (ch
>= 0x2185 && ch
<= 0x2188) /* ROMAN NUMERALS */
3160 || (ch
>= 0x24D0 && ch
<= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
3161 || (ch
== 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
3162 || (ch
>= 0x3021 && ch
<= 0x3029) /* HANGZHOU NUMERAL */
3163 || (ch
>= 0x3038 && ch
<= 0x303A) /* HANGZHOU NUMERAL */
3164 || (ch
>= 0xA6E6 && ch
<= 0xA6EF) /* BAMUM LETTERS */
3165 || (ch
>= 0x10140 && ch
<= 0x10174) /* GREEK ACROPHONICS */
3166 || (ch
== 0x10341) /* GOTHIC LETTER NINETY */
3167 || (ch
== 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
3168 || (ch
>= 0x103D1 && ch
<= 0x103D5) /* OLD PERSIAN NUMBERS */
3169 || (ch
>= 0x12400 && ch
<= 0x1246E); /* CUNEIFORM NUMERIC SIGNS */
3171 ((unicode_properties
[ch
] & (1ULL << PROP_ALPHABETIC
)) != 0);
3173 assert (result1
== result2
);
3177 /* See PropList.txt, UCD.html. */
3179 is_property_other_alphabetic (unsigned int ch
)
3181 return ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_ALPHABETIC
)) != 0);
3184 /* See PropList.txt, UCD.html. */
3186 is_property_not_a_character (unsigned int ch
)
3188 return ((unicode_properties
[ch
] & (1ULL << PROP_NONCHARACTER_CODE_POINT
)) != 0);
3191 /* See PropList.txt, UCD.html,
3192 DerivedCoreProperties.txt, UCD.html. */
3194 is_property_default_ignorable_code_point (unsigned int ch
)
3197 (is_category_Cf (ch
)
3198 && !(ch
>= 0xFFF9 && ch
<= 0xFFFB) /* Annotations */
3199 && !(ch
>= 0x13430 && ch
<= 0x1343F) /* Egyptian Hieroglyph */
3200 && ((unicode_properties
[ch
] & (1ULL << PROP_PREPENDED_CONCATENATION_MARK
)) == 0))
3201 || ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT
)) != 0)
3202 || ((unicode_properties
[ch
] & (1ULL << PROP_VARIATION_SELECTOR
)) != 0);
3204 ((unicode_properties
[ch
] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT
)) != 0);
3206 assert (result1
== result2
);
3210 /* See PropList.txt, UCD.html. */
3212 is_property_other_default_ignorable_code_point (unsigned int ch
)
3214 return ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT
)) != 0);
3217 /* See PropList.txt, UCD.html. */
3219 is_property_deprecated (unsigned int ch
)
3221 return ((unicode_properties
[ch
] & (1ULL << PROP_DEPRECATED
)) != 0);
3224 /* See PropList.txt, UCD.html. */
3226 is_property_logical_order_exception (unsigned int ch
)
3228 return ((unicode_properties
[ch
] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION
)) != 0);
3231 /* See PropList.txt, UCD.html. */
3233 is_property_variation_selector (unsigned int ch
)
3235 return ((unicode_properties
[ch
] & (1ULL << PROP_VARIATION_SELECTOR
)) != 0);
3238 /* See PropList-3.0.1.txt. */
3240 is_property_private_use (unsigned int ch
)
3242 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
3243 return (ch
>= 0xE000 && ch
<= 0xF8FF)
3244 || (ch
>= 0xF0000 && ch
<= 0xFFFFD)
3245 || (ch
>= 0x100000 && ch
<= 0x10FFFD);
3248 /* See PropList-3.0.1.txt. */
3250 is_property_unassigned_code_value (unsigned int ch
)
3252 return (is_category_Cn (ch
) && !is_property_not_a_character (ch
));
3255 /* See PropList.txt, UCD.html,
3256 DerivedCoreProperties.txt, UCD.html. */
3258 is_property_uppercase (unsigned int ch
)
3262 || ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_UPPERCASE
)) != 0);
3264 ((unicode_properties
[ch
] & (1ULL << PROP_UPPERCASE
)) != 0);
3266 assert (result1
== result2
);
3270 /* See PropList.txt, UCD.html. */
3272 is_property_other_uppercase (unsigned int ch
)
3274 return ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_UPPERCASE
)) != 0);
3277 /* See PropList.txt, UCD.html,
3278 DerivedCoreProperties.txt, UCD.html. */
3280 is_property_lowercase (unsigned int ch
)
3284 || ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_LOWERCASE
)) != 0);
3286 ((unicode_properties
[ch
] & (1ULL << PROP_LOWERCASE
)) != 0);
3288 assert (result1
== result2
);
3292 /* See PropList.txt, UCD.html. */
3294 is_property_other_lowercase (unsigned int ch
)
3296 return ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_LOWERCASE
)) != 0);
3299 /* See PropList-3.0.1.txt. */
3301 is_property_titlecase (unsigned int ch
)
3303 return is_category_Lt (ch
);
3306 /* See DerivedCoreProperties.txt. */
3308 is_property_cased (unsigned int ch
)
3310 bool result1
= (is_property_lowercase (ch
)
3311 || is_property_uppercase (ch
)
3312 || is_category_Lt (ch
));
3313 bool result2
= ((unicode_properties
[ch
] & (1ULL << PROP_CASED
)) != 0);
3315 assert (result1
== result2
);
3319 /* See DerivedCoreProperties.txt. */
3321 is_property_case_ignorable (unsigned int ch
)
3323 bool result1
= (is_WBP_MIDLETTER (ch
) || is_WBP_MIDNUMLET (ch
)
3325 || is_category_Mn (ch
)
3326 || is_category_Me (ch
)
3327 || is_category_Cf (ch
)
3328 || is_category_Lm (ch
)
3329 || is_category_Sk (ch
));
3330 bool result2
= ((unicode_properties
[ch
] & (1ULL << PROP_CASE_IGNORABLE
)) != 0);
3332 assert (result1
== result2
);
3336 /* See DerivedCoreProperties.txt. */
3338 is_property_changes_when_lowercased (unsigned int ch
)
3340 bool result1
= ((unicode_properties
[ch
] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED
)) != 0);
3341 bool result2
= (unicode_attributes
[ch
].name
!= NULL
3342 && unicode_attributes
[ch
].lower
!= NONE
3343 && unicode_attributes
[ch
].lower
!= ch
);
3345 assert (result1
== result2
);
3349 /* See DerivedCoreProperties.txt. */
3351 is_property_changes_when_uppercased (unsigned int ch
)
3353 return ((unicode_properties
[ch
] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED
)) != 0);
3356 /* See DerivedCoreProperties.txt. */
3358 is_property_changes_when_titlecased (unsigned int ch
)
3360 return ((unicode_properties
[ch
] & (1ULL << PROP_CHANGES_WHEN_TITLECASED
)) != 0);
3363 /* See DerivedCoreProperties.txt. */
3365 is_property_changes_when_casefolded (unsigned int ch
)
3367 return ((unicode_properties
[ch
] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED
)) != 0);
3370 /* See DerivedCoreProperties.txt. */
3372 is_property_changes_when_casemapped (unsigned int ch
)
3374 return ((unicode_properties
[ch
] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED
)) != 0);
3377 /* See PropList.txt, UCD.html. */
3379 is_property_soft_dotted (unsigned int ch
)
3381 return ((unicode_properties
[ch
] & (1ULL << PROP_SOFT_DOTTED
)) != 0);
3384 /* See DerivedCoreProperties.txt, UCD.html. */
3386 is_property_id_start (unsigned int ch
)
3388 return ((unicode_properties
[ch
] & (1ULL << PROP_ID_START
)) != 0);
3391 /* See PropList.txt, UCD.html. */
3393 is_property_other_id_start (unsigned int ch
)
3395 return ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_ID_START
)) != 0);
3398 /* See DerivedCoreProperties.txt, UCD.html. */
3400 is_property_id_continue (unsigned int ch
)
3402 return ((unicode_properties
[ch
] & (1ULL << PROP_ID_CONTINUE
)) != 0);
3405 /* See PropList.txt, UCD.html. */
3407 is_property_other_id_continue (unsigned int ch
)
3409 return ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_ID_CONTINUE
)) != 0);
3412 /* See DerivedCoreProperties.txt, UCD.html. */
3414 is_property_xid_start (unsigned int ch
)
3416 return ((unicode_properties
[ch
] & (1ULL << PROP_XID_START
)) != 0);
3419 /* See DerivedCoreProperties.txt, UCD.html. */
3421 is_property_xid_continue (unsigned int ch
)
3423 return ((unicode_properties
[ch
] & (1ULL << PROP_XID_CONTINUE
)) != 0);
3426 /* See PropList.txt, UCD.html. */
3428 is_property_id_compat_math_start (unsigned int ch
)
3430 return ((unicode_properties
[ch
] & (1ULL << PROP_ID_COMPAT_MATH_START
)) != 0);
3433 /* See PropList.txt, UCD.html. */
3435 is_property_id_compat_math_continue (unsigned int ch
)
3437 return ((unicode_properties
[ch
] & (1ULL << PROP_ID_COMPAT_MATH_CONTINUE
)) != 0);
3440 /* See PropList.txt, UCD.html. */
3442 is_property_pattern_white_space (unsigned int ch
)
3444 return ((unicode_properties
[ch
] & (1ULL << PROP_PATTERN_WHITE_SPACE
)) != 0);
3447 /* See PropList.txt, UCD.html. */
3449 is_property_pattern_syntax (unsigned int ch
)
3451 return ((unicode_properties
[ch
] & (1ULL << PROP_PATTERN_SYNTAX
)) != 0);
3454 /* See PropList.txt, UCD.html. */
3456 is_property_join_control (unsigned int ch
)
3458 return ((unicode_properties
[ch
] & (1ULL << PROP_JOIN_CONTROL
)) != 0);
3461 /* See DerivedCoreProperties.txt, UCD.html. */
3463 is_property_grapheme_base (unsigned int ch
)
3465 return ((unicode_properties
[ch
] & (1ULL << PROP_GRAPHEME_BASE
)) != 0);
3468 /* See DerivedCoreProperties.txt, UCD.html. */
3470 is_property_grapheme_extend (unsigned int ch
)
3472 return ((unicode_properties
[ch
] & (1ULL << PROP_GRAPHEME_EXTEND
)) != 0);
3475 /* See PropList.txt, UCD.html. */
3477 is_property_other_grapheme_extend (unsigned int ch
)
3479 return ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND
)) != 0);
3482 /* See DerivedCoreProperties.txt, UCD.html. */
3484 is_property_grapheme_link (unsigned int ch
)
3486 return ((unicode_properties
[ch
] & (1ULL << PROP_GRAPHEME_LINK
)) != 0);
3489 /* See PropList.txt, UCD.html. */
3491 is_property_modifier_combining_mark (unsigned int ch
)
3493 return ((unicode_properties
[ch
] & (1ULL << PROP_MODIFIER_COMBINING_MARK
)) != 0);
3496 /* See PropList.txt, UCD.html. */
3498 is_property_bidi_control (unsigned int ch
)
3500 return ((unicode_properties
[ch
] & (1ULL << PROP_BIDI_CONTROL
)) != 0);
3503 /* See PropList-3.0.1.txt. */
3505 is_property_bidi_left_to_right (unsigned int ch
)
3507 return (get_bidi_category (ch
) == UC_BIDI_L
);
3510 /* See PropList-3.0.1.txt. */
3512 is_property_bidi_hebrew_right_to_left (unsigned int ch
)
3514 return (get_bidi_category (ch
) == UC_BIDI_R
);
3517 /* See PropList-3.0.1.txt. */
3519 is_property_bidi_arabic_right_to_left (unsigned int ch
)
3521 return (get_bidi_category (ch
) == UC_BIDI_AL
);
3524 /* See PropList-3.0.1.txt. */
3526 is_property_bidi_european_digit (unsigned int ch
)
3528 return (get_bidi_category (ch
) == UC_BIDI_EN
);
3531 /* See PropList-3.0.1.txt. */
3533 is_property_bidi_eur_num_separator (unsigned int ch
)
3535 return (get_bidi_category (ch
) == UC_BIDI_ES
);
3538 /* See PropList-3.0.1.txt. */
3540 is_property_bidi_eur_num_terminator (unsigned int ch
)
3542 return (get_bidi_category (ch
) == UC_BIDI_ET
);
3545 /* See PropList-3.0.1.txt. */
3547 is_property_bidi_arabic_digit (unsigned int ch
)
3549 return (get_bidi_category (ch
) == UC_BIDI_AN
);
3552 /* See PropList-3.0.1.txt. */
3554 is_property_bidi_common_separator (unsigned int ch
)
3556 return (get_bidi_category (ch
) == UC_BIDI_CS
);
3559 /* See PropList-3.0.1.txt. */
3561 is_property_bidi_block_separator (unsigned int ch
)
3563 return (get_bidi_category (ch
) == UC_BIDI_B
);
3566 /* See PropList-3.0.1.txt. */
3568 is_property_bidi_segment_separator (unsigned int ch
)
3570 return (get_bidi_category (ch
) == UC_BIDI_S
);
3573 /* See PropList-3.0.1.txt. */
3575 is_property_bidi_whitespace (unsigned int ch
)
3577 return (get_bidi_category (ch
) == UC_BIDI_WS
);
3580 /* See PropList-3.0.1.txt. */
3582 is_property_bidi_non_spacing_mark (unsigned int ch
)
3584 return (get_bidi_category (ch
) == UC_BIDI_NSM
);
3587 /* See PropList-3.0.1.txt. */
3589 is_property_bidi_boundary_neutral (unsigned int ch
)
3591 return (get_bidi_category (ch
) == UC_BIDI_BN
);
3594 /* See PropList-3.0.1.txt. */
3596 is_property_bidi_pdf (unsigned int ch
)
3598 return (get_bidi_category (ch
) == UC_BIDI_PDF
);
3601 /* See PropList-3.0.1.txt. */
3603 is_property_bidi_embedding_or_override (unsigned int ch
)
3605 int category
= get_bidi_category (ch
);
3606 return (category
== UC_BIDI_LRE
|| category
== UC_BIDI_LRO
3607 || category
== UC_BIDI_RLE
|| category
== UC_BIDI_RLO
);
3610 /* See PropList-3.0.1.txt. */
3612 is_property_bidi_other_neutral (unsigned int ch
)
3614 return (get_bidi_category (ch
) == UC_BIDI_ON
);
3617 /* See PropList.txt, UCD.html. */
3619 is_property_hex_digit (unsigned int ch
)
3621 return ((unicode_properties
[ch
] & (1ULL << PROP_HEX_DIGIT
)) != 0);
3624 /* See PropList.txt, UCD.html. */
3626 is_property_ascii_hex_digit (unsigned int ch
)
3628 return ((unicode_properties
[ch
] & (1ULL << PROP_ASCII_HEX_DIGIT
)) != 0);
3631 /* See Unicode 3.0 book, section 4.10,
3632 PropList.txt, UCD.html. */
3634 is_property_ideographic (unsigned int ch
)
3636 return ((unicode_properties
[ch
] & (1ULL << PROP_IDEOGRAPHIC
)) != 0);
3639 /* See PropList.txt, UCD.html. */
3641 is_property_unified_ideograph (unsigned int ch
)
3643 return ((unicode_properties
[ch
] & (1ULL << PROP_UNIFIED_IDEOGRAPH
)) != 0);
3646 /* See PropList.txt, UCD.html. */
3648 is_property_radical (unsigned int ch
)
3650 return ((unicode_properties
[ch
] & (1ULL << PROP_RADICAL
)) != 0);
3653 /* See PropList.txt, UCD.html. */
3655 is_property_ids_unary_operator (unsigned int ch
)
3657 return ((unicode_properties
[ch
] & (1ULL << PROP_IDS_UNARY_OPERATOR
)) != 0);
3660 /* See PropList.txt, UCD.html. */
3662 is_property_ids_binary_operator (unsigned int ch
)
3664 return ((unicode_properties
[ch
] & (1ULL << PROP_IDS_BINARY_OPERATOR
)) != 0);
3667 /* See PropList.txt, UCD.html. */
3669 is_property_ids_trinary_operator (unsigned int ch
)
3671 return ((unicode_properties
[ch
] & (1ULL << PROP_IDS_TRINARY_OPERATOR
)) != 0);
3674 /* See PropList-3.0.1.txt. */
3676 is_property_zero_width (unsigned int ch
)
3678 return is_category_Cf (ch
)
3679 || (unicode_attributes
[ch
].name
!= NULL
3680 && strstr (unicode_attributes
[ch
].name
, "ZERO WIDTH") != NULL
);
3683 /* See PropList-3.0.1.txt. */
3685 is_property_space (unsigned int ch
)
3687 return is_category_Zs (ch
);
3690 /* See PropList-3.0.1.txt. */
3692 is_property_non_break (unsigned int ch
)
3694 /* This is exactly the set of characters having line breaking
3696 return (ch
== 0x00A0 /* NO-BREAK SPACE */
3697 || ch
== 0x034F /* COMBINING GRAPHEME JOINER */
3698 || ch
== 0x035C /* COMBINING DOUBLE BREVE BELOW */
3699 || ch
== 0x035D /* COMBINING DOUBLE BREVE */
3700 || ch
== 0x035E /* COMBINING DOUBLE MACRON */
3701 || ch
== 0x035F /* COMBINING DOUBLE MACRON BELOW */
3702 || ch
== 0x0360 /* COMBINING DOUBLE TILDE */
3703 || ch
== 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3704 || ch
== 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3705 || ch
== 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3706 || ch
== 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3707 || ch
== 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3708 || ch
== 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3709 || ch
== 0x2007 /* FIGURE SPACE */
3710 || ch
== 0x2011 /* NON-BREAKING HYPHEN */
3711 || ch
== 0x202F /* NARROW NO-BREAK SPACE */);
3714 /* See PropList-3.0.1.txt. */
3716 is_property_iso_control (unsigned int ch
)
3719 (unicode_attributes
[ch
].name
!= NULL
3720 && strcmp (unicode_attributes
[ch
].name
, "<control>") == 0);
3722 is_category_Cc (ch
);
3724 assert (result1
== result2
);
3728 /* See PropList-3.0.1.txt. */
3730 is_property_format_control (unsigned int ch
)
3732 return (is_category_Cf (ch
)
3733 && get_bidi_category (ch
) == UC_BIDI_BN
3734 && !is_property_join_control (ch
)
3738 /* See PropList.txt, UCD.html. */
3740 is_property_prepended_concatenation_mark (unsigned int ch
)
3742 return ((unicode_properties
[ch
] & (1ULL << PROP_PREPENDED_CONCATENATION_MARK
)) != 0);
3745 /* See PropList.txt, UCD.html. */
3747 is_property_dash (unsigned int ch
)
3749 return ((unicode_properties
[ch
] & (1ULL << PROP_DASH
)) != 0);
3752 /* See PropList.txt, UCD.html. */
3754 is_property_hyphen (unsigned int ch
)
3756 return ((unicode_properties
[ch
] & (1ULL << PROP_HYPHEN
)) != 0);
3759 /* See PropList-3.0.1.txt. */
3761 is_property_punctuation (unsigned int ch
)
3763 return is_category_P (ch
);
3766 /* See PropList-3.0.1.txt. */
3768 is_property_line_separator (unsigned int ch
)
3770 return is_category_Zl (ch
);
3773 /* See PropList-3.0.1.txt. */
3775 is_property_paragraph_separator (unsigned int ch
)
3777 return is_category_Zp (ch
);
3780 /* See PropList.txt, UCD.html. */
3782 is_property_quotation_mark (unsigned int ch
)
3784 return ((unicode_properties
[ch
] & (1ULL << PROP_QUOTATION_MARK
)) != 0);
3787 /* See PropList.txt, UCD.html. */
3789 is_property_sentence_terminal (unsigned int ch
)
3791 return ((unicode_properties
[ch
] & (1ULL << PROP_SENTENCE_TERMINAL
)) != 0);
3794 /* See PropList.txt, UCD.html. */
3796 is_property_terminal_punctuation (unsigned int ch
)
3798 return ((unicode_properties
[ch
] & (1ULL << PROP_TERMINAL_PUNCTUATION
)) != 0);
3801 /* See PropList-3.0.1.txt. */
3803 is_property_currency_symbol (unsigned int ch
)
3805 return is_category_Sc (ch
);
3808 /* See Unicode 3.0 book, section 4.9,
3809 PropList.txt, UCD.html,
3810 DerivedCoreProperties.txt, UCD.html. */
3812 is_property_math (unsigned int ch
)
3816 || ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_MATH
)) != 0);
3818 ((unicode_properties
[ch
] & (1ULL << PROP_MATH
)) != 0);
3820 assert (result1
== result2
);
3824 /* See PropList.txt, UCD.html. */
3826 is_property_other_math (unsigned int ch
)
3828 return ((unicode_properties
[ch
] & (1ULL << PROP_OTHER_MATH
)) != 0);
3831 /* See PropList-3.0.1.txt. */
3833 is_property_paired_punctuation (unsigned int ch
)
3835 return unicode_pairedpunctuation
[ch
];
3838 /* See PropList-3.0.1.txt. */
3840 is_property_left_of_pair (unsigned int ch
)
3842 return unicode_leftofpair
[ch
];
3845 /* See PropList-3.0.1.txt. */
3847 is_property_combining (unsigned int ch
)
3849 return (unicode_attributes
[ch
].name
!= NULL
3850 && (strcmp (unicode_attributes
[ch
].combining
, "0") != 0
3851 || is_category_Mc (ch
)
3852 || is_category_Me (ch
)
3853 || is_category_Mn (ch
)));
3856 #if 0 /* same as is_property_bidi_non_spacing_mark */
3857 /* See PropList-3.0.1.txt. */
3859 is_property_non_spacing (unsigned int ch
)
3861 return (unicode_attributes
[ch
].name
!= NULL
3862 && get_bidi_category (ch
) == UC_BIDI_NSM
);
3866 /* See PropList-3.0.1.txt. */
3868 is_property_composite (unsigned int ch
)
3870 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3871 logical in some sense. */
3872 if (ch
>= 0xAC00 && ch
<= 0xD7A4) /* Hangul Syllables */
3874 if (unicode_attributes
[ch
].name
!= NULL
3875 && unicode_attributes
[ch
].decomposition
!= NULL
)
3877 /* Test whether the decomposition contains more than one character,
3878 and the first is not a space. */
3879 const char *decomp
= unicode_attributes
[ch
].decomposition
;
3880 if (decomp
[0] == '<')
3882 decomp
= strchr (decomp
, '>') + 1;
3883 if (decomp
[0] == ' ')
3886 return strchr (decomp
, ' ') != NULL
&& strncmp (decomp
, "0020 ", 5) != 0;
3891 /* See PropList-3.0.1.txt. */
3893 is_property_decimal_digit (unsigned int ch
)
3895 return is_category_Nd (ch
);
3898 /* See PropList-3.0.1.txt. */
3900 is_property_numeric (unsigned int ch
)
3902 return ((get_numeric_value (ch
)).denominator
> 0)
3903 || (ch
== 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3904 || (ch
== 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3907 /* See PropList.txt, UCD.html. */
3909 is_property_diacritic (unsigned int ch
)
3911 return ((unicode_properties
[ch
] & (1ULL << PROP_DIACRITIC
)) != 0);
3914 /* See PropList.txt, UCD.html. */
3916 is_property_extender (unsigned int ch
)
3918 return ((unicode_properties
[ch
] & (1ULL << PROP_EXTENDER
)) != 0);
3921 /* See PropList-3.0.1.txt. */
3923 is_property_ignorable_control (unsigned int ch
)
3925 return ((is_category_Cc (ch
) && get_bidi_category (ch
) == UC_BIDI_BN
)
3926 || is_category_Cf (ch
))
3930 /* See PropList.txt, UCD.html. */
3932 is_property_regional_indicator (unsigned int ch
)
3934 return ((unicode_properties
[ch
] & (1ULL << PROP_REGIONAL_INDICATOR
)) != 0);
3937 /* See emoji-data.txt, UTS #51. */
3939 is_property_emoji (unsigned int ch
)
3941 return ((unicode_properties
[ch
] & (1ULL << PROP_EMOJI
)) != 0);
3944 /* See emoji-data.txt, UTS #51. */
3946 is_property_emoji_presentation (unsigned int ch
)
3948 return ((unicode_properties
[ch
] & (1ULL << PROP_EMOJI_PRESENTATION
)) != 0);
3951 /* See emoji-data.txt, UTS #51. */
3953 is_property_emoji_modifier (unsigned int ch
)
3955 return ((unicode_properties
[ch
] & (1ULL << PROP_EMOJI_MODIFIER
)) != 0);
3958 /* See emoji-data.txt, UTS #51. */
3960 is_property_emoji_modifier_base (unsigned int ch
)
3962 return ((unicode_properties
[ch
] & (1ULL << PROP_EMOJI_MODIFIER_BASE
)) != 0);
3965 /* See emoji-data.txt, UTS #51. */
3967 is_property_emoji_component (unsigned int ch
)
3969 return ((unicode_properties
[ch
] & (1ULL << PROP_EMOJI_COMPONENT
)) != 0);
3972 /* See emoji-data.txt, UTS #51. */
3974 is_property_extended_pictographic (unsigned int ch
)
3976 return ((unicode_properties
[ch
] & (1ULL << PROP_EXTENDED_PICTOGRAPHIC
)) != 0);
3979 /* ------------------------------------------------------------------------- */
3981 /* Output all properties. */
3983 output_properties (const char *version
)
3985 #define PROPERTY(P) \
3986 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3987 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3988 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3989 PROPERTY(white_space
)
3990 PROPERTY(alphabetic
)
3991 PROPERTY(other_alphabetic
)
3992 PROPERTY(not_a_character
)
3993 PROPERTY(default_ignorable_code_point
)
3994 PROPERTY(other_default_ignorable_code_point
)
3995 PROPERTY(deprecated
)
3996 PROPERTY(logical_order_exception
)
3997 PROPERTY(variation_selector
)
3998 PROPERTY(private_use
)
3999 PROPERTY(unassigned_code_value
)
4001 PROPERTY(other_uppercase
)
4003 PROPERTY(other_lowercase
)
4006 PROPERTY(case_ignorable
)
4007 PROPERTY(changes_when_lowercased
)
4008 PROPERTY(changes_when_uppercased
)
4009 PROPERTY(changes_when_titlecased
)
4010 PROPERTY(changes_when_casefolded
)
4011 PROPERTY(changes_when_casemapped
)
4012 PROPERTY(soft_dotted
)
4014 PROPERTY(other_id_start
)
4015 PROPERTY(id_continue
)
4016 PROPERTY(other_id_continue
)
4018 PROPERTY(xid_continue
)
4019 PROPERTY(id_compat_math_start
)
4020 PROPERTY(id_compat_math_continue
)
4021 PROPERTY(pattern_white_space
)
4022 PROPERTY(pattern_syntax
)
4023 PROPERTY(join_control
)
4024 PROPERTY(grapheme_base
)
4025 PROPERTY(grapheme_extend
)
4026 PROPERTY(other_grapheme_extend
)
4027 PROPERTY(grapheme_link
)
4028 PROPERTY(modifier_combining_mark
)
4029 PROPERTY(bidi_control
)
4030 PROPERTY(bidi_left_to_right
)
4031 PROPERTY(bidi_hebrew_right_to_left
)
4032 PROPERTY(bidi_arabic_right_to_left
)
4033 PROPERTY(bidi_european_digit
)
4034 PROPERTY(bidi_eur_num_separator
)
4035 PROPERTY(bidi_eur_num_terminator
)
4036 PROPERTY(bidi_arabic_digit
)
4037 PROPERTY(bidi_common_separator
)
4038 PROPERTY(bidi_block_separator
)
4039 PROPERTY(bidi_segment_separator
)
4040 PROPERTY(bidi_whitespace
)
4041 PROPERTY(bidi_non_spacing_mark
)
4042 PROPERTY(bidi_boundary_neutral
)
4044 PROPERTY(bidi_embedding_or_override
)
4045 PROPERTY(bidi_other_neutral
)
4047 PROPERTY(ascii_hex_digit
)
4048 PROPERTY(ideographic
)
4049 PROPERTY(unified_ideograph
)
4051 PROPERTY(ids_unary_operator
)
4052 PROPERTY(ids_binary_operator
)
4053 PROPERTY(ids_trinary_operator
)
4054 PROPERTY(zero_width
)
4057 PROPERTY(iso_control
)
4058 PROPERTY(format_control
)
4059 PROPERTY(prepended_concatenation_mark
)
4062 PROPERTY(punctuation
)
4063 PROPERTY(line_separator
)
4064 PROPERTY(paragraph_separator
)
4065 PROPERTY(quotation_mark
)
4066 PROPERTY(sentence_terminal
)
4067 PROPERTY(terminal_punctuation
)
4068 PROPERTY(currency_symbol
)
4070 PROPERTY(other_math
)
4071 PROPERTY(paired_punctuation
)
4072 PROPERTY(left_of_pair
)
4075 PROPERTY(decimal_digit
)
4079 PROPERTY(ignorable_control
)
4080 PROPERTY(regional_indicator
)
4082 PROPERTY(emoji_presentation
)
4083 PROPERTY(emoji_modifier
)
4084 PROPERTY(emoji_modifier_base
)
4085 PROPERTY(emoji_component
)
4086 PROPERTY(extended_pictographic
)
4090 /* ------------------------------------------------------------------------- */
4092 /* Convert an Indic_Conjunct_Break value to a C identifier. */
4094 indic_conjunct_break_as_c_identifier (int indic_conjunct_break
)
4096 #define TRY(value) if (indic_conjunct_break == value) return #value;
4097 TRY(UC_INDIC_CONJUNCT_BREAK_NONE
)
4098 TRY(UC_INDIC_CONJUNCT_BREAK_CONSONANT
)
4099 TRY(UC_INDIC_CONJUNCT_BREAK_LINKER
)
4100 TRY(UC_INDIC_CONJUNCT_BREAK_EXTEND
)
4106 output_indic_conjunct_break_test (const char *filename
, const char *version
)
4112 stream
= fopen (filename
, "w");
4115 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
4119 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4120 fprintf (stream
, "/* Indic_Conjunct_Break attribute of Unicode characters. */\n");
4121 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4123 fprintf (stream
, "\n");
4125 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
4126 fprintf (stream
, "\n");
4127 output_tests_license (stream
);
4128 fprintf (stream
, "\n");
4131 for (ch
= 0; ch
< 0x110000; ch
++)
4133 int value
= unicode_indic_conjunct_break
[ch
];
4135 if (value
!= UC_INDIC_CONJUNCT_BREAK_NONE
)
4138 fprintf (stream
, ",\n");
4139 fprintf (stream
, " { 0x%04X, %s }", ch
, indic_conjunct_break_as_c_identifier (value
));
4144 fprintf (stream
, "\n");
4146 if (ferror (stream
) || fclose (stream
))
4148 fprintf (stderr
, "error writing to '%s'\n", filename
);
4153 /* Construction of sparse 3-level tables. */
4154 #define TABLE indic_conjunct_break_table
4155 #define ELEMENT uint8_t
4156 #define DEFAULT UC_INDIC_CONJUNCT_BREAK_NONE
4157 #define xmalloc malloc
4158 #define xrealloc realloc
4162 output_indic_conjunct_break (const char *filename
, const char *version
)
4166 struct indic_conjunct_break_table t
;
4167 unsigned int level1_offset
, level2_offset
, level3_offset
;
4169 stream
= fopen (filename
, "w");
4172 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
4176 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4177 fprintf (stream
, "/* Indic_Conjunct_Break attribute of Unicode characters. */\n");
4178 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4180 fprintf (stream
, "\n");
4182 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
4183 fprintf (stream
, "\n");
4184 output_library_license (stream
, false);
4185 fprintf (stream
, "\n");
4189 indic_conjunct_break_table_init (&t
);
4191 for (ch
= 0; ch
< 0x110000; ch
++)
4193 uint8_t value
= unicode_indic_conjunct_break
[ch
];
4195 assert (value
<= 0x03);
4197 if (value
!= UC_INDIC_CONJUNCT_BREAK_NONE
)
4198 indic_conjunct_break_table_add (&t
, ch
, value
);
4201 indic_conjunct_break_table_finalize (&t
);
4203 /* Offsets in t.result, in memory of this process. */
4205 5 * sizeof (uint32_t);
4207 5 * sizeof (uint32_t)
4208 + t
.level1_size
* sizeof (uint32_t);
4210 5 * sizeof (uint32_t)
4211 + t
.level1_size
* sizeof (uint32_t)
4212 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
4214 for (i
= 0; i
< 5; i
++)
4215 fprintf (stream
, "#define indic_conjunct_break_header_%d %d\n", i
,
4216 ((uint32_t *) t
.result
)[i
]);
4217 fprintf (stream
, "static const\n");
4218 fprintf (stream
, "struct\n");
4219 fprintf (stream
, " {\n");
4220 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
4221 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
4222 fprintf (stream
, " unsigned short level3[%zu * %d];\n", t
.level3_size
,
4223 (1 << t
.p
) * 2 / 16);
4224 fprintf (stream
, " }\n");
4225 fprintf (stream
, "u_indic_conjunct_break =\n");
4226 fprintf (stream
, "{\n");
4227 fprintf (stream
, " {");
4228 if (t
.level1_size
> 8)
4229 fprintf (stream
, "\n ");
4230 for (i
= 0; i
< t
.level1_size
; i
++)
4233 if (i
> 0 && (i
% 8) == 0)
4234 fprintf (stream
, "\n ");
4235 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
4237 fprintf (stream
, " %5d", -1);
4239 fprintf (stream
, " %5zu",
4240 (offset
- level2_offset
) / sizeof (uint32_t));
4241 if (i
+1 < t
.level1_size
)
4242 fprintf (stream
, ",");
4244 if (t
.level1_size
> 8)
4245 fprintf (stream
, "\n ");
4246 fprintf (stream
, " },\n");
4247 fprintf (stream
, " {");
4248 if (t
.level2_size
<< t
.q
> 8)
4249 fprintf (stream
, "\n ");
4250 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
4253 if (i
> 0 && (i
% 8) == 0)
4254 fprintf (stream
, "\n ");
4255 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
4257 fprintf (stream
, " %5d", -1);
4259 fprintf (stream
, " %5zu",
4260 (offset
- level3_offset
) / sizeof (uint8_t));
4261 if (i
+1 < t
.level2_size
<< t
.q
)
4262 fprintf (stream
, ",");
4264 if (t
.level2_size
<< t
.q
> 8)
4265 fprintf (stream
, "\n ");
4266 fprintf (stream
, " },\n");
4267 /* Pack the level3 array. Each entry needs 2 bits only. */
4268 fprintf (stream
, " {");
4269 if ((t
.level3_size
<< t
.p
) * 2 / 16 > 8)
4270 fprintf (stream
, "\n ");
4271 for (i
= 0; i
< (t
.level3_size
<< t
.p
) * 2 / 16; i
++)
4273 if (i
> 0 && (i
% 8) == 0)
4274 fprintf (stream
, "\n ");
4275 fprintf (stream
, " 0x%04x",
4276 (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
] << 0)
4277 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 1] << 2)
4278 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 2] << 4)
4279 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 3] << 6)
4280 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 4] << 8)
4281 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 5] << 10)
4282 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 6] << 12)
4283 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 7] << 14));
4284 if (i
+1 < (t
.level3_size
<< t
.p
) * 2 / 16)
4285 fprintf (stream
, ",");
4287 if ((t
.level3_size
<< t
.p
) * 2 / 16 > 8)
4288 fprintf (stream
, "\n ");
4289 fprintf (stream
, " }\n");
4290 fprintf (stream
, "};\n");
4292 if (ferror (stream
) || fclose (stream
))
4294 fprintf (stderr
, "error writing to '%s'\n", filename
);
4299 /* ========================================================================= */
4301 /* Arabic Shaping. */
4305 UC_JOINING_TYPE_U
, /* Non_Joining */
4306 UC_JOINING_TYPE_T
, /* Transparent */
4307 UC_JOINING_TYPE_C
, /* Join_Causing */
4308 UC_JOINING_TYPE_L
, /* Left_Joining */
4309 UC_JOINING_TYPE_R
, /* Right_Joining */
4310 UC_JOINING_TYPE_D
/* Dual_Joining */
4313 static uint8_t unicode_joining_type
[0x110000];
4317 UC_JOINING_GROUP_NONE
, /* No_Joining_Group */
4318 UC_JOINING_GROUP_AIN
, /* Ain */
4319 UC_JOINING_GROUP_ALAPH
, /* Alaph */
4320 UC_JOINING_GROUP_ALEF
, /* Alef */
4321 UC_JOINING_GROUP_BEH
, /* Beh */
4322 UC_JOINING_GROUP_BETH
, /* Beth */
4323 UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE
, /* Burushaski_Yeh_Barree */
4324 UC_JOINING_GROUP_DAL
, /* Dal */
4325 UC_JOINING_GROUP_DALATH_RISH
, /* Dalath_Rish */
4326 UC_JOINING_GROUP_E
, /* E */
4327 UC_JOINING_GROUP_FARSI_YEH
, /* Farsi_Yeh */
4328 UC_JOINING_GROUP_FE
, /* Fe */
4329 UC_JOINING_GROUP_FEH
, /* Feh */
4330 UC_JOINING_GROUP_FINAL_SEMKATH
, /* Final_Semkath */
4331 UC_JOINING_GROUP_GAF
, /* Gaf */
4332 UC_JOINING_GROUP_GAMAL
, /* Gamal */
4333 UC_JOINING_GROUP_HAH
, /* Hah */
4334 UC_JOINING_GROUP_HE
, /* He */
4335 UC_JOINING_GROUP_HEH
, /* Heh */
4336 UC_JOINING_GROUP_HEH_GOAL
, /* Heh_Goal */
4337 UC_JOINING_GROUP_HETH
, /* Heth */
4338 UC_JOINING_GROUP_KAF
, /* Kaf */
4339 UC_JOINING_GROUP_KAPH
, /* Kaph */
4340 UC_JOINING_GROUP_KHAPH
, /* Khaph */
4341 UC_JOINING_GROUP_KNOTTED_HEH
, /* Knotted_Heh */
4342 UC_JOINING_GROUP_LAM
, /* Lam */
4343 UC_JOINING_GROUP_LAMADH
, /* Lamadh */
4344 UC_JOINING_GROUP_MEEM
, /* Meem */
4345 UC_JOINING_GROUP_MIM
, /* Mim */
4346 UC_JOINING_GROUP_NOON
, /* Noon */
4347 UC_JOINING_GROUP_NUN
, /* Nun */
4348 UC_JOINING_GROUP_NYA
, /* Nya */
4349 UC_JOINING_GROUP_PE
, /* Pe */
4350 UC_JOINING_GROUP_QAF
, /* Qaf */
4351 UC_JOINING_GROUP_QAPH
, /* Qaph */
4352 UC_JOINING_GROUP_REH
, /* Reh */
4353 UC_JOINING_GROUP_REVERSED_PE
, /* Reversed_Pe */
4354 UC_JOINING_GROUP_SAD
, /* Sad */
4355 UC_JOINING_GROUP_SADHE
, /* Sadhe */
4356 UC_JOINING_GROUP_SEEN
, /* Seen */
4357 UC_JOINING_GROUP_SEMKATH
, /* Semkath */
4358 UC_JOINING_GROUP_SHIN
, /* Shin */
4359 UC_JOINING_GROUP_SWASH_KAF
, /* Swash_Kaf */
4360 UC_JOINING_GROUP_SYRIAC_WAW
, /* Syriac_Waw */
4361 UC_JOINING_GROUP_TAH
, /* Tah */
4362 UC_JOINING_GROUP_TAW
, /* Taw */
4363 UC_JOINING_GROUP_TEH_MARBUTA
, /* Teh_Marbuta */
4364 UC_JOINING_GROUP_TEH_MARBUTA_GOAL
, /* Teh_Marbuta_Goal */
4365 UC_JOINING_GROUP_TETH
, /* Teth */
4366 UC_JOINING_GROUP_WAW
, /* Waw */
4367 UC_JOINING_GROUP_YEH
, /* Yeh */
4368 UC_JOINING_GROUP_YEH_BARREE
, /* Yeh_Barree */
4369 UC_JOINING_GROUP_YEH_WITH_TAIL
, /* Yeh_With_Tail */
4370 UC_JOINING_GROUP_YUDH
, /* Yudh */
4371 UC_JOINING_GROUP_YUDH_HE
, /* Yudh_He */
4372 UC_JOINING_GROUP_ZAIN
, /* Zain */
4373 UC_JOINING_GROUP_ZHAIN
, /* Zhain */
4374 UC_JOINING_GROUP_ROHINGYA_YEH
, /* Rohingya_Yeh */
4375 UC_JOINING_GROUP_STRAIGHT_WAW
, /* Straight_Waw */
4376 UC_JOINING_GROUP_MANICHAEAN_ALEPH
, /* Manichaean_Aleph */
4377 UC_JOINING_GROUP_MANICHAEAN_BETH
, /* Manichaean_Beth */
4378 UC_JOINING_GROUP_MANICHAEAN_GIMEL
, /* Manichaean_Gimel */
4379 UC_JOINING_GROUP_MANICHAEAN_DALETH
, /* Manichaean_Daleth */
4380 UC_JOINING_GROUP_MANICHAEAN_WAW
, /* Manichaean_Waw */
4381 UC_JOINING_GROUP_MANICHAEAN_ZAYIN
, /* Manichaean_Zayin */
4382 UC_JOINING_GROUP_MANICHAEAN_HETH
, /* Manichaean_Heth */
4383 UC_JOINING_GROUP_MANICHAEAN_TETH
, /* Manichaean_Teth */
4384 UC_JOINING_GROUP_MANICHAEAN_YODH
, /* Manichaean_Yodh */
4385 UC_JOINING_GROUP_MANICHAEAN_KAPH
, /* Manichaean_Kaph */
4386 UC_JOINING_GROUP_MANICHAEAN_LAMEDH
, /* Manichaean_Lamedh */
4387 UC_JOINING_GROUP_MANICHAEAN_DHAMEDH
, /* Manichaean_Dhamedh */
4388 UC_JOINING_GROUP_MANICHAEAN_THAMEDH
, /* Manichaean_Thamedh */
4389 UC_JOINING_GROUP_MANICHAEAN_MEM
, /* Manichaean_Mem */
4390 UC_JOINING_GROUP_MANICHAEAN_NUN
, /* Manichaean_Nun */
4391 UC_JOINING_GROUP_MANICHAEAN_SAMEKH
, /* Manichaean_Aleph */
4392 UC_JOINING_GROUP_MANICHAEAN_AYIN
, /* Manichaean_Ayin */
4393 UC_JOINING_GROUP_MANICHAEAN_PE
, /* Manichaean_Pe */
4394 UC_JOINING_GROUP_MANICHAEAN_SADHE
, /* Manichaean_Sadhe */
4395 UC_JOINING_GROUP_MANICHAEAN_QOPH
, /* Manichaean_Qoph */
4396 UC_JOINING_GROUP_MANICHAEAN_RESH
, /* Manichaean_Resh */
4397 UC_JOINING_GROUP_MANICHAEAN_TAW
, /* Manichaean_Taw */
4398 UC_JOINING_GROUP_MANICHAEAN_ONE
, /* Manichaean_One */
4399 UC_JOINING_GROUP_MANICHAEAN_FIVE
, /* Manichaean_Five */
4400 UC_JOINING_GROUP_MANICHAEAN_TEN
, /* Manichaean_Ten */
4401 UC_JOINING_GROUP_MANICHAEAN_TWENTY
, /* Manichaean_Twenty */
4402 UC_JOINING_GROUP_MANICHAEAN_HUNDRED
, /* Manichaean_Hundred */
4403 UC_JOINING_GROUP_AFRICAN_FEH
, /* African_Feh */
4404 UC_JOINING_GROUP_AFRICAN_QAF
, /* African_Qaf */
4405 UC_JOINING_GROUP_AFRICAN_NOON
, /* African_Noon */
4406 UC_JOINING_GROUP_MALAYALAM_NGA
, /* Malayalam_Nga */
4407 UC_JOINING_GROUP_MALAYALAM_JA
, /* Malayalam_Ja */
4408 UC_JOINING_GROUP_MALAYALAM_NYA
, /* Malayalam_Nya */
4409 UC_JOINING_GROUP_MALAYALAM_TTA
, /* Malayalam_Tta */
4410 UC_JOINING_GROUP_MALAYALAM_NNA
, /* Malayalam_Nna */
4411 UC_JOINING_GROUP_MALAYALAM_NNNA
, /* Malayalam_Nnna */
4412 UC_JOINING_GROUP_MALAYALAM_BHA
, /* Malayalam_Bha */
4413 UC_JOINING_GROUP_MALAYALAM_RA
, /* Malayalam_Ra */
4414 UC_JOINING_GROUP_MALAYALAM_LLA
, /* Malayalam_Lla */
4415 UC_JOINING_GROUP_MALAYALAM_LLLA
, /* Malayalam_Llla */
4416 UC_JOINING_GROUP_MALAYALAM_SSA
, /* Malayalam_Ssa */
4417 UC_JOINING_GROUP_HANIFI_ROHINGYA_PA
, /* Hanifi_Rohingya_Pa */
4418 UC_JOINING_GROUP_HANIFI_ROHINGYA_KINNA_YA
, /* Hanifi_Rohingya_Kinna_Ya */
4419 UC_JOINING_GROUP_THIN_YEH
, /* Thin_Yeh */
4420 UC_JOINING_GROUP_VERTICAL_TAIL
, /* Vertical_Tail */
4421 UC_JOINING_GROUP_KASHMIRI_YEH
/* Kashmiri_Yeh */
4424 static uint8_t unicode_joining_group
[0x110000];
4427 fill_arabicshaping (const char *arabicshaping_filename
)
4433 stream
= fopen (arabicshaping_filename
, "r");
4436 fprintf (stderr
, "error during fopen of '%s'\n", arabicshaping_filename
);
4440 for (i
= 0; i
< 0x110000; i
++)
4442 unicode_joining_type
[i
] = (uint8_t)~(uint8_t)0;
4443 unicode_joining_group
[i
] = UC_JOINING_GROUP_NONE
;
4450 char separator1
[200+1];
4451 char schematic_name
[200+1];
4452 char separator2
[200+1];
4453 char joining_type_name
[200+1];
4454 char separator3
[200+1];
4455 char joining_group_name
[200+1];
4460 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
4463 if (buf
[0] == '\0' || buf
[0] == '#')
4466 if (sscanf (buf
, "%X%[; ]%[^;]%[; ]%[^;]%[; ]%100[^\n]",
4467 &i
, separator1
, schematic_name
, separator2
, joining_type_name
,
4468 separator3
, joining_group_name
) != 7)
4470 fprintf (stderr
, "parse error in '%s':%d\n",
4471 arabicshaping_filename
, lineno
);
4474 assert (i
< 0x110000);
4476 #define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name;
4478 TRY(UC_JOINING_TYPE_U
)
4479 TRY(UC_JOINING_TYPE_T
)
4480 TRY(UC_JOINING_TYPE_C
)
4481 TRY(UC_JOINING_TYPE_L
)
4482 TRY(UC_JOINING_TYPE_R
)
4483 TRY(UC_JOINING_TYPE_D
)
4487 fprintf (stderr
, "unknown joining type value \"%s\" in '%s':%d\n",
4488 joining_type_name
, arabicshaping_filename
, lineno
);
4492 /* Remove trailing spaces. */
4493 while (joining_group_name
[0] != '\0'
4494 && joining_group_name
[strlen (joining_group_name
) - 1] == ' ')
4495 joining_group_name
[strlen (joining_group_name
) - 1] = '\0';
4497 #define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value;
4499 TRY(UC_JOINING_GROUP_NONE
, "No_Joining_Group")
4500 TRY(UC_JOINING_GROUP_AIN
, "AIN")
4501 TRY(UC_JOINING_GROUP_ALAPH
, "ALAPH")
4502 TRY(UC_JOINING_GROUP_ALEF
, "ALEF")
4503 TRY(UC_JOINING_GROUP_BEH
, "BEH")
4504 TRY(UC_JOINING_GROUP_BETH
, "BETH")
4505 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE
, "BURUSHASKI YEH BARREE")
4506 TRY(UC_JOINING_GROUP_DAL
, "DAL")
4507 TRY(UC_JOINING_GROUP_DALATH_RISH
, "DALATH RISH")
4508 TRY(UC_JOINING_GROUP_E
, "E")
4509 TRY(UC_JOINING_GROUP_FARSI_YEH
, "FARSI YEH")
4510 TRY(UC_JOINING_GROUP_FE
, "FE")
4511 TRY(UC_JOINING_GROUP_FEH
, "FEH")
4512 TRY(UC_JOINING_GROUP_FINAL_SEMKATH
, "FINAL SEMKATH")
4513 TRY(UC_JOINING_GROUP_GAF
, "GAF")
4514 TRY(UC_JOINING_GROUP_GAMAL
, "GAMAL")
4515 TRY(UC_JOINING_GROUP_HAH
, "HAH")
4516 TRY(UC_JOINING_GROUP_HE
, "HE")
4517 TRY(UC_JOINING_GROUP_HEH
, "HEH")
4518 TRY(UC_JOINING_GROUP_HEH_GOAL
, "HEH GOAL")
4519 TRY(UC_JOINING_GROUP_HETH
, "HETH")
4520 TRY(UC_JOINING_GROUP_KAF
, "KAF")
4521 TRY(UC_JOINING_GROUP_KAPH
, "KAPH")
4522 TRY(UC_JOINING_GROUP_KHAPH
, "KHAPH")
4523 TRY(UC_JOINING_GROUP_KNOTTED_HEH
, "KNOTTED HEH")
4524 TRY(UC_JOINING_GROUP_LAM
, "LAM")
4525 TRY(UC_JOINING_GROUP_LAMADH
, "LAMADH")
4526 TRY(UC_JOINING_GROUP_MEEM
, "MEEM")
4527 TRY(UC_JOINING_GROUP_MIM
, "MIM")
4528 TRY(UC_JOINING_GROUP_NOON
, "NOON")
4529 TRY(UC_JOINING_GROUP_NUN
, "NUN")
4530 TRY(UC_JOINING_GROUP_NYA
, "NYA")
4531 TRY(UC_JOINING_GROUP_PE
, "PE")
4532 TRY(UC_JOINING_GROUP_QAF
, "QAF")
4533 TRY(UC_JOINING_GROUP_QAPH
, "QAPH")
4534 TRY(UC_JOINING_GROUP_REH
, "REH")
4535 TRY(UC_JOINING_GROUP_REVERSED_PE
, "REVERSED PE")
4536 TRY(UC_JOINING_GROUP_SAD
, "SAD")
4537 TRY(UC_JOINING_GROUP_SADHE
, "SADHE")
4538 TRY(UC_JOINING_GROUP_SEEN
, "SEEN")
4539 TRY(UC_JOINING_GROUP_SEMKATH
, "SEMKATH")
4540 TRY(UC_JOINING_GROUP_SHIN
, "SHIN")
4541 TRY(UC_JOINING_GROUP_SWASH_KAF
, "SWASH KAF")
4542 TRY(UC_JOINING_GROUP_SYRIAC_WAW
, "SYRIAC WAW")
4543 TRY(UC_JOINING_GROUP_TAH
, "TAH")
4544 TRY(UC_JOINING_GROUP_TAW
, "TAW")
4545 TRY(UC_JOINING_GROUP_TEH_MARBUTA
, "TEH MARBUTA")
4546 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL
, "TEH MARBUTA GOAL")
4547 TRY(UC_JOINING_GROUP_TETH
, "TETH")
4548 TRY(UC_JOINING_GROUP_WAW
, "WAW")
4549 TRY(UC_JOINING_GROUP_YEH
, "YEH")
4550 TRY(UC_JOINING_GROUP_YEH_BARREE
, "YEH BARREE")
4551 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL
, "YEH WITH TAIL")
4552 TRY(UC_JOINING_GROUP_YUDH
, "YUDH")
4553 TRY(UC_JOINING_GROUP_YUDH_HE
, "YUDH HE")
4554 TRY(UC_JOINING_GROUP_ZAIN
, "ZAIN")
4555 TRY(UC_JOINING_GROUP_ZHAIN
, "ZHAIN")
4556 TRY(UC_JOINING_GROUP_ROHINGYA_YEH
, "ROHINGYA YEH")
4557 TRY(UC_JOINING_GROUP_STRAIGHT_WAW
, "STRAIGHT WAW")
4558 TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH
, "MANICHAEAN ALEPH")
4559 TRY(UC_JOINING_GROUP_MANICHAEAN_BETH
, "MANICHAEAN BETH")
4560 TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL
, "MANICHAEAN GIMEL")
4561 TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH
, "MANICHAEAN DALETH")
4562 TRY(UC_JOINING_GROUP_MANICHAEAN_WAW
, "MANICHAEAN WAW")
4563 TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN
, "MANICHAEAN ZAYIN")
4564 TRY(UC_JOINING_GROUP_MANICHAEAN_HETH
, "MANICHAEAN HETH")
4565 TRY(UC_JOINING_GROUP_MANICHAEAN_TETH
, "MANICHAEAN TETH")
4566 TRY(UC_JOINING_GROUP_MANICHAEAN_YODH
, "MANICHAEAN YODH")
4567 TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH
, "MANICHAEAN KAPH")
4568 TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH
, "MANICHAEAN LAMEDH")
4569 TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH
, "MANICHAEAN DHAMEDH")
4570 TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH
, "MANICHAEAN THAMEDH")
4571 TRY(UC_JOINING_GROUP_MANICHAEAN_MEM
, "MANICHAEAN MEM")
4572 TRY(UC_JOINING_GROUP_MANICHAEAN_NUN
, "MANICHAEAN NUN")
4573 TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH
, "MANICHAEAN SAMEKH")
4574 TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN
, "MANICHAEAN AYIN")
4575 TRY(UC_JOINING_GROUP_MANICHAEAN_PE
, "MANICHAEAN PE")
4576 TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE
, "MANICHAEAN SADHE")
4577 TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH
, "MANICHAEAN QOPH")
4578 TRY(UC_JOINING_GROUP_MANICHAEAN_RESH
, "MANICHAEAN RESH")
4579 TRY(UC_JOINING_GROUP_MANICHAEAN_TAW
, "MANICHAEAN TAW")
4580 TRY(UC_JOINING_GROUP_MANICHAEAN_ONE
, "MANICHAEAN ONE")
4581 TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE
, "MANICHAEAN FIVE")
4582 TRY(UC_JOINING_GROUP_MANICHAEAN_TEN
, "MANICHAEAN TEN")
4583 TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY
, "MANICHAEAN TWENTY")
4584 TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED
, "MANICHAEAN HUNDRED")
4585 TRY(UC_JOINING_GROUP_AFRICAN_FEH
, "AFRICAN FEH")
4586 TRY(UC_JOINING_GROUP_AFRICAN_QAF
, "AFRICAN QAF")
4587 TRY(UC_JOINING_GROUP_AFRICAN_NOON
, "AFRICAN NOON")
4588 TRY(UC_JOINING_GROUP_MALAYALAM_NGA
, "MALAYALAM NGA")
4589 TRY(UC_JOINING_GROUP_MALAYALAM_JA
, "MALAYALAM JA")
4590 TRY(UC_JOINING_GROUP_MALAYALAM_NYA
, "MALAYALAM NYA")
4591 TRY(UC_JOINING_GROUP_MALAYALAM_TTA
, "MALAYALAM TTA")
4592 TRY(UC_JOINING_GROUP_MALAYALAM_NNA
, "MALAYALAM NNA")
4593 TRY(UC_JOINING_GROUP_MALAYALAM_NNNA
, "MALAYALAM NNNA")
4594 TRY(UC_JOINING_GROUP_MALAYALAM_BHA
, "MALAYALAM BHA")
4595 TRY(UC_JOINING_GROUP_MALAYALAM_RA
, "MALAYALAM RA")
4596 TRY(UC_JOINING_GROUP_MALAYALAM_LLA
, "MALAYALAM LLA")
4597 TRY(UC_JOINING_GROUP_MALAYALAM_LLLA
, "MALAYALAM LLLA")
4598 TRY(UC_JOINING_GROUP_MALAYALAM_SSA
, "MALAYALAM SSA")
4599 TRY(UC_JOINING_GROUP_HANIFI_ROHINGYA_PA
, "HANIFI ROHINGYA PA")
4600 TRY(UC_JOINING_GROUP_HANIFI_ROHINGYA_KINNA_YA
, "HANIFI ROHINGYA KINNA YA")
4601 TRY(UC_JOINING_GROUP_THIN_YEH
, "THIN YEH")
4602 TRY(UC_JOINING_GROUP_VERTICAL_TAIL
, "VERTICAL TAIL")
4603 TRY(UC_JOINING_GROUP_KASHMIRI_YEH
, "KASHMIRI YEH")
4607 fprintf (stderr
, "unknown joining group value \"%s\" in '%s':%d\n",
4608 joining_group_name
, arabicshaping_filename
, lineno
);
4612 unicode_joining_type
[i
] = joining_type
;
4613 unicode_joining_group
[i
] = joining_group
;
4616 if (ferror (stream
) || fclose (stream
))
4618 fprintf (stderr
, "error reading from '%s'\n", arabicshaping_filename
);
4623 /* Convert a Joining_Type value to a C identifier. */
4625 joining_type_as_c_identifier (int joining_type
)
4627 #define TRY(value) if (joining_type == value) return #value;
4628 TRY(UC_JOINING_TYPE_U
)
4629 TRY(UC_JOINING_TYPE_T
)
4630 TRY(UC_JOINING_TYPE_C
)
4631 TRY(UC_JOINING_TYPE_L
)
4632 TRY(UC_JOINING_TYPE_R
)
4633 TRY(UC_JOINING_TYPE_D
)
4639 output_joining_type_test (const char *filename
, const char *version
)
4645 stream
= fopen (filename
, "w");
4648 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
4652 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4653 fprintf (stream
, "/* Arabic joining type of Unicode characters. */\n");
4654 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4656 fprintf (stream
, "\n");
4658 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
4659 fprintf (stream
, "\n");
4660 output_tests_license (stream
);
4661 fprintf (stream
, "\n");
4664 for (ch
= 0; ch
< 0x110000; ch
++)
4666 int value
= unicode_joining_type
[ch
];
4668 if (value
!= (uint8_t)~(uint8_t)0)
4671 fprintf (stream
, ",\n");
4672 fprintf (stream
, " { 0x%04X, %s }", ch
, joining_type_as_c_identifier (value
));
4677 fprintf (stream
, "\n");
4679 if (ferror (stream
) || fclose (stream
))
4681 fprintf (stderr
, "error writing to '%s'\n", filename
);
4686 /* Construction of sparse 3-level tables. */
4687 #define TABLE joining_type_table
4688 #define ELEMENT uint8_t
4689 #define DEFAULT (uint8_t)~(uint8_t)0
4690 #define xmalloc malloc
4691 #define xrealloc realloc
4695 output_joining_type (const char *filename
, const char *version
)
4699 struct joining_type_table t
;
4700 unsigned int level1_offset
, level2_offset
, level3_offset
;
4701 uint8_t *level3_packed
;
4703 stream
= fopen (filename
, "w");
4706 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
4710 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4711 fprintf (stream
, "/* Arabic joining type of Unicode characters. */\n");
4712 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4714 fprintf (stream
, "\n");
4716 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
4717 fprintf (stream
, "\n");
4718 output_library_license (stream
, true);
4719 fprintf (stream
, "\n");
4723 joining_type_table_init (&t
);
4725 for (ch
= 0; ch
< 0x110000; ch
++)
4727 uint8_t value
= unicode_joining_type
[ch
];
4729 assert (value
== (uint8_t)~(uint8_t)0 || value
<= 0x0f);
4731 joining_type_table_add (&t
, ch
, value
);
4734 joining_type_table_finalize (&t
);
4736 /* Offsets in t.result, in memory of this process. */
4738 5 * sizeof (uint32_t);
4740 5 * sizeof (uint32_t)
4741 + t
.level1_size
* sizeof (uint32_t);
4743 5 * sizeof (uint32_t)
4744 + t
.level1_size
* sizeof (uint32_t)
4745 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
4747 for (i
= 0; i
< 5; i
++)
4748 fprintf (stream
, "#define joining_type_header_%d %d\n", i
,
4749 ((uint32_t *) t
.result
)[i
]);
4750 fprintf (stream
, "static const\n");
4751 fprintf (stream
, "struct\n");
4752 fprintf (stream
, " {\n");
4753 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
4754 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
4755 fprintf (stream
, " unsigned char level3[%zu * %d];\n", t
.level3_size
,
4756 (1 << t
.p
) * 4 / 8);
4757 fprintf (stream
, " }\n");
4758 fprintf (stream
, "u_joining_type =\n");
4759 fprintf (stream
, "{\n");
4760 fprintf (stream
, " {");
4761 if (t
.level1_size
> 8)
4762 fprintf (stream
, "\n ");
4763 for (i
= 0; i
< t
.level1_size
; i
++)
4766 if (i
> 0 && (i
% 8) == 0)
4767 fprintf (stream
, "\n ");
4768 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
4770 fprintf (stream
, " %5d", -1);
4772 fprintf (stream
, " %5zu",
4773 (offset
- level2_offset
) / sizeof (uint32_t));
4774 if (i
+1 < t
.level1_size
)
4775 fprintf (stream
, ",");
4777 if (t
.level1_size
> 8)
4778 fprintf (stream
, "\n ");
4779 fprintf (stream
, " },\n");
4780 fprintf (stream
, " {");
4781 if (t
.level2_size
<< t
.q
> 8)
4782 fprintf (stream
, "\n ");
4783 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
4786 if (i
> 0 && (i
% 8) == 0)
4787 fprintf (stream
, "\n ");
4788 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
4790 fprintf (stream
, " %5d", -1);
4792 fprintf (stream
, " %5zu",
4793 (offset
- level3_offset
) / sizeof (uint8_t));
4794 if (i
+1 < t
.level2_size
<< t
.q
)
4795 fprintf (stream
, ",");
4797 if (t
.level2_size
<< t
.q
> 8)
4798 fprintf (stream
, "\n ");
4799 fprintf (stream
, " },\n");
4800 /* Pack the level3 array. Each entry needs 4 bits only. */
4802 (uint8_t *) calloc ((t
.level3_size
<< t
.p
) * 4 / 8, sizeof (uint8_t));
4803 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
4805 unsigned int j
= (i
* 4) / 8;
4806 unsigned int k
= (i
* 4) % 8;
4807 uint32_t value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
] & 0x0f;
4808 level3_packed
[j
] |= (value
<< k
);
4810 fprintf (stream
, " {");
4811 if ((t
.level3_size
<< t
.p
) * 4 / 8 > 8)
4812 fprintf (stream
, "\n ");
4813 for (i
= 0; i
< (t
.level3_size
<< t
.p
) * 4 / 8; i
++)
4815 if (i
> 0 && (i
% 8) == 0)
4816 fprintf (stream
, "\n ");
4817 fprintf (stream
, " 0x%02x", level3_packed
[i
]);
4818 if (i
+1 < (t
.level3_size
<< t
.p
) * 4 / 8)
4819 fprintf (stream
, ",");
4821 if ((t
.level3_size
<< t
.p
) * 4 / 8 > 8)
4822 fprintf (stream
, "\n ");
4823 fprintf (stream
, " }\n");
4824 free (level3_packed
);
4825 fprintf (stream
, "};\n");
4827 if (ferror (stream
) || fclose (stream
))
4829 fprintf (stderr
, "error writing to '%s'\n", filename
);
4834 /* Convert a Joining_Group value to a C identifier. */
4836 joining_group_as_c_identifier (int joining_group
)
4838 #define TRY(value) if (joining_group == value) return #value;
4839 TRY(UC_JOINING_GROUP_NONE
)
4840 TRY(UC_JOINING_GROUP_AIN
)
4841 TRY(UC_JOINING_GROUP_ALAPH
)
4842 TRY(UC_JOINING_GROUP_ALEF
)
4843 TRY(UC_JOINING_GROUP_BEH
)
4844 TRY(UC_JOINING_GROUP_BETH
)
4845 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE
)
4846 TRY(UC_JOINING_GROUP_DAL
)
4847 TRY(UC_JOINING_GROUP_DALATH_RISH
)
4848 TRY(UC_JOINING_GROUP_E
)
4849 TRY(UC_JOINING_GROUP_FARSI_YEH
)
4850 TRY(UC_JOINING_GROUP_FE
)
4851 TRY(UC_JOINING_GROUP_FEH
)
4852 TRY(UC_JOINING_GROUP_FINAL_SEMKATH
)
4853 TRY(UC_JOINING_GROUP_GAF
)
4854 TRY(UC_JOINING_GROUP_GAMAL
)
4855 TRY(UC_JOINING_GROUP_HAH
)
4856 TRY(UC_JOINING_GROUP_HE
)
4857 TRY(UC_JOINING_GROUP_HEH
)
4858 TRY(UC_JOINING_GROUP_HEH_GOAL
)
4859 TRY(UC_JOINING_GROUP_HETH
)
4860 TRY(UC_JOINING_GROUP_KAF
)
4861 TRY(UC_JOINING_GROUP_KAPH
)
4862 TRY(UC_JOINING_GROUP_KHAPH
)
4863 TRY(UC_JOINING_GROUP_KNOTTED_HEH
)
4864 TRY(UC_JOINING_GROUP_LAM
)
4865 TRY(UC_JOINING_GROUP_LAMADH
)
4866 TRY(UC_JOINING_GROUP_MEEM
)
4867 TRY(UC_JOINING_GROUP_MIM
)
4868 TRY(UC_JOINING_GROUP_NOON
)
4869 TRY(UC_JOINING_GROUP_NUN
)
4870 TRY(UC_JOINING_GROUP_NYA
)
4871 TRY(UC_JOINING_GROUP_PE
)
4872 TRY(UC_JOINING_GROUP_QAF
)
4873 TRY(UC_JOINING_GROUP_QAPH
)
4874 TRY(UC_JOINING_GROUP_REH
)
4875 TRY(UC_JOINING_GROUP_REVERSED_PE
)
4876 TRY(UC_JOINING_GROUP_SAD
)
4877 TRY(UC_JOINING_GROUP_SADHE
)
4878 TRY(UC_JOINING_GROUP_SEEN
)
4879 TRY(UC_JOINING_GROUP_SEMKATH
)
4880 TRY(UC_JOINING_GROUP_SHIN
)
4881 TRY(UC_JOINING_GROUP_SWASH_KAF
)
4882 TRY(UC_JOINING_GROUP_SYRIAC_WAW
)
4883 TRY(UC_JOINING_GROUP_TAH
)
4884 TRY(UC_JOINING_GROUP_TAW
)
4885 TRY(UC_JOINING_GROUP_TEH_MARBUTA
)
4886 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL
)
4887 TRY(UC_JOINING_GROUP_TETH
)
4888 TRY(UC_JOINING_GROUP_WAW
)
4889 TRY(UC_JOINING_GROUP_YEH
)
4890 TRY(UC_JOINING_GROUP_YEH_BARREE
)
4891 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL
)
4892 TRY(UC_JOINING_GROUP_YUDH
)
4893 TRY(UC_JOINING_GROUP_YUDH_HE
)
4894 TRY(UC_JOINING_GROUP_ZAIN
)
4895 TRY(UC_JOINING_GROUP_ZHAIN
)
4896 TRY(UC_JOINING_GROUP_ROHINGYA_YEH
)
4897 TRY(UC_JOINING_GROUP_STRAIGHT_WAW
)
4898 TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH
)
4899 TRY(UC_JOINING_GROUP_MANICHAEAN_BETH
)
4900 TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL
)
4901 TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH
)
4902 TRY(UC_JOINING_GROUP_MANICHAEAN_WAW
)
4903 TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN
)
4904 TRY(UC_JOINING_GROUP_MANICHAEAN_HETH
)
4905 TRY(UC_JOINING_GROUP_MANICHAEAN_TETH
)
4906 TRY(UC_JOINING_GROUP_MANICHAEAN_YODH
)
4907 TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH
)
4908 TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH
)
4909 TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH
)
4910 TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH
)
4911 TRY(UC_JOINING_GROUP_MANICHAEAN_MEM
)
4912 TRY(UC_JOINING_GROUP_MANICHAEAN_NUN
)
4913 TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH
)
4914 TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN
)
4915 TRY(UC_JOINING_GROUP_MANICHAEAN_PE
)
4916 TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE
)
4917 TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH
)
4918 TRY(UC_JOINING_GROUP_MANICHAEAN_RESH
)
4919 TRY(UC_JOINING_GROUP_MANICHAEAN_TAW
)
4920 TRY(UC_JOINING_GROUP_MANICHAEAN_ONE
)
4921 TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE
)
4922 TRY(UC_JOINING_GROUP_MANICHAEAN_TEN
)
4923 TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY
)
4924 TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED
)
4925 TRY(UC_JOINING_GROUP_AFRICAN_FEH
)
4926 TRY(UC_JOINING_GROUP_AFRICAN_QAF
)
4927 TRY(UC_JOINING_GROUP_AFRICAN_NOON
)
4928 TRY(UC_JOINING_GROUP_MALAYALAM_NGA
)
4929 TRY(UC_JOINING_GROUP_MALAYALAM_JA
)
4930 TRY(UC_JOINING_GROUP_MALAYALAM_NYA
)
4931 TRY(UC_JOINING_GROUP_MALAYALAM_TTA
)
4932 TRY(UC_JOINING_GROUP_MALAYALAM_NNA
)
4933 TRY(UC_JOINING_GROUP_MALAYALAM_NNNA
)
4934 TRY(UC_JOINING_GROUP_MALAYALAM_BHA
)
4935 TRY(UC_JOINING_GROUP_MALAYALAM_RA
)
4936 TRY(UC_JOINING_GROUP_MALAYALAM_LLA
)
4937 TRY(UC_JOINING_GROUP_MALAYALAM_LLLA
)
4938 TRY(UC_JOINING_GROUP_MALAYALAM_SSA
)
4939 TRY(UC_JOINING_GROUP_HANIFI_ROHINGYA_PA
)
4940 TRY(UC_JOINING_GROUP_HANIFI_ROHINGYA_KINNA_YA
)
4941 TRY(UC_JOINING_GROUP_THIN_YEH
)
4942 TRY(UC_JOINING_GROUP_VERTICAL_TAIL
)
4943 TRY(UC_JOINING_GROUP_KASHMIRI_YEH
)
4949 output_joining_group_test (const char *filename
, const char *version
)
4955 stream
= fopen (filename
, "w");
4958 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
4962 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4963 fprintf (stream
, "/* Arabic joining group of Unicode characters. */\n");
4964 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4966 fprintf (stream
, "\n");
4968 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
4969 fprintf (stream
, "\n");
4970 output_tests_license (stream
);
4971 fprintf (stream
, "\n");
4974 for (ch
= 0; ch
< 0x110000; ch
++)
4976 int value
= unicode_joining_group
[ch
];
4978 if (value
!= UC_JOINING_GROUP_NONE
)
4981 fprintf (stream
, ",\n");
4982 fprintf (stream
, " { 0x%04X, %s }", ch
, joining_group_as_c_identifier (value
));
4987 fprintf (stream
, "\n");
4989 if (ferror (stream
) || fclose (stream
))
4991 fprintf (stderr
, "error writing to '%s'\n", filename
);
4996 /* Construction of sparse 3-level tables. */
4997 #define TABLE joining_group_table
4998 #define ELEMENT uint8_t
4999 #define DEFAULT UC_JOINING_GROUP_NONE
5000 #define xmalloc malloc
5001 #define xrealloc realloc
5005 output_joining_group (const char *filename
, const char *version
)
5009 struct joining_group_table t
;
5010 unsigned int level1_offset
, level2_offset
, level3_offset
;
5011 uint16_t *level3_packed
;
5013 stream
= fopen (filename
, "w");
5016 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
5020 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5021 fprintf (stream
, "/* Arabic joining group of Unicode characters. */\n");
5022 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5024 fprintf (stream
, "\n");
5026 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
5027 fprintf (stream
, "\n");
5028 output_library_license (stream
, false);
5029 fprintf (stream
, "\n");
5033 joining_group_table_init (&t
);
5035 for (ch
= 0; ch
< 0x110000; ch
++)
5037 uint8_t value
= unicode_joining_group
[ch
];
5039 assert (value
<= 0x7f);
5041 joining_group_table_add (&t
, ch
, value
);
5044 joining_group_table_finalize (&t
);
5046 /* Offsets in t.result, in memory of this process. */
5048 5 * sizeof (uint32_t);
5050 5 * sizeof (uint32_t)
5051 + t
.level1_size
* sizeof (uint32_t);
5053 5 * sizeof (uint32_t)
5054 + t
.level1_size
* sizeof (uint32_t)
5055 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
5057 for (i
= 0; i
< 5; i
++)
5058 fprintf (stream
, "#define joining_group_header_%d %d\n", i
,
5059 ((uint32_t *) t
.result
)[i
]);
5060 fprintf (stream
, "static const\n");
5061 fprintf (stream
, "struct\n");
5062 fprintf (stream
, " {\n");
5063 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
5064 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
5065 fprintf (stream
, " unsigned short level3[%zu * %d + 1];\n", t
.level3_size
,
5066 (1 << t
.p
) * 7 / 16);
5067 fprintf (stream
, " }\n");
5068 fprintf (stream
, "u_joining_group =\n");
5069 fprintf (stream
, "{\n");
5070 fprintf (stream
, " {");
5071 if (t
.level1_size
> 8)
5072 fprintf (stream
, "\n ");
5073 for (i
= 0; i
< t
.level1_size
; i
++)
5076 if (i
> 0 && (i
% 8) == 0)
5077 fprintf (stream
, "\n ");
5078 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
5080 fprintf (stream
, " %5d", -1);
5082 fprintf (stream
, " %5zu",
5083 (offset
- level2_offset
) / sizeof (uint32_t));
5084 if (i
+1 < t
.level1_size
)
5085 fprintf (stream
, ",");
5087 if (t
.level1_size
> 8)
5088 fprintf (stream
, "\n ");
5089 fprintf (stream
, " },\n");
5090 fprintf (stream
, " {");
5091 if (t
.level2_size
<< t
.q
> 8)
5092 fprintf (stream
, "\n ");
5093 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
5096 if (i
> 0 && (i
% 8) == 0)
5097 fprintf (stream
, "\n ");
5098 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
5100 fprintf (stream
, " %5d", -1);
5102 fprintf (stream
, " %5zu",
5103 (offset
- level3_offset
) / sizeof (uint8_t));
5104 if (i
+1 < t
.level2_size
<< t
.q
)
5105 fprintf (stream
, ",");
5107 if (t
.level2_size
<< t
.q
> 8)
5108 fprintf (stream
, "\n ");
5109 fprintf (stream
, " },\n");
5110 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
5111 not 32-bit units, in order to make the lookup function easier. */
5114 calloc ((t
.level3_size
<< t
.p
) * 7 / 16 + 1, sizeof (uint16_t));
5115 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
5117 unsigned int j
= (i
* 7) / 16;
5118 unsigned int k
= (i
* 7) % 16;
5119 uint32_t value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
];
5120 value
= level3_packed
[j
] | (level3_packed
[j
+1] << 16) | (value
<< k
);
5121 level3_packed
[j
] = value
& 0xffff;
5122 level3_packed
[j
+1] = value
>> 16;
5124 fprintf (stream
, " {");
5125 if ((t
.level3_size
<< t
.p
) * 7 / 16 + 1 > 8)
5126 fprintf (stream
, "\n ");
5127 for (i
= 0; i
< (t
.level3_size
<< t
.p
) * 7 / 16 + 1; i
++)
5129 if (i
> 0 && (i
% 8) == 0)
5130 fprintf (stream
, "\n ");
5131 fprintf (stream
, " 0x%04x", level3_packed
[i
]);
5132 if (i
+1 < (t
.level3_size
<< t
.p
) * 7 / 16 + 1)
5133 fprintf (stream
, ",");
5135 if ((t
.level3_size
<< t
.p
) * 7 / 16 + 1 > 8)
5136 fprintf (stream
, "\n ");
5137 fprintf (stream
, " }\n");
5138 free (level3_packed
);
5139 fprintf (stream
, "};\n");
5141 if (ferror (stream
) || fclose (stream
))
5143 fprintf (stderr
, "error writing to '%s'\n", filename
);
5148 /* ========================================================================= */
5152 static const char *scripts
[256];
5153 static unsigned int numscripts
;
5155 static uint8_t unicode_scripts
[0x110000];
5158 fill_scripts (const char *scripts_filename
)
5163 stream
= fopen (scripts_filename
, "r");
5166 fprintf (stderr
, "error during fopen of '%s'\n", scripts_filename
);
5172 for (i
= 0; i
< 0x110000; i
++)
5173 unicode_scripts
[i
] = (uint8_t)~(uint8_t)0;
5178 unsigned int i1
, i2
;
5179 char padding
[200+1];
5180 char scriptname
[200+1];
5183 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
5186 if (buf
[0] == '\0' || buf
[0] == '#')
5189 if (sscanf (buf
, "%X..%X%[ ;]%[^ ]", &i1
, &i2
, padding
, scriptname
) != 4)
5191 if (sscanf (buf
, "%X%[ ;]%[^ ]", &i1
, padding
, scriptname
) != 3)
5193 fprintf (stderr
, "parse error in '%s'\n", scripts_filename
);
5199 assert (i2
< 0x110000);
5201 for (script
= numscripts
- 1; script
>= 0; script
--)
5202 if (strcmp (scripts
[script
], scriptname
) == 0)
5206 scripts
[numscripts
] = strdup (scriptname
);
5207 script
= numscripts
;
5209 assert (numscripts
!= 256);
5212 for (i
= i1
; i
<= i2
; i
++)
5214 if (unicode_scripts
[i
] != (uint8_t)~(uint8_t)0)
5215 fprintf (stderr
, "0x%04X belongs to multiple scripts\n", i
);
5216 unicode_scripts
[i
] = script
;
5220 if (ferror (stream
) || fclose (stream
))
5222 fprintf (stderr
, "error reading from '%s'\n", scripts_filename
);
5227 /* Construction of sparse 3-level tables. */
5228 #define TABLE script_table
5229 #define ELEMENT uint8_t
5230 #define DEFAULT (uint8_t)~(uint8_t)0
5231 #define xmalloc malloc
5232 #define xrealloc realloc
5236 output_scripts (const char *version
)
5238 const char *filename
= "unictype/scripts.h";
5240 unsigned int ch
, s
, i
;
5241 struct script_table t
;
5242 unsigned int level1_offset
, level2_offset
, level3_offset
;
5246 const char *lowercase_name
;
5249 scriptinfo_t scriptinfo
[256];
5251 stream
= fopen (filename
, "w");
5254 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
5258 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5259 fprintf (stream
, "/* Unicode scripts. */\n");
5260 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5262 fprintf (stream
, "\n");
5264 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
5265 fprintf (stream
, "\n");
5266 output_library_license (stream
, true);
5267 fprintf (stream
, "\n");
5269 for (s
= 0; s
< numscripts
; s
++)
5271 char *lcp
= strdup (scripts
[s
]);
5274 for (cp
= lcp
; *cp
!= '\0'; cp
++)
5275 if (*cp
>= 'A' && *cp
<= 'Z')
5278 scriptinfo
[s
].lowercase_name
= lcp
;
5281 for (s
= 0; s
< numscripts
; s
++)
5283 fprintf (stream
, "static const uc_interval_t script_%s_intervals[] =\n",
5284 scriptinfo
[s
].lowercase_name
);
5285 fprintf (stream
, "{\n");
5287 for (ch
= 0; ch
< 0x110000; ch
++)
5288 if (unicode_scripts
[ch
] == s
)
5294 while (ch
+ 1 < 0x110000 && unicode_scripts
[ch
+ 1] == s
)
5299 fprintf (stream
, ",\n");
5301 fprintf (stream
, " { 0x%04X, 1, 1 }", start
);
5303 fprintf (stream
, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
5307 fprintf (stream
, "\n");
5308 fprintf (stream
, "};\n");
5311 fprintf (stream
, "static const uc_script_t scripts[%d] =\n", numscripts
);
5312 fprintf (stream
, "{\n");
5313 for (s
= 0; s
< numscripts
; s
++)
5315 fprintf (stream
, " {\n");
5316 fprintf (stream
, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
5317 scriptinfo
[s
].lowercase_name
);
5318 fprintf (stream
, " script_%s_intervals,\n",
5319 scriptinfo
[s
].lowercase_name
);
5320 fprintf (stream
, " \"%s\"\n", scripts
[s
]);
5321 fprintf (stream
, " }");
5322 if (s
+1 < numscripts
)
5323 fprintf (stream
, ",");
5324 fprintf (stream
, "\n");
5326 fprintf (stream
, "};\n");
5330 script_table_init (&t
);
5332 for (ch
= 0; ch
< 0x110000; ch
++)
5334 unsigned int s
= unicode_scripts
[ch
];
5335 if (s
!= (uint8_t)~(uint8_t)0)
5336 script_table_add (&t
, ch
, s
);
5339 script_table_finalize (&t
);
5341 /* Offsets in t.result, in memory of this process. */
5343 5 * sizeof (uint32_t);
5345 5 * sizeof (uint32_t)
5346 + t
.level1_size
* sizeof (uint32_t);
5348 5 * sizeof (uint32_t)
5349 + t
.level1_size
* sizeof (uint32_t)
5350 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
5352 for (i
= 0; i
< 5; i
++)
5353 fprintf (stream
, "#define script_header_%d %d\n", i
,
5354 ((uint32_t *) t
.result
)[i
]);
5355 fprintf (stream
, "static const\n");
5356 fprintf (stream
, "struct\n");
5357 fprintf (stream
, " {\n");
5358 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
5359 fprintf (stream
, " unsigned short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
5360 fprintf (stream
, " unsigned char level3[%zu << %d];\n", t
.level3_size
, t
.p
);
5361 fprintf (stream
, " }\n");
5362 fprintf (stream
, "u_script =\n");
5363 fprintf (stream
, "{\n");
5364 fprintf (stream
, " {");
5365 if (t
.level1_size
> 8)
5366 fprintf (stream
, "\n ");
5367 for (i
= 0; i
< t
.level1_size
; i
++)
5370 if (i
> 0 && (i
% 8) == 0)
5371 fprintf (stream
, "\n ");
5372 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
5374 fprintf (stream
, " %5d", -1);
5376 fprintf (stream
, " %5zu",
5377 (offset
- level2_offset
) / sizeof (uint32_t));
5378 if (i
+1 < t
.level1_size
)
5379 fprintf (stream
, ",");
5381 if (t
.level1_size
> 8)
5382 fprintf (stream
, "\n ");
5383 fprintf (stream
, " },\n");
5384 fprintf (stream
, " {");
5385 if (t
.level2_size
<< t
.q
> 8)
5386 fprintf (stream
, "\n ");
5387 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
5390 if (i
> 0 && (i
% 8) == 0)
5391 fprintf (stream
, "\n ");
5392 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
5393 /* To make the level2 values fit in 16 bits, we use 'unsigned short'
5394 instead of 'short' and add 1 to each value. */
5396 fprintf (stream
, " %5d", -1 + 1);
5398 fprintf (stream
, " %5zu",
5399 (offset
- level3_offset
) / sizeof (uint8_t) + 1);
5400 if (i
+1 < t
.level2_size
<< t
.q
)
5401 fprintf (stream
, ",");
5403 if (t
.level2_size
<< t
.q
> 8)
5404 fprintf (stream
, "\n ");
5405 fprintf (stream
, " },\n");
5406 fprintf (stream
, " {");
5407 if (t
.level3_size
<< t
.p
> 8)
5408 fprintf (stream
, "\n ");
5409 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
5411 if (i
> 0 && (i
% 8) == 0)
5412 fprintf (stream
, "\n ");
5413 fprintf (stream
, " %3d", ((uint8_t *) (t
.result
+ level3_offset
))[i
]);
5414 if (i
+1 < t
.level3_size
<< t
.p
)
5415 fprintf (stream
, ",");
5417 if (t
.level3_size
<< t
.p
> 8)
5418 fprintf (stream
, "\n ");
5419 fprintf (stream
, " }\n");
5420 fprintf (stream
, "};\n");
5422 if (ferror (stream
) || fclose (stream
))
5424 fprintf (stderr
, "error writing to '%s'\n", filename
);
5430 output_scripts_byname (const char *version
)
5432 const char *filename
= "unictype/scripts_byname.gperf";
5436 stream
= fopen (filename
, "w");
5439 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
5443 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5444 fprintf (stream
, "/* Unicode scripts. */\n");
5445 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5447 fprintf (stream
, "\n");
5449 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
5450 fprintf (stream
, "\n");
5451 output_library_license (stream
, true);
5452 fprintf (stream
, "\n");
5454 fprintf (stream
, "struct named_script { int name; unsigned int index; };\n");
5455 fprintf (stream
, "%%struct-type\n");
5456 fprintf (stream
, "%%language=ANSI-C\n");
5457 fprintf (stream
, "%%define hash-function-name scripts_hash\n");
5458 fprintf (stream
, "%%define lookup-function-name uc_script_lookup\n");
5459 fprintf (stream
, "%%readonly-tables\n");
5460 fprintf (stream
, "%%global-table\n");
5461 fprintf (stream
, "%%define word-array-name script_names\n");
5462 fprintf (stream
, "%%pic\n");
5463 fprintf (stream
, "%%define string-pool-name script_stringpool\n");
5464 fprintf (stream
, "%%%%\n");
5465 for (s
= 0; s
< numscripts
; s
++)
5466 fprintf (stream
, "%s, %u\n", scripts
[s
], s
);
5468 if (ferror (stream
) || fclose (stream
))
5470 fprintf (stderr
, "error writing to '%s'\n", filename
);
5475 /* ========================================================================= */
5479 typedef struct { unsigned int start
; unsigned int end
; const char *name
; }
5481 static block_t blocks
[384];
5482 static unsigned int numblocks
;
5485 fill_blocks (const char *blocks_filename
)
5489 stream
= fopen (blocks_filename
, "r");
5492 fprintf (stderr
, "error during fopen of '%s'\n", blocks_filename
);
5499 unsigned int i1
, i2
;
5500 char padding
[200+1];
5501 char blockname
[200+1];
5503 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
5506 if (buf
[0] == '\0' || buf
[0] == '#')
5509 if (sscanf (buf
, "%X..%X%[ ;]%[^\r]", &i1
, &i2
, padding
, blockname
) != 4)
5511 fprintf (stderr
, "parse error in '%s'\n", blocks_filename
);
5514 blocks
[numblocks
].start
= i1
;
5515 blocks
[numblocks
].end
= i2
;
5516 blocks
[numblocks
].name
= strdup (blockname
);
5517 /* It must be sorted. */
5518 assert (numblocks
== 0 || blocks
[numblocks
-1].end
< blocks
[numblocks
].start
);
5520 assert (numblocks
!= SIZEOF (blocks
));
5523 if (ferror (stream
) || fclose (stream
))
5525 fprintf (stderr
, "error reading from '%s'\n", blocks_filename
);
5530 /* Return the smallest block index among the blocks for characters >= ch. */
5532 block_first_index (unsigned int ch
)
5534 /* Binary search. */
5535 unsigned int lo
= 0;
5536 unsigned int hi
= numblocks
;
5538 All blocks[i], i < lo, have blocks[i].end < ch,
5539 all blocks[i], i >= hi, have blocks[i].end >= ch. */
5542 unsigned int mid
= (lo
+ hi
) / 2; /* >= lo, < hi */
5543 if (blocks
[mid
].end
< ch
)
5551 /* Return the largest block index among the blocks for characters <= ch,
5554 block_last_index (unsigned int ch
)
5556 /* Binary search. */
5557 unsigned int lo
= 0;
5558 unsigned int hi
= numblocks
;
5560 All blocks[i], i < lo, have blocks[i].start <= ch,
5561 all blocks[i], i >= hi, have blocks[i].start > ch. */
5564 unsigned int mid
= (lo
+ hi
) / 2; /* >= lo, < hi */
5565 if (blocks
[mid
].start
<= ch
)
5574 output_blocks (const char *version
)
5576 const char *filename
= "unictype/blocks.h";
5577 const unsigned int shift
= 8; /* bits to shift away for array access */
5578 const unsigned int threshold
= 0x28000; /* cut-off table here to save space */
5583 stream
= fopen (filename
, "w");
5586 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
5590 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5591 fprintf (stream
, "/* Unicode blocks. */\n");
5592 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5594 fprintf (stream
, "\n");
5596 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
5597 fprintf (stream
, "\n");
5598 output_library_license (stream
, false);
5599 fprintf (stream
, "\n");
5601 fprintf (stream
, "static const uc_block_t blocks[] =\n");
5602 fprintf (stream
, "{\n");
5603 for (i
= 0; i
< numblocks
; i
++)
5605 fprintf (stream
, " { 0x%04X, 0x%04X, \"%s\" }", blocks
[i
].start
,
5606 blocks
[i
].end
, blocks
[i
].name
);
5607 if (i
+1 < numblocks
)
5608 fprintf (stream
, ",");
5609 fprintf (stream
, "\n");
5611 fprintf (stream
, "};\n");
5612 fprintf (stream
, "#define blocks_level1_shift %d\n", shift
);
5613 fprintf (stream
, "#define blocks_level1_threshold 0x%04X\n", threshold
);
5614 fprintf (stream
, "static const uint16_t blocks_level1[%d * 2] =\n",
5615 threshold
>> shift
);
5616 fprintf (stream
, "{\n");
5617 for (i1
= 0; i1
< (threshold
>> shift
); i1
++)
5619 unsigned int first_index
= block_first_index (i1
<< shift
);
5620 unsigned int last_index
= block_last_index (((i1
+ 1) << shift
) - 1);
5621 fprintf (stream
, " %3d, %3d", first_index
, last_index
);
5622 if (i1
+1 < (threshold
>> shift
))
5623 fprintf (stream
, ",");
5624 fprintf (stream
, "\n");
5626 fprintf (stream
, "};\n");
5627 fprintf (stream
, "#define blocks_upper_first_index %d\n",
5628 block_first_index (threshold
));
5629 fprintf (stream
, "#define blocks_upper_last_index %d\n",
5630 block_last_index (0x10FFFF));
5632 if (ferror (stream
) || fclose (stream
))
5634 fprintf (stderr
, "error writing to '%s'\n", filename
);
5639 /* ========================================================================= */
5641 /* C and Java syntax. */
5645 UC_IDENTIFIER_START
, /* valid as first or subsequent character */
5646 UC_IDENTIFIER_VALID
, /* valid as subsequent character only */
5647 UC_IDENTIFIER_INVALID
, /* not valid */
5648 UC_IDENTIFIER_IGNORABLE
/* ignorable (Java only) */
5651 /* ISO C 99 section 6.4.(3). */
5653 is_c_whitespace (unsigned int ch
)
5655 return (ch
== ' ' /* space */
5656 || ch
== '\t' /* horizontal tab */
5657 || ch
== '\n' || ch
== '\r' /* new-line */
5658 || ch
== '\v' /* vertical tab */
5659 || ch
== '\f'); /* form-feed */
5662 /* ISO C 99 section 6.4.2.1 and appendix D. */
5664 c_ident_category (unsigned int ch
)
5666 /* Section 6.4.2.1. */
5667 if (ch
>= '0' && ch
<= '9')
5668 return UC_IDENTIFIER_VALID
;
5669 if ((ch
>= 'A' && ch
<= 'Z') || (ch
>= 'a' && ch
<= 'z') || ch
== '_')
5670 return UC_IDENTIFIER_START
;
5676 || (ch
>= 0x00C0 && ch
<= 0x00D6)
5677 || (ch
>= 0x00D8 && ch
<= 0x00F6)
5678 || (ch
>= 0x00F8 && ch
<= 0x01F5)
5679 || (ch
>= 0x01FA && ch
<= 0x0217)
5680 || (ch
>= 0x0250 && ch
<= 0x02A8)
5681 || (ch
>= 0x1E00 && ch
<= 0x1E9B)
5682 || (ch
>= 0x1EA0 && ch
<= 0x1EF9)
5686 || (ch
>= 0x0388 && ch
<= 0x038A)
5688 || (ch
>= 0x038E && ch
<= 0x03A1)
5689 || (ch
>= 0x03A3 && ch
<= 0x03CE)
5690 || (ch
>= 0x03D0 && ch
<= 0x03D6)
5695 || (ch
>= 0x03E2 && ch
<= 0x03F3)
5696 || (ch
>= 0x1F00 && ch
<= 0x1F15)
5697 || (ch
>= 0x1F18 && ch
<= 0x1F1D)
5698 || (ch
>= 0x1F20 && ch
<= 0x1F45)
5699 || (ch
>= 0x1F48 && ch
<= 0x1F4D)
5700 || (ch
>= 0x1F50 && ch
<= 0x1F57)
5704 || (ch
>= 0x1F5F && ch
<= 0x1F7D)
5705 || (ch
>= 0x1F80 && ch
<= 0x1FB4)
5706 || (ch
>= 0x1FB6 && ch
<= 0x1FBC)
5707 || (ch
>= 0x1FC2 && ch
<= 0x1FC4)
5708 || (ch
>= 0x1FC6 && ch
<= 0x1FCC)
5709 || (ch
>= 0x1FD0 && ch
<= 0x1FD3)
5710 || (ch
>= 0x1FD6 && ch
<= 0x1FDB)
5711 || (ch
>= 0x1FE0 && ch
<= 0x1FEC)
5712 || (ch
>= 0x1FF2 && ch
<= 0x1FF4)
5713 || (ch
>= 0x1FF6 && ch
<= 0x1FFC)
5715 || (ch
>= 0x0401 && ch
<= 0x040C)
5716 || (ch
>= 0x040E && ch
<= 0x044F)
5717 || (ch
>= 0x0451 && ch
<= 0x045C)
5718 || (ch
>= 0x045E && ch
<= 0x0481)
5719 || (ch
>= 0x0490 && ch
<= 0x04C4)
5720 || (ch
>= 0x04C7 && ch
<= 0x04C8)
5721 || (ch
>= 0x04CB && ch
<= 0x04CC)
5722 || (ch
>= 0x04D0 && ch
<= 0x04EB)
5723 || (ch
>= 0x04EE && ch
<= 0x04F5)
5724 || (ch
>= 0x04F8 && ch
<= 0x04F9)
5726 || (ch
>= 0x0531 && ch
<= 0x0556)
5727 || (ch
>= 0x0561 && ch
<= 0x0587)
5729 || (ch
>= 0x05B0 && ch
<= 0x05B9)
5730 || (ch
>= 0x05BB && ch
<= 0x05BD)
5732 || (ch
>= 0x05C1 && ch
<= 0x05C2)
5733 || (ch
>= 0x05D0 && ch
<= 0x05EA)
5734 || (ch
>= 0x05F0 && ch
<= 0x05F2)
5736 || (ch
>= 0x0621 && ch
<= 0x063A)
5737 || (ch
>= 0x0640 && ch
<= 0x0652)
5738 || (ch
>= 0x0670 && ch
<= 0x06B7)
5739 || (ch
>= 0x06BA && ch
<= 0x06BE)
5740 || (ch
>= 0x06C0 && ch
<= 0x06CE)
5741 || (ch
>= 0x06D0 && ch
<= 0x06DC)
5742 || (ch
>= 0x06E5 && ch
<= 0x06E8)
5743 || (ch
>= 0x06EA && ch
<= 0x06ED)
5745 || (ch
>= 0x0901 && ch
<= 0x0903)
5746 || (ch
>= 0x0905 && ch
<= 0x0939)
5747 || (ch
>= 0x093E && ch
<= 0x094D)
5748 || (ch
>= 0x0950 && ch
<= 0x0952)
5749 || (ch
>= 0x0958 && ch
<= 0x0963)
5751 || (ch
>= 0x0981 && ch
<= 0x0983)
5752 || (ch
>= 0x0985 && ch
<= 0x098C)
5753 || (ch
>= 0x098F && ch
<= 0x0990)
5754 || (ch
>= 0x0993 && ch
<= 0x09A8)
5755 || (ch
>= 0x09AA && ch
<= 0x09B0)
5757 || (ch
>= 0x09B6 && ch
<= 0x09B9)
5758 || (ch
>= 0x09BE && ch
<= 0x09C4)
5759 || (ch
>= 0x09C7 && ch
<= 0x09C8)
5760 || (ch
>= 0x09CB && ch
<= 0x09CD)
5761 || (ch
>= 0x09DC && ch
<= 0x09DD)
5762 || (ch
>= 0x09DF && ch
<= 0x09E3)
5763 || (ch
>= 0x09F0 && ch
<= 0x09F1)
5766 || (ch
>= 0x0A05 && ch
<= 0x0A0A)
5767 || (ch
>= 0x0A0F && ch
<= 0x0A10)
5768 || (ch
>= 0x0A13 && ch
<= 0x0A28)
5769 || (ch
>= 0x0A2A && ch
<= 0x0A30)
5770 || (ch
>= 0x0A32 && ch
<= 0x0A33)
5771 || (ch
>= 0x0A35 && ch
<= 0x0A36)
5772 || (ch
>= 0x0A38 && ch
<= 0x0A39)
5773 || (ch
>= 0x0A3E && ch
<= 0x0A42)
5774 || (ch
>= 0x0A47 && ch
<= 0x0A48)
5775 || (ch
>= 0x0A4B && ch
<= 0x0A4D)
5776 || (ch
>= 0x0A59 && ch
<= 0x0A5C)
5780 || (ch
>= 0x0A81 && ch
<= 0x0A83)
5781 || (ch
>= 0x0A85 && ch
<= 0x0A8B)
5783 || (ch
>= 0x0A8F && ch
<= 0x0A91)
5784 || (ch
>= 0x0A93 && ch
<= 0x0AA8)
5785 || (ch
>= 0x0AAA && ch
<= 0x0AB0)
5786 || (ch
>= 0x0AB2 && ch
<= 0x0AB3)
5787 || (ch
>= 0x0AB5 && ch
<= 0x0AB9)
5788 || (ch
>= 0x0ABD && ch
<= 0x0AC5)
5789 || (ch
>= 0x0AC7 && ch
<= 0x0AC9)
5790 || (ch
>= 0x0ACB && ch
<= 0x0ACD)
5794 || (ch
>= 0x0B01 && ch
<= 0x0B03)
5795 || (ch
>= 0x0B05 && ch
<= 0x0B0C)
5796 || (ch
>= 0x0B0F && ch
<= 0x0B10)
5797 || (ch
>= 0x0B13 && ch
<= 0x0B28)
5798 || (ch
>= 0x0B2A && ch
<= 0x0B30)
5799 || (ch
>= 0x0B32 && ch
<= 0x0B33)
5800 || (ch
>= 0x0B36 && ch
<= 0x0B39)
5801 || (ch
>= 0x0B3E && ch
<= 0x0B43)
5802 || (ch
>= 0x0B47 && ch
<= 0x0B48)
5803 || (ch
>= 0x0B4B && ch
<= 0x0B4D)
5804 || (ch
>= 0x0B5C && ch
<= 0x0B5D)
5805 || (ch
>= 0x0B5F && ch
<= 0x0B61)
5807 || (ch
>= 0x0B82 && ch
<= 0x0B83)
5808 || (ch
>= 0x0B85 && ch
<= 0x0B8A)
5809 || (ch
>= 0x0B8E && ch
<= 0x0B90)
5810 || (ch
>= 0x0B92 && ch
<= 0x0B95)
5811 || (ch
>= 0x0B99 && ch
<= 0x0B9A)
5813 || (ch
>= 0x0B9E && ch
<= 0x0B9F)
5814 || (ch
>= 0x0BA3 && ch
<= 0x0BA4)
5815 || (ch
>= 0x0BA8 && ch
<= 0x0BAA)
5816 || (ch
>= 0x0BAE && ch
<= 0x0BB5)
5817 || (ch
>= 0x0BB7 && ch
<= 0x0BB9)
5818 || (ch
>= 0x0BBE && ch
<= 0x0BC2)
5819 || (ch
>= 0x0BC6 && ch
<= 0x0BC8)
5820 || (ch
>= 0x0BCA && ch
<= 0x0BCD)
5822 || (ch
>= 0x0C01 && ch
<= 0x0C03)
5823 || (ch
>= 0x0C05 && ch
<= 0x0C0C)
5824 || (ch
>= 0x0C0E && ch
<= 0x0C10)
5825 || (ch
>= 0x0C12 && ch
<= 0x0C28)
5826 || (ch
>= 0x0C2A && ch
<= 0x0C33)
5827 || (ch
>= 0x0C35 && ch
<= 0x0C39)
5828 || (ch
>= 0x0C3E && ch
<= 0x0C44)
5829 || (ch
>= 0x0C46 && ch
<= 0x0C48)
5830 || (ch
>= 0x0C4A && ch
<= 0x0C4D)
5831 || (ch
>= 0x0C60 && ch
<= 0x0C61)
5833 || (ch
>= 0x0C82 && ch
<= 0x0C83)
5834 || (ch
>= 0x0C85 && ch
<= 0x0C8C)
5835 || (ch
>= 0x0C8E && ch
<= 0x0C90)
5836 || (ch
>= 0x0C92 && ch
<= 0x0CA8)
5837 || (ch
>= 0x0CAA && ch
<= 0x0CB3)
5838 || (ch
>= 0x0CB5 && ch
<= 0x0CB9)
5839 || (ch
>= 0x0CBE && ch
<= 0x0CC4)
5840 || (ch
>= 0x0CC6 && ch
<= 0x0CC8)
5841 || (ch
>= 0x0CCA && ch
<= 0x0CCD)
5843 || (ch
>= 0x0CE0 && ch
<= 0x0CE1)
5845 || (ch
>= 0x0D02 && ch
<= 0x0D03)
5846 || (ch
>= 0x0D05 && ch
<= 0x0D0C)
5847 || (ch
>= 0x0D0E && ch
<= 0x0D10)
5848 || (ch
>= 0x0D12 && ch
<= 0x0D28)
5849 || (ch
>= 0x0D2A && ch
<= 0x0D39)
5850 || (ch
>= 0x0D3E && ch
<= 0x0D43)
5851 || (ch
>= 0x0D46 && ch
<= 0x0D48)
5852 || (ch
>= 0x0D4A && ch
<= 0x0D4D)
5853 || (ch
>= 0x0D60 && ch
<= 0x0D61)
5855 || (ch
>= 0x0E01 && ch
<= 0x0E3A)
5856 || (ch
>= 0x0E40 && ch
<= 0x0E5B)
5858 || (ch
>= 0x0E81 && ch
<= 0x0E82)
5860 || (ch
>= 0x0E87 && ch
<= 0x0E88)
5863 || (ch
>= 0x0E94 && ch
<= 0x0E97)
5864 || (ch
>= 0x0E99 && ch
<= 0x0E9F)
5865 || (ch
>= 0x0EA1 && ch
<= 0x0EA3)
5868 || (ch
>= 0x0EAA && ch
<= 0x0EAB)
5869 || (ch
>= 0x0EAD && ch
<= 0x0EAE)
5870 || (ch
>= 0x0EB0 && ch
<= 0x0EB9)
5871 || (ch
>= 0x0EBB && ch
<= 0x0EBD)
5872 || (ch
>= 0x0EC0 && ch
<= 0x0EC4)
5874 || (ch
>= 0x0EC8 && ch
<= 0x0ECD)
5875 || (ch
>= 0x0EDC && ch
<= 0x0EDD)
5878 || (ch
>= 0x0F18 && ch
<= 0x0F19)
5882 || (ch
>= 0x0F3E && ch
<= 0x0F47)
5883 || (ch
>= 0x0F49 && ch
<= 0x0F69)
5884 || (ch
>= 0x0F71 && ch
<= 0x0F84)
5885 || (ch
>= 0x0F86 && ch
<= 0x0F8B)
5886 || (ch
>= 0x0F90 && ch
<= 0x0F95)
5888 || (ch
>= 0x0F99 && ch
<= 0x0FAD)
5889 || (ch
>= 0x0FB1 && ch
<= 0x0FB7)
5892 || (ch
>= 0x10A0 && ch
<= 0x10C5)
5893 || (ch
>= 0x10D0 && ch
<= 0x10F6)
5895 || (ch
>= 0x3041 && ch
<= 0x3093)
5896 || (ch
>= 0x309B && ch
<= 0x309C)
5898 || (ch
>= 0x30A1 && ch
<= 0x30F6)
5899 || (ch
>= 0x30FB && ch
<= 0x30FC)
5901 || (ch
>= 0x3105 && ch
<= 0x312C)
5902 /* CJK Unified Ideographs */
5903 || (ch
>= 0x4E00 && ch
<= 0x9FA5)
5905 || (ch
>= 0xAC00 && ch
<= 0xD7A3)
5907 || (ch
>= 0x0660 && ch
<= 0x0669)
5908 || (ch
>= 0x06F0 && ch
<= 0x06F9)
5909 || (ch
>= 0x0966 && ch
<= 0x096F)
5910 || (ch
>= 0x09E6 && ch
<= 0x09EF)
5911 || (ch
>= 0x0A66 && ch
<= 0x0A6F)
5912 || (ch
>= 0x0AE6 && ch
<= 0x0AEF)
5913 || (ch
>= 0x0B66 && ch
<= 0x0B6F)
5914 || (ch
>= 0x0BE7 && ch
<= 0x0BEF)
5915 || (ch
>= 0x0C66 && ch
<= 0x0C6F)
5916 || (ch
>= 0x0CE6 && ch
<= 0x0CEF)
5917 || (ch
>= 0x0D66 && ch
<= 0x0D6F)
5918 || (ch
>= 0x0E50 && ch
<= 0x0E59)
5919 || (ch
>= 0x0ED0 && ch
<= 0x0ED9)
5920 || (ch
>= 0x0F20 && ch
<= 0x0F33)
5921 /* Special characters */
5924 || (ch
>= 0x02B0 && ch
<= 0x02B8)
5926 || (ch
>= 0x02BD && ch
<= 0x02C1)
5927 || (ch
>= 0x02D0 && ch
<= 0x02D1)
5928 || (ch
>= 0x02E0 && ch
<= 0x02E4)
5934 || (ch
>= 0x203F && ch
<= 0x2040)
5937 || (ch
>= 0x210A && ch
<= 0x2113)
5939 || (ch
>= 0x2118 && ch
<= 0x211D)
5943 || (ch
>= 0x212A && ch
<= 0x2131)
5944 || (ch
>= 0x2133 && ch
<= 0x2138)
5945 || (ch
>= 0x2160 && ch
<= 0x2182)
5946 || (ch
>= 0x3005 && ch
<= 0x3007)
5947 || (ch
>= 0x3021 && ch
<= 0x3029)
5949 return UC_IDENTIFIER_START
;
5950 return UC_IDENTIFIER_INVALID
;
5953 /* The Java Language Specification, 3rd edition, §3.6.
5954 https://docs.oracle.com/javase/specs/jls/se6/html/lexical.html#3.6 */
5956 is_java_whitespace (unsigned int ch
)
5958 return (ch
== ' ' || ch
== '\t' || ch
== '\f'
5959 || ch
== '\n' || ch
== '\r');
5962 /* The Java Language Specification, 3rd edition, §3.8.
5963 https://docs.oracle.com/javase/specs/jls/se6/html/lexical.html#3.8
5964 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
5966 java_ident_category (unsigned int ch
)
5968 /* FIXME: Check this against Sun's JDK implementation. */
5969 if (is_category_L (ch
) /* = Character.isLetter(ch) */
5970 || is_category_Nl (ch
) /* = Character.getType(ch)==LETTER_NUMBER */
5971 || is_category_Sc (ch
) /* currency symbol */
5972 || is_category_Pc (ch
) /* connector punctuation */
5974 return UC_IDENTIFIER_START
;
5975 if (is_category_Nd (ch
) /* digit */
5976 || is_category_Mc (ch
) /* combining mark */
5977 || is_category_Mn (ch
) /* non-spacing mark */
5979 return UC_IDENTIFIER_VALID
;
5980 if ((ch
>= 0x0000 && ch
<= 0x0008)
5981 || (ch
>= 0x000E && ch
<= 0x001B)
5982 || (ch
>= 0x007F && ch
<= 0x009F)
5983 || is_category_Cf (ch
) /* = Character.getType(ch)==FORMAT */
5985 return UC_IDENTIFIER_IGNORABLE
;
5986 return UC_IDENTIFIER_INVALID
;
5989 /* Construction of sparse 3-level tables. */
5990 #define TABLE identsyntax_table
5991 #define ELEMENT uint8_t
5992 #define DEFAULT UC_IDENTIFIER_INVALID
5993 #define xmalloc malloc
5994 #define xrealloc realloc
5997 /* Output an identifier syntax categorization in a three-level bitmap. */
5999 output_ident_category (const char *filename
, int (*predicate
) (unsigned int), const char *name
, const char *version
)
6003 struct identsyntax_table t
;
6004 unsigned int level1_offset
, level2_offset
, level3_offset
;
6006 stream
= fopen (filename
, "w");
6009 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
6013 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6014 fprintf (stream
, "/* Language syntax properties of Unicode characters. */\n");
6015 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
6017 fprintf (stream
, "\n");
6019 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
6020 fprintf (stream
, "\n");
6021 output_library_license (stream
, false);
6022 fprintf (stream
, "\n");
6026 identsyntax_table_init (&t
);
6028 for (ch
= 0; ch
< 0x110000; ch
++)
6030 int syntaxcode
= predicate (ch
);
6032 assert (syntaxcode
<= 0x03);
6034 if (syntaxcode
!= UC_IDENTIFIER_INVALID
)
6035 identsyntax_table_add (&t
, ch
, syntaxcode
);
6038 identsyntax_table_finalize (&t
);
6040 /* Offsets in t.result, in memory of this process. */
6042 5 * sizeof (uint32_t);
6044 5 * sizeof (uint32_t)
6045 + t
.level1_size
* sizeof (uint32_t);
6047 5 * sizeof (uint32_t)
6048 + t
.level1_size
* sizeof (uint32_t)
6049 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
6051 for (i
= 0; i
< 5; i
++)
6052 fprintf (stream
, "#define identsyntax_header_%d %d\n", i
,
6053 ((uint32_t *) t
.result
)[i
]);
6054 fprintf (stream
, "static const\n");
6055 fprintf (stream
, "struct\n");
6056 fprintf (stream
, " {\n");
6057 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
6058 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
6059 fprintf (stream
, " unsigned short level3[%zu * %d];\n", t
.level3_size
,
6060 (1 << t
.p
) * 2 / 16);
6061 fprintf (stream
, " }\n");
6062 fprintf (stream
, "%s =\n", name
);
6063 fprintf (stream
, "{\n");
6064 fprintf (stream
, " {");
6065 if (t
.level1_size
> 8)
6066 fprintf (stream
, "\n ");
6067 for (i
= 0; i
< t
.level1_size
; i
++)
6070 if (i
> 0 && (i
% 8) == 0)
6071 fprintf (stream
, "\n ");
6072 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
6074 fprintf (stream
, " %5d", -1);
6076 fprintf (stream
, " %5zu",
6077 (offset
- level2_offset
) / sizeof (uint32_t));
6078 if (i
+1 < t
.level1_size
)
6079 fprintf (stream
, ",");
6081 if (t
.level1_size
> 8)
6082 fprintf (stream
, "\n ");
6083 fprintf (stream
, " },\n");
6084 fprintf (stream
, " {");
6085 if (t
.level2_size
<< t
.q
> 8)
6086 fprintf (stream
, "\n ");
6087 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
6090 if (i
> 0 && (i
% 8) == 0)
6091 fprintf (stream
, "\n ");
6092 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
6094 fprintf (stream
, " %5d", -1);
6096 fprintf (stream
, " %5zu",
6097 (offset
- level3_offset
) / sizeof (uint8_t));
6098 if (i
+1 < t
.level2_size
<< t
.q
)
6099 fprintf (stream
, ",");
6101 if (t
.level2_size
<< t
.q
> 8)
6102 fprintf (stream
, "\n ");
6103 fprintf (stream
, " },\n");
6104 /* Pack the level3 array. Each entry needs 2 bits only. */
6105 fprintf (stream
, " {");
6106 if ((t
.level3_size
<< t
.p
) * 2 / 16 > 8)
6107 fprintf (stream
, "\n ");
6108 for (i
= 0; i
< (t
.level3_size
<< t
.p
) * 2 / 16; i
++)
6110 if (i
> 0 && (i
% 8) == 0)
6111 fprintf (stream
, "\n ");
6112 fprintf (stream
, " 0x%04x",
6113 (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
] << 0)
6114 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 1] << 2)
6115 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 2] << 4)
6116 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 3] << 6)
6117 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 4] << 8)
6118 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 5] << 10)
6119 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 6] << 12)
6120 | (((uint8_t *) (t
.result
+ level3_offset
))[8 * i
+ 7] << 14));
6121 if (i
+1 < (t
.level3_size
<< t
.p
) * 2 / 16)
6122 fprintf (stream
, ",");
6124 if ((t
.level3_size
<< t
.p
) * 2 / 16 > 8)
6125 fprintf (stream
, "\n ");
6126 fprintf (stream
, " }\n");
6127 fprintf (stream
, "};\n");
6129 if (ferror (stream
) || fclose (stream
))
6131 fprintf (stderr
, "error writing to '%s'\n", filename
);
6137 output_ident_properties (const char *version
)
6139 #define PROPERTY(P) \
6140 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
6141 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
6142 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
6143 PROPERTY(c_whitespace
)
6144 PROPERTY(java_whitespace
)
6147 output_ident_category ("unictype/sy_c_ident.h", c_ident_category
, "u_c_ident", version
);
6148 output_ident_category ("unictype/sy_java_ident.h", java_ident_category
, "u_java_ident", version
);
6151 /* ========================================================================= */
6153 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
6154 glibc/localedata/locales/i18n file, generated by
6155 glibc/localedata/gen-unicode-ctype.c. */
6157 /* Character mappings. */
6160 to_upper (unsigned int ch
)
6162 if (unicode_attributes
[ch
].name
!= NULL
6163 && unicode_attributes
[ch
].upper
!= NONE
)
6164 return unicode_attributes
[ch
].upper
;
6170 to_lower (unsigned int ch
)
6172 if (unicode_attributes
[ch
].name
!= NULL
6173 && unicode_attributes
[ch
].lower
!= NONE
)
6174 return unicode_attributes
[ch
].lower
;
6180 to_title (unsigned int ch
)
6182 if (unicode_attributes
[ch
].name
!= NULL
6183 && unicode_attributes
[ch
].title
!= NONE
)
6184 return unicode_attributes
[ch
].title
;
6189 /* Character class properties. */
6192 is_upper (unsigned int ch
)
6194 return (to_lower (ch
) != ch
);
6198 is_lower (unsigned int ch
)
6200 return (to_upper (ch
) != ch
)
6201 /* <U00DF> is lowercase, but without simple to_upper mapping. */
6206 is_alpha (unsigned int ch
)
6208 return (unicode_attributes
[ch
].name
!= NULL
6209 && ((unicode_attributes
[ch
].category
[0] == 'L'
6210 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
6211 <U0E2F>, <U0E46> should belong to is_punct. */
6212 && (ch
!= 0x0E2F) && (ch
!= 0x0E46))
6213 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
6214 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
6216 || (ch
>= 0x0E34 && ch
<= 0x0E3A)
6217 || (ch
>= 0x0E47 && ch
<= 0x0E4E)
6218 /* Avoid warning for <U0345>. */
6220 /* Avoid warnings for <U2160>..<U217F>. */
6221 || (unicode_attributes
[ch
].category
[0] == 'N'
6222 && unicode_attributes
[ch
].category
[1] == 'l')
6223 /* Avoid warnings for <U24B6>..<U24E9>. */
6224 || (unicode_attributes
[ch
].category
[0] == 'S'
6225 && unicode_attributes
[ch
].category
[1] == 'o'
6226 && strstr (unicode_attributes
[ch
].name
, " LETTER ")
6228 /* Consider all the non-ASCII digits as alphabetic.
6229 ISO C 99 forbids us to have them in category "digit",
6230 but we want iswalnum to return true on them. */
6231 || (unicode_attributes
[ch
].category
[0] == 'N'
6232 && unicode_attributes
[ch
].category
[1] == 'd'
6233 && !(ch
>= 0x0030 && ch
<= 0x0039))));
6237 is_digit (unsigned int ch
)
6240 return (unicode_attributes
[ch
].name
!= NULL
6241 && unicode_attributes
[ch
].category
[0] == 'N'
6242 && unicode_attributes
[ch
].category
[1] == 'd');
6243 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
6244 a zero. Must add <0> in front of them by hand. */
6246 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
6249 The iswdigit function tests for any wide character that corresponds
6250 to a decimal-digit character (as defined in 5.2.1).
6252 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
6254 return (ch
>= 0x0030 && ch
<= 0x0039);
6259 is_alnum (unsigned int ch
)
6261 return is_alpha (ch
) || is_digit (ch
);
6265 is_blank (unsigned int ch
)
6267 return (ch
== 0x0009 /* '\t' */
6268 /* Category Zs without mention of "<noBreak>" */
6269 || (unicode_attributes
[ch
].name
!= NULL
6270 && unicode_attributes
[ch
].category
[0] == 'Z'
6271 && unicode_attributes
[ch
].category
[1] == 's'
6272 && !strstr (unicode_attributes
[ch
].decomposition
, "<noBreak>")));
6276 is_space (unsigned int ch
)
6278 /* Don't make U+00A0 a space. Non-breaking space means that all programs
6279 should treat it like a punctuation character, not like a space. */
6280 return (ch
== 0x0020 /* ' ' */
6281 || ch
== 0x000C /* '\f' */
6282 || ch
== 0x000A /* '\n' */
6283 || ch
== 0x000D /* '\r' */
6284 || ch
== 0x0009 /* '\t' */
6285 || ch
== 0x000B /* '\v' */
6286 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
6287 || (unicode_attributes
[ch
].name
!= NULL
6288 && unicode_attributes
[ch
].category
[0] == 'Z'
6289 && (unicode_attributes
[ch
].category
[1] == 'l'
6290 || unicode_attributes
[ch
].category
[1] == 'p'
6291 || (unicode_attributes
[ch
].category
[1] == 's'
6292 && !strstr (unicode_attributes
[ch
].decomposition
,
6297 is_cntrl (unsigned int ch
)
6299 return (unicode_attributes
[ch
].name
!= NULL
6300 && (strcmp (unicode_attributes
[ch
].name
, "<control>") == 0
6301 /* Categories Zl and Zp */
6302 || (unicode_attributes
[ch
].category
[0] == 'Z'
6303 && (unicode_attributes
[ch
].category
[1] == 'l'
6304 || unicode_attributes
[ch
].category
[1] == 'p'))));
6308 is_xdigit (unsigned int ch
)
6311 return is_digit (ch
)
6312 || (ch
>= 0x0041 && ch
<= 0x0046)
6313 || (ch
>= 0x0061 && ch
<= 0x0066);
6315 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
6318 The iswxdigit function tests for any wide character that corresponds
6319 to a hexadecimal-digit character (as defined in 6.4.4.1).
6321 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
6323 return (ch
>= 0x0030 && ch
<= 0x0039)
6324 || (ch
>= 0x0041 && ch
<= 0x0046)
6325 || (ch
>= 0x0061 && ch
<= 0x0066);
6330 is_graph (unsigned int ch
)
6332 return (unicode_attributes
[ch
].name
!= NULL
6333 && strcmp (unicode_attributes
[ch
].name
, "<control>")
6338 is_print (unsigned int ch
)
6340 return (unicode_attributes
[ch
].name
!= NULL
6341 && strcmp (unicode_attributes
[ch
].name
, "<control>")
6342 /* Categories Zl and Zp */
6343 && !(unicode_attributes
[ch
].name
!= NULL
6344 && unicode_attributes
[ch
].category
[0] == 'Z'
6345 && (unicode_attributes
[ch
].category
[1] == 'l'
6346 || unicode_attributes
[ch
].category
[1] == 'p')));
6350 is_punct (unsigned int ch
)
6353 return (unicode_attributes
[ch
].name
!= NULL
6354 && unicode_attributes
[ch
].category
[0] == 'P');
6356 /* The traditional POSIX definition of punctuation is every graphic,
6357 non-alphanumeric character. */
6358 return (is_graph (ch
) && !is_alpha (ch
) && !is_digit (ch
));
6362 /* Output all properties. */
6364 output_old_ctype (const char *version
)
6366 #define PROPERTY(P) \
6367 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
6368 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
6369 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
6388 is_combining (unsigned int ch
)
6390 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
6391 file. In 3.0.1 it was identical to the union of the general categories
6392 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
6393 PropList.txt file, so we take the latter definition. */
6394 return (unicode_attributes
[ch
].name
!= NULL
6395 && unicode_attributes
[ch
].category
[0] == 'M'
6396 && (unicode_attributes
[ch
].category
[1] == 'n'
6397 || unicode_attributes
[ch
].category
[1] == 'c'
6398 || unicode_attributes
[ch
].category
[1] == 'e'));
6402 is_combining_level3 (unsigned int ch
)
6404 return is_combining (ch
)
6405 && !(unicode_attributes
[ch
].combining
[0] != '\0'
6406 && unicode_attributes
[ch
].combining
[0] != '0'
6407 && strtoul (unicode_attributes
[ch
].combining
, NULL
, 10) >= 200);
6410 /* Return the UCS symbol string for a Unicode character. */
6412 ucs_symbol (unsigned int i
)
6414 static char buf
[11+1];
6416 sprintf (buf
, (i
< 0x10000 ? "<U%04X>" : "<U%08X>"), i
);
6420 /* Return the UCS symbol range string for a Unicode characters interval. */
6422 ucs_symbol_range (unsigned int low
, unsigned int high
)
6424 static char buf
[24+1];
6426 strcpy (buf
, ucs_symbol (low
));
6428 strcat (buf
, ucs_symbol (high
));
6432 /* Output a character class (= property) table. */
6435 output_charclass (FILE *stream
, const char *classname
,
6436 bool (*func
) (unsigned int))
6438 char table
[0x110000];
6440 bool need_semicolon
;
6441 const int max_column
= 75;
6444 for (i
= 0; i
< 0x110000; i
++)
6445 table
[i
] = (int) func (i
);
6447 fprintf (stream
, "%s ", classname
);
6448 need_semicolon
= false;
6450 for (i
= 0; i
< 0x110000; )
6456 unsigned int low
, high
;
6462 while (i
< 0x110000 && table
[i
]);
6466 strcpy (buf
, ucs_symbol (low
));
6468 strcpy (buf
, ucs_symbol_range (low
, high
));
6472 fprintf (stream
, ";");
6476 if (column
+ strlen (buf
) > max_column
)
6478 fprintf (stream
, "/\n ");
6482 fprintf (stream
, "%s", buf
);
6483 column
+= strlen (buf
);
6484 need_semicolon
= true;
6487 fprintf (stream
, "\n");
6490 /* Output a character mapping table. */
6493 output_charmap (FILE *stream
, const char *mapname
,
6494 unsigned int (*func
) (unsigned int))
6496 char table
[0x110000];
6498 bool need_semicolon
;
6499 const int max_column
= 75;
6502 for (i
= 0; i
< 0x110000; i
++)
6503 table
[i
] = (func (i
) != i
);
6505 fprintf (stream
, "%s ", mapname
);
6506 need_semicolon
= false;
6508 for (i
= 0; i
< 0x110000; i
++)
6514 strcat (buf
, ucs_symbol (i
));
6516 strcat (buf
, ucs_symbol (func (i
)));
6521 fprintf (stream
, ";");
6525 if (column
+ strlen (buf
) > max_column
)
6527 fprintf (stream
, "/\n ");
6531 fprintf (stream
, "%s", buf
);
6532 column
+= strlen (buf
);
6533 need_semicolon
= true;
6535 fprintf (stream
, "\n");
6538 /* Output the width table. */
6541 output_widthmap (FILE *stream
)
6545 /* Output the tables to the given file. */
6548 output_tables (const char *filename
, const char *version
)
6553 stream
= fopen (filename
, "w");
6556 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
6560 fprintf (stream
, "escape_char /\n");
6561 fprintf (stream
, "comment_char %%\n");
6562 fprintf (stream
, "\n");
6563 fprintf (stream
, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n",
6565 fprintf (stream
, "\n");
6567 fprintf (stream
, "LC_IDENTIFICATION\n");
6568 fprintf (stream
, "title \"Unicode %s FDCC-set\"\n", version
);
6569 fprintf (stream
, "source \"UnicodeData.txt, PropList.txt\"\n");
6570 fprintf (stream
, "address \"\"\n");
6571 fprintf (stream
, "contact \"\"\n");
6572 fprintf (stream
, "email \"bug-glibc@gnu.org\"\n");
6573 fprintf (stream
, "tel \"\"\n");
6574 fprintf (stream
, "fax \"\"\n");
6575 fprintf (stream
, "language \"\"\n");
6576 fprintf (stream
, "territory \"Earth\"\n");
6577 fprintf (stream
, "revision \"%s\"\n", version
);
6582 strftime (date
, sizeof (date
), "%Y-%m-%d", gmtime (&now
));
6583 fprintf (stream
, "date \"%s\"\n", date
);
6585 fprintf (stream
, "category \"unicode:2001\";LC_CTYPE\n");
6586 fprintf (stream
, "END LC_IDENTIFICATION\n");
6587 fprintf (stream
, "\n");
6590 for (ch
= 0; ch
< 0x110000; ch
++)
6592 /* toupper restriction: "Only characters specified for the keywords
6593 lower and upper shall be specified. */
6594 if (to_upper (ch
) != ch
&& !(is_lower (ch
) || is_upper (ch
)))
6596 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
6597 ucs_symbol (ch
), ch
, to_upper (ch
));
6599 /* tolower restriction: "Only characters specified for the keywords
6600 lower and upper shall be specified. */
6601 if (to_lower (ch
) != ch
&& !(is_lower (ch
) || is_upper (ch
)))
6603 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
6604 ucs_symbol (ch
), ch
, to_lower (ch
));
6606 /* alpha restriction: "Characters classified as either upper or lower
6607 shall automatically belong to this class. */
6608 if ((is_lower (ch
) || is_upper (ch
)) && !is_alpha (ch
))
6609 fprintf (stderr
, "%s is upper|lower but not alpha\n", ucs_symbol (ch
));
6611 /* alpha restriction: "No character specified for the keywords cntrl,
6612 digit, punct or space shall be specified." */
6613 if (is_alpha (ch
) && is_cntrl (ch
))
6614 fprintf (stderr
, "%s is alpha and cntrl\n", ucs_symbol (ch
));
6615 if (is_alpha (ch
) && is_digit (ch
))
6616 fprintf (stderr
, "%s is alpha and digit\n", ucs_symbol (ch
));
6617 if (is_alpha (ch
) && is_punct (ch
))
6618 fprintf (stderr
, "%s is alpha and punct\n", ucs_symbol (ch
));
6619 if (is_alpha (ch
) && is_space (ch
))
6620 fprintf (stderr
, "%s is alpha and space\n", ucs_symbol (ch
));
6622 /* space restriction: "No character specified for the keywords upper,
6623 lower, alpha, digit, graph or xdigit shall be specified."
6624 upper, lower, alpha already checked above. */
6625 if (is_space (ch
) && is_digit (ch
))
6626 fprintf (stderr
, "%s is space and digit\n", ucs_symbol (ch
));
6627 if (is_space (ch
) && is_graph (ch
))
6628 fprintf (stderr
, "%s is space and graph\n", ucs_symbol (ch
));
6629 if (is_space (ch
) && is_xdigit (ch
))
6630 fprintf (stderr
, "%s is space and xdigit\n", ucs_symbol (ch
));
6632 /* cntrl restriction: "No character specified for the keywords upper,
6633 lower, alpha, digit, punct, graph, print or xdigit shall be
6634 specified." upper, lower, alpha already checked above. */
6635 if (is_cntrl (ch
) && is_digit (ch
))
6636 fprintf (stderr
, "%s is cntrl and digit\n", ucs_symbol (ch
));
6637 if (is_cntrl (ch
) && is_punct (ch
))
6638 fprintf (stderr
, "%s is cntrl and punct\n", ucs_symbol (ch
));
6639 if (is_cntrl (ch
) && is_graph (ch
))
6640 fprintf (stderr
, "%s is cntrl and graph\n", ucs_symbol (ch
));
6641 if (is_cntrl (ch
) && is_print (ch
))
6642 fprintf (stderr
, "%s is cntrl and print\n", ucs_symbol (ch
));
6643 if (is_cntrl (ch
) && is_xdigit (ch
))
6644 fprintf (stderr
, "%s is cntrl and xdigit\n", ucs_symbol (ch
));
6646 /* punct restriction: "No character specified for the keywords upper,
6647 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
6648 be specified." upper, lower, alpha, cntrl already checked above. */
6649 if (is_punct (ch
) && is_digit (ch
))
6650 fprintf (stderr
, "%s is punct and digit\n", ucs_symbol (ch
));
6651 if (is_punct (ch
) && is_xdigit (ch
))
6652 fprintf (stderr
, "%s is punct and xdigit\n", ucs_symbol (ch
));
6653 if (is_punct (ch
) && (ch
== 0x0020))
6654 fprintf (stderr
, "%s is punct\n", ucs_symbol (ch
));
6656 /* graph restriction: "No character specified for the keyword cntrl
6657 shall be specified." Already checked above. */
6659 /* print restriction: "No character specified for the keyword cntrl
6660 shall be specified." Already checked above. */
6662 /* graph - print relation: differ only in the <space> character.
6663 How is this possible if there are more than one space character?!
6664 I think susv2/xbd/locale.html should speak of "space characters",
6665 not "space character". */
6666 if (is_print (ch
) && !(is_graph (ch
) || /* ch == 0x0020 */ is_space (ch
)))
6668 "%s is print but not graph|<space>\n", ucs_symbol (ch
));
6669 if (!is_print (ch
) && (is_graph (ch
) || ch
== 0x0020))
6671 "%s is graph|<space> but not print\n", ucs_symbol (ch
));
6674 fprintf (stream
, "LC_CTYPE\n");
6675 output_charclass (stream
, "upper", is_upper
);
6676 output_charclass (stream
, "lower", is_lower
);
6677 output_charclass (stream
, "alpha", is_alpha
);
6678 output_charclass (stream
, "digit", is_digit
);
6679 output_charclass (stream
, "outdigit", is_outdigit
);
6680 output_charclass (stream
, "blank", is_blank
);
6681 output_charclass (stream
, "space", is_space
);
6682 output_charclass (stream
, "cntrl", is_cntrl
);
6683 output_charclass (stream
, "punct", is_punct
);
6684 output_charclass (stream
, "xdigit", is_xdigit
);
6685 output_charclass (stream
, "graph", is_graph
);
6686 output_charclass (stream
, "print", is_print
);
6687 output_charclass (stream
, "class \"combining\";", is_combining
);
6688 output_charclass (stream
, "class \"combining_level3\";", is_combining_level3
);
6689 output_charmap (stream
, "toupper", to_upper
);
6690 output_charmap (stream
, "tolower", to_lower
);
6691 output_charmap (stream
, "map \"totitle\";", to_title
);
6692 output_widthmap (stream
);
6693 fprintf (stream
, "END LC_CTYPE\n");
6695 if (ferror (stream
) || fclose (stream
))
6697 fprintf (stderr
, "error writing to '%s'\n", filename
);
6704 /* ========================================================================= */
6706 /* The width property from the EastAsianWidth.txt file.
6707 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
6708 const char * unicode_width
[0x110000];
6710 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
6713 fill_width (const char *width_filename
)
6717 char field0
[FIELDLEN
];
6718 char field1
[FIELDLEN
];
6719 char field2
[FIELDLEN
];
6722 for (i
= 0; i
< 0x110000; i
++)
6723 unicode_width
[i
] = (unicode_attributes
[i
].name
!= NULL
? "N" : NULL
);
6725 stream
= fopen (width_filename
, "r");
6728 fprintf (stderr
, "error during fopen of '%s'\n", width_filename
);
6745 do c
= getc (stream
); while (c
!= EOF
&& c
!= '\n');
6749 n
= getfield (stream
, field0
, ';');
6750 do c
= getc (stream
); while (c
== ' ');
6752 n
+= getfield (stream
, field1
, '#');
6753 n
+= getfield (stream
, field2
, '\n');
6758 fprintf (stderr
, "short line in '%s':%d\n", width_filename
, lineno
);
6761 /* Remove trailing spaces from field0. */
6762 while (strlen (field0
) > 0 && field0
[strlen (field0
) - 1] == ' ')
6763 field0
[strlen (field0
) - 1] = '\0';
6764 /* Remove trailing spaces from field1. */
6765 while (strlen (field1
) > 0 && field1
[strlen (field1
) - 1] == ' ')
6766 field1
[strlen (field1
) - 1] = '\0';
6767 i
= strtoul (field0
, NULL
, 16);
6768 if (strstr (field0
, "..") != NULL
)
6770 /* Deal with a range. */
6771 j
= strtoul (strstr (field0
, "..") + 2, NULL
, 16);
6773 unicode_width
[i
] = strdup (field1
);
6777 /* Single character line. */
6778 unicode_width
[i
] = strdup (field1
);
6782 if (ferror (stream
) || fclose (stream
))
6784 fprintf (stderr
, "error reading from '%s'\n", width_filename
);
6789 /* ========================================================================= */
6791 /* Non-spacing attribute and width. */
6793 /* The non-spacing attribute table consists of:
6794 * Non-spacing characters; generated from PropList.txt or
6795 "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
6796 * Format control characters, except for characters with property
6797 Prepended_Concatenation_Mark; generated from
6798 "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt" and from
6799 "grep Prepended_Concatenation_Mark PropList.txt".
6800 Rationale for the Prepended_Concatenation_Mark exception:
6801 The Unicode standard says "Unlike most other format characters,
6802 however, they should be rendered with a visible glyph".
6803 * Zero width characters; generated from
6804 "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
6805 * Hangul Jamo characters that have conjoining behaviour:
6806 - jungseong = syllable-middle vowels
6807 - jongseong = syllable-final consonants
6809 1) These characters act like combining characters. They have no
6810 equivalent in legacy character sets. Therefore the EastAsianWidth.txt
6811 file does not really matter for them; UAX #11 East Asian Width
6812 <https://www.unicode.org/reports/tr11/> makes it clear that it focus
6813 is on compatibility with traditional Japanese layout.
6814 By contrast, the same glyphs without conjoining behaviour are available
6815 in the U+3130..U+318F block, and these characters are mapped to legacy
6816 character sets, and traditional Japanese layout matters for them.
6817 2) glibc does the same thing, see
6818 <https://sourceware.org/bugzilla/show_bug.cgi?id=21750>
6819 <https://sourceware.org/bugzilla/show_bug.cgi?id=26120>
6823 is_nonspacing (unsigned int ch
)
6825 return (unicode_attributes
[ch
].name
!= NULL
6826 && (get_bidi_category (ch
) == UC_BIDI_NSM
6827 || is_category_Cc (ch
)
6828 || (is_category_Cf (ch
)
6829 && !is_property_prepended_concatenation_mark (ch
))
6830 || strncmp (unicode_attributes
[ch
].name
, "ZERO WIDTH ", 11) == 0
6831 || (ch
>= 0x1160 && ch
<= 0x11A7) || (ch
>= 0xD7B0 && ch
<= 0xD7C6) /* jungseong */
6832 || (ch
>= 0x11A8 && ch
<= 0x11FF) || (ch
>= 0xD7CB && ch
<= 0xD7FB) /* jongseong */
6837 output_nonspacing_property (const char *filename
, const char *version
)
6840 int ind
[0x110000 / 0x200];
6845 stream
= fopen (filename
, "w");
6848 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
6852 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6853 fprintf (stream
, "/* Table of non-spacing or control characters. */\n");
6854 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
6856 fprintf (stream
, "\n");
6858 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
6859 fprintf (stream
, "\n");
6860 output_library_license (stream
, true);
6861 fprintf (stream
, "\n");
6864 for (i
= 0; i
< 0x110000 / 0x200; i
++)
6866 bool nontrivial
= false;
6869 if (i
!= 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */
6870 for (ch
= i
* 0x200; ch
< (i
+ 1) * 0x200; ch
++)
6871 if (is_nonspacing (ch
))
6877 ind
[i
] = next_ind
++;
6882 fprintf (stream
, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
6885 for (i
= 0; i
< 0x110000 / 0x200; i
++)
6887 bool nontrivial
= (ind
[i
] >= 0);
6893 fprintf (stream
, " /* 0x%04x-0x%04x */\n", i
* 0x200, (i
+ 1) * 0x200 - 1);
6894 for (j
= 0; j
< 8; j
++)
6898 fprintf (stream
, " ");
6899 for (k
= 0; k
< 8; k
++)
6902 unsigned char bits
= 0;
6904 for (l
= 0; l
< 8; l
++)
6906 unsigned int ch
= i
* 0x200 + j
* 0x40 + k
* 8 + l
;
6908 if (is_nonspacing (ch
))
6911 fprintf (stream
, " 0x%02x%c", bits
,
6912 ind
[i
] + 1 == next_ind
&& j
== 8 - 1 && k
== 8 - 1 ? ' ' : ',');
6914 fprintf (stream
, " /* 0x%04x-0x%04x */\n",
6915 i
* 0x200 + j
* 0x40, i
* 0x200 + (j
+ 1) * 0x40 - 1);
6920 fprintf (stream
, "};\n");
6922 i_max
= ((i_max
+ 8 - 1) / 8) * 8;
6923 fprintf (stream
, "static const signed char nonspacing_table_ind[%u] = {\n",
6928 for (j
= 0; j
< i_max
/ 8; j
++)
6932 fprintf (stream
, " ");
6933 for (k
= 0; k
< 8; k
++)
6936 fprintf (stream
, " %2d%c", ind
[i
],
6937 j
== i_max
/ 8 - 1 && k
== 8 - 1 ? ' ' : ',');
6939 fprintf (stream
, " /* 0x%04x-0x%04x */\n",
6940 j
* 8 * 0x200, (j
+ 1) * 8 * 0x200 - 1);
6943 fprintf (stream
, "};\n");
6945 if (ferror (stream
) || fclose (stream
))
6947 fprintf (stderr
, "error writing to '%s'\n", filename
);
6952 /* Determines whether a character has width 2, regardless of context.
6953 Generated from "grep '^[^;]\+;[WF]' EastAsianWidth.txt"
6954 and "grep '^[^;]\+;[^WF]' EastAsianWidth.txt"
6957 is_width2 (unsigned int ch
)
6959 return ((ch
>= 0x1100 && ch
<= 0x115F) /* Hangul Jamo */
6960 || (ch
>= 0x231A && ch
<= 0x231B) /* Watch, Hourglass */
6961 || (ch
>= 0x2329 && ch
<= 0x232A) /* Angle Brackets */
6962 || (ch
>= 0x23E9 && ch
<= 0x23EC) /* Black double triangles */
6963 || ch
== 0x23F0 /* Alarm clock */
6964 || ch
== 0x23F3 /* Hourglass */
6965 || (ch
>= 0x25FD && ch
<= 0x25FE) /* Medium small squares */
6966 /* Miscellaneous symbols, dingbats */
6967 || (ch
>= 0x2614 && ch
<= 0x2615)
6968 || (ch
>= 0x2648 && ch
<= 0x2653)
6972 || (ch
>= 0x26AA && ch
<= 0x26AB)
6973 || (ch
>= 0x26BD && ch
<= 0x26BE)
6974 || (ch
>= 0x26C4 && ch
<= 0x26C5)
6978 || (ch
>= 0x26F2 && ch
<= 0x26F3)
6983 || (ch
>= 0x270A && ch
<= 0x270B)
6987 || (ch
>= 0x2753 && ch
<= 0x2755)
6989 || (ch
>= 0x2795 && ch
<= 0x2797)
6992 || (ch
>= 0x2B1B && ch
<= 0x2B1C) /* Large squares */
6995 || (ch
>= 0x2E80 && ch
<= 0xA4CF /* CJK ... Yi */
6997 && !(ch
>= 0x3248 && ch
<= 0x324F)
6998 && !(ch
>= 0x4DC0 && ch
<= 0x4DFF))
6999 || (ch
>= 0xA960 && ch
<= 0xA97C) /* Hangul Jamo Extended-A */
7000 || (ch
>= 0xAC00 && ch
<= 0xD7A3) /* Hangul Syllables */
7001 || (ch
>= 0xF900 && ch
<= 0xFAFF) /* CJK Compatibility Ideographs */
7002 || (ch
>= 0xFE10 && ch
<= 0xFE1F) /* Presentation Forms for Vertical */
7003 || (ch
>= 0xFE30 && ch
<= 0xFE6F) /* CJK Compatibility Forms */
7004 || (ch
>= 0xFF00 && ch
<= 0xFF60) /* Fullwidth Forms */
7005 || (ch
>= 0xFFE0 && ch
<= 0xFFE6) /* Fullwidth Signs */
7006 || (ch
>= 0x16FE0 && ch
<= 0x16FE3) /* Tangut mark, Nushu mark */
7007 || (ch
>= 0x16FF0 && ch
<= 0x16FF1) /* Vietnamese alternate reading marks */
7008 || (ch
>= 0x17000 && ch
<= 0x187F7) /* Tangut */
7009 || (ch
>= 0x18800 && ch
<= 0x18CD5) /* Tangut components */
7010 || (ch
>= 0x18D00 && ch
<= 0x18D08) /* Tangul Ideograph Supplement */
7011 || ((ch
>= 0x1AFF0 && ch
<= 0x1AFFE) /* Katakana letter Minnan */
7012 && ch
!= 0x1AFF4 && ch
!= 0x1AFFC)
7013 || (ch
>= 0x1B000 && ch
<= 0x1B122) /* Kana supplement, Kana Extended-A */
7014 || (ch
>= 0x1B150 && ch
<= 0x1B152) /* Small Hiragana */
7015 || (ch
>= 0x1B164 && ch
<= 0x1B167) /* Small Katakana */
7016 || (ch
>= 0x1B170 && ch
<= 0x1B2FB) /* Nushu */
7020 || (ch
>= 0x1F191 && ch
<= 0x1F19A)
7021 /* Miscellaneous symbols and pictographs */
7022 || (ch
>= 0x1F200 && ch
<= 0x1F320)
7023 || (ch
>= 0x1F32D && ch
<= 0x1F335)
7024 || (ch
>= 0x1F337 && ch
<= 0x1F37C)
7025 || (ch
>= 0x1F37E && ch
<= 0x1F393)
7026 || (ch
>= 0x1F3A0 && ch
<= 0x1F3CA)
7027 || (ch
>= 0x1F3CF && ch
<= 0x1F3D3)
7028 || (ch
>= 0x1F3E0 && ch
<= 0x1F3F0)
7030 || (ch
>= 0x1F3F8 && ch
<= 0x1F43E)
7032 || (ch
>= 0x1F442 && ch
<= 0x1F4FC)
7033 || (ch
>= 0x1F4FF && ch
<= 0x1F53D)
7034 || (ch
>= 0x1F54B && ch
<= 0x1F54E)
7035 || (ch
>= 0x1F550 && ch
<= 0x1F567)
7037 || (ch
>= 0x1F595 && ch
<= 0x1F596)
7039 || (ch
>= 0x1F5FB && ch
<= 0x1F64F)
7040 || (ch
>= 0x1F680 && ch
<= 0x1F6C5)
7042 || (ch
>= 0x1F6D0 && ch
<= 0x1F6D2)
7043 || (ch
>= 0x1F6D5 && ch
<= 0x1F6D7)
7044 || (ch
>= 0x1F6DD && ch
<= 0x1F6DF)
7045 || (ch
>= 0x1F6EB && ch
<= 0x1F6EC)
7046 || (ch
>= 0x1F6F4 && ch
<= 0x1F6FC)
7047 || (ch
>= 0x1F7E0 && ch
<= 0x1F7EB)
7049 || ((ch
>= 0x1F90C && ch
<= 0x1F9FF)
7050 && ch
!= 0x1F93B && ch
!= 0x1F946)
7051 || (ch
>= 0x1FA70 && ch
<= 0x1FA74)
7052 || (ch
>= 0x1FA78 && ch
<= 0x1FA7C)
7053 || (ch
>= 0x1FA80 && ch
<= 0x1FA86)
7054 || (ch
>= 0x1FA90 && ch
<= 0x1FAAC)
7055 || (ch
>= 0x1FAB0 && ch
<= 0x1FABA)
7056 || (ch
>= 0x1FAC0 && ch
<= 0x1FAC5)
7057 || (ch
>= 0x1FAD0 && ch
<= 0x1FAD9)
7058 || (ch
>= 0x1FAE0 && ch
<= 0x1FAE7)
7059 || (ch
>= 0x1FAF0 && ch
<= 0x1FAF6)
7060 || (ch
>= 0x20000 && ch
<= 0x2FFFF) /* Supplementary Ideographic Plane */
7061 || (ch
>= 0x30000 && ch
<= 0x3FFFF) /* Tertiary Ideographic Plane */
7066 output_width2_property (const char *filename
, const char *version
)
7068 output_predicate (filename
, is_width2
, "u_width2", "Width 2 property", version
);
7071 /* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */
7073 symbolic_width (unsigned int ch
)
7075 /* Test for unassigned character. */
7076 if (is_property_unassigned_code_value (ch
))
7078 /* Unicode TR#11 section "Unassigned and Private-Use Characters". */
7079 if (ch
>= 0xE000 && ch
<= 0xF8FF) /* Private Use */
7081 if ((ch
>= 0x4E00 && ch
<= 0x9FFF) /* CJK Unified Ideographs block */
7082 || (ch
>= 0x3400 && ch
<= 0x4DBF) /* CJK Unified Ideographs Extension A block */
7083 || (ch
>= 0xF900 && ch
<= 0xFAFF) /* CJK Compatibility Ideographs block */
7084 || (ch
>= 0x20000 && ch
<= 0x2FFFF) /* Supplementary Ideographic Plane */
7085 || (ch
>= 0x30000 && ch
<= 0x3FFFF) /* Tertiary Ideographic Plane */)
7091 /* Test for non-spacing or control character. */
7092 if (is_category_Cc (ch
) && ch
< 0x00A0)
7094 if (is_nonspacing (ch
))
7096 /* Test for double-width character. */
7097 if (unicode_width
[ch
] != NULL
7098 && (strcmp (unicode_width
[ch
], "W") == 0
7099 || strcmp (unicode_width
[ch
], "F") == 0))
7101 /* Test for half-width character. */
7102 if (unicode_width
[ch
] != NULL
7103 && strcmp (unicode_width
[ch
], "H") == 0)
7106 /* In ancient CJK encodings, Cyrillic and most other characters are
7107 double-width as well. */
7108 if (ch
>= 0x00A1 && ch
< 0x10000)
7114 output_width_property_test (const char *filename
)
7117 unsigned int interval_start
, interval_end
, ch
;
7118 char interval_value
;
7120 stream
= fopen (filename
, "w");
7123 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
7128 interval_start
= interval_end
= 0; /* avoid GCC warning */
7129 for (ch
= 0; ch
< 0x110000; ch
++)
7131 char value
= symbolic_width (ch
);
7132 if (value
!= 0) /* skip Cc control characters and unassigned characters */
7134 if (value
== interval_value
)
7135 /* Extend the interval. */
7139 /* Terminate the interval. */
7140 if (interval_value
!= 0)
7142 if (interval_end
== interval_start
)
7143 fprintf (stream
, "%04X\t\t%c\n", interval_start
, interval_value
);
7145 fprintf (stream
, "%04X..%04X\t%c\n", interval_start
, interval_end
, interval_value
);
7147 /* Start a new interval. */
7148 interval_start
= interval_end
= ch
;
7149 interval_value
= value
;
7153 /* Terminate the last interval. */
7154 if (interval_value
!= 0)
7156 if (interval_end
== interval_start
)
7157 fprintf (stream
, "%04X\t\t%c\n", interval_start
, interval_value
);
7159 fprintf (stream
, "%04X..%04X\t%c\n", interval_start
, interval_end
, interval_value
);
7162 if (ferror (stream
) || fclose (stream
))
7164 fprintf (stderr
, "error writing to '%s'\n", filename
);
7169 /* ========================================================================= */
7171 /* Line breaking classification.
7172 Updated for Unicode TR #14 revision 53. */
7176 /* Values >= 41 are resolved at run time. */
7177 /* Values >= 100 are shorthands for several values. */
7178 LBP_BK
= 41, /* mandatory break */
7179 LBP_CR
= 42, /* carriage return */
7180 LBP_LF
= 43, /* line feed */
7181 LBP_CM
= 44, /* attached characters and combining marks */
7182 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
7183 /*LBP_SG, surrogates - not used here because they are not characters */
7184 LBP_WJ
= 0, /* word joiner */
7185 LBP_ZW
= 45, /* zero width space */
7186 LBP_GL
= 1, /* non-breaking (glue) */
7187 LBP_SP
= 46, /* space */
7188 LBP_B2
= 2, /* break opportunity before and after */
7189 LBP_BA
= 3, /* break opportunity after */
7190 LBP_BB
= 4, /* break opportunity before */
7191 LBP_HY
= 5, /* hyphen */
7192 LBP_CB
= 47, /* contingent break opportunity */
7193 LBP_CL
= 6, /* closing punctuation */
7194 LBP_CP1
= 7, /* closing parenthesis, non-EastAsian character */
7195 LBP_CP2
= 8, /* closing parenthesis, EastAsian character */
7196 LBP_EX
= 9, /* exclamation/interrogation */
7197 LBP_IN
= 10, /* inseparable */
7198 LBP_NS
= 11, /* non starter */
7199 LBP_OP1
= 12, /* opening punctuation, non-EastAsian character */
7200 LBP_OP2
= 13, /* opening punctuation, EastAsian character */
7201 LBP_QU1
= 14, /* ambiguous quotation, neither initial nor final punctuation */
7202 LBP_QU2
= 15, /* ambiguous quotation, initial punctuation */
7203 LBP_QU3
= 16, /* ambiguous quotation, final punctuation */
7204 LBP_IS
= 17, /* infix separator (numeric) */
7205 LBP_NU
= 18, /* numeric */
7206 LBP_PO
= 19, /* postfix (numeric) */
7207 LBP_PR
= 20, /* prefix (numeric) */
7208 LBP_SY
= 21, /* symbols allowing breaks */
7209 LBP_AI
= 48, /* ambiguous (alphabetic or ideograph) */
7210 LBP_AL1
= 22, /* ordinary alphabetic and symbol characters, != U+25CC */
7211 LBP_AL2
= 23, /* ordinary alphabetic and symbol characters, == U+25CC */
7212 /*LBP_CJ, conditional Japanese starter, resolved to NS */
7213 LBP_H2
= 24, /* Hangul LV syllable */
7214 LBP_H3
= 25, /* Hangul LVT syllable */
7215 LBP_HL
= 31, /* Hebrew letter */
7216 LBP_ID1
= 26, /* ideographic */
7217 LBP_ID2
= 27, /* ideographic and potential future emoji */
7218 LBP_JL
= 28, /* Hangul L Jamo */
7219 LBP_JV
= 29, /* Hangul V Jamo */
7220 LBP_JT
= 30, /* Hangul T Jamo */
7221 LBP_AP
= 32, /* Brahmic scripts: pre-base repha */
7222 LBP_AK
= 33, /* Brahmic scripts: consonants */
7223 LBP_AS
= 34, /* Brahmic scripts: independent vowels */
7224 LBP_VI
= 35, /* Brahmic scripts: conjoining viramas */
7225 LBP_VF
= 36, /* Brahmic scripts: viramas for final consonants */
7226 LBP_RI
= 37, /* regional indicator */
7227 LBP_SA
= 49, /* complex context (South East Asian) */
7228 LBP_ZWJ
= 38, /* zero width joiner */
7229 LBP_EB
= 39, /* emoji base */
7230 LBP_EM
= 40, /* emoji modifier */
7231 LBP_XX
= 50, /* unknown */
7232 /* Artificial values that exist only in this file, not in the tables. */
7233 LBP_CP
= 100, /* LBP_CP1 or LBP_CP2 */
7234 LBP_OP
= 101, /* LBP_OP1 or LBP_OP2 */
7235 LBP_QU
= 102, /* LBP_QU1 or LBP_QU2 or LBP_QU3 */
7236 LBP_AL
= 103, /* LBP_AL1 or LBP_AL2 */
7237 LBP_ID
= 104 /* LBP_ID1 or LBP_ID2 */
7240 /* Returns the line breaking EastAsian property for ch, as a bit. */
7242 get_lbea (unsigned int ch
)
7244 return (unicode_width
[ch
] != NULL
7245 && (strcmp (unicode_width
[ch
], "W") == 0
7246 || strcmp (unicode_width
[ch
], "F") == 0
7247 || strcmp (unicode_width
[ch
], "H") == 0));
7250 /* Returns the line breaking classification for ch, as a bit mask. */
7252 get_lbp (unsigned int ch
)
7256 /* U+20BC..U+20CF are reserved for prefixes. */
7257 if (unicode_attributes
[ch
].name
== NULL
&& (ch
>= 0x20BC && ch
<= 0x20CF))
7258 return (int64_t) 1 << LBP_PR
;
7260 if (unicode_attributes
[ch
].name
!= NULL
)
7262 /* mandatory break */
7264 attr
|= (int64_t) 1 << LBP_LF
;
7266 attr
|= (int64_t) 1 << LBP_CR
;
7267 if (ch
== 0x0085 /* newline */
7268 || ch
== 0x000B /* LINE TABULATION */
7269 || ch
== 0x000C /* FORM FEED */
7270 || ch
== 0x2028 /* LINE SEPARATOR */
7271 || ch
== 0x2029 /* PARAGRAPH SEPARATOR */)
7272 attr
|= (int64_t) 1 << LBP_BK
;
7274 if (ch
== 0x2060 /* WORD JOINER */
7275 || ch
== 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
7276 attr
|= (int64_t) 1 << LBP_WJ
;
7278 /* zero width space */
7279 if (ch
== 0x200B /* ZERO WIDTH SPACE */)
7280 attr
|= (int64_t) 1 << LBP_ZW
;
7282 /* zero width joiner */
7283 if (ch
== 0x200D /* ZERO WIDTH JOINER */)
7284 attr
|= (int64_t) 1 << LBP_ZWJ
;
7287 if (((unicode_properties
[ch
] >> PROP_EMOJI_MODIFIER_BASE
) & 1) != 0) /* EMOJI MODIFIER BASE */
7288 attr
|= (int64_t) 1 << LBP_EB
;
7290 if (((unicode_properties
[ch
] >> PROP_EMOJI_MODIFIER
) & 1) != 0) /* EMOJI MODIFIER */
7291 attr
|= (int64_t) 1 << LBP_EM
;
7293 /* non-breaking (glue) */
7294 if (ch
== 0x00A0 /* NO-BREAK SPACE */
7295 || ch
== 0x202F /* NARROW NO-BREAK SPACE */
7296 || ch
== 0x180E /* MONGOLIAN VOWEL SEPARATOR */
7297 || ch
== 0x1107F /* BRAHMI NUMBER JOINER */
7298 || (ch
>= 0x13430 && ch
<= 0x13436) /* EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH OVERLAY MIDDLE */
7299 || (ch
>= 0x13439 && ch
<= 0x1343B) /* EGYPTIAN HIEROGLYPH INSERT AT MIDDLE..EGYPTIAN HIEROGLYPH INSERT AT BOTTOM */
7300 || ch
== 0x16FE4 /* KHITAN SMALL SCRIPT FILLER */
7301 || ch
== 0x034F /* COMBINING GRAPHEME JOINER */
7302 || ch
== 0x2007 /* FIGURE SPACE */
7303 || ch
== 0x2011 /* NON-BREAKING HYPHEN */
7304 || ch
== 0x0F08 /* TIBETAN MARK SBRUL SHAD */
7305 || ch
== 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
7306 || ch
== 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
7307 || (ch
>= 0x035C && ch
<= 0x0362) /* COMBINING DOUBLE ... */
7308 || ch
== 0xFE20 /* COMBINING LIGATURE LEFT HALF */
7309 || ch
== 0xFE22 /* COMBINING DOUBLE TILDE LEFT HALF */
7310 || ch
== 0xFE24 /* COMBINING MACRON LEFT HALF */
7311 || ch
== 0xFE27 /* COMBINING LIGATURE LEFT HALF BELOW */
7312 || ch
== 0xFE29 /* COMBINING TILDE LEFT HALF BELOW */
7313 || ch
== 0xFE2B /* COMBINING MACRON LEFT HALF BELOW */
7314 || ch
== 0xFE2E /* COMBINING CYRILLIC TITLO LEFT HALF */
7315 || ch
== 0xFE26 /* COMBINING CONJOINING MACRON */
7316 || ch
== 0xFE2D /* COMBINING CONJOINING MACRON BELOW */
7317 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7318 || ch
== 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */
7319 || ch
== 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */
7320 || ch
== 0x1DCD /* COMBINING DOUBLE CIRCUMFLEX ABOVE */
7321 || ch
== 0x1DFC /* COMBINING DOUBLE INVERTED BREVE BELOW */)
7322 attr
|= (int64_t) 1 << LBP_GL
;
7325 if (ch
== 0x0020 /* SPACE */)
7326 attr
|= (int64_t) 1 << LBP_SP
;
7328 /* break opportunity before and after */
7329 if (ch
== 0x2014 /* EM DASH */
7330 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7331 || ch
== 0x2E3A /* TWO-EM DASH */
7332 || ch
== 0x2E3B /* THREE-EM DASH */)
7333 attr
|= (int64_t) 1 << LBP_B2
;
7335 /* break opportunity after */
7336 if (/* Breaking Spaces */
7337 ch
== 0x1680 /* OGHAM SPACE MARK */
7338 || ch
== 0x2000 /* EN QUAD */
7339 || ch
== 0x2001 /* EM QUAD */
7340 || ch
== 0x2002 /* EN SPACE */
7341 || ch
== 0x2003 /* EM SPACE */
7342 || ch
== 0x2004 /* THREE-PER-EM SPACE */
7343 || ch
== 0x2005 /* FOUR-PER-EM SPACE */
7344 || ch
== 0x2006 /* SIX-PER-EM SPACE */
7345 || ch
== 0x2008 /* PUNCTUATION SPACE */
7346 || ch
== 0x2009 /* THIN SPACE */
7347 || ch
== 0x200A /* HAIR SPACE */
7348 || ch
== 0x205F /* MEDIUM MATHEMATICAL SPACE */
7349 || ch
== 0x3000 /* IDEOGRAPHIC SPACE */
7351 || ch
== 0x0009 /* tab */
7352 /* Conditional Hyphens */
7353 || ch
== 0x00AD /* SOFT HYPHEN */
7354 /* Breaking Hyphens */
7355 || ch
== 0x058A /* ARMENIAN HYPHEN */
7356 || ch
== 0x2010 /* HYPHEN */
7357 || ch
== 0x2012 /* FIGURE DASH */
7358 || ch
== 0x2013 /* EN DASH */
7359 /* Visible Word Dividers */
7360 || ch
== 0x05BE /* HEBREW PUNCTUATION MAQAF */
7361 || ch
== 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
7362 || ch
== 0x1361 /* ETHIOPIC WORDSPACE */
7363 || ch
== 0x17D8 /* KHMER SIGN BEYYAL */
7364 || ch
== 0x17DA /* KHMER SIGN KOOMUUT */
7365 || ch
== 0x2027 /* HYPHENATION POINT */
7366 || ch
== 0x007C /* VERTICAL LINE */
7367 /* Historic Word Separators */
7368 || ch
== 0x16EB /* RUNIC SINGLE PUNCTUATION */
7369 || ch
== 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
7370 || ch
== 0x16ED /* RUNIC CROSS PUNCTUATION */
7371 || ch
== 0x2056 /* THREE DOT PUNCTUATION */
7372 || ch
== 0x2058 /* FOUR DOT PUNCTUATION */
7373 || ch
== 0x2059 /* FIVE DOT PUNCTUATION */
7374 || ch
== 0x205A /* TWO DOT PUNCTUATION */
7375 || ch
== 0x205B /* FOUR DOT MARK */
7376 || ch
== 0x205D /* TRICOLON */
7377 || ch
== 0x205E /* VERTICAL FOUR DOTS */
7378 || ch
== 0x2E19 /* PALM BRANCH */
7379 || ch
== 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
7380 || ch
== 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
7381 || ch
== 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
7382 || ch
== 0x2E2D /* FIVE DOT PUNCTUATION */
7383 || ch
== 0x2E30 /* RING POINT */
7384 || ch
== 0x10100 /* AEGEAN WORD SEPARATOR LINE */
7385 || ch
== 0x10101 /* AEGEAN WORD SEPARATOR DOT */
7386 || ch
== 0x10102 /* AEGEAN CHECK MARK */
7387 || ch
== 0x1039F /* UGARITIC WORD DIVIDER */
7388 || ch
== 0x103D0 /* OLD PERSIAN WORD DIVIDER */
7389 || ch
== 0x1091F /* PHOENICIAN WORD SEPARATOR */
7390 || ch
== 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
7392 || ch
== 0x0964 /* DEVANAGARI DANDA */
7393 || ch
== 0x0965 /* DEVANAGARI DOUBLE DANDA */
7394 || ch
== 0x0E5A /* THAI CHARACTER ANGKHANKHU */
7395 || ch
== 0x0E5B /* THAI CHARACTER KHOMUT */
7396 || ch
== 0x104A /* MYANMAR SIGN LITTLE SECTION */
7397 || ch
== 0x104B /* MYANMAR SIGN SECTION */
7398 || ch
== 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
7399 || ch
== 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
7400 || ch
== 0x17D4 /* KHMER SIGN KHAN */
7401 || ch
== 0x17D5 /* KHMER SIGN BARIYOOSAN */
7402 || ch
== 0x1B5E /* BALINESE CARIK SIKI */
7403 || ch
== 0x1B5F /* BALINESE CARIK PAREREN */
7404 || ch
== 0xA8CE /* SAURASHTRA DANDA */
7405 || ch
== 0xA8CF /* SAURASHTRA DOUBLE DANDA */
7406 || ch
== 0xAA5D /* CHAM PUNCTUATION DANDA */
7407 || ch
== 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
7408 || ch
== 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
7409 || ch
== 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
7410 || ch
== 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
7412 || ch
== 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
7413 || ch
== 0x0F7F /* TIBETAN SIGN RNAM BCAD */
7414 || ch
== 0x0F85 /* TIBETAN MARK PALUTA */
7415 || ch
== 0x0FBE /* TIBETAN KU RU KHA */
7416 || ch
== 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
7417 || ch
== 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
7418 /* Other Terminating Punctuation */
7419 || ch
== 0x1804 /* MONGOLIAN COLON */
7420 || ch
== 0x1805 /* MONGOLIAN FOUR DOTS */
7421 || ch
== 0x1B5A /* BALINESE PANTI */
7422 || ch
== 0x1B5B /* BALINESE PAMADA */
7423 || ch
== 0x1B5D /* BALINESE CARIK PAMUNGKAH */
7424 || ch
== 0x1B60 /* BALINESE PAMENENG */
7425 || ch
== 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
7426 || ch
== 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
7427 || ch
== 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
7428 || ch
== 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
7429 || ch
== 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
7430 || ch
== 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
7431 || ch
== 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
7432 || ch
== 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
7433 || ch
== 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
7434 || ch
== 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
7435 || ch
== 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
7436 || (ch
>= 0x2E0E && ch
<= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
7437 || ch
== 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
7438 || ch
== 0xA60D /* VAI COMMA */
7439 || ch
== 0xA60F /* VAI QUESTION MARK */
7440 || ch
== 0xA92E /* KAYAH LI SIGN CWI */
7441 || ch
== 0xA92F /* KAYAH LI SIGN SHYA */
7442 || ch
== 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
7443 || ch
== 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
7444 || ch
== 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
7445 || ch
== 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
7446 || ch
== 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
7447 || ch
== 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
7448 || (ch
>= 0x11EF7 && ch
<= 0x11EF8) /* MAKASAR PASSIMBANG..MAKASAR END OF SECTION */
7449 /* Letters attached to orthographic syllables */
7450 || ch
== 0xA9CF /* JAVANESE PANGRANGKEP */
7451 || (ch
>= 0xAA40 && ch
<= 0xAA42) /* CHAM LETTER FINAL K..CHAM LETTER FINAL NG */
7452 || (ch
>= 0xAA44 && ch
<= 0xAA4B) /* CHAM LETTER FINAL CH..CHAM LETTER FINAL SS */
7453 || ch
== 0x1133D /* GRANTHA SIGN AVAGRAHA */
7454 || ch
== 0x1135D /* GRANTHA SIGN PLUTA */
7455 || ch
== 0x11EF2 /* MAKASAR ANGKA */
7456 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7457 || ch
== 0x1400 /* CANADIAN SYLLABICS HYPHEN */
7458 || ch
== 0x1B4E /* BALINESE INVERTED CARIK SIKI */
7459 || ch
== 0x1B4F /* BALINESE INVERTED CARIK PAREREN */
7460 || ch
== 0x1B7D /* BALINESE PANTI LANTANG */
7461 || ch
== 0x1B7E /* BALINESE PAMADA LANTANG */
7462 || ch
== 0x1B7F /* BALINESE PANTI BAWAK */
7463 || ch
== 0x2D70 /* TIFINAGH SEPARATOR MARK */
7464 || ch
== 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
7465 || ch
== 0x2E33 /* RAISED DOT */
7466 || ch
== 0x2E34 /* RAISED COMMA */
7467 || ch
== 0x2E3C /* STENOGRAPHIC FULL STOP */
7468 || ch
== 0x2E3D /* VERTICAL SIX DOTS */
7469 || ch
== 0x2E3E /* WIGGLY VERTICAL LINE */
7470 || ch
== 0x2E40 /* DOUBLE HYPHEN */
7471 || ch
== 0x2E41 /* REVERSED COMMA */
7472 || ch
== 0x2E43 /* DASH WITH LEFT UPTURN */
7473 || ch
== 0x2E44 /* DOUBLE SUSPENSION MARK */
7474 || ch
== 0x2E45 /* INVERTED LOW KAVYKA */
7475 || ch
== 0x2E46 /* INVERTED LOW KAVYKA WITH KAVYKA ABOVE */
7476 || ch
== 0x2E47 /* LOW KAVYKA */
7477 || ch
== 0x2E48 /* LOW KAVYKA WITH DOT */
7478 || ch
== 0x2E49 /* DOUBLE STACKED COMMA */
7479 || ch
== 0x2E4A /* DOTTED SOLIDUS */
7480 || ch
== 0x2E4C /* MEDIEVAL COMMA */
7481 || ch
== 0x2E4E /* PUNCTUS ELEVATUS MARK */
7482 || ch
== 0x2E4F /* CORNISH VERSE DIVIDER */
7483 || ch
== 0x2E5D /* OBLIQUE HYPHEN */
7484 || ch
== 0xA4FE /* LISU PUNCTUATION COMMA */
7485 || ch
== 0xA4FF /* LISU PUNCTUATION FULL STOP */
7486 || ch
== 0xA6F3 /* BAMUM FULL STOP */
7487 || ch
== 0xA6F4 /* BAMUM COLON */
7488 || ch
== 0xA6F5 /* BAMUM COMMA */
7489 || ch
== 0xA6F6 /* BAMUM SEMICOLON */
7490 || ch
== 0xA6F7 /* BAMUM QUESTION MARK */
7491 || ch
== 0xA9C7 /* JAVANESE PADA PANGKAT */
7492 || ch
== 0xA9C8 /* JAVANESE PADA LINGSA */
7493 || ch
== 0xA9C9 /* JAVANESE PADA LUNGSI */
7494 || ch
== 0xAAF0 /* MEETEI MAYEK CHEIKHAN */
7495 || ch
== 0xAAF1 /* MEETEI MAYEK AHANG KHUDAM */
7496 || ch
== 0xABEB /* MEETEI MAYEK CHEIKHEI */
7497 || ch
== 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
7498 || (ch
>= 0x10AF0 && ch
<= 0x10AF5) /* MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION TWO DOTS */
7499 || ch
== 0x10B39 /* AVESTAN ABBREVIATION MARK */
7500 || ch
== 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
7501 || ch
== 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
7502 || ch
== 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
7503 || ch
== 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
7504 || ch
== 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
7505 || ch
== 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
7506 || ch
== 0x10D6E /* GARAY HYPHEN */
7507 || ch
== 0x10EAD /* YEZIDI HYPHENATION MARK */
7508 || ch
== 0x11047 /* BRAHMI DANDA */
7509 || ch
== 0x11048 /* BRAHMI DOUBLE DANDA */
7510 || ch
== 0x110BE /* KAITHI SECTION MARK */
7511 || ch
== 0x110BF /* KAITHI DOUBLE SECTION MARK */
7512 || ch
== 0x110C0 /* KAITHI DANDA */
7513 || ch
== 0x110C1 /* KAITHI DOUBLE DANDA */
7514 || ch
== 0x11140 /* CHAKMA SECTION MARK */
7515 || ch
== 0x11141 /* CHAKMA DANDA */
7516 || ch
== 0x11142 /* CHAKMA DOUBLE DANDA */
7517 || ch
== 0x11143 /* CHAKMA QUESTION MARK */
7518 || ch
== 0x111C5 /* SHARADA DANDA */
7519 || ch
== 0x111C6 /* SHARADA DOUBLE DANDA */
7520 || ch
== 0x111C8 /* SHARADA SEPARATOR */
7521 || (ch
>= 0x111DD && ch
<= 0x111DF) /* SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2 */
7522 || ch
== 0x11238 /* KHOJKI DANDA */
7523 || ch
== 0x11239 /* KHOJKI DOUBLE DANDA */
7524 || ch
== 0x1123B /* KHOJKI SECTION MARK */
7525 || ch
== 0x1123C /* KHOJKI DOUBLE SECTION MARK */
7526 || ch
== 0x112A9 /* MULTANI SECTION MARK */
7527 || (ch
>= 0x1144B && ch
<= 0x1144E) /* NEWA DANDA..NEWA GAP FILLER */
7528 || ch
== 0x1145A /* NEWA DOUBLE COMMA */
7529 || ch
== 0x1145B /* NEWA PLACEHOLDER MARK */
7530 || ch
== 0x115C2 /* SIDDHAM DANDA */
7531 || ch
== 0x115C3 /* SIDDHAM DOUBLE DANDA */
7532 || (ch
>= 0x115C9 && ch
<= 0x115D7) /* SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES */
7533 || ch
== 0x11641 /* MODI DANDA */
7534 || ch
== 0x11642 /* MODI DOUBLE DANDA */
7535 || (ch
>= 0x1173C && ch
<= 0x1173E) /* AHOM SIGN SMALL SECTION..AHOM SIGN RULAI */
7536 || (ch
>= 0x11944 && ch
<= 0x11946) /* DIVES AKURU DOUBLE DANDA..DIVES AKURU END OF TEXT MARK */
7537 || ch
== 0x11A41 /* ZANABAZAR SQUARE MARK TSHEG */
7538 || ch
== 0x11A42 /* ZANABAZAR SQUARE MARK SHAD */
7539 || ch
== 0x11A43 /* ZANABAZAR SQUARE MARK DOUBLE SHAD */
7540 || ch
== 0x11A44 /* ZANABAZAR SQUARE MARK LONG TSHEG */
7541 || ch
== 0x11A9A /* SOYOMBO MARK TSHEG */
7542 || ch
== 0x11A9B /* SOYOMBO MARK SHAD */
7543 || ch
== 0x11A9C /* SOYOMBO MARK DOUBLE SHAD */
7544 || ch
== 0x11AA1 /* SOYOMBO TERMINAL MARK-1 */
7545 || ch
== 0x11AA2 /* SOYOMBO TERMINAL MARK-2 */
7546 || (ch
>= 0x11C41 && ch
<= 0x11C45) /* BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2 */
7547 || ch
== 0x11F43 /* KAWI DANDA */
7548 || ch
== 0x11F44 /* KAWI DOUBLE DANDA */
7549 || ch
== 0x11FFF /* TAMIL PUNCTUATION END OF TEXT */
7550 || ch
== 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
7551 || ch
== 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
7552 || ch
== 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */
7553 || ch
== 0x12474 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON */
7554 || ch
== 0x16A6E /* MRO DANDA */
7555 || ch
== 0x16A6F /* MRO DOUBLE DANDA */
7556 || ch
== 0x16AF5 /* BASSA VAH FULL STOP */
7557 || ch
== 0x16B37 /* PAHAWH HMONG SIGN VOS THOM */
7558 || ch
== 0x16B38 /* PAHAWH HMONG SIGN VOS TSHAB CEEB */
7559 || ch
== 0x16B39 /* PAHAWH HMONG SIGN CIM CHEEM */
7560 || ch
== 0x16B44 /* PAHAWH HMONG SIGN XAUS */
7561 || ch
== 0x16D6E /* KIRAT RAI DANDA */
7562 || ch
== 0x16D6F /* KIRAT RAI DOUBLE DANDA */
7563 || ch
== 0x16E97 /* MEDEFAIDRIN COMMA */
7564 || ch
== 0x16E98 /* MEDEFAIDRIN FULL STOP */
7565 || ch
== 0x1BC9F /* DUPLOYAN PUNCTUATION CHINOOK FULL STOP */
7566 || (ch
>= 0x1DA87 && ch
<= 0x1DA8A) /* SIGNWRITING COMMA..SIGNWRITING COLON */)
7567 attr
|= (int64_t) 1 << LBP_BA
;
7569 /* break opportunity before */
7570 if (/* Dictionary Use */
7571 ch
== 0x00B4 /* ACUTE ACCENT */
7572 || ch
== 0x1FFD /* GREEK OXIA */
7573 || ch
== 0x02DF /* MODIFIER LETTER CROSS ACCENT */
7574 || ch
== 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
7575 || ch
== 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
7576 /* Tibetan and Phags-Pa Head Letters */
7577 || ch
== 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
7578 || ch
== 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
7579 || ch
== 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
7580 || ch
== 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
7581 || ch
== 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
7582 || ch
== 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
7583 || ch
== 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
7584 || ch
== 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
7585 || ch
== 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
7586 || ch
== 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
7587 || ch
== 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
7588 || ch
== 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
7589 || ch
== 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
7591 || ch
== 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */
7592 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7593 || ch
== 0x0C77 /* TELUGU SIGN SIDDHAM */
7594 || ch
== 0x0C84 /* KANNADA SIGN SIDDHAM */
7595 || ch
== 0xA8FC /* DEVANAGARI SIGN SIDDHAM */
7596 || ch
== 0x11175 /* MAHAJANI SECTION MARK */
7597 || ch
== 0x111DB /* SHARADA SIGN SIDDHAM */
7598 || ch
== 0x115C1 /* SIDDHAM SIGN SIDDHAM */
7599 || (ch
>= 0x11660 && ch
<= 0x1166C) /* MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT */
7600 || ch
== 0x119E2 /* NANDINAGARI SIGN SIDDHAM */
7601 || ch
== 0x11A3F /* ZANABAZAR SQUARE INITIAL HEAD MARK */
7602 || ch
== 0x11A45 /* ZANABAZAR SQUARE INITIAL DOUBLE-LINED HEAD MARK */
7603 || ch
== 0x11A9E /* SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME */
7604 || ch
== 0x11A9F /* SOYOMBO HEAD MARK WITH MOON AND SUN AND FLAME */
7605 || ch
== 0x11AA0 /* SOYOMBO HEAD MARK WITH MOON AND SUN */
7606 || (ch
>= 0x11B00 && ch
<= 0x11B09) /* DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU */
7607 || ch
== 0x11C70 /* MARCHEN HEAD MARK */)
7608 attr
|= (int64_t) 1 << LBP_BB
;
7611 if (ch
== 0x002D /* HYPHEN-MINUS */)
7612 attr
|= (int64_t) 1 << LBP_HY
;
7614 /* contingent break opportunity */
7615 if (ch
== 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
7616 attr
|= (int64_t) 1 << LBP_CB
;
7618 /* closing parenthesis */
7619 if (ch
== 0x0029 /* RIGHT PARENTHESIS */
7620 || ch
== 0x005D /* RIGHT SQUARE BRACKET */
7621 || ch
== 0x2E56 /* RIGHT SQUARE BRACKET WITH STROKE */
7622 || ch
== 0x2E58 /* RIGHT SQUARE BRACKET WITH DOUBLE STROKE */
7623 || ch
== 0x2E5A /* TOP HALF RIGHT PARENTHESIS */
7624 || ch
== 0x2E5C /* BOTTOM HALF RIGHT PARENTHESIS */)
7627 attr
|= (int64_t) 1 << LBP_CP2
;
7629 attr
|= (int64_t) 1 << LBP_CP1
;
7632 /* closing punctuation */
7633 if ((unicode_attributes
[ch
].category
[0] == 'P'
7634 && unicode_attributes
[ch
].category
[1] == 'e'
7635 && !(attr
& (((int64_t) 1 << LBP_CP1
) | ((int64_t) 1 << LBP_CP2
))))
7636 || ch
== 0x3001 /* IDEOGRAPHIC COMMA */
7637 || ch
== 0x3002 /* IDEOGRAPHIC FULL STOP */
7638 || ch
== 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
7639 || ch
== 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
7640 || ch
== 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
7641 || ch
== 0xFE50 /* SMALL COMMA */
7642 || ch
== 0xFE52 /* SMALL FULL STOP */
7643 || ch
== 0xFF0C /* FULLWIDTH COMMA */
7644 || ch
== 0xFF0E /* FULLWIDTH FULL STOP */
7645 || ch
== 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
7646 || ch
== 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
7647 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7648 || ch
== 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
7649 || ch
== 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
7650 || ch
== 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
7651 || ch
== 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
7652 || ch
== 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
7653 || ch
== 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
7654 || ch
== 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
7655 || ch
== 0x1337B /* EGYPTIAN HIEROGLYPH V011C */
7656 || ch
== 0x13438 /* EGYPTIAN HIEROGLYPH END SEGMENT */
7657 || ch
== 0x1343D /* EGYPTIAN HIEROGLYPH END ENCLOSURE */
7658 || ch
== 0x1343F /* EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE */
7659 || ch
== 0x145CF /* ANATOLIAN HIEROGLYPH A410A END LOGOGRAM MARK */)
7660 attr
|= (int64_t) 1 << LBP_CL
;
7662 /* exclamation/interrogation */
7663 if (ch
== 0x0021 /* EXCLAMATION MARK */
7664 || ch
== 0x003F /* QUESTION MARK */
7665 || ch
== 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
7666 || ch
== 0x061B /* ARABIC SEMICOLON */
7667 || ch
== 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
7668 || ch
== 0x061F /* ARABIC QUESTION MARK */
7669 || ch
== 0x06D4 /* ARABIC FULL STOP */
7670 || ch
== 0x07F9 /* NKO EXCLAMATION MARK */
7671 || ch
== 0x0F0D /* TIBETAN MARK SHAD */
7672 || ch
== 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
7673 || ch
== 0xFF1F /* FULLWIDTH QUESTION MARK */
7674 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7675 || ch
== 0x061D /* ARABIC END OF TEXT MARK */
7676 || ch
== 0x0F0E /* TIBETAN MARK NYIS SHAD */
7677 || ch
== 0x0F0F /* TIBETAN MARK TSHEG SHAD */
7678 || ch
== 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
7679 || ch
== 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
7680 || ch
== 0x0F14 /* TIBETAN MARK GTER TSHEG */
7681 || ch
== 0x1802 /* MONGOLIAN COMMA */
7682 || ch
== 0x1803 /* MONGOLIAN FULL STOP */
7683 || ch
== 0x1808 /* MONGOLIAN MANCHU COMMA */
7684 || ch
== 0x1809 /* MONGOLIAN MANCHU FULL STOP */
7685 || ch
== 0x1944 /* LIMBU EXCLAMATION MARK */
7686 || ch
== 0x1945 /* LIMBU QUESTION MARK */
7687 || ch
== 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
7688 || ch
== 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
7689 || ch
== 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
7690 || ch
== 0x2CFE /* COPTIC FULL STOP */
7691 || ch
== 0x2E2E /* REVERSED QUESTION MARK */
7692 || ch
== 0x2E53 /* MEDIEVAL EXCLAMATION MARK */
7693 || ch
== 0x2E54 /* MEDIEVAL QUESTION MARK */
7694 || ch
== 0xA60E /* VAI FULL STOP */
7695 || ch
== 0xA876 /* PHAGS-PA MARK SHAD */
7696 || ch
== 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
7697 || ch
== 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
7698 || ch
== 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
7699 || ch
== 0xFE56 /* SMALL QUESTION MARK */
7700 || ch
== 0xFE57 /* SMALL EXCLAMATION MARK */
7701 || ch
== 0x115C4 /* SIDDHAM SEPARATOR DOT */
7702 || ch
== 0x115C5 /* SIDDHAM SEPARATOR BAR */
7703 || ch
== 0x11C71 /* MARCHEN MARK SHAD */)
7704 attr
|= (int64_t) 1 << LBP_EX
;
7707 if (ch
== 0x2024 /* ONE DOT LEADER */
7708 || ch
== 0x2025 /* TWO DOT LEADER */
7709 || ch
== 0x2026 /* HORIZONTAL ELLIPSIS */
7710 || ch
== 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */
7711 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7712 || ch
== 0x22EF /* MIDLINE HORIZONTAL ELLIPSIS */
7713 || ch
== 0x10AF6 /* MANICHAEAN PUNCTUATION LINE FILLER */)
7714 attr
|= (int64_t) 1 << LBP_IN
;
7717 if (ch
== 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
7718 || ch
== 0x203C /* DOUBLE EXCLAMATION MARK */
7719 || ch
== 0x203D /* INTERROBANG */
7720 || ch
== 0x2047 /* DOUBLE QUESTION MARK */
7721 || ch
== 0x2048 /* QUESTION EXCLAMATION MARK */
7722 || ch
== 0x2049 /* EXCLAMATION QUESTION MARK */
7723 || ch
== 0x3005 /* IDEOGRAPHIC ITERATION MARK */
7724 || ch
== 0x301C /* WAVE DASH */
7725 || ch
== 0x303C /* MASU MARK */
7726 || ch
== 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
7727 || ch
== 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
7728 || ch
== 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
7729 || ch
== 0x309D /* HIRAGANA ITERATION MARK */
7730 || ch
== 0x309E /* HIRAGANA VOICED ITERATION MARK */
7731 || ch
== 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
7732 || ch
== 0x30FB /* KATAKANA MIDDLE DOT */
7733 || ch
== 0x30FD /* KATAKANA ITERATION MARK */
7734 || ch
== 0x30FE /* KATAKANA VOICED ITERATION MARK */
7735 || ch
== 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
7736 || ch
== 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */
7737 || ch
== 0xFE54 /* SMALL SEMICOLON */
7738 || ch
== 0xFE55 /* SMALL COLON */
7739 || ch
== 0xFF1A /* FULLWIDTH COLON */
7740 || ch
== 0xFF1B /* FULLWIDTH SEMICOLON */
7741 || ch
== 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
7742 || ch
== 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
7743 || ch
== 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
7744 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7745 || strstr (unicode_attributes
[ch
].name
, "HIRAGANA LETTER SMALL ") != NULL
7746 || strstr (unicode_attributes
[ch
].name
, "KATAKANA LETTER SMALL ") != NULL
7747 || ch
== 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
7748 || ch
== 0xA015 /* YI SYLLABLE WU */
7749 || ch
== 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
7750 || ch
== 0x16FE0 /* TANGUT ITERATION MARK */
7751 || ch
== 0x16FE1 /* NUSHU ITERATION MARK */
7752 || ch
== 0x16FE2 /* OLD CHINESE HOOK MARK */
7753 || ch
== 0x16FE3 /* OLD CHINESE ITERATION MARK */
7754 || ch
== 0x1F679 /* HEAVY INTERROBANG ORNAMENT */
7755 || ch
== 0x1F67A /* SANS-SERIF INTERROBANG ORNAMENT */
7756 || ch
== 0x1F67B /* HEAVY SANS-SERIF INTERROBANG ORNAMENT */)
7757 attr
|= (int64_t) 1 << LBP_NS
;
7759 /* opening punctuation */
7760 if ((unicode_attributes
[ch
].category
[0] == 'P'
7761 && unicode_attributes
[ch
].category
[1] == 's')
7762 || ch
== 0x00A1 /* INVERTED EXCLAMATION MARK */
7763 || ch
== 0x00BF /* INVERTED QUESTION MARK */
7764 || ch
== 0x2E18 /* INVERTED INTERROBANG */
7765 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7766 || ch
== 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
7767 || ch
== 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
7768 || ch
== 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
7769 || ch
== 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
7770 || ch
== 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
7771 || ch
== 0x13379 /* EGYPTIAN HIEROGLYPH V011A */
7772 || ch
== 0x1342F /* EGYPTIAN HIEROGLYPH V011D */
7773 || ch
== 0x13437 /* EGYPTIAN HIEROGLYPH BEGIN SEGMENT */
7774 || ch
== 0x1343C /* EGYPTIAN HIEROGLYPH BEGIN ENCLOSURE */
7775 || ch
== 0x1343E /* EGYPTIAN HIEROGLYPH BEGIN WALLED ENCLOSURE */
7776 || ch
== 0x145CE /* ANATOLIAN HIEROGLYPH A410 BEGIN LOGOGRAM MARK */
7777 || (ch
>= 0x1E95E && ch
<= 0x1E95F) /* ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK */)
7780 attr
|= (int64_t) 1 << LBP_OP2
;
7782 attr
|= (int64_t) 1 << LBP_OP1
;
7785 /* ambiguous quotation */
7786 if ((unicode_attributes
[ch
].category
[0] == 'P'
7787 && (unicode_attributes
[ch
].category
[1] == 'f'
7788 || unicode_attributes
[ch
].category
[1] == 'i'))
7789 || ch
== 0x0022 /* QUOTATION MARK */
7790 || ch
== 0x0027 /* APOSTROPHE */
7791 || ch
== 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
7792 || ch
== 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
7793 || ch
== 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
7794 || ch
== 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
7795 || ch
== 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
7796 || ch
== 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
7797 || ch
== 0x2E06 /* RAISED INTERPOLATION MARKER */
7798 || ch
== 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
7799 || ch
== 0x2E08 /* DOTTED TRANSPOSITION MARKER */
7800 || ch
== 0x2E0B /* RAISED SQUARE */
7801 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7802 || ch
== 0x275F /* HEAVY LOW SINGLE COMMA QUOTATION MARK ORNAMENT */
7803 || ch
== 0x2760 /* HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT */
7804 || ch
== 0x1F676 /* SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
7805 || ch
== 0x1F677 /* SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
7806 || ch
== 0x1F678 /* SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT */)
7808 if (unicode_attributes
[ch
].category
[0] == 'P'
7809 && unicode_attributes
[ch
].category
[1] == 'i')
7810 attr
|= (int64_t) 1 << LBP_QU2
;
7811 else if (unicode_attributes
[ch
].category
[0] == 'P'
7812 && unicode_attributes
[ch
].category
[1] == 'f')
7813 attr
|= (int64_t) 1 << LBP_QU3
;
7815 attr
|= (int64_t) 1 << LBP_QU1
;
7818 /* infix separator (numeric) */
7819 if (ch
== 0x002C /* COMMA */
7820 || ch
== 0x002E /* FULL STOP */
7821 || ch
== 0x003A /* COLON */
7822 || ch
== 0x003B /* SEMICOLON */
7823 || ch
== 0x037E /* GREEK QUESTION MARK */
7824 || ch
== 0x0589 /* ARMENIAN FULL STOP */
7825 || ch
== 0x060C /* ARABIC COMMA */
7826 || ch
== 0x060D /* ARABIC DATE SEPARATOR */
7827 || ch
== 0x07F8 /* NKO COMMA */
7828 || ch
== 0x2044 /* FRACTION SLASH */)
7829 attr
|= (int64_t) 1 << LBP_IS
;
7832 if ((unicode_attributes
[ch
].category
[0] == 'N'
7833 && unicode_attributes
[ch
].category
[1] == 'd'
7834 && strstr (unicode_attributes
[ch
].name
, "FULLWIDTH") == NULL
7835 && !(ch
>= 0x1B50 && ch
<= 0x1B59) /* BALINESE DIGIT ZERO..NINE */
7836 && !(ch
>= 0xA9D0 && ch
<= 0xA9D9) /* JAVANESE DIGIT ZERO..NINE */
7837 && !(ch
>= 0xAA50 && ch
<= 0xAA59) /* CHAM DIGIT ZERO..NINE */
7838 && !(ch
>= 0x11066 && ch
<= 0x1106F) /* BRAHMI DIGIT ZERO..NINE */
7839 && !(ch
>= 0x11950 && ch
<= 0x11959) /* DIVES AKURU DIGIT ZERO..NINE */
7840 && !(ch
>= 0x11F50 && ch
<= 0x11F59) /* KAWI DIGIT ZERO..NINE */
7841 && !(ch
>= 0x16130 && ch
<= 0x16139)) /* GURUNG KHEMA DIGIT ZERO..NINE */
7842 || ch
== 0x066B /* ARABIC DECIMAL SEPARATOR */
7843 || ch
== 0x066C /* ARABIC THOUSANDS SEPARATOR */
7844 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7845 || ch
== 0x0600 /* ARABIC NUMBER SIGN */
7846 || ch
== 0x0601 /* ARABIC SIGN SANAH */
7847 || ch
== 0x0602 /* ARABIC FOOTNOTE MARKER */
7848 || ch
== 0x0603 /* ARABIC SIGN SAFHA */
7849 || ch
== 0x0604 /* ARABIC SIGN SAMVAT */
7850 || ch
== 0x0605 /* ARABIC NUMBER MARK ABOVE */
7851 || ch
== 0x06DD /* ARABIC END OF AYAH */
7852 || ch
== 0x0890 /* ARABIC POUND MARK ABOVE */
7853 || ch
== 0x0891 /* ARABIC PIASTRE MARK ABOVE */
7854 || ch
== 0x08E2 /* ARABIC DISPUTED END OF AYAH */
7855 || ch
== 0x19DA /* NEW TAI LUE THAM DIGIT ONE */
7856 || ch
== 0x110BD /* KAITHI NUMBER SIGN */
7857 || ch
== 0x110CD /* KAITHI NUMBER SIGN ABOVE */)
7858 attr
|= (int64_t) 1 << LBP_NU
;
7860 /* postfix numeric */
7861 if (ch
== 0x0025 /* PERCENT SIGN */
7862 || ch
== 0x00A2 /* CENT SIGN */
7863 || ch
== 0x00B0 /* DEGREE SIGN */
7864 || ch
== 0x060B /* AFGHANI SIGN */
7865 || ch
== 0x066A /* ARABIC PERCENT SIGN */
7866 || ch
== 0x2030 /* PER MILLE SIGN */
7867 || ch
== 0x2031 /* PER TEN THOUSAND SIGN */
7868 || ch
== 0x2032 /* PRIME */
7869 || ch
== 0x2033 /* DOUBLE PRIME */
7870 || ch
== 0x2034 /* TRIPLE PRIME */
7871 || ch
== 0x2035 /* REVERSED PRIME */
7872 || ch
== 0x2036 /* REVERSED DOUBLE PRIME */
7873 || ch
== 0x2037 /* REVERSED TRIPLE PRIME */
7874 || ch
== 0x20A7 /* PESETA SIGN */
7875 || ch
== 0x2103 /* DEGREE CELSIUS */
7876 || ch
== 0x2109 /* DEGREE FAHRENHEIT */
7877 || ch
== 0xFDFC /* RIAL SIGN */
7878 || ch
== 0xFE6A /* SMALL PERCENT SIGN */
7879 || ch
== 0xFF05 /* FULLWIDTH PERCENT SIGN */
7880 || ch
== 0xFFE0 /* FULLWIDTH DIGIT ZERO */
7881 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7882 || ch
== 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
7883 || ch
== 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
7884 || ch
== 0x09F2 /* BENGALI RUPEE MARK */
7885 || ch
== 0x09F3 /* BENGALI RUPEE SIGN */
7886 || ch
== 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
7887 || ch
== 0x0D79 /* MALAYALAM DATE MARK */
7888 || ch
== 0x2057 /* QUADRUPLE PRIME */
7889 || ch
== 0x20B6 /* LIVRE TOURNOIS SIGN */
7890 || ch
== 0x20BB /* NORDIC MARK SIGN */
7891 || ch
== 0x20BE /* LARI SIGN */
7892 || ch
== 0x20C0 /* SOM SIGN */
7893 || ch
== 0xA838 /* NORTH INDIC RUPEE MARK */
7894 || (ch
>= 0x11FDD && ch
<= 0x11FE0) /* TAMIL SIGN KAACU..TAMIL SIGN VARAAKAN */
7895 || ch
== 0x1ECAC /* INDIC SIYAQ PLACEHOLDER */
7896 || ch
== 0x1ECB0 /* INDIC SIYAQ RUPEE MARK */)
7897 attr
|= (int64_t) 1 << LBP_PO
;
7899 /* prefix numeric */
7900 if ((unicode_attributes
[ch
].category
[0] == 'S'
7901 && unicode_attributes
[ch
].category
[1] == 'c')
7902 || ch
== 0x002B /* PLUS SIGN */
7903 || ch
== 0x005C /* REVERSE SOLIDUS */
7904 || ch
== 0x00B1 /* PLUS-MINUS SIGN */
7905 || ch
== 0x2116 /* NUMERO SIGN */
7906 || ch
== 0x2212 /* MINUS SIGN */
7907 || ch
== 0x2213 /* MINUS-OR-PLUS SIGN */)
7908 if (!(attr
& ((int64_t) 1 << LBP_PO
)))
7909 attr
|= (int64_t) 1 << LBP_PR
;
7911 /* symbols allowing breaks */
7912 if (ch
== 0x002F /* SOLIDUS */)
7913 attr
|= (int64_t) 1 << LBP_SY
;
7915 if (ch
>= 0xAC00 && ch
<= 0xD7A3 && ((ch
- 0xAC00) % 28) == 0)
7916 attr
|= (int64_t) 1 << LBP_H2
;
7918 if (ch
>= 0xAC00 && ch
<= 0xD7A3 && ((ch
- 0xAC00) % 28) != 0)
7919 attr
|= (int64_t) 1 << LBP_H3
;
7921 if ((ch
>= 0x05D0 && ch
<= 0x05F2) || ch
== 0xFB1D
7922 || (ch
>= 0xFB1F && ch
<= 0xFB28) || (ch
>= 0xFB2A && ch
<= 0xFB4F))
7923 attr
|= (int64_t) 1 << LBP_HL
;
7925 if ((ch
>= 0x1100 && ch
<= 0x115F) || (ch
>= 0xA960 && ch
<= 0xA97C))
7926 attr
|= (int64_t) 1 << LBP_JL
;
7928 if ((ch
>= 0x1160 && ch
<= 0x11A7) || (ch
>= 0xD7B0 && ch
<= 0xD7C6))
7929 attr
|= (int64_t) 1 << LBP_JV
;
7931 if ((ch
>= 0x11A8 && ch
<= 0x11FF) || (ch
>= 0xD7CB && ch
<= 0xD7FB))
7932 attr
|= (int64_t) 1 << LBP_JT
;
7934 /* Brahmic scripts: pre-base repha */
7935 if ((ch
>= 0x11003 && ch
<= 0x11004)
7937 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7941 attr
|= (int64_t) 1 << LBP_AP
;
7943 /* Brahmic scripts: consonants */
7944 if ((ch
>= 0x1B05 && ch
<= 0x1B33)
7945 || (ch
>= 0x1B45 && ch
<= 0x1B4C)
7946 || (ch
>= 0xA984 && ch
<= 0xA9B2)
7947 || (ch
>= 0x11005 && ch
<= 0x11037)
7948 || (ch
>= 0x11071 && ch
<= 0x11072)
7950 || (ch
>= 0x11305 && ch
<= 0x1130C)
7951 || (ch
>= 0x1130F && ch
<= 0x11310)
7952 || (ch
>= 0x11313 && ch
<= 0x11328)
7953 || (ch
>= 0x1132A && ch
<= 0x11330)
7954 || (ch
>= 0x11332 && ch
<= 0x11333)
7955 || (ch
>= 0x11335 && ch
<= 0x11339)
7956 || (ch
>= 0x11360 && ch
<= 0x11361)
7957 || (ch
>= 0x11F04 && ch
<= 0x11F10)
7958 || (ch
>= 0x11F12 && ch
<= 0x11F33)
7959 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7960 || (ch
>= 0x11392 && ch
<= 0x113B5)
7961 || (ch
>= 0x11900 && ch
<= 0x11906)
7963 || (ch
>= 0x1190C && ch
<= 0x11913)
7964 || (ch
>= 0x11915 && ch
<= 0x11916)
7965 || (ch
>= 0x11918 && ch
<= 0x1192F))
7966 attr
|= (int64_t) 1 << LBP_AK
;
7968 /* Brahmic scripts: independent vowels */
7969 if ((ch
>= 0x1B50 && ch
<= 0x1B59) /* BALINESE DIGIT ZERO..NINE */
7970 || (ch
>= 0x1BC0 && ch
<= 0x1BE5)
7971 || (ch
>= 0xA9D0 && ch
<= 0xA9D9) /* JAVANESE DIGIT ZERO..NINE */
7972 || (ch
>= 0xAA00 && ch
<= 0xAA28)
7973 || (ch
>= 0xAA50 && ch
<= 0xAA59) /* CHAM DIGIT ZERO..NINE */
7974 || (ch
>= 0x11066 && ch
<= 0x1106F)
7976 || (ch
>= 0x1135E && ch
<= 0x1135F)
7977 || (ch
>= 0x11950 && ch
<= 0x11959) /* DIVES AKURU DIGIT ZERO..NINE */
7978 || (ch
>= 0x11EE0 && ch
<= 0x11EF1)
7979 || (ch
>= 0x11F50 && ch
<= 0x11F59)
7980 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7981 || (ch
>= 0x11380 && ch
<= 0x11389)
7986 || (ch
>= 0x16100 && ch
<= 0x1611D)
7987 || (ch
>= 0x16130 && ch
<= 0x16139) /* GURUNG KHEMA DIGIT ZERO..NINE */)
7988 attr
|= (int64_t) 1 << LBP_AS
;
7990 /* Brahmic scripts: conjoining viramas */
7996 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7999 attr
|= (int64_t) 1 << LBP_VI
;
8001 /* Brahmic scripts: viramas for final consonants */
8002 if (ch
== 0x1BF2 || ch
== 0x1BF3)
8003 attr
|= (int64_t) 1 << LBP_VF
;
8005 if (is_property_regional_indicator (ch
))
8006 attr
|= (int64_t) 1 << LBP_RI
;
8008 /* complex context (South East Asian) */
8009 if (((unicode_attributes
[ch
].category
[0] == 'C'
8010 && unicode_attributes
[ch
].category
[1] == 'f')
8011 || (unicode_attributes
[ch
].category
[0] == 'L'
8012 && (unicode_attributes
[ch
].category
[1] == 'm'
8013 || unicode_attributes
[ch
].category
[1] == 'o'))
8014 || (unicode_attributes
[ch
].category
[0] == 'M'
8015 && (unicode_attributes
[ch
].category
[1] == 'c'
8016 || unicode_attributes
[ch
].category
[1] == 'n')
8017 && ch
!= 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
8018 /* Extra characters for compatibility with Unicode LineBreak.txt. */
8019 || ch
== 0x109E /* MYANMAR SYMBOL SHAN ONE */
8020 || ch
== 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
8021 || ch
== 0x19DE /* NEW TAI LUE SIGN LAE */
8022 || ch
== 0x19DF /* NEW TAI LUE SIGN LAEV */
8023 || (ch
>= 0x1AA0 && ch
<= 0x1AAD) /* TAI THAM SIGN */
8024 || (ch
>= 0xA9E0 && ch
<= 0xA9EF) /* Myanmar */
8025 || (ch
>= 0xA9FA && ch
<= 0xA9FE) /* Myanmar */
8026 || (ch
>= 0xAA77 && ch
<= 0xAA79) /* MYANMAR SYMBOL AITON */
8027 || (ch
>= 0xAADE && ch
<= 0xAADF) /* TAI VIET SYMBOL */
8028 || (ch
>= 0x1173A && ch
<= 0x1173B) /* Ahom */
8029 || (ch
>= 0x1173F && ch
<= 0x11746) /* Ahom */)
8030 && ((ch
>= 0x0E00 && ch
<= 0x0EFF) /* Thai, Lao */
8031 || (ch
>= 0x1000 && ch
<= 0x109F) /* Myanmar */
8032 || (ch
>= 0x1780 && ch
<= 0x17FF) /* Khmer */
8033 || (ch
>= 0x1950 && ch
<= 0x19DF) /* Tai Le, New Tai Lue */
8034 || (ch
>= 0x1A20 && ch
<= 0x1AAF) /* Tai Tham */
8035 || (ch
>= 0xA9E0 && ch
<= 0xA9EF) /* Myanmar */
8036 || (ch
>= 0xA9FA && ch
<= 0xA9FE) /* Myanmar */
8037 || (ch
>= 0xAA60 && ch
<= 0xAADF) /* Myanmar Extended-A, Tai Viet */
8038 || (ch
>= 0x11700 && ch
<= 0x1171A) /* Ahom */
8039 || (ch
>= 0x1171D && ch
<= 0x1172B) /* Ahom */
8040 || (ch
>= 0x1173A && ch
<= 0x1173B) /* Ahom */
8041 || (ch
>= 0x1173F && ch
<= 0x11746) /* Ahom */))
8042 attr
|= (int64_t) 1 << LBP_SA
;
8044 /* attached characters and combining marks */
8045 if ((unicode_attributes
[ch
].category
[0] == 'M'
8046 && (unicode_attributes
[ch
].category
[1] == 'c'
8047 || unicode_attributes
[ch
].category
[1] == 'e'
8048 || unicode_attributes
[ch
].category
[1] == 'n')
8049 && ch
!= 0x1BF2 /* BATAK PANGOLAT */
8050 && ch
!= 0x1BF3 /* BATAK PANONGONAN */)
8051 || (unicode_attributes
[ch
].category
[0] == 'C'
8052 && (unicode_attributes
[ch
].category
[1] == 'c'
8053 || unicode_attributes
[ch
].category
[1] == 'f')
8054 && ch
!= 0x0600 /* ARABIC NUMBER SIGN */
8055 && ch
!= 0x0601 /* ARABIC SIGN SANAH */
8056 && ch
!= 0x0602 /* ARABIC FOOTNOTE MARKER */
8057 && ch
!= 0x0603 /* ARABIC SIGN SAFHA */
8058 && ch
!= 0x0604 /* ARABIC SIGN SAMVAT */
8059 && ch
!= 0x0605 /* ARABIC NUMBER MARK ABOVE */
8060 && ch
!= 0x06DD /* ARABIC END OF AYAH */
8061 && ch
!= 0x0890 /* ARABIC POUND MARK ABOVE */
8062 && ch
!= 0x0891 /* ARABIC PIASTRE MARK ABOVE */
8063 && ch
!= 0x08E2 /* ARABIC DISPUTED END OF AYAH */
8064 && ch
!= 0x110BD /* KAITHI NUMBER SIGN */
8065 && ch
!= 0x110CD /* KAITHI NUMBER SIGN ABOVE */
8066 && ch
!= 0x13437 /* EGYPTIAN HIEROGLYPH BEGIN SEGMENT */
8067 && ch
!= 0x13438 /* EGYPTIAN HIEROGLYPH END SEGMENT */
8068 && ch
!= 0x1343C /* EGYPTIAN HIEROGLYPH BEGIN ENCLOSURE */
8069 && ch
!= 0x1343D /* EGYPTIAN HIEROGLYPH END ENCLOSURE */
8070 && ch
!= 0x1343E /* EGYPTIAN HIEROGLYPH BEGIN WALLED ENCLOSURE */
8071 && ch
!= 0x1343F /* EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE */)
8072 || ch
== 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
8073 if (!(attr
& (((int64_t) 1 << LBP_BK
) | ((int64_t) 1 << LBP_CR
) | ((int64_t) 1 << LBP_LF
) | ((int64_t) 1 << LBP_BA
) | ((int64_t) 1 << LBP_GL
) | ((int64_t) 1 << LBP_VI
) | ((int64_t) 1 << LBP_SA
) | ((int64_t) 1 << LBP_WJ
) | ((int64_t) 1 << LBP_ZW
) | ((int64_t) 1 << LBP_ZWJ
))))
8074 attr
|= (int64_t) 1 << LBP_CM
;
8077 if ((ch
>= 0x2E80 && ch
<= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
8078 || (ch
>= 0x3040 && ch
<= 0x309F) /* HIRAGANA */
8079 || (ch
>= 0x30A0 && ch
<= 0x30FF) /* KATAKANA */
8080 || (ch
>= 0x3400 && ch
<= 0x4DBF) /* CJK Ideograph Extension A */
8081 || (ch
>= 0x4E00 && ch
<= 0x9FFF) /* CJK Ideograph */
8082 || (ch
>= 0xF900 && ch
<= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
8083 /* Extra characters for compatibility with Unicode LineBreak.txt. */
8084 || strstr (unicode_attributes
[ch
].name
, "FULLWIDTH LATIN ") != NULL
8085 || ch
== 0x1B5C /* BALINESE WINDU */
8086 || (ch
>= 0x1B61 && ch
<= 0x1B6A) /* BALINESE MUSICAL SYMBOL DONG..BALINESE MUSICAL SYMBOL DANG GEDE */
8087 || (ch
>= 0x1B74 && ch
<= 0x1B7C) /* BALINESE MUSICAL SYMBOL RIGHT-HAND OPEN DUG..BALINESE MUSICAL SYMBOL LEFT-HAND OPEN PING */
8088 || ch
== 0x231A /* WATCH */
8089 || ch
== 0x231B /* HOURGLASS */
8090 || ch
== 0x23F0 /* ALARM CLOCK */
8091 || ch
== 0x23F1 /* STOPWATCH */
8092 || ch
== 0x23F2 /* TIMER CLOCK */
8093 || ch
== 0x23F3 /* HOURGLASS WITH FLOWING SAND */
8094 || ch
== 0x2600 /* BLACK SUN WITH RAYS */
8095 || ch
== 0x2601 /* CLOUD */
8096 || ch
== 0x2602 /* UMBRELLA */
8097 || ch
== 0x2603 /* SNOWMAN */
8098 || ch
== 0x2614 /* UMBRELLA WITH RAIN DROPS */
8099 || ch
== 0x2615 /* HOT BEVERAGE */
8100 || ch
== 0x2618 /* SHAMROCK */
8101 || ch
== 0x261A /* BLACK LEFT POINTING INDEX */
8102 || ch
== 0x261B /* BLACK RIGHT POINTING INDEX */
8103 || ch
== 0x261C /* WHITE LEFT POINTING INDEX */
8104 || ch
== 0x261D /* WHITE UP POINTING INDEX */
8105 || ch
== 0x261E /* WHITE RIGHT POINTING INDEX */
8106 || ch
== 0x261F /* WHITE DOWN POINTING INDEX */
8107 || ch
== 0x2639 /* WHITE FROWNING FACE */
8108 || ch
== 0x263A /* WHITE SMILING FACE */
8109 || ch
== 0x263B /* BLACK SMILING FACE */
8110 || ch
== 0x2668 /* HOT SPRINGS */
8111 || ch
== 0x267F /* WHEELCHAIR SYMBOL */
8112 || ch
== 0x26BD /* SOCCER BALL */
8113 || ch
== 0x26BE /* BASEBALL */
8114 || ch
== 0x26BF /* SQUARED KEY */
8115 || ch
== 0x26C0 /* WHITE DRAUGHTS MAN */
8116 || ch
== 0x26C1 /* WHITE DRAUGHTS KING */
8117 || ch
== 0x26C2 /* BLACK DRAUGHTS MAN */
8118 || ch
== 0x26C3 /* BLACK DRAUGHTS KING */
8119 || ch
== 0x26C4 /* SNOWMAN WITHOUT SNOW */
8120 || ch
== 0x26C5 /* SUN BEHIND CLOUD */
8121 || ch
== 0x26C6 /* RAIN */
8122 || ch
== 0x26C7 /* BLACK SNOWMAN */
8123 || ch
== 0x26C8 /* THUNDER CLOUD AND RAIN */
8124 || ch
== 0x26CD /* DISABLED CAR */
8125 || ch
== 0x26CF /* PICK */
8126 || ch
== 0x26D0 /* CAR SLIDING */
8127 || ch
== 0x26D1 /* HELMET WITH WHITE CROSS */
8128 || ch
== 0x26D3 /* CHAINS */
8129 || ch
== 0x26D4 /* NO ENTRY */
8130 || ch
== 0x26D8 /* BLACK LEFT LANE MERGE */
8131 || ch
== 0x26D9 /* WHITE LEFT LANE MERGE */
8132 || ch
== 0x26DC /* LEFT CLOSED ENTRY */
8133 || ch
== 0x26DF /* BLACK TRUCK */
8134 || ch
== 0x26E0 /* RESTRICTED LEFT ENTRY-1 */
8135 || ch
== 0x26E1 /* RESTRICTED LEFT ENTRY-2 */
8136 || ch
== 0x26EA /* CHURCH */
8137 || ch
== 0x26F1 /* UMBRELLA ON GROUND */
8138 || ch
== 0x26F2 /* FOUNTAIN */
8139 || ch
== 0x26F3 /* FLAG IN HOLE */
8140 || ch
== 0x26F4 /* FERRY */
8141 || ch
== 0x26F5 /* SAILBOAT */
8142 || ch
== 0x26F7 /* SKIER */
8143 || ch
== 0x26F8 /* ICE SKATE */
8144 || ch
== 0x26F9 /* PERSON WITH BALL */
8145 || ch
== 0x26FA /* TENT */
8146 || ch
== 0x26FD /* FUEL PUMP */
8147 || ch
== 0x26FE /* CUP ON BLACK SQUARE */
8148 || ch
== 0x26FF /* WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE */
8149 || ch
== 0x2700 /* BLACK SAFETY SCISSORS */
8150 || ch
== 0x2701 /* UPPER BLADE SCISSORS */
8151 || ch
== 0x2702 /* BLACK SCISSORS */
8152 || ch
== 0x2703 /* LOWER BLADE SCISSORS */
8153 || ch
== 0x2704 /* WHITE SCISSORS */
8154 || ch
== 0x2708 /* AIRPLANE */
8155 || ch
== 0x2709 /* ENVELOPE */
8156 || ch
== 0x270A /* RAISED FIST */
8157 || ch
== 0x270B /* RAISED HAND */
8158 || ch
== 0x270C /* VICTORY HAND */
8159 || ch
== 0x270D /* WRITING HAND */
8160 || ch
== 0x2764 /* HEAVY BLACK HEART */
8161 || (ch
>= 0x3000 && ch
<= 0x33FF
8162 && !(attr
& (((int64_t) 1 << LBP_BA
) | ((int64_t) 1 << LBP_CM
) | ((int64_t) 1 << LBP_NS
) | ((int64_t) 1 << LBP_OP1
) | ((int64_t) 1 << LBP_OP2
) | ((int64_t) 1 << LBP_CL
) | ((int64_t) 1 << LBP_CP1
) | ((int64_t) 1 << LBP_CP2
))))
8163 || (ch
>= 0xA000 && ch
<= 0xA48F) /* YI SYLLABLE */
8164 || (ch
>= 0xA490 && ch
<= 0xA4CF) /* YI RADICAL */
8165 || (ch
>= 0xA9C1 && ch
<= 0xA9C6) /* JAVANESE LEFT RERENGGAN..JAVANESE PADA WINDU */
8166 || (ch
>= 0xA9CA && ch
<= 0xA9CD) /* JAVANESE PADA ADEG..JAVANESE TURNED PADA PISELEH */
8167 || ch
== 0xA9DE /* JAVANESE PADA TIRTA TUMETES */
8168 || ch
== 0xA9DF /* JAVANESE PADA ISEN-ISEN */
8169 || ch
== 0xAA5C /* CHAM PUNCTUATION SPIRAL */
8170 || ch
== 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
8171 || ch
== 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
8172 || ch
== 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
8173 || ch
== 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
8174 || ch
== 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
8175 || ch
== 0xFE45 /* SESAME DOT */
8176 || ch
== 0xFE46 /* WHITE SESAME DOT */
8177 || ch
== 0xFE49 /* DASHED OVERLINE */
8178 || ch
== 0xFE4A /* CENTRELINE OVERLINE */
8179 || ch
== 0xFE4B /* WAVY OVERLINE */
8180 || ch
== 0xFE4C /* DOUBLE WAVY OVERLINE */
8181 || ch
== 0xFE4D /* DASHED LOW LINE */
8182 || ch
== 0xFE4E /* CENTRELINE LOW LINE */
8183 || ch
== 0xFE4F /* WAVY LOW LINE */
8184 || ch
== 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
8185 || ch
== 0xFE58 /* SMALL EM DASH */
8186 || ch
== 0xFE5F /* SMALL NUMBER SIGN */
8187 || ch
== 0xFE60 /* SMALL AMPERSAND */
8188 || ch
== 0xFE61 /* SMALL ASTERISK */
8189 || ch
== 0xFE62 /* SMALL PLUS SIGN */
8190 || ch
== 0xFE63 /* SMALL HYPHEN-MINUS */
8191 || ch
== 0xFE64 /* SMALL LESS-THAN SIGN */
8192 || ch
== 0xFE65 /* SMALL GREATER-THAN SIGN */
8193 || ch
== 0xFE66 /* SMALL EQUALS SIGN */
8194 || ch
== 0xFE68 /* SMALL REVERSE SOLIDUS */
8195 || ch
== 0xFE6B /* SMALL COMMERCIAL AT */
8196 || ch
== 0xFF02 /* FULLWIDTH QUOTATION MARK */
8197 || ch
== 0xFF03 /* FULLWIDTH NUMBER SIGN */
8198 || ch
== 0xFF06 /* FULLWIDTH AMPERSAND */
8199 || ch
== 0xFF07 /* FULLWIDTH APOSTROPHE */
8200 || ch
== 0xFF0A /* FULLWIDTH ASTERISK */
8201 || ch
== 0xFF0B /* FULLWIDTH PLUS SIGN */
8202 || ch
== 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
8203 || ch
== 0xFF0F /* FULLWIDTH SOLIDUS */
8204 || (ch
>= 0xFF10 && ch
<= 0xFF19) /* FULLWIDTH DIGIT */
8205 || ch
== 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
8206 || ch
== 0xFF1D /* FULLWIDTH EQUALS SIGN */
8207 || ch
== 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
8208 || ch
== 0xFF20 /* FULLWIDTH COMMERCIAL AT */
8209 || ch
== 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
8210 || ch
== 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
8211 || ch
== 0xFF3F /* FULLWIDTH LOW LINE */
8212 || ch
== 0xFF40 /* FULLWIDTH GRAVE ACCENT */
8213 || ch
== 0xFF5C /* FULLWIDTH VERTICAL LINE */
8214 || ch
== 0xFF5E /* FULLWIDTH TILDE */
8215 || ch
== 0xFF66 /* Halfwidth Katakana */
8216 || (ch
>= 0xFF71 && ch
<= 0xFF9D) /* Halfwidth Katakana */
8217 || (ch
>= 0xFFA0 && ch
<= 0xFFBE) /* Halfwidth Hangul */
8218 || (ch
>= 0xFFC2 && ch
<= 0xFFC7) /* Halfwidth Hangul */
8219 || (ch
>= 0xFFCA && ch
<= 0xFFCF) /* Halfwidth Hangul */
8220 || (ch
>= 0xFFD2 && ch
<= 0xFFD7) /* Halfwidth Hangul */
8221 || (ch
>= 0xFFDA && ch
<= 0xFFDC) /* Halfwidth Hangul */
8222 || ch
== 0xFFE2 /* FULLWIDTH NOT SIGN */
8223 || ch
== 0xFFE3 /* FULLWIDTH MACRON */
8224 || ch
== 0xFFE4 /* FULLWIDTH BROKEN BAR */
8225 || (ch
>= 0x11049 && ch
<= 0x1104D) /* BRAHMI PUNCTUATION DOT..BRAHMI PUNCTUATION LOTUS */
8226 || (ch
>= 0x11052 && ch
<= 0x11065) /* BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND */
8227 || ch
== 0x113B7 /* TULU-TIGALARI SIGN AVAGRAHA */
8228 || ch
== 0x113D3 /* TULU-TIGALARI SIGN PLUTA */
8229 || ch
== 0x113D4 /* TULU-TIGALARI DANDA */
8230 || ch
== 0x113D5 /* TULU-TIGALARI DOUBLE DANDA */
8231 || ch
== 0x113D7 /* TULU-TIGALARI SIGN OM PUSHPIKA */
8232 || ch
== 0x113D8 /* TULU-TIGALARI SIGN SHRII PUSHPIKA */
8233 || (ch
>= 0x11F45 && ch
<= 0x11F4F) /* Kawi Punctuation */
8234 || (ch
>= 0x17000 && ch
<= 0x187F7) /* Tangut Ideograph */
8235 || (ch
>= 0x18800 && ch
<= 0x18AFF) /* Tangut Ideograph */
8236 || (ch
>= 0x18D00 && ch
<= 0x18D08) /* Tangut Ideograph Supplement */
8237 || (ch
>= 0x1B000 && ch
<= 0x1B001) /* Kana Supplement */
8238 || (ch
>= 0x1B002 && ch
<= 0x1B122) /* Hentaigana, archaic Hiragana/Katakana */
8239 || (ch
>= 0x1B170 && ch
<= 0x1B2FB) /* Nushu */
8240 || (ch
>= 0x1F000 && ch
<= 0x1F02B) /* Mahjong Tiles */
8241 || (ch
>= 0x1F030 && ch
<= 0x1F093) /* Domino Tiles */
8242 || (ch
>= 0x1F0A0 && ch
<= 0x1F0F5) /* Playing Cards */
8243 || (ch
>= 0x1F200 && ch
<= 0x1F248) /* Enclosed Ideographic Supplement */
8244 || (ch
>= 0x1F250 && ch
<= 0x1F251) /* Enclosed Ideographic Supplement */
8245 || (ch
>= 0x1F260 && ch
<= 0x1F265) /* Rounded Symbols */
8246 || (ch
>= 0x1F300 && ch
<= 0x1F5FF /* Miscellaneous Symbols and Pictographs */
8247 && ch
!= 0x1F3B5 && ch
!= 0x1F3B6 && ch
!= 0x1F3BC
8248 && ch
!= 0x1F4A0 && ch
!= 0x1F4A2 && ch
!= 0x1F4A4
8249 && ch
!= 0x1F4AF && ch
!= 0x1F4B1 && ch
!= 0x1F4B2
8250 && !(ch
>= 0x1F39C && ch
<= 0x1F39D)
8251 && !(ch
>= 0x1F3FB && ch
<= 0x1F3FF)
8252 && !(ch
>= 0x1F500 && ch
<= 0x1F506)
8253 && !(ch
>= 0x1F517 && ch
<= 0x1F524)
8254 && !(ch
>= 0x1F532 && ch
<= 0x1F549)
8255 && !(ch
>= 0x1F5D4 && ch
<= 0x1F5DB)
8256 && !(ch
>= 0x1F5F4 && ch
<= 0x1F5F9))
8257 || (ch
>= 0x1F600 && ch
<= 0x1F64F) /* Emoticons */
8258 || (ch
>= 0x1F680 && ch
<= 0x1F6DF) /* Transport and Map Symbols */
8259 || (ch
>= 0x1F6E0 && ch
<= 0x1F6EC) /* Transport and Map Symbols */
8260 || (ch
>= 0x1F6F0 && ch
<= 0x1F6FC) /* Transport and Map Symbols */
8261 || ch
== 0x1F774 /* LOT OF FORTUNE */
8262 || ch
== 0x1F775 /* OCCULTATION */
8263 || ch
== 0x1F776 /* LUNAR ECLIPSE */
8264 || ch
== 0x1F77B /* HAUMEA */
8265 || ch
== 0x1F77C /* MAKEMAKE */
8266 || ch
== 0x1F77D /* GONGGONG */
8267 || ch
== 0x1F77E /* QUAOAR */
8268 || ch
== 0x1F77F /* ORCUS */
8269 || (ch
>= 0x1F7D5 && ch
<= 0x1F7D8) /* Circled polygons */
8270 || ch
== 0x1F7D9 /* NINE POINTED WHITE STAR */
8271 || (ch
>= 0x1F7E0 && ch
<= 0x1F7EB) /* Large circles */
8272 || ch
== 0x1F7F0 /* Heavy equals sign */
8273 || (ch
>= 0x1F90C && ch
<= 0x1F9FF) /* Supplemental Symbols and Pictographs */
8274 || (ch
>= 0x1FA60 && ch
<= 0x1FA6D) /* Xiangqi pieces */
8275 || (ch
>= 0x1FA70 && ch
<= 0x1FA74) /* Emoticons */
8276 || (ch
>= 0x1FA75 && ch
<= 0x1FA77) /* Colored heart symbols */
8277 || (ch
>= 0x1FA78 && ch
<= 0x1FA7C) /* Medical pictographs */
8278 || (ch
>= 0x1FA80 && ch
<= 0x1FA89) /* Pictographs */
8279 || (ch
>= 0x1FA8F && ch
<= 0x1FABD) /* Pictographs */
8280 || (ch
>= 0x1FABE && ch
<= 0x1FAC2) /* Pictographs */
8281 || ch
== 0x1FAC6 /* Pictographs */
8282 || (ch
>= 0x1FACE && ch
<= 0x1FADC) /* Pictographs */
8283 || (ch
>= 0x1FADF && ch
<= 0x1FAE9) /* Pictographs */
8284 || (ch
>= 0x1FAF7 && ch
<= 0x1FAF8) /* Pictographs */
8285 || (ch
>= 0x20000 && ch
<= 0x2A6D6) /* CJK Ideograph Extension B */
8286 || (ch
>= 0x2A6D7 && ch
<= 0x2A6DF) /* CJK Ideograph Extension B */
8287 || (ch
>= 0x2A700 && ch
<= 0x2B739) /* CJK Ideograph Extension C */
8288 || (ch
>= 0x2B740 && ch
<= 0x2B81D) /* CJK Ideograph Extension D */
8289 || (ch
>= 0x2B820 && ch
<= 0x2CEAF) /* CJK Ideograph Extension E */
8290 || (ch
>= 0x2CEB0 && ch
<= 0x2EBE0) /* CJK Ideograph Extension F */
8291 || (ch
>= 0x2EBF0 && ch
<= 0x2EE5D) /* CJK Ideograph Extension I */
8292 || (ch
>= 0x2F800 && ch
<= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
8293 || (ch
>= 0x30000 && ch
<= 0x3134A) /* CJK Ideograph Extension G */
8294 || (ch
>= 0x31350 && ch
<= 0x323AF) /* CJK Ideograph Extension H */)
8295 if (!(attr
& (((int64_t) 1 << LBP_NS
) | ((int64_t) 1 << LBP_CM
) | ((int64_t) 1 << LBP_EB
))))
8297 /* ambiguous (ideograph) ? */
8298 if ((unicode_width
[ch
] != NULL
8299 && unicode_width
[ch
][0] == 'A'
8308 && !(ch
>= 0x26C4 && ch
<= 0x26C8)
8322 && !(ch
>= 0x26F1 && ch
<= 0x26F5)
8323 && !(ch
>= 0x26F7 && ch
<= 0x26FA)
8324 && !(ch
>= 0x26FD && ch
<= 0x26FF))
8325 || ch
== 0x24EA /* CIRCLED DIGIT ZERO */
8326 || (ch
>= 0x2780 && ch
<= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
8327 || (ch
>= 0x3248 && ch
<= 0x324F) /* CIRCLED NUMBER TEN ON BLACK SQUARE..CIRCLED NUMBER EIGHTY ON BLACK SQUARE */)
8328 attr
|= (int64_t) 1 << LBP_AI
;
8330 attr
|= (int64_t) 1 << LBP_ID1
;
8333 /* ordinary alphabetic and symbol characters */
8334 if ((unicode_attributes
[ch
].category
[0] == 'L'
8335 && (unicode_attributes
[ch
].category
[1] == 'u'
8336 || unicode_attributes
[ch
].category
[1] == 'l'
8337 || unicode_attributes
[ch
].category
[1] == 't'
8338 || unicode_attributes
[ch
].category
[1] == 'm'
8339 || unicode_attributes
[ch
].category
[1] == 'o'))
8340 || (unicode_attributes
[ch
].category
[0] == 'S'
8341 && (unicode_attributes
[ch
].category
[1] == 'm'
8342 || unicode_attributes
[ch
].category
[1] == 'k'
8343 || unicode_attributes
[ch
].category
[1] == 'o'))
8344 || (unicode_attributes
[ch
].category
[0] == 'N'
8345 && (unicode_attributes
[ch
].category
[1] == 'l'
8346 || unicode_attributes
[ch
].category
[1] == 'o'))
8347 || (unicode_attributes
[ch
].category
[0] == 'P'
8348 && (unicode_attributes
[ch
].category
[1] == 'c'
8349 || unicode_attributes
[ch
].category
[1] == 'd'
8350 || unicode_attributes
[ch
].category
[1] == 'o'))
8351 || ch
== 0x070F /* SYRIAC ABBREVIATION MARK */
8352 || ch
== 0x2061 /* FUNCTION APPLICATION */
8353 || ch
== 0x2062 /* INVISIBLE TIMES */
8354 || ch
== 0x2063 /* INVISIBLE SEPARATOR */
8355 || ch
== 0x2064 /* INVISIBLE PLUS */
8356 /* Extra characters for compatibility with Unicode LineBreak.txt. */
8357 || ch
== 0x08E2 /* ARABIC DISPUTED END OF AYAH */)
8358 if (!(attr
& (((int64_t) 1 << LBP_GL
) | ((int64_t) 1 << LBP_B2
) | ((int64_t) 1 << LBP_BA
) | ((int64_t) 1 << LBP_BB
) | ((int64_t) 1 << LBP_HY
) | ((int64_t) 1 << LBP_CB
) | ((int64_t) 1 << LBP_CL
) | ((int64_t) 1 << LBP_CP1
) | ((int64_t) 1 << LBP_CP2
) | ((int64_t) 1 << LBP_EX
) | ((int64_t) 1 << LBP_IN
) | ((int64_t) 1 << LBP_NS
) | ((int64_t) 1 << LBP_OP1
) | ((int64_t) 1 << LBP_OP2
) | ((int64_t) 1 << LBP_QU1
) | ((int64_t) 1 << LBP_QU2
) | ((int64_t) 1 << LBP_QU3
) | ((int64_t) 1 << LBP_IS
) | ((int64_t) 1 << LBP_NU
) | ((int64_t) 1 << LBP_PO
) | ((int64_t) 1 << LBP_PR
) | ((int64_t) 1 << LBP_SY
) | ((int64_t) 1 << LBP_H2
) | ((int64_t) 1 << LBP_H3
) | ((int64_t) 1 << LBP_HL
) | ((int64_t) 1 << LBP_JL
) | ((int64_t) 1 << LBP_JV
) | ((int64_t) 1 << LBP_JT
) | ((int64_t) 1 << LBP_AP
) | ((int64_t) 1 << LBP_AK
) | ((int64_t) 1 << LBP_AS
) | ((int64_t) 1 << LBP_VI
) | ((int64_t) 1 << LBP_VF
) | ((int64_t) 1 << LBP_RI
) | ((int64_t) 1 << LBP_SA
) | ((int64_t) 1 << LBP_ID1
) | ((int64_t) 1 << LBP_ID2
) | ((int64_t) 1 << LBP_EB
) | ((int64_t) 1 << LBP_EM
)))
8359 && ch
!= 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */
8360 && !(ch
>= 0x3248 && ch
<= 0x324F) /* CIRCLED NUMBER TEN ON BLACK SQUARE..CIRCLED NUMBER EIGHTY ON BLACK SQUARE */)
8362 /* ambiguous (alphabetic) ? */
8363 if ((unicode_width
[ch
] != NULL
8364 && unicode_width
[ch
][0] == 'A'
8366 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
8367 && ch
!= 0x2022 /* BULLET */
8368 && ch
!= 0x203E /* OVERLINE */
8369 && ch
!= 0x2126 /* OHM SIGN */
8370 && ch
!= 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
8371 && ch
!= 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
8372 && ch
!= 0x21E7 /* UPWARDS WHITE ARROW */
8373 && ch
!= 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
8374 && ch
!= 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
8375 || ch
== 0x00A7 /* SECTION SIGN */
8376 || ch
== 0x00A8 /* DIAERESIS */
8377 || ch
== 0x00AA /* FEMININE ORDINAL INDICATOR */
8378 || ch
== 0x00B2 /* SUPERSCRIPT TWO */
8379 || ch
== 0x00B3 /* SUPERSCRIPT THREE */
8380 || ch
== 0x00B6 /* PILCROW SIGN */
8381 || ch
== 0x00B7 /* MIDDLE DOT */
8382 || ch
== 0x00B8 /* CEDILLA */
8383 || ch
== 0x00B9 /* SUPERSCRIPT ONE */
8384 || ch
== 0x00BA /* MASCULINE ORDINAL INDICATOR */
8385 || ch
== 0x00BC /* VULGAR FRACTION ONE QUARTER */
8386 || ch
== 0x00BD /* VULGAR FRACTION ONE HALF */
8387 || ch
== 0x00BE /* VULGAR FRACTION THREE QUARTERS */
8388 || ch
== 0x00D7 /* MULTIPLICATION SIGN */
8389 || ch
== 0x00F7 /* DIVISION SIGN */
8390 || ch
== 0x02C7 /* CARON */
8391 || ch
== 0x02C9 /* MODIFIER LETTER MACRON */
8392 || ch
== 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
8393 || ch
== 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
8394 || ch
== 0x02CD /* MODIFIER LETTER LOW MACRON */
8395 || ch
== 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
8396 || ch
== 0x02D8 /* BREVE */
8397 || ch
== 0x02D9 /* DOT ABOVE */
8398 || ch
== 0x02DA /* RING ABOVE */
8399 || ch
== 0x02DB /* OGONEK */
8400 || ch
== 0x02DD /* DOUBLE ACUTE ACCENT */
8401 || ch
== 0x24EA /* CIRCLED DIGIT ZERO */
8402 || (ch
>= 0x2780 && ch
<= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
8403 /* Extra characters for compatibility with Unicode LineBreak.txt. */
8404 || ch
== 0x2015 /* HORIZONTAL BAR */
8405 || ch
== 0x2016 /* DOUBLE VERTICAL LINE */
8406 || ch
== 0x2020 /* DAGGER */
8407 || ch
== 0x2021 /* DOUBLE DAGGER */
8408 || ch
== 0x203B /* REFERENCE MARK */
8409 || ch
== 0x2074 /* SUPERSCRIPT FOUR */
8410 || ch
== 0x207F /* SUPERSCRIPT LATIN SMALL LETTER N */
8411 || (ch
>= 0x2081 && ch
<= 0x2084) /* SUBSCRIPT ONE..FOUR */
8412 || ch
== 0x2105 /* CARE OF */
8413 || ch
== 0x2113 /* SCRIPT SMALL L */
8414 || ch
== 0x2121 /* TELEPHONE SIGN */
8415 || ch
== 0x2122 /* TRADE MARK SIGN */
8416 || ch
== 0x212B /* ANGSTROM SIGN */
8417 || ch
== 0x2150 /* VULGAR FRACTION ONE SEVENTH */
8418 || ch
== 0x2151 /* VULGAR FRACTION ONE NINTH */
8419 || ch
== 0x2152 /* VULGAR FRACTION ONE TENTH */
8420 || ch
== 0x2153 /* VULGAR FRACTION ONE THIRD */
8421 || ch
== 0x2154 /* VULGAR FRACTION TWO THIRDS */
8422 || ch
== 0x2155 /* VULGAR FRACTION ONE FIFTH */
8423 || ch
== 0x2156 /* VULGAR FRACTION TWO FIFTHS */
8424 || ch
== 0x2157 /* VULGAR FRACTION THREE FIFTHS */
8425 || ch
== 0x2158 /* VULGAR FRACTION FOUR FIFTHS */
8426 || ch
== 0x2159 /* VULGAR FRACTION ONE SIXTH */
8427 || ch
== 0x215A /* VULGAR FRACTION FIVE SIXTHS */
8428 || ch
== 0x215B /* VULGAR FRACTION ONE EIGHTH */
8429 || ch
== 0x215C /* VULGAR FRACTION THREE EIGHTHS */
8430 || ch
== 0x215D /* VULGAR FRACTION SEVEN EIGHTHS */
8431 || ch
== 0x215E /* VULGAR FRACTION SEVEN EIGHTHS */
8432 || (ch
>= 0x2160 && ch
<= 0x216B) /* ROMAN NUMERAL ONE..TWELVE */
8433 || (ch
>= 0x2170 && ch
<= 0x2179) /* SMALL ROMAN NUMERAL ONE..TEN */
8434 || ch
== 0x2189 /* VULGAR FRACTION ZERO THIRDS */
8435 || (ch
>= 0x2190 && ch
<= 0x2199) /* LEFTWARDS ARROW..SOUTH WEST ARROW */
8436 || ch
== 0x21D2 /* RIGHTWARDS DOUBLE ARROW */
8437 || ch
== 0x21D4 /* LEFT RIGHT DOUBLE ARROW */
8438 || ch
== 0x2200 /* FOR ALL */
8439 || ch
== 0x2202 /* PARTIAL DIFFERENTIAL */
8440 || ch
== 0x2203 /* THERE EXISTS */
8441 || ch
== 0x2207 /* NABLA */
8442 || ch
== 0x2208 /* ELEMENT OF */
8443 || ch
== 0x220B /* CONTAINS AS MEMBER */
8444 || ch
== 0x220F /* N-ARY PRODUCT */
8445 || ch
== 0x2211 /* N-ARY SUMMATION */
8446 || ch
== 0x2215 /* DIVISION SLASH */
8447 || ch
== 0x221A /* SQUARE ROOT */
8448 || ch
== 0x221D /* PROPORTIONAL TO */
8449 || ch
== 0x221E /* INFINITY */
8450 || ch
== 0x221F /* RIGHT ANGLE */
8451 || ch
== 0x2220 /* ANGLE */
8452 || ch
== 0x2223 /* DIVIDES */
8453 || ch
== 0x2225 /* PARALLEL TO */
8454 || ch
== 0x2227 /* LOGICAL AND */
8455 || ch
== 0x2228 /* LOGICAL OR */
8456 || ch
== 0x2229 /* INTERSECTION */
8457 || ch
== 0x222A /* UNION */
8458 || ch
== 0x222B /* INTEGRAL */
8459 || ch
== 0x222C /* DOUBLE INTEGRAL */
8460 || ch
== 0x222E /* CONTOUR INTEGRAL */
8461 || ch
== 0x2234 /* THEREFORE */
8462 || ch
== 0x2235 /* BECAUSE */
8463 || ch
== 0x2236 /* RATIO */
8464 || ch
== 0x2237 /* PROPORTION */
8465 || ch
== 0x223C /* TILDE OPERATOR */
8466 || ch
== 0x223D /* REVERSED TILDE */
8467 || ch
== 0x2248 /* ALMOST EQUAL TO */
8468 || ch
== 0x224C /* ALL EQUAL TO */
8469 || ch
== 0x2252 /* APPROXIMATELY EQUAL TO OR THE IMAGE OF */
8470 || ch
== 0x2260 /* NOT EQUAL TO */
8471 || ch
== 0x2261 /* IDENTICAL TO */
8472 || ch
== 0x2264 /* LESS-THAN OR EQUAL TO */
8473 || ch
== 0x2265 /* GREATER-THAN OR EQUAL TO */
8474 || ch
== 0x2266 /* LESS-THAN OVER EQUAL TO */
8475 || ch
== 0x2267 /* GREATER-THAN OVER EQUAL TO */
8476 || ch
== 0x226A /* MUCH LESS-THAN */
8477 || ch
== 0x226B /* MUCH GREATER-THAN */
8478 || ch
== 0x226E /* NOT LESS-THAN */
8479 || ch
== 0x226F /* NOT GREATER-THAN */
8480 || ch
== 0x2282 /* SUBSET OF */
8481 || ch
== 0x2283 /* SUPERSET OF */
8482 || ch
== 0x2286 /* SUBSET OF OR EQUAL TO */
8483 || ch
== 0x2287 /* SUPERSET OF OR EQUAL TO */
8484 || ch
== 0x2295 /* CIRCLED PLUS */
8485 || ch
== 0x2299 /* CIRCLED DOT OPERATOR */
8486 || ch
== 0x22A5 /* UP TACK */
8487 || ch
== 0x22BF /* RIGHT TRIANGLE */
8488 || ch
== 0x2312 /* ARC */
8489 || (ch
>= 0x2460 && ch
<= 0x24E9) /* CIRCLED DIGIT ONE..CIRCLED LATIN SMALL LETTER Z */
8490 || (ch
>= 0x24EB && ch
<= 0x24FE) /* NEGATIVE CIRCLED NUMBER ELEVEN..NEGATIVE CIRCLED DIGIT ZERO */
8491 || (ch
>= 0x2500 && ch
<= 0x254B) /* BOX DRAWINGS LIGHT HORIZONTAL..BOX DRAWINGS HEAVY VERTICAL AND HORIZONTAL */
8492 || (ch
>= 0x2550 && ch
<= 0x2574) /* BOX DRAWINGS DOUBLE HORIZONTAL..BOX DRAWINGS LIGHT LEFT */
8493 || (ch
>= 0x2580 && ch
<= 0x258F) /* UPPER HALF BLOCK..LEFT ONE EIGHTH BLOCK */
8494 || (ch
>= 0x2592 && ch
<= 0x2595) /* MEDIUM SHADE..RIGHT ONE EIGHTH BLOCK */
8495 || ch
== 0x25A0 /* BLACK SQUARE */
8496 || ch
== 0x25A1 /* WHITE SQUARE */
8497 || (ch
>= 0x25A3 && ch
<= 0x25A9) /* WHITE SQUARE CONTAINING BLACK SMALL SQUARE..SQUARE WITH DIAGONAL CROSSHATCH FILL */
8498 || ch
== 0x25B2 /* BLACK UP-POINTING TRIANGLE */
8499 || ch
== 0x25B3 /* WHITE UP-POINTING TRIANGLE */
8500 || ch
== 0x25B6 /* BLACK RIGHT-POINTING TRIANGLE */
8501 || ch
== 0x25B7 /* WHITE RIGHT-POINTING TRIANGLE */
8502 || ch
== 0x25BC /* BLACK DOWN-POINTING TRIANGLE */
8503 || ch
== 0x25BD /* WHITE DOWN-POINTING TRIANGLE */
8504 || ch
== 0x25C0 /* BLACK LEFT-POINTING TRIANGLE */
8505 || ch
== 0x25C1 /* WHITE LEFT-POINTING TRIANGLE */
8506 || (ch
>= 0x25C6 && ch
<= 0x25C8) /* BLACK DIAMOND..WHITE DIAMOND CONTAINING BLACK SMALL DIAMOND */
8507 || ch
== 0x25CB /* WHITE CIRCLE */
8508 || (ch
>= 0x25CE && ch
<= 0x25D1) /* BULLSEYE..CIRCLE WITH RIGHT HALF BLACK */
8509 || (ch
>= 0x25E2 && ch
<= 0x25E5) /* BLACK LOWER RIGHT TRIANGLE..BLACK UPPER RIGHT TRIANGLE */
8510 || ch
== 0x25EF /* LARGE CIRCLE */
8511 || ch
== 0x2605 /* BLACK STAR */
8512 || ch
== 0x2606 /* WHITE STAR */
8513 || ch
== 0x2609 /* SUN */
8514 || ch
== 0x260E /* BLACK TELEPHONE */
8515 || ch
== 0x260F /* WHITE TELEPHONE */
8516 || ch
== 0x2616 /* WHITE SHOGI PIECE */
8517 || ch
== 0x2617 /* BLACK SHOGI PIECE */
8518 || ch
== 0x2640 /* FEMALE SIGN */
8519 || ch
== 0x2642 /* MALE SIGN */
8520 || ch
== 0x2660 /* BLACK SPADE SUIT */
8521 || ch
== 0x2661 /* WHITE HEART SUIT */
8522 || (ch
>= 0x2663 && ch
<= 0x2665) /* BLACK CLUB SUIT..BLACK HEART SUIT */
8523 || ch
== 0x2667 /* WHITE CLUB SUIT */
8524 || ch
== 0x2669 /* QUARTER NOTE */
8525 || ch
== 0x266A /* EIGHTH NOTE */
8526 || ch
== 0x266C /* BEAMED SIXTEENTH NOTES */
8527 || ch
== 0x266D /* MUSIC FLAT SIGN */
8528 || ch
== 0x266F /* MUSIC SHARP SIGN */
8529 || ch
== 0x269E /* THREE LINES CONVERGING RIGHT */
8530 || ch
== 0x269F /* THREE LINES CONVERGING LEFT */
8531 || (ch
>= 0x26C9 && ch
<= 0x26CC) /* TURNED WHITE SHOGI PIECE..CROSSING LANES */
8532 || ch
== 0x26D2 /* CIRCLED CROSSING LANES */
8533 || (ch
>= 0x26D5 && ch
<= 0x26D7) /* ALTERNATE ONE-WAY LEFT WAY TRAFFIC..WHITE TWO-WAY LEFT WAY TRAFFIC */
8534 || ch
== 0x26DA /* DRIVE SLOW SIGN */
8535 || ch
== 0x26DB /* HEAVY WHITE DOWN-POINTING TRIANGLE */
8536 || ch
== 0x26DD /* SQUARED SALTIRE */
8537 || ch
== 0x26DE /* FALLING DIAGONAL IN WHITE CIRCLE IN BLACK SQUARE */
8538 || ch
== 0x26E3 /* HEAVY CIRCLE WITH STROKE AND TWO DOTS ABOVE */
8539 || ch
== 0x26E8 /* BLACK CROSS ON SHIELD */
8540 || ch
== 0x26E9 /* SHINTO SHRINE */
8541 || (ch
>= 0x26EB && ch
<= 0x26F0) /* CASTLE..MOUNTAIN */
8542 || ch
== 0x26F6 /* SQUARE FOUR CORNERS */
8543 || ch
== 0x26FB /* JAPANESE BANK SYMBOL */
8544 || ch
== 0x26FC /* HEADSTONE GRAVEYARD SYMBOL */
8545 || ch
== 0x2757 /* HEAVY EXCLAMATION MARK SYMBOL */
8546 || (ch
>= 0x2776 && ch
<= 0x277F) /* DINGBAT NEGATIVE CIRCLED DIGIT ONE..DINGBAT NEGATIVE CIRCLED NUMBER TEN */
8547 || (ch
>= 0x2B55 && ch
<= 0x2B59) /* HEAVY LARGE CIRCLE..HEAVY CIRCLED SALTIRE */
8548 || ch
== 0xFFFD /* REPLACEMENT CHARACTER */
8549 || (ch
>= 0x1F100 && ch
<= 0x1F10C) /* DIGIT ZERO FULL STOP..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO */
8550 || (ch
>= 0x1F110 && ch
<= 0x1F12D) /* PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED CD */
8551 || (ch
>= 0x1F130 && ch
<= 0x1F169) /* SQUARED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z */
8552 || (ch
>= 0x1F170 && ch
<= 0x1F1AC) /* NEGATIVE SQUARED LATIN CAPITAL LETTER A..SQUARED VOD */)
8553 attr
|= (int64_t) 1 << LBP_AI
;
8557 attr
|= (int64_t) 1 << LBP_AL2
;
8559 attr
|= (int64_t) 1 << LBP_AL1
;
8561 attr
&= ~((int64_t) 1 << LBP_CM
);
8566 /* Unassigned character. */
8567 if ((ch
>= 0x3400 && ch
<= 0x4DBF) /* CJK Unified Ideographs Extension A */
8568 || (ch
>= 0x4E00 && ch
<= 0x9FFF) /* CJK Unified Ideographs */
8569 || (ch
>= 0xF900 && ch
<= 0xFAFF) /* CJK Compatibility Ideographs */
8570 || (ch
>= 0x1F02C && ch
<= 0x1F02F) /* reserved */
8571 || (ch
>= 0x1F094 && ch
<= 0x1F09F) /* reserved */
8572 || (ch
>= 0x1F0AF && ch
<= 0x1F0B0) /* reserved */
8573 || ch
== 0x1F0C0 /* reserved */
8574 || ch
== 0x1F0D0 /* reserved */
8575 || (ch
>= 0x1F0F6 && ch
<= 0x1F0FF) /* reserved */
8576 || (ch
>= 0x1F10D && ch
<= 0x1F10F) /* reserved */
8577 || ch
== 0x1F12F /* reserved */
8578 || (ch
>= 0x1F16C && ch
<= 0x1F16F) /* reserved */
8579 || (ch
>= 0x1F1AD && ch
<= 0x1F1E5) /* reserved */
8580 || (ch
>= 0x1F203 && ch
<= 0x1F20F) /* reserved */
8581 || (ch
>= 0x1F23C && ch
<= 0x1F23F) /* reserved */
8582 || (ch
>= 0x1F249 && ch
<= 0x1F24F) /* reserved */
8583 || (ch
>= 0x1F252 && ch
<= 0x1F2FF) /* reserved */
8584 || (ch
>= 0x1F6D3 && ch
<= 0x1F6DF) /* reserved */
8585 || (ch
>= 0x1F6ED && ch
<= 0x1F6EF) /* reserved */
8586 || (ch
>= 0x1F6F7 && ch
<= 0x1F6FF) /* reserved */
8587 || (ch
>= 0x1F774 && ch
<= 0x1F77F) /* reserved */
8588 || (ch
>= 0x1F7D5 && ch
<= 0x1F7FF) /* reserved */
8589 || (ch
>= 0x1F8B0 && ch
<= 0x1F8BB) /* reserved */
8590 || (ch
>= 0x1F8C0 && ch
<= 0x1F8C1) /* reserved */
8591 || (ch
>= 0x1F900 && ch
<= 0x1F90F) /* reserved */
8592 || ch
== 0x1F91F /* reserved */
8593 || ch
== 0x1F93F /* reserved */
8594 || (ch
>= 0x1F928 && ch
<= 0x1F92F) /* reserved */
8595 || (ch
>= 0x1F931 && ch
<= 0x1F932) /* reserved */
8596 || (ch
>= 0x1F94C && ch
<= 0x1F94F) /* reserved */
8597 || (ch
>= 0x1F95F && ch
<= 0x1F97F) /* reserved */
8598 || (ch
>= 0x1F992 && ch
<= 0x1F9BF) /* reserved */
8599 || (ch
>= 0x1F9C1 && ch
<= 0x1FB92) /* reserved */
8600 || (ch
>= 0x1FB94 && ch
<= 0x1FBCA) /* reserved */
8601 || (ch
>= 0x1FBF0 && ch
<= 0x1FBF9) /* reserved */
8602 || (ch
>= 0x1FC00 && ch
<= 0x1FFFD) /* reserved */
8603 || (ch
>= 0x20000 && ch
<= 0x2A6FF) /* CJK Unified Ideographs Extension B */
8604 || (ch
>= 0x2A700 && ch
<= 0x2F7FF) /* CJK Unified Ideographs Extension C,
8605 Supplementary Ideographic Plane (Plane 2) outside of blocks */
8606 || (ch
>= 0x2F800 && ch
<= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
8607 Supplementary Ideographic Plane (Plane 2) outside of blocks */
8608 || (ch
>= 0x30000 && ch
<= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
8610 if (is_property_extended_pictographic (ch
))
8611 attr
|= (int64_t) 1 << LBP_ID2
;
8613 attr
|= (int64_t) 1 << LBP_ID1
;
8619 attr
|= (int64_t) 1 << LBP_XX
;
8624 /* Combining prop and ea to a table entry. */
8625 #define PROP_EA(prop,ea) (((prop) << 1) | (ea))
8627 /* Splitting a table entry into prop and ea. */
8628 #define PROP(entry) ((entry) >> 1)
8629 #define EA(entry) ((entry) & 1)
8631 /* Output the line breaking properties in a human readable format. */
8633 debug_output_lbp (FILE *stream
)
8637 for (i
= 0; i
< 0x110000; i
++)
8639 int64_t attr
= get_lbp (i
);
8640 if (attr
!= (int64_t) 1 << LBP_XX
)
8642 fprintf (stream
, "0x%04X", i
);
8643 #define PRINT_BIT(attr,bit) \
8644 if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
8645 #define PRINT_BIT_ALT(attr,bit,name) \
8646 if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #name);
8647 PRINT_BIT(attr
,LBP_BK
);
8648 PRINT_BIT(attr
,LBP_CR
);
8649 PRINT_BIT(attr
,LBP_LF
);
8650 PRINT_BIT(attr
,LBP_CM
);
8651 PRINT_BIT(attr
,LBP_WJ
);
8652 PRINT_BIT(attr
,LBP_ZW
);
8653 PRINT_BIT(attr
,LBP_GL
);
8654 PRINT_BIT(attr
,LBP_SP
);
8655 PRINT_BIT(attr
,LBP_B2
);
8656 PRINT_BIT(attr
,LBP_BA
);
8657 PRINT_BIT(attr
,LBP_BB
);
8658 PRINT_BIT(attr
,LBP_HY
);
8659 PRINT_BIT(attr
,LBP_CB
);
8660 PRINT_BIT(attr
,LBP_CL
);
8661 PRINT_BIT_ALT(attr
,LBP_CP1
,LBP_CP
);
8662 PRINT_BIT_ALT(attr
,LBP_CP2
,LBP_CP
);
8663 PRINT_BIT(attr
,LBP_EX
);
8664 PRINT_BIT(attr
,LBP_IN
);
8665 PRINT_BIT(attr
,LBP_NS
);
8666 PRINT_BIT_ALT(attr
,LBP_OP1
,LBP_OP
);
8667 PRINT_BIT_ALT(attr
,LBP_OP2
,LBP_OP
);
8668 PRINT_BIT_ALT(attr
,LBP_QU1
,LBP_QU
);
8669 PRINT_BIT_ALT(attr
,LBP_QU2
,LBP_QU
);
8670 PRINT_BIT_ALT(attr
,LBP_QU3
,LBP_QU
);
8671 PRINT_BIT(attr
,LBP_IS
);
8672 PRINT_BIT(attr
,LBP_NU
);
8673 PRINT_BIT(attr
,LBP_PO
);
8674 PRINT_BIT(attr
,LBP_PR
);
8675 PRINT_BIT(attr
,LBP_SY
);
8676 PRINT_BIT(attr
,LBP_AI
);
8677 PRINT_BIT_ALT(attr
,LBP_AL1
,LBP_AL
);
8678 PRINT_BIT_ALT(attr
,LBP_AL2
,LBP_AL
);
8679 PRINT_BIT(attr
,LBP_H2
);
8680 PRINT_BIT(attr
,LBP_H3
);
8681 PRINT_BIT(attr
,LBP_HL
);
8682 PRINT_BIT_ALT(attr
,LBP_ID1
,LBP_ID
);
8683 PRINT_BIT_ALT(attr
,LBP_ID2
,LBP_ID
);
8684 PRINT_BIT(attr
,LBP_JL
);
8685 PRINT_BIT(attr
,LBP_JV
);
8686 PRINT_BIT(attr
,LBP_JT
);
8687 PRINT_BIT(attr
,LBP_AP
);
8688 PRINT_BIT(attr
,LBP_AK
);
8689 PRINT_BIT(attr
,LBP_AS
);
8690 PRINT_BIT(attr
,LBP_VI
);
8691 PRINT_BIT(attr
,LBP_VF
);
8692 PRINT_BIT(attr
,LBP_RI
);
8693 PRINT_BIT(attr
,LBP_SA
);
8694 PRINT_BIT(attr
,LBP_ZWJ
);
8695 PRINT_BIT(attr
,LBP_EB
);
8696 PRINT_BIT(attr
,LBP_EM
);
8697 PRINT_BIT(attr
,LBP_XX
);
8698 #undef PRINT_BIT_ALT
8700 fprintf (stream
, "\n");
8706 debug_output_lbrk_tables (const char *filename
)
8710 stream
= fopen (filename
, "w");
8713 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
8717 debug_output_lbp (stream
);
8719 if (ferror (stream
) || fclose (stream
))
8721 fprintf (stderr
, "error writing to '%s'\n", filename
);
8726 /* The line breaking property from the LineBreak.txt file. */
8727 int unicode_org_lbp
[0x110000];
8729 /* Stores in unicode_org_lbp[] the line breaking property from the
8730 LineBreak.txt file. */
8732 fill_org_lbp (const char *linebreak_filename
)
8736 char field0
[FIELDLEN
];
8737 char field1
[FIELDLEN
];
8738 char field2
[FIELDLEN
];
8741 for (i
= 0; i
< 0x110000; i
++)
8742 unicode_org_lbp
[i
] = LBP_XX
;
8744 stream
= fopen (linebreak_filename
, "r");
8747 fprintf (stderr
, "error during fopen of '%s'\n", linebreak_filename
);
8765 do c
= getc (stream
); while (c
!= EOF
&& c
!= '\n');
8769 n
= getfield (stream
, field0
, ';');
8770 do c
= getc (stream
); while (c
== ' ');
8772 n
+= getfield (stream
, field1
, '#');
8773 n
+= getfield (stream
, field2
, '\n');
8778 fprintf (stderr
, "short line in '%s':%d\n", linebreak_filename
,
8782 /* Remove trailing spaces from field0. */
8783 while (strlen (field0
) > 0 && field0
[strlen (field0
) - 1] == ' ')
8784 field0
[strlen (field0
) - 1] = '\0';
8785 /* Remove trailing spaces from field1. */
8786 while (strlen (field1
) > 0 && field1
[strlen (field1
) - 1] == ' ')
8787 field1
[strlen (field1
) - 1] = '\0';
8788 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
8836 else if (strcmp (field1
, "NL") == 0) value
= LBP_BK
;
8837 else if (strcmp (field1
, "SG") == 0) value
= LBP_XX
;
8838 else if (strcmp (field1
, "CJ") == 0) value
= LBP_NS
;
8841 fprintf (stderr
, "unknown property value \"%s\" in '%s':%d\n",
8842 field1
, linebreak_filename
, lineno
);
8845 i
= strtoul (field0
, NULL
, 16);
8846 if (strstr (field0
, "..") != NULL
)
8848 /* Deal with a range. */
8849 j
= strtoul (strstr (field0
, "..") + 2, NULL
, 16);
8851 unicode_org_lbp
[i
] = value
;
8855 /* Single character line. */
8856 unicode_org_lbp
[i
] = value
;
8860 if (ferror (stream
) || fclose (stream
))
8862 fprintf (stderr
, "error reading from '%s'\n", linebreak_filename
);
8867 /* Output the line breaking properties in a human readable format. */
8869 debug_output_org_lbp (FILE *stream
)
8873 for (i
= 0; i
< 0x110000; i
++)
8875 int attr
= unicode_org_lbp
[i
];
8878 fprintf (stream
, "0x%04X", i
);
8879 #define PRINT_BIT(attr,bit) \
8880 if (attr == bit) fprintf (stream, " " #bit);
8881 PRINT_BIT(attr
,LBP_BK
);
8882 PRINT_BIT(attr
,LBP_CR
);
8883 PRINT_BIT(attr
,LBP_LF
);
8884 PRINT_BIT(attr
,LBP_CM
);
8885 PRINT_BIT(attr
,LBP_WJ
);
8886 PRINT_BIT(attr
,LBP_ZW
);
8887 PRINT_BIT(attr
,LBP_GL
);
8888 PRINT_BIT(attr
,LBP_SP
);
8889 PRINT_BIT(attr
,LBP_B2
);
8890 PRINT_BIT(attr
,LBP_BA
);
8891 PRINT_BIT(attr
,LBP_BB
);
8892 PRINT_BIT(attr
,LBP_HY
);
8893 PRINT_BIT(attr
,LBP_CB
);
8894 PRINT_BIT(attr
,LBP_CL
);
8895 PRINT_BIT(attr
,LBP_CP
);
8896 PRINT_BIT(attr
,LBP_EX
);
8897 PRINT_BIT(attr
,LBP_IN
);
8898 PRINT_BIT(attr
,LBP_NS
);
8899 PRINT_BIT(attr
,LBP_OP
);
8900 PRINT_BIT(attr
,LBP_QU
);
8901 PRINT_BIT(attr
,LBP_IS
);
8902 PRINT_BIT(attr
,LBP_NU
);
8903 PRINT_BIT(attr
,LBP_PO
);
8904 PRINT_BIT(attr
,LBP_PR
);
8905 PRINT_BIT(attr
,LBP_SY
);
8906 PRINT_BIT(attr
,LBP_AI
);
8907 PRINT_BIT(attr
,LBP_AL
);
8908 PRINT_BIT(attr
,LBP_H2
);
8909 PRINT_BIT(attr
,LBP_H3
);
8910 PRINT_BIT(attr
,LBP_HL
);
8911 PRINT_BIT(attr
,LBP_ID
);
8912 PRINT_BIT(attr
,LBP_JL
);
8913 PRINT_BIT(attr
,LBP_JV
);
8914 PRINT_BIT(attr
,LBP_JT
);
8915 PRINT_BIT(attr
,LBP_AP
);
8916 PRINT_BIT(attr
,LBP_AK
);
8917 PRINT_BIT(attr
,LBP_AS
);
8918 PRINT_BIT(attr
,LBP_VI
);
8919 PRINT_BIT(attr
,LBP_VF
);
8920 PRINT_BIT(attr
,LBP_RI
);
8921 PRINT_BIT(attr
,LBP_SA
);
8922 PRINT_BIT(attr
,LBP_ZWJ
);
8923 PRINT_BIT(attr
,LBP_EB
);
8924 PRINT_BIT(attr
,LBP_EM
);
8925 PRINT_BIT(attr
,LBP_XX
);
8927 fprintf (stream
, "\n");
8933 debug_output_org_lbrk_tables (const char *filename
)
8937 stream
= fopen (filename
, "w");
8940 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
8944 debug_output_org_lbp (stream
);
8946 if (ferror (stream
) || fclose (stream
))
8948 fprintf (stderr
, "error writing to '%s'\n", filename
);
8953 /* Given an enum value LBP_..., returns its name "LBP_..." as a string. */
8955 lbp_value_to_string (unsigned int value
)
8957 const char *value_string
;
8960 #define CASE(x) case x: value_string = #x; break;
9016 return value_string
;
9019 /* Construction of sparse 3-level tables. */
9020 #define TABLE lbpea_table
9021 #define ELEMENT unsigned char
9022 #define DEFAULT PROP_EA (LBP_XX, 0)
9023 #define xmalloc malloc
9024 #define xrealloc realloc
9028 output_lbpea (FILE *stream1
, FILE *stream2
)
9031 struct lbpea_table t
;
9032 unsigned int level1_offset
, level2_offset
, level3_offset
;
9036 lbpea_table_init (&t
);
9038 for (i
= 0; i
< 0x110000; i
++)
9040 int64_t attr
= get_lbp (i
);
9041 int ea
= get_lbea (i
);
9043 /* Now attr should contain exactly one bit. */
9044 assert (attr
!= 0 && (attr
& (attr
- 1)) == 0);
9046 if (attr
!= (int64_t) 1 << LBP_XX
)
9048 unsigned int log2_attr
;
9049 for (log2_attr
= 0; attr
> 1; attr
>>= 1, log2_attr
++);
9051 lbpea_table_add (&t
, i
, PROP_EA (log2_attr
, ea
));
9055 lbpea_table_finalize (&t
);
9058 5 * sizeof (uint32_t);
9060 5 * sizeof (uint32_t)
9061 + t
.level1_size
* sizeof (uint32_t);
9063 5 * sizeof (uint32_t)
9064 + t
.level1_size
* sizeof (uint32_t)
9065 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
9067 for (i
= 0; i
< 5; i
++)
9068 fprintf (stream1
, "#define lbrkprop_header_%d %d\n", i
,
9069 ((uint32_t *) t
.result
)[i
]);
9070 fprintf (stream1
, "\n");
9071 fprintf (stream1
, "typedef struct\n");
9072 fprintf (stream1
, " {\n");
9073 fprintf (stream1
, " int level1[%zu];\n", t
.level1_size
);
9074 fprintf (stream1
, " int level2[%zu << %d];\n", t
.level2_size
, t
.q
);
9075 fprintf (stream1
, " unsigned char level3[%zu << %d];\n", t
.level3_size
, t
.p
);
9076 fprintf (stream1
, " }\n");
9077 fprintf (stream1
, "lbrkprop_t;\n");
9078 fprintf (stream1
, "extern const lbrkprop_t unilbrkprop;\n");
9080 fprintf (stream2
, "const lbrkprop_t unilbrkprop =\n");
9081 fprintf (stream2
, "{\n");
9082 fprintf (stream2
, " {");
9083 if (t
.level1_size
> 8)
9084 fprintf (stream2
, "\n ");
9085 for (i
= 0; i
< t
.level1_size
; i
++)
9088 if (i
> 0 && (i
% 8) == 0)
9089 fprintf (stream2
, "\n ");
9090 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
9092 fprintf (stream2
, " %5d", -1);
9094 fprintf (stream2
, " %5zu",
9095 (offset
- level2_offset
) / sizeof (uint32_t));
9096 if (i
+1 < t
.level1_size
)
9097 fprintf (stream2
, ",");
9099 if (t
.level1_size
> 8)
9100 fprintf (stream2
, "\n ");
9101 fprintf (stream2
, " },\n");
9102 fprintf (stream2
, " {");
9103 if (t
.level2_size
<< t
.q
> 8)
9104 fprintf (stream2
, "\n ");
9105 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
9108 if (i
> 0 && (i
% 8) == 0)
9109 fprintf (stream2
, "\n ");
9110 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
9112 fprintf (stream2
, " %5d", -1);
9114 fprintf (stream2
, " %5zu",
9115 (offset
- level3_offset
) / sizeof (unsigned char));
9116 if (i
+1 < t
.level2_size
<< t
.q
)
9117 fprintf (stream2
, ",");
9119 if (t
.level2_size
<< t
.q
> 8)
9120 fprintf (stream2
, "\n ");
9121 fprintf (stream2
, " },\n");
9122 fprintf (stream2
, " {");
9123 if (t
.level3_size
<< t
.p
> 8)
9124 fprintf (stream2
, "\n ");
9125 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
9127 unsigned char value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
];
9128 if (i
> 0 && (i
% 4) == 0)
9129 fprintf (stream2
, "\n ");
9130 fprintf (stream2
, " (%s<<1)|%d%s",
9131 lbp_value_to_string (PROP (value
)), EA (value
),
9132 (i
+1 < t
.level3_size
<< t
.p
? "," : ""));
9134 if (t
.level3_size
<< t
.p
> 8)
9135 fprintf (stream2
, "\n ");
9136 fprintf (stream2
, " }\n");
9137 fprintf (stream2
, "};\n");
9141 output_lbrk_tables (const char *filename1
, const char *filename2
, const char *version
)
9143 const char *filenames
[2];
9147 filenames
[0] = filename1
;
9148 filenames
[1] = filename2
;
9150 for (i
= 0; i
< 2; i
++)
9152 streams
[i
] = fopen (filenames
[i
], "w");
9153 if (streams
[i
] == NULL
)
9155 fprintf (stderr
, "cannot open '%s' for writing\n", filenames
[i
]);
9160 for (i
= 0; i
< 2; i
++)
9162 FILE *stream
= streams
[i
];
9164 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9165 fprintf (stream
, "/* Line breaking properties of Unicode characters. */\n");
9166 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9168 fprintf (stream
, "\n");
9170 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
9171 fprintf (stream
, "\n");
9172 output_library_license (stream
, false);
9173 fprintf (stream
, "\n");
9176 output_lbpea (streams
[0], streams
[1]);
9178 for (i
= 0; i
< 2; i
++)
9180 if (ferror (streams
[i
]) || fclose (streams
[i
]))
9182 fprintf (stderr
, "error writing to '%s'\n", filenames
[i
]);
9189 output_lbrk_rules_as_tables (const char *filename
, const char *version
)
9193 stream
= fopen (filename
, "w");
9196 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
9200 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9201 fprintf (stream
, "/* Table that encodes several line breaking rules. */\n");
9202 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9204 fprintf (stream
, "\n");
9206 fprintf (stream
, "/* Copyright (C) 2001-2024 Free Software Foundation, Inc.\n");
9207 fprintf (stream
, "\n");
9208 output_library_license (stream
, false);
9209 fprintf (stream
, "\n");
9211 fprintf (stream
, "#include <config.h>\n");
9212 fprintf (stream
, "\n");
9213 fprintf (stream
, "/* Specification. */\n");
9214 fprintf (stream
, "#include \"unilbrk/lbrktables.h\"\n");
9215 fprintf (stream
, "\n");
9216 fprintf (stream
, "/* Define unilbrkprop, table of line breaking properties. */\n");
9217 fprintf (stream
, "#include \"unilbrk/lbrkprop2.h\"\n");
9218 fprintf (stream
, "\n");
9220 /* LBP_* values that are entered in the table are in the range 0 .. NLBP-1. */
9221 const unsigned int NLBP
= 41;
9223 unsigned int before
;
9225 /* Describe the table cell (before, after). */
9228 /* Break prohibited when no spaces, i.e. in before ÷ after */
9229 bool prohibited_no_sp
;
9230 /* Break prohibited with spaces, i.e. in before SP+ ÷ after */
9231 bool prohibited_with_sp
;
9233 struct table_cell table
[NLBP
][NLBP
];
9234 /* Sets table[before][after].field to value. */
9235 #define set_table_cell(field,value) \
9236 (before == LBP_CP ? (set_table_cell_1 (LBP_CP1, field, value), set_table_cell_1 (LBP_CP2, field, value)) : \
9237 before == LBP_OP ? (set_table_cell_1 (LBP_OP1, field, value), set_table_cell_1 (LBP_OP2, field, value)) : \
9238 before == LBP_QU ? (set_table_cell_1 (LBP_QU1, field, value), set_table_cell_1 (LBP_QU2, field, value), set_table_cell_1 (LBP_QU3, field, value)) : \
9239 before == LBP_AL ? (set_table_cell_1 (LBP_AL1, field, value), set_table_cell_1 (LBP_AL2, field, value)) : \
9240 before == LBP_ID ? (set_table_cell_1 (LBP_ID1, field, value), set_table_cell_1 (LBP_ID2, field, value)) : \
9241 set_table_cell_1 (before, field, value))
9242 #define set_table_cell_1(row,field,value) \
9243 (after == LBP_CP ? (set_table_cell_2 (row, LBP_CP1, field, value), set_table_cell_2 (row, LBP_CP2, field, value)) : \
9244 after == LBP_OP ? (set_table_cell_2 (row, LBP_OP1, field, value), set_table_cell_2 (row, LBP_OP2, field, value)) : \
9245 after == LBP_QU ? (set_table_cell_2 (row, LBP_QU1, field, value), set_table_cell_2 (row, LBP_QU2, field, value), set_table_cell_2 (row, LBP_QU3, field, value)) : \
9246 after == LBP_AL ? (set_table_cell_2 (row, LBP_AL1, field, value), set_table_cell_2 (row, LBP_AL2, field, value)) : \
9247 after == LBP_ID ? (set_table_cell_2 (row, LBP_ID1, field, value), set_table_cell_2 (row, LBP_ID2, field, value)) : \
9248 set_table_cell_2 (row, after, field, value))
9249 #define set_table_cell_2(row,column,field,value) \
9250 (table[row][column].field = (value))
9253 If we were to apply the rules in top-down order (high precedence rules
9254 first), the table_cell fields have to support values false/true/unknown.
9255 If we apply the rules in the opposite order (high precedence order last),
9256 the table_cell fields need to support only the values false/true.
9257 So, that's what we do here. */
9259 /* (LB31) Break everywhere. */
9260 for (before
= 0; before
< NLBP
; before
++)
9261 for (after
= 0; after
< NLBP
; after
++)
9262 set_table_cell (prohibited_no_sp
, false);
9264 /* (LB30b) Do not break between an emoji base (or potential emoji) and an
9266 before
= LBP_EB
; after
= LBP_EM
; set_table_cell (prohibited_no_sp
, true);
9267 before
= LBP_ID2
; after
= LBP_EM
; set_table_cell (prohibited_no_sp
, true);
9269 /* (LB30) Do not break between letters, numbers, or ordinary symbols and
9270 opening or closing parentheses (except for East Asian parentheses). */
9271 before
= LBP_AL
; after
= LBP_OP1
; set_table_cell (prohibited_no_sp
, true);
9272 before
= LBP_HL
; after
= LBP_OP1
; set_table_cell (prohibited_no_sp
, true);
9273 before
= LBP_NU
; after
= LBP_OP1
; set_table_cell (prohibited_no_sp
, true);
9274 before
= LBP_CP1
; after
= LBP_AL
; set_table_cell (prohibited_no_sp
, true);
9275 before
= LBP_CP1
; after
= LBP_HL
; set_table_cell (prohibited_no_sp
, true);
9276 before
= LBP_CP1
; after
= LBP_NU
; set_table_cell (prohibited_no_sp
, true);
9278 /* (LB29) Do not break between numeric punctuation and alphabetics
9280 before
= LBP_IS
; after
= LBP_AL
; set_table_cell (prohibited_no_sp
, true);
9281 before
= LBP_IS
; after
= LBP_HL
; set_table_cell (prohibited_no_sp
, true);
9283 /* (LB28a) Do not break inside the orthographic syllables of Brahmic
9285 /* (LB28a) line 1. */
9286 before
= LBP_AP
; after
= LBP_AK
; set_table_cell (prohibited_no_sp
, true);
9287 before
= LBP_AP
; after
= LBP_AL2
; set_table_cell (prohibited_no_sp
, true);
9288 before
= LBP_AP
; after
= LBP_AS
; set_table_cell (prohibited_no_sp
, true);
9289 /* (LB28a) line 2. */
9290 before
= LBP_AK
; after
= LBP_VF
; set_table_cell (prohibited_no_sp
, true);
9291 before
= LBP_AK
; after
= LBP_VI
; set_table_cell (prohibited_no_sp
, true);
9292 before
= LBP_AL2
; after
= LBP_VF
; set_table_cell (prohibited_no_sp
, true);
9293 before
= LBP_AL2
; after
= LBP_VI
; set_table_cell (prohibited_no_sp
, true);
9294 before
= LBP_AS
; after
= LBP_VF
; set_table_cell (prohibited_no_sp
, true);
9295 before
= LBP_AS
; after
= LBP_VI
; set_table_cell (prohibited_no_sp
, true);
9297 /* (LB28) Do not break between alphabetics ("at"). */
9298 before
= LBP_AL
; after
= LBP_AL
; set_table_cell (prohibited_no_sp
, true);
9299 before
= LBP_AL
; after
= LBP_HL
; set_table_cell (prohibited_no_sp
, true);
9300 before
= LBP_HL
; after
= LBP_AL
; set_table_cell (prohibited_no_sp
, true);
9301 before
= LBP_HL
; after
= LBP_HL
; set_table_cell (prohibited_no_sp
, true);
9303 /* (LB27) Korean Syllable Block. */
9304 before
= LBP_JL
; after
= LBP_PO
; set_table_cell (prohibited_no_sp
, true);
9305 before
= LBP_JV
; after
= LBP_PO
; set_table_cell (prohibited_no_sp
, true);
9306 before
= LBP_JT
; after
= LBP_PO
; set_table_cell (prohibited_no_sp
, true);
9307 before
= LBP_H2
; after
= LBP_PO
; set_table_cell (prohibited_no_sp
, true);
9308 before
= LBP_H3
; after
= LBP_PO
; set_table_cell (prohibited_no_sp
, true);
9309 before
= LBP_PR
; after
= LBP_JL
; set_table_cell (prohibited_no_sp
, true);
9310 before
= LBP_PR
; after
= LBP_JV
; set_table_cell (prohibited_no_sp
, true);
9311 before
= LBP_PR
; after
= LBP_JT
; set_table_cell (prohibited_no_sp
, true);
9312 before
= LBP_PR
; after
= LBP_H2
; set_table_cell (prohibited_no_sp
, true);
9313 before
= LBP_PR
; after
= LBP_H3
; set_table_cell (prohibited_no_sp
, true);
9315 /* (LB26) Do not break a Korean syllable. */
9316 before
= LBP_JL
; after
= LBP_JL
; set_table_cell (prohibited_no_sp
, true);
9317 before
= LBP_JL
; after
= LBP_JV
; set_table_cell (prohibited_no_sp
, true);
9318 before
= LBP_JL
; after
= LBP_H2
; set_table_cell (prohibited_no_sp
, true);
9319 before
= LBP_JL
; after
= LBP_H3
; set_table_cell (prohibited_no_sp
, true);
9320 before
= LBP_JV
; after
= LBP_JV
; set_table_cell (prohibited_no_sp
, true);
9321 before
= LBP_JV
; after
= LBP_JT
; set_table_cell (prohibited_no_sp
, true);
9322 before
= LBP_H2
; after
= LBP_JV
; set_table_cell (prohibited_no_sp
, true);
9323 before
= LBP_H2
; after
= LBP_JT
; set_table_cell (prohibited_no_sp
, true);
9324 before
= LBP_JT
; after
= LBP_JT
; set_table_cell (prohibited_no_sp
, true);
9325 before
= LBP_H3
; after
= LBP_JT
; set_table_cell (prohibited_no_sp
, true);
9327 /* (LB25) Do not break between the following pairs of classes relevant to
9329 before
= LBP_PO
; after
= LBP_NU
; set_table_cell (prohibited_no_sp
, true);
9330 before
= LBP_PR
; after
= LBP_NU
; set_table_cell (prohibited_no_sp
, true);
9331 before
= LBP_HY
; after
= LBP_NU
; set_table_cell (prohibited_no_sp
, true);
9332 before
= LBP_IS
; after
= LBP_NU
; set_table_cell (prohibited_no_sp
, true);
9334 /* (LB24) Do not break between numeric prefix/postfix and letters, or between
9335 letters and prefix/postfix. */
9336 before
= LBP_PR
; after
= LBP_AL
; set_table_cell (prohibited_no_sp
, true);
9337 before
= LBP_PR
; after
= LBP_HL
; set_table_cell (prohibited_no_sp
, true);
9338 before
= LBP_PO
; after
= LBP_AL
; set_table_cell (prohibited_no_sp
, true);
9339 before
= LBP_PO
; after
= LBP_HL
; set_table_cell (prohibited_no_sp
, true);
9340 before
= LBP_AL
; after
= LBP_PR
; set_table_cell (prohibited_no_sp
, true);
9341 before
= LBP_AL
; after
= LBP_PO
; set_table_cell (prohibited_no_sp
, true);
9342 before
= LBP_HL
; after
= LBP_PR
; set_table_cell (prohibited_no_sp
, true);
9343 before
= LBP_HL
; after
= LBP_PO
; set_table_cell (prohibited_no_sp
, true);
9345 /* (LB23a) Do not break between numeric prefixes and ideographs, or between
9346 ideographs and numeric postfixes. */
9347 before
= LBP_PR
; after
= LBP_ID
; set_table_cell (prohibited_no_sp
, true);
9348 before
= LBP_PR
; after
= LBP_EB
; set_table_cell (prohibited_no_sp
, true);
9349 before
= LBP_PR
; after
= LBP_EM
; set_table_cell (prohibited_no_sp
, true);
9350 before
= LBP_ID
; after
= LBP_PO
; set_table_cell (prohibited_no_sp
, true);
9351 before
= LBP_EB
; after
= LBP_PO
; set_table_cell (prohibited_no_sp
, true);
9352 before
= LBP_EM
; after
= LBP_PO
; set_table_cell (prohibited_no_sp
, true);
9354 /* (LB23) Do not break between digits and letters. */
9355 before
= LBP_AL
; after
= LBP_NU
; set_table_cell (prohibited_no_sp
, true);
9356 before
= LBP_HL
; after
= LBP_NU
; set_table_cell (prohibited_no_sp
, true);
9357 before
= LBP_NU
; after
= LBP_AL
; set_table_cell (prohibited_no_sp
, true);
9358 before
= LBP_NU
; after
= LBP_HL
; set_table_cell (prohibited_no_sp
, true);
9360 /* (LB22) Do not break before ellipses. */
9361 for (before
= 0; before
< NLBP
; before
++)
9363 after
= LBP_IN
; set_table_cell (prohibited_no_sp
, true);
9366 /* (LB21b) Don’t break between Solidus and Hebrew letters. */
9367 before
= LBP_SY
; after
= LBP_HL
; set_table_cell (prohibited_no_sp
, true);
9369 /* (LB21) Do not break before hyphen-minus, other hyphens, fixed-width spaces,
9370 small kana, and other non-starters, or after acute accents. */
9371 for (before
= 0; before
< NLBP
; before
++)
9373 after
= LBP_BA
; set_table_cell (prohibited_no_sp
, true);
9374 after
= LBP_HY
; set_table_cell (prohibited_no_sp
, true);
9375 after
= LBP_NS
; set_table_cell (prohibited_no_sp
, true);
9377 for (after
= 0; after
< NLBP
; after
++)
9379 before
= LBP_BB
; set_table_cell (prohibited_no_sp
, true);
9382 /* (LB19) Don't break before non-initial ambiguous quotation marks,
9383 such as '”' or '"'. Don't break after non-final ambiguous quotation
9384 marks, such as '“' or '"'. */
9385 for (before
= 0; before
< NLBP
; before
++)
9387 after
= LBP_QU1
; set_table_cell (prohibited_no_sp
, true);
9388 after
= LBP_QU3
; set_table_cell (prohibited_no_sp
, true);
9390 for (after
= 0; after
< NLBP
; after
++)
9392 before
= LBP_QU1
; set_table_cell (prohibited_no_sp
, true);
9393 before
= LBP_QU2
; set_table_cell (prohibited_no_sp
, true);
9396 /* (LB18) Break after spaces. */
9397 for (before
= 0; before
< NLBP
; before
++)
9398 for (after
= 0; after
< NLBP
; after
++)
9399 set_table_cell (prohibited_with_sp
, false);
9401 /* (LB17) Do not break within '——', even with intervening spaces. */
9402 before
= LBP_B2
; after
= LBP_B2
; set_table_cell (prohibited_no_sp
, true);
9403 set_table_cell (prohibited_with_sp
, true);
9405 /* (LB16) Do not break between closing punctuation and a nonstarter (lb=NS),
9406 even with intervening spaces. */
9407 before
= LBP_CL
; after
= LBP_NS
; set_table_cell (prohibited_no_sp
, true);
9408 set_table_cell (prohibited_with_sp
, true);
9409 before
= LBP_CP
; after
= LBP_NS
; set_table_cell (prohibited_no_sp
, true);
9410 set_table_cell (prohibited_with_sp
, true);
9412 /* (LB15d) Do not break before ';', ',', '.', even after spaces. */
9413 for (before
= 0; before
< NLBP
; before
++)
9415 after
= LBP_IS
; set_table_cell (prohibited_no_sp
, true);
9416 set_table_cell (prohibited_with_sp
, true);
9419 /* (LB15b) Do not break before an ambiguous quotation that is a final
9420 punctuation, even after spaces. */
9421 for (before
= 0; before
< NLBP
; before
++)
9423 after
= LBP_QU3
; set_table_cell (prohibited_no_sp
, true);
9424 set_table_cell (prohibited_with_sp
, true);
9427 /* (LB15a) Do not break after an ambiguous quotation that is an initial
9428 punctuation, even after spaces. */
9429 for (after
= 0; after
< NLBP
; after
++)
9431 before
= LBP_QU2
; set_table_cell (prohibited_no_sp
, true);
9432 set_table_cell (prohibited_with_sp
, true);
9435 /* (LB14) Do not break after '[', even after spaces. */
9436 for (after
= 0; after
< NLBP
; after
++)
9438 before
= LBP_OP
; set_table_cell (prohibited_no_sp
, true);
9439 set_table_cell (prohibited_with_sp
, true);
9442 /* (LB13) Do not break before ‘]’ or ‘!’ or ‘/’, even after spaces. */
9443 for (before
= 0; before
< NLBP
; before
++)
9445 after
= LBP_CL
; set_table_cell (prohibited_no_sp
, true);
9446 set_table_cell (prohibited_with_sp
, true);
9447 after
= LBP_CP
; set_table_cell (prohibited_no_sp
, true);
9448 set_table_cell (prohibited_with_sp
, true);
9449 after
= LBP_EX
; set_table_cell (prohibited_no_sp
, true);
9450 set_table_cell (prohibited_with_sp
, true);
9451 after
= LBP_SY
; set_table_cell (prohibited_no_sp
, true);
9452 set_table_cell (prohibited_with_sp
, true);
9455 /* (LB12a) Do not break before NBSP and related characters, except after
9456 spaces and hyphens. */
9457 for (before
= 0; before
< NLBP
; before
++)
9458 if (before
!= LBP_BA
&& before
!= LBP_HY
)
9460 after
= LBP_GL
; set_table_cell (prohibited_no_sp
, true);
9463 /* (LB12) Do not break after NBSP and related characters. */
9464 for (after
= 0; after
< NLBP
; after
++)
9466 before
= LBP_GL
; set_table_cell (prohibited_no_sp
, true);
9469 /* (LB11) Do not break before or after Word joiner and related characters. */
9470 for (before
= 0; before
< NLBP
; before
++)
9472 after
= LBP_WJ
; set_table_cell (prohibited_no_sp
, true);
9473 set_table_cell (prohibited_with_sp
, true);
9475 for (after
= 0; after
< NLBP
; after
++)
9477 before
= LBP_WJ
; set_table_cell (prohibited_no_sp
, true);
9480 /* (LB10) Treat any remaining combining mark or ZWJ as AL. */
9481 /* We resolve LBP_CM at runtime, before accessing the table. */
9482 for (before
= 0; before
< NLBP
; before
++)
9483 table
[before
][LBP_ZWJ
] = table
[before
][LBP_AL1
];
9484 for (after
= 0; after
< NLBP
; after
++)
9485 table
[LBP_ZWJ
][after
] = table
[LBP_AL1
][after
];
9486 table
[LBP_ZWJ
][LBP_ZWJ
] = table
[LBP_AL1
][LBP_AL1
];
9488 /* (LB8a) Do not break between a zero width joiner and an ideograph, emoji
9489 base or emoji modifier. */
9490 before
= LBP_ZWJ
; after
= LBP_ID
; set_table_cell (prohibited_no_sp
, true);
9491 before
= LBP_ZWJ
; after
= LBP_EB
; set_table_cell (prohibited_no_sp
, true);
9492 before
= LBP_ZWJ
; after
= LBP_EM
; set_table_cell (prohibited_no_sp
, true);
9494 /* Not reflected in the table:
9495 (LB30a) Break between two regional indicator symbols if and only if there are
9496 an even number of regional indicators preceding the position of the
9498 (LB28a) Don't break inside orthographic syllables of Brahmic scripts, lines
9500 (LB25) Do not break between the following pairs of classes relevant to
9501 numbers, lines with NU (SY|IS)* or OP NU or OP IS NU.
9502 (LB21a) Don't break after Hebrew + Hyphen/Break-After, before non-Hebrew.
9503 (LB20a) Don't break after a word-initial hyphen.
9504 (LB20) Break before and after unresolved CB.
9505 We resolve LBP_CB at runtime, before accessing the table.
9506 (LB19a) Don't break on either side of ambiguous quotation marks, except next
9507 to an EastAsian character.
9508 (LB15c) Break before a decimal mark that follows a space.
9509 Part of (LB15b) Do not break before an ambiguous quotation that is a final
9510 punctuation, even after spaces.
9511 Part of (LB15a) Do not break before an ambiguous quotation that is an initial
9512 punctuation, even after spaces.
9513 (LB9) Do not break a combining character sequence; treat it as if it has the
9514 line breaking class of the base character in all of the following rules.
9515 Treat ZWJ as if it were CM.
9516 Part of (LB8a) Don't break right after a zero-width joiner.
9517 (LB8) Break before any character following a zero-width space, even if one
9518 or more spaces intervene.
9519 We handle LBP_ZW at runtime, before accessing the table.
9520 (LB7) Do not break before spaces or zero width space.
9521 We handle LBP_ZW at runtime, before accessing the table.
9522 (LB6) Do not break before hard line breaks.
9523 We handle LBP_BK at runtime, before accessing the table.
9524 (LB5) Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
9525 (LB4) Always break after hard line breaks.
9526 (LB3) Always break at the end of text.
9527 (LB2) Never break at the start of text.
9530 fprintf (stream
, "const unsigned char unilbrk_table[%u][%u] =\n", NLBP
, NLBP
);
9531 fprintf (stream
, "{\n");
9532 fprintf (stream
, " /* after */\n");
9534 fprintf (stream
, " /* ");
9535 for (after
= 0; after
< NLBP
; after
++)
9536 fprintf (stream
, " %-3s", lbp_value_to_string (after
) + 4);
9537 fprintf (stream
, " */\n");
9539 for (before
= 0; before
< NLBP
; before
++)
9541 fprintf (stream
, "/* %3s */ {", lbp_value_to_string (before
) + 4);
9542 for (after
= 0; after
< NLBP
; after
++)
9544 if (table
[before
][after
].prohibited_no_sp
)
9546 if (table
[before
][after
].prohibited_with_sp
)
9547 /* Prohibited break. */
9548 fprintf (stream
, " P,");
9550 /* Indirect break. */
9551 fprintf (stream
, " I,");
9555 if (table
[before
][after
].prohibited_with_sp
)
9559 fprintf (stream
, " D,");
9562 fprintf (stream
, " },\n");
9564 fprintf (stream
, "/* \"\" */\n");
9565 fprintf (stream
, "/* before */\n");
9566 fprintf (stream
, "};\n");
9568 if (ferror (stream
) || fclose (stream
))
9570 fprintf (stderr
, "error writing to '%s'\n", filename
);
9579 /* ========================================================================= */
9581 /* Word break property.
9582 Updated for Unicode TR #29 revision 17. */
9584 /* Possible values of the Word_Break property. */
9599 WBP_EXTENDNUMLET
= 7,
9608 /* Returns the word breaking property for ch, as a bit mask. */
9610 get_wbp (unsigned int ch
)
9614 if (unicode_attributes
[ch
].name
!= NULL
)
9617 attr
|= 1 << WBP_CR
;
9620 attr
|= 1 << WBP_LF
;
9622 if (ch
== 0x000B || ch
== 0x000C
9624 || ch
== 0x2028 || ch
== 0x2029)
9625 attr
|= 1 << WBP_NEWLINE
;
9627 if (((unicode_properties
[ch
] >> PROP_GRAPHEME_EXTEND
) & 1) != 0
9628 || ((unicode_properties
[ch
] >> PROP_OTHER_GRAPHEME_EXTEND
) & 1) != 0
9629 || (unicode_attributes
[ch
].category
!= NULL
9630 && strcmp (unicode_attributes
[ch
].category
, "Mc") == 0)
9631 || ((unicode_properties
[ch
] >> PROP_EMOJI_MODIFIER
) & 1) != 0 /* Emoji modifier */)
9632 attr
|= 1 << WBP_EXTEND
;
9634 if (unicode_attributes
[ch
].category
!= NULL
9635 && strcmp (unicode_attributes
[ch
].category
, "Cf") == 0
9636 && !(ch
>= 0x0600 && ch
<= 0x0605)
9639 && ch
!= 0x0890 && ch
!= 0x0891 && ch
!= 0x08E2
9640 && ch
!= 0x200B && ch
!= 0x200C && ch
!= 0x200D
9641 && ch
!= 0x110BD && ch
!= 0x110CD
9642 && !(ch
>= 0xe0020 && ch
<= 0xe007f))
9643 attr
|= 1 << WBP_FORMAT
;
9645 if ((unicode_scripts
[ch
] < numscripts
9646 && strcmp (scripts
[unicode_scripts
[ch
]], "Katakana") == 0)
9647 || (ch
>= 0x3031 && ch
<= 0x3035)
9648 || ch
== 0x309B || ch
== 0x309C || ch
== 0x30A0 || ch
== 0x30FC
9650 attr
|= 1 << WBP_KATAKANA
;
9652 if ((unicode_scripts
[ch
] < numscripts
9653 && strcmp (scripts
[unicode_scripts
[ch
]], "Hebrew") == 0)
9654 && strcmp (unicode_attributes
[ch
].category
, "Lo") == 0)
9655 attr
|= 1 << WBP_HL
;
9657 if ((((unicode_properties
[ch
] >> PROP_ALPHABETIC
) & 1) != 0
9658 || (ch
>= 0x02C2 && ch
<= 0x02C5)
9659 || (ch
>= 0x02D2 && ch
<= 0x02D7)
9660 || (ch
>= 0x02DE && ch
<= 0x02DF)
9661 || (ch
>= 0x02E5 && ch
<= 0x02EB)
9663 || (ch
>= 0x02EF && ch
<= 0x02FF)
9664 || (ch
>= 0x055A && ch
<= 0x055C)
9669 || (ch
>= 0xA708 && ch
<= 0xA716)
9670 || (ch
>= 0xA720 && ch
<= 0xA721)
9671 || (ch
>= 0xA789 && ch
<= 0xA78A)
9673 && ((unicode_properties
[ch
] >> PROP_IDEOGRAPHIC
) & 1) == 0
9674 && (attr
& (1 << WBP_KATAKANA
)) == 0
9675 && ((get_lbp (ch
) >> LBP_SA
) & 1) == 0
9676 && !(unicode_scripts
[ch
] < numscripts
9677 && strcmp (scripts
[unicode_scripts
[ch
]], "Hiragana") == 0)
9678 && (attr
& (1 << WBP_EXTEND
)) == 0
9679 && (attr
& (1 << WBP_HL
)) == 0)
9680 attr
|= 1 << WBP_ALETTER
;
9682 if (is_WBP_MIDNUMLET (ch
))
9683 attr
|= 1 << WBP_MIDNUMLET
;
9685 if (is_WBP_MIDLETTER (ch
) && ch
!= 0x02D7)
9686 attr
|= 1 << WBP_MIDLETTER
;
9688 if ((((get_lbp (ch
) >> LBP_IS
) & 1) != 0
9689 || ch
== 0x066C || ch
== 0xFE50 || ch
== 0xFE54 || ch
== 0xFF0C
9691 && ch
!= 0x003A && ch
!= 0xFE13 && ch
!= 0x002E)
9692 attr
|= 1 << WBP_MIDNUM
;
9694 if ((((get_lbp (ch
) >> LBP_NU
) & 1) != 0
9695 || (ch
>= 0x1B50 && ch
<= 0x1B59) /* BALINESE DIGIT ZERO..NINE */
9696 || (ch
>= 0xA9D0 && ch
<= 0xA9D9) /* JAVANESE DIGIT ZERO..NINE */
9697 || (ch
>= 0xAA50 && ch
<= 0xAA59) /* CHAM DIGIT ZERO..NINE */
9698 || (ch
>= 0xFF10 && ch
<= 0xFF19) /* FULLWIDTH DIGIT ZERO..NINE */
9699 || (ch
>= 0x11066 && ch
<= 0x1106F) /* BRAHMI DIGIT ZERO..NINE */
9700 || (ch
>= 0x11950 && ch
<= 0x11959) /* DIVES AKURU DIGIT ZERO..NINE */
9701 || (ch
>= 0x11F50 && ch
<= 0x11F59) /* KAWI DIGIT ZERO..NINE */
9702 || (ch
>= 0x16130 && ch
<= 0x16139) /* GURUNG KHEMA DIGIT ZERO..NINE */)
9704 attr
|= 1 << WBP_NUMERIC
;
9706 if ((unicode_attributes
[ch
].category
!= NULL
9707 && strcmp (unicode_attributes
[ch
].category
, "Pc") == 0)
9708 || ch
== 0x202F /* NARROW NO-BREAK SPACE */)
9709 attr
|= 1 << WBP_EXTENDNUMLET
;
9711 if (is_property_regional_indicator (ch
))
9712 attr
|= 1 << WBP_RI
;
9715 attr
|= 1 << WBP_DQ
;
9718 attr
|= 1 << WBP_SQ
;
9721 attr
|= 1 << WBP_ZWJ
;
9723 if (is_category_Zs (ch
) && ((get_lbp (ch
) >> LBP_GL
) & 1) == 0)
9724 attr
|= 1 << WBP_WSS
;
9729 attr
|= 1 << WBP_OTHER
;
9734 /* Output the word break property in a human readable format. */
9736 debug_output_wbp (FILE *stream
)
9740 for (i
= 0; i
< 0x110000; i
++)
9742 int attr
= get_wbp (i
);
9743 if (attr
!= 1 << WBP_OTHER
)
9745 fprintf (stream
, "0x%04X", i
);
9746 if (attr
& (1 << WBP_CR
))
9747 fprintf (stream
, " CR");
9748 if (attr
& (1 << WBP_LF
))
9749 fprintf (stream
, " LF");
9750 if (attr
& (1 << WBP_NEWLINE
))
9751 fprintf (stream
, " Newline");
9752 if (attr
& (1 << WBP_EXTEND
))
9753 fprintf (stream
, " Extend");
9754 if (attr
& (1 << WBP_FORMAT
))
9755 fprintf (stream
, " Format");
9756 if (attr
& (1 << WBP_KATAKANA
))
9757 fprintf (stream
, " Katakana");
9758 if (attr
& (1 << WBP_ALETTER
))
9759 fprintf (stream
, " ALetter");
9760 if (attr
& (1 << WBP_MIDNUMLET
))
9761 fprintf (stream
, " MidNumLet");
9762 if (attr
& (1 << WBP_MIDLETTER
))
9763 fprintf (stream
, " MidLetter");
9764 if (attr
& (1 << WBP_MIDNUM
))
9765 fprintf (stream
, " MidNum");
9766 if (attr
& (1 << WBP_NUMERIC
))
9767 fprintf (stream
, " Numeric");
9768 if (attr
& (1 << WBP_EXTENDNUMLET
))
9769 fprintf (stream
, " ExtendNumLet");
9770 if (attr
& (1 << WBP_RI
))
9771 fprintf (stream
, " Regional_Indicator");
9772 if (attr
& (1 << WBP_DQ
))
9773 fprintf (stream
, " Double_Quote");
9774 if (attr
& (1 << WBP_SQ
))
9775 fprintf (stream
, " Single_Quote");
9776 if (attr
& (1 << WBP_HL
))
9777 fprintf (stream
, " Hebrew_Letter");
9778 if (attr
& (1 << WBP_ZWJ
))
9779 fprintf (stream
, " ZWJ");
9780 if (attr
& (1 << WBP_WSS
))
9781 fprintf (stream
, " WSegSpace");
9782 fprintf (stream
, "\n");
9788 debug_output_wbrk_tables (const char *filename
)
9792 stream
= fopen (filename
, "w");
9795 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
9799 debug_output_wbp (stream
);
9801 if (ferror (stream
) || fclose (stream
))
9803 fprintf (stderr
, "error writing to '%s'\n", filename
);
9808 /* The word break property from the WordBreakProperty.txt file. */
9809 int unicode_org_wbp
[0x110000];
9811 /* Stores in unicode_org_wbp[] the word break property from the
9812 WordBreakProperty.txt file. */
9814 fill_org_wbp (const char *wordbreakproperty_filename
)
9819 for (i
= 0; i
< 0x110000; i
++)
9820 unicode_org_wbp
[i
] = WBP_OTHER
;
9822 stream
= fopen (wordbreakproperty_filename
, "r");
9825 fprintf (stderr
, "error during fopen of '%s'\n", wordbreakproperty_filename
);
9832 unsigned int i1
, i2
;
9833 char padding
[200+1];
9834 char propname
[200+1];
9837 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
9840 if (buf
[0] == '\0' || buf
[0] == '#')
9843 if (sscanf (buf
, "%X..%X%[ ;]%[^ ]", &i1
, &i2
, padding
, propname
) != 4)
9845 if (sscanf (buf
, "%X%[ ;]%[^ ]", &i1
, padding
, propname
) != 3)
9847 fprintf (stderr
, "parse error in '%s'\n",
9848 wordbreakproperty_filename
);
9853 #define PROP(name,value) \
9854 if (strcmp (propname, name) == 0) propvalue = value; else
9857 PROP ("Newline", WBP_NEWLINE
)
9858 PROP ("Extend", WBP_EXTEND
)
9859 PROP ("Format", WBP_FORMAT
)
9860 PROP ("Katakana", WBP_KATAKANA
)
9861 PROP ("ALetter", WBP_ALETTER
)
9862 PROP ("MidNumLet", WBP_MIDNUMLET
)
9863 PROP ("MidLetter", WBP_MIDLETTER
)
9864 PROP ("MidNum", WBP_MIDNUM
)
9865 PROP ("Numeric", WBP_NUMERIC
)
9866 PROP ("ExtendNumLet", WBP_EXTENDNUMLET
)
9867 PROP ("Regional_Indicator", WBP_RI
)
9868 PROP ("Double_Quote", WBP_DQ
)
9869 PROP ("Single_Quote", WBP_SQ
)
9870 PROP ("Hebrew_Letter", WBP_HL
)
9871 PROP ("ZWJ", WBP_ZWJ
)
9872 PROP ("WSegSpace", WBP_WSS
)
9875 fprintf (stderr
, "unknown property value '%s' in '%s'\n", propname
,
9876 wordbreakproperty_filename
);
9879 assert (i1
<= i2
&& i2
< 0x110000);
9881 for (i
= i1
; i
<= i2
; i
++)
9882 unicode_org_wbp
[i
] = propvalue
;
9885 if (ferror (stream
) || fclose (stream
))
9887 fprintf (stderr
, "error reading from '%s'\n", wordbreakproperty_filename
);
9892 /* Output the word break property in a human readable format. */
9894 debug_output_org_wbp (FILE *stream
)
9898 for (i
= 0; i
< 0x110000; i
++)
9900 int propvalue
= unicode_org_wbp
[i
];
9901 if (propvalue
!= WBP_OTHER
)
9903 fprintf (stream
, "0x%04X", i
);
9904 #define PROP(name,value) \
9905 if (propvalue == value) fprintf (stream, " " name); else
9908 PROP ("Newline", WBP_NEWLINE
)
9909 PROP ("Extend", WBP_EXTEND
)
9910 PROP ("Format", WBP_FORMAT
)
9911 PROP ("Katakana", WBP_KATAKANA
)
9912 PROP ("ALetter", WBP_ALETTER
)
9913 PROP ("MidNumLet", WBP_MIDNUMLET
)
9914 PROP ("MidLetter", WBP_MIDLETTER
)
9915 PROP ("MidNum", WBP_MIDNUM
)
9916 PROP ("Numeric", WBP_NUMERIC
)
9917 PROP ("ExtendNumLet", WBP_EXTENDNUMLET
)
9918 PROP ("Regional_Indicator", WBP_RI
)
9919 PROP ("Double_Quote", WBP_DQ
)
9920 PROP ("Single_Quote", WBP_SQ
)
9921 PROP ("Hebrew_Letter", WBP_HL
)
9922 PROP ("ZWJ", WBP_ZWJ
)
9923 PROP ("WSegSpace", WBP_WSS
)
9925 fprintf (stream
, " ??");
9926 fprintf (stream
, "\n");
9932 debug_output_org_wbrk_tables (const char *filename
)
9936 stream
= fopen (filename
, "w");
9939 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
9943 debug_output_org_wbp (stream
);
9945 if (ferror (stream
) || fclose (stream
))
9947 fprintf (stderr
, "error writing to '%s'\n", filename
);
9952 /* Construction of sparse 3-level tables. */
9953 #define TABLE wbp_table
9954 #define ELEMENT unsigned char
9955 #define DEFAULT WBP_OTHER
9956 #define xmalloc malloc
9957 #define xrealloc realloc
9961 output_wbp (FILE *stream
)
9965 unsigned int level1_offset
, level2_offset
, level3_offset
;
9969 wbp_table_init (&t
);
9971 for (i
= 0; i
< 0x110000; i
++)
9973 int attr
= get_wbp (i
);
9975 /* Now attr should contain exactly one bit. */
9976 assert (attr
!= 0 && (attr
& (attr
- 1)) == 0);
9978 if (attr
!= 1 << WBP_OTHER
)
9980 unsigned int log2_attr
;
9981 for (log2_attr
= 0; attr
> 1; attr
>>= 1, log2_attr
++);
9983 wbp_table_add (&t
, i
, log2_attr
);
9987 wbp_table_finalize (&t
);
9990 5 * sizeof (uint32_t);
9992 5 * sizeof (uint32_t)
9993 + t
.level1_size
* sizeof (uint32_t);
9995 5 * sizeof (uint32_t)
9996 + t
.level1_size
* sizeof (uint32_t)
9997 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
9999 for (i
= 0; i
< 5; i
++)
10000 fprintf (stream
, "#define wbrkprop_header_%d %d\n", i
,
10001 ((uint32_t *) t
.result
)[i
]);
10002 fprintf (stream
, "\n");
10003 fprintf (stream
, "typedef struct\n");
10004 fprintf (stream
, " {\n");
10005 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
10006 fprintf (stream
, " int level2[%zu << %d];\n", t
.level2_size
, t
.q
);
10007 fprintf (stream
, " unsigned char level3[%zu << %d];\n", t
.level3_size
, t
.p
);
10008 fprintf (stream
, " }\n");
10009 fprintf (stream
, "wbrkprop_t;\n");
10010 fprintf (stream
, "static const wbrkprop_t uniwbrkprop =\n");
10011 fprintf (stream
, "{\n");
10012 fprintf (stream
, " {");
10013 if (t
.level1_size
> 8)
10014 fprintf (stream
, "\n ");
10015 for (i
= 0; i
< t
.level1_size
; i
++)
10018 if (i
> 0 && (i
% 8) == 0)
10019 fprintf (stream
, "\n ");
10020 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
10022 fprintf (stream
, " %5d", -1);
10024 fprintf (stream
, " %5zu",
10025 (offset
- level2_offset
) / sizeof (uint32_t));
10026 if (i
+1 < t
.level1_size
)
10027 fprintf (stream
, ",");
10029 if (t
.level1_size
> 8)
10030 fprintf (stream
, "\n ");
10031 fprintf (stream
, " },\n");
10032 fprintf (stream
, " {");
10033 if (t
.level2_size
<< t
.q
> 8)
10034 fprintf (stream
, "\n ");
10035 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
10038 if (i
> 0 && (i
% 8) == 0)
10039 fprintf (stream
, "\n ");
10040 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
10042 fprintf (stream
, " %5d", -1);
10044 fprintf (stream
, " %5zu",
10045 (offset
- level3_offset
) / sizeof (unsigned char));
10046 if (i
+1 < t
.level2_size
<< t
.q
)
10047 fprintf (stream
, ",");
10049 if (t
.level2_size
<< t
.q
> 8)
10050 fprintf (stream
, "\n ");
10051 fprintf (stream
, " },\n");
10052 fprintf (stream
, " {");
10053 if (t
.level3_size
<< t
.p
> 4)
10054 fprintf (stream
, "\n ");
10055 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
10057 unsigned char value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
];
10058 const char *value_string
;
10061 #define CASE(x) case x: value_string = #x; break;
10068 CASE(WBP_KATAKANA
);
10070 CASE(WBP_MIDNUMLET
);
10071 CASE(WBP_MIDLETTER
);
10074 CASE(WBP_EXTENDNUMLET
);
10085 if (i
> 0 && (i
% 4) == 0)
10086 fprintf (stream
, "\n ");
10087 fprintf (stream
, " %s%s", value_string
,
10088 (i
+1 < t
.level3_size
<< t
.p
? "," : ""));
10090 if (t
.level3_size
<< t
.p
> 4)
10091 fprintf (stream
, "\n ");
10092 fprintf (stream
, " }\n");
10093 fprintf (stream
, "};\n");
10097 output_wbrk_tables (const char *filename
, const char *version
)
10101 stream
= fopen (filename
, "w");
10102 if (stream
== NULL
)
10104 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
10108 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10109 fprintf (stream
, "/* Word breaking properties of Unicode characters. */\n");
10110 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10112 fprintf (stream
, "\n");
10114 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
10115 fprintf (stream
, "\n");
10116 output_library_license (stream
, false);
10117 fprintf (stream
, "\n");
10119 output_wbp (stream
);
10121 if (ferror (stream
) || fclose (stream
))
10123 fprintf (stderr
, "error writing to '%s'\n", filename
);
10128 /* ========================================================================= */
10130 /* Grapheme break property.
10131 Updated for Unicode TR #29 revision 29. */
10133 /* Possible values of the Grapheme_Cluster_Break property. */
10142 GBP_SPACINGMARK
= 6,
10156 /* Construction of sparse 3-level tables. */
10157 #define TABLE gbp_table
10158 #define ELEMENT unsigned char
10159 #define DEFAULT GBP_OTHER
10160 #define xmalloc malloc
10161 #define xrealloc realloc
10162 #include "3level.h"
10164 /* The grapheme break property from the GraphemeBreakProperty.txt file. */
10165 int unicode_org_gbp
[0x110000];
10167 /* Output the unit test data for the grapheme break property. */
10169 output_gbp_test (const char *filename
)
10175 stream
= fopen (filename
, "w");
10176 if (stream
== NULL
)
10178 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
10182 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10183 fprintf (stream
, "/* Test the Unicode grapheme break property functions.\n");
10184 fprintf (stream
, " Copyright (C) 2010-2024 Free Software Foundation, Inc.\n");
10185 fprintf (stream
, "\n");
10186 output_tests_license (stream
);
10187 fprintf (stream
, "\n");
10189 need_comma
= false;
10190 for (ch
= 0; ch
< 0x110000; ch
++)
10192 int gbp
= unicode_org_gbp
[ch
];
10193 const char *gbp_string
;
10195 while (ch
+ 1 < 0x110000 && unicode_org_gbp
[ch
+ 1] == gbp
)
10200 #define CASE(x) case x: gbp_string = #x; break;
10207 CASE (GBP_SPACINGMARK
)
10225 fprintf (stream
, ",\n");
10226 fprintf (stream
, "{ 0x%04X, %s }", ch
+ 1, gbp_string
);
10230 fprintf (stream
, "\n");
10232 if (ferror (stream
) || fclose (stream
))
10234 fprintf (stderr
, "error writing to '%s'\n", filename
);
10239 /* Output the per-character grapheme break property table. */
10241 output_gbp_table (const char *filename
, const char *version
)
10244 unsigned int ch
, i
;
10245 struct gbp_table t
;
10246 unsigned int level1_offset
, level2_offset
, level3_offset
;
10248 stream
= fopen (filename
, "w");
10249 if (stream
== NULL
)
10251 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
10255 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10256 fprintf (stream
, "/* Grapheme break property of Unicode characters. */\n");
10257 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10259 fprintf (stream
, "\n");
10261 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
10262 fprintf (stream
, "\n");
10263 output_library_license (stream
, false);
10264 fprintf (stream
, "\n");
10268 gbp_table_init (&t
);
10270 for (ch
= 0; ch
< 0x110000; ch
++)
10271 gbp_table_add (&t
, ch
, unicode_org_gbp
[ch
]);
10273 gbp_table_finalize (&t
);
10275 /* Offsets in t.result, in memory of this process. */
10277 5 * sizeof (uint32_t);
10279 5 * sizeof (uint32_t)
10280 + t
.level1_size
* sizeof (uint32_t);
10282 5 * sizeof (uint32_t)
10283 + t
.level1_size
* sizeof (uint32_t)
10284 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
10286 for (i
= 0; i
< 5; i
++)
10287 fprintf (stream
, "#define gbrkprop_header_%d %d\n", i
,
10288 ((uint32_t *) t
.result
)[i
]);
10289 fprintf (stream
, "static const\n");
10290 fprintf (stream
, "struct\n");
10291 fprintf (stream
, " {\n");
10292 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
10293 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
10294 fprintf (stream
, " unsigned char level3[%zu << %d];\n",
10295 t
.level3_size
, t
.p
);
10296 fprintf (stream
, " }\n");
10297 fprintf (stream
, "unigbrkprop =\n");
10298 fprintf (stream
, "{\n");
10299 fprintf (stream
, " {");
10300 if (t
.level1_size
> 8)
10301 fprintf (stream
, "\n ");
10302 for (i
= 0; i
< t
.level1_size
; i
++)
10305 if (i
> 0 && (i
% 8) == 0)
10306 fprintf (stream
, "\n ");
10307 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
10309 fprintf (stream
, " %5d", -1);
10311 fprintf (stream
, " %5zu",
10312 (offset
- level2_offset
) / sizeof (uint32_t));
10313 if (i
+1 < t
.level1_size
)
10314 fprintf (stream
, ",");
10316 if (t
.level1_size
> 8)
10317 fprintf (stream
, "\n ");
10318 fprintf (stream
, " },\n");
10319 fprintf (stream
, " {");
10320 if (t
.level2_size
<< t
.q
> 8)
10321 fprintf (stream
, "\n ");
10322 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
10325 if (i
> 0 && (i
% 8) == 0)
10326 fprintf (stream
, "\n ");
10327 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
10329 fprintf (stream
, " %5d", -1);
10331 fprintf (stream
, " %5zu",
10332 (offset
- level3_offset
) / sizeof (uint8_t));
10333 if (i
+1 < t
.level2_size
<< t
.q
)
10334 fprintf (stream
, ",");
10336 if (t
.level2_size
<< t
.q
> 8)
10337 fprintf (stream
, "\n ");
10338 fprintf (stream
, " },\n");
10339 fprintf (stream
, " {");
10340 if (t
.level3_size
<< t
.p
> 4)
10341 fprintf (stream
, "\n ");
10342 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
10344 unsigned char value
= ((unsigned char *) (t
.result
+ level3_offset
))[i
];
10345 const char *value_string
;
10348 #define CASE(x) case x: value_string = #x; break;
10355 CASE (GBP_SPACINGMARK
)
10371 if (i
> 0 && (i
% 4) == 0)
10372 fprintf (stream
, "\n ");
10373 fprintf (stream
, " %s%s", value_string
,
10374 (i
+1 < t
.level3_size
<< t
.p
? "," : ""));
10376 if (t
.level3_size
<< t
.p
> 4)
10377 fprintf (stream
, "\n ");
10378 fprintf (stream
, " }\n");
10379 fprintf (stream
, "};\n");
10381 if (ferror (stream
) || fclose (stream
))
10383 fprintf (stderr
, "error writing to '%s'\n", filename
);
10388 /* Stores in unicode_org_gbp[] the grapheme breaking property from the
10389 GraphemeBreakProperty.txt file. */
10391 fill_org_gbp (const char *graphemebreakproperty_filename
)
10397 for (i
= 0; i
< 0x110000; i
++)
10398 unicode_org_gbp
[i
] = GBP_OTHER
;
10400 stream
= fopen (graphemebreakproperty_filename
, "r");
10401 if (stream
== NULL
)
10403 fprintf (stderr
, "error during fopen of '%s'\n",
10404 graphemebreakproperty_filename
);
10411 unsigned int i1
, i2
;
10412 char padding
[200+1];
10413 char propname
[200+1];
10417 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
10420 if (buf
[0] == '\0' || buf
[0] == '#')
10423 if (sscanf (buf
, "%X..%X%[ ;]%[^ ]", &i1
, &i2
, padding
, propname
) != 4)
10425 if (sscanf (buf
, "%X%[ ;]%[^ ]", &i1
, padding
, propname
) != 3)
10427 fprintf (stderr
, "parse error in '%s'\n",
10428 graphemebreakproperty_filename
);
10433 #define PROP(name,value) \
10434 if (strcmp (propname, name) == 0) propvalue = value; else
10435 PROP ("CR", GBP_CR
)
10436 PROP ("LF", GBP_LF
)
10437 PROP ("Control", GBP_CONTROL
)
10438 PROP ("Extend", GBP_EXTEND
)
10439 PROP ("Prepend", GBP_PREPEND
)
10440 PROP ("SpacingMark", GBP_SPACINGMARK
)
10444 PROP ("LV", GBP_LV
)
10445 PROP ("LVT", GBP_LVT
)
10446 PROP ("Regional_Indicator", GBP_RI
)
10447 PROP ("ZWJ", GBP_ZWJ
)
10448 PROP ("E_Base", GBP_EB
)
10449 PROP ("E_Modifier", GBP_EM
)
10450 PROP ("Glue_After_Zwj", GBP_GAZ
)
10451 PROP ("E_Base_GAZ", GBP_EBG
)
10454 fprintf (stderr
, "unknown property value '%s' in %s:%d\n", propname
,
10455 graphemebreakproperty_filename
, lineno
);
10458 assert (i1
<= i2
&& i2
< 0x110000);
10460 for (i
= i1
; i
<= i2
; i
++)
10461 unicode_org_gbp
[i
] = propvalue
;
10464 if (ferror (stream
) || fclose (stream
))
10466 fprintf (stderr
, "error reading from '%s'\n", graphemebreakproperty_filename
);
10471 /* ========================================================================= */
10473 /* Composition and decomposition.
10474 Updated for Unicode TR #15 revision 33. */
10476 /* Maximum number of characters into which a single Unicode character can be
10478 #define MAX_DECOMP_LENGTH 18
10482 UC_DECOMP_CANONICAL
,/* Canonical decomposition. */
10483 UC_DECOMP_FONT
, /* <font> A font variant (e.g. a blackletter form). */
10484 UC_DECOMP_NOBREAK
, /* <noBreak> A no-break version of a space or hyphen. */
10485 UC_DECOMP_INITIAL
, /* <initial> An initial presentation form (Arabic). */
10486 UC_DECOMP_MEDIAL
, /* <medial> A medial presentation form (Arabic). */
10487 UC_DECOMP_FINAL
, /* <final> A final presentation form (Arabic). */
10488 UC_DECOMP_ISOLATED
,/* <isolated> An isolated presentation form (Arabic). */
10489 UC_DECOMP_CIRCLE
, /* <circle> An encircled form. */
10490 UC_DECOMP_SUPER
, /* <super> A superscript form. */
10491 UC_DECOMP_SUB
, /* <sub> A subscript form. */
10492 UC_DECOMP_VERTICAL
,/* <vertical> A vertical layout presentation form. */
10493 UC_DECOMP_WIDE
, /* <wide> A wide (or zenkaku) compatibility character. */
10494 UC_DECOMP_NARROW
, /* <narrow> A narrow (or hankaku) compatibility character. */
10495 UC_DECOMP_SMALL
, /* <small> A small variant form (CNS compatibility). */
10496 UC_DECOMP_SQUARE
, /* <square> A CJK squared font variant. */
10497 UC_DECOMP_FRACTION
,/* <fraction> A vulgar fraction form. */
10498 UC_DECOMP_COMPAT
/* <compat> Otherwise unspecified compatibility character. */
10501 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
10502 decompositions). Return the type, or -1 for none. */
10504 get_decomposition (unsigned int ch
,
10505 unsigned int *lengthp
, unsigned int decomposed
[MAX_DECOMP_LENGTH
])
10507 const char *decomposition
= unicode_attributes
[ch
].decomposition
;
10509 if (decomposition
!= NULL
&& decomposition
[0] != '\0')
10511 int type
= UC_DECOMP_CANONICAL
;
10512 unsigned int length
;
10515 if (decomposition
[0] == '<')
10517 const char *rangle
;
10520 rangle
= strchr (decomposition
+ 1, '>');
10521 assert (rangle
!= NULL
);
10522 typelen
= rangle
+ 1 - decomposition
;
10523 #define TYPE(t1,t2) \
10524 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
10527 TYPE ("<font>", UC_DECOMP_FONT
)
10528 TYPE ("<noBreak>", UC_DECOMP_NOBREAK
)
10529 TYPE ("<initial>", UC_DECOMP_INITIAL
)
10530 TYPE ("<medial>", UC_DECOMP_MEDIAL
)
10531 TYPE ("<final>", UC_DECOMP_FINAL
)
10532 TYPE ("<isolated>", UC_DECOMP_ISOLATED
)
10533 TYPE ("<circle>", UC_DECOMP_CIRCLE
)
10534 TYPE ("<super>", UC_DECOMP_SUPER
)
10535 TYPE ("<sub>", UC_DECOMP_SUB
)
10536 TYPE ("<vertical>", UC_DECOMP_VERTICAL
)
10537 TYPE ("<wide>", UC_DECOMP_WIDE
)
10538 TYPE ("<narrow>", UC_DECOMP_NARROW
)
10539 TYPE ("<small>", UC_DECOMP_SMALL
)
10540 TYPE ("<square>", UC_DECOMP_SQUARE
)
10541 TYPE ("<fraction>", UC_DECOMP_FRACTION
)
10542 TYPE ("<compat>", UC_DECOMP_COMPAT
)
10544 fprintf (stderr
, "unknown decomposition type %*s\n", (int)typelen
, decomposition
);
10548 decomposition
= rangle
+ 1;
10549 if (decomposition
[0] == ' ')
10552 for (length
= 0; length
< MAX_DECOMP_LENGTH
; length
++)
10554 decomposed
[length
] = strtoul (decomposition
, &endptr
, 16);
10555 if (endptr
== decomposition
)
10557 decomposition
= endptr
;
10558 if (decomposition
[0] == ' ')
10561 /* Make sure that *DECOMPOSITION is not NULL-terminated.
10562 Otherwise MAX_DECOMP_LENGTH is too small. */
10563 assert (*decomposition
== '\0');
10572 /* Construction of sparse 3-level tables. */
10573 #define TABLE decomp_table
10574 #define ELEMENT uint16_t
10575 #define DEFAULT (uint16_t)(-1)
10576 #define xmalloc malloc
10577 #define xrealloc realloc
10578 #include "3level.h"
10581 output_decomposition (FILE *stream1
, FILE *stream2
)
10583 struct decomp_table t
;
10584 unsigned int level1_offset
, level2_offset
, level3_offset
;
10585 unsigned int offset
;
10591 decomp_table_init (&t
);
10593 fprintf (stream1
, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
10594 fprintf (stream1
, "\n");
10595 fprintf (stream2
, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
10598 for (ch
= 0; ch
< 0x110000; ch
++)
10600 unsigned int length
;
10601 unsigned int decomposed
[MAX_DECOMP_LENGTH
];
10602 int type
= get_decomposition (ch
, &length
, decomposed
);
10606 assert (offset
< (1 << 15));
10607 decomp_table_add (&t
, ch
, ((type
== UC_DECOMP_CANONICAL
? 0 : 1) << 15) | offset
);
10609 /* Produce length 3-bytes entries. */
10610 /* We would need a special representation of zero-length entries. */
10611 assert (length
!= 0);
10612 for (i
= 0; i
< length
; i
++)
10615 fprintf (stream2
, ",");
10616 if ((offset
% 4) == 0)
10617 fprintf (stream2
, "\n ");
10618 assert (decomposed
[i
] < (1 << 18));
10619 fprintf (stream2
, " 0x%02X, 0x%02X, 0x%02X",
10620 (((i
+1 < length
? (1 << 23) : 0)
10621 | (i
== 0 ? (type
<< 18) : 0)
10622 | decomposed
[i
]) >> 16) & 0xff,
10623 (decomposed
[i
] >> 8) & 0xff,
10624 decomposed
[i
] & 0xff);
10630 fprintf (stream2
, "\n};\n");
10631 fprintf (stream2
, "\n");
10633 decomp_table_finalize (&t
);
10636 5 * sizeof (uint32_t);
10638 5 * sizeof (uint32_t)
10639 + t
.level1_size
* sizeof (uint32_t);
10641 5 * sizeof (uint32_t)
10642 + t
.level1_size
* sizeof (uint32_t)
10643 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
10645 for (i
= 0; i
< 5; i
++)
10646 fprintf (stream1
, "#define decomp_header_%d %d\n", i
,
10647 ((uint32_t *) t
.result
)[i
]);
10648 fprintf (stream1
, "\n");
10649 fprintf (stream1
, "typedef struct\n");
10650 fprintf (stream1
, " {\n");
10651 fprintf (stream1
, " int level1[%zu];\n", t
.level1_size
);
10652 fprintf (stream1
, " int level2[%zu << %d];\n", t
.level2_size
, t
.q
);
10653 fprintf (stream1
, " unsigned short level3[%zu << %d];\n", t
.level3_size
, t
.p
);
10654 fprintf (stream1
, " }\n");
10655 fprintf (stream1
, "decomp_index_table_t;\n");
10656 fprintf (stream1
, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
10657 fprintf (stream2
, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
10658 fprintf (stream2
, "{\n");
10659 fprintf (stream2
, " {");
10660 if (t
.level1_size
> 8)
10661 fprintf (stream2
, "\n ");
10662 for (i
= 0; i
< t
.level1_size
; i
++)
10665 if (i
> 0 && (i
% 8) == 0)
10666 fprintf (stream2
, "\n ");
10667 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
10669 fprintf (stream2
, " %5d", -1);
10671 fprintf (stream2
, " %5zu",
10672 (offset
- level2_offset
) / sizeof (uint32_t));
10673 if (i
+1 < t
.level1_size
)
10674 fprintf (stream2
, ",");
10676 if (t
.level1_size
> 8)
10677 fprintf (stream2
, "\n ");
10678 fprintf (stream2
, " },\n");
10679 fprintf (stream2
, " {");
10680 if (t
.level2_size
<< t
.q
> 8)
10681 fprintf (stream2
, "\n ");
10682 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
10685 if (i
> 0 && (i
% 8) == 0)
10686 fprintf (stream2
, "\n ");
10687 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
10689 fprintf (stream2
, " %5d", -1);
10691 fprintf (stream2
, " %5zu",
10692 (offset
- level3_offset
) / sizeof (uint16_t));
10693 if (i
+1 < t
.level2_size
<< t
.q
)
10694 fprintf (stream2
, ",");
10696 if (t
.level2_size
<< t
.q
> 8)
10697 fprintf (stream2
, "\n ");
10698 fprintf (stream2
, " },\n");
10699 fprintf (stream2
, " {");
10700 if (t
.level3_size
<< t
.p
> 8)
10701 fprintf (stream2
, "\n ");
10702 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
10704 uint16_t value
= ((uint16_t *) (t
.result
+ level3_offset
))[i
];
10705 if (i
> 0 && (i
% 8) == 0)
10706 fprintf (stream2
, "\n ");
10707 fprintf (stream2
, " %5d", value
== (uint16_t)(-1) ? -1 : value
);
10708 if (i
+1 < t
.level3_size
<< t
.p
)
10709 fprintf (stream2
, ",");
10711 if (t
.level3_size
<< t
.p
> 8)
10712 fprintf (stream2
, "\n ");
10713 fprintf (stream2
, " }\n");
10714 fprintf (stream2
, "};\n");
10718 output_decomposition_tables (const char *filename1
, const char *filename2
, const char *version
)
10720 const char *filenames
[2];
10724 filenames
[0] = filename1
;
10725 filenames
[1] = filename2
;
10727 for (i
= 0; i
< 2; i
++)
10729 streams
[i
] = fopen (filenames
[i
], "w");
10730 if (streams
[i
] == NULL
)
10732 fprintf (stderr
, "cannot open '%s' for writing\n", filenames
[i
]);
10737 for (i
= 0; i
< 2; i
++)
10739 FILE *stream
= streams
[i
];
10741 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10742 fprintf (stream
, "/* Decomposition of Unicode characters. */\n");
10743 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10745 fprintf (stream
, "\n");
10747 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
10748 fprintf (stream
, "\n");
10749 output_library_license (stream
, true);
10750 fprintf (stream
, "\n");
10753 output_decomposition (streams
[0], streams
[1]);
10755 for (i
= 0; i
< 2; i
++)
10757 if (ferror (streams
[i
]) || fclose (streams
[i
]))
10759 fprintf (stderr
, "error writing to '%s'\n", filenames
[i
]);
10765 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
10766 char unicode_composition_exclusions
[0x110000];
10769 fill_composition_exclusions (const char *compositionexclusions_filename
)
10774 stream
= fopen (compositionexclusions_filename
, "r");
10775 if (stream
== NULL
)
10777 fprintf (stderr
, "error during fopen of '%s'\n", compositionexclusions_filename
);
10781 for (i
= 0; i
< 0x110000; i
++)
10782 unicode_composition_exclusions
[i
] = 0;
10789 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
10792 if (buf
[0] == '\0' || buf
[0] == '#')
10795 if (sscanf (buf
, "%X", &i
) != 1)
10797 fprintf (stderr
, "parse error in '%s'\n", compositionexclusions_filename
);
10800 assert (i
< 0x110000);
10802 unicode_composition_exclusions
[i
] = 1;
10805 if (ferror (stream
) || fclose (stream
))
10807 fprintf (stderr
, "error reading from '%s'\n", compositionexclusions_filename
);
10813 debug_output_composition_tables (const char *filename
)
10818 stream
= fopen (filename
, "w");
10819 if (stream
== NULL
)
10821 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
10825 for (ch
= 0; ch
< 0x110000; ch
++)
10827 unsigned int length
;
10828 unsigned int decomposed
[MAX_DECOMP_LENGTH
];
10829 int type
= get_decomposition (ch
, &length
, decomposed
);
10831 if (type
== UC_DECOMP_CANONICAL
10832 /* Consider only binary decompositions.
10833 Exclude singleton decompositions. */
10836 unsigned int code1
= decomposed
[0];
10837 unsigned int code2
= decomposed
[1];
10838 unsigned int combined
= ch
;
10840 /* Exclude decompositions where the first part is not a starter,
10841 i.e. is not of canonical combining class 0. */
10842 if (strcmp (unicode_attributes
[code1
].combining
, "0") == 0
10843 /* Exclude characters listed in CompositionExclusions.txt. */
10844 && !unicode_composition_exclusions
[combined
])
10846 /* The combined character must now also be a starter.
10848 assert (strcmp (unicode_attributes
[combined
].combining
, "0") == 0);
10850 fprintf (stream
, "0x%04X\t0x%04X\t0x%04X\t%s\n",
10854 unicode_attributes
[code2
].combining
);
10859 if (ferror (stream
) || fclose (stream
))
10861 fprintf (stderr
, "error writing to '%s'\n", filename
);
10867 output_composition_tables (const char *filename
, const char *filename2
,
10868 const char *version
)
10870 unsigned int max_code1
;
10871 unsigned int max_code2
;
10878 stream
= fopen (filename
, "w");
10879 if (stream
== NULL
)
10881 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
10885 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10886 fprintf (stream
, "/* Canonical composition of Unicode characters. */\n");
10887 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10889 fprintf (stream
, "\n");
10891 fprintf (stream
, "/* Copyright (C) 2009-2024 Free Software Foundation, Inc.\n");
10892 fprintf (stream
, "\n");
10893 output_library_license (stream
, true);
10894 fprintf (stream
, "\n");
10896 /* The composition table is a set of mappings (code1, code2) -> combined,
10898 367 values for code1 (from 0x003C to 0x30FD),
10899 54 values for code2 (from 0x0300 to 0x309A).
10900 For a fixed code1, there are from 1 to 19 possible values for code2.
10901 For a fixed code2, there are from 1 to 117 possible values for code1.
10902 This is a very sparse matrix.
10904 We want an O(1) hash lookup.
10906 We could implement the hash lookup by mapping (code1, code2) to a linear
10907 combination mul1*code1 + mul2*code2, which is then used as an index into
10908 a 3-level table. But this leads to a table of size 37 KB.
10910 We use gperf to implement the hash lookup, giving it the 928 sets of
10911 4 bytes (code1, code2) as input. gperf generates a hash table of size
10912 1527, which is quite good (60% filled). It requires an auxiliary table
10913 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
10915 fprintf (stream
, "struct composition_rule { char codes[6]; };\n");
10916 fprintf (stream
, "%%struct-type\n");
10917 fprintf (stream
, "%%language=ANSI-C\n");
10918 fprintf (stream
, "%%define slot-name codes\n");
10919 fprintf (stream
, "%%define hash-function-name gl_uninorm_compose_hash\n");
10920 fprintf (stream
, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
10921 fprintf (stream
, "%%compare-lengths\n");
10922 fprintf (stream
, "%%compare-strncmp\n");
10923 fprintf (stream
, "%%readonly-tables\n");
10924 fprintf (stream
, "%%omit-struct-type\n");
10925 fprintf (stream
, "%%%%\n");
10927 for (ch
= 0; ch
< 0x110000; ch
++)
10929 unsigned int length
;
10930 unsigned int decomposed
[MAX_DECOMP_LENGTH
];
10931 int type
= get_decomposition (ch
, &length
, decomposed
);
10933 if (type
== UC_DECOMP_CANONICAL
10934 /* Consider only binary decompositions.
10935 Exclude singleton decompositions. */
10938 unsigned int code1
= decomposed
[0];
10939 unsigned int code2
= decomposed
[1];
10940 unsigned int combined
= ch
;
10942 /* Exclude decompositions where the first part is not a starter,
10943 i.e. is not of canonical combining class 0. */
10944 if (strcmp (unicode_attributes
[code1
].combining
, "0") == 0
10945 /* Exclude characters listed in CompositionExclusions.txt. */
10946 && !unicode_composition_exclusions
[combined
])
10948 /* The combined character must now also be a starter.
10950 assert (strcmp (unicode_attributes
[combined
].combining
, "0") == 0);
10952 if (max_code1
< code1
)
10954 if (max_code2
< code2
)
10957 fprintf (stream
, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
10958 (code1
>> 16) & 0xff, (code1
>> 8) & 0xff, code1
& 0xff,
10959 (code2
>> 16) & 0xff, (code2
>> 8) & 0xff, code2
& 0xff,
10965 if (ferror (stream
) || fclose (stream
))
10967 fprintf (stderr
, "error writing to '%s'\n", filename
);
10971 stream
= fopen (filename2
, "w");
10972 if (stream
== NULL
)
10974 fprintf (stderr
, "cannot open '%s' for writing\n", filename2
);
10978 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10979 fprintf (stream
, "/* Canonical composition of Unicode characters. */\n");
10980 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10982 fprintf (stream
, "\n");
10984 fprintf (stream
, "/* Copyright (C) 2009-2024 Free Software Foundation, Inc.\n");
10985 fprintf (stream
, "\n");
10986 output_library_license (stream
, true);
10987 fprintf (stream
, "\n");
10989 fprintf (stream
, "/* Maximum value of the first argument for which gl_uninorm_compose_lookup\n"
10990 " can return a non-NULL value. */\n");
10991 fprintf (stream
, "#define UNINORM_COMPOSE_MAX_ARG1 0x%x\n", max_code1
);
10992 fprintf (stream
, "/* Maximum value of the second argument for which gl_uninorm_compose_lookup\n"
10993 " can return a non-NULL value. */\n");
10994 fprintf (stream
, "#define UNINORM_COMPOSE_MAX_ARG2 0x%x\n", max_code2
);
10996 if (ferror (stream
) || fclose (stream
))
10998 fprintf (stderr
, "error writing to '%s'\n", filename2
);
11003 /* ========================================================================= */
11005 /* Output the test for a simple character mapping table to the given file. */
11008 output_simple_mapping_test (const char *filename
,
11009 const char *function_name
,
11010 unsigned int (*func
) (unsigned int),
11011 const char *version
)
11017 stream
= fopen (filename
, "w");
11018 if (stream
== NULL
)
11020 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
11024 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
11025 fprintf (stream
, "/* Test the Unicode character mapping functions.\n");
11026 fprintf (stream
, " Copyright (C) 2009-2024 Free Software Foundation, Inc.\n");
11027 fprintf (stream
, "\n");
11028 output_tests_license (stream
);
11029 fprintf (stream
, "\n");
11030 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
11032 fprintf (stream
, "\n");
11033 fprintf (stream
, "#include \"test-mapping-part1.h\"\n");
11034 fprintf (stream
, "\n");
11036 need_comma
= false;
11037 for (ch
= 0; ch
< 0x110000; ch
++)
11039 unsigned int value
= func (ch
);
11044 fprintf (stream
, ",\n");
11045 fprintf (stream
, " { 0x%04X, 0x%04X }", ch
, value
);
11050 fprintf (stream
, "\n");
11052 fprintf (stream
, "\n");
11053 fprintf (stream
, "#define MAP(c) %s (c)\n", function_name
);
11054 fprintf (stream
, "#include \"test-mapping-part2.h\"\n");
11056 if (ferror (stream
) || fclose (stream
))
11058 fprintf (stderr
, "error writing to '%s'\n", filename
);
11063 /* Construction of sparse 3-level tables. */
11064 #define TABLE mapping_table
11065 #define ELEMENT int32_t
11067 #define xmalloc malloc
11068 #define xrealloc realloc
11069 #include "3level.h"
11071 /* Output a simple character mapping table to the given file. */
11074 output_simple_mapping (const char *filename
,
11075 unsigned int (*func
) (unsigned int),
11076 const char *version
)
11079 unsigned int ch
, i
;
11080 struct mapping_table t
;
11081 unsigned int level1_offset
, level2_offset
, level3_offset
;
11083 stream
= fopen (filename
, "w");
11084 if (stream
== NULL
)
11086 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
11090 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
11091 fprintf (stream
, "/* Simple character mapping of Unicode characters. */\n");
11092 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
11094 fprintf (stream
, "\n");
11096 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
11097 fprintf (stream
, "\n");
11098 output_library_license (stream
,
11099 strcmp (filename
, "unicase/tolower.h") == 0
11100 || strcmp (filename
, "unicase/toupper.h") == 0);
11101 fprintf (stream
, "\n");
11105 mapping_table_init (&t
);
11107 for (ch
= 0; ch
< 0x110000; ch
++)
11109 int value
= (int) func (ch
) - (int) ch
;
11111 mapping_table_add (&t
, ch
, value
);
11114 mapping_table_finalize (&t
);
11116 /* Offsets in t.result, in memory of this process. */
11118 5 * sizeof (uint32_t);
11120 5 * sizeof (uint32_t)
11121 + t
.level1_size
* sizeof (uint32_t);
11123 5 * sizeof (uint32_t)
11124 + t
.level1_size
* sizeof (uint32_t)
11125 + (t
.level2_size
<< t
.q
) * sizeof (uint32_t);
11127 for (i
= 0; i
< 5; i
++)
11128 fprintf (stream
, "#define mapping_header_%d %d\n", i
,
11129 ((uint32_t *) t
.result
)[i
]);
11130 fprintf (stream
, "static const\n");
11131 fprintf (stream
, "struct\n");
11132 fprintf (stream
, " {\n");
11133 fprintf (stream
, " int level1[%zu];\n", t
.level1_size
);
11134 fprintf (stream
, " short level2[%zu << %d];\n", t
.level2_size
, t
.q
);
11135 fprintf (stream
, " int level3[%zu << %d];\n", t
.level3_size
, t
.p
);
11136 fprintf (stream
, " }\n");
11137 fprintf (stream
, "u_mapping =\n");
11138 fprintf (stream
, "{\n");
11139 fprintf (stream
, " {");
11140 if (t
.level1_size
> 8)
11141 fprintf (stream
, "\n ");
11142 for (i
= 0; i
< t
.level1_size
; i
++)
11145 if (i
> 0 && (i
% 8) == 0)
11146 fprintf (stream
, "\n ");
11147 offset
= ((uint32_t *) (t
.result
+ level1_offset
))[i
];
11149 fprintf (stream
, " %5d", -1);
11151 fprintf (stream
, " %5zu",
11152 (offset
- level2_offset
) / sizeof (uint32_t));
11153 if (i
+1 < t
.level1_size
)
11154 fprintf (stream
, ",");
11156 if (t
.level1_size
> 8)
11157 fprintf (stream
, "\n ");
11158 fprintf (stream
, " },\n");
11159 fprintf (stream
, " {");
11160 if (t
.level2_size
<< t
.q
> 8)
11161 fprintf (stream
, "\n ");
11162 for (i
= 0; i
< t
.level2_size
<< t
.q
; i
++)
11165 if (i
> 0 && (i
% 8) == 0)
11166 fprintf (stream
, "\n ");
11167 offset
= ((uint32_t *) (t
.result
+ level2_offset
))[i
];
11169 fprintf (stream
, " %5d", -1);
11171 fprintf (stream
, " %5zu",
11172 (offset
- level3_offset
) / sizeof (int32_t));
11173 if (i
+1 < t
.level2_size
<< t
.q
)
11174 fprintf (stream
, ",");
11176 if (t
.level2_size
<< t
.q
> 8)
11177 fprintf (stream
, "\n ");
11178 fprintf (stream
, " },\n");
11179 fprintf (stream
, " {");
11180 if (t
.level3_size
<< t
.p
> 8)
11181 fprintf (stream
, "\n ");
11182 for (i
= 0; i
< t
.level3_size
<< t
.p
; i
++)
11184 if (i
> 0 && (i
% 8) == 0)
11185 fprintf (stream
, "\n ");
11186 fprintf (stream
, " %5d", ((int32_t *) (t
.result
+ level3_offset
))[i
]);
11187 if (i
+1 < t
.level3_size
<< t
.p
)
11188 fprintf (stream
, ",");
11190 if (t
.level3_size
<< t
.p
> 8)
11191 fprintf (stream
, "\n ");
11192 fprintf (stream
, " }\n");
11193 fprintf (stream
, "};\n");
11195 if (ferror (stream
) || fclose (stream
))
11197 fprintf (stderr
, "error writing to '%s'\n", filename
);
11202 /* ========================================================================= */
11204 /* A special casing context.
11205 A context is negated through x -> -x. */
11210 SCC_AFTER_SOFT_DOTTED
,
11216 /* A special casing rule. */
11217 struct special_casing_rule
11220 unsigned int lower_mapping
[3];
11221 unsigned int title_mapping
[3];
11222 unsigned int upper_mapping
[3];
11223 unsigned int casefold_mapping
[3];
11224 const char *language
;
11228 /* The special casing rules. */
11229 struct special_casing_rule
**casing_rules
;
11230 unsigned int num_casing_rules
;
11231 unsigned int allocated_casing_rules
;
11234 add_casing_rule (struct special_casing_rule
*new_rule
)
11236 if (num_casing_rules
== allocated_casing_rules
)
11238 allocated_casing_rules
= 2 * allocated_casing_rules
;
11239 if (allocated_casing_rules
< 16)
11240 allocated_casing_rules
= 16;
11242 (struct special_casing_rule
**)
11243 realloc (casing_rules
, allocated_casing_rules
* sizeof (struct special_casing_rule
*));
11245 casing_rules
[num_casing_rules
++] = new_rule
;
11248 /* Stores in casing_rules the special casing rules found in
11249 specialcasing_filename. */
11251 fill_casing_rules (const char *specialcasing_filename
)
11255 stream
= fopen (specialcasing_filename
, "r");
11256 if (stream
== NULL
)
11258 fprintf (stderr
, "error during fopen of '%s'\n", specialcasing_filename
);
11262 casing_rules
= NULL
;
11263 num_casing_rules
= 0;
11264 allocated_casing_rules
= 0;
11274 unsigned int lower_mapping
[3];
11275 unsigned int title_mapping
[3];
11276 unsigned int upper_mapping
[3];
11280 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
11283 if (buf
[0] == '\0' || buf
[0] == '#')
11288 code
= strtoul (scanptr
, &endptr
, 16);
11289 if (endptr
== scanptr
)
11291 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
11295 if (*scanptr
!= ';')
11297 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
11302 /* Scan lower mapping. */
11303 for (i
= 0; i
< 3; i
++)
11304 lower_mapping
[i
] = 0;
11305 for (i
= 0; i
< 3; i
++)
11307 while (*scanptr
== ' ')
11309 if (*scanptr
== ';')
11311 lower_mapping
[i
] = strtoul (scanptr
, &endptr
, 16);
11312 if (endptr
== scanptr
)
11314 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
11319 if (*scanptr
!= ';')
11321 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
11326 /* Scan title mapping. */
11327 for (i
= 0; i
< 3; i
++)
11328 title_mapping
[i
] = 0;
11329 for (i
= 0; i
< 3; i
++)
11331 while (*scanptr
== ' ')
11333 if (*scanptr
== ';')
11335 title_mapping
[i
] = strtoul (scanptr
, &endptr
, 16);
11336 if (endptr
== scanptr
)
11338 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
11343 if (*scanptr
!= ';')
11345 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
11350 /* Scan upper mapping. */
11351 for (i
= 0; i
< 3; i
++)
11352 upper_mapping
[i
] = 0;
11353 for (i
= 0; i
< 3; i
++)
11355 while (*scanptr
== ' ')
11357 if (*scanptr
== ';')
11359 upper_mapping
[i
] = strtoul (scanptr
, &endptr
, 16);
11360 if (endptr
== scanptr
)
11362 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
11367 if (*scanptr
!= ';')
11369 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
11374 /* Scan language and context. */
11376 context
= SCC_ALWAYS
;
11377 while (*scanptr
== ' ')
11379 if (*scanptr
!= '\0' && *scanptr
!= '#')
11381 const char *word_begin
= scanptr
;
11382 const char *word_end
;
11384 while (*scanptr
!= '\0' && *scanptr
!= '#' && *scanptr
!= ';' && *scanptr
!= ' ')
11386 word_end
= scanptr
;
11388 while (*scanptr
== ' ')
11391 if (word_end
- word_begin
== 2)
11393 language
= (char *) malloc ((word_end
- word_begin
) + 1);
11394 memcpy (language
, word_begin
, 2);
11395 language
[word_end
- word_begin
] = '\0';
11396 word_begin
= word_end
= NULL
;
11398 if (*scanptr
!= '\0' && *scanptr
!= '#' && *scanptr
!= ';')
11400 word_begin
= scanptr
;
11401 while (*scanptr
!= '\0' && *scanptr
!= '#' && *scanptr
!= ';' && *scanptr
!= ' ')
11403 word_end
= scanptr
;
11407 if (word_end
> word_begin
)
11409 bool negate
= false;
11411 if (word_end
- word_begin
>= 4 && memcmp (word_begin
, "Not_", 4) == 0)
11416 if (word_end
- word_begin
== 11 && memcmp (word_begin
, "Final_Sigma", 11) == 0)
11417 context
= SCC_FINAL_SIGMA
;
11418 else if (word_end
- word_begin
== 17 && memcmp (word_begin
, "After_Soft_Dotted", 17) == 0)
11419 context
= SCC_AFTER_SOFT_DOTTED
;
11420 else if (word_end
- word_begin
== 10 && memcmp (word_begin
, "More_Above", 10) == 0)
11421 context
= SCC_MORE_ABOVE
;
11422 else if (word_end
- word_begin
== 10 && memcmp (word_begin
, "Before_Dot", 10) == 0)
11423 context
= SCC_BEFORE_DOT
;
11424 else if (word_end
- word_begin
== 7 && memcmp (word_begin
, "After_I", 7) == 0)
11425 context
= SCC_AFTER_I
;
11428 fprintf (stderr
, "unknown context type in '%s'\n", specialcasing_filename
);
11432 context
= - context
;
11435 if (*scanptr
!= '\0' && *scanptr
!= '#' && *scanptr
!= ';')
11437 fprintf (stderr
, "parse error in '%s'\n", specialcasing_filename
);
11442 /* Store the rule. */
11444 struct special_casing_rule
*new_rule
=
11445 (struct special_casing_rule
*) malloc (sizeof (struct special_casing_rule
));
11446 new_rule
->code
= code
;
11447 new_rule
->language
= language
;
11448 new_rule
->context
= context
;
11449 memcpy (new_rule
->lower_mapping
, lower_mapping
, sizeof (new_rule
->lower_mapping
));
11450 memcpy (new_rule
->title_mapping
, title_mapping
, sizeof (new_rule
->title_mapping
));
11451 memcpy (new_rule
->upper_mapping
, upper_mapping
, sizeof (new_rule
->upper_mapping
));
11453 add_casing_rule (new_rule
);
11457 if (ferror (stream
) || fclose (stream
))
11459 fprintf (stderr
, "error reading from '%s'\n", specialcasing_filename
);
11464 /* A casefolding rule. */
11465 struct casefold_rule
11468 unsigned int mapping
[3];
11469 const char *language
;
11472 /* The casefolding rules. */
11473 struct casefold_rule
**casefolding_rules
;
11474 unsigned int num_casefolding_rules
;
11475 unsigned int allocated_casefolding_rules
;
11477 /* Stores in casefolding_rules the case folding rules found in
11478 casefolding_filename. */
11480 fill_casefolding_rules (const char *casefolding_filename
)
11484 stream
= fopen (casefolding_filename
, "r");
11485 if (stream
== NULL
)
11487 fprintf (stderr
, "error during fopen of '%s'\n", casefolding_filename
);
11491 casefolding_rules
= NULL
;
11492 num_casefolding_rules
= 0;
11493 allocated_casefolding_rules
= 0;
11504 unsigned int mapping
[3];
11506 if (fscanf (stream
, "%200[^\n]\n", buf
) < 1)
11509 if (buf
[0] == '\0' || buf
[0] == '#')
11514 code
= strtoul (scanptr
, &endptr
, 16);
11515 if (endptr
== scanptr
)
11517 fprintf (stderr
, "parse error in '%s'\n", casefolding_filename
);
11521 if (*scanptr
!= ';')
11523 fprintf (stderr
, "parse error in '%s'\n", casefolding_filename
);
11529 while (*scanptr
== ' ')
11534 case 'C': case 'F': case 'S': case 'T':
11538 fprintf (stderr
, "parse error in '%s'\n", casefolding_filename
);
11542 if (*scanptr
!= ';')
11544 fprintf (stderr
, "parse error in '%s'\n", casefolding_filename
);
11549 /* Scan casefold mapping. */
11550 for (i
= 0; i
< 3; i
++)
11552 for (i
= 0; i
< 3; i
++)
11554 while (*scanptr
== ' ')
11556 if (*scanptr
== ';')
11558 mapping
[i
] = strtoul (scanptr
, &endptr
, 16);
11559 if (endptr
== scanptr
)
11561 fprintf (stderr
, "parse error in '%s'\n", casefolding_filename
);
11566 if (*scanptr
!= ';')
11568 fprintf (stderr
, "parse error in '%s'\n", casefolding_filename
);
11573 /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
11576 const char * const *languages
;
11577 unsigned int languages_count
;
11579 /* Type 'T' indicates that the rule is applicable to Turkish
11583 static const char * const turkish_languages
[] = { "tr", "az" };
11584 languages
= turkish_languages
;
11585 languages_count
= 2;
11589 static const char * const all_languages
[] = { NULL
};
11590 languages
= all_languages
;
11591 languages_count
= 1;
11594 for (i
= 0; i
< languages_count
; i
++)
11596 /* Store a new rule. */
11597 struct casefold_rule
*new_rule
=
11598 (struct casefold_rule
*) malloc (sizeof (struct casefold_rule
));
11599 new_rule
->code
= code
;
11600 memcpy (new_rule
->mapping
, mapping
, sizeof (new_rule
->mapping
));
11601 new_rule
->language
= languages
[i
];
11603 if (num_casefolding_rules
== allocated_casefolding_rules
)
11605 allocated_casefolding_rules
= 2 * allocated_casefolding_rules
;
11606 if (allocated_casefolding_rules
< 16)
11607 allocated_casefolding_rules
= 16;
11608 casefolding_rules
=
11609 (struct casefold_rule
**)
11610 realloc (casefolding_rules
,
11611 allocated_casefolding_rules
* sizeof (struct casefold_rule
*));
11613 casefolding_rules
[num_casefolding_rules
++] = new_rule
;
11618 if (ferror (stream
) || fclose (stream
))
11620 fprintf (stderr
, "error reading from '%s'\n", casefolding_filename
);
11625 /* Casefold mapping, when it maps to a single character. */
11626 unsigned int unicode_casefold
[0x110000];
11628 static unsigned int
11629 to_casefold (unsigned int ch
)
11631 return unicode_casefold
[ch
];
11634 /* Redistribute the casefolding_rules:
11635 - Rules that map to a single character, language independently, are stored
11636 in unicode_casefold.
11637 - Other rules are merged into casing_rules. */
11639 redistribute_casefolding_rules (void)
11641 unsigned int ch
, i
, j
;
11643 /* Fill unicode_casefold[]. */
11644 for (ch
= 0; ch
< 0x110000; ch
++)
11645 unicode_casefold
[ch
] = ch
;
11646 for (i
= 0; i
< num_casefolding_rules
; i
++)
11648 struct casefold_rule
*cfrule
= casefolding_rules
[i
];
11650 if (cfrule
->language
== NULL
&& cfrule
->mapping
[1] == 0)
11653 assert (ch
< 0x110000);
11654 unicode_casefold
[ch
] = cfrule
->mapping
[0];
11658 /* Extend the special casing rules by filling in their casefold_mapping[]
11660 for (j
= 0; j
< num_casing_rules
; j
++)
11662 struct special_casing_rule
*rule
= casing_rules
[j
];
11665 rule
->casefold_mapping
[0] = to_casefold (rule
->code
);
11666 for (k
= 1; k
< 3; k
++)
11667 rule
->casefold_mapping
[k
] = 0;
11670 /* Now merge the other casefolding rules into casing_rules. */
11671 for (i
= 0; i
< num_casefolding_rules
; i
++)
11673 struct casefold_rule
*cfrule
= casefolding_rules
[i
];
11675 if (!(cfrule
->language
== NULL
&& cfrule
->mapping
[1] == 0))
11677 /* Find a rule that applies to the same code, same language, and it
11678 has context SCC_ALWAYS. At the same time, update all rules that
11679 have the same code and same or more specific language. */
11680 struct special_casing_rule
*found_rule
= NULL
;
11682 for (j
= 0; j
< num_casing_rules
; j
++)
11684 struct special_casing_rule
*rule
= casing_rules
[j
];
11686 if (rule
->code
== cfrule
->code
11687 && (cfrule
->language
== NULL
11688 || (rule
->language
!= NULL
11689 && strcmp (rule
->language
, cfrule
->language
) == 0)))
11691 memcpy (rule
->casefold_mapping
, cfrule
->mapping
,
11692 sizeof (rule
->casefold_mapping
));
11694 if ((cfrule
->language
== NULL
11695 ? rule
->language
== NULL
11696 : rule
->language
!= NULL
11697 && strcmp (rule
->language
, cfrule
->language
) == 0)
11698 && rule
->context
== SCC_ALWAYS
)
11706 if (found_rule
== NULL
)
11708 /* Create a new rule. */
11709 struct special_casing_rule
*new_rule
=
11710 (struct special_casing_rule
*) malloc (sizeof (struct special_casing_rule
));
11712 /* Try to find a rule that applies to the same code, no language
11713 restriction, and with context SCC_ALWAYS. */
11714 for (j
= 0; j
< num_casing_rules
; j
++)
11716 struct special_casing_rule
*rule
= casing_rules
[j
];
11718 if (rule
->code
== cfrule
->code
11719 && rule
->context
== SCC_ALWAYS
11720 && rule
->language
== NULL
)
11728 new_rule
->code
= cfrule
->code
;
11729 new_rule
->language
= cfrule
->language
;
11730 new_rule
->context
= SCC_ALWAYS
;
11731 if (found_rule
!= NULL
)
11733 memcpy (new_rule
->lower_mapping
, found_rule
->lower_mapping
,
11734 sizeof (new_rule
->lower_mapping
));
11735 memcpy (new_rule
->title_mapping
, found_rule
->title_mapping
,
11736 sizeof (new_rule
->title_mapping
));
11737 memcpy (new_rule
->upper_mapping
, found_rule
->upper_mapping
,
11738 sizeof (new_rule
->upper_mapping
));
11744 new_rule
->lower_mapping
[0] = to_lower (cfrule
->code
);
11745 for (k
= 1; k
< 3; k
++)
11746 new_rule
->lower_mapping
[k
] = 0;
11747 new_rule
->title_mapping
[0] = to_title (cfrule
->code
);
11748 for (k
= 1; k
< 3; k
++)
11749 new_rule
->title_mapping
[k
] = 0;
11750 new_rule
->upper_mapping
[0] = to_upper (cfrule
->code
);
11751 for (k
= 1; k
< 3; k
++)
11752 new_rule
->upper_mapping
[k
] = 0;
11754 memcpy (new_rule
->casefold_mapping
, cfrule
->mapping
,
11755 sizeof (new_rule
->casefold_mapping
));
11757 add_casing_rule (new_rule
);
11764 compare_casing_rules (const void *a
, const void *b
)
11766 struct special_casing_rule
*a_rule
= *(struct special_casing_rule
**) a
;
11767 struct special_casing_rule
*b_rule
= *(struct special_casing_rule
**) b
;
11768 unsigned int a_code
= a_rule
->code
;
11769 unsigned int b_code
= b_rule
->code
;
11771 if (a_code
< b_code
)
11773 if (a_code
> b_code
)
11776 /* Sort the more specific rules before the more general ones. */
11777 return (- ((a_rule
->language
!= NULL
? 1 : 0) + (a_rule
->context
!= SCC_ALWAYS
? 1 : 0))
11778 + ((b_rule
->language
!= NULL
? 1 : 0) + (b_rule
->context
!= SCC_ALWAYS
? 1 : 0)));
11782 sort_casing_rules (void)
11784 /* Sort the rules 1. by code, 2. by specificity. */
11785 if (num_casing_rules
> 1)
11786 qsort (casing_rules
, num_casing_rules
, sizeof (struct special_casing_rule
*),
11787 compare_casing_rules
);
11790 /* Output the special casing rules. */
11792 output_casing_rules (const char *filename
, const char *version
)
11796 unsigned int minor
;
11798 stream
= fopen (filename
, "w");
11799 if (stream
== NULL
)
11801 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
11805 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
11806 fprintf (stream
, "/* Special casing rules of Unicode characters. */\n");
11807 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
11809 fprintf (stream
, "\n");
11811 fprintf (stream
, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
11812 fprintf (stream
, "\n");
11813 output_library_license (stream
, false);
11814 fprintf (stream
, "\n");
11816 fprintf (stream
, "struct special_casing_rule { char code[3]; };\n");
11817 fprintf (stream
, "%%struct-type\n");
11818 fprintf (stream
, "%%language=ANSI-C\n");
11819 fprintf (stream
, "%%define slot-name code\n");
11820 fprintf (stream
, "%%define hash-function-name gl_unicase_special_hash\n");
11821 fprintf (stream
, "%%define lookup-function-name gl_unicase_special_lookup\n");
11822 fprintf (stream
, "%%compare-lengths\n");
11823 fprintf (stream
, "%%compare-strncmp\n");
11824 fprintf (stream
, "%%readonly-tables\n");
11825 fprintf (stream
, "%%omit-struct-type\n");
11826 fprintf (stream
, "%%%%\n");
11829 for (i
= 0; i
< num_casing_rules
; i
++)
11831 struct special_casing_rule
*rule
= casing_rules
[i
];
11834 if (i
> 0 && rule
->code
== casing_rules
[i
- 1]->code
)
11839 if (!(rule
->code
< 0x10000))
11841 fprintf (stderr
, "special rule #%u: code %u out of range\n", i
, rule
->code
);
11845 fprintf (stream
, "\"\\x%02x\\x%02x\\x%02x\", ",
11846 (rule
->code
>> 8) & 0xff, rule
->code
& 0xff, minor
);
11848 fprintf (stream
, "%d, ",
11849 i
+ 1 < num_casing_rules
&& casing_rules
[i
+ 1]->code
== rule
->code
? 1 : 0);
11851 context
= rule
->context
;
11854 fprintf (stream
, "-");
11855 context
= - context
;
11858 fprintf (stream
, " ");
11862 fprintf (stream
, "SCC_ALWAYS ");
11864 case SCC_FINAL_SIGMA
:
11865 fprintf (stream
, "SCC_FINAL_SIGMA ");
11867 case SCC_AFTER_SOFT_DOTTED
:
11868 fprintf (stream
, "SCC_AFTER_SOFT_DOTTED");
11870 case SCC_MORE_ABOVE
:
11871 fprintf (stream
, "SCC_MORE_ABOVE ");
11873 case SCC_BEFORE_DOT
:
11874 fprintf (stream
, "SCC_BEFORE_DOT ");
11877 fprintf (stream
, "SCC_AFTER_I ");
11882 fprintf (stream
, ", ");
11884 if (rule
->language
!= NULL
)
11886 assert (strlen (rule
->language
) == 2);
11887 fprintf (stream
, "{ '%c', '%c' }, ", rule
->language
[0], rule
->language
[1]);
11890 fprintf (stream
, "{ '\\0', '\\0' }, ");
11892 fprintf (stream
, "{ ");
11893 for (j
= 0; j
< 3; j
++)
11896 fprintf (stream
, ", ");
11897 if (!(rule
->upper_mapping
[j
] < 0x10000))
11899 fprintf (stderr
, "special rule #%u: upper mapping of code %u out of range\n", i
, rule
->code
);
11902 if (rule
->upper_mapping
[j
] != 0)
11903 fprintf (stream
, "0x%04X", rule
->upper_mapping
[j
]);
11905 fprintf (stream
, " 0");
11907 fprintf (stream
, " }, { ");
11908 for (j
= 0; j
< 3; j
++)
11911 fprintf (stream
, ", ");
11912 if (!(rule
->lower_mapping
[j
] < 0x10000))
11914 fprintf (stderr
, "special rule #%u: lower mapping of code %u out of range\n", i
, rule
->code
);
11917 if (rule
->lower_mapping
[j
] != 0)
11918 fprintf (stream
, "0x%04X", rule
->lower_mapping
[j
]);
11920 fprintf (stream
, " 0");
11922 fprintf (stream
, " }, { ");
11923 for (j
= 0; j
< 3; j
++)
11926 fprintf (stream
, ", ");
11927 if (!(rule
->title_mapping
[j
] < 0x10000))
11929 fprintf (stderr
, "special rule #%u: title mapping of code %u out of range\n", i
, rule
->code
);
11932 if (rule
->title_mapping
[j
] != 0)
11933 fprintf (stream
, "0x%04X", rule
->title_mapping
[j
]);
11935 fprintf (stream
, " 0");
11937 fprintf (stream
, " }, { ");
11938 for (j
= 0; j
< 3; j
++)
11941 fprintf (stream
, ", ");
11942 if (!(rule
->casefold_mapping
[j
] < 0x10000))
11944 fprintf (stderr
, "special rule #%u: casefold mapping of code %u out of range\n", i
, rule
->code
);
11947 if (rule
->casefold_mapping
[j
] != 0)
11948 fprintf (stream
, "0x%04X", rule
->casefold_mapping
[j
]);
11950 fprintf (stream
, " 0");
11952 fprintf (stream
, " }\n");
11955 if (ferror (stream
) || fclose (stream
))
11957 fprintf (stderr
, "error writing to '%s'\n", filename
);
11962 /* ========================================================================= */
11964 /* Quoting the Unicode standard:
11965 Definition: A character is defined to be "cased" if it has the Lowercase
11966 or Uppercase property or has a General_Category value of
11967 Titlecase_Letter. */
11969 is_cased (unsigned int ch
)
11971 return (is_property_lowercase (ch
)
11972 || is_property_uppercase (ch
)
11973 || is_category_Lt (ch
));
11976 /* Quoting the Unicode standard:
11977 Definition: A character is defined to be "case-ignorable" if it has the
11978 value MidLetter {or the value MidNumLet} for the Word_Break property or
11979 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
11980 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
11981 The text marked in braces was added in Unicode 5.1.0, see
11982 <https://www.unicode.org/versions/Unicode5.1.0/> section "Update of
11983 Definition of case-ignorable". */
11984 /* Since this predicate is only used for the "Before C" and "After C"
11985 conditions of FINAL_SIGMA, we exclude the "cased" characters here.
11986 This simplifies the evaluation of the regular expressions
11987 \p{cased} (\p{case-ignorable})* C
11989 C (\p{case-ignorable})* \p{cased}
11992 is_case_ignorable (unsigned int ch
)
11994 return (unicode_org_wbp
[ch
] == WBP_MIDLETTER
11995 || unicode_org_wbp
[ch
] == WBP_MIDNUMLET
11996 || is_category_Mn (ch
)
11997 || is_category_Me (ch
)
11998 || is_category_Cf (ch
)
11999 || is_category_Lm (ch
)
12000 || is_category_Sk (ch
))
12004 /* ------------------------------------------------------------------------- */
12006 /* Output all case related properties. */
12008 output_casing_properties (const char *version
)
12010 #define PROPERTY(FN,P) \
12011 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
12012 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
12013 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
12014 PROPERTY(cased
, cased
)
12015 PROPERTY(ignorable
, case_ignorable
)
12019 /* ========================================================================= */
12021 /* Output the Unicode version. */
12023 output_version (const char *filename
, const char *version
)
12029 stream
= fopen (filename
, "w");
12030 if (stream
== NULL
)
12032 fprintf (stderr
, "cannot open '%s' for writing\n", filename
);
12036 fprintf (stream
, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
12037 fprintf (stream
, "/* Supported Unicode version. */\n");
12038 fprintf (stream
, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
12040 fprintf (stream
, "\n");
12042 fprintf (stream
, "/* Copyright (C) 2024 Free Software Foundation, Inc.\n");
12043 fprintf (stream
, "\n");
12044 output_library_license (stream
, false);
12045 fprintf (stream
, "\n");
12047 fprintf (stream
, "#include <config.h>\n");
12048 fprintf (stream
, "\n");
12050 fprintf (stream
, "/* Specification. */\n");
12051 fprintf (stream
, "#include \"unimetadata.h\"\n");
12052 fprintf (stream
, "\n");
12054 sscanf (version
, "%d.%d", &major
, &minor
);
12055 fprintf (stream
, "const int _libunistring_unicode_version = (%d << 8) | %d;\n",
12058 if (ferror (stream
) || fclose (stream
))
12060 fprintf (stderr
, "error writing to '%s'\n", filename
);
12065 /* ========================================================================= */
12068 main (int argc
, char * argv
[])
12070 const char *unicodedata_filename
;
12071 const char *proplist_filename
;
12072 const char *derivedproplist_filename
;
12073 const char *emojidata_filename
;
12074 const char *arabicshaping_filename
;
12075 const char *scripts_filename
;
12076 const char *blocks_filename
;
12077 const char *proplist30_filename
;
12078 const char *bidimirroring_filename
;
12079 const char *eastasianwidth_filename
;
12080 const char *linebreak_filename
;
12081 const char *wordbreakproperty_filename
;
12082 const char *graphemebreakproperty_filename
;
12083 const char *compositionexclusions_filename
;
12084 const char *specialcasing_filename
;
12085 const char *casefolding_filename
;
12086 const char *version
;
12090 fprintf (stderr
, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt emoji-data.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt BidiMirroring.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
12095 unicodedata_filename
= argv
[1];
12096 proplist_filename
= argv
[2];
12097 derivedproplist_filename
= argv
[3];
12098 emojidata_filename
= argv
[4];
12099 arabicshaping_filename
= argv
[5];
12100 scripts_filename
= argv
[6];
12101 blocks_filename
= argv
[7];
12102 proplist30_filename
= argv
[8];
12103 bidimirroring_filename
= argv
[9];
12104 eastasianwidth_filename
= argv
[10];
12105 linebreak_filename
= argv
[11];
12106 wordbreakproperty_filename
= argv
[12];
12107 graphemebreakproperty_filename
= argv
[13];
12108 compositionexclusions_filename
= argv
[14];
12109 specialcasing_filename
= argv
[15];
12110 casefolding_filename
= argv
[16];
12111 version
= argv
[17];
12113 fill_attributes (unicodedata_filename
);
12114 clear_properties ();
12115 fill_properties (proplist_filename
);
12116 fill_properties (derivedproplist_filename
);
12117 fill_properties (emojidata_filename
);
12118 fill_properties30 (proplist30_filename
);
12119 fill_arabicshaping (arabicshaping_filename
);
12120 fill_scripts (scripts_filename
);
12121 fill_blocks (blocks_filename
);
12122 fill_mirror (bidimirroring_filename
);
12123 fill_width (eastasianwidth_filename
);
12124 fill_org_lbp (linebreak_filename
);
12125 fill_org_wbp (wordbreakproperty_filename
);
12126 fill_org_gbp (graphemebreakproperty_filename
);
12127 fill_composition_exclusions (compositionexclusions_filename
);
12128 fill_casing_rules (specialcasing_filename
);
12129 fill_casefolding_rules (casefolding_filename
);
12130 redistribute_casefolding_rules ();
12131 sort_casing_rules ();
12133 output_categories (version
);
12134 output_category ("unictype/categ_of.h", version
);
12135 output_combclass ("unictype/combiningclass.h", version
);
12136 output_bidi_category ("unictype/bidi_of.h", version
);
12137 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version
);
12138 output_decimal_digit ("unictype/decdigit.h", version
);
12139 output_digit_test ("../tests/unictype/test-digit.h", version
);
12140 output_digit ("unictype/digit.h", version
);
12141 output_numeric_test ("../tests/unictype/test-numeric.h", version
);
12142 output_numeric ("unictype/numeric.h", version
);
12143 output_mirror ("unictype/mirror.h", version
);
12144 output_properties (version
);
12145 output_indic_conjunct_break_test ("../tests/unictype/test-incb_of.h", version
);
12146 output_indic_conjunct_break ("unictype/incb_of.h", version
);
12147 output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version
);
12148 output_joining_type ("unictype/joiningtype_of.h", version
);
12149 output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version
);
12150 output_joining_group ("unictype/joininggroup_of.h", version
);
12152 output_scripts (version
);
12153 output_scripts_byname (version
);
12154 output_blocks (version
);
12155 output_ident_properties (version
);
12156 output_nonspacing_property ("uniwidth/width0.h", version
);
12157 output_width2_property ("uniwidth/width2.h", version
);
12158 output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
12159 output_old_ctype (version
);
12161 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
12162 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
12163 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version
);
12164 output_lbrk_rules_as_tables ("unilbrk/lbrktables.c", version
);
12166 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
12167 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
12168 output_wbrk_tables ("uniwbrk/wbrkprop.h", version
);
12170 output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
12171 output_gbp_table ("unigbrk/gbrkprop.h", version
);
12173 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version
);
12174 debug_output_composition_tables ("uninorm/composition.txt");
12175 output_composition_tables ("uninorm/composition-table.gperf", "uninorm/composition-table-bounds.h", version
);
12177 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper
, version
);
12178 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower
, version
);
12179 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title
, version
);
12180 output_simple_mapping ("unicase/toupper.h", to_upper
, version
);
12181 output_simple_mapping ("unicase/tolower.h", to_lower
, version
);
12182 output_simple_mapping ("unicase/totitle.h", to_title
, version
);
12183 output_simple_mapping ("unicase/tocasefold.h", to_casefold
, version
);
12184 output_casing_rules ("unicase/special-casing-table.gperf", version
);
12185 output_casing_properties (version
);
12187 output_version ("unimetadata/u-version.c", version
);
12195 * compile-command: "\
12196 * gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \\
12197 * ./gen-uni-tables \\
12198 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/UnicodeData.txt \\
12199 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/PropList.txt \\
12200 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/DerivedCoreProperties.txt \\
12201 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/emoji/emoji-data.txt \\
12202 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/ArabicShaping.txt \\
12203 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/Scripts.txt \\
12204 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/Blocks.txt \\
12205 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \\
12206 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/BidiMirroring.txt \\
12207 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/EastAsianWidth.txt \\
12208 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/LineBreak.txt \\
12209 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/auxiliary/WordBreakProperty.txt \\
12210 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \\
12211 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/CompositionExclusions.txt \\
12212 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/SpecialCasing.txt \\
12213 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/CaseFolding.txt \\
12215 * && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \\
12216 * && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt \\
12217 * && clisp -C uniname/gen-uninames.lisp \\
12218 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/UnicodeData.txt \\
12219 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/NameAliases.txt \\
12220 * uniname/uninames.h \\
12221 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12223 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/NameAliases.txt; } \\
12224 * > ../tests/uniname/NameAliases.txt \\
12225 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12227 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/UnicodeData.txt; } \\
12228 * > ../tests/uniname/UnicodeData.txt \\
12229 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12231 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/NormalizationTest.txt; } \\
12232 * > ../tests/uninorm/NormalizationTest.txt \\
12233 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12235 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/auxiliary/GraphemeBreakTest.txt; } \\
12236 * > ../tests/unigbrk/GraphemeBreakTest.txt \\
12237 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12239 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/auxiliary/LineBreakTest.txt; } \\
12240 * > ../tests/unilbrk/LineBreakTest.txt \\
12241 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12243 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/auxiliary/WordBreakTest.txt; } \\
12244 * > ../tests/uniwbrk/WordBreakTest.txt"