gettext: Sync with gettext 0.23.
[gnulib.git] / lib / gen-uni-tables.c
blob617af649f1170502c435e88c23a009eecde3d963
1 /* Generate Unicode conforming character classification tables and
2 line break properties tables and word break property tables and
3 decomposition/composition and case mapping tables from a UnicodeData file.
4 Copyright (C) 2000-2002, 2004, 2007-2024 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2000-2002.
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <https://www.gnu.org/licenses/>. */
20 /* Usage example:
21 $ gen-uni-tables /usr/local/share/www.unicode.org/Public/16.0.0/ucd/UnicodeData.txt \
22 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/PropList.txt \
23 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/DerivedCoreProperties.txt \
24 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt \
25 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/ArabicShaping.txt \
26 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/Scripts.txt \
27 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/Blocks.txt \
28 /usr/local/share/www.unicode.org/Public/3.0-Update1/PropList-3.0.1.txt \
29 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/BidiMirroring.txt \
30 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/EastAsianWidth.txt \
31 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/LineBreak.txt \
32 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/auxiliary/WordBreakProperty.txt \
33 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \
34 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/CompositionExclusions.txt \
35 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/SpecialCasing.txt \
36 /usr/local/share/www.unicode.org/Public/16.0.0/ucd/CaseFolding.txt \
37 16.0.0
40 #include <assert.h>
41 #if __STDC_VERSION__ < 202311L
42 # include <stdbool.h>
43 #endif
44 #include <stdint.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <time.h>
50 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
52 /* ========================================================================= */
54 /* Reading UnicodeData.txt. */
55 /* See UCD.html. */
57 /* This structure represents one line in the UnicodeData.txt file. */
58 struct unicode_attribute
60 const char *name; /* Character name */
61 const char *category; /* General category */
62 const char *combining; /* Canonical combining class */
63 const char *bidi; /* Bidirectional category */
64 const char *decomposition; /* Character decomposition mapping */
65 const char *decdigit; /* Decimal digit value */
66 const char *digit; /* Digit value */
67 const char *numeric; /* Numeric value */
68 bool mirrored; /* mirrored */
69 const char *oldname; /* Old Unicode 1.0 name */
70 const char *comment; /* Comment */
71 unsigned int upper; /* Uppercase mapping */
72 unsigned int lower; /* Lowercase mapping */
73 unsigned int title; /* Titlecase mapping */
76 /* Missing fields are represented with "" for strings, and NONE for
77 characters. */
78 #define NONE (~(unsigned int)0)
80 /* The entire contents of the UnicodeData.txt file. */
81 struct unicode_attribute unicode_attributes [0x110000];
83 /* Stores in unicode_attributes[i] the values from the given fields. */
84 static void
85 fill_attribute (unsigned int i,
86 const char *field1, const char *field2,
87 const char *field3, const char *field4,
88 const char *field5, const char *field6,
89 const char *field7, const char *field8,
90 const char *field9, const char *field10,
91 const char *field11, const char *field12,
92 const char *field13, const char *field14)
94 struct unicode_attribute * uni;
96 if (i >= 0x110000)
98 fprintf (stderr, "index too large\n");
99 exit (1);
101 if (strcmp (field2, "Cs") == 0)
102 /* Surrogates are UTF-16 artifacts, not real characters. Ignore them. */
103 return;
104 uni = &unicode_attributes[i];
105 /* Copy the strings. */
106 uni->name = strdup (field1);
107 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
108 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
109 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
110 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
111 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
112 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
113 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
114 uni->mirrored = (field9[0] == 'Y');
115 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
116 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
117 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
118 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
119 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
122 /* Maximum length of a field in the UnicodeData.txt file. */
123 #define FIELDLEN 160
125 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
126 Reads up to (but excluding) DELIM.
127 Returns 1 when a field was successfully read, otherwise 0. */
128 static int
129 getfield (FILE *stream, char *buffer, int delim)
131 int count = 0;
132 int c;
134 for (; (c = getc (stream)), (c != EOF && c != delim); )
136 /* The original unicode.org UnicodeData.txt file happens to have
137 CR/LF line terminators. Silently convert to LF. */
138 if (c == '\r')
139 continue;
141 /* Put c into the buffer. */
142 if (++count >= FIELDLEN - 1)
144 fprintf (stderr, "field longer than expected, increase FIELDLEN\n");
145 exit (1);
147 *buffer++ = c;
150 if (c == EOF)
151 return 0;
153 *buffer = '\0';
154 return 1;
157 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
158 file. */
159 static void
160 fill_attributes (const char *unicodedata_filename)
162 unsigned int i, j;
163 FILE *stream;
164 char field0[FIELDLEN];
165 char field1[FIELDLEN];
166 char field2[FIELDLEN];
167 char field3[FIELDLEN];
168 char field4[FIELDLEN];
169 char field5[FIELDLEN];
170 char field6[FIELDLEN];
171 char field7[FIELDLEN];
172 char field8[FIELDLEN];
173 char field9[FIELDLEN];
174 char field10[FIELDLEN];
175 char field11[FIELDLEN];
176 char field12[FIELDLEN];
177 char field13[FIELDLEN];
178 char field14[FIELDLEN];
179 int lineno = 0;
181 for (i = 0; i < 0x110000; i++)
182 unicode_attributes[i].name = NULL;
184 stream = fopen (unicodedata_filename, "r");
185 if (stream == NULL)
187 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
188 exit (1);
191 for (;;)
193 int n;
195 lineno++;
196 n = getfield (stream, field0, ';');
197 n += getfield (stream, field1, ';');
198 n += getfield (stream, field2, ';');
199 n += getfield (stream, field3, ';');
200 n += getfield (stream, field4, ';');
201 n += getfield (stream, field5, ';');
202 n += getfield (stream, field6, ';');
203 n += getfield (stream, field7, ';');
204 n += getfield (stream, field8, ';');
205 n += getfield (stream, field9, ';');
206 n += getfield (stream, field10, ';');
207 n += getfield (stream, field11, ';');
208 n += getfield (stream, field12, ';');
209 n += getfield (stream, field13, ';');
210 n += getfield (stream, field14, '\n');
211 if (n == 0)
212 break;
213 if (n != 15)
215 fprintf (stderr, "short line in '%s':%d\n",
216 unicodedata_filename, lineno);
217 exit (1);
219 i = strtoul (field0, NULL, 16);
220 if (field1[0] == '<'
221 && strlen (field1) >= 9
222 && strcmp (field1 + strlen (field1) - 8, ", First>") == 0)
224 /* Deal with a range. */
225 lineno++;
226 n = getfield (stream, field0, ';');
227 n += getfield (stream, field1, ';');
228 n += getfield (stream, field2, ';');
229 n += getfield (stream, field3, ';');
230 n += getfield (stream, field4, ';');
231 n += getfield (stream, field5, ';');
232 n += getfield (stream, field6, ';');
233 n += getfield (stream, field7, ';');
234 n += getfield (stream, field8, ';');
235 n += getfield (stream, field9, ';');
236 n += getfield (stream, field10, ';');
237 n += getfield (stream, field11, ';');
238 n += getfield (stream, field12, ';');
239 n += getfield (stream, field13, ';');
240 n += getfield (stream, field14, '\n');
241 if (n != 15)
243 fprintf (stderr, "missing end range in '%s':%d\n",
244 unicodedata_filename, lineno);
245 exit (1);
247 if (!(field1[0] == '<'
248 && strlen (field1) >= 8
249 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0))
251 fprintf (stderr, "missing end range in '%s':%d\n",
252 unicodedata_filename, lineno);
253 exit (1);
255 field1[strlen (field1) - 7] = '\0';
256 j = strtoul (field0, NULL, 16);
257 for (; i <= j; i++)
258 fill_attribute (i, field1+1, field2, field3, field4, field5,
259 field6, field7, field8, field9, field10,
260 field11, field12, field13, field14);
262 else
264 /* Single character line */
265 fill_attribute (i, field1, field2, field3, field4, field5,
266 field6, field7, field8, field9, field10,
267 field11, field12, field13, field14);
271 if (ferror (stream) || fclose (stream))
273 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
274 exit (1);
278 /* ========================================================================= */
280 /* Output the license notice for a library file.
281 This closes an open C syntax comment. */
282 static void
283 output_library_license (FILE *stream, bool lgplv2plus)
285 if (lgplv2plus)
287 /* These Gnulib modules are under the LGPLv2+ license. */
288 fprintf (stream, " This file is free software: you can redistribute it and/or modify\n");
289 fprintf (stream, " it under the terms of the GNU Lesser General Public License as\n");
290 fprintf (stream, " published by the Free Software Foundation; either version 2.1 of the\n");
291 fprintf (stream, " License, or (at your option) any later version.\n");
292 fprintf (stream, "\n");
293 fprintf (stream, " This file is distributed in the hope that it will be useful,\n");
294 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
295 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
296 fprintf (stream, " GNU Lesser General Public License for more details.\n");
297 fprintf (stream, "\n");
298 fprintf (stream, " You should have received a copy of the GNU Lesser General Public License\n");
299 fprintf (stream, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
301 else
303 /* These Gnulib modules are under the 'LGPLv3+ or GPLv2+' license. */
304 fprintf (stream, " This file is free software.\n");
305 fprintf (stream, " It is dual-licensed under \"the GNU LGPLv3+ or the GNU GPLv2+\".\n");
306 fprintf (stream, " You can redistribute it and/or modify it under either\n");
307 fprintf (stream, " - the terms of the GNU Lesser General Public License as published\n");
308 fprintf (stream, " by the Free Software Foundation, either version 3, or (at your\n");
309 fprintf (stream, " option) any later version, or\n");
310 fprintf (stream, " - the terms of the GNU General Public License as published by the\n");
311 fprintf (stream, " Free Software Foundation; either version 2, or (at your option)\n");
312 fprintf (stream, " any later version, or\n");
313 fprintf (stream, " - the same dual license \"the GNU LGPLv3+ or the GNU GPLv2+\".\n");
314 fprintf (stream, "\n");
315 fprintf (stream, " This file is distributed in the hope that it will be useful,\n");
316 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
317 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n");
318 fprintf (stream, " Lesser General Public License and the GNU General Public License\n");
319 fprintf (stream, " for more details.\n");
320 fprintf (stream, "\n");
321 fprintf (stream, " You should have received a copy of the GNU Lesser General Public\n");
322 fprintf (stream, " License and of the GNU General Public License along with this\n");
323 fprintf (stream, " program. If not, see <https://www.gnu.org/licenses/>. */\n");
327 /* Output the license notice for a tests file.
328 This closes an open C syntax comment. */
329 static void
330 output_tests_license (FILE *stream)
332 /* Gnulib tests modules are under the GPLv3+ license. */
333 fprintf (stream, " This file is free software: you can redistribute it and/or modify\n");
334 fprintf (stream, " it under the terms of the GNU General Public License as published\n");
335 fprintf (stream, " by the Free Software Foundation, either version 3 of the License,\n");
336 fprintf (stream, " or (at your option) any later version.\n");
337 fprintf (stream, "\n");
338 fprintf (stream, " This file is distributed in the hope that it will be useful,\n");
339 fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
340 fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
341 fprintf (stream, " GNU General Public License for more details.\n");
342 fprintf (stream, "\n");
343 fprintf (stream, " You should have received a copy of the GNU General Public License\n");
344 fprintf (stream, " along with this program. If not, see <https://www.gnu.org/licenses/>. */\n");
347 /* ========================================================================= */
349 /* General category. */
350 /* See Unicode 3.0 book, section 4.5,
351 UCD.html. */
353 static bool
354 is_category_L (unsigned int ch)
356 return (unicode_attributes[ch].name != NULL
357 && unicode_attributes[ch].category[0] == 'L');
360 static bool
361 is_category_LC (unsigned int ch)
363 /* See PropertyValueAliases.txt. */
364 return (unicode_attributes[ch].name != NULL
365 && unicode_attributes[ch].category[0] == 'L'
366 && (unicode_attributes[ch].category[1] == 'u'
367 || unicode_attributes[ch].category[1] == 'l'
368 || unicode_attributes[ch].category[1] == 't'));
371 static bool
372 is_category_Lu (unsigned int ch)
374 return (unicode_attributes[ch].name != NULL
375 && unicode_attributes[ch].category[0] == 'L'
376 && unicode_attributes[ch].category[1] == 'u');
379 static bool
380 is_category_Ll (unsigned int ch)
382 return (unicode_attributes[ch].name != NULL
383 && unicode_attributes[ch].category[0] == 'L'
384 && unicode_attributes[ch].category[1] == 'l');
387 static bool
388 is_category_Lt (unsigned int ch)
390 return (unicode_attributes[ch].name != NULL
391 && unicode_attributes[ch].category[0] == 'L'
392 && unicode_attributes[ch].category[1] == 't');
395 static bool
396 is_category_Lm (unsigned int ch)
398 return (unicode_attributes[ch].name != NULL
399 && unicode_attributes[ch].category[0] == 'L'
400 && unicode_attributes[ch].category[1] == 'm');
403 static bool
404 is_category_Lo (unsigned int ch)
406 return (unicode_attributes[ch].name != NULL
407 && unicode_attributes[ch].category[0] == 'L'
408 && unicode_attributes[ch].category[1] == 'o');
411 static bool
412 is_category_M (unsigned int ch)
414 return (unicode_attributes[ch].name != NULL
415 && unicode_attributes[ch].category[0] == 'M');
418 static bool
419 is_category_Mn (unsigned int ch)
421 return (unicode_attributes[ch].name != NULL
422 && unicode_attributes[ch].category[0] == 'M'
423 && unicode_attributes[ch].category[1] == 'n');
426 static bool
427 is_category_Mc (unsigned int ch)
429 return (unicode_attributes[ch].name != NULL
430 && unicode_attributes[ch].category[0] == 'M'
431 && unicode_attributes[ch].category[1] == 'c');
434 static bool
435 is_category_Me (unsigned int ch)
437 return (unicode_attributes[ch].name != NULL
438 && unicode_attributes[ch].category[0] == 'M'
439 && unicode_attributes[ch].category[1] == 'e');
442 static bool
443 is_category_N (unsigned int ch)
445 return (unicode_attributes[ch].name != NULL
446 && unicode_attributes[ch].category[0] == 'N');
449 static bool
450 is_category_Nd (unsigned int ch)
452 return (unicode_attributes[ch].name != NULL
453 && unicode_attributes[ch].category[0] == 'N'
454 && unicode_attributes[ch].category[1] == 'd');
457 static bool
458 is_category_Nl (unsigned int ch)
460 return (unicode_attributes[ch].name != NULL
461 && unicode_attributes[ch].category[0] == 'N'
462 && unicode_attributes[ch].category[1] == 'l');
465 static bool
466 is_category_No (unsigned int ch)
468 return (unicode_attributes[ch].name != NULL
469 && unicode_attributes[ch].category[0] == 'N'
470 && unicode_attributes[ch].category[1] == 'o');
473 static bool
474 is_category_P (unsigned int ch)
476 return (unicode_attributes[ch].name != NULL
477 && unicode_attributes[ch].category[0] == 'P');
480 static bool
481 is_category_Pc (unsigned int ch)
483 return (unicode_attributes[ch].name != NULL
484 && unicode_attributes[ch].category[0] == 'P'
485 && unicode_attributes[ch].category[1] == 'c');
488 static bool
489 is_category_Pd (unsigned int ch)
491 return (unicode_attributes[ch].name != NULL
492 && unicode_attributes[ch].category[0] == 'P'
493 && unicode_attributes[ch].category[1] == 'd');
496 static bool
497 is_category_Ps (unsigned int ch)
499 return (unicode_attributes[ch].name != NULL
500 && unicode_attributes[ch].category[0] == 'P'
501 && unicode_attributes[ch].category[1] == 's');
504 static bool
505 is_category_Pe (unsigned int ch)
507 return (unicode_attributes[ch].name != NULL
508 && unicode_attributes[ch].category[0] == 'P'
509 && unicode_attributes[ch].category[1] == 'e');
512 static bool
513 is_category_Pi (unsigned int ch)
515 return (unicode_attributes[ch].name != NULL
516 && unicode_attributes[ch].category[0] == 'P'
517 && unicode_attributes[ch].category[1] == 'i');
520 static bool
521 is_category_Pf (unsigned int ch)
523 return (unicode_attributes[ch].name != NULL
524 && unicode_attributes[ch].category[0] == 'P'
525 && unicode_attributes[ch].category[1] == 'f');
528 static bool
529 is_category_Po (unsigned int ch)
531 return (unicode_attributes[ch].name != NULL
532 && unicode_attributes[ch].category[0] == 'P'
533 && unicode_attributes[ch].category[1] == 'o');
536 static bool
537 is_category_S (unsigned int ch)
539 return (unicode_attributes[ch].name != NULL
540 && unicode_attributes[ch].category[0] == 'S');
543 static bool
544 is_category_Sm (unsigned int ch)
546 return (unicode_attributes[ch].name != NULL
547 && unicode_attributes[ch].category[0] == 'S'
548 && unicode_attributes[ch].category[1] == 'm');
551 static bool
552 is_category_Sc (unsigned int ch)
554 return (unicode_attributes[ch].name != NULL
555 && unicode_attributes[ch].category[0] == 'S'
556 && unicode_attributes[ch].category[1] == 'c');
559 static bool
560 is_category_Sk (unsigned int ch)
562 return (unicode_attributes[ch].name != NULL
563 && unicode_attributes[ch].category[0] == 'S'
564 && unicode_attributes[ch].category[1] == 'k');
567 static bool
568 is_category_So (unsigned int ch)
570 return (unicode_attributes[ch].name != NULL
571 && unicode_attributes[ch].category[0] == 'S'
572 && unicode_attributes[ch].category[1] == 'o');
575 static bool
576 is_category_Z (unsigned int ch)
578 return (unicode_attributes[ch].name != NULL
579 && unicode_attributes[ch].category[0] == 'Z');
582 static bool
583 is_category_Zs (unsigned int ch)
585 return (unicode_attributes[ch].name != NULL
586 && unicode_attributes[ch].category[0] == 'Z'
587 && unicode_attributes[ch].category[1] == 's');
590 static bool
591 is_category_Zl (unsigned int ch)
593 return (unicode_attributes[ch].name != NULL
594 && unicode_attributes[ch].category[0] == 'Z'
595 && unicode_attributes[ch].category[1] == 'l');
598 static bool
599 is_category_Zp (unsigned int ch)
601 return (unicode_attributes[ch].name != NULL
602 && unicode_attributes[ch].category[0] == 'Z'
603 && unicode_attributes[ch].category[1] == 'p');
606 static bool
607 is_category_C (unsigned int ch)
609 return (unicode_attributes[ch].name == NULL
610 || unicode_attributes[ch].category[0] == 'C');
613 static bool
614 is_category_Cc (unsigned int ch)
616 return (unicode_attributes[ch].name != NULL
617 && unicode_attributes[ch].category[0] == 'C'
618 && unicode_attributes[ch].category[1] == 'c');
621 static bool
622 is_category_Cf (unsigned int ch)
624 return (unicode_attributes[ch].name != NULL
625 && unicode_attributes[ch].category[0] == 'C'
626 && unicode_attributes[ch].category[1] == 'f');
629 static bool
630 is_category_Cs (unsigned int ch)
632 return (ch >= 0xd800 && ch < 0xe000);
635 static bool
636 is_category_Co (unsigned int ch)
638 return (unicode_attributes[ch].name != NULL
639 && unicode_attributes[ch].category[0] == 'C'
640 && unicode_attributes[ch].category[1] == 'o');
643 static bool
644 is_category_Cn (unsigned int ch)
646 return (unicode_attributes[ch].name == NULL
647 && !(ch >= 0xd800 && ch < 0xe000));
650 /* Output a boolean property in a human readable format. */
651 static void
652 debug_output_predicate (const char *filename, bool (*predicate) (unsigned int))
654 FILE *stream;
655 unsigned int ch;
657 stream = fopen (filename, "w");
658 if (stream == NULL)
660 fprintf (stderr, "cannot open '%s' for writing\n", filename);
661 exit (1);
664 #if 0 /* This yields huge text output. */
665 for (ch = 0; ch < 0x110000; ch++)
666 if (predicate (ch))
668 fprintf (stream, "0x%04X\n", ch);
670 #else
671 for (ch = 0; ch < 0x110000; ch++)
672 if (predicate (ch))
674 unsigned int first = ch;
675 unsigned int last;
677 while (ch + 1 < 0x110000 && predicate (ch + 1))
678 ch++;
679 last = ch;
680 if (first < last)
681 fprintf (stream, "0x%04X..0x%04X\n", first, last);
682 else
683 fprintf (stream, "0x%04X\n", ch);
685 #endif
687 if (ferror (stream) || fclose (stream))
689 fprintf (stderr, "error writing to '%s'\n", filename);
690 exit (1);
694 /* Output the unit test for a boolean property. */
695 static void
696 output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression)
698 FILE *stream;
699 bool need_comma;
700 unsigned int ch;
702 stream = fopen (filename, "w");
703 if (stream == NULL)
705 fprintf (stderr, "cannot open '%s' for writing\n", filename);
706 exit (1);
709 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
710 fprintf (stream, "/* Test the Unicode character type functions.\n");
711 fprintf (stream, " Copyright (C) 2007-2024 Free Software Foundation, Inc.\n");
712 fprintf (stream, "\n");
713 output_tests_license (stream);
714 fprintf (stream, "\n");
715 fprintf (stream, "#include \"test-predicate-part1.h\"\n");
716 fprintf (stream, "\n");
718 need_comma = false;
719 for (ch = 0; ch < 0x110000; ch++)
720 if (predicate (ch))
722 unsigned int first = ch;
723 unsigned int last;
725 while (ch + 1 < 0x110000 && predicate (ch + 1))
726 ch++;
727 last = ch;
728 if (need_comma)
729 fprintf (stream, ",\n");
730 fprintf (stream, " { 0x%04X, 0x%04X }", first, last);
731 need_comma = true;
733 if (need_comma)
734 fprintf (stream, "\n");
736 fprintf (stream, "\n");
737 fprintf (stream, "#define PREDICATE(c) %s\n", expression);
738 fprintf (stream, "#include \"test-predicate-part2.h\"\n");
740 if (ferror (stream) || fclose (stream))
742 fprintf (stderr, "error writing to '%s'\n", filename);
743 exit (1);
747 /* Construction of sparse 3-level tables. */
748 #define TABLE predicate_table
749 #define xmalloc malloc
750 #define xrealloc realloc
751 #include "3levelbit.h"
753 /* Output a boolean property in a three-level bitmap. */
754 static void
755 output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version)
757 FILE *stream;
758 unsigned int ch, i;
759 struct predicate_table t;
760 unsigned int level1_offset, level2_offset, level3_offset;
762 stream = fopen (filename, "w");
763 if (stream == NULL)
765 fprintf (stderr, "cannot open '%s' for writing\n", filename);
766 exit (1);
769 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
770 fprintf (stream, "/* %s of Unicode characters. */\n", comment);
771 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
772 version);
773 fprintf (stream, "\n");
775 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
776 fprintf (stream, "\n");
777 output_library_license (stream,
778 strcmp (filename, "unictype/categ_M.h") == 0
779 || strncmp (filename, "unictype/ctype_", 15) == 0
780 || strcmp (filename, "uniwidth/width2.h") == 0);
781 fprintf (stream, "\n");
783 t.p = 4; /* or: 5 */
784 t.q = 7; /* or: 6 */
785 predicate_table_init (&t);
787 for (ch = 0; ch < 0x110000; ch++)
788 if (predicate (ch))
789 predicate_table_add (&t, ch);
791 predicate_table_finalize (&t);
793 /* Offsets in t.result, in memory of this process. */
794 level1_offset =
795 5 * sizeof (uint32_t);
796 level2_offset =
797 5 * sizeof (uint32_t)
798 + t.level1_size * sizeof (uint32_t);
799 level3_offset =
800 5 * sizeof (uint32_t)
801 + t.level1_size * sizeof (uint32_t)
802 + (t.level2_size << t.q) * sizeof (uint32_t);
804 for (i = 0; i < 5; i++)
805 if (i != 1)
806 fprintf (stream, "#define header_%d %d\n", i,
807 ((uint32_t *) t.result)[i]);
809 fprintf (stream, "static const\n");
810 fprintf (stream, "struct\n");
811 fprintf (stream, " {\n");
812 fprintf (stream, " int header[1];\n");
813 fprintf (stream, " int level1[%zu];\n", t.level1_size);
814 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
815 fprintf (stream, " unsigned int level3[%zu << %d];\n", t.level3_size, t.p);
816 fprintf (stream, " }\n");
817 fprintf (stream, "%s =\n", name);
818 fprintf (stream, "{\n");
819 fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]);
820 fprintf (stream, " {");
821 if (t.level1_size > 1)
822 fprintf (stream, "\n ");
823 for (i = 0; i < t.level1_size; i++)
825 uint32_t offset;
826 if (i > 0 && (i % 1) == 0)
827 fprintf (stream, "\n ");
828 offset = ((uint32_t *) (t.result + level1_offset))[i];
829 if (offset == 0)
830 fprintf (stream, " %5d", -1);
831 else
832 fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu",
833 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t));
834 if (i+1 < t.level1_size)
835 fprintf (stream, ",");
837 if (t.level1_size > 1)
838 fprintf (stream, "\n ");
839 fprintf (stream, " },\n");
840 fprintf (stream, " {");
841 if (t.level2_size << t.q > 1)
842 fprintf (stream, "\n ");
843 for (i = 0; i < t.level2_size << t.q; i++)
845 uint32_t offset;
846 if (i > 0 && (i % 1) == 0)
847 fprintf (stream, "\n ");
848 offset = ((uint32_t *) (t.result + level2_offset))[i];
849 if (offset == 0)
850 fprintf (stream, " %5d", -1);
851 else
852 fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu",
853 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t));
854 if (i+1 < t.level2_size << t.q)
855 fprintf (stream, ",");
857 if (t.level2_size << t.q > 1)
858 fprintf (stream, "\n ");
859 fprintf (stream, " },\n");
860 fprintf (stream, " {");
861 if (t.level3_size << t.p > 4)
862 fprintf (stream, "\n ");
863 for (i = 0; i < t.level3_size << t.p; i++)
865 if (i > 0 && (i % 4) == 0)
866 fprintf (stream, "\n ");
867 fprintf (stream, " 0x%08XU",
868 ((uint32_t *) (t.result + level3_offset))[i]);
869 if (i+1 < t.level3_size << t.p)
870 fprintf (stream, ",");
872 if (t.level3_size << t.p > 4)
873 fprintf (stream, "\n ");
874 fprintf (stream, " }\n");
875 fprintf (stream, "};\n");
877 if (ferror (stream) || fclose (stream))
879 fprintf (stderr, "error writing to '%s'\n", filename);
880 exit (1);
884 /* Output all categories. */
885 static void
886 output_categories (const char *version)
888 #define CATEGORY(C) \
889 debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \
890 output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \
891 output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version);
892 CATEGORY (L)
893 CATEGORY (LC)
894 CATEGORY (Lu)
895 CATEGORY (Ll)
896 CATEGORY (Lt)
897 CATEGORY (Lm)
898 CATEGORY (Lo)
899 CATEGORY (M)
900 CATEGORY (Mn)
901 CATEGORY (Mc)
902 CATEGORY (Me)
903 CATEGORY (N)
904 CATEGORY (Nd)
905 CATEGORY (Nl)
906 CATEGORY (No)
907 CATEGORY (P)
908 CATEGORY (Pc)
909 CATEGORY (Pd)
910 CATEGORY (Ps)
911 CATEGORY (Pe)
912 CATEGORY (Pi)
913 CATEGORY (Pf)
914 CATEGORY (Po)
915 CATEGORY (S)
916 CATEGORY (Sm)
917 CATEGORY (Sc)
918 CATEGORY (Sk)
919 CATEGORY (So)
920 CATEGORY (Z)
921 CATEGORY (Zs)
922 CATEGORY (Zl)
923 CATEGORY (Zp)
924 CATEGORY (C)
925 CATEGORY (Cc)
926 CATEGORY (Cf)
927 CATEGORY (Cs)
928 CATEGORY (Co)
929 CATEGORY (Cn)
930 #undef CATEGORY
933 enum
935 UC_CATEGORY_MASK_L = 0x0000001f,
936 UC_CATEGORY_MASK_LC = 0x00000007,
937 UC_CATEGORY_MASK_Lu = 0x00000001,
938 UC_CATEGORY_MASK_Ll = 0x00000002,
939 UC_CATEGORY_MASK_Lt = 0x00000004,
940 UC_CATEGORY_MASK_Lm = 0x00000008,
941 UC_CATEGORY_MASK_Lo = 0x00000010,
942 UC_CATEGORY_MASK_M = 0x000000e0,
943 UC_CATEGORY_MASK_Mn = 0x00000020,
944 UC_CATEGORY_MASK_Mc = 0x00000040,
945 UC_CATEGORY_MASK_Me = 0x00000080,
946 UC_CATEGORY_MASK_N = 0x00000700,
947 UC_CATEGORY_MASK_Nd = 0x00000100,
948 UC_CATEGORY_MASK_Nl = 0x00000200,
949 UC_CATEGORY_MASK_No = 0x00000400,
950 UC_CATEGORY_MASK_P = 0x0003f800,
951 UC_CATEGORY_MASK_Pc = 0x00000800,
952 UC_CATEGORY_MASK_Pd = 0x00001000,
953 UC_CATEGORY_MASK_Ps = 0x00002000,
954 UC_CATEGORY_MASK_Pe = 0x00004000,
955 UC_CATEGORY_MASK_Pi = 0x00008000,
956 UC_CATEGORY_MASK_Pf = 0x00010000,
957 UC_CATEGORY_MASK_Po = 0x00020000,
958 UC_CATEGORY_MASK_S = 0x003c0000,
959 UC_CATEGORY_MASK_Sm = 0x00040000,
960 UC_CATEGORY_MASK_Sc = 0x00080000,
961 UC_CATEGORY_MASK_Sk = 0x00100000,
962 UC_CATEGORY_MASK_So = 0x00200000,
963 UC_CATEGORY_MASK_Z = 0x01c00000,
964 UC_CATEGORY_MASK_Zs = 0x00400000,
965 UC_CATEGORY_MASK_Zl = 0x00800000,
966 UC_CATEGORY_MASK_Zp = 0x01000000,
967 UC_CATEGORY_MASK_C = 0x3e000000,
968 UC_CATEGORY_MASK_Cc = 0x02000000,
969 UC_CATEGORY_MASK_Cf = 0x04000000,
970 UC_CATEGORY_MASK_Cs = 0x08000000,
971 UC_CATEGORY_MASK_Co = 0x10000000,
972 UC_CATEGORY_MASK_Cn = 0x20000000
975 static int
976 general_category_byname (const char *category_name)
978 if (category_name[0] != '\0'
979 && (category_name[1] == '\0' || category_name[2] == '\0'))
980 switch (category_name[0])
982 case 'L':
983 switch (category_name[1])
985 case '\0': return UC_CATEGORY_MASK_L;
986 case 'C': return UC_CATEGORY_MASK_LC;
987 case 'u': return UC_CATEGORY_MASK_Lu;
988 case 'l': return UC_CATEGORY_MASK_Ll;
989 case 't': return UC_CATEGORY_MASK_Lt;
990 case 'm': return UC_CATEGORY_MASK_Lm;
991 case 'o': return UC_CATEGORY_MASK_Lo;
993 break;
994 case 'M':
995 switch (category_name[1])
997 case '\0': return UC_CATEGORY_MASK_M;
998 case 'n': return UC_CATEGORY_MASK_Mn;
999 case 'c': return UC_CATEGORY_MASK_Mc;
1000 case 'e': return UC_CATEGORY_MASK_Me;
1002 break;
1003 case 'N':
1004 switch (category_name[1])
1006 case '\0': return UC_CATEGORY_MASK_N;
1007 case 'd': return UC_CATEGORY_MASK_Nd;
1008 case 'l': return UC_CATEGORY_MASK_Nl;
1009 case 'o': return UC_CATEGORY_MASK_No;
1011 break;
1012 case 'P':
1013 switch (category_name[1])
1015 case '\0': return UC_CATEGORY_MASK_P;
1016 case 'c': return UC_CATEGORY_MASK_Pc;
1017 case 'd': return UC_CATEGORY_MASK_Pd;
1018 case 's': return UC_CATEGORY_MASK_Ps;
1019 case 'e': return UC_CATEGORY_MASK_Pe;
1020 case 'i': return UC_CATEGORY_MASK_Pi;
1021 case 'f': return UC_CATEGORY_MASK_Pf;
1022 case 'o': return UC_CATEGORY_MASK_Po;
1024 break;
1025 case 'S':
1026 switch (category_name[1])
1028 case '\0': return UC_CATEGORY_MASK_S;
1029 case 'm': return UC_CATEGORY_MASK_Sm;
1030 case 'c': return UC_CATEGORY_MASK_Sc;
1031 case 'k': return UC_CATEGORY_MASK_Sk;
1032 case 'o': return UC_CATEGORY_MASK_So;
1034 break;
1035 case 'Z':
1036 switch (category_name[1])
1038 case '\0': return UC_CATEGORY_MASK_Z;
1039 case 's': return UC_CATEGORY_MASK_Zs;
1040 case 'l': return UC_CATEGORY_MASK_Zl;
1041 case 'p': return UC_CATEGORY_MASK_Zp;
1043 break;
1044 case 'C':
1045 switch (category_name[1])
1047 case '\0': return UC_CATEGORY_MASK_C;
1048 case 'c': return UC_CATEGORY_MASK_Cc;
1049 case 'f': return UC_CATEGORY_MASK_Cf;
1050 case 's': return UC_CATEGORY_MASK_Cs;
1051 case 'o': return UC_CATEGORY_MASK_Co;
1052 case 'n': return UC_CATEGORY_MASK_Cn;
1054 break;
1056 /* Invalid category name. */
1057 abort ();
1060 /* Construction of sparse 3-level tables. */
1061 #define TABLE category_table
1062 #define ELEMENT uint8_t
1063 #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */
1064 #define xmalloc malloc
1065 #define xrealloc realloc
1066 #include "3level.h"
1068 /* Output the per-character category table. */
1069 static void
1070 output_category (const char *filename, const char *version)
1072 FILE *stream;
1073 unsigned int ch, i;
1074 struct category_table t;
1075 unsigned int level1_offset, level2_offset, level3_offset;
1076 uint16_t *level3_packed;
1078 stream = fopen (filename, "w");
1079 if (stream == NULL)
1081 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1082 exit (1);
1085 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1086 fprintf (stream, "/* Categories of Unicode characters. */\n");
1087 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1088 version);
1089 fprintf (stream, "\n");
1091 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
1092 fprintf (stream, "\n");
1093 output_library_license (stream, true);
1094 fprintf (stream, "\n");
1096 t.p = 7;
1097 t.q = 9;
1098 category_table_init (&t);
1100 for (ch = 0; ch < 0x110000; ch++)
1102 int value;
1103 unsigned int log2_value;
1105 if (is_category_Cs (ch))
1106 value = UC_CATEGORY_MASK_Cs;
1107 else if (unicode_attributes[ch].name != NULL)
1108 value = general_category_byname (unicode_attributes[ch].category);
1109 else
1110 continue;
1112 /* Now value should contain exactly one bit. */
1113 assert (value != 0 && (value & (value - 1)) == 0);
1115 for (log2_value = 0; value > 1; value >>= 1, log2_value++);
1117 assert (log2_value <= 0x1f);
1119 category_table_add (&t, ch, log2_value);
1122 category_table_finalize (&t);
1124 /* Offsets in t.result, in memory of this process. */
1125 level1_offset =
1126 5 * sizeof (uint32_t);
1127 level2_offset =
1128 5 * sizeof (uint32_t)
1129 + t.level1_size * sizeof (uint32_t);
1130 level3_offset =
1131 5 * sizeof (uint32_t)
1132 + t.level1_size * sizeof (uint32_t)
1133 + (t.level2_size << t.q) * sizeof (uint32_t);
1135 for (i = 0; i < 5; i++)
1136 fprintf (stream, "#define category_header_%d %d\n", i,
1137 ((uint32_t *) t.result)[i]);
1138 fprintf (stream, "static const\n");
1139 fprintf (stream, "struct\n");
1140 fprintf (stream, " {\n");
1141 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1142 fprintf (stream, " unsigned short level2[%zu << %d];\n", t.level2_size, t.q);
1143 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1144 (1 << t.p) * 5 / 16);
1145 fprintf (stream, " }\n");
1146 fprintf (stream, "u_category =\n");
1147 fprintf (stream, "{\n");
1148 fprintf (stream, " {");
1149 if (t.level1_size > 8)
1150 fprintf (stream, "\n ");
1151 for (i = 0; i < t.level1_size; i++)
1153 uint32_t offset;
1154 if (i > 0 && (i % 8) == 0)
1155 fprintf (stream, "\n ");
1156 offset = ((uint32_t *) (t.result + level1_offset))[i];
1157 if (offset == 0)
1158 fprintf (stream, " %5d", -1);
1159 else
1160 fprintf (stream, " %5zu",
1161 (offset - level2_offset) / sizeof (uint32_t));
1162 if (i+1 < t.level1_size)
1163 fprintf (stream, ",");
1165 if (t.level1_size > 8)
1166 fprintf (stream, "\n ");
1167 fprintf (stream, " },\n");
1168 fprintf (stream, " {");
1169 if (t.level2_size << t.q > 8)
1170 fprintf (stream, "\n ");
1171 for (i = 0; i < t.level2_size << t.q; i++)
1173 uint32_t offset;
1174 if (i > 0 && (i % 8) == 0)
1175 fprintf (stream, "\n ");
1176 offset = ((uint32_t *) (t.result + level2_offset))[i];
1177 /* To make the level2 values fit in 16 bits, we use 'unsigned short'
1178 instead of 'short' and add 1 to each value. */
1179 if (offset == 0)
1180 fprintf (stream, " %5d", -1 + 1);
1181 else
1182 fprintf (stream, " %5zu",
1183 (offset - level3_offset) / sizeof (uint8_t) + 1);
1184 if (i+1 < t.level2_size << t.q)
1185 fprintf (stream, ",");
1187 if (t.level2_size << t.q > 8)
1188 fprintf (stream, "\n ");
1189 fprintf (stream, " },\n");
1190 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1191 not 32-bit units, in order to make the lookup function easier. */
1192 level3_packed =
1193 (uint16_t *)
1194 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1195 for (i = 0; i < t.level3_size << t.p; i++)
1197 unsigned int j = (i * 5) / 16;
1198 unsigned int k = (i * 5) % 16;
1199 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1200 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1201 level3_packed[j] = value & 0xffff;
1202 level3_packed[j+1] = value >> 16;
1204 fprintf (stream, " {");
1205 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1206 fprintf (stream, "\n ");
1207 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1209 if (i > 0 && (i % 8) == 0)
1210 fprintf (stream, "\n ");
1211 fprintf (stream, " 0x%04x", level3_packed[i]);
1212 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1213 fprintf (stream, ",");
1215 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1216 fprintf (stream, "\n ");
1217 fprintf (stream, " }\n");
1218 free (level3_packed);
1219 fprintf (stream, "};\n");
1221 if (ferror (stream) || fclose (stream))
1223 fprintf (stderr, "error writing to '%s'\n", filename);
1224 exit (1);
1228 /* ========================================================================= */
1230 /* Canonical combining class. */
1231 /* See Unicode 3.0 book, section 4.2,
1232 UCD.html. */
1234 /* Construction of sparse 3-level tables. */
1235 #define TABLE combclass_table
1236 #define ELEMENT uint8_t
1237 #define DEFAULT 0
1238 #define xmalloc malloc
1239 #define xrealloc realloc
1240 #include "3level.h"
1242 /* Output the per-character combining class table. */
1243 static void
1244 output_combclass (const char *filename, const char *version)
1246 FILE *stream;
1247 unsigned int ch, i;
1248 struct combclass_table t;
1249 unsigned int level1_offset, level2_offset, level3_offset;
1251 stream = fopen (filename, "w");
1252 if (stream == NULL)
1254 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1255 exit (1);
1258 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1259 fprintf (stream, "/* Combining class of Unicode characters. */\n");
1260 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1261 version);
1262 fprintf (stream, "\n");
1264 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
1265 fprintf (stream, "\n");
1266 output_library_license (stream, true);
1267 fprintf (stream, "\n");
1269 t.p = 7;
1270 t.q = 9;
1271 combclass_table_init (&t);
1273 for (ch = 0; ch < 0x110000; ch++)
1274 if (unicode_attributes[ch].name != NULL)
1276 int value = atoi (unicode_attributes[ch].combining);
1277 assert (value >= 0 && value <= 255);
1278 combclass_table_add (&t, ch, value);
1281 combclass_table_finalize (&t);
1283 /* Offsets in t.result, in memory of this process. */
1284 level1_offset =
1285 5 * sizeof (uint32_t);
1286 level2_offset =
1287 5 * sizeof (uint32_t)
1288 + t.level1_size * sizeof (uint32_t);
1289 level3_offset =
1290 5 * sizeof (uint32_t)
1291 + t.level1_size * sizeof (uint32_t)
1292 + (t.level2_size << t.q) * sizeof (uint32_t);
1294 for (i = 0; i < 5; i++)
1295 fprintf (stream, "#define combclass_header_%d %d\n", i,
1296 ((uint32_t *) t.result)[i]);
1297 fprintf (stream, "static const\n");
1298 fprintf (stream, "struct\n");
1299 fprintf (stream, " {\n");
1300 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1301 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1302 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
1303 fprintf (stream, " }\n");
1304 fprintf (stream, "u_combclass =\n");
1305 fprintf (stream, "{\n");
1306 fprintf (stream, " {");
1307 if (t.level1_size > 8)
1308 fprintf (stream, "\n ");
1309 for (i = 0; i < t.level1_size; i++)
1311 uint32_t offset;
1312 if (i > 0 && (i % 8) == 0)
1313 fprintf (stream, "\n ");
1314 offset = ((uint32_t *) (t.result + level1_offset))[i];
1315 if (offset == 0)
1316 fprintf (stream, " %5d", -1);
1317 else
1318 fprintf (stream, " %5zu",
1319 (offset - level2_offset) / sizeof (uint32_t));
1320 if (i+1 < t.level1_size)
1321 fprintf (stream, ",");
1323 if (t.level1_size > 8)
1324 fprintf (stream, "\n ");
1325 fprintf (stream, " },\n");
1326 fprintf (stream, " {");
1327 if (t.level2_size << t.q > 8)
1328 fprintf (stream, "\n ");
1329 for (i = 0; i < t.level2_size << t.q; i++)
1331 uint32_t offset;
1332 if (i > 0 && (i % 8) == 0)
1333 fprintf (stream, "\n ");
1334 offset = ((uint32_t *) (t.result + level2_offset))[i];
1335 if (offset == 0)
1336 fprintf (stream, " %5d", -1);
1337 else
1338 fprintf (stream, " %5zu",
1339 (offset - level3_offset) / sizeof (uint8_t));
1340 if (i+1 < t.level2_size << t.q)
1341 fprintf (stream, ",");
1343 if (t.level2_size << t.q > 8)
1344 fprintf (stream, "\n ");
1345 fprintf (stream, " },\n");
1346 fprintf (stream, " {");
1347 if (t.level3_size << t.p > 8)
1348 fprintf (stream, "\n ");
1349 for (i = 0; i < t.level3_size << t.p; i++)
1351 if (i > 0 && (i % 8) == 0)
1352 fprintf (stream, "\n ");
1353 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
1354 if (i+1 < t.level3_size << t.p)
1355 fprintf (stream, ",");
1357 if (t.level3_size << t.p > 8)
1358 fprintf (stream, "\n ");
1359 fprintf (stream, " }\n");
1360 fprintf (stream, "};\n");
1362 if (ferror (stream) || fclose (stream))
1364 fprintf (stderr, "error writing to '%s'\n", filename);
1365 exit (1);
1369 /* ========================================================================= */
1371 /* Bidirectional category. */
1372 /* See Unicode 3.0 book, section 4.3,
1373 UCD.html. */
1375 enum
1377 UC_BIDI_L, /* Left-to-Right */
1378 UC_BIDI_LRE, /* Left-to-Right Embedding */
1379 UC_BIDI_LRO, /* Left-to-Right Override */
1380 UC_BIDI_R, /* Right-to-Left */
1381 UC_BIDI_AL, /* Right-to-Left Arabic */
1382 UC_BIDI_RLE, /* Right-to-Left Embedding */
1383 UC_BIDI_RLO, /* Right-to-Left Override */
1384 UC_BIDI_PDF, /* Pop Directional Format */
1385 UC_BIDI_EN, /* European Number */
1386 UC_BIDI_ES, /* European Number Separator */
1387 UC_BIDI_ET, /* European Number Terminator */
1388 UC_BIDI_AN, /* Arabic Number */
1389 UC_BIDI_CS, /* Common Number Separator */
1390 UC_BIDI_NSM, /* Non-Spacing Mark */
1391 UC_BIDI_BN, /* Boundary Neutral */
1392 UC_BIDI_B, /* Paragraph Separator */
1393 UC_BIDI_S, /* Segment Separator */
1394 UC_BIDI_WS, /* Whitespace */
1395 UC_BIDI_ON, /* Other Neutral */
1396 UC_BIDI_LRI, /* Left-to-Right Isolate */
1397 UC_BIDI_RLI, /* Right-to-Left Isolate */
1398 UC_BIDI_FSI, /* First Strong Isolate */
1399 UC_BIDI_PDI /* Pop Directional Isolate */
1402 static int
1403 bidi_category_byname (const char *category_name)
1405 switch (category_name[0])
1407 case 'A':
1408 switch (category_name[1])
1410 case 'L':
1411 if (category_name[2] == '\0')
1412 return UC_BIDI_AL;
1413 break;
1414 case 'N':
1415 if (category_name[2] == '\0')
1416 return UC_BIDI_AN;
1417 break;
1419 break;
1420 case 'B':
1421 switch (category_name[1])
1423 case '\0':
1424 return UC_BIDI_B;
1425 case 'N':
1426 if (category_name[2] == '\0')
1427 return UC_BIDI_BN;
1428 break;
1430 break;
1431 case 'C':
1432 switch (category_name[1])
1434 case 'S':
1435 if (category_name[2] == '\0')
1436 return UC_BIDI_CS;
1437 break;
1439 break;
1440 case 'E':
1441 switch (category_name[1])
1443 case 'N':
1444 if (category_name[2] == '\0')
1445 return UC_BIDI_EN;
1446 break;
1447 case 'S':
1448 if (category_name[2] == '\0')
1449 return UC_BIDI_ES;
1450 break;
1451 case 'T':
1452 if (category_name[2] == '\0')
1453 return UC_BIDI_ET;
1454 break;
1456 break;
1457 case 'F':
1458 switch (category_name[1])
1460 case 'S':
1461 switch (category_name[2])
1463 case 'I':
1464 if (category_name[3] == '\0')
1465 return UC_BIDI_FSI;
1466 break;
1469 break;
1470 case 'L':
1471 switch (category_name[1])
1473 case '\0':
1474 return UC_BIDI_L;
1475 case 'R':
1476 switch (category_name[2])
1478 case 'E':
1479 if (category_name[3] == '\0')
1480 return UC_BIDI_LRE;
1481 break;
1482 case 'O':
1483 if (category_name[3] == '\0')
1484 return UC_BIDI_LRO;
1485 break;
1486 case 'I':
1487 if (category_name[3] == '\0')
1488 return UC_BIDI_LRI;
1489 break;
1491 break;
1493 break;
1494 case 'N':
1495 switch (category_name[1])
1497 case 'S':
1498 switch (category_name[2])
1500 case 'M':
1501 if (category_name[3] == '\0')
1502 return UC_BIDI_NSM;
1503 break;
1505 break;
1507 break;
1508 case 'O':
1509 switch (category_name[1])
1511 case 'N':
1512 if (category_name[2] == '\0')
1513 return UC_BIDI_ON;
1514 break;
1516 break;
1517 case 'P':
1518 switch (category_name[1])
1520 case 'D':
1521 switch (category_name[2])
1523 case 'F':
1524 if (category_name[3] == '\0')
1525 return UC_BIDI_PDF;
1526 break;
1527 case 'I':
1528 if (category_name[3] == '\0')
1529 return UC_BIDI_PDI;
1530 break;
1532 break;
1534 break;
1535 case 'R':
1536 switch (category_name[1])
1538 case '\0':
1539 return UC_BIDI_R;
1540 case 'L':
1541 switch (category_name[2])
1543 case 'E':
1544 if (category_name[3] == '\0')
1545 return UC_BIDI_RLE;
1546 break;
1547 case 'O':
1548 if (category_name[3] == '\0')
1549 return UC_BIDI_RLO;
1550 break;
1551 case 'I':
1552 if (category_name[3] == '\0')
1553 return UC_BIDI_RLI;
1554 break;
1556 break;
1558 break;
1559 case 'S':
1560 if (category_name[1] == '\0')
1561 return UC_BIDI_S;
1562 break;
1563 case 'W':
1564 switch (category_name[1])
1566 case 'S':
1567 if (category_name[2] == '\0')
1568 return UC_BIDI_WS;
1569 break;
1571 break;
1573 /* Invalid bidi category name. */
1574 abort ();
1577 static int
1578 get_bidi_category (unsigned int ch)
1580 if (unicode_attributes[ch].name != NULL)
1581 return bidi_category_byname (unicode_attributes[ch].bidi);
1582 else
1584 /* The bidi category of unassigned characters depends on the range.
1585 See UTR #9 and DerivedBidiClass.txt. */
1586 if ((ch >= 0x0590 && ch <= 0x05FF)
1587 || (ch >= 0x07FB && ch <= 0x08FF)
1588 || (ch >= 0xFB37 && ch <= 0xFB45)
1589 || (ch >= 0x10800 && ch <= 0x10FFF))
1590 return UC_BIDI_R;
1591 else if ((ch >= 0x0600 && ch <= 0x07BF)
1592 || (ch >= 0x2064 && ch <= 0x2069)
1593 || (ch >= 0xFBB2 && ch <= 0xFDCF)
1594 || (ch >= 0xFDFE && ch <= 0xFEFE))
1595 return UC_BIDI_AL;
1596 else if ((ch >= 0xFDD0 && ch <= 0xFDEF)
1597 || (ch >= 0xFFF0 && ch <= 0xFFFF)
1598 || (ch & 0xFFFF) == 0xFFFE
1599 || (ch & 0xFFFF) == 0xFFFF
1600 || (ch >= 0xE0000 && ch <= 0xE0FFF))
1601 return UC_BIDI_BN;
1602 else
1603 return UC_BIDI_L;
1607 /* Construction of sparse 3-level tables. */
1608 #define TABLE bidi_category_table
1609 #define ELEMENT uint8_t
1610 #define DEFAULT UC_BIDI_L
1611 #define xmalloc malloc
1612 #define xrealloc realloc
1613 #include "3level.h"
1615 /* Output the per-character bidi category table. */
1616 static void
1617 output_bidi_category (const char *filename, const char *version)
1619 FILE *stream;
1620 unsigned int ch, i;
1621 struct bidi_category_table t;
1622 unsigned int level1_offset, level2_offset, level3_offset;
1623 uint16_t *level3_packed;
1625 stream = fopen (filename, "w");
1626 if (stream == NULL)
1628 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1629 exit (1);
1632 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1633 fprintf (stream, "/* Bidi categories of Unicode characters. */\n");
1634 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1635 version);
1636 fprintf (stream, "\n");
1638 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
1639 fprintf (stream, "\n");
1640 output_library_license (stream, true);
1641 fprintf (stream, "\n");
1643 t.p = 7;
1644 t.q = 9;
1645 bidi_category_table_init (&t);
1647 for (ch = 0; ch < 0x110000; ch++)
1649 int value = get_bidi_category (ch);
1651 assert (value <= 0x1f);
1653 bidi_category_table_add (&t, ch, value);
1656 bidi_category_table_finalize (&t);
1658 /* Offsets in t.result, in memory of this process. */
1659 level1_offset =
1660 5 * sizeof (uint32_t);
1661 level2_offset =
1662 5 * sizeof (uint32_t)
1663 + t.level1_size * sizeof (uint32_t);
1664 level3_offset =
1665 5 * sizeof (uint32_t)
1666 + t.level1_size * sizeof (uint32_t)
1667 + (t.level2_size << t.q) * sizeof (uint32_t);
1669 for (i = 0; i < 5; i++)
1670 fprintf (stream, "#define bidi_category_header_%d %d\n", i,
1671 ((uint32_t *) t.result)[i]);
1672 fprintf (stream, "static const\n");
1673 fprintf (stream, "struct\n");
1674 fprintf (stream, " {\n");
1675 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1676 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1677 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
1678 (1 << t.p) * 5 / 16);
1679 fprintf (stream, " }\n");
1680 fprintf (stream, "u_bidi_category =\n");
1681 fprintf (stream, "{\n");
1682 fprintf (stream, " {");
1683 if (t.level1_size > 8)
1684 fprintf (stream, "\n ");
1685 for (i = 0; i < t.level1_size; i++)
1687 uint32_t offset;
1688 if (i > 0 && (i % 8) == 0)
1689 fprintf (stream, "\n ");
1690 offset = ((uint32_t *) (t.result + level1_offset))[i];
1691 if (offset == 0)
1692 fprintf (stream, " %5d", -1);
1693 else
1694 fprintf (stream, " %5zu",
1695 (offset - level2_offset) / sizeof (uint32_t));
1696 if (i+1 < t.level1_size)
1697 fprintf (stream, ",");
1699 if (t.level1_size > 8)
1700 fprintf (stream, "\n ");
1701 fprintf (stream, " },\n");
1702 fprintf (stream, " {");
1703 if (t.level2_size << t.q > 8)
1704 fprintf (stream, "\n ");
1705 for (i = 0; i < t.level2_size << t.q; i++)
1707 uint32_t offset;
1708 if (i > 0 && (i % 8) == 0)
1709 fprintf (stream, "\n ");
1710 offset = ((uint32_t *) (t.result + level2_offset))[i];
1711 if (offset == 0)
1712 fprintf (stream, " %5d", -1);
1713 else
1714 fprintf (stream, " %5zu",
1715 (offset - level3_offset) / sizeof (uint8_t));
1716 if (i+1 < t.level2_size << t.q)
1717 fprintf (stream, ",");
1719 if (t.level2_size << t.q > 8)
1720 fprintf (stream, "\n ");
1721 fprintf (stream, " },\n");
1722 /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units,
1723 not 32-bit units, in order to make the lookup function easier. */
1724 level3_packed =
1725 (uint16_t *)
1726 calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t));
1727 for (i = 0; i < t.level3_size << t.p; i++)
1729 unsigned int j = (i * 5) / 16;
1730 unsigned int k = (i * 5) % 16;
1731 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
1732 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
1733 level3_packed[j] = value & 0xffff;
1734 level3_packed[j+1] = value >> 16;
1736 fprintf (stream, " {");
1737 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1738 fprintf (stream, "\n ");
1739 for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++)
1741 if (i > 0 && (i % 8) == 0)
1742 fprintf (stream, "\n ");
1743 fprintf (stream, " 0x%04x", level3_packed[i]);
1744 if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1)
1745 fprintf (stream, ",");
1747 if ((t.level3_size << t.p) * 5 / 16 + 1 > 8)
1748 fprintf (stream, "\n ");
1749 fprintf (stream, " }\n");
1750 free (level3_packed);
1751 fprintf (stream, "};\n");
1753 if (ferror (stream) || fclose (stream))
1755 fprintf (stderr, "error writing to '%s'\n", filename);
1756 exit (1);
1760 /* ========================================================================= */
1762 /* Decimal digit value. */
1763 /* See Unicode 3.0 book, section 4.6. */
1765 static int
1766 get_decdigit_value (unsigned int ch)
1768 if (unicode_attributes[ch].name != NULL
1769 && unicode_attributes[ch].decdigit[0] != '\0')
1770 return atoi (unicode_attributes[ch].decdigit);
1771 return -1;
1774 /* Construction of sparse 3-level tables. */
1775 #define TABLE decdigit_table
1776 #define ELEMENT uint8_t
1777 #define DEFAULT 0
1778 #define xmalloc malloc
1779 #define xrealloc realloc
1780 #include "3level.h"
1782 /* Output the unit test for the per-character decimal digit value table. */
1783 static void
1784 output_decimal_digit_test (const char *filename, const char *version)
1786 FILE *stream;
1787 bool need_comma;
1788 unsigned int ch;
1790 stream = fopen (filename, "w");
1791 if (stream == NULL)
1793 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1794 exit (1);
1797 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1798 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1799 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1800 version);
1801 fprintf (stream, "\n");
1803 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
1804 fprintf (stream, "\n");
1805 output_tests_license (stream);
1806 fprintf (stream, "\n");
1808 need_comma = false;
1809 for (ch = 0; ch < 0x110000; ch++)
1811 int value = get_decdigit_value (ch);
1813 assert (value >= -1 && value < 10);
1815 if (value >= 0)
1817 if (need_comma)
1818 fprintf (stream, ",\n");
1819 fprintf (stream, " { 0x%04X, %d }", ch, value);
1820 need_comma = true;
1823 if (need_comma)
1824 fprintf (stream, "\n");
1826 if (ferror (stream) || fclose (stream))
1828 fprintf (stderr, "error writing to '%s'\n", filename);
1829 exit (1);
1833 /* Output the per-character decimal digit value table. */
1834 static void
1835 output_decimal_digit (const char *filename, const char *version)
1837 FILE *stream;
1838 unsigned int ch, i;
1839 struct decdigit_table t;
1840 unsigned int level1_offset, level2_offset, level3_offset;
1842 stream = fopen (filename, "w");
1843 if (stream == NULL)
1845 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1846 exit (1);
1849 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1850 fprintf (stream, "/* Decimal digit values of Unicode characters. */\n");
1851 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1852 version);
1853 fprintf (stream, "\n");
1855 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
1856 fprintf (stream, "\n");
1857 output_library_license (stream, false);
1858 fprintf (stream, "\n");
1860 t.p = 7;
1861 t.q = 9;
1862 decdigit_table_init (&t);
1864 for (ch = 0; ch < 0x110000; ch++)
1866 int value = 1 + get_decdigit_value (ch);
1868 assert (value >= 0 && value <= 10);
1870 decdigit_table_add (&t, ch, value);
1873 decdigit_table_finalize (&t);
1875 /* Offsets in t.result, in memory of this process. */
1876 level1_offset =
1877 5 * sizeof (uint32_t);
1878 level2_offset =
1879 5 * sizeof (uint32_t)
1880 + t.level1_size * sizeof (uint32_t);
1881 level3_offset =
1882 5 * sizeof (uint32_t)
1883 + t.level1_size * sizeof (uint32_t)
1884 + (t.level2_size << t.q) * sizeof (uint32_t);
1886 for (i = 0; i < 5; i++)
1887 fprintf (stream, "#define decdigit_header_%d %d\n", i,
1888 ((uint32_t *) t.result)[i]);
1889 fprintf (stream, "static const\n");
1890 fprintf (stream, "struct\n");
1891 fprintf (stream, " {\n");
1892 fprintf (stream, " int level1[%zu];\n", t.level1_size);
1893 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
1894 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
1895 t.p - 1);
1896 fprintf (stream, " }\n");
1897 fprintf (stream, "u_decdigit =\n");
1898 fprintf (stream, "{\n");
1899 fprintf (stream, " {");
1900 if (t.level1_size > 8)
1901 fprintf (stream, "\n ");
1902 for (i = 0; i < t.level1_size; i++)
1904 uint32_t offset;
1905 if (i > 0 && (i % 8) == 0)
1906 fprintf (stream, "\n ");
1907 offset = ((uint32_t *) (t.result + level1_offset))[i];
1908 if (offset == 0)
1909 fprintf (stream, " %5d", -1);
1910 else
1911 fprintf (stream, " %5zu",
1912 (offset - level2_offset) / sizeof (uint32_t));
1913 if (i+1 < t.level1_size)
1914 fprintf (stream, ",");
1916 if (t.level1_size > 8)
1917 fprintf (stream, "\n ");
1918 fprintf (stream, " },\n");
1919 fprintf (stream, " {");
1920 if (t.level2_size << t.q > 8)
1921 fprintf (stream, "\n ");
1922 for (i = 0; i < t.level2_size << t.q; i++)
1924 uint32_t offset;
1925 if (i > 0 && (i % 8) == 0)
1926 fprintf (stream, "\n ");
1927 offset = ((uint32_t *) (t.result + level2_offset))[i];
1928 if (offset == 0)
1929 fprintf (stream, " %5d", -1);
1930 else
1931 fprintf (stream, " %5zu",
1932 (offset - level3_offset) / sizeof (uint8_t));
1933 if (i+1 < t.level2_size << t.q)
1934 fprintf (stream, ",");
1936 if (t.level2_size << t.q > 8)
1937 fprintf (stream, "\n ");
1938 fprintf (stream, " },\n");
1939 /* Pack the level3 array. Each entry needs 4 bits only. */
1940 fprintf (stream, " {");
1941 if (t.level3_size << (t.p - 1) > 8)
1942 fprintf (stream, "\n ");
1943 for (i = 0; i < t.level3_size << (t.p - 1); i++)
1945 if (i > 0 && (i % 8) == 0)
1946 fprintf (stream, "\n ");
1947 fprintf (stream, " 0x%02x",
1948 ((uint8_t *) (t.result + level3_offset))[2*i]
1949 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
1950 if (i+1 < t.level3_size << (t.p - 1))
1951 fprintf (stream, ",");
1953 if (t.level3_size << (t.p - 1) > 8)
1954 fprintf (stream, "\n ");
1955 fprintf (stream, " }\n");
1956 fprintf (stream, "};\n");
1958 if (ferror (stream) || fclose (stream))
1960 fprintf (stderr, "error writing to '%s'\n", filename);
1961 exit (1);
1965 /* ========================================================================= */
1967 /* Digit value. */
1968 /* See Unicode 3.0 book, section 4.6. */
1970 static int
1971 get_digit_value (unsigned int ch)
1973 if (unicode_attributes[ch].name != NULL
1974 && unicode_attributes[ch].digit[0] != '\0')
1975 return atoi (unicode_attributes[ch].digit);
1976 return -1;
1979 /* Output the unit test for the per-character digit value table. */
1980 static void
1981 output_digit_test (const char *filename, const char *version)
1983 FILE *stream;
1984 bool need_comma;
1985 unsigned int ch;
1987 stream = fopen (filename, "w");
1988 if (stream == NULL)
1990 fprintf (stderr, "cannot open '%s' for writing\n", filename);
1991 exit (1);
1994 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
1995 fprintf (stream, "/* Digit values of Unicode characters. */\n");
1996 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
1997 version);
1998 fprintf (stream, "\n");
2000 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
2001 fprintf (stream, "\n");
2002 output_tests_license (stream);
2003 fprintf (stream, "\n");
2005 need_comma = false;
2006 for (ch = 0; ch < 0x110000; ch++)
2008 int value = get_digit_value (ch);
2010 assert (value >= -1 && value < 10);
2012 if (value >= 0)
2014 if (need_comma)
2015 fprintf (stream, ",\n");
2016 fprintf (stream, " { 0x%04X, %d }", ch, value);
2017 need_comma = true;
2020 if (need_comma)
2021 fprintf (stream, "\n");
2023 if (ferror (stream) || fclose (stream))
2025 fprintf (stderr, "error writing to '%s'\n", filename);
2026 exit (1);
2030 /* Output the per-character digit value table. */
2031 static void
2032 output_digit (const char *filename, const char *version)
2034 FILE *stream;
2035 unsigned int ch, i;
2036 struct decdigit_table t;
2037 unsigned int level1_offset, level2_offset, level3_offset;
2039 stream = fopen (filename, "w");
2040 if (stream == NULL)
2042 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2043 exit (1);
2046 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2047 fprintf (stream, "/* Digit values of Unicode characters. */\n");
2048 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2049 version);
2050 fprintf (stream, "\n");
2052 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
2053 fprintf (stream, "\n");
2054 output_library_license (stream, false);
2055 fprintf (stream, "\n");
2057 t.p = 7;
2058 t.q = 9;
2059 decdigit_table_init (&t);
2061 for (ch = 0; ch < 0x110000; ch++)
2063 int value = 1 + get_digit_value (ch);
2065 assert (value >= 0 && value <= 10);
2067 decdigit_table_add (&t, ch, value);
2070 decdigit_table_finalize (&t);
2072 /* Offsets in t.result, in memory of this process. */
2073 level1_offset =
2074 5 * sizeof (uint32_t);
2075 level2_offset =
2076 5 * sizeof (uint32_t)
2077 + t.level1_size * sizeof (uint32_t);
2078 level3_offset =
2079 5 * sizeof (uint32_t)
2080 + t.level1_size * sizeof (uint32_t)
2081 + (t.level2_size << t.q) * sizeof (uint32_t);
2083 for (i = 0; i < 5; i++)
2084 fprintf (stream, "#define digit_header_%d %d\n", i,
2085 ((uint32_t *) t.result)[i]);
2086 fprintf (stream, "static const\n");
2087 fprintf (stream, "struct\n");
2088 fprintf (stream, " {\n");
2089 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2090 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2091 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size,
2092 t.p - 1);
2093 fprintf (stream, " }\n");
2094 fprintf (stream, "u_digit =\n");
2095 fprintf (stream, "{\n");
2096 fprintf (stream, " {");
2097 if (t.level1_size > 8)
2098 fprintf (stream, "\n ");
2099 for (i = 0; i < t.level1_size; i++)
2101 uint32_t offset;
2102 if (i > 0 && (i % 8) == 0)
2103 fprintf (stream, "\n ");
2104 offset = ((uint32_t *) (t.result + level1_offset))[i];
2105 if (offset == 0)
2106 fprintf (stream, " %5d", -1);
2107 else
2108 fprintf (stream, " %5zu",
2109 (offset - level2_offset) / sizeof (uint32_t));
2110 if (i+1 < t.level1_size)
2111 fprintf (stream, ",");
2113 if (t.level1_size > 8)
2114 fprintf (stream, "\n ");
2115 fprintf (stream, " },\n");
2116 fprintf (stream, " {");
2117 if (t.level2_size << t.q > 8)
2118 fprintf (stream, "\n ");
2119 for (i = 0; i < t.level2_size << t.q; i++)
2121 uint32_t offset;
2122 if (i > 0 && (i % 8) == 0)
2123 fprintf (stream, "\n ");
2124 offset = ((uint32_t *) (t.result + level2_offset))[i];
2125 if (offset == 0)
2126 fprintf (stream, " %5d", -1);
2127 else
2128 fprintf (stream, " %5zu",
2129 (offset - level3_offset) / sizeof (uint8_t));
2130 if (i+1 < t.level2_size << t.q)
2131 fprintf (stream, ",");
2133 if (t.level2_size << t.q > 8)
2134 fprintf (stream, "\n ");
2135 fprintf (stream, " },\n");
2136 /* Pack the level3 array. Each entry needs 4 bits only. */
2137 fprintf (stream, " {");
2138 if (t.level3_size << (t.p - 1) > 8)
2139 fprintf (stream, "\n ");
2140 for (i = 0; i < t.level3_size << (t.p - 1); i++)
2142 if (i > 0 && (i % 8) == 0)
2143 fprintf (stream, "\n ");
2144 fprintf (stream, " 0x%02x",
2145 ((uint8_t *) (t.result + level3_offset))[2*i]
2146 + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4));
2147 if (i+1 < t.level3_size << (t.p - 1))
2148 fprintf (stream, ",");
2150 if (t.level3_size << (t.p - 1) > 8)
2151 fprintf (stream, "\n ");
2152 fprintf (stream, " }\n");
2153 fprintf (stream, "};\n");
2155 if (ferror (stream) || fclose (stream))
2157 fprintf (stderr, "error writing to '%s'\n", filename);
2158 exit (1);
2162 /* ========================================================================= */
2164 /* Numeric value. */
2165 /* See Unicode 3.0 book, section 4.6. */
2167 typedef struct { int numerator; int denominator; } uc_fraction_t;
2169 static uc_fraction_t
2170 get_numeric_value (unsigned int ch)
2172 uc_fraction_t value;
2174 if (unicode_attributes[ch].name != NULL
2175 && unicode_attributes[ch].numeric[0] != '\0')
2177 const char *str = unicode_attributes[ch].numeric;
2178 /* str is of the form "integer" or "integer/posinteger". */
2179 value.numerator = atoi (str);
2180 if (strchr (str, '/') != NULL)
2181 value.denominator = atoi (strchr (str, '/') + 1);
2182 else
2183 value.denominator = 1;
2185 else
2187 value.numerator = 0;
2188 value.denominator = 0;
2190 return value;
2193 /* Output the unit test for the per-character numeric value table. */
2194 static void
2195 output_numeric_test (const char *filename, const char *version)
2197 FILE *stream;
2198 bool need_comma;
2199 unsigned int ch;
2201 stream = fopen (filename, "w");
2202 if (stream == NULL)
2204 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2205 exit (1);
2208 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2209 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2210 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2211 version);
2212 fprintf (stream, "\n");
2214 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
2215 fprintf (stream, "\n");
2216 output_tests_license (stream);
2217 fprintf (stream, "\n");
2219 need_comma = false;
2220 for (ch = 0; ch < 0x110000; ch++)
2222 uc_fraction_t value = get_numeric_value (ch);
2224 if (value.numerator != 0 || value.denominator != 0)
2226 if (need_comma)
2227 fprintf (stream, ",\n");
2228 fprintf (stream, " { 0x%04X, %d, %d }",
2229 ch, value.numerator, value.denominator);
2230 need_comma = true;
2233 if (need_comma)
2234 fprintf (stream, "\n");
2236 if (ferror (stream) || fclose (stream))
2238 fprintf (stderr, "error writing to '%s'\n", filename);
2239 exit (1);
2243 /* Construction of sparse 3-level tables. */
2244 #define TABLE numeric_table
2245 #define ELEMENT uint8_t
2246 #define DEFAULT 0
2247 #define xmalloc malloc
2248 #define xrealloc realloc
2249 #include "3level.h"
2251 /* Output the per-character numeric value table. */
2252 static void
2253 output_numeric (const char *filename, const char *version)
2255 FILE *stream;
2256 uc_fraction_t fractions[160];
2257 unsigned int nfractions;
2258 unsigned int ch, i, j;
2259 struct numeric_table t;
2260 unsigned int level1_offset, level2_offset, level3_offset;
2261 uint16_t *level3_packed;
2263 stream = fopen (filename, "w");
2264 if (stream == NULL)
2266 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2267 exit (1);
2270 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2271 fprintf (stream, "/* Numeric values of Unicode characters. */\n");
2272 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2273 version);
2274 fprintf (stream, "\n");
2276 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
2277 fprintf (stream, "\n");
2278 output_library_license (stream, false);
2279 fprintf (stream, "\n");
2281 /* Create table of occurring fractions. */
2282 nfractions = 0;
2283 for (ch = 0; ch < 0x110000; ch++)
2285 uc_fraction_t value = get_numeric_value (ch);
2287 for (i = 0; i < nfractions; i++)
2288 if (value.numerator == fractions[i].numerator
2289 && value.denominator == fractions[i].denominator)
2290 break;
2291 if (i == nfractions)
2293 assert (nfractions != SIZEOF (fractions));
2294 for (i = 0; i < nfractions; i++)
2295 if (value.denominator < fractions[i].denominator
2296 || (value.denominator == fractions[i].denominator
2297 && value.numerator < fractions[i].numerator))
2298 break;
2299 for (j = nfractions; j > i; j--)
2300 fractions[j] = fractions[j - 1];
2301 fractions[i] = value;
2302 nfractions++;
2306 fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n",
2307 nfractions);
2308 fprintf (stream, "{\n");
2309 for (i = 0; i < nfractions; i++)
2311 fprintf (stream, " { %d, %d }", fractions[i].numerator,
2312 fractions[i].denominator);
2313 if (i+1 < nfractions)
2314 fprintf (stream, ",");
2315 fprintf (stream, "\n");
2317 fprintf (stream, "};\n");
2319 t.p = 7;
2320 t.q = 9;
2321 numeric_table_init (&t);
2323 for (ch = 0; ch < 0x110000; ch++)
2325 uc_fraction_t value = get_numeric_value (ch);
2327 for (i = 0; i < nfractions; i++)
2328 if (value.numerator == fractions[i].numerator
2329 && value.denominator == fractions[i].denominator)
2330 break;
2331 assert (i != nfractions);
2333 numeric_table_add (&t, ch, i);
2336 numeric_table_finalize (&t);
2338 /* Offsets in t.result, in memory of this process. */
2339 level1_offset =
2340 5 * sizeof (uint32_t);
2341 level2_offset =
2342 5 * sizeof (uint32_t)
2343 + t.level1_size * sizeof (uint32_t);
2344 level3_offset =
2345 5 * sizeof (uint32_t)
2346 + t.level1_size * sizeof (uint32_t)
2347 + (t.level2_size << t.q) * sizeof (uint32_t);
2349 for (i = 0; i < 5; i++)
2350 fprintf (stream, "#define numeric_header_%d %d\n", i,
2351 ((uint32_t *) t.result)[i]);
2352 fprintf (stream, "static const\n");
2353 fprintf (stream, "struct\n");
2354 fprintf (stream, " {\n");
2355 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2356 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2357 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
2358 (1 << t.p) * 8 / 16);
2359 fprintf (stream, " }\n");
2360 fprintf (stream, "u_numeric =\n");
2361 fprintf (stream, "{\n");
2362 fprintf (stream, " {");
2363 if (t.level1_size > 8)
2364 fprintf (stream, "\n ");
2365 for (i = 0; i < t.level1_size; i++)
2367 uint32_t offset;
2368 if (i > 0 && (i % 8) == 0)
2369 fprintf (stream, "\n ");
2370 offset = ((uint32_t *) (t.result + level1_offset))[i];
2371 if (offset == 0)
2372 fprintf (stream, " %5d", -1);
2373 else
2374 fprintf (stream, " %5zu",
2375 (offset - level2_offset) / sizeof (uint32_t));
2376 if (i+1 < t.level1_size)
2377 fprintf (stream, ",");
2379 if (t.level1_size > 8)
2380 fprintf (stream, "\n ");
2381 fprintf (stream, " },\n");
2382 fprintf (stream, " {");
2383 if (t.level2_size << t.q > 8)
2384 fprintf (stream, "\n ");
2385 for (i = 0; i < t.level2_size << t.q; i++)
2387 uint32_t offset;
2388 if (i > 0 && (i % 8) == 0)
2389 fprintf (stream, "\n ");
2390 offset = ((uint32_t *) (t.result + level2_offset))[i];
2391 if (offset == 0)
2392 fprintf (stream, " %5d", -1);
2393 else
2394 fprintf (stream, " %5zu",
2395 (offset - level3_offset) / sizeof (uint8_t));
2396 if (i+1 < t.level2_size << t.q)
2397 fprintf (stream, ",");
2399 if (t.level2_size << t.q > 8)
2400 fprintf (stream, "\n ");
2401 fprintf (stream, " },\n");
2402 /* Pack the level3 array. Each entry needs 8 bits only. Use 16-bit units,
2403 not 32-bit units, in order to make the lookup function easier. */
2404 level3_packed =
2405 (uint16_t *)
2406 calloc ((t.level3_size << t.p) * 8 / 16 + 1, sizeof (uint16_t));
2407 for (i = 0; i < t.level3_size << t.p; i++)
2409 unsigned int j = (i * 8) / 16;
2410 unsigned int k = (i * 8) % 16;
2411 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
2412 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
2413 level3_packed[j] = value & 0xffff;
2414 level3_packed[j+1] = value >> 16;
2416 fprintf (stream, " {");
2417 if ((t.level3_size << t.p) * 8 / 16 + 1 > 8)
2418 fprintf (stream, "\n ");
2419 for (i = 0; i < (t.level3_size << t.p) * 8 / 16 + 1; i++)
2421 if (i > 0 && (i % 8) == 0)
2422 fprintf (stream, "\n ");
2423 fprintf (stream, " 0x%04x", level3_packed[i]);
2424 if (i+1 < (t.level3_size << t.p) * 8 / 16 + 1)
2425 fprintf (stream, ",");
2427 if ((t.level3_size << t.p) * 8 / 16 + 1 > 8)
2428 fprintf (stream, "\n ");
2429 fprintf (stream, " }\n");
2430 free (level3_packed);
2431 fprintf (stream, "};\n");
2433 if (ferror (stream) || fclose (stream))
2435 fprintf (stderr, "error writing to '%s'\n", filename);
2436 exit (1);
2440 /* ========================================================================= */
2442 /* Mirrored. */
2443 /* See Unicode 3.0 book, section 4.7,
2444 UAX #9. */
2446 /* A pair of mirrored characters. */
2447 struct mirror_pair { unsigned int uc[2]; };
2449 /* List of mirrored character pairs, from the BidiMirroring.txt file.
2450 This is a subset of the characters having the BidiMirrored property. */
2451 static struct mirror_pair mirror_pairs[1000];
2452 static unsigned int mirror_pairs_count;
2454 /* Stores in mirror_pairs[] the mirrored character pairs from the
2455 BidiMirroring.txt file. */
2456 static void
2457 fill_mirror (const char *bidimirroring_filename)
2459 FILE *stream;
2460 char field0[FIELDLEN];
2461 char field1[FIELDLEN];
2462 char field2[FIELDLEN];
2463 int lineno = 0;
2465 stream = fopen (bidimirroring_filename, "r");
2466 if (stream == NULL)
2468 fprintf (stderr, "error during fopen of '%s'\n", bidimirroring_filename);
2469 exit (1);
2472 mirror_pairs_count = 0;
2473 for (;;)
2475 int n;
2476 int c;
2477 unsigned int uc1;
2478 unsigned int uc2;
2479 unsigned int i;
2481 lineno++;
2482 c = getc (stream);
2483 if (c == EOF)
2484 break;
2485 if (c == '\n')
2486 continue;
2487 if (c == '#')
2489 do c = getc (stream); while (c != EOF && c != '\n');
2490 continue;
2492 ungetc (c, stream);
2493 n = getfield (stream, field0, ';');
2494 do c = getc (stream); while (c == ' ');
2495 ungetc (c, stream);
2496 n += getfield (stream, field1, '#');
2497 n += getfield (stream, field2, '\n');
2498 if (n == 0)
2499 break;
2500 if (n != 3)
2502 fprintf (stderr, "short line in '%s':%d\n",
2503 bidimirroring_filename, lineno);
2504 exit (1);
2506 /* Remove trailing spaces from field1. */
2507 while (strlen (field1) > 0 && field1[strlen (field1) - 1] == ' ')
2508 field1[strlen (field1) - 1] = '\0';
2509 /* The line should contain two characters. */
2510 uc1 = strtoul (field0, NULL, 16);
2511 uc2 = strtoul (field1, NULL, 16);
2512 if (uc1 == 0 || uc2 == 0 || uc1 == uc2)
2514 fprintf (stderr, "parse error at '%s':%d\n",
2515 bidimirroring_filename, lineno);
2516 exit (1);
2518 /* Verify that uc1 and uc2 are in range. */
2519 if (!(uc1 < 0x110000))
2521 fprintf (stderr, "%s mentions 0x%04X, which is out-of-range.\n",
2522 bidimirroring_filename, uc1);
2523 exit (1);
2525 if (!(uc2 < 0x110000))
2527 fprintf (stderr, "%s mentions 0x%04X, which is out-of-range.\n",
2528 bidimirroring_filename, uc2);
2529 exit (1);
2531 /* Have we seen uc1 or uc2 already? */
2532 for (i = 0; i < mirror_pairs_count; i++)
2534 if (uc1 == mirror_pairs[i].uc[0])
2536 fprintf (stderr, "%s: mapping conflict for 0x%04X\n",
2537 bidimirroring_filename, uc1);
2538 exit (1);
2540 if (uc2 == mirror_pairs[i].uc[1])
2542 fprintf (stderr, "%s: mapping conflict for 0x%04X\n",
2543 bidimirroring_filename, uc2);
2544 exit (1);
2547 for (i = 0; i < mirror_pairs_count; i++)
2548 if (uc1 == mirror_pairs[i].uc[1] || uc2 == mirror_pairs[i].uc[0])
2549 break;
2550 if (i < mirror_pairs_count)
2552 if (uc1 != mirror_pairs[i].uc[1])
2554 /* uc1 != mirror_pairs[i].uc[1], uc2 == mirror_pairs[i].uc[0] */
2555 fprintf (stderr, "%s: mapping conflict for 0x%04X\n",
2556 bidimirroring_filename, uc2);
2557 exit (1);
2559 if (uc2 != mirror_pairs[i].uc[0])
2561 /* uc1 == mirror_pairs[i].uc[1], uc2 != mirror_pairs[i].uc[0] */
2562 fprintf (stderr, "%s: mapping conflict for 0x%04X\n",
2563 bidimirroring_filename, uc1);
2564 exit (1);
2566 /* uc1 == mirror_pairs[i].uc[1], uc2 == mirror_pairs[i].uc[0].
2567 (uc1, uc2) is the reverse pair of a pair that we already had
2568 encountered: (uc2, uc1). */
2570 else
2572 /* A new pair. */
2573 if (mirror_pairs_count == SIZEOF (mirror_pairs))
2575 fprintf (stderr, "%s contains more pairs than expected, "
2576 "increase mirror_pairs' size.\n",
2577 bidimirroring_filename);
2578 exit (1);
2580 mirror_pairs[mirror_pairs_count].uc[0] = uc1;
2581 mirror_pairs[mirror_pairs_count].uc[1] = uc2;
2582 mirror_pairs_count++;
2584 /* Verify that uc1 and uc2 have the BidiMirrored property. */
2585 if (!(unicode_attributes[uc1].name != NULL
2586 && unicode_attributes[uc1].mirrored))
2588 fprintf (stderr, "%s mentions 0x%04X, which is not BidiMirrored\n",
2589 bidimirroring_filename, uc1);
2590 exit (1);
2592 if (!(unicode_attributes[uc2].name != NULL
2593 && unicode_attributes[uc2].mirrored))
2595 fprintf (stderr, "%s mentions 0x%04X, which is not BidiMirrored\n",
2596 bidimirroring_filename, uc2);
2597 exit (1);
2601 if (ferror (stream) || fclose (stream))
2603 fprintf (stderr, "error reading from '%s'\n", bidimirroring_filename);
2604 exit (1);
2608 static int
2609 get_mirror_value (unsigned int ch)
2611 bool mirrored;
2612 unsigned int mirror_char;
2613 unsigned int i;
2615 mirrored = (unicode_attributes[ch].name != NULL
2616 && unicode_attributes[ch].mirrored);
2617 mirror_char = 0xfffd;
2618 for (i = 0; i < mirror_pairs_count; i++)
2619 if (ch == mirror_pairs[i].uc[0])
2621 mirror_char = mirror_pairs[i].uc[1];
2622 break;
2624 else if (ch == mirror_pairs[i].uc[1])
2626 mirror_char = mirror_pairs[i].uc[0];
2627 break;
2629 if (mirrored)
2630 return (int) mirror_char - (int) ch;
2631 else
2633 assert (mirror_char == 0xfffd);
2634 return 0;
2638 /* Construction of sparse 3-level tables. */
2639 #define TABLE mirror_table
2640 #define ELEMENT int32_t
2641 #define DEFAULT 0
2642 #define xmalloc malloc
2643 #define xrealloc realloc
2644 #include "3level.h"
2646 /* Output the per-character mirror table. */
2647 static void
2648 output_mirror (const char *filename, const char *version)
2650 FILE *stream;
2651 unsigned int ch, i;
2652 struct mirror_table t;
2653 unsigned int level1_offset, level2_offset, level3_offset;
2655 stream = fopen (filename, "w");
2656 if (stream == NULL)
2658 fprintf (stderr, "cannot open '%s' for writing\n", filename);
2659 exit (1);
2662 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
2663 fprintf (stream, "/* Mirrored Unicode characters. */\n");
2664 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
2665 version);
2666 fprintf (stream, "\n");
2668 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
2669 fprintf (stream, "\n");
2670 output_library_license (stream, false);
2671 fprintf (stream, "\n");
2673 t.p = 7;
2674 t.q = 9;
2675 mirror_table_init (&t);
2677 for (ch = 0; ch < 0x110000; ch++)
2679 int value = get_mirror_value (ch);
2681 mirror_table_add (&t, ch, value);
2684 mirror_table_finalize (&t);
2686 /* Offsets in t.result, in memory of this process. */
2687 level1_offset =
2688 5 * sizeof (uint32_t);
2689 level2_offset =
2690 5 * sizeof (uint32_t)
2691 + t.level1_size * sizeof (uint32_t);
2692 level3_offset =
2693 5 * sizeof (uint32_t)
2694 + t.level1_size * sizeof (uint32_t)
2695 + (t.level2_size << t.q) * sizeof (uint32_t);
2697 for (i = 0; i < 5; i++)
2698 fprintf (stream, "#define mirror_header_%d %d\n", i,
2699 ((uint32_t *) t.result)[i]);
2700 fprintf (stream, "static const\n");
2701 fprintf (stream, "struct\n");
2702 fprintf (stream, " {\n");
2703 fprintf (stream, " int level1[%zu];\n", t.level1_size);
2704 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
2705 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
2706 fprintf (stream, " }\n");
2707 fprintf (stream, "u_mirror =\n");
2708 fprintf (stream, "{\n");
2709 fprintf (stream, " {");
2710 if (t.level1_size > 8)
2711 fprintf (stream, "\n ");
2712 for (i = 0; i < t.level1_size; i++)
2714 uint32_t offset;
2715 if (i > 0 && (i % 8) == 0)
2716 fprintf (stream, "\n ");
2717 offset = ((uint32_t *) (t.result + level1_offset))[i];
2718 if (offset == 0)
2719 fprintf (stream, " %5d", -1);
2720 else
2721 fprintf (stream, " %5zu",
2722 (offset - level2_offset) / sizeof (uint32_t));
2723 if (i+1 < t.level1_size)
2724 fprintf (stream, ",");
2726 if (t.level1_size > 8)
2727 fprintf (stream, "\n ");
2728 fprintf (stream, " },\n");
2729 fprintf (stream, " {");
2730 if (t.level2_size << t.q > 8)
2731 fprintf (stream, "\n ");
2732 for (i = 0; i < t.level2_size << t.q; i++)
2734 uint32_t offset;
2735 if (i > 0 && (i % 8) == 0)
2736 fprintf (stream, "\n ");
2737 offset = ((uint32_t *) (t.result + level2_offset))[i];
2738 if (offset == 0)
2739 fprintf (stream, " %5d", -1);
2740 else
2741 fprintf (stream, " %5zu",
2742 (offset - level3_offset) / sizeof (int32_t));
2743 if (i+1 < t.level2_size << t.q)
2744 fprintf (stream, ",");
2746 if (t.level2_size << t.q > 8)
2747 fprintf (stream, "\n ");
2748 fprintf (stream, " },\n");
2749 fprintf (stream, " {");
2750 if (t.level3_size << t.p > 8)
2751 fprintf (stream, "\n ");
2752 for (i = 0; i < t.level3_size << t.p; i++)
2754 if (i > 0 && (i % 8) == 0)
2755 fprintf (stream, "\n ");
2756 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
2757 if (i+1 < t.level3_size << t.p)
2758 fprintf (stream, ",");
2760 if (t.level3_size << t.p > 8)
2761 fprintf (stream, "\n ");
2762 fprintf (stream, " }\n");
2763 fprintf (stream, "};\n");
2765 if (ferror (stream) || fclose (stream))
2767 fprintf (stderr, "error writing to '%s'\n", filename);
2768 exit (1);
2772 /* ========================================================================= */
2774 /* Particular values of the word break property. */
2776 static bool
2777 is_WBP_MIDNUMLET (unsigned int ch)
2779 return (ch == 0x002E || ch == 0x2018 || ch == 0x2019
2780 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E);
2783 static bool
2784 is_WBP_MIDLETTER (unsigned int ch)
2786 return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A
2787 || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A
2788 || ch == 0x055F);
2791 /* ========================================================================= */
2793 /* Properties. */
2795 /* Reading PropList.txt and DerivedCoreProperties.txt. */
2796 enum
2798 /* PropList.txt */
2799 PROP_WHITE_SPACE,
2800 PROP_BIDI_CONTROL,
2801 PROP_JOIN_CONTROL,
2802 PROP_PREPENDED_CONCATENATION_MARK,
2803 PROP_DASH,
2804 PROP_HYPHEN,
2805 PROP_QUOTATION_MARK,
2806 PROP_TERMINAL_PUNCTUATION,
2807 PROP_OTHER_MATH,
2808 PROP_HEX_DIGIT,
2809 PROP_ASCII_HEX_DIGIT,
2810 PROP_OTHER_ALPHABETIC,
2811 PROP_IDEOGRAPHIC,
2812 PROP_DIACRITIC,
2813 PROP_EXTENDER,
2814 PROP_OTHER_LOWERCASE,
2815 PROP_OTHER_UPPERCASE,
2816 PROP_NONCHARACTER_CODE_POINT,
2817 PROP_OTHER_GRAPHEME_EXTEND,
2818 PROP_IDS_BINARY_OPERATOR,
2819 PROP_IDS_TRINARY_OPERATOR,
2820 PROP_IDS_UNARY_OPERATOR,
2821 PROP_RADICAL,
2822 PROP_UNIFIED_IDEOGRAPH,
2823 PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT,
2824 PROP_DEPRECATED,
2825 PROP_SOFT_DOTTED,
2826 PROP_LOGICAL_ORDER_EXCEPTION,
2827 PROP_OTHER_ID_START,
2828 PROP_OTHER_ID_CONTINUE,
2829 PROP_ID_COMPAT_MATH_CONTINUE,
2830 PROP_ID_COMPAT_MATH_START,
2831 PROP_SENTENCE_TERMINAL,
2832 PROP_VARIATION_SELECTOR,
2833 PROP_PATTERN_WHITE_SPACE,
2834 PROP_PATTERN_SYNTAX,
2835 PROP_REGIONAL_INDICATOR,
2836 PROP_MODIFIER_COMBINING_MARK,
2837 /* DerivedCoreProperties.txt */
2838 PROP_MATH,
2839 PROP_ALPHABETIC,
2840 PROP_LOWERCASE,
2841 PROP_UPPERCASE,
2842 PROP_CASED,
2843 PROP_CASE_IGNORABLE,
2844 PROP_CHANGES_WHEN_LOWERCASED,
2845 PROP_CHANGES_WHEN_UPPERCASED,
2846 PROP_CHANGES_WHEN_TITLECASED,
2847 PROP_CHANGES_WHEN_CASEFOLDED,
2848 PROP_CHANGES_WHEN_CASEMAPPED,
2849 PROP_ID_START,
2850 PROP_ID_CONTINUE,
2851 PROP_XID_START,
2852 PROP_XID_CONTINUE,
2853 PROP_DEFAULT_IGNORABLE_CODE_POINT,
2854 PROP_GRAPHEME_EXTEND,
2855 PROP_GRAPHEME_BASE,
2856 PROP_GRAPHEME_LINK,
2857 /* emoji-data.txt */
2858 PROP_EMOJI,
2859 PROP_EMOJI_PRESENTATION,
2860 PROP_EMOJI_MODIFIER,
2861 PROP_EMOJI_MODIFIER_BASE,
2862 PROP_EMOJI_COMPONENT,
2863 PROP_EXTENDED_PICTOGRAPHIC
2865 unsigned long long unicode_properties[0x110000];
2867 enum
2869 UC_INDIC_CONJUNCT_BREAK_NONE = 0, /* None */
2870 UC_INDIC_CONJUNCT_BREAK_CONSONANT, /* Consonant */
2871 UC_INDIC_CONJUNCT_BREAK_LINKER, /* Linker */
2872 UC_INDIC_CONJUNCT_BREAK_EXTEND /* Extend */
2874 static uint8_t unicode_indic_conjunct_break[0x110000];
2876 static void
2877 clear_properties (void)
2879 unsigned int i;
2881 for (i = 0; i < 0x110000; i++)
2882 unicode_properties[i] = 0;
2885 /* Stores in unicode_properties[] the properties from the
2886 PropList.txt or DerivedCoreProperties.txt file. */
2887 static void
2888 fill_properties (const char *proplist_filename)
2890 unsigned int i;
2891 FILE *stream;
2893 stream = fopen (proplist_filename, "r");
2894 if (stream == NULL)
2896 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
2897 exit (1);
2900 for (;;)
2902 char buf[200+1];
2903 unsigned int i1, i2;
2904 char padding[200+1];
2905 char propname[200+1];
2906 char rest_of_line[200+1];
2907 unsigned int propcode;
2909 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
2910 break;
2912 if (buf[0] == '\0' || buf[0] == '#')
2913 continue;
2915 if (sscanf (buf, "%X..%X%[ ;]%[^ #]%200s", &i1, &i2, padding, propname, rest_of_line) != 5)
2917 if (sscanf (buf, "%X%[ ;]%[^ #]%200s", &i1, padding, propname, rest_of_line) != 4)
2919 fprintf (stderr, "parse error in '%s'\n", proplist_filename);
2920 exit (1);
2922 i2 = i1;
2924 #define PROP(name,code) \
2925 if (strcmp (propname, name) == 0) propcode = code; else
2926 /* PropList.txt */
2927 PROP ("White_Space", PROP_WHITE_SPACE)
2928 PROP ("Bidi_Control", PROP_BIDI_CONTROL)
2929 PROP ("Join_Control", PROP_JOIN_CONTROL)
2930 PROP ("Prepended_Concatenation_Mark", PROP_PREPENDED_CONCATENATION_MARK)
2931 PROP ("Dash", PROP_DASH)
2932 PROP ("Hyphen", PROP_HYPHEN)
2933 PROP ("Quotation_Mark", PROP_QUOTATION_MARK)
2934 PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION)
2935 PROP ("Other_Math", PROP_OTHER_MATH)
2936 PROP ("Hex_Digit", PROP_HEX_DIGIT)
2937 PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT)
2938 PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC)
2939 PROP ("Ideographic", PROP_IDEOGRAPHIC)
2940 PROP ("Diacritic", PROP_DIACRITIC)
2941 PROP ("Extender", PROP_EXTENDER)
2942 PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE)
2943 PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE)
2944 PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT)
2945 PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND)
2946 PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR)
2947 PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR)
2948 PROP ("IDS_Unary_Operator", PROP_IDS_UNARY_OPERATOR)
2949 PROP ("Radical", PROP_RADICAL)
2950 PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH)
2951 PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)
2952 PROP ("Deprecated", PROP_DEPRECATED)
2953 PROP ("Soft_Dotted", PROP_SOFT_DOTTED)
2954 PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION)
2955 PROP ("Other_ID_Start", PROP_OTHER_ID_START)
2956 PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE)
2957 PROP ("ID_Compat_Math_Continue", PROP_ID_COMPAT_MATH_CONTINUE)
2958 PROP ("ID_Compat_Math_Start", PROP_ID_COMPAT_MATH_START)
2959 PROP ("Sentence_Terminal", PROP_SENTENCE_TERMINAL)
2960 PROP ("Variation_Selector", PROP_VARIATION_SELECTOR)
2961 PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE)
2962 PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX)
2963 PROP ("Regional_Indicator", PROP_REGIONAL_INDICATOR)
2964 PROP ("Modifier_Combining_Mark", PROP_MODIFIER_COMBINING_MARK)
2965 /* DerivedCoreProperties.txt */
2966 PROP ("Math", PROP_MATH)
2967 PROP ("Alphabetic", PROP_ALPHABETIC)
2968 PROP ("Lowercase", PROP_LOWERCASE)
2969 PROP ("Uppercase", PROP_UPPERCASE)
2970 PROP ("Cased", PROP_CASED)
2971 PROP ("Case_Ignorable", PROP_CASE_IGNORABLE)
2972 PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED)
2973 PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED)
2974 PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED)
2975 PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED)
2976 PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED)
2977 PROP ("ID_Start", PROP_ID_START)
2978 PROP ("ID_Continue", PROP_ID_CONTINUE)
2979 PROP ("XID_Start", PROP_XID_START)
2980 PROP ("XID_Continue", PROP_XID_CONTINUE)
2981 PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT)
2982 PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND)
2983 PROP ("Grapheme_Base", PROP_GRAPHEME_BASE)
2984 PROP ("Grapheme_Link", PROP_GRAPHEME_LINK)
2985 /* emoji-data.txt */
2986 PROP ("Emoji", PROP_EMOJI)
2987 PROP ("Emoji_Presentation", PROP_EMOJI_PRESENTATION)
2988 PROP ("Emoji_Modifier", PROP_EMOJI_MODIFIER)
2989 PROP ("Emoji_Modifier_Base", PROP_EMOJI_MODIFIER_BASE)
2990 PROP ("Emoji_Component", PROP_EMOJI_COMPONENT)
2991 PROP ("Extended_Pictographic", PROP_EXTENDED_PICTOGRAPHIC)
2992 #undef PROP
2993 /* An enum-valued property from DerivedCoreProperties.txt */
2994 if (strcmp (propname, "InCB;") == 0)
2996 char valuename[200+1];
2997 unsigned int valuecode;
2999 if (sscanf (rest_of_line, "%[^ #]", valuename) != 1)
3001 fprintf (stderr, "parse error 2 in '%s'\n", proplist_filename);
3002 exit (1);
3005 if (strcmp (valuename, "None") == 0)
3006 valuecode = UC_INDIC_CONJUNCT_BREAK_NONE;
3007 else if (strcmp (valuename, "Consonant") == 0)
3008 valuecode = UC_INDIC_CONJUNCT_BREAK_CONSONANT;
3009 else if (strcmp (valuename, "Linker") == 0)
3010 valuecode = UC_INDIC_CONJUNCT_BREAK_LINKER;
3011 else if (strcmp (valuename, "Extend") == 0)
3012 valuecode = UC_INDIC_CONJUNCT_BREAK_EXTEND;
3013 else
3015 fprintf (stderr, "unknown InCB value named '%s' in '%s'\n",
3016 valuename, proplist_filename);
3017 exit (1);
3020 assert (i1 <= i2 && i2 < 0x110000);
3021 for (i = i1; i <= i2; i++)
3022 unicode_indic_conjunct_break[i] = valuecode;
3024 goto done_line;
3026 else
3028 fprintf (stderr, "unknown property named '%s' in '%s'\n", propname,
3029 proplist_filename);
3030 exit (1);
3033 assert (i1 <= i2 && i2 < 0x110000);
3034 for (i = i1; i <= i2; i++)
3035 unicode_properties[i] |= 1ULL << propcode;
3037 done_line: ;
3040 if (ferror (stream) || fclose (stream))
3042 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
3043 exit (1);
3047 /* Stores in array the given property from the Unicode 3.0 PropList.txt
3048 file. */
3049 static void
3050 fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name)
3052 unsigned int i;
3053 FILE *stream;
3054 char buf[100+1];
3056 for (i = 0; i < 0x110000; i++)
3057 array[i] = 0;
3059 stream = fopen (proplist_filename, "r");
3060 if (stream == NULL)
3062 fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
3063 exit (1);
3066 /* Search for the "Property dump for: ..." line. */
3069 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
3071 fprintf (stderr, "no property found in '%s'\n", proplist_filename);
3072 exit (1);
3075 while (strstr (buf, property_name) == NULL);
3077 for (;;)
3079 unsigned int i1, i2;
3081 if (fscanf (stream, "%100[^\n]\n", buf) < 1)
3082 break;
3083 if (buf[0] == '*')
3084 break;
3085 if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
3087 if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
3089 fprintf (stderr, "parse error in property in '%s'\n",
3090 proplist_filename);
3091 exit (1);
3094 else if (strlen (buf) >= 4)
3096 if (sscanf (buf, "%4X", &i1) < 1)
3098 fprintf (stderr, "parse error in property in '%s'\n",
3099 proplist_filename);
3100 exit (1);
3102 i2 = i1;
3104 else
3106 fprintf (stderr, "parse error in property in '%s'\n",
3107 proplist_filename);
3108 exit (1);
3110 assert (i1 <= i2 && i2 < 0x110000);
3111 for (i = i1; i <= i2; i++)
3112 array[i] = 1;
3115 if (ferror (stream) || fclose (stream))
3117 fprintf (stderr, "error reading from '%s'\n", proplist_filename);
3118 exit (1);
3122 /* Properties from Unicode 3.0 PropList.txt file. */
3124 /* The paired punctuation property from the PropList.txt file. */
3125 char unicode_pairedpunctuation[0x110000];
3127 /* The left of pair property from the PropList.txt file. */
3128 char unicode_leftofpair[0x110000];
3130 static void
3131 fill_properties30 (const char *proplist30_filename)
3133 fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)");
3134 fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)");
3137 /* ------------------------------------------------------------------------- */
3139 /* See PropList.txt, UCD.html. */
3140 static bool
3141 is_property_white_space (unsigned int ch)
3143 return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0);
3146 /* See Unicode 3.0 book, section 4.10,
3147 PropList.txt, UCD.html,
3148 DerivedCoreProperties.txt, UCD.html. */
3149 static bool
3150 is_property_alphabetic (unsigned int ch)
3152 bool result1 =
3153 is_category_L (ch)
3154 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0)
3155 /* For some reason, the following are listed as having property
3156 Alphabetic but not as having property Other_Alphabetic. */
3157 || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */
3158 || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */
3159 || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */
3160 || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */
3161 || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */
3162 || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */
3163 || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */
3164 || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */
3165 || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */
3166 || (ch == 0x10341) /* GOTHIC LETTER NINETY */
3167 || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */
3168 || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */
3169 || (ch >= 0x12400 && ch <= 0x1246E); /* CUNEIFORM NUMERIC SIGNS */
3170 bool result2 =
3171 ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0);
3173 assert (result1 == result2);
3174 return result1;
3177 /* See PropList.txt, UCD.html. */
3178 static bool
3179 is_property_other_alphabetic (unsigned int ch)
3181 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0);
3184 /* See PropList.txt, UCD.html. */
3185 static bool
3186 is_property_not_a_character (unsigned int ch)
3188 return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0);
3191 /* See PropList.txt, UCD.html,
3192 DerivedCoreProperties.txt, UCD.html. */
3193 static bool
3194 is_property_default_ignorable_code_point (unsigned int ch)
3196 bool result1 =
3197 (is_category_Cf (ch)
3198 && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
3199 && !(ch >= 0x13430 && ch <= 0x1343F) /* Egyptian Hieroglyph */
3200 && ((unicode_properties[ch] & (1ULL << PROP_PREPENDED_CONCATENATION_MARK)) == 0))
3201 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0)
3202 || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
3203 bool result2 =
3204 ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
3206 assert (result1 == result2);
3207 return result1;
3210 /* See PropList.txt, UCD.html. */
3211 static bool
3212 is_property_other_default_ignorable_code_point (unsigned int ch)
3214 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0);
3217 /* See PropList.txt, UCD.html. */
3218 static bool
3219 is_property_deprecated (unsigned int ch)
3221 return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0);
3224 /* See PropList.txt, UCD.html. */
3225 static bool
3226 is_property_logical_order_exception (unsigned int ch)
3228 return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0);
3231 /* See PropList.txt, UCD.html. */
3232 static bool
3233 is_property_variation_selector (unsigned int ch)
3235 return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0);
3238 /* See PropList-3.0.1.txt. */
3239 static bool
3240 is_property_private_use (unsigned int ch)
3242 /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */
3243 return (ch >= 0xE000 && ch <= 0xF8FF)
3244 || (ch >= 0xF0000 && ch <= 0xFFFFD)
3245 || (ch >= 0x100000 && ch <= 0x10FFFD);
3248 /* See PropList-3.0.1.txt. */
3249 static bool
3250 is_property_unassigned_code_value (unsigned int ch)
3252 return (is_category_Cn (ch) && !is_property_not_a_character (ch));
3255 /* See PropList.txt, UCD.html,
3256 DerivedCoreProperties.txt, UCD.html. */
3257 static bool
3258 is_property_uppercase (unsigned int ch)
3260 bool result1 =
3261 is_category_Lu (ch)
3262 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
3263 bool result2 =
3264 ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0);
3266 assert (result1 == result2);
3267 return result1;
3270 /* See PropList.txt, UCD.html. */
3271 static bool
3272 is_property_other_uppercase (unsigned int ch)
3274 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0);
3277 /* See PropList.txt, UCD.html,
3278 DerivedCoreProperties.txt, UCD.html. */
3279 static bool
3280 is_property_lowercase (unsigned int ch)
3282 bool result1 =
3283 is_category_Ll (ch)
3284 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
3285 bool result2 =
3286 ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0);
3288 assert (result1 == result2);
3289 return result1;
3292 /* See PropList.txt, UCD.html. */
3293 static bool
3294 is_property_other_lowercase (unsigned int ch)
3296 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0);
3299 /* See PropList-3.0.1.txt. */
3300 static bool
3301 is_property_titlecase (unsigned int ch)
3303 return is_category_Lt (ch);
3306 /* See DerivedCoreProperties.txt. */
3307 static bool
3308 is_property_cased (unsigned int ch)
3310 bool result1 = (is_property_lowercase (ch)
3311 || is_property_uppercase (ch)
3312 || is_category_Lt (ch));
3313 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0);
3315 assert (result1 == result2);
3316 return result1;
3319 /* See DerivedCoreProperties.txt. */
3320 static bool
3321 is_property_case_ignorable (unsigned int ch)
3323 bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch)
3324 || ch == 0x0027
3325 || is_category_Mn (ch)
3326 || is_category_Me (ch)
3327 || is_category_Cf (ch)
3328 || is_category_Lm (ch)
3329 || is_category_Sk (ch));
3330 bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0);
3332 assert (result1 == result2);
3333 return result1;
3336 /* See DerivedCoreProperties.txt. */
3337 static bool
3338 is_property_changes_when_lowercased (unsigned int ch)
3340 bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0);
3341 bool result2 = (unicode_attributes[ch].name != NULL
3342 && unicode_attributes[ch].lower != NONE
3343 && unicode_attributes[ch].lower != ch);
3345 assert (result1 == result2);
3346 return result1;
3349 /* See DerivedCoreProperties.txt. */
3350 static bool
3351 is_property_changes_when_uppercased (unsigned int ch)
3353 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0);
3356 /* See DerivedCoreProperties.txt. */
3357 static bool
3358 is_property_changes_when_titlecased (unsigned int ch)
3360 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0);
3363 /* See DerivedCoreProperties.txt. */
3364 static bool
3365 is_property_changes_when_casefolded (unsigned int ch)
3367 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0);
3370 /* See DerivedCoreProperties.txt. */
3371 static bool
3372 is_property_changes_when_casemapped (unsigned int ch)
3374 return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0);
3377 /* See PropList.txt, UCD.html. */
3378 static bool
3379 is_property_soft_dotted (unsigned int ch)
3381 return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0);
3384 /* See DerivedCoreProperties.txt, UCD.html. */
3385 static bool
3386 is_property_id_start (unsigned int ch)
3388 return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0);
3391 /* See PropList.txt, UCD.html. */
3392 static bool
3393 is_property_other_id_start (unsigned int ch)
3395 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0);
3398 /* See DerivedCoreProperties.txt, UCD.html. */
3399 static bool
3400 is_property_id_continue (unsigned int ch)
3402 return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0);
3405 /* See PropList.txt, UCD.html. */
3406 static bool
3407 is_property_other_id_continue (unsigned int ch)
3409 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0);
3412 /* See DerivedCoreProperties.txt, UCD.html. */
3413 static bool
3414 is_property_xid_start (unsigned int ch)
3416 return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0);
3419 /* See DerivedCoreProperties.txt, UCD.html. */
3420 static bool
3421 is_property_xid_continue (unsigned int ch)
3423 return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0);
3426 /* See PropList.txt, UCD.html. */
3427 static bool
3428 is_property_id_compat_math_start (unsigned int ch)
3430 return ((unicode_properties[ch] & (1ULL << PROP_ID_COMPAT_MATH_START)) != 0);
3433 /* See PropList.txt, UCD.html. */
3434 static bool
3435 is_property_id_compat_math_continue (unsigned int ch)
3437 return ((unicode_properties[ch] & (1ULL << PROP_ID_COMPAT_MATH_CONTINUE)) != 0);
3440 /* See PropList.txt, UCD.html. */
3441 static bool
3442 is_property_pattern_white_space (unsigned int ch)
3444 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0);
3447 /* See PropList.txt, UCD.html. */
3448 static bool
3449 is_property_pattern_syntax (unsigned int ch)
3451 return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0);
3454 /* See PropList.txt, UCD.html. */
3455 static bool
3456 is_property_join_control (unsigned int ch)
3458 return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0);
3461 /* See DerivedCoreProperties.txt, UCD.html. */
3462 static bool
3463 is_property_grapheme_base (unsigned int ch)
3465 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0);
3468 /* See DerivedCoreProperties.txt, UCD.html. */
3469 static bool
3470 is_property_grapheme_extend (unsigned int ch)
3472 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0);
3475 /* See PropList.txt, UCD.html. */
3476 static bool
3477 is_property_other_grapheme_extend (unsigned int ch)
3479 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0);
3482 /* See DerivedCoreProperties.txt, UCD.html. */
3483 static bool
3484 is_property_grapheme_link (unsigned int ch)
3486 return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0);
3489 /* See PropList.txt, UCD.html. */
3490 static bool
3491 is_property_modifier_combining_mark (unsigned int ch)
3493 return ((unicode_properties[ch] & (1ULL << PROP_MODIFIER_COMBINING_MARK)) != 0);
3496 /* See PropList.txt, UCD.html. */
3497 static bool
3498 is_property_bidi_control (unsigned int ch)
3500 return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0);
3503 /* See PropList-3.0.1.txt. */
3504 static bool
3505 is_property_bidi_left_to_right (unsigned int ch)
3507 return (get_bidi_category (ch) == UC_BIDI_L);
3510 /* See PropList-3.0.1.txt. */
3511 static bool
3512 is_property_bidi_hebrew_right_to_left (unsigned int ch)
3514 return (get_bidi_category (ch) == UC_BIDI_R);
3517 /* See PropList-3.0.1.txt. */
3518 static bool
3519 is_property_bidi_arabic_right_to_left (unsigned int ch)
3521 return (get_bidi_category (ch) == UC_BIDI_AL);
3524 /* See PropList-3.0.1.txt. */
3525 static bool
3526 is_property_bidi_european_digit (unsigned int ch)
3528 return (get_bidi_category (ch) == UC_BIDI_EN);
3531 /* See PropList-3.0.1.txt. */
3532 static bool
3533 is_property_bidi_eur_num_separator (unsigned int ch)
3535 return (get_bidi_category (ch) == UC_BIDI_ES);
3538 /* See PropList-3.0.1.txt. */
3539 static bool
3540 is_property_bidi_eur_num_terminator (unsigned int ch)
3542 return (get_bidi_category (ch) == UC_BIDI_ET);
3545 /* See PropList-3.0.1.txt. */
3546 static bool
3547 is_property_bidi_arabic_digit (unsigned int ch)
3549 return (get_bidi_category (ch) == UC_BIDI_AN);
3552 /* See PropList-3.0.1.txt. */
3553 static bool
3554 is_property_bidi_common_separator (unsigned int ch)
3556 return (get_bidi_category (ch) == UC_BIDI_CS);
3559 /* See PropList-3.0.1.txt. */
3560 static bool
3561 is_property_bidi_block_separator (unsigned int ch)
3563 return (get_bidi_category (ch) == UC_BIDI_B);
3566 /* See PropList-3.0.1.txt. */
3567 static bool
3568 is_property_bidi_segment_separator (unsigned int ch)
3570 return (get_bidi_category (ch) == UC_BIDI_S);
3573 /* See PropList-3.0.1.txt. */
3574 static bool
3575 is_property_bidi_whitespace (unsigned int ch)
3577 return (get_bidi_category (ch) == UC_BIDI_WS);
3580 /* See PropList-3.0.1.txt. */
3581 static bool
3582 is_property_bidi_non_spacing_mark (unsigned int ch)
3584 return (get_bidi_category (ch) == UC_BIDI_NSM);
3587 /* See PropList-3.0.1.txt. */
3588 static bool
3589 is_property_bidi_boundary_neutral (unsigned int ch)
3591 return (get_bidi_category (ch) == UC_BIDI_BN);
3594 /* See PropList-3.0.1.txt. */
3595 static bool
3596 is_property_bidi_pdf (unsigned int ch)
3598 return (get_bidi_category (ch) == UC_BIDI_PDF);
3601 /* See PropList-3.0.1.txt. */
3602 static bool
3603 is_property_bidi_embedding_or_override (unsigned int ch)
3605 int category = get_bidi_category (ch);
3606 return (category == UC_BIDI_LRE || category == UC_BIDI_LRO
3607 || category == UC_BIDI_RLE || category == UC_BIDI_RLO);
3610 /* See PropList-3.0.1.txt. */
3611 static bool
3612 is_property_bidi_other_neutral (unsigned int ch)
3614 return (get_bidi_category (ch) == UC_BIDI_ON);
3617 /* See PropList.txt, UCD.html. */
3618 static bool
3619 is_property_hex_digit (unsigned int ch)
3621 return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0);
3624 /* See PropList.txt, UCD.html. */
3625 static bool
3626 is_property_ascii_hex_digit (unsigned int ch)
3628 return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0);
3631 /* See Unicode 3.0 book, section 4.10,
3632 PropList.txt, UCD.html. */
3633 static bool
3634 is_property_ideographic (unsigned int ch)
3636 return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0);
3639 /* See PropList.txt, UCD.html. */
3640 static bool
3641 is_property_unified_ideograph (unsigned int ch)
3643 return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0);
3646 /* See PropList.txt, UCD.html. */
3647 static bool
3648 is_property_radical (unsigned int ch)
3650 return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0);
3653 /* See PropList.txt, UCD.html. */
3654 static bool
3655 is_property_ids_unary_operator (unsigned int ch)
3657 return ((unicode_properties[ch] & (1ULL << PROP_IDS_UNARY_OPERATOR)) != 0);
3660 /* See PropList.txt, UCD.html. */
3661 static bool
3662 is_property_ids_binary_operator (unsigned int ch)
3664 return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0);
3667 /* See PropList.txt, UCD.html. */
3668 static bool
3669 is_property_ids_trinary_operator (unsigned int ch)
3671 return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0);
3674 /* See PropList-3.0.1.txt. */
3675 static bool
3676 is_property_zero_width (unsigned int ch)
3678 return is_category_Cf (ch)
3679 || (unicode_attributes[ch].name != NULL
3680 && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL);
3683 /* See PropList-3.0.1.txt. */
3684 static bool
3685 is_property_space (unsigned int ch)
3687 return is_category_Zs (ch);
3690 /* See PropList-3.0.1.txt. */
3691 static bool
3692 is_property_non_break (unsigned int ch)
3694 /* This is exactly the set of characters having line breaking
3695 property GL. */
3696 return (ch == 0x00A0 /* NO-BREAK SPACE */
3697 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
3698 || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */
3699 || ch == 0x035D /* COMBINING DOUBLE BREVE */
3700 || ch == 0x035E /* COMBINING DOUBLE MACRON */
3701 || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */
3702 || ch == 0x0360 /* COMBINING DOUBLE TILDE */
3703 || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */
3704 || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */
3705 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
3706 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
3707 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
3708 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
3709 || ch == 0x2007 /* FIGURE SPACE */
3710 || ch == 0x2011 /* NON-BREAKING HYPHEN */
3711 || ch == 0x202F /* NARROW NO-BREAK SPACE */);
3714 /* See PropList-3.0.1.txt. */
3715 static bool
3716 is_property_iso_control (unsigned int ch)
3718 bool result1 =
3719 (unicode_attributes[ch].name != NULL
3720 && strcmp (unicode_attributes[ch].name, "<control>") == 0);
3721 bool result2 =
3722 is_category_Cc (ch);
3724 assert (result1 == result2);
3725 return result1;
3728 /* See PropList-3.0.1.txt. */
3729 static bool
3730 is_property_format_control (unsigned int ch)
3732 return (is_category_Cf (ch)
3733 && get_bidi_category (ch) == UC_BIDI_BN
3734 && !is_property_join_control (ch)
3735 && ch != 0xFEFF);
3738 /* See PropList.txt, UCD.html. */
3739 static bool
3740 is_property_prepended_concatenation_mark (unsigned int ch)
3742 return ((unicode_properties[ch] & (1ULL << PROP_PREPENDED_CONCATENATION_MARK)) != 0);
3745 /* See PropList.txt, UCD.html. */
3746 static bool
3747 is_property_dash (unsigned int ch)
3749 return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0);
3752 /* See PropList.txt, UCD.html. */
3753 static bool
3754 is_property_hyphen (unsigned int ch)
3756 return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0);
3759 /* See PropList-3.0.1.txt. */
3760 static bool
3761 is_property_punctuation (unsigned int ch)
3763 return is_category_P (ch);
3766 /* See PropList-3.0.1.txt. */
3767 static bool
3768 is_property_line_separator (unsigned int ch)
3770 return is_category_Zl (ch);
3773 /* See PropList-3.0.1.txt. */
3774 static bool
3775 is_property_paragraph_separator (unsigned int ch)
3777 return is_category_Zp (ch);
3780 /* See PropList.txt, UCD.html. */
3781 static bool
3782 is_property_quotation_mark (unsigned int ch)
3784 return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0);
3787 /* See PropList.txt, UCD.html. */
3788 static bool
3789 is_property_sentence_terminal (unsigned int ch)
3791 return ((unicode_properties[ch] & (1ULL << PROP_SENTENCE_TERMINAL)) != 0);
3794 /* See PropList.txt, UCD.html. */
3795 static bool
3796 is_property_terminal_punctuation (unsigned int ch)
3798 return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0);
3801 /* See PropList-3.0.1.txt. */
3802 static bool
3803 is_property_currency_symbol (unsigned int ch)
3805 return is_category_Sc (ch);
3808 /* See Unicode 3.0 book, section 4.9,
3809 PropList.txt, UCD.html,
3810 DerivedCoreProperties.txt, UCD.html. */
3811 static bool
3812 is_property_math (unsigned int ch)
3814 bool result1 =
3815 is_category_Sm (ch)
3816 || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3817 bool result2 =
3818 ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0);
3820 assert (result1 == result2);
3821 return result1;
3824 /* See PropList.txt, UCD.html. */
3825 static bool
3826 is_property_other_math (unsigned int ch)
3828 return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0);
3831 /* See PropList-3.0.1.txt. */
3832 static bool
3833 is_property_paired_punctuation (unsigned int ch)
3835 return unicode_pairedpunctuation[ch];
3838 /* See PropList-3.0.1.txt. */
3839 static bool
3840 is_property_left_of_pair (unsigned int ch)
3842 return unicode_leftofpair[ch];
3845 /* See PropList-3.0.1.txt. */
3846 static bool
3847 is_property_combining (unsigned int ch)
3849 return (unicode_attributes[ch].name != NULL
3850 && (strcmp (unicode_attributes[ch].combining, "0") != 0
3851 || is_category_Mc (ch)
3852 || is_category_Me (ch)
3853 || is_category_Mn (ch)));
3856 #if 0 /* same as is_property_bidi_non_spacing_mark */
3857 /* See PropList-3.0.1.txt. */
3858 static bool
3859 is_property_non_spacing (unsigned int ch)
3861 return (unicode_attributes[ch].name != NULL
3862 && get_bidi_category (ch) == UC_BIDI_NSM);
3864 #endif
3866 /* See PropList-3.0.1.txt. */
3867 static bool
3868 is_property_composite (unsigned int ch)
3870 /* This definition differs from the one in PropList-3.0.1.txt, but is more
3871 logical in some sense. */
3872 if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */
3873 return true;
3874 if (unicode_attributes[ch].name != NULL
3875 && unicode_attributes[ch].decomposition != NULL)
3877 /* Test whether the decomposition contains more than one character,
3878 and the first is not a space. */
3879 const char *decomp = unicode_attributes[ch].decomposition;
3880 if (decomp[0] == '<')
3882 decomp = strchr (decomp, '>') + 1;
3883 if (decomp[0] == ' ')
3884 decomp++;
3886 return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0;
3888 return false;
3891 /* See PropList-3.0.1.txt. */
3892 static bool
3893 is_property_decimal_digit (unsigned int ch)
3895 return is_category_Nd (ch);
3898 /* See PropList-3.0.1.txt. */
3899 static bool
3900 is_property_numeric (unsigned int ch)
3902 return ((get_numeric_value (ch)).denominator > 0)
3903 || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */
3904 || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */
3907 /* See PropList.txt, UCD.html. */
3908 static bool
3909 is_property_diacritic (unsigned int ch)
3911 return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0);
3914 /* See PropList.txt, UCD.html. */
3915 static bool
3916 is_property_extender (unsigned int ch)
3918 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0);
3921 /* See PropList-3.0.1.txt. */
3922 static bool
3923 is_property_ignorable_control (unsigned int ch)
3925 return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN)
3926 || is_category_Cf (ch))
3927 && ch != 0x0000;
3930 /* See PropList.txt, UCD.html. */
3931 static bool
3932 is_property_regional_indicator (unsigned int ch)
3934 return ((unicode_properties[ch] & (1ULL << PROP_REGIONAL_INDICATOR)) != 0);
3937 /* See emoji-data.txt, UTS #51. */
3938 static bool
3939 is_property_emoji (unsigned int ch)
3941 return ((unicode_properties[ch] & (1ULL << PROP_EMOJI)) != 0);
3944 /* See emoji-data.txt, UTS #51. */
3945 static bool
3946 is_property_emoji_presentation (unsigned int ch)
3948 return ((unicode_properties[ch] & (1ULL << PROP_EMOJI_PRESENTATION)) != 0);
3951 /* See emoji-data.txt, UTS #51. */
3952 static bool
3953 is_property_emoji_modifier (unsigned int ch)
3955 return ((unicode_properties[ch] & (1ULL << PROP_EMOJI_MODIFIER)) != 0);
3958 /* See emoji-data.txt, UTS #51. */
3959 static bool
3960 is_property_emoji_modifier_base (unsigned int ch)
3962 return ((unicode_properties[ch] & (1ULL << PROP_EMOJI_MODIFIER_BASE)) != 0);
3965 /* See emoji-data.txt, UTS #51. */
3966 static bool
3967 is_property_emoji_component (unsigned int ch)
3969 return ((unicode_properties[ch] & (1ULL << PROP_EMOJI_COMPONENT)) != 0);
3972 /* See emoji-data.txt, UTS #51. */
3973 static bool
3974 is_property_extended_pictographic (unsigned int ch)
3976 return ((unicode_properties[ch] & (1ULL << PROP_EXTENDED_PICTOGRAPHIC)) != 0);
3979 /* ------------------------------------------------------------------------- */
3981 /* Output all properties. */
3982 static void
3983 output_properties (const char *version)
3985 #define PROPERTY(P) \
3986 debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \
3987 output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \
3988 output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version);
3989 PROPERTY(white_space)
3990 PROPERTY(alphabetic)
3991 PROPERTY(other_alphabetic)
3992 PROPERTY(not_a_character)
3993 PROPERTY(default_ignorable_code_point)
3994 PROPERTY(other_default_ignorable_code_point)
3995 PROPERTY(deprecated)
3996 PROPERTY(logical_order_exception)
3997 PROPERTY(variation_selector)
3998 PROPERTY(private_use)
3999 PROPERTY(unassigned_code_value)
4000 PROPERTY(uppercase)
4001 PROPERTY(other_uppercase)
4002 PROPERTY(lowercase)
4003 PROPERTY(other_lowercase)
4004 PROPERTY(titlecase)
4005 PROPERTY(cased)
4006 PROPERTY(case_ignorable)
4007 PROPERTY(changes_when_lowercased)
4008 PROPERTY(changes_when_uppercased)
4009 PROPERTY(changes_when_titlecased)
4010 PROPERTY(changes_when_casefolded)
4011 PROPERTY(changes_when_casemapped)
4012 PROPERTY(soft_dotted)
4013 PROPERTY(id_start)
4014 PROPERTY(other_id_start)
4015 PROPERTY(id_continue)
4016 PROPERTY(other_id_continue)
4017 PROPERTY(xid_start)
4018 PROPERTY(xid_continue)
4019 PROPERTY(id_compat_math_start)
4020 PROPERTY(id_compat_math_continue)
4021 PROPERTY(pattern_white_space)
4022 PROPERTY(pattern_syntax)
4023 PROPERTY(join_control)
4024 PROPERTY(grapheme_base)
4025 PROPERTY(grapheme_extend)
4026 PROPERTY(other_grapheme_extend)
4027 PROPERTY(grapheme_link)
4028 PROPERTY(modifier_combining_mark)
4029 PROPERTY(bidi_control)
4030 PROPERTY(bidi_left_to_right)
4031 PROPERTY(bidi_hebrew_right_to_left)
4032 PROPERTY(bidi_arabic_right_to_left)
4033 PROPERTY(bidi_european_digit)
4034 PROPERTY(bidi_eur_num_separator)
4035 PROPERTY(bidi_eur_num_terminator)
4036 PROPERTY(bidi_arabic_digit)
4037 PROPERTY(bidi_common_separator)
4038 PROPERTY(bidi_block_separator)
4039 PROPERTY(bidi_segment_separator)
4040 PROPERTY(bidi_whitespace)
4041 PROPERTY(bidi_non_spacing_mark)
4042 PROPERTY(bidi_boundary_neutral)
4043 PROPERTY(bidi_pdf)
4044 PROPERTY(bidi_embedding_or_override)
4045 PROPERTY(bidi_other_neutral)
4046 PROPERTY(hex_digit)
4047 PROPERTY(ascii_hex_digit)
4048 PROPERTY(ideographic)
4049 PROPERTY(unified_ideograph)
4050 PROPERTY(radical)
4051 PROPERTY(ids_unary_operator)
4052 PROPERTY(ids_binary_operator)
4053 PROPERTY(ids_trinary_operator)
4054 PROPERTY(zero_width)
4055 PROPERTY(space)
4056 PROPERTY(non_break)
4057 PROPERTY(iso_control)
4058 PROPERTY(format_control)
4059 PROPERTY(prepended_concatenation_mark)
4060 PROPERTY(dash)
4061 PROPERTY(hyphen)
4062 PROPERTY(punctuation)
4063 PROPERTY(line_separator)
4064 PROPERTY(paragraph_separator)
4065 PROPERTY(quotation_mark)
4066 PROPERTY(sentence_terminal)
4067 PROPERTY(terminal_punctuation)
4068 PROPERTY(currency_symbol)
4069 PROPERTY(math)
4070 PROPERTY(other_math)
4071 PROPERTY(paired_punctuation)
4072 PROPERTY(left_of_pair)
4073 PROPERTY(combining)
4074 PROPERTY(composite)
4075 PROPERTY(decimal_digit)
4076 PROPERTY(numeric)
4077 PROPERTY(diacritic)
4078 PROPERTY(extender)
4079 PROPERTY(ignorable_control)
4080 PROPERTY(regional_indicator)
4081 PROPERTY(emoji)
4082 PROPERTY(emoji_presentation)
4083 PROPERTY(emoji_modifier)
4084 PROPERTY(emoji_modifier_base)
4085 PROPERTY(emoji_component)
4086 PROPERTY(extended_pictographic)
4087 #undef PROPERTY
4090 /* ------------------------------------------------------------------------- */
4092 /* Convert an Indic_Conjunct_Break value to a C identifier. */
4093 static const char *
4094 indic_conjunct_break_as_c_identifier (int indic_conjunct_break)
4096 #define TRY(value) if (indic_conjunct_break == value) return #value;
4097 TRY(UC_INDIC_CONJUNCT_BREAK_NONE)
4098 TRY(UC_INDIC_CONJUNCT_BREAK_CONSONANT)
4099 TRY(UC_INDIC_CONJUNCT_BREAK_LINKER)
4100 TRY(UC_INDIC_CONJUNCT_BREAK_EXTEND)
4101 #undef TRY
4102 abort ();
4105 static void
4106 output_indic_conjunct_break_test (const char *filename, const char *version)
4108 FILE *stream;
4109 bool need_comma;
4110 unsigned int ch;
4112 stream = fopen (filename, "w");
4113 if (stream == NULL)
4115 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4116 exit (1);
4119 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4120 fprintf (stream, "/* Indic_Conjunct_Break attribute of Unicode characters. */\n");
4121 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4122 version);
4123 fprintf (stream, "\n");
4125 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
4126 fprintf (stream, "\n");
4127 output_tests_license (stream);
4128 fprintf (stream, "\n");
4130 need_comma = false;
4131 for (ch = 0; ch < 0x110000; ch++)
4133 int value = unicode_indic_conjunct_break[ch];
4135 if (value != UC_INDIC_CONJUNCT_BREAK_NONE)
4137 if (need_comma)
4138 fprintf (stream, ",\n");
4139 fprintf (stream, " { 0x%04X, %s }", ch, indic_conjunct_break_as_c_identifier (value));
4140 need_comma = true;
4143 if (need_comma)
4144 fprintf (stream, "\n");
4146 if (ferror (stream) || fclose (stream))
4148 fprintf (stderr, "error writing to '%s'\n", filename);
4149 exit (1);
4153 /* Construction of sparse 3-level tables. */
4154 #define TABLE indic_conjunct_break_table
4155 #define ELEMENT uint8_t
4156 #define DEFAULT UC_INDIC_CONJUNCT_BREAK_NONE
4157 #define xmalloc malloc
4158 #define xrealloc realloc
4159 #include "3level.h"
4161 static void
4162 output_indic_conjunct_break (const char *filename, const char *version)
4164 FILE *stream;
4165 unsigned int ch, i;
4166 struct indic_conjunct_break_table t;
4167 unsigned int level1_offset, level2_offset, level3_offset;
4169 stream = fopen (filename, "w");
4170 if (stream == NULL)
4172 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4173 exit (1);
4176 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4177 fprintf (stream, "/* Indic_Conjunct_Break attribute of Unicode characters. */\n");
4178 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4179 version);
4180 fprintf (stream, "\n");
4182 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
4183 fprintf (stream, "\n");
4184 output_library_license (stream, false);
4185 fprintf (stream, "\n");
4187 t.p = 6; /* or 5 */
4188 t.q = 4; /* or 5 */
4189 indic_conjunct_break_table_init (&t);
4191 for (ch = 0; ch < 0x110000; ch++)
4193 uint8_t value = unicode_indic_conjunct_break[ch];
4195 assert (value <= 0x03);
4197 if (value != UC_INDIC_CONJUNCT_BREAK_NONE)
4198 indic_conjunct_break_table_add (&t, ch, value);
4201 indic_conjunct_break_table_finalize (&t);
4203 /* Offsets in t.result, in memory of this process. */
4204 level1_offset =
4205 5 * sizeof (uint32_t);
4206 level2_offset =
4207 5 * sizeof (uint32_t)
4208 + t.level1_size * sizeof (uint32_t);
4209 level3_offset =
4210 5 * sizeof (uint32_t)
4211 + t.level1_size * sizeof (uint32_t)
4212 + (t.level2_size << t.q) * sizeof (uint32_t);
4214 for (i = 0; i < 5; i++)
4215 fprintf (stream, "#define indic_conjunct_break_header_%d %d\n", i,
4216 ((uint32_t *) t.result)[i]);
4217 fprintf (stream, "static const\n");
4218 fprintf (stream, "struct\n");
4219 fprintf (stream, " {\n");
4220 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4221 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4222 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
4223 (1 << t.p) * 2 / 16);
4224 fprintf (stream, " }\n");
4225 fprintf (stream, "u_indic_conjunct_break =\n");
4226 fprintf (stream, "{\n");
4227 fprintf (stream, " {");
4228 if (t.level1_size > 8)
4229 fprintf (stream, "\n ");
4230 for (i = 0; i < t.level1_size; i++)
4232 uint32_t offset;
4233 if (i > 0 && (i % 8) == 0)
4234 fprintf (stream, "\n ");
4235 offset = ((uint32_t *) (t.result + level1_offset))[i];
4236 if (offset == 0)
4237 fprintf (stream, " %5d", -1);
4238 else
4239 fprintf (stream, " %5zu",
4240 (offset - level2_offset) / sizeof (uint32_t));
4241 if (i+1 < t.level1_size)
4242 fprintf (stream, ",");
4244 if (t.level1_size > 8)
4245 fprintf (stream, "\n ");
4246 fprintf (stream, " },\n");
4247 fprintf (stream, " {");
4248 if (t.level2_size << t.q > 8)
4249 fprintf (stream, "\n ");
4250 for (i = 0; i < t.level2_size << t.q; i++)
4252 uint32_t offset;
4253 if (i > 0 && (i % 8) == 0)
4254 fprintf (stream, "\n ");
4255 offset = ((uint32_t *) (t.result + level2_offset))[i];
4256 if (offset == 0)
4257 fprintf (stream, " %5d", -1);
4258 else
4259 fprintf (stream, " %5zu",
4260 (offset - level3_offset) / sizeof (uint8_t));
4261 if (i+1 < t.level2_size << t.q)
4262 fprintf (stream, ",");
4264 if (t.level2_size << t.q > 8)
4265 fprintf (stream, "\n ");
4266 fprintf (stream, " },\n");
4267 /* Pack the level3 array. Each entry needs 2 bits only. */
4268 fprintf (stream, " {");
4269 if ((t.level3_size << t.p) * 2 / 16 > 8)
4270 fprintf (stream, "\n ");
4271 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
4273 if (i > 0 && (i % 8) == 0)
4274 fprintf (stream, "\n ");
4275 fprintf (stream, " 0x%04x",
4276 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
4277 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
4278 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
4279 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
4280 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
4281 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
4282 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
4283 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
4284 if (i+1 < (t.level3_size << t.p) * 2 / 16)
4285 fprintf (stream, ",");
4287 if ((t.level3_size << t.p) * 2 / 16 > 8)
4288 fprintf (stream, "\n ");
4289 fprintf (stream, " }\n");
4290 fprintf (stream, "};\n");
4292 if (ferror (stream) || fclose (stream))
4294 fprintf (stderr, "error writing to '%s'\n", filename);
4295 exit (1);
4299 /* ========================================================================= */
4301 /* Arabic Shaping. */
4303 enum
4305 UC_JOINING_TYPE_U, /* Non_Joining */
4306 UC_JOINING_TYPE_T, /* Transparent */
4307 UC_JOINING_TYPE_C, /* Join_Causing */
4308 UC_JOINING_TYPE_L, /* Left_Joining */
4309 UC_JOINING_TYPE_R, /* Right_Joining */
4310 UC_JOINING_TYPE_D /* Dual_Joining */
4313 static uint8_t unicode_joining_type[0x110000];
4315 enum
4317 UC_JOINING_GROUP_NONE, /* No_Joining_Group */
4318 UC_JOINING_GROUP_AIN, /* Ain */
4319 UC_JOINING_GROUP_ALAPH, /* Alaph */
4320 UC_JOINING_GROUP_ALEF, /* Alef */
4321 UC_JOINING_GROUP_BEH, /* Beh */
4322 UC_JOINING_GROUP_BETH, /* Beth */
4323 UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */
4324 UC_JOINING_GROUP_DAL, /* Dal */
4325 UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */
4326 UC_JOINING_GROUP_E, /* E */
4327 UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */
4328 UC_JOINING_GROUP_FE, /* Fe */
4329 UC_JOINING_GROUP_FEH, /* Feh */
4330 UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */
4331 UC_JOINING_GROUP_GAF, /* Gaf */
4332 UC_JOINING_GROUP_GAMAL, /* Gamal */
4333 UC_JOINING_GROUP_HAH, /* Hah */
4334 UC_JOINING_GROUP_HE, /* He */
4335 UC_JOINING_GROUP_HEH, /* Heh */
4336 UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */
4337 UC_JOINING_GROUP_HETH, /* Heth */
4338 UC_JOINING_GROUP_KAF, /* Kaf */
4339 UC_JOINING_GROUP_KAPH, /* Kaph */
4340 UC_JOINING_GROUP_KHAPH, /* Khaph */
4341 UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */
4342 UC_JOINING_GROUP_LAM, /* Lam */
4343 UC_JOINING_GROUP_LAMADH, /* Lamadh */
4344 UC_JOINING_GROUP_MEEM, /* Meem */
4345 UC_JOINING_GROUP_MIM, /* Mim */
4346 UC_JOINING_GROUP_NOON, /* Noon */
4347 UC_JOINING_GROUP_NUN, /* Nun */
4348 UC_JOINING_GROUP_NYA, /* Nya */
4349 UC_JOINING_GROUP_PE, /* Pe */
4350 UC_JOINING_GROUP_QAF, /* Qaf */
4351 UC_JOINING_GROUP_QAPH, /* Qaph */
4352 UC_JOINING_GROUP_REH, /* Reh */
4353 UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */
4354 UC_JOINING_GROUP_SAD, /* Sad */
4355 UC_JOINING_GROUP_SADHE, /* Sadhe */
4356 UC_JOINING_GROUP_SEEN, /* Seen */
4357 UC_JOINING_GROUP_SEMKATH, /* Semkath */
4358 UC_JOINING_GROUP_SHIN, /* Shin */
4359 UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */
4360 UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */
4361 UC_JOINING_GROUP_TAH, /* Tah */
4362 UC_JOINING_GROUP_TAW, /* Taw */
4363 UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */
4364 UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */
4365 UC_JOINING_GROUP_TETH, /* Teth */
4366 UC_JOINING_GROUP_WAW, /* Waw */
4367 UC_JOINING_GROUP_YEH, /* Yeh */
4368 UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */
4369 UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */
4370 UC_JOINING_GROUP_YUDH, /* Yudh */
4371 UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */
4372 UC_JOINING_GROUP_ZAIN, /* Zain */
4373 UC_JOINING_GROUP_ZHAIN, /* Zhain */
4374 UC_JOINING_GROUP_ROHINGYA_YEH, /* Rohingya_Yeh */
4375 UC_JOINING_GROUP_STRAIGHT_WAW, /* Straight_Waw */
4376 UC_JOINING_GROUP_MANICHAEAN_ALEPH, /* Manichaean_Aleph */
4377 UC_JOINING_GROUP_MANICHAEAN_BETH, /* Manichaean_Beth */
4378 UC_JOINING_GROUP_MANICHAEAN_GIMEL, /* Manichaean_Gimel */
4379 UC_JOINING_GROUP_MANICHAEAN_DALETH, /* Manichaean_Daleth */
4380 UC_JOINING_GROUP_MANICHAEAN_WAW, /* Manichaean_Waw */
4381 UC_JOINING_GROUP_MANICHAEAN_ZAYIN, /* Manichaean_Zayin */
4382 UC_JOINING_GROUP_MANICHAEAN_HETH, /* Manichaean_Heth */
4383 UC_JOINING_GROUP_MANICHAEAN_TETH, /* Manichaean_Teth */
4384 UC_JOINING_GROUP_MANICHAEAN_YODH, /* Manichaean_Yodh */
4385 UC_JOINING_GROUP_MANICHAEAN_KAPH, /* Manichaean_Kaph */
4386 UC_JOINING_GROUP_MANICHAEAN_LAMEDH, /* Manichaean_Lamedh */
4387 UC_JOINING_GROUP_MANICHAEAN_DHAMEDH, /* Manichaean_Dhamedh */
4388 UC_JOINING_GROUP_MANICHAEAN_THAMEDH, /* Manichaean_Thamedh */
4389 UC_JOINING_GROUP_MANICHAEAN_MEM, /* Manichaean_Mem */
4390 UC_JOINING_GROUP_MANICHAEAN_NUN, /* Manichaean_Nun */
4391 UC_JOINING_GROUP_MANICHAEAN_SAMEKH, /* Manichaean_Aleph */
4392 UC_JOINING_GROUP_MANICHAEAN_AYIN, /* Manichaean_Ayin */
4393 UC_JOINING_GROUP_MANICHAEAN_PE, /* Manichaean_Pe */
4394 UC_JOINING_GROUP_MANICHAEAN_SADHE, /* Manichaean_Sadhe */
4395 UC_JOINING_GROUP_MANICHAEAN_QOPH, /* Manichaean_Qoph */
4396 UC_JOINING_GROUP_MANICHAEAN_RESH, /* Manichaean_Resh */
4397 UC_JOINING_GROUP_MANICHAEAN_TAW, /* Manichaean_Taw */
4398 UC_JOINING_GROUP_MANICHAEAN_ONE, /* Manichaean_One */
4399 UC_JOINING_GROUP_MANICHAEAN_FIVE, /* Manichaean_Five */
4400 UC_JOINING_GROUP_MANICHAEAN_TEN, /* Manichaean_Ten */
4401 UC_JOINING_GROUP_MANICHAEAN_TWENTY, /* Manichaean_Twenty */
4402 UC_JOINING_GROUP_MANICHAEAN_HUNDRED, /* Manichaean_Hundred */
4403 UC_JOINING_GROUP_AFRICAN_FEH, /* African_Feh */
4404 UC_JOINING_GROUP_AFRICAN_QAF, /* African_Qaf */
4405 UC_JOINING_GROUP_AFRICAN_NOON, /* African_Noon */
4406 UC_JOINING_GROUP_MALAYALAM_NGA, /* Malayalam_Nga */
4407 UC_JOINING_GROUP_MALAYALAM_JA, /* Malayalam_Ja */
4408 UC_JOINING_GROUP_MALAYALAM_NYA, /* Malayalam_Nya */
4409 UC_JOINING_GROUP_MALAYALAM_TTA, /* Malayalam_Tta */
4410 UC_JOINING_GROUP_MALAYALAM_NNA, /* Malayalam_Nna */
4411 UC_JOINING_GROUP_MALAYALAM_NNNA, /* Malayalam_Nnna */
4412 UC_JOINING_GROUP_MALAYALAM_BHA, /* Malayalam_Bha */
4413 UC_JOINING_GROUP_MALAYALAM_RA, /* Malayalam_Ra */
4414 UC_JOINING_GROUP_MALAYALAM_LLA, /* Malayalam_Lla */
4415 UC_JOINING_GROUP_MALAYALAM_LLLA, /* Malayalam_Llla */
4416 UC_JOINING_GROUP_MALAYALAM_SSA, /* Malayalam_Ssa */
4417 UC_JOINING_GROUP_HANIFI_ROHINGYA_PA, /* Hanifi_Rohingya_Pa */
4418 UC_JOINING_GROUP_HANIFI_ROHINGYA_KINNA_YA, /* Hanifi_Rohingya_Kinna_Ya */
4419 UC_JOINING_GROUP_THIN_YEH, /* Thin_Yeh */
4420 UC_JOINING_GROUP_VERTICAL_TAIL, /* Vertical_Tail */
4421 UC_JOINING_GROUP_KASHMIRI_YEH /* Kashmiri_Yeh */
4424 static uint8_t unicode_joining_group[0x110000];
4426 static void
4427 fill_arabicshaping (const char *arabicshaping_filename)
4429 FILE *stream;
4430 unsigned int i;
4431 int lineno;
4433 stream = fopen (arabicshaping_filename, "r");
4434 if (stream == NULL)
4436 fprintf (stderr, "error during fopen of '%s'\n", arabicshaping_filename);
4437 exit (1);
4440 for (i = 0; i < 0x110000; i++)
4442 unicode_joining_type[i] = (uint8_t)~(uint8_t)0;
4443 unicode_joining_group[i] = UC_JOINING_GROUP_NONE;
4446 lineno = 0;
4447 for (;;)
4449 char buf[200+1];
4450 char separator1[200+1];
4451 char schematic_name[200+1];
4452 char separator2[200+1];
4453 char joining_type_name[200+1];
4454 char separator3[200+1];
4455 char joining_group_name[200+1];
4456 int joining_type;
4457 int joining_group;
4459 lineno++;
4460 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
4461 break;
4463 if (buf[0] == '\0' || buf[0] == '#')
4464 continue;
4466 if (sscanf (buf, "%X%[; ]%[^;]%[; ]%[^;]%[; ]%100[^\n]",
4467 &i, separator1, schematic_name, separator2, joining_type_name,
4468 separator3, joining_group_name) != 7)
4470 fprintf (stderr, "parse error in '%s':%d\n",
4471 arabicshaping_filename, lineno);
4472 exit (1);
4474 assert (i < 0x110000);
4476 #define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name;
4477 if (false) {}
4478 TRY(UC_JOINING_TYPE_U)
4479 TRY(UC_JOINING_TYPE_T)
4480 TRY(UC_JOINING_TYPE_C)
4481 TRY(UC_JOINING_TYPE_L)
4482 TRY(UC_JOINING_TYPE_R)
4483 TRY(UC_JOINING_TYPE_D)
4484 #undef TRY
4485 else
4487 fprintf (stderr, "unknown joining type value \"%s\" in '%s':%d\n",
4488 joining_type_name, arabicshaping_filename, lineno);
4489 exit (1);
4492 /* Remove trailing spaces. */
4493 while (joining_group_name[0] != '\0'
4494 && joining_group_name[strlen (joining_group_name) - 1] == ' ')
4495 joining_group_name[strlen (joining_group_name) - 1] = '\0';
4497 #define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value;
4498 if (false) {}
4499 TRY(UC_JOINING_GROUP_NONE, "No_Joining_Group")
4500 TRY(UC_JOINING_GROUP_AIN, "AIN")
4501 TRY(UC_JOINING_GROUP_ALAPH, "ALAPH")
4502 TRY(UC_JOINING_GROUP_ALEF, "ALEF")
4503 TRY(UC_JOINING_GROUP_BEH, "BEH")
4504 TRY(UC_JOINING_GROUP_BETH, "BETH")
4505 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, "BURUSHASKI YEH BARREE")
4506 TRY(UC_JOINING_GROUP_DAL, "DAL")
4507 TRY(UC_JOINING_GROUP_DALATH_RISH, "DALATH RISH")
4508 TRY(UC_JOINING_GROUP_E, "E")
4509 TRY(UC_JOINING_GROUP_FARSI_YEH, "FARSI YEH")
4510 TRY(UC_JOINING_GROUP_FE, "FE")
4511 TRY(UC_JOINING_GROUP_FEH, "FEH")
4512 TRY(UC_JOINING_GROUP_FINAL_SEMKATH, "FINAL SEMKATH")
4513 TRY(UC_JOINING_GROUP_GAF, "GAF")
4514 TRY(UC_JOINING_GROUP_GAMAL, "GAMAL")
4515 TRY(UC_JOINING_GROUP_HAH, "HAH")
4516 TRY(UC_JOINING_GROUP_HE, "HE")
4517 TRY(UC_JOINING_GROUP_HEH, "HEH")
4518 TRY(UC_JOINING_GROUP_HEH_GOAL, "HEH GOAL")
4519 TRY(UC_JOINING_GROUP_HETH, "HETH")
4520 TRY(UC_JOINING_GROUP_KAF, "KAF")
4521 TRY(UC_JOINING_GROUP_KAPH, "KAPH")
4522 TRY(UC_JOINING_GROUP_KHAPH, "KHAPH")
4523 TRY(UC_JOINING_GROUP_KNOTTED_HEH, "KNOTTED HEH")
4524 TRY(UC_JOINING_GROUP_LAM, "LAM")
4525 TRY(UC_JOINING_GROUP_LAMADH, "LAMADH")
4526 TRY(UC_JOINING_GROUP_MEEM, "MEEM")
4527 TRY(UC_JOINING_GROUP_MIM, "MIM")
4528 TRY(UC_JOINING_GROUP_NOON, "NOON")
4529 TRY(UC_JOINING_GROUP_NUN, "NUN")
4530 TRY(UC_JOINING_GROUP_NYA, "NYA")
4531 TRY(UC_JOINING_GROUP_PE, "PE")
4532 TRY(UC_JOINING_GROUP_QAF, "QAF")
4533 TRY(UC_JOINING_GROUP_QAPH, "QAPH")
4534 TRY(UC_JOINING_GROUP_REH, "REH")
4535 TRY(UC_JOINING_GROUP_REVERSED_PE, "REVERSED PE")
4536 TRY(UC_JOINING_GROUP_SAD, "SAD")
4537 TRY(UC_JOINING_GROUP_SADHE, "SADHE")
4538 TRY(UC_JOINING_GROUP_SEEN, "SEEN")
4539 TRY(UC_JOINING_GROUP_SEMKATH, "SEMKATH")
4540 TRY(UC_JOINING_GROUP_SHIN, "SHIN")
4541 TRY(UC_JOINING_GROUP_SWASH_KAF, "SWASH KAF")
4542 TRY(UC_JOINING_GROUP_SYRIAC_WAW, "SYRIAC WAW")
4543 TRY(UC_JOINING_GROUP_TAH, "TAH")
4544 TRY(UC_JOINING_GROUP_TAW, "TAW")
4545 TRY(UC_JOINING_GROUP_TEH_MARBUTA, "TEH MARBUTA")
4546 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL, "TEH MARBUTA GOAL")
4547 TRY(UC_JOINING_GROUP_TETH, "TETH")
4548 TRY(UC_JOINING_GROUP_WAW, "WAW")
4549 TRY(UC_JOINING_GROUP_YEH, "YEH")
4550 TRY(UC_JOINING_GROUP_YEH_BARREE, "YEH BARREE")
4551 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL, "YEH WITH TAIL")
4552 TRY(UC_JOINING_GROUP_YUDH, "YUDH")
4553 TRY(UC_JOINING_GROUP_YUDH_HE, "YUDH HE")
4554 TRY(UC_JOINING_GROUP_ZAIN, "ZAIN")
4555 TRY(UC_JOINING_GROUP_ZHAIN, "ZHAIN")
4556 TRY(UC_JOINING_GROUP_ROHINGYA_YEH, "ROHINGYA YEH")
4557 TRY(UC_JOINING_GROUP_STRAIGHT_WAW, "STRAIGHT WAW")
4558 TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH, "MANICHAEAN ALEPH")
4559 TRY(UC_JOINING_GROUP_MANICHAEAN_BETH, "MANICHAEAN BETH")
4560 TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL, "MANICHAEAN GIMEL")
4561 TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH, "MANICHAEAN DALETH")
4562 TRY(UC_JOINING_GROUP_MANICHAEAN_WAW, "MANICHAEAN WAW")
4563 TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN, "MANICHAEAN ZAYIN")
4564 TRY(UC_JOINING_GROUP_MANICHAEAN_HETH, "MANICHAEAN HETH")
4565 TRY(UC_JOINING_GROUP_MANICHAEAN_TETH, "MANICHAEAN TETH")
4566 TRY(UC_JOINING_GROUP_MANICHAEAN_YODH, "MANICHAEAN YODH")
4567 TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH, "MANICHAEAN KAPH")
4568 TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH, "MANICHAEAN LAMEDH")
4569 TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH, "MANICHAEAN DHAMEDH")
4570 TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH, "MANICHAEAN THAMEDH")
4571 TRY(UC_JOINING_GROUP_MANICHAEAN_MEM, "MANICHAEAN MEM")
4572 TRY(UC_JOINING_GROUP_MANICHAEAN_NUN, "MANICHAEAN NUN")
4573 TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH, "MANICHAEAN SAMEKH")
4574 TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN, "MANICHAEAN AYIN")
4575 TRY(UC_JOINING_GROUP_MANICHAEAN_PE, "MANICHAEAN PE")
4576 TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE, "MANICHAEAN SADHE")
4577 TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH, "MANICHAEAN QOPH")
4578 TRY(UC_JOINING_GROUP_MANICHAEAN_RESH, "MANICHAEAN RESH")
4579 TRY(UC_JOINING_GROUP_MANICHAEAN_TAW, "MANICHAEAN TAW")
4580 TRY(UC_JOINING_GROUP_MANICHAEAN_ONE, "MANICHAEAN ONE")
4581 TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE, "MANICHAEAN FIVE")
4582 TRY(UC_JOINING_GROUP_MANICHAEAN_TEN, "MANICHAEAN TEN")
4583 TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY, "MANICHAEAN TWENTY")
4584 TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED, "MANICHAEAN HUNDRED")
4585 TRY(UC_JOINING_GROUP_AFRICAN_FEH, "AFRICAN FEH")
4586 TRY(UC_JOINING_GROUP_AFRICAN_QAF, "AFRICAN QAF")
4587 TRY(UC_JOINING_GROUP_AFRICAN_NOON, "AFRICAN NOON")
4588 TRY(UC_JOINING_GROUP_MALAYALAM_NGA, "MALAYALAM NGA")
4589 TRY(UC_JOINING_GROUP_MALAYALAM_JA, "MALAYALAM JA")
4590 TRY(UC_JOINING_GROUP_MALAYALAM_NYA, "MALAYALAM NYA")
4591 TRY(UC_JOINING_GROUP_MALAYALAM_TTA, "MALAYALAM TTA")
4592 TRY(UC_JOINING_GROUP_MALAYALAM_NNA, "MALAYALAM NNA")
4593 TRY(UC_JOINING_GROUP_MALAYALAM_NNNA, "MALAYALAM NNNA")
4594 TRY(UC_JOINING_GROUP_MALAYALAM_BHA, "MALAYALAM BHA")
4595 TRY(UC_JOINING_GROUP_MALAYALAM_RA, "MALAYALAM RA")
4596 TRY(UC_JOINING_GROUP_MALAYALAM_LLA, "MALAYALAM LLA")
4597 TRY(UC_JOINING_GROUP_MALAYALAM_LLLA, "MALAYALAM LLLA")
4598 TRY(UC_JOINING_GROUP_MALAYALAM_SSA, "MALAYALAM SSA")
4599 TRY(UC_JOINING_GROUP_HANIFI_ROHINGYA_PA, "HANIFI ROHINGYA PA")
4600 TRY(UC_JOINING_GROUP_HANIFI_ROHINGYA_KINNA_YA, "HANIFI ROHINGYA KINNA YA")
4601 TRY(UC_JOINING_GROUP_THIN_YEH, "THIN YEH")
4602 TRY(UC_JOINING_GROUP_VERTICAL_TAIL, "VERTICAL TAIL")
4603 TRY(UC_JOINING_GROUP_KASHMIRI_YEH, "KASHMIRI YEH")
4604 #undef TRY
4605 else
4607 fprintf (stderr, "unknown joining group value \"%s\" in '%s':%d\n",
4608 joining_group_name, arabicshaping_filename, lineno);
4609 exit (1);
4612 unicode_joining_type[i] = joining_type;
4613 unicode_joining_group[i] = joining_group;
4616 if (ferror (stream) || fclose (stream))
4618 fprintf (stderr, "error reading from '%s'\n", arabicshaping_filename);
4619 exit (1);
4623 /* Convert a Joining_Type value to a C identifier. */
4624 static const char *
4625 joining_type_as_c_identifier (int joining_type)
4627 #define TRY(value) if (joining_type == value) return #value;
4628 TRY(UC_JOINING_TYPE_U)
4629 TRY(UC_JOINING_TYPE_T)
4630 TRY(UC_JOINING_TYPE_C)
4631 TRY(UC_JOINING_TYPE_L)
4632 TRY(UC_JOINING_TYPE_R)
4633 TRY(UC_JOINING_TYPE_D)
4634 #undef TRY
4635 abort ();
4638 static void
4639 output_joining_type_test (const char *filename, const char *version)
4641 FILE *stream;
4642 bool need_comma;
4643 unsigned int ch;
4645 stream = fopen (filename, "w");
4646 if (stream == NULL)
4648 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4649 exit (1);
4652 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4653 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
4654 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4655 version);
4656 fprintf (stream, "\n");
4658 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
4659 fprintf (stream, "\n");
4660 output_tests_license (stream);
4661 fprintf (stream, "\n");
4663 need_comma = false;
4664 for (ch = 0; ch < 0x110000; ch++)
4666 int value = unicode_joining_type[ch];
4668 if (value != (uint8_t)~(uint8_t)0)
4670 if (need_comma)
4671 fprintf (stream, ",\n");
4672 fprintf (stream, " { 0x%04X, %s }", ch, joining_type_as_c_identifier (value));
4673 need_comma = true;
4676 if (need_comma)
4677 fprintf (stream, "\n");
4679 if (ferror (stream) || fclose (stream))
4681 fprintf (stderr, "error writing to '%s'\n", filename);
4682 exit (1);
4686 /* Construction of sparse 3-level tables. */
4687 #define TABLE joining_type_table
4688 #define ELEMENT uint8_t
4689 #define DEFAULT (uint8_t)~(uint8_t)0
4690 #define xmalloc malloc
4691 #define xrealloc realloc
4692 #include "3level.h"
4694 static void
4695 output_joining_type (const char *filename, const char *version)
4697 FILE *stream;
4698 unsigned int ch, i;
4699 struct joining_type_table t;
4700 unsigned int level1_offset, level2_offset, level3_offset;
4701 uint8_t *level3_packed;
4703 stream = fopen (filename, "w");
4704 if (stream == NULL)
4706 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4707 exit (1);
4710 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4711 fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
4712 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4713 version);
4714 fprintf (stream, "\n");
4716 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
4717 fprintf (stream, "\n");
4718 output_library_license (stream, true);
4719 fprintf (stream, "\n");
4721 t.p = 7;
4722 t.q = 9;
4723 joining_type_table_init (&t);
4725 for (ch = 0; ch < 0x110000; ch++)
4727 uint8_t value = unicode_joining_type[ch];
4729 assert (value == (uint8_t)~(uint8_t)0 || value <= 0x0f);
4731 joining_type_table_add (&t, ch, value);
4734 joining_type_table_finalize (&t);
4736 /* Offsets in t.result, in memory of this process. */
4737 level1_offset =
4738 5 * sizeof (uint32_t);
4739 level2_offset =
4740 5 * sizeof (uint32_t)
4741 + t.level1_size * sizeof (uint32_t);
4742 level3_offset =
4743 5 * sizeof (uint32_t)
4744 + t.level1_size * sizeof (uint32_t)
4745 + (t.level2_size << t.q) * sizeof (uint32_t);
4747 for (i = 0; i < 5; i++)
4748 fprintf (stream, "#define joining_type_header_%d %d\n", i,
4749 ((uint32_t *) t.result)[i]);
4750 fprintf (stream, "static const\n");
4751 fprintf (stream, "struct\n");
4752 fprintf (stream, " {\n");
4753 fprintf (stream, " int level1[%zu];\n", t.level1_size);
4754 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
4755 fprintf (stream, " unsigned char level3[%zu * %d];\n", t.level3_size,
4756 (1 << t.p) * 4 / 8);
4757 fprintf (stream, " }\n");
4758 fprintf (stream, "u_joining_type =\n");
4759 fprintf (stream, "{\n");
4760 fprintf (stream, " {");
4761 if (t.level1_size > 8)
4762 fprintf (stream, "\n ");
4763 for (i = 0; i < t.level1_size; i++)
4765 uint32_t offset;
4766 if (i > 0 && (i % 8) == 0)
4767 fprintf (stream, "\n ");
4768 offset = ((uint32_t *) (t.result + level1_offset))[i];
4769 if (offset == 0)
4770 fprintf (stream, " %5d", -1);
4771 else
4772 fprintf (stream, " %5zu",
4773 (offset - level2_offset) / sizeof (uint32_t));
4774 if (i+1 < t.level1_size)
4775 fprintf (stream, ",");
4777 if (t.level1_size > 8)
4778 fprintf (stream, "\n ");
4779 fprintf (stream, " },\n");
4780 fprintf (stream, " {");
4781 if (t.level2_size << t.q > 8)
4782 fprintf (stream, "\n ");
4783 for (i = 0; i < t.level2_size << t.q; i++)
4785 uint32_t offset;
4786 if (i > 0 && (i % 8) == 0)
4787 fprintf (stream, "\n ");
4788 offset = ((uint32_t *) (t.result + level2_offset))[i];
4789 if (offset == 0)
4790 fprintf (stream, " %5d", -1);
4791 else
4792 fprintf (stream, " %5zu",
4793 (offset - level3_offset) / sizeof (uint8_t));
4794 if (i+1 < t.level2_size << t.q)
4795 fprintf (stream, ",");
4797 if (t.level2_size << t.q > 8)
4798 fprintf (stream, "\n ");
4799 fprintf (stream, " },\n");
4800 /* Pack the level3 array. Each entry needs 4 bits only. */
4801 level3_packed =
4802 (uint8_t *) calloc ((t.level3_size << t.p) * 4 / 8, sizeof (uint8_t));
4803 for (i = 0; i < t.level3_size << t.p; i++)
4805 unsigned int j = (i * 4) / 8;
4806 unsigned int k = (i * 4) % 8;
4807 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f;
4808 level3_packed[j] |= (value << k);
4810 fprintf (stream, " {");
4811 if ((t.level3_size << t.p) * 4 / 8 > 8)
4812 fprintf (stream, "\n ");
4813 for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++)
4815 if (i > 0 && (i % 8) == 0)
4816 fprintf (stream, "\n ");
4817 fprintf (stream, " 0x%02x", level3_packed[i]);
4818 if (i+1 < (t.level3_size << t.p) * 4 / 8)
4819 fprintf (stream, ",");
4821 if ((t.level3_size << t.p) * 4 / 8 > 8)
4822 fprintf (stream, "\n ");
4823 fprintf (stream, " }\n");
4824 free (level3_packed);
4825 fprintf (stream, "};\n");
4827 if (ferror (stream) || fclose (stream))
4829 fprintf (stderr, "error writing to '%s'\n", filename);
4830 exit (1);
4834 /* Convert a Joining_Group value to a C identifier. */
4835 static const char *
4836 joining_group_as_c_identifier (int joining_group)
4838 #define TRY(value) if (joining_group == value) return #value;
4839 TRY(UC_JOINING_GROUP_NONE)
4840 TRY(UC_JOINING_GROUP_AIN)
4841 TRY(UC_JOINING_GROUP_ALAPH)
4842 TRY(UC_JOINING_GROUP_ALEF)
4843 TRY(UC_JOINING_GROUP_BEH)
4844 TRY(UC_JOINING_GROUP_BETH)
4845 TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE)
4846 TRY(UC_JOINING_GROUP_DAL)
4847 TRY(UC_JOINING_GROUP_DALATH_RISH)
4848 TRY(UC_JOINING_GROUP_E)
4849 TRY(UC_JOINING_GROUP_FARSI_YEH)
4850 TRY(UC_JOINING_GROUP_FE)
4851 TRY(UC_JOINING_GROUP_FEH)
4852 TRY(UC_JOINING_GROUP_FINAL_SEMKATH)
4853 TRY(UC_JOINING_GROUP_GAF)
4854 TRY(UC_JOINING_GROUP_GAMAL)
4855 TRY(UC_JOINING_GROUP_HAH)
4856 TRY(UC_JOINING_GROUP_HE)
4857 TRY(UC_JOINING_GROUP_HEH)
4858 TRY(UC_JOINING_GROUP_HEH_GOAL)
4859 TRY(UC_JOINING_GROUP_HETH)
4860 TRY(UC_JOINING_GROUP_KAF)
4861 TRY(UC_JOINING_GROUP_KAPH)
4862 TRY(UC_JOINING_GROUP_KHAPH)
4863 TRY(UC_JOINING_GROUP_KNOTTED_HEH)
4864 TRY(UC_JOINING_GROUP_LAM)
4865 TRY(UC_JOINING_GROUP_LAMADH)
4866 TRY(UC_JOINING_GROUP_MEEM)
4867 TRY(UC_JOINING_GROUP_MIM)
4868 TRY(UC_JOINING_GROUP_NOON)
4869 TRY(UC_JOINING_GROUP_NUN)
4870 TRY(UC_JOINING_GROUP_NYA)
4871 TRY(UC_JOINING_GROUP_PE)
4872 TRY(UC_JOINING_GROUP_QAF)
4873 TRY(UC_JOINING_GROUP_QAPH)
4874 TRY(UC_JOINING_GROUP_REH)
4875 TRY(UC_JOINING_GROUP_REVERSED_PE)
4876 TRY(UC_JOINING_GROUP_SAD)
4877 TRY(UC_JOINING_GROUP_SADHE)
4878 TRY(UC_JOINING_GROUP_SEEN)
4879 TRY(UC_JOINING_GROUP_SEMKATH)
4880 TRY(UC_JOINING_GROUP_SHIN)
4881 TRY(UC_JOINING_GROUP_SWASH_KAF)
4882 TRY(UC_JOINING_GROUP_SYRIAC_WAW)
4883 TRY(UC_JOINING_GROUP_TAH)
4884 TRY(UC_JOINING_GROUP_TAW)
4885 TRY(UC_JOINING_GROUP_TEH_MARBUTA)
4886 TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL)
4887 TRY(UC_JOINING_GROUP_TETH)
4888 TRY(UC_JOINING_GROUP_WAW)
4889 TRY(UC_JOINING_GROUP_YEH)
4890 TRY(UC_JOINING_GROUP_YEH_BARREE)
4891 TRY(UC_JOINING_GROUP_YEH_WITH_TAIL)
4892 TRY(UC_JOINING_GROUP_YUDH)
4893 TRY(UC_JOINING_GROUP_YUDH_HE)
4894 TRY(UC_JOINING_GROUP_ZAIN)
4895 TRY(UC_JOINING_GROUP_ZHAIN)
4896 TRY(UC_JOINING_GROUP_ROHINGYA_YEH)
4897 TRY(UC_JOINING_GROUP_STRAIGHT_WAW)
4898 TRY(UC_JOINING_GROUP_MANICHAEAN_ALEPH)
4899 TRY(UC_JOINING_GROUP_MANICHAEAN_BETH)
4900 TRY(UC_JOINING_GROUP_MANICHAEAN_GIMEL)
4901 TRY(UC_JOINING_GROUP_MANICHAEAN_DALETH)
4902 TRY(UC_JOINING_GROUP_MANICHAEAN_WAW)
4903 TRY(UC_JOINING_GROUP_MANICHAEAN_ZAYIN)
4904 TRY(UC_JOINING_GROUP_MANICHAEAN_HETH)
4905 TRY(UC_JOINING_GROUP_MANICHAEAN_TETH)
4906 TRY(UC_JOINING_GROUP_MANICHAEAN_YODH)
4907 TRY(UC_JOINING_GROUP_MANICHAEAN_KAPH)
4908 TRY(UC_JOINING_GROUP_MANICHAEAN_LAMEDH)
4909 TRY(UC_JOINING_GROUP_MANICHAEAN_DHAMEDH)
4910 TRY(UC_JOINING_GROUP_MANICHAEAN_THAMEDH)
4911 TRY(UC_JOINING_GROUP_MANICHAEAN_MEM)
4912 TRY(UC_JOINING_GROUP_MANICHAEAN_NUN)
4913 TRY(UC_JOINING_GROUP_MANICHAEAN_SAMEKH)
4914 TRY(UC_JOINING_GROUP_MANICHAEAN_AYIN)
4915 TRY(UC_JOINING_GROUP_MANICHAEAN_PE)
4916 TRY(UC_JOINING_GROUP_MANICHAEAN_SADHE)
4917 TRY(UC_JOINING_GROUP_MANICHAEAN_QOPH)
4918 TRY(UC_JOINING_GROUP_MANICHAEAN_RESH)
4919 TRY(UC_JOINING_GROUP_MANICHAEAN_TAW)
4920 TRY(UC_JOINING_GROUP_MANICHAEAN_ONE)
4921 TRY(UC_JOINING_GROUP_MANICHAEAN_FIVE)
4922 TRY(UC_JOINING_GROUP_MANICHAEAN_TEN)
4923 TRY(UC_JOINING_GROUP_MANICHAEAN_TWENTY)
4924 TRY(UC_JOINING_GROUP_MANICHAEAN_HUNDRED)
4925 TRY(UC_JOINING_GROUP_AFRICAN_FEH)
4926 TRY(UC_JOINING_GROUP_AFRICAN_QAF)
4927 TRY(UC_JOINING_GROUP_AFRICAN_NOON)
4928 TRY(UC_JOINING_GROUP_MALAYALAM_NGA)
4929 TRY(UC_JOINING_GROUP_MALAYALAM_JA)
4930 TRY(UC_JOINING_GROUP_MALAYALAM_NYA)
4931 TRY(UC_JOINING_GROUP_MALAYALAM_TTA)
4932 TRY(UC_JOINING_GROUP_MALAYALAM_NNA)
4933 TRY(UC_JOINING_GROUP_MALAYALAM_NNNA)
4934 TRY(UC_JOINING_GROUP_MALAYALAM_BHA)
4935 TRY(UC_JOINING_GROUP_MALAYALAM_RA)
4936 TRY(UC_JOINING_GROUP_MALAYALAM_LLA)
4937 TRY(UC_JOINING_GROUP_MALAYALAM_LLLA)
4938 TRY(UC_JOINING_GROUP_MALAYALAM_SSA)
4939 TRY(UC_JOINING_GROUP_HANIFI_ROHINGYA_PA)
4940 TRY(UC_JOINING_GROUP_HANIFI_ROHINGYA_KINNA_YA)
4941 TRY(UC_JOINING_GROUP_THIN_YEH)
4942 TRY(UC_JOINING_GROUP_VERTICAL_TAIL)
4943 TRY(UC_JOINING_GROUP_KASHMIRI_YEH)
4944 #undef TRY
4945 abort ();
4948 static void
4949 output_joining_group_test (const char *filename, const char *version)
4951 FILE *stream;
4952 bool need_comma;
4953 unsigned int ch;
4955 stream = fopen (filename, "w");
4956 if (stream == NULL)
4958 fprintf (stderr, "cannot open '%s' for writing\n", filename);
4959 exit (1);
4962 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
4963 fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
4964 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
4965 version);
4966 fprintf (stream, "\n");
4968 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
4969 fprintf (stream, "\n");
4970 output_tests_license (stream);
4971 fprintf (stream, "\n");
4973 need_comma = false;
4974 for (ch = 0; ch < 0x110000; ch++)
4976 int value = unicode_joining_group[ch];
4978 if (value != UC_JOINING_GROUP_NONE)
4980 if (need_comma)
4981 fprintf (stream, ",\n");
4982 fprintf (stream, " { 0x%04X, %s }", ch, joining_group_as_c_identifier (value));
4983 need_comma = true;
4986 if (need_comma)
4987 fprintf (stream, "\n");
4989 if (ferror (stream) || fclose (stream))
4991 fprintf (stderr, "error writing to '%s'\n", filename);
4992 exit (1);
4996 /* Construction of sparse 3-level tables. */
4997 #define TABLE joining_group_table
4998 #define ELEMENT uint8_t
4999 #define DEFAULT UC_JOINING_GROUP_NONE
5000 #define xmalloc malloc
5001 #define xrealloc realloc
5002 #include "3level.h"
5004 static void
5005 output_joining_group (const char *filename, const char *version)
5007 FILE *stream;
5008 unsigned int ch, i;
5009 struct joining_group_table t;
5010 unsigned int level1_offset, level2_offset, level3_offset;
5011 uint16_t *level3_packed;
5013 stream = fopen (filename, "w");
5014 if (stream == NULL)
5016 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5017 exit (1);
5020 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5021 fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
5022 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5023 version);
5024 fprintf (stream, "\n");
5026 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
5027 fprintf (stream, "\n");
5028 output_library_license (stream, false);
5029 fprintf (stream, "\n");
5031 t.p = 7;
5032 t.q = 9;
5033 joining_group_table_init (&t);
5035 for (ch = 0; ch < 0x110000; ch++)
5037 uint8_t value = unicode_joining_group[ch];
5039 assert (value <= 0x7f);
5041 joining_group_table_add (&t, ch, value);
5044 joining_group_table_finalize (&t);
5046 /* Offsets in t.result, in memory of this process. */
5047 level1_offset =
5048 5 * sizeof (uint32_t);
5049 level2_offset =
5050 5 * sizeof (uint32_t)
5051 + t.level1_size * sizeof (uint32_t);
5052 level3_offset =
5053 5 * sizeof (uint32_t)
5054 + t.level1_size * sizeof (uint32_t)
5055 + (t.level2_size << t.q) * sizeof (uint32_t);
5057 for (i = 0; i < 5; i++)
5058 fprintf (stream, "#define joining_group_header_%d %d\n", i,
5059 ((uint32_t *) t.result)[i]);
5060 fprintf (stream, "static const\n");
5061 fprintf (stream, "struct\n");
5062 fprintf (stream, " {\n");
5063 fprintf (stream, " int level1[%zu];\n", t.level1_size);
5064 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
5065 fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
5066 (1 << t.p) * 7 / 16);
5067 fprintf (stream, " }\n");
5068 fprintf (stream, "u_joining_group =\n");
5069 fprintf (stream, "{\n");
5070 fprintf (stream, " {");
5071 if (t.level1_size > 8)
5072 fprintf (stream, "\n ");
5073 for (i = 0; i < t.level1_size; i++)
5075 uint32_t offset;
5076 if (i > 0 && (i % 8) == 0)
5077 fprintf (stream, "\n ");
5078 offset = ((uint32_t *) (t.result + level1_offset))[i];
5079 if (offset == 0)
5080 fprintf (stream, " %5d", -1);
5081 else
5082 fprintf (stream, " %5zu",
5083 (offset - level2_offset) / sizeof (uint32_t));
5084 if (i+1 < t.level1_size)
5085 fprintf (stream, ",");
5087 if (t.level1_size > 8)
5088 fprintf (stream, "\n ");
5089 fprintf (stream, " },\n");
5090 fprintf (stream, " {");
5091 if (t.level2_size << t.q > 8)
5092 fprintf (stream, "\n ");
5093 for (i = 0; i < t.level2_size << t.q; i++)
5095 uint32_t offset;
5096 if (i > 0 && (i % 8) == 0)
5097 fprintf (stream, "\n ");
5098 offset = ((uint32_t *) (t.result + level2_offset))[i];
5099 if (offset == 0)
5100 fprintf (stream, " %5d", -1);
5101 else
5102 fprintf (stream, " %5zu",
5103 (offset - level3_offset) / sizeof (uint8_t));
5104 if (i+1 < t.level2_size << t.q)
5105 fprintf (stream, ",");
5107 if (t.level2_size << t.q > 8)
5108 fprintf (stream, "\n ");
5109 fprintf (stream, " },\n");
5110 /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
5111 not 32-bit units, in order to make the lookup function easier. */
5112 level3_packed =
5113 (uint16_t *)
5114 calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
5115 for (i = 0; i < t.level3_size << t.p; i++)
5117 unsigned int j = (i * 7) / 16;
5118 unsigned int k = (i * 7) % 16;
5119 uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
5120 value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
5121 level3_packed[j] = value & 0xffff;
5122 level3_packed[j+1] = value >> 16;
5124 fprintf (stream, " {");
5125 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
5126 fprintf (stream, "\n ");
5127 for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
5129 if (i > 0 && (i % 8) == 0)
5130 fprintf (stream, "\n ");
5131 fprintf (stream, " 0x%04x", level3_packed[i]);
5132 if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
5133 fprintf (stream, ",");
5135 if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
5136 fprintf (stream, "\n ");
5137 fprintf (stream, " }\n");
5138 free (level3_packed);
5139 fprintf (stream, "};\n");
5141 if (ferror (stream) || fclose (stream))
5143 fprintf (stderr, "error writing to '%s'\n", filename);
5144 exit (1);
5148 /* ========================================================================= */
5150 /* Scripts. */
5152 static const char *scripts[256];
5153 static unsigned int numscripts;
5155 static uint8_t unicode_scripts[0x110000];
5157 static void
5158 fill_scripts (const char *scripts_filename)
5160 FILE *stream;
5161 unsigned int i;
5163 stream = fopen (scripts_filename, "r");
5164 if (stream == NULL)
5166 fprintf (stderr, "error during fopen of '%s'\n", scripts_filename);
5167 exit (1);
5170 numscripts = 0;
5172 for (i = 0; i < 0x110000; i++)
5173 unicode_scripts[i] = (uint8_t)~(uint8_t)0;
5175 for (;;)
5177 char buf[200+1];
5178 unsigned int i1, i2;
5179 char padding[200+1];
5180 char scriptname[200+1];
5181 int script;
5183 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
5184 break;
5186 if (buf[0] == '\0' || buf[0] == '#')
5187 continue;
5189 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4)
5191 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3)
5193 fprintf (stderr, "parse error in '%s'\n", scripts_filename);
5194 exit (1);
5196 i2 = i1;
5198 assert (i2 >= i1);
5199 assert (i2 < 0x110000);
5201 for (script = numscripts - 1; script >= 0; script--)
5202 if (strcmp (scripts[script], scriptname) == 0)
5203 break;
5204 if (script < 0)
5206 scripts[numscripts] = strdup (scriptname);
5207 script = numscripts;
5208 numscripts++;
5209 assert (numscripts != 256);
5212 for (i = i1; i <= i2; i++)
5214 if (unicode_scripts[i] != (uint8_t)~(uint8_t)0)
5215 fprintf (stderr, "0x%04X belongs to multiple scripts\n", i);
5216 unicode_scripts[i] = script;
5220 if (ferror (stream) || fclose (stream))
5222 fprintf (stderr, "error reading from '%s'\n", scripts_filename);
5223 exit (1);
5227 /* Construction of sparse 3-level tables. */
5228 #define TABLE script_table
5229 #define ELEMENT uint8_t
5230 #define DEFAULT (uint8_t)~(uint8_t)0
5231 #define xmalloc malloc
5232 #define xrealloc realloc
5233 #include "3level.h"
5235 static void
5236 output_scripts (const char *version)
5238 const char *filename = "unictype/scripts.h";
5239 FILE *stream;
5240 unsigned int ch, s, i;
5241 struct script_table t;
5242 unsigned int level1_offset, level2_offset, level3_offset;
5244 typedef struct
5246 const char *lowercase_name;
5248 scriptinfo_t;
5249 scriptinfo_t scriptinfo[256];
5251 stream = fopen (filename, "w");
5252 if (stream == NULL)
5254 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5255 exit (1);
5258 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5259 fprintf (stream, "/* Unicode scripts. */\n");
5260 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5261 version);
5262 fprintf (stream, "\n");
5264 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
5265 fprintf (stream, "\n");
5266 output_library_license (stream, true);
5267 fprintf (stream, "\n");
5269 for (s = 0; s < numscripts; s++)
5271 char *lcp = strdup (scripts[s]);
5272 char *cp;
5274 for (cp = lcp; *cp != '\0'; cp++)
5275 if (*cp >= 'A' && *cp <= 'Z')
5276 *cp += 'a' - 'A';
5278 scriptinfo[s].lowercase_name = lcp;
5281 for (s = 0; s < numscripts; s++)
5283 fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n",
5284 scriptinfo[s].lowercase_name);
5285 fprintf (stream, "{\n");
5286 i = 0;
5287 for (ch = 0; ch < 0x110000; ch++)
5288 if (unicode_scripts[ch] == s)
5290 unsigned int start;
5291 unsigned int end;
5293 start = ch;
5294 while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s)
5295 ch++;
5296 end = ch;
5298 if (i > 0)
5299 fprintf (stream, ",\n");
5300 if (start == end)
5301 fprintf (stream, " { 0x%04X, 1, 1 }", start);
5302 else
5303 fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }",
5304 start, end);
5305 i++;
5307 fprintf (stream, "\n");
5308 fprintf (stream, "};\n");
5311 fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts);
5312 fprintf (stream, "{\n");
5313 for (s = 0; s < numscripts; s++)
5315 fprintf (stream, " {\n");
5316 fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n",
5317 scriptinfo[s].lowercase_name);
5318 fprintf (stream, " script_%s_intervals,\n",
5319 scriptinfo[s].lowercase_name);
5320 fprintf (stream, " \"%s\"\n", scripts[s]);
5321 fprintf (stream, " }");
5322 if (s+1 < numscripts)
5323 fprintf (stream, ",");
5324 fprintf (stream, "\n");
5326 fprintf (stream, "};\n");
5328 t.p = 7;
5329 t.q = 9;
5330 script_table_init (&t);
5332 for (ch = 0; ch < 0x110000; ch++)
5334 unsigned int s = unicode_scripts[ch];
5335 if (s != (uint8_t)~(uint8_t)0)
5336 script_table_add (&t, ch, s);
5339 script_table_finalize (&t);
5341 /* Offsets in t.result, in memory of this process. */
5342 level1_offset =
5343 5 * sizeof (uint32_t);
5344 level2_offset =
5345 5 * sizeof (uint32_t)
5346 + t.level1_size * sizeof (uint32_t);
5347 level3_offset =
5348 5 * sizeof (uint32_t)
5349 + t.level1_size * sizeof (uint32_t)
5350 + (t.level2_size << t.q) * sizeof (uint32_t);
5352 for (i = 0; i < 5; i++)
5353 fprintf (stream, "#define script_header_%d %d\n", i,
5354 ((uint32_t *) t.result)[i]);
5355 fprintf (stream, "static const\n");
5356 fprintf (stream, "struct\n");
5357 fprintf (stream, " {\n");
5358 fprintf (stream, " int level1[%zu];\n", t.level1_size);
5359 fprintf (stream, " unsigned short level2[%zu << %d];\n", t.level2_size, t.q);
5360 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
5361 fprintf (stream, " }\n");
5362 fprintf (stream, "u_script =\n");
5363 fprintf (stream, "{\n");
5364 fprintf (stream, " {");
5365 if (t.level1_size > 8)
5366 fprintf (stream, "\n ");
5367 for (i = 0; i < t.level1_size; i++)
5369 uint32_t offset;
5370 if (i > 0 && (i % 8) == 0)
5371 fprintf (stream, "\n ");
5372 offset = ((uint32_t *) (t.result + level1_offset))[i];
5373 if (offset == 0)
5374 fprintf (stream, " %5d", -1);
5375 else
5376 fprintf (stream, " %5zu",
5377 (offset - level2_offset) / sizeof (uint32_t));
5378 if (i+1 < t.level1_size)
5379 fprintf (stream, ",");
5381 if (t.level1_size > 8)
5382 fprintf (stream, "\n ");
5383 fprintf (stream, " },\n");
5384 fprintf (stream, " {");
5385 if (t.level2_size << t.q > 8)
5386 fprintf (stream, "\n ");
5387 for (i = 0; i < t.level2_size << t.q; i++)
5389 uint32_t offset;
5390 if (i > 0 && (i % 8) == 0)
5391 fprintf (stream, "\n ");
5392 offset = ((uint32_t *) (t.result + level2_offset))[i];
5393 /* To make the level2 values fit in 16 bits, we use 'unsigned short'
5394 instead of 'short' and add 1 to each value. */
5395 if (offset == 0)
5396 fprintf (stream, " %5d", -1 + 1);
5397 else
5398 fprintf (stream, " %5zu",
5399 (offset - level3_offset) / sizeof (uint8_t) + 1);
5400 if (i+1 < t.level2_size << t.q)
5401 fprintf (stream, ",");
5403 if (t.level2_size << t.q > 8)
5404 fprintf (stream, "\n ");
5405 fprintf (stream, " },\n");
5406 fprintf (stream, " {");
5407 if (t.level3_size << t.p > 8)
5408 fprintf (stream, "\n ");
5409 for (i = 0; i < t.level3_size << t.p; i++)
5411 if (i > 0 && (i % 8) == 0)
5412 fprintf (stream, "\n ");
5413 fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]);
5414 if (i+1 < t.level3_size << t.p)
5415 fprintf (stream, ",");
5417 if (t.level3_size << t.p > 8)
5418 fprintf (stream, "\n ");
5419 fprintf (stream, " }\n");
5420 fprintf (stream, "};\n");
5422 if (ferror (stream) || fclose (stream))
5424 fprintf (stderr, "error writing to '%s'\n", filename);
5425 exit (1);
5429 static void
5430 output_scripts_byname (const char *version)
5432 const char *filename = "unictype/scripts_byname.gperf";
5433 FILE *stream;
5434 unsigned int s;
5436 stream = fopen (filename, "w");
5437 if (stream == NULL)
5439 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5440 exit (1);
5443 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5444 fprintf (stream, "/* Unicode scripts. */\n");
5445 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5446 version);
5447 fprintf (stream, "\n");
5449 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
5450 fprintf (stream, "\n");
5451 output_library_license (stream, true);
5452 fprintf (stream, "\n");
5454 fprintf (stream, "struct named_script { int name; unsigned int index; };\n");
5455 fprintf (stream, "%%struct-type\n");
5456 fprintf (stream, "%%language=ANSI-C\n");
5457 fprintf (stream, "%%define hash-function-name scripts_hash\n");
5458 fprintf (stream, "%%define lookup-function-name uc_script_lookup\n");
5459 fprintf (stream, "%%readonly-tables\n");
5460 fprintf (stream, "%%global-table\n");
5461 fprintf (stream, "%%define word-array-name script_names\n");
5462 fprintf (stream, "%%pic\n");
5463 fprintf (stream, "%%define string-pool-name script_stringpool\n");
5464 fprintf (stream, "%%%%\n");
5465 for (s = 0; s < numscripts; s++)
5466 fprintf (stream, "%s, %u\n", scripts[s], s);
5468 if (ferror (stream) || fclose (stream))
5470 fprintf (stderr, "error writing to '%s'\n", filename);
5471 exit (1);
5475 /* ========================================================================= */
5477 /* Blocks. */
5479 typedef struct { unsigned int start; unsigned int end; const char *name; }
5480 block_t;
5481 static block_t blocks[384];
5482 static unsigned int numblocks;
5484 static void
5485 fill_blocks (const char *blocks_filename)
5487 FILE *stream;
5489 stream = fopen (blocks_filename, "r");
5490 if (stream == NULL)
5492 fprintf (stderr, "error during fopen of '%s'\n", blocks_filename);
5493 exit (1);
5496 for (;;)
5498 char buf[200+1];
5499 unsigned int i1, i2;
5500 char padding[200+1];
5501 char blockname[200+1];
5503 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
5504 break;
5506 if (buf[0] == '\0' || buf[0] == '#')
5507 continue;
5509 if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4)
5511 fprintf (stderr, "parse error in '%s'\n", blocks_filename);
5512 exit (1);
5514 blocks[numblocks].start = i1;
5515 blocks[numblocks].end = i2;
5516 blocks[numblocks].name = strdup (blockname);
5517 /* It must be sorted. */
5518 assert (numblocks == 0 || blocks[numblocks-1].end < blocks[numblocks].start);
5519 numblocks++;
5520 assert (numblocks != SIZEOF (blocks));
5523 if (ferror (stream) || fclose (stream))
5525 fprintf (stderr, "error reading from '%s'\n", blocks_filename);
5526 exit (1);
5530 /* Return the smallest block index among the blocks for characters >= ch. */
5531 static unsigned int
5532 block_first_index (unsigned int ch)
5534 /* Binary search. */
5535 unsigned int lo = 0;
5536 unsigned int hi = numblocks;
5537 /* Invariants:
5538 All blocks[i], i < lo, have blocks[i].end < ch,
5539 all blocks[i], i >= hi, have blocks[i].end >= ch. */
5540 while (lo < hi)
5542 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
5543 if (blocks[mid].end < ch)
5544 lo = mid + 1;
5545 else
5546 hi = mid;
5548 return hi;
5551 /* Return the largest block index among the blocks for characters <= ch,
5552 plus 1. */
5553 static unsigned int
5554 block_last_index (unsigned int ch)
5556 /* Binary search. */
5557 unsigned int lo = 0;
5558 unsigned int hi = numblocks;
5559 /* Invariants:
5560 All blocks[i], i < lo, have blocks[i].start <= ch,
5561 all blocks[i], i >= hi, have blocks[i].start > ch. */
5562 while (lo < hi)
5564 unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */
5565 if (blocks[mid].start <= ch)
5566 lo = mid + 1;
5567 else
5568 hi = mid;
5570 return hi;
5573 static void
5574 output_blocks (const char *version)
5576 const char *filename = "unictype/blocks.h";
5577 const unsigned int shift = 8; /* bits to shift away for array access */
5578 const unsigned int threshold = 0x28000; /* cut-off table here to save space */
5579 FILE *stream;
5580 unsigned int i;
5581 unsigned int i1;
5583 stream = fopen (filename, "w");
5584 if (stream == NULL)
5586 fprintf (stderr, "cannot open '%s' for writing\n", filename);
5587 exit (1);
5590 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
5591 fprintf (stream, "/* Unicode blocks. */\n");
5592 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
5593 version);
5594 fprintf (stream, "\n");
5596 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
5597 fprintf (stream, "\n");
5598 output_library_license (stream, false);
5599 fprintf (stream, "\n");
5601 fprintf (stream, "static const uc_block_t blocks[] =\n");
5602 fprintf (stream, "{\n");
5603 for (i = 0; i < numblocks; i++)
5605 fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start,
5606 blocks[i].end, blocks[i].name);
5607 if (i+1 < numblocks)
5608 fprintf (stream, ",");
5609 fprintf (stream, "\n");
5611 fprintf (stream, "};\n");
5612 fprintf (stream, "#define blocks_level1_shift %d\n", shift);
5613 fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold);
5614 fprintf (stream, "static const uint16_t blocks_level1[%d * 2] =\n",
5615 threshold >> shift);
5616 fprintf (stream, "{\n");
5617 for (i1 = 0; i1 < (threshold >> shift); i1++)
5619 unsigned int first_index = block_first_index (i1 << shift);
5620 unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1);
5621 fprintf (stream, " %3d, %3d", first_index, last_index);
5622 if (i1+1 < (threshold >> shift))
5623 fprintf (stream, ",");
5624 fprintf (stream, "\n");
5626 fprintf (stream, "};\n");
5627 fprintf (stream, "#define blocks_upper_first_index %d\n",
5628 block_first_index (threshold));
5629 fprintf (stream, "#define blocks_upper_last_index %d\n",
5630 block_last_index (0x10FFFF));
5632 if (ferror (stream) || fclose (stream))
5634 fprintf (stderr, "error writing to '%s'\n", filename);
5635 exit (1);
5639 /* ========================================================================= */
5641 /* C and Java syntax. */
5643 enum
5645 UC_IDENTIFIER_START, /* valid as first or subsequent character */
5646 UC_IDENTIFIER_VALID, /* valid as subsequent character only */
5647 UC_IDENTIFIER_INVALID, /* not valid */
5648 UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */
5651 /* ISO C 99 section 6.4.(3). */
5652 static bool
5653 is_c_whitespace (unsigned int ch)
5655 return (ch == ' ' /* space */
5656 || ch == '\t' /* horizontal tab */
5657 || ch == '\n' || ch == '\r' /* new-line */
5658 || ch == '\v' /* vertical tab */
5659 || ch == '\f'); /* form-feed */
5662 /* ISO C 99 section 6.4.2.1 and appendix D. */
5663 static int
5664 c_ident_category (unsigned int ch)
5666 /* Section 6.4.2.1. */
5667 if (ch >= '0' && ch <= '9')
5668 return UC_IDENTIFIER_VALID;
5669 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_')
5670 return UC_IDENTIFIER_START;
5671 /* Appendix D. */
5672 if (0
5673 /* Latin */
5674 || (ch == 0x00AA)
5675 || (ch == 0x00BA)
5676 || (ch >= 0x00C0 && ch <= 0x00D6)
5677 || (ch >= 0x00D8 && ch <= 0x00F6)
5678 || (ch >= 0x00F8 && ch <= 0x01F5)
5679 || (ch >= 0x01FA && ch <= 0x0217)
5680 || (ch >= 0x0250 && ch <= 0x02A8)
5681 || (ch >= 0x1E00 && ch <= 0x1E9B)
5682 || (ch >= 0x1EA0 && ch <= 0x1EF9)
5683 || (ch == 0x207F)
5684 /* Greek */
5685 || (ch == 0x0386)
5686 || (ch >= 0x0388 && ch <= 0x038A)
5687 || (ch == 0x038C)
5688 || (ch >= 0x038E && ch <= 0x03A1)
5689 || (ch >= 0x03A3 && ch <= 0x03CE)
5690 || (ch >= 0x03D0 && ch <= 0x03D6)
5691 || (ch == 0x03DA)
5692 || (ch == 0x03DC)
5693 || (ch == 0x03DE)
5694 || (ch == 0x03E0)
5695 || (ch >= 0x03E2 && ch <= 0x03F3)
5696 || (ch >= 0x1F00 && ch <= 0x1F15)
5697 || (ch >= 0x1F18 && ch <= 0x1F1D)
5698 || (ch >= 0x1F20 && ch <= 0x1F45)
5699 || (ch >= 0x1F48 && ch <= 0x1F4D)
5700 || (ch >= 0x1F50 && ch <= 0x1F57)
5701 || (ch == 0x1F59)
5702 || (ch == 0x1F5B)
5703 || (ch == 0x1F5D)
5704 || (ch >= 0x1F5F && ch <= 0x1F7D)
5705 || (ch >= 0x1F80 && ch <= 0x1FB4)
5706 || (ch >= 0x1FB6 && ch <= 0x1FBC)
5707 || (ch >= 0x1FC2 && ch <= 0x1FC4)
5708 || (ch >= 0x1FC6 && ch <= 0x1FCC)
5709 || (ch >= 0x1FD0 && ch <= 0x1FD3)
5710 || (ch >= 0x1FD6 && ch <= 0x1FDB)
5711 || (ch >= 0x1FE0 && ch <= 0x1FEC)
5712 || (ch >= 0x1FF2 && ch <= 0x1FF4)
5713 || (ch >= 0x1FF6 && ch <= 0x1FFC)
5714 /* Cyrillic */
5715 || (ch >= 0x0401 && ch <= 0x040C)
5716 || (ch >= 0x040E && ch <= 0x044F)
5717 || (ch >= 0x0451 && ch <= 0x045C)
5718 || (ch >= 0x045E && ch <= 0x0481)
5719 || (ch >= 0x0490 && ch <= 0x04C4)
5720 || (ch >= 0x04C7 && ch <= 0x04C8)
5721 || (ch >= 0x04CB && ch <= 0x04CC)
5722 || (ch >= 0x04D0 && ch <= 0x04EB)
5723 || (ch >= 0x04EE && ch <= 0x04F5)
5724 || (ch >= 0x04F8 && ch <= 0x04F9)
5725 /* Armenian */
5726 || (ch >= 0x0531 && ch <= 0x0556)
5727 || (ch >= 0x0561 && ch <= 0x0587)
5728 /* Hebrew */
5729 || (ch >= 0x05B0 && ch <= 0x05B9)
5730 || (ch >= 0x05BB && ch <= 0x05BD)
5731 || (ch == 0x05BF)
5732 || (ch >= 0x05C1 && ch <= 0x05C2)
5733 || (ch >= 0x05D0 && ch <= 0x05EA)
5734 || (ch >= 0x05F0 && ch <= 0x05F2)
5735 /* Arabic */
5736 || (ch >= 0x0621 && ch <= 0x063A)
5737 || (ch >= 0x0640 && ch <= 0x0652)
5738 || (ch >= 0x0670 && ch <= 0x06B7)
5739 || (ch >= 0x06BA && ch <= 0x06BE)
5740 || (ch >= 0x06C0 && ch <= 0x06CE)
5741 || (ch >= 0x06D0 && ch <= 0x06DC)
5742 || (ch >= 0x06E5 && ch <= 0x06E8)
5743 || (ch >= 0x06EA && ch <= 0x06ED)
5744 /* Devanagari */
5745 || (ch >= 0x0901 && ch <= 0x0903)
5746 || (ch >= 0x0905 && ch <= 0x0939)
5747 || (ch >= 0x093E && ch <= 0x094D)
5748 || (ch >= 0x0950 && ch <= 0x0952)
5749 || (ch >= 0x0958 && ch <= 0x0963)
5750 /* Bengali */
5751 || (ch >= 0x0981 && ch <= 0x0983)
5752 || (ch >= 0x0985 && ch <= 0x098C)
5753 || (ch >= 0x098F && ch <= 0x0990)
5754 || (ch >= 0x0993 && ch <= 0x09A8)
5755 || (ch >= 0x09AA && ch <= 0x09B0)
5756 || (ch == 0x09B2)
5757 || (ch >= 0x09B6 && ch <= 0x09B9)
5758 || (ch >= 0x09BE && ch <= 0x09C4)
5759 || (ch >= 0x09C7 && ch <= 0x09C8)
5760 || (ch >= 0x09CB && ch <= 0x09CD)
5761 || (ch >= 0x09DC && ch <= 0x09DD)
5762 || (ch >= 0x09DF && ch <= 0x09E3)
5763 || (ch >= 0x09F0 && ch <= 0x09F1)
5764 /* Gurmukhi */
5765 || (ch == 0x0A02)
5766 || (ch >= 0x0A05 && ch <= 0x0A0A)
5767 || (ch >= 0x0A0F && ch <= 0x0A10)
5768 || (ch >= 0x0A13 && ch <= 0x0A28)
5769 || (ch >= 0x0A2A && ch <= 0x0A30)
5770 || (ch >= 0x0A32 && ch <= 0x0A33)
5771 || (ch >= 0x0A35 && ch <= 0x0A36)
5772 || (ch >= 0x0A38 && ch <= 0x0A39)
5773 || (ch >= 0x0A3E && ch <= 0x0A42)
5774 || (ch >= 0x0A47 && ch <= 0x0A48)
5775 || (ch >= 0x0A4B && ch <= 0x0A4D)
5776 || (ch >= 0x0A59 && ch <= 0x0A5C)
5777 || (ch == 0x0A5E)
5778 || (ch == 0x0A74)
5779 /* Gujarati */
5780 || (ch >= 0x0A81 && ch <= 0x0A83)
5781 || (ch >= 0x0A85 && ch <= 0x0A8B)
5782 || (ch == 0x0A8D)
5783 || (ch >= 0x0A8F && ch <= 0x0A91)
5784 || (ch >= 0x0A93 && ch <= 0x0AA8)
5785 || (ch >= 0x0AAA && ch <= 0x0AB0)
5786 || (ch >= 0x0AB2 && ch <= 0x0AB3)
5787 || (ch >= 0x0AB5 && ch <= 0x0AB9)
5788 || (ch >= 0x0ABD && ch <= 0x0AC5)
5789 || (ch >= 0x0AC7 && ch <= 0x0AC9)
5790 || (ch >= 0x0ACB && ch <= 0x0ACD)
5791 || (ch == 0x0AD0)
5792 || (ch == 0x0AE0)
5793 /* Oriya */
5794 || (ch >= 0x0B01 && ch <= 0x0B03)
5795 || (ch >= 0x0B05 && ch <= 0x0B0C)
5796 || (ch >= 0x0B0F && ch <= 0x0B10)
5797 || (ch >= 0x0B13 && ch <= 0x0B28)
5798 || (ch >= 0x0B2A && ch <= 0x0B30)
5799 || (ch >= 0x0B32 && ch <= 0x0B33)
5800 || (ch >= 0x0B36 && ch <= 0x0B39)
5801 || (ch >= 0x0B3E && ch <= 0x0B43)
5802 || (ch >= 0x0B47 && ch <= 0x0B48)
5803 || (ch >= 0x0B4B && ch <= 0x0B4D)
5804 || (ch >= 0x0B5C && ch <= 0x0B5D)
5805 || (ch >= 0x0B5F && ch <= 0x0B61)
5806 /* Tamil */
5807 || (ch >= 0x0B82 && ch <= 0x0B83)
5808 || (ch >= 0x0B85 && ch <= 0x0B8A)
5809 || (ch >= 0x0B8E && ch <= 0x0B90)
5810 || (ch >= 0x0B92 && ch <= 0x0B95)
5811 || (ch >= 0x0B99 && ch <= 0x0B9A)
5812 || (ch == 0x0B9C)
5813 || (ch >= 0x0B9E && ch <= 0x0B9F)
5814 || (ch >= 0x0BA3 && ch <= 0x0BA4)
5815 || (ch >= 0x0BA8 && ch <= 0x0BAA)
5816 || (ch >= 0x0BAE && ch <= 0x0BB5)
5817 || (ch >= 0x0BB7 && ch <= 0x0BB9)
5818 || (ch >= 0x0BBE && ch <= 0x0BC2)
5819 || (ch >= 0x0BC6 && ch <= 0x0BC8)
5820 || (ch >= 0x0BCA && ch <= 0x0BCD)
5821 /* Telugu */
5822 || (ch >= 0x0C01 && ch <= 0x0C03)
5823 || (ch >= 0x0C05 && ch <= 0x0C0C)
5824 || (ch >= 0x0C0E && ch <= 0x0C10)
5825 || (ch >= 0x0C12 && ch <= 0x0C28)
5826 || (ch >= 0x0C2A && ch <= 0x0C33)
5827 || (ch >= 0x0C35 && ch <= 0x0C39)
5828 || (ch >= 0x0C3E && ch <= 0x0C44)
5829 || (ch >= 0x0C46 && ch <= 0x0C48)
5830 || (ch >= 0x0C4A && ch <= 0x0C4D)
5831 || (ch >= 0x0C60 && ch <= 0x0C61)
5832 /* Kannada */
5833 || (ch >= 0x0C82 && ch <= 0x0C83)
5834 || (ch >= 0x0C85 && ch <= 0x0C8C)
5835 || (ch >= 0x0C8E && ch <= 0x0C90)
5836 || (ch >= 0x0C92 && ch <= 0x0CA8)
5837 || (ch >= 0x0CAA && ch <= 0x0CB3)
5838 || (ch >= 0x0CB5 && ch <= 0x0CB9)
5839 || (ch >= 0x0CBE && ch <= 0x0CC4)
5840 || (ch >= 0x0CC6 && ch <= 0x0CC8)
5841 || (ch >= 0x0CCA && ch <= 0x0CCD)
5842 || (ch == 0x0CDE)
5843 || (ch >= 0x0CE0 && ch <= 0x0CE1)
5844 /* Malayalam */
5845 || (ch >= 0x0D02 && ch <= 0x0D03)
5846 || (ch >= 0x0D05 && ch <= 0x0D0C)
5847 || (ch >= 0x0D0E && ch <= 0x0D10)
5848 || (ch >= 0x0D12 && ch <= 0x0D28)
5849 || (ch >= 0x0D2A && ch <= 0x0D39)
5850 || (ch >= 0x0D3E && ch <= 0x0D43)
5851 || (ch >= 0x0D46 && ch <= 0x0D48)
5852 || (ch >= 0x0D4A && ch <= 0x0D4D)
5853 || (ch >= 0x0D60 && ch <= 0x0D61)
5854 /* Thai */
5855 || (ch >= 0x0E01 && ch <= 0x0E3A)
5856 || (ch >= 0x0E40 && ch <= 0x0E5B)
5857 /* Lao */
5858 || (ch >= 0x0E81 && ch <= 0x0E82)
5859 || (ch == 0x0E84)
5860 || (ch >= 0x0E87 && ch <= 0x0E88)
5861 || (ch == 0x0E8A)
5862 || (ch == 0x0E8D)
5863 || (ch >= 0x0E94 && ch <= 0x0E97)
5864 || (ch >= 0x0E99 && ch <= 0x0E9F)
5865 || (ch >= 0x0EA1 && ch <= 0x0EA3)
5866 || (ch == 0x0EA5)
5867 || (ch == 0x0EA7)
5868 || (ch >= 0x0EAA && ch <= 0x0EAB)
5869 || (ch >= 0x0EAD && ch <= 0x0EAE)
5870 || (ch >= 0x0EB0 && ch <= 0x0EB9)
5871 || (ch >= 0x0EBB && ch <= 0x0EBD)
5872 || (ch >= 0x0EC0 && ch <= 0x0EC4)
5873 || (ch == 0x0EC6)
5874 || (ch >= 0x0EC8 && ch <= 0x0ECD)
5875 || (ch >= 0x0EDC && ch <= 0x0EDD)
5876 /* Tibetan */
5877 || (ch == 0x0F00)
5878 || (ch >= 0x0F18 && ch <= 0x0F19)
5879 || (ch == 0x0F35)
5880 || (ch == 0x0F37)
5881 || (ch == 0x0F39)
5882 || (ch >= 0x0F3E && ch <= 0x0F47)
5883 || (ch >= 0x0F49 && ch <= 0x0F69)
5884 || (ch >= 0x0F71 && ch <= 0x0F84)
5885 || (ch >= 0x0F86 && ch <= 0x0F8B)
5886 || (ch >= 0x0F90 && ch <= 0x0F95)
5887 || (ch == 0x0F97)
5888 || (ch >= 0x0F99 && ch <= 0x0FAD)
5889 || (ch >= 0x0FB1 && ch <= 0x0FB7)
5890 || (ch == 0x0FB9)
5891 /* Georgian */
5892 || (ch >= 0x10A0 && ch <= 0x10C5)
5893 || (ch >= 0x10D0 && ch <= 0x10F6)
5894 /* Hiragana */
5895 || (ch >= 0x3041 && ch <= 0x3093)
5896 || (ch >= 0x309B && ch <= 0x309C)
5897 /* Katakana */
5898 || (ch >= 0x30A1 && ch <= 0x30F6)
5899 || (ch >= 0x30FB && ch <= 0x30FC)
5900 /* Bopomofo */
5901 || (ch >= 0x3105 && ch <= 0x312C)
5902 /* CJK Unified Ideographs */
5903 || (ch >= 0x4E00 && ch <= 0x9FA5)
5904 /* Hangul */
5905 || (ch >= 0xAC00 && ch <= 0xD7A3)
5906 /* Digits */
5907 || (ch >= 0x0660 && ch <= 0x0669)
5908 || (ch >= 0x06F0 && ch <= 0x06F9)
5909 || (ch >= 0x0966 && ch <= 0x096F)
5910 || (ch >= 0x09E6 && ch <= 0x09EF)
5911 || (ch >= 0x0A66 && ch <= 0x0A6F)
5912 || (ch >= 0x0AE6 && ch <= 0x0AEF)
5913 || (ch >= 0x0B66 && ch <= 0x0B6F)
5914 || (ch >= 0x0BE7 && ch <= 0x0BEF)
5915 || (ch >= 0x0C66 && ch <= 0x0C6F)
5916 || (ch >= 0x0CE6 && ch <= 0x0CEF)
5917 || (ch >= 0x0D66 && ch <= 0x0D6F)
5918 || (ch >= 0x0E50 && ch <= 0x0E59)
5919 || (ch >= 0x0ED0 && ch <= 0x0ED9)
5920 || (ch >= 0x0F20 && ch <= 0x0F33)
5921 /* Special characters */
5922 || (ch == 0x00B5)
5923 || (ch == 0x00B7)
5924 || (ch >= 0x02B0 && ch <= 0x02B8)
5925 || (ch == 0x02BB)
5926 || (ch >= 0x02BD && ch <= 0x02C1)
5927 || (ch >= 0x02D0 && ch <= 0x02D1)
5928 || (ch >= 0x02E0 && ch <= 0x02E4)
5929 || (ch == 0x037A)
5930 || (ch == 0x0559)
5931 || (ch == 0x093D)
5932 || (ch == 0x0B3D)
5933 || (ch == 0x1FBE)
5934 || (ch >= 0x203F && ch <= 0x2040)
5935 || (ch == 0x2102)
5936 || (ch == 0x2107)
5937 || (ch >= 0x210A && ch <= 0x2113)
5938 || (ch == 0x2115)
5939 || (ch >= 0x2118 && ch <= 0x211D)
5940 || (ch == 0x2124)
5941 || (ch == 0x2126)
5942 || (ch == 0x2128)
5943 || (ch >= 0x212A && ch <= 0x2131)
5944 || (ch >= 0x2133 && ch <= 0x2138)
5945 || (ch >= 0x2160 && ch <= 0x2182)
5946 || (ch >= 0x3005 && ch <= 0x3007)
5947 || (ch >= 0x3021 && ch <= 0x3029)
5949 return UC_IDENTIFIER_START;
5950 return UC_IDENTIFIER_INVALID;
5953 /* The Java Language Specification, 3rd edition, §3.6.
5954 https://docs.oracle.com/javase/specs/jls/se6/html/lexical.html#3.6 */
5955 static bool
5956 is_java_whitespace (unsigned int ch)
5958 return (ch == ' ' || ch == '\t' || ch == '\f'
5959 || ch == '\n' || ch == '\r');
5962 /* The Java Language Specification, 3rd edition, §3.8.
5963 https://docs.oracle.com/javase/specs/jls/se6/html/lexical.html#3.8
5964 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */
5965 static int
5966 java_ident_category (unsigned int ch)
5968 /* FIXME: Check this against Sun's JDK implementation. */
5969 if (is_category_L (ch) /* = Character.isLetter(ch) */
5970 || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */
5971 || is_category_Sc (ch) /* currency symbol */
5972 || is_category_Pc (ch) /* connector punctuation */
5974 return UC_IDENTIFIER_START;
5975 if (is_category_Nd (ch) /* digit */
5976 || is_category_Mc (ch) /* combining mark */
5977 || is_category_Mn (ch) /* non-spacing mark */
5979 return UC_IDENTIFIER_VALID;
5980 if ((ch >= 0x0000 && ch <= 0x0008)
5981 || (ch >= 0x000E && ch <= 0x001B)
5982 || (ch >= 0x007F && ch <= 0x009F)
5983 || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */
5985 return UC_IDENTIFIER_IGNORABLE;
5986 return UC_IDENTIFIER_INVALID;
5989 /* Construction of sparse 3-level tables. */
5990 #define TABLE identsyntax_table
5991 #define ELEMENT uint8_t
5992 #define DEFAULT UC_IDENTIFIER_INVALID
5993 #define xmalloc malloc
5994 #define xrealloc realloc
5995 #include "3level.h"
5997 /* Output an identifier syntax categorization in a three-level bitmap. */
5998 static void
5999 output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version)
6001 FILE *stream;
6002 unsigned int ch, i;
6003 struct identsyntax_table t;
6004 unsigned int level1_offset, level2_offset, level3_offset;
6006 stream = fopen (filename, "w");
6007 if (stream == NULL)
6009 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6010 exit (1);
6013 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6014 fprintf (stream, "/* Language syntax properties of Unicode characters. */\n");
6015 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
6016 version);
6017 fprintf (stream, "\n");
6019 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
6020 fprintf (stream, "\n");
6021 output_library_license (stream, false);
6022 fprintf (stream, "\n");
6024 t.p = 7; /* or 8 */
6025 t.q = 5; /* or 4 */
6026 identsyntax_table_init (&t);
6028 for (ch = 0; ch < 0x110000; ch++)
6030 int syntaxcode = predicate (ch);
6032 assert (syntaxcode <= 0x03);
6034 if (syntaxcode != UC_IDENTIFIER_INVALID)
6035 identsyntax_table_add (&t, ch, syntaxcode);
6038 identsyntax_table_finalize (&t);
6040 /* Offsets in t.result, in memory of this process. */
6041 level1_offset =
6042 5 * sizeof (uint32_t);
6043 level2_offset =
6044 5 * sizeof (uint32_t)
6045 + t.level1_size * sizeof (uint32_t);
6046 level3_offset =
6047 5 * sizeof (uint32_t)
6048 + t.level1_size * sizeof (uint32_t)
6049 + (t.level2_size << t.q) * sizeof (uint32_t);
6051 for (i = 0; i < 5; i++)
6052 fprintf (stream, "#define identsyntax_header_%d %d\n", i,
6053 ((uint32_t *) t.result)[i]);
6054 fprintf (stream, "static const\n");
6055 fprintf (stream, "struct\n");
6056 fprintf (stream, " {\n");
6057 fprintf (stream, " int level1[%zu];\n", t.level1_size);
6058 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
6059 fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size,
6060 (1 << t.p) * 2 / 16);
6061 fprintf (stream, " }\n");
6062 fprintf (stream, "%s =\n", name);
6063 fprintf (stream, "{\n");
6064 fprintf (stream, " {");
6065 if (t.level1_size > 8)
6066 fprintf (stream, "\n ");
6067 for (i = 0; i < t.level1_size; i++)
6069 uint32_t offset;
6070 if (i > 0 && (i % 8) == 0)
6071 fprintf (stream, "\n ");
6072 offset = ((uint32_t *) (t.result + level1_offset))[i];
6073 if (offset == 0)
6074 fprintf (stream, " %5d", -1);
6075 else
6076 fprintf (stream, " %5zu",
6077 (offset - level2_offset) / sizeof (uint32_t));
6078 if (i+1 < t.level1_size)
6079 fprintf (stream, ",");
6081 if (t.level1_size > 8)
6082 fprintf (stream, "\n ");
6083 fprintf (stream, " },\n");
6084 fprintf (stream, " {");
6085 if (t.level2_size << t.q > 8)
6086 fprintf (stream, "\n ");
6087 for (i = 0; i < t.level2_size << t.q; i++)
6089 uint32_t offset;
6090 if (i > 0 && (i % 8) == 0)
6091 fprintf (stream, "\n ");
6092 offset = ((uint32_t *) (t.result + level2_offset))[i];
6093 if (offset == 0)
6094 fprintf (stream, " %5d", -1);
6095 else
6096 fprintf (stream, " %5zu",
6097 (offset - level3_offset) / sizeof (uint8_t));
6098 if (i+1 < t.level2_size << t.q)
6099 fprintf (stream, ",");
6101 if (t.level2_size << t.q > 8)
6102 fprintf (stream, "\n ");
6103 fprintf (stream, " },\n");
6104 /* Pack the level3 array. Each entry needs 2 bits only. */
6105 fprintf (stream, " {");
6106 if ((t.level3_size << t.p) * 2 / 16 > 8)
6107 fprintf (stream, "\n ");
6108 for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++)
6110 if (i > 0 && (i % 8) == 0)
6111 fprintf (stream, "\n ");
6112 fprintf (stream, " 0x%04x",
6113 (((uint8_t *) (t.result + level3_offset))[8 * i] << 0)
6114 | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2)
6115 | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4)
6116 | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6)
6117 | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8)
6118 | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10)
6119 | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12)
6120 | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14));
6121 if (i+1 < (t.level3_size << t.p) * 2 / 16)
6122 fprintf (stream, ",");
6124 if ((t.level3_size << t.p) * 2 / 16 > 8)
6125 fprintf (stream, "\n ");
6126 fprintf (stream, " }\n");
6127 fprintf (stream, "};\n");
6129 if (ferror (stream) || fclose (stream))
6131 fprintf (stderr, "error writing to '%s'\n", filename);
6132 exit (1);
6136 static void
6137 output_ident_properties (const char *version)
6139 #define PROPERTY(P) \
6140 debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \
6141 output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
6142 output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version);
6143 PROPERTY(c_whitespace)
6144 PROPERTY(java_whitespace)
6145 #undef PROPERTY
6147 output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version);
6148 output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version);
6151 /* ========================================================================= */
6153 /* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's
6154 glibc/localedata/locales/i18n file, generated by
6155 glibc/localedata/gen-unicode-ctype.c. */
6157 /* Character mappings. */
6159 static unsigned int
6160 to_upper (unsigned int ch)
6162 if (unicode_attributes[ch].name != NULL
6163 && unicode_attributes[ch].upper != NONE)
6164 return unicode_attributes[ch].upper;
6165 else
6166 return ch;
6169 static unsigned int
6170 to_lower (unsigned int ch)
6172 if (unicode_attributes[ch].name != NULL
6173 && unicode_attributes[ch].lower != NONE)
6174 return unicode_attributes[ch].lower;
6175 else
6176 return ch;
6179 static unsigned int
6180 to_title (unsigned int ch)
6182 if (unicode_attributes[ch].name != NULL
6183 && unicode_attributes[ch].title != NONE)
6184 return unicode_attributes[ch].title;
6185 else
6186 return ch;
6189 /* Character class properties. */
6191 static bool
6192 is_upper (unsigned int ch)
6194 return (to_lower (ch) != ch);
6197 static bool
6198 is_lower (unsigned int ch)
6200 return (to_upper (ch) != ch)
6201 /* <U00DF> is lowercase, but without simple to_upper mapping. */
6202 || (ch == 0x00DF);
6205 static bool
6206 is_alpha (unsigned int ch)
6208 return (unicode_attributes[ch].name != NULL
6209 && ((unicode_attributes[ch].category[0] == 'L'
6210 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
6211 <U0E2F>, <U0E46> should belong to is_punct. */
6212 && (ch != 0x0E2F) && (ch != 0x0E46))
6213 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
6214 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
6215 || (ch == 0x0E31)
6216 || (ch >= 0x0E34 && ch <= 0x0E3A)
6217 || (ch >= 0x0E47 && ch <= 0x0E4E)
6218 /* Avoid warning for <U0345>. */
6219 || (ch == 0x0345)
6220 /* Avoid warnings for <U2160>..<U217F>. */
6221 || (unicode_attributes[ch].category[0] == 'N'
6222 && unicode_attributes[ch].category[1] == 'l')
6223 /* Avoid warnings for <U24B6>..<U24E9>. */
6224 || (unicode_attributes[ch].category[0] == 'S'
6225 && unicode_attributes[ch].category[1] == 'o'
6226 && strstr (unicode_attributes[ch].name, " LETTER ")
6227 != NULL)
6228 /* Consider all the non-ASCII digits as alphabetic.
6229 ISO C 99 forbids us to have them in category "digit",
6230 but we want iswalnum to return true on them. */
6231 || (unicode_attributes[ch].category[0] == 'N'
6232 && unicode_attributes[ch].category[1] == 'd'
6233 && !(ch >= 0x0030 && ch <= 0x0039))));
6236 static bool
6237 is_digit (unsigned int ch)
6239 #if 0
6240 return (unicode_attributes[ch].name != NULL
6241 && unicode_attributes[ch].category[0] == 'N'
6242 && unicode_attributes[ch].category[1] == 'd');
6243 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
6244 a zero. Must add <0> in front of them by hand. */
6245 #else
6246 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
6247 takes it away:
6248 7.25.2.1.5:
6249 The iswdigit function tests for any wide character that corresponds
6250 to a decimal-digit character (as defined in 5.2.1).
6251 5.2.1:
6252 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
6254 return (ch >= 0x0030 && ch <= 0x0039);
6255 #endif
6258 static bool
6259 is_alnum (unsigned int ch)
6261 return is_alpha (ch) || is_digit (ch);
6264 static bool
6265 is_blank (unsigned int ch)
6267 return (ch == 0x0009 /* '\t' */
6268 /* Category Zs without mention of "<noBreak>" */
6269 || (unicode_attributes[ch].name != NULL
6270 && unicode_attributes[ch].category[0] == 'Z'
6271 && unicode_attributes[ch].category[1] == 's'
6272 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
6275 static bool
6276 is_space (unsigned int ch)
6278 /* Don't make U+00A0 a space. Non-breaking space means that all programs
6279 should treat it like a punctuation character, not like a space. */
6280 return (ch == 0x0020 /* ' ' */
6281 || ch == 0x000C /* '\f' */
6282 || ch == 0x000A /* '\n' */
6283 || ch == 0x000D /* '\r' */
6284 || ch == 0x0009 /* '\t' */
6285 || ch == 0x000B /* '\v' */
6286 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
6287 || (unicode_attributes[ch].name != NULL
6288 && unicode_attributes[ch].category[0] == 'Z'
6289 && (unicode_attributes[ch].category[1] == 'l'
6290 || unicode_attributes[ch].category[1] == 'p'
6291 || (unicode_attributes[ch].category[1] == 's'
6292 && !strstr (unicode_attributes[ch].decomposition,
6293 "<noBreak>")))));
6296 static bool
6297 is_cntrl (unsigned int ch)
6299 return (unicode_attributes[ch].name != NULL
6300 && (strcmp (unicode_attributes[ch].name, "<control>") == 0
6301 /* Categories Zl and Zp */
6302 || (unicode_attributes[ch].category[0] == 'Z'
6303 && (unicode_attributes[ch].category[1] == 'l'
6304 || unicode_attributes[ch].category[1] == 'p'))));
6307 static bool
6308 is_xdigit (unsigned int ch)
6310 #if 0
6311 return is_digit (ch)
6312 || (ch >= 0x0041 && ch <= 0x0046)
6313 || (ch >= 0x0061 && ch <= 0x0066);
6314 #else
6315 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
6316 takes it away:
6317 7.25.2.1.12:
6318 The iswxdigit function tests for any wide character that corresponds
6319 to a hexadecimal-digit character (as defined in 6.4.4.1).
6320 6.4.4.1:
6321 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
6323 return (ch >= 0x0030 && ch <= 0x0039)
6324 || (ch >= 0x0041 && ch <= 0x0046)
6325 || (ch >= 0x0061 && ch <= 0x0066);
6326 #endif
6329 static bool
6330 is_graph (unsigned int ch)
6332 return (unicode_attributes[ch].name != NULL
6333 && strcmp (unicode_attributes[ch].name, "<control>")
6334 && !is_space (ch));
6337 static bool
6338 is_print (unsigned int ch)
6340 return (unicode_attributes[ch].name != NULL
6341 && strcmp (unicode_attributes[ch].name, "<control>")
6342 /* Categories Zl and Zp */
6343 && !(unicode_attributes[ch].name != NULL
6344 && unicode_attributes[ch].category[0] == 'Z'
6345 && (unicode_attributes[ch].category[1] == 'l'
6346 || unicode_attributes[ch].category[1] == 'p')));
6349 static bool
6350 is_punct (unsigned int ch)
6352 #if 0
6353 return (unicode_attributes[ch].name != NULL
6354 && unicode_attributes[ch].category[0] == 'P');
6355 #else
6356 /* The traditional POSIX definition of punctuation is every graphic,
6357 non-alphanumeric character. */
6358 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
6359 #endif
6362 /* Output all properties. */
6363 static void
6364 output_old_ctype (const char *version)
6366 #define PROPERTY(P) \
6367 debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \
6368 output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \
6369 output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version);
6370 PROPERTY(alnum)
6371 PROPERTY(alpha)
6372 PROPERTY(cntrl)
6373 PROPERTY(digit)
6374 PROPERTY(graph)
6375 PROPERTY(lower)
6376 PROPERTY(print)
6377 PROPERTY(punct)
6378 PROPERTY(space)
6379 PROPERTY(upper)
6380 PROPERTY(xdigit)
6381 PROPERTY(blank)
6382 #undef PROPERTY
6385 #if 0
6387 static bool
6388 is_combining (unsigned int ch)
6390 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
6391 file. In 3.0.1 it was identical to the union of the general categories
6392 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
6393 PropList.txt file, so we take the latter definition. */
6394 return (unicode_attributes[ch].name != NULL
6395 && unicode_attributes[ch].category[0] == 'M'
6396 && (unicode_attributes[ch].category[1] == 'n'
6397 || unicode_attributes[ch].category[1] == 'c'
6398 || unicode_attributes[ch].category[1] == 'e'));
6401 static bool
6402 is_combining_level3 (unsigned int ch)
6404 return is_combining (ch)
6405 && !(unicode_attributes[ch].combining[0] != '\0'
6406 && unicode_attributes[ch].combining[0] != '0'
6407 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
6410 /* Return the UCS symbol string for a Unicode character. */
6411 static const char *
6412 ucs_symbol (unsigned int i)
6414 static char buf[11+1];
6416 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
6417 return buf;
6420 /* Return the UCS symbol range string for a Unicode characters interval. */
6421 static const char *
6422 ucs_symbol_range (unsigned int low, unsigned int high)
6424 static char buf[24+1];
6426 strcpy (buf, ucs_symbol (low));
6427 strcat (buf, "..");
6428 strcat (buf, ucs_symbol (high));
6429 return buf;
6432 /* Output a character class (= property) table. */
6434 static void
6435 output_charclass (FILE *stream, const char *classname,
6436 bool (*func) (unsigned int))
6438 char table[0x110000];
6439 unsigned int i;
6440 bool need_semicolon;
6441 const int max_column = 75;
6442 int column;
6444 for (i = 0; i < 0x110000; i++)
6445 table[i] = (int) func (i);
6447 fprintf (stream, "%s ", classname);
6448 need_semicolon = false;
6449 column = 1000;
6450 for (i = 0; i < 0x110000; )
6452 if (!table[i])
6453 i++;
6454 else
6456 unsigned int low, high;
6457 char buf[25];
6459 low = i;
6461 i++;
6462 while (i < 0x110000 && table[i]);
6463 high = i - 1;
6465 if (low == high)
6466 strcpy (buf, ucs_symbol (low));
6467 else
6468 strcpy (buf, ucs_symbol_range (low, high));
6470 if (need_semicolon)
6472 fprintf (stream, ";");
6473 column++;
6476 if (column + strlen (buf) > max_column)
6478 fprintf (stream, "/\n ");
6479 column = 3;
6482 fprintf (stream, "%s", buf);
6483 column += strlen (buf);
6484 need_semicolon = true;
6487 fprintf (stream, "\n");
6490 /* Output a character mapping table. */
6492 static void
6493 output_charmap (FILE *stream, const char *mapname,
6494 unsigned int (*func) (unsigned int))
6496 char table[0x110000];
6497 unsigned int i;
6498 bool need_semicolon;
6499 const int max_column = 75;
6500 int column;
6502 for (i = 0; i < 0x110000; i++)
6503 table[i] = (func (i) != i);
6505 fprintf (stream, "%s ", mapname);
6506 need_semicolon = false;
6507 column = 1000;
6508 for (i = 0; i < 0x110000; i++)
6509 if (table[i])
6511 char buf[25+1];
6513 strcpy (buf, "(");
6514 strcat (buf, ucs_symbol (i));
6515 strcat (buf, ",");
6516 strcat (buf, ucs_symbol (func (i)));
6517 strcat (buf, ")");
6519 if (need_semicolon)
6521 fprintf (stream, ";");
6522 column++;
6525 if (column + strlen (buf) > max_column)
6527 fprintf (stream, "/\n ");
6528 column = 3;
6531 fprintf (stream, "%s", buf);
6532 column += strlen (buf);
6533 need_semicolon = true;
6535 fprintf (stream, "\n");
6538 /* Output the width table. */
6540 static void
6541 output_widthmap (FILE *stream)
6545 /* Output the tables to the given file. */
6547 static void
6548 output_tables (const char *filename, const char *version)
6550 FILE *stream;
6551 unsigned int ch;
6553 stream = fopen (filename, "w");
6554 if (stream == NULL)
6556 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6557 exit (1);
6560 fprintf (stream, "escape_char /\n");
6561 fprintf (stream, "comment_char %%\n");
6562 fprintf (stream, "\n");
6563 fprintf (stream, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n",
6564 version);
6565 fprintf (stream, "\n");
6567 fprintf (stream, "LC_IDENTIFICATION\n");
6568 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
6569 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
6570 fprintf (stream, "address \"\"\n");
6571 fprintf (stream, "contact \"\"\n");
6572 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
6573 fprintf (stream, "tel \"\"\n");
6574 fprintf (stream, "fax \"\"\n");
6575 fprintf (stream, "language \"\"\n");
6576 fprintf (stream, "territory \"Earth\"\n");
6577 fprintf (stream, "revision \"%s\"\n", version);
6579 time_t now;
6580 char date[11];
6581 now = time (NULL);
6582 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
6583 fprintf (stream, "date \"%s\"\n", date);
6585 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
6586 fprintf (stream, "END LC_IDENTIFICATION\n");
6587 fprintf (stream, "\n");
6589 /* Verification. */
6590 for (ch = 0; ch < 0x110000; ch++)
6592 /* toupper restriction: "Only characters specified for the keywords
6593 lower and upper shall be specified. */
6594 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
6595 fprintf (stderr,
6596 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
6597 ucs_symbol (ch), ch, to_upper (ch));
6599 /* tolower restriction: "Only characters specified for the keywords
6600 lower and upper shall be specified. */
6601 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
6602 fprintf (stderr,
6603 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
6604 ucs_symbol (ch), ch, to_lower (ch));
6606 /* alpha restriction: "Characters classified as either upper or lower
6607 shall automatically belong to this class. */
6608 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
6609 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
6611 /* alpha restriction: "No character specified for the keywords cntrl,
6612 digit, punct or space shall be specified." */
6613 if (is_alpha (ch) && is_cntrl (ch))
6614 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
6615 if (is_alpha (ch) && is_digit (ch))
6616 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
6617 if (is_alpha (ch) && is_punct (ch))
6618 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
6619 if (is_alpha (ch) && is_space (ch))
6620 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
6622 /* space restriction: "No character specified for the keywords upper,
6623 lower, alpha, digit, graph or xdigit shall be specified."
6624 upper, lower, alpha already checked above. */
6625 if (is_space (ch) && is_digit (ch))
6626 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
6627 if (is_space (ch) && is_graph (ch))
6628 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
6629 if (is_space (ch) && is_xdigit (ch))
6630 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
6632 /* cntrl restriction: "No character specified for the keywords upper,
6633 lower, alpha, digit, punct, graph, print or xdigit shall be
6634 specified." upper, lower, alpha already checked above. */
6635 if (is_cntrl (ch) && is_digit (ch))
6636 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
6637 if (is_cntrl (ch) && is_punct (ch))
6638 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
6639 if (is_cntrl (ch) && is_graph (ch))
6640 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
6641 if (is_cntrl (ch) && is_print (ch))
6642 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
6643 if (is_cntrl (ch) && is_xdigit (ch))
6644 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
6646 /* punct restriction: "No character specified for the keywords upper,
6647 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
6648 be specified." upper, lower, alpha, cntrl already checked above. */
6649 if (is_punct (ch) && is_digit (ch))
6650 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
6651 if (is_punct (ch) && is_xdigit (ch))
6652 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
6653 if (is_punct (ch) && (ch == 0x0020))
6654 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
6656 /* graph restriction: "No character specified for the keyword cntrl
6657 shall be specified." Already checked above. */
6659 /* print restriction: "No character specified for the keyword cntrl
6660 shall be specified." Already checked above. */
6662 /* graph - print relation: differ only in the <space> character.
6663 How is this possible if there are more than one space character?!
6664 I think susv2/xbd/locale.html should speak of "space characters",
6665 not "space character". */
6666 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
6667 fprintf (stderr,
6668 "%s is print but not graph|<space>\n", ucs_symbol (ch));
6669 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
6670 fprintf (stderr,
6671 "%s is graph|<space> but not print\n", ucs_symbol (ch));
6674 fprintf (stream, "LC_CTYPE\n");
6675 output_charclass (stream, "upper", is_upper);
6676 output_charclass (stream, "lower", is_lower);
6677 output_charclass (stream, "alpha", is_alpha);
6678 output_charclass (stream, "digit", is_digit);
6679 output_charclass (stream, "outdigit", is_outdigit);
6680 output_charclass (stream, "blank", is_blank);
6681 output_charclass (stream, "space", is_space);
6682 output_charclass (stream, "cntrl", is_cntrl);
6683 output_charclass (stream, "punct", is_punct);
6684 output_charclass (stream, "xdigit", is_xdigit);
6685 output_charclass (stream, "graph", is_graph);
6686 output_charclass (stream, "print", is_print);
6687 output_charclass (stream, "class \"combining\";", is_combining);
6688 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
6689 output_charmap (stream, "toupper", to_upper);
6690 output_charmap (stream, "tolower", to_lower);
6691 output_charmap (stream, "map \"totitle\";", to_title);
6692 output_widthmap (stream);
6693 fprintf (stream, "END LC_CTYPE\n");
6695 if (ferror (stream) || fclose (stream))
6697 fprintf (stderr, "error writing to '%s'\n", filename);
6698 exit (1);
6702 #endif
6704 /* ========================================================================= */
6706 /* The width property from the EastAsianWidth.txt file.
6707 Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
6708 const char * unicode_width[0x110000];
6710 /* Stores in unicode_width[] the width property from the EastAsianWidth.txt
6711 file. */
6712 static void
6713 fill_width (const char *width_filename)
6715 unsigned int i, j;
6716 FILE *stream;
6717 char field0[FIELDLEN];
6718 char field1[FIELDLEN];
6719 char field2[FIELDLEN];
6720 int lineno = 0;
6722 for (i = 0; i < 0x110000; i++)
6723 unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
6725 stream = fopen (width_filename, "r");
6726 if (stream == NULL)
6728 fprintf (stderr, "error during fopen of '%s'\n", width_filename);
6729 exit (1);
6732 for (;;)
6734 int n;
6735 int c;
6737 lineno++;
6738 c = getc (stream);
6739 if (c == EOF)
6740 break;
6741 if (c == '\n')
6742 continue;
6743 if (c == '#')
6745 do c = getc (stream); while (c != EOF && c != '\n');
6746 continue;
6748 ungetc (c, stream);
6749 n = getfield (stream, field0, ';');
6750 do c = getc (stream); while (c == ' ');
6751 ungetc (c, stream);
6752 n += getfield (stream, field1, '#');
6753 n += getfield (stream, field2, '\n');
6754 if (n == 0)
6755 break;
6756 if (n != 3)
6758 fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
6759 exit (1);
6761 /* Remove trailing spaces from field0. */
6762 while (strlen (field0) > 0 && field0[strlen (field0) - 1] == ' ')
6763 field0[strlen (field0) - 1] = '\0';
6764 /* Remove trailing spaces from field1. */
6765 while (strlen (field1) > 0 && field1[strlen (field1) - 1] == ' ')
6766 field1[strlen (field1) - 1] = '\0';
6767 i = strtoul (field0, NULL, 16);
6768 if (strstr (field0, "..") != NULL)
6770 /* Deal with a range. */
6771 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
6772 for (; i <= j; i++)
6773 unicode_width[i] = strdup (field1);
6775 else
6777 /* Single character line. */
6778 unicode_width[i] = strdup (field1);
6782 if (ferror (stream) || fclose (stream))
6784 fprintf (stderr, "error reading from '%s'\n", width_filename);
6785 exit (1);
6789 /* ========================================================================= */
6791 /* Non-spacing attribute and width. */
6793 /* The non-spacing attribute table consists of:
6794 * Non-spacing characters; generated from PropList.txt or
6795 "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
6796 * Format control characters, except for characters with property
6797 Prepended_Concatenation_Mark; generated from
6798 "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt" and from
6799 "grep Prepended_Concatenation_Mark PropList.txt".
6800 Rationale for the Prepended_Concatenation_Mark exception:
6801 The Unicode standard says "Unlike most other format characters,
6802 however, they should be rendered with a visible glyph".
6803 * Zero width characters; generated from
6804 "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
6805 * Hangul Jamo characters that have conjoining behaviour:
6806 - jungseong = syllable-middle vowels
6807 - jongseong = syllable-final consonants
6808 Rationale:
6809 1) These characters act like combining characters. They have no
6810 equivalent in legacy character sets. Therefore the EastAsianWidth.txt
6811 file does not really matter for them; UAX #11 East Asian Width
6812 <https://www.unicode.org/reports/tr11/> makes it clear that it focus
6813 is on compatibility with traditional Japanese layout.
6814 By contrast, the same glyphs without conjoining behaviour are available
6815 in the U+3130..U+318F block, and these characters are mapped to legacy
6816 character sets, and traditional Japanese layout matters for them.
6817 2) glibc does the same thing, see
6818 <https://sourceware.org/bugzilla/show_bug.cgi?id=21750>
6819 <https://sourceware.org/bugzilla/show_bug.cgi?id=26120>
6822 static bool
6823 is_nonspacing (unsigned int ch)
6825 return (unicode_attributes[ch].name != NULL
6826 && (get_bidi_category (ch) == UC_BIDI_NSM
6827 || is_category_Cc (ch)
6828 || (is_category_Cf (ch)
6829 && !is_property_prepended_concatenation_mark (ch))
6830 || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0
6831 || (ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6) /* jungseong */
6832 || (ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB) /* jongseong */
6833 ) );
6836 static void
6837 output_nonspacing_property (const char *filename, const char *version)
6839 FILE *stream;
6840 int ind[0x110000 / 0x200];
6841 unsigned int i;
6842 unsigned int i_max;
6843 int next_ind;
6845 stream = fopen (filename, "w");
6846 if (stream == NULL)
6848 fprintf (stderr, "cannot open '%s' for writing\n", filename);
6849 exit (1);
6852 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
6853 fprintf (stream, "/* Table of non-spacing or control characters. */\n");
6854 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
6855 version);
6856 fprintf (stream, "\n");
6858 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
6859 fprintf (stream, "\n");
6860 output_library_license (stream, true);
6861 fprintf (stream, "\n");
6863 next_ind = 0;
6864 for (i = 0; i < 0x110000 / 0x200; i++)
6866 bool nontrivial = false;
6867 unsigned int ch;
6869 if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */
6870 for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++)
6871 if (is_nonspacing (ch))
6873 nontrivial = true;
6874 break;
6876 if (nontrivial)
6877 ind[i] = next_ind++;
6878 else
6879 ind[i] = -1;
6882 fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n",
6883 next_ind);
6884 i_max = 0;
6885 for (i = 0; i < 0x110000 / 0x200; i++)
6887 bool nontrivial = (ind[i] >= 0);
6889 if (nontrivial)
6891 unsigned int j;
6893 fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1);
6894 for (j = 0; j < 8; j++)
6896 unsigned int k;
6898 fprintf (stream, " ");
6899 for (k = 0; k < 8; k++)
6901 unsigned int l;
6902 unsigned char bits = 0;
6904 for (l = 0; l < 8; l++)
6906 unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l;
6908 if (is_nonspacing (ch))
6909 bits |= 1 << l;
6911 fprintf (stream, " 0x%02x%c", bits,
6912 ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ',');
6914 fprintf (stream, " /* 0x%04x-0x%04x */\n",
6915 i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1);
6917 i_max = i;
6920 fprintf (stream, "};\n");
6922 i_max = ((i_max + 8 - 1) / 8) * 8;
6923 fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n",
6924 i_max);
6926 unsigned int j;
6928 for (j = 0; j < i_max / 8; j++)
6930 unsigned int k;
6932 fprintf (stream, " ");
6933 for (k = 0; k < 8; k++)
6935 i = j * 8 + k;
6936 fprintf (stream, " %2d%c", ind[i],
6937 j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ',');
6939 fprintf (stream, " /* 0x%04x-0x%04x */\n",
6940 j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1);
6943 fprintf (stream, "};\n");
6945 if (ferror (stream) || fclose (stream))
6947 fprintf (stderr, "error writing to '%s'\n", filename);
6948 exit (1);
6952 /* Determines whether a character has width 2, regardless of context.
6953 Generated from "grep '^[^;]\+;[WF]' EastAsianWidth.txt"
6954 and "grep '^[^;]\+;[^WF]' EastAsianWidth.txt"
6956 static bool
6957 is_width2 (unsigned int ch)
6959 return ((ch >= 0x1100 && ch <= 0x115F) /* Hangul Jamo */
6960 || (ch >= 0x231A && ch <= 0x231B) /* Watch, Hourglass */
6961 || (ch >= 0x2329 && ch <= 0x232A) /* Angle Brackets */
6962 || (ch >= 0x23E9 && ch <= 0x23EC) /* Black double triangles */
6963 || ch == 0x23F0 /* Alarm clock */
6964 || ch == 0x23F3 /* Hourglass */
6965 || (ch >= 0x25FD && ch <= 0x25FE) /* Medium small squares */
6966 /* Miscellaneous symbols, dingbats */
6967 || (ch >= 0x2614 && ch <= 0x2615)
6968 || (ch >= 0x2648 && ch <= 0x2653)
6969 || ch == 0x267F
6970 || ch == 0x2693
6971 || ch == 0x26A1
6972 || (ch >= 0x26AA && ch <= 0x26AB)
6973 || (ch >= 0x26BD && ch <= 0x26BE)
6974 || (ch >= 0x26C4 && ch <= 0x26C5)
6975 || ch == 0x26CE
6976 || ch == 0x26D4
6977 || ch == 0x26EA
6978 || (ch >= 0x26F2 && ch <= 0x26F3)
6979 || ch == 0x26F5
6980 || ch == 0x26FA
6981 || ch == 0x26FD
6982 || ch == 0x2705
6983 || (ch >= 0x270A && ch <= 0x270B)
6984 || ch == 0x2728
6985 || ch == 0x274C
6986 || ch == 0x274E
6987 || (ch >= 0x2753 && ch <= 0x2755)
6988 || ch == 0x2757
6989 || (ch >= 0x2795 && ch <= 0x2797)
6990 || ch == 0x27B0
6991 || ch == 0x27BF
6992 || (ch >= 0x2B1B && ch <= 0x2B1C) /* Large squares */
6993 || ch == 0x2B50
6994 || ch == 0x2B55
6995 || (ch >= 0x2E80 && ch <= 0xA4CF /* CJK ... Yi */
6996 && !(ch == 0x303F)
6997 && !(ch >= 0x3248 && ch <= 0x324F)
6998 && !(ch >= 0x4DC0 && ch <= 0x4DFF))
6999 || (ch >= 0xA960 && ch <= 0xA97C) /* Hangul Jamo Extended-A */
7000 || (ch >= 0xAC00 && ch <= 0xD7A3) /* Hangul Syllables */
7001 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
7002 || (ch >= 0xFE10 && ch <= 0xFE1F) /* Presentation Forms for Vertical */
7003 || (ch >= 0xFE30 && ch <= 0xFE6F) /* CJK Compatibility Forms */
7004 || (ch >= 0xFF00 && ch <= 0xFF60) /* Fullwidth Forms */
7005 || (ch >= 0xFFE0 && ch <= 0xFFE6) /* Fullwidth Signs */
7006 || (ch >= 0x16FE0 && ch <= 0x16FE3) /* Tangut mark, Nushu mark */
7007 || (ch >= 0x16FF0 && ch <= 0x16FF1) /* Vietnamese alternate reading marks */
7008 || (ch >= 0x17000 && ch <= 0x187F7) /* Tangut */
7009 || (ch >= 0x18800 && ch <= 0x18CD5) /* Tangut components */
7010 || (ch >= 0x18D00 && ch <= 0x18D08) /* Tangul Ideograph Supplement */
7011 || ((ch >= 0x1AFF0 && ch <= 0x1AFFE) /* Katakana letter Minnan */
7012 && ch != 0x1AFF4 && ch != 0x1AFFC)
7013 || (ch >= 0x1B000 && ch <= 0x1B122) /* Kana supplement, Kana Extended-A */
7014 || (ch >= 0x1B150 && ch <= 0x1B152) /* Small Hiragana */
7015 || (ch >= 0x1B164 && ch <= 0x1B167) /* Small Katakana */
7016 || (ch >= 0x1B170 && ch <= 0x1B2FB) /* Nushu */
7017 || ch == 0x1F004
7018 || ch == 0x1F0CF
7019 || ch == 0x1F18E
7020 || (ch >= 0x1F191 && ch <= 0x1F19A)
7021 /* Miscellaneous symbols and pictographs */
7022 || (ch >= 0x1F200 && ch <= 0x1F320)
7023 || (ch >= 0x1F32D && ch <= 0x1F335)
7024 || (ch >= 0x1F337 && ch <= 0x1F37C)
7025 || (ch >= 0x1F37E && ch <= 0x1F393)
7026 || (ch >= 0x1F3A0 && ch <= 0x1F3CA)
7027 || (ch >= 0x1F3CF && ch <= 0x1F3D3)
7028 || (ch >= 0x1F3E0 && ch <= 0x1F3F0)
7029 || ch == 0x1F3F4
7030 || (ch >= 0x1F3F8 && ch <= 0x1F43E)
7031 || ch == 0x1F440
7032 || (ch >= 0x1F442 && ch <= 0x1F4FC)
7033 || (ch >= 0x1F4FF && ch <= 0x1F53D)
7034 || (ch >= 0x1F54B && ch <= 0x1F54E)
7035 || (ch >= 0x1F550 && ch <= 0x1F567)
7036 || ch == 0x1F57A
7037 || (ch >= 0x1F595 && ch <= 0x1F596)
7038 || ch == 0x1F5A4
7039 || (ch >= 0x1F5FB && ch <= 0x1F64F)
7040 || (ch >= 0x1F680 && ch <= 0x1F6C5)
7041 || ch == 0x1F6CC
7042 || (ch >= 0x1F6D0 && ch <= 0x1F6D2)
7043 || (ch >= 0x1F6D5 && ch <= 0x1F6D7)
7044 || (ch >= 0x1F6DD && ch <= 0x1F6DF)
7045 || (ch >= 0x1F6EB && ch <= 0x1F6EC)
7046 || (ch >= 0x1F6F4 && ch <= 0x1F6FC)
7047 || (ch >= 0x1F7E0 && ch <= 0x1F7EB)
7048 || ch == 0x1F7F0
7049 || ((ch >= 0x1F90C && ch <= 0x1F9FF)
7050 && ch != 0x1F93B && ch != 0x1F946)
7051 || (ch >= 0x1FA70 && ch <= 0x1FA74)
7052 || (ch >= 0x1FA78 && ch <= 0x1FA7C)
7053 || (ch >= 0x1FA80 && ch <= 0x1FA86)
7054 || (ch >= 0x1FA90 && ch <= 0x1FAAC)
7055 || (ch >= 0x1FAB0 && ch <= 0x1FABA)
7056 || (ch >= 0x1FAC0 && ch <= 0x1FAC5)
7057 || (ch >= 0x1FAD0 && ch <= 0x1FAD9)
7058 || (ch >= 0x1FAE0 && ch <= 0x1FAE7)
7059 || (ch >= 0x1FAF0 && ch <= 0x1FAF6)
7060 || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */
7061 || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */
7065 static void
7066 output_width2_property (const char *filename, const char *version)
7068 output_predicate (filename, is_width2, "u_width2", "Width 2 property", version);
7071 /* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */
7072 static char
7073 symbolic_width (unsigned int ch)
7075 /* Test for unassigned character. */
7076 if (is_property_unassigned_code_value (ch))
7078 /* Unicode TR#11 section "Unassigned and Private-Use Characters". */
7079 if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */
7080 return 'A';
7081 if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */
7082 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */
7083 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */
7084 || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */
7085 || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */)
7086 return '2';
7087 return 0;
7089 else
7091 /* Test for non-spacing or control character. */
7092 if (is_category_Cc (ch) && ch < 0x00A0)
7093 return 0;
7094 if (is_nonspacing (ch))
7095 return '0';
7096 /* Test for double-width character. */
7097 if (unicode_width[ch] != NULL
7098 && (strcmp (unicode_width[ch], "W") == 0
7099 || strcmp (unicode_width[ch], "F") == 0))
7100 return '2';
7101 /* Test for half-width character. */
7102 if (unicode_width[ch] != NULL
7103 && strcmp (unicode_width[ch], "H") == 0)
7104 return '1';
7106 /* In ancient CJK encodings, Cyrillic and most other characters are
7107 double-width as well. */
7108 if (ch >= 0x00A1 && ch < 0x10000)
7109 return 'A';
7110 return '1';
7113 static void
7114 output_width_property_test (const char *filename)
7116 FILE *stream;
7117 unsigned int interval_start, interval_end, ch;
7118 char interval_value;
7120 stream = fopen (filename, "w");
7121 if (stream == NULL)
7123 fprintf (stderr, "cannot open '%s' for writing\n", filename);
7124 exit (1);
7127 interval_value = 0;
7128 interval_start = interval_end = 0; /* avoid GCC warning */
7129 for (ch = 0; ch < 0x110000; ch++)
7131 char value = symbolic_width (ch);
7132 if (value != 0) /* skip Cc control characters and unassigned characters */
7134 if (value == interval_value)
7135 /* Extend the interval. */
7136 interval_end = ch;
7137 else
7139 /* Terminate the interval. */
7140 if (interval_value != 0)
7142 if (interval_end == interval_start)
7143 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
7144 else
7145 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
7147 /* Start a new interval. */
7148 interval_start = interval_end = ch;
7149 interval_value = value;
7153 /* Terminate the last interval. */
7154 if (interval_value != 0)
7156 if (interval_end == interval_start)
7157 fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value);
7158 else
7159 fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value);
7162 if (ferror (stream) || fclose (stream))
7164 fprintf (stderr, "error writing to '%s'\n", filename);
7165 exit (1);
7169 /* ========================================================================= */
7171 /* Line breaking classification.
7172 Updated for Unicode TR #14 revision 53. */
7174 enum
7176 /* Values >= 41 are resolved at run time. */
7177 /* Values >= 100 are shorthands for several values. */
7178 LBP_BK = 41, /* mandatory break */
7179 LBP_CR = 42, /* carriage return */
7180 LBP_LF = 43, /* line feed */
7181 LBP_CM = 44, /* attached characters and combining marks */
7182 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
7183 /*LBP_SG, surrogates - not used here because they are not characters */
7184 LBP_WJ = 0, /* word joiner */
7185 LBP_ZW = 45, /* zero width space */
7186 LBP_GL = 1, /* non-breaking (glue) */
7187 LBP_SP = 46, /* space */
7188 LBP_B2 = 2, /* break opportunity before and after */
7189 LBP_BA = 3, /* break opportunity after */
7190 LBP_BB = 4, /* break opportunity before */
7191 LBP_HY = 5, /* hyphen */
7192 LBP_CB = 47, /* contingent break opportunity */
7193 LBP_CL = 6, /* closing punctuation */
7194 LBP_CP1 = 7, /* closing parenthesis, non-EastAsian character */
7195 LBP_CP2 = 8, /* closing parenthesis, EastAsian character */
7196 LBP_EX = 9, /* exclamation/interrogation */
7197 LBP_IN = 10, /* inseparable */
7198 LBP_NS = 11, /* non starter */
7199 LBP_OP1 = 12, /* opening punctuation, non-EastAsian character */
7200 LBP_OP2 = 13, /* opening punctuation, EastAsian character */
7201 LBP_QU1 = 14, /* ambiguous quotation, neither initial nor final punctuation */
7202 LBP_QU2 = 15, /* ambiguous quotation, initial punctuation */
7203 LBP_QU3 = 16, /* ambiguous quotation, final punctuation */
7204 LBP_IS = 17, /* infix separator (numeric) */
7205 LBP_NU = 18, /* numeric */
7206 LBP_PO = 19, /* postfix (numeric) */
7207 LBP_PR = 20, /* prefix (numeric) */
7208 LBP_SY = 21, /* symbols allowing breaks */
7209 LBP_AI = 48, /* ambiguous (alphabetic or ideograph) */
7210 LBP_AL1 = 22, /* ordinary alphabetic and symbol characters, != U+25CC */
7211 LBP_AL2 = 23, /* ordinary alphabetic and symbol characters, == U+25CC */
7212 /*LBP_CJ, conditional Japanese starter, resolved to NS */
7213 LBP_H2 = 24, /* Hangul LV syllable */
7214 LBP_H3 = 25, /* Hangul LVT syllable */
7215 LBP_HL = 31, /* Hebrew letter */
7216 LBP_ID1 = 26, /* ideographic */
7217 LBP_ID2 = 27, /* ideographic and potential future emoji */
7218 LBP_JL = 28, /* Hangul L Jamo */
7219 LBP_JV = 29, /* Hangul V Jamo */
7220 LBP_JT = 30, /* Hangul T Jamo */
7221 LBP_AP = 32, /* Brahmic scripts: pre-base repha */
7222 LBP_AK = 33, /* Brahmic scripts: consonants */
7223 LBP_AS = 34, /* Brahmic scripts: independent vowels */
7224 LBP_VI = 35, /* Brahmic scripts: conjoining viramas */
7225 LBP_VF = 36, /* Brahmic scripts: viramas for final consonants */
7226 LBP_RI = 37, /* regional indicator */
7227 LBP_SA = 49, /* complex context (South East Asian) */
7228 LBP_ZWJ = 38, /* zero width joiner */
7229 LBP_EB = 39, /* emoji base */
7230 LBP_EM = 40, /* emoji modifier */
7231 LBP_XX = 50, /* unknown */
7232 /* Artificial values that exist only in this file, not in the tables. */
7233 LBP_CP = 100, /* LBP_CP1 or LBP_CP2 */
7234 LBP_OP = 101, /* LBP_OP1 or LBP_OP2 */
7235 LBP_QU = 102, /* LBP_QU1 or LBP_QU2 or LBP_QU3 */
7236 LBP_AL = 103, /* LBP_AL1 or LBP_AL2 */
7237 LBP_ID = 104 /* LBP_ID1 or LBP_ID2 */
7240 /* Returns the line breaking EastAsian property for ch, as a bit. */
7241 static int
7242 get_lbea (unsigned int ch)
7244 return (unicode_width[ch] != NULL
7245 && (strcmp (unicode_width[ch], "W") == 0
7246 || strcmp (unicode_width[ch], "F") == 0
7247 || strcmp (unicode_width[ch], "H") == 0));
7250 /* Returns the line breaking classification for ch, as a bit mask. */
7251 static int64_t
7252 get_lbp (unsigned int ch)
7254 int64_t attr = 0;
7256 /* U+20BC..U+20CF are reserved for prefixes. */
7257 if (unicode_attributes[ch].name == NULL && (ch >= 0x20BC && ch <= 0x20CF))
7258 return (int64_t) 1 << LBP_PR;
7260 if (unicode_attributes[ch].name != NULL)
7262 /* mandatory break */
7263 if (ch == 0x000A)
7264 attr |= (int64_t) 1 << LBP_LF;
7265 if (ch == 0x000D)
7266 attr |= (int64_t) 1 << LBP_CR;
7267 if (ch == 0x0085 /* newline */
7268 || ch == 0x000B /* LINE TABULATION */
7269 || ch == 0x000C /* FORM FEED */
7270 || ch == 0x2028 /* LINE SEPARATOR */
7271 || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
7272 attr |= (int64_t) 1 << LBP_BK;
7274 if (ch == 0x2060 /* WORD JOINER */
7275 || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */)
7276 attr |= (int64_t) 1 << LBP_WJ;
7278 /* zero width space */
7279 if (ch == 0x200B /* ZERO WIDTH SPACE */)
7280 attr |= (int64_t) 1 << LBP_ZW;
7282 /* zero width joiner */
7283 if (ch == 0x200D /* ZERO WIDTH JOINER */)
7284 attr |= (int64_t) 1 << LBP_ZWJ;
7286 /* emoji base */
7287 if (((unicode_properties[ch] >> PROP_EMOJI_MODIFIER_BASE) & 1) != 0) /* EMOJI MODIFIER BASE */
7288 attr |= (int64_t) 1 << LBP_EB;
7290 if (((unicode_properties[ch] >> PROP_EMOJI_MODIFIER) & 1) != 0) /* EMOJI MODIFIER */
7291 attr |= (int64_t) 1 << LBP_EM;
7293 /* non-breaking (glue) */
7294 if (ch == 0x00A0 /* NO-BREAK SPACE */
7295 || ch == 0x202F /* NARROW NO-BREAK SPACE */
7296 || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */
7297 || ch == 0x1107F /* BRAHMI NUMBER JOINER */
7298 || (ch >= 0x13430 && ch <= 0x13436) /* EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH OVERLAY MIDDLE */
7299 || (ch >= 0x13439 && ch <= 0x1343B) /* EGYPTIAN HIEROGLYPH INSERT AT MIDDLE..EGYPTIAN HIEROGLYPH INSERT AT BOTTOM */
7300 || ch == 0x16FE4 /* KHITAN SMALL SCRIPT FILLER */
7301 || ch == 0x034F /* COMBINING GRAPHEME JOINER */
7302 || ch == 0x2007 /* FIGURE SPACE */
7303 || ch == 0x2011 /* NON-BREAKING HYPHEN */
7304 || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */
7305 || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */
7306 || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */
7307 || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */
7308 || ch == 0xFE20 /* COMBINING LIGATURE LEFT HALF */
7309 || ch == 0xFE22 /* COMBINING DOUBLE TILDE LEFT HALF */
7310 || ch == 0xFE24 /* COMBINING MACRON LEFT HALF */
7311 || ch == 0xFE27 /* COMBINING LIGATURE LEFT HALF BELOW */
7312 || ch == 0xFE29 /* COMBINING TILDE LEFT HALF BELOW */
7313 || ch == 0xFE2B /* COMBINING MACRON LEFT HALF BELOW */
7314 || ch == 0xFE2E /* COMBINING CYRILLIC TITLO LEFT HALF */
7315 || ch == 0xFE26 /* COMBINING CONJOINING MACRON */
7316 || ch == 0xFE2D /* COMBINING CONJOINING MACRON BELOW */
7317 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7318 || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */
7319 || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */
7320 || ch == 0x1DCD /* COMBINING DOUBLE CIRCUMFLEX ABOVE */
7321 || ch == 0x1DFC /* COMBINING DOUBLE INVERTED BREVE BELOW */)
7322 attr |= (int64_t) 1 << LBP_GL;
7324 /* space */
7325 if (ch == 0x0020 /* SPACE */)
7326 attr |= (int64_t) 1 << LBP_SP;
7328 /* break opportunity before and after */
7329 if (ch == 0x2014 /* EM DASH */
7330 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7331 || ch == 0x2E3A /* TWO-EM DASH */
7332 || ch == 0x2E3B /* THREE-EM DASH */)
7333 attr |= (int64_t) 1 << LBP_B2;
7335 /* break opportunity after */
7336 if (/* Breaking Spaces */
7337 ch == 0x1680 /* OGHAM SPACE MARK */
7338 || ch == 0x2000 /* EN QUAD */
7339 || ch == 0x2001 /* EM QUAD */
7340 || ch == 0x2002 /* EN SPACE */
7341 || ch == 0x2003 /* EM SPACE */
7342 || ch == 0x2004 /* THREE-PER-EM SPACE */
7343 || ch == 0x2005 /* FOUR-PER-EM SPACE */
7344 || ch == 0x2006 /* SIX-PER-EM SPACE */
7345 || ch == 0x2008 /* PUNCTUATION SPACE */
7346 || ch == 0x2009 /* THIN SPACE */
7347 || ch == 0x200A /* HAIR SPACE */
7348 || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */
7349 || ch == 0x3000 /* IDEOGRAPHIC SPACE */
7350 /* Tabs */
7351 || ch == 0x0009 /* tab */
7352 /* Conditional Hyphens */
7353 || ch == 0x00AD /* SOFT HYPHEN */
7354 /* Breaking Hyphens */
7355 || ch == 0x058A /* ARMENIAN HYPHEN */
7356 || ch == 0x2010 /* HYPHEN */
7357 || ch == 0x2012 /* FIGURE DASH */
7358 || ch == 0x2013 /* EN DASH */
7359 /* Visible Word Dividers */
7360 || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */
7361 || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
7362 || ch == 0x1361 /* ETHIOPIC WORDSPACE */
7363 || ch == 0x17D8 /* KHMER SIGN BEYYAL */
7364 || ch == 0x17DA /* KHMER SIGN KOOMUUT */
7365 || ch == 0x2027 /* HYPHENATION POINT */
7366 || ch == 0x007C /* VERTICAL LINE */
7367 /* Historic Word Separators */
7368 || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */
7369 || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */
7370 || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */
7371 || ch == 0x2056 /* THREE DOT PUNCTUATION */
7372 || ch == 0x2058 /* FOUR DOT PUNCTUATION */
7373 || ch == 0x2059 /* FIVE DOT PUNCTUATION */
7374 || ch == 0x205A /* TWO DOT PUNCTUATION */
7375 || ch == 0x205B /* FOUR DOT MARK */
7376 || ch == 0x205D /* TRICOLON */
7377 || ch == 0x205E /* VERTICAL FOUR DOTS */
7378 || ch == 0x2E19 /* PALM BRANCH */
7379 || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */
7380 || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */
7381 || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */
7382 || ch == 0x2E2D /* FIVE DOT PUNCTUATION */
7383 || ch == 0x2E30 /* RING POINT */
7384 || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
7385 || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
7386 || ch == 0x10102 /* AEGEAN CHECK MARK */
7387 || ch == 0x1039F /* UGARITIC WORD DIVIDER */
7388 || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */
7389 || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */
7390 || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */
7391 /* Dandas */
7392 || ch == 0x0964 /* DEVANAGARI DANDA */
7393 || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */
7394 || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
7395 || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
7396 || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */
7397 || ch == 0x104B /* MYANMAR SIGN SECTION */
7398 || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */
7399 || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */
7400 || ch == 0x17D4 /* KHMER SIGN KHAN */
7401 || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
7402 || ch == 0x1B5E /* BALINESE CARIK SIKI */
7403 || ch == 0x1B5F /* BALINESE CARIK PAREREN */
7404 || ch == 0xA8CE /* SAURASHTRA DANDA */
7405 || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */
7406 || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */
7407 || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */
7408 || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */
7409 || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */
7410 || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */
7411 /* Tibetan */
7412 || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */
7413 || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */
7414 || ch == 0x0F85 /* TIBETAN MARK PALUTA */
7415 || ch == 0x0FBE /* TIBETAN KU RU KHA */
7416 || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */
7417 || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */
7418 /* Other Terminating Punctuation */
7419 || ch == 0x1804 /* MONGOLIAN COLON */
7420 || ch == 0x1805 /* MONGOLIAN FOUR DOTS */
7421 || ch == 0x1B5A /* BALINESE PANTI */
7422 || ch == 0x1B5B /* BALINESE PAMADA */
7423 || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */
7424 || ch == 0x1B60 /* BALINESE PAMENENG */
7425 || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */
7426 || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */
7427 || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */
7428 || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */
7429 || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */
7430 || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */
7431 || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */
7432 || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */
7433 || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */
7434 || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */
7435 || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */
7436 || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */
7437 || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */
7438 || ch == 0xA60D /* VAI COMMA */
7439 || ch == 0xA60F /* VAI QUESTION MARK */
7440 || ch == 0xA92E /* KAYAH LI SIGN CWI */
7441 || ch == 0xA92F /* KAYAH LI SIGN SHYA */
7442 || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */
7443 || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */
7444 || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */
7445 || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */
7446 || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */
7447 || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */
7448 || (ch >= 0x11EF7 && ch <= 0x11EF8) /* MAKASAR PASSIMBANG..MAKASAR END OF SECTION */
7449 /* Letters attached to orthographic syllables */
7450 || ch == 0xA9CF /* JAVANESE PANGRANGKEP */
7451 || (ch >= 0xAA40 && ch <= 0xAA42) /* CHAM LETTER FINAL K..CHAM LETTER FINAL NG */
7452 || (ch >= 0xAA44 && ch <= 0xAA4B) /* CHAM LETTER FINAL CH..CHAM LETTER FINAL SS */
7453 || ch == 0x1133D /* GRANTHA SIGN AVAGRAHA */
7454 || ch == 0x1135D /* GRANTHA SIGN PLUTA */
7455 || ch == 0x11EF2 /* MAKASAR ANGKA */
7456 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7457 || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */
7458 || ch == 0x1B4E /* BALINESE INVERTED CARIK SIKI */
7459 || ch == 0x1B4F /* BALINESE INVERTED CARIK PAREREN */
7460 || ch == 0x1B7D /* BALINESE PANTI LANTANG */
7461 || ch == 0x1B7E /* BALINESE PAMADA LANTANG */
7462 || ch == 0x1B7F /* BALINESE PANTI BAWAK */
7463 || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */
7464 || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
7465 || ch == 0x2E33 /* RAISED DOT */
7466 || ch == 0x2E34 /* RAISED COMMA */
7467 || ch == 0x2E3C /* STENOGRAPHIC FULL STOP */
7468 || ch == 0x2E3D /* VERTICAL SIX DOTS */
7469 || ch == 0x2E3E /* WIGGLY VERTICAL LINE */
7470 || ch == 0x2E40 /* DOUBLE HYPHEN */
7471 || ch == 0x2E41 /* REVERSED COMMA */
7472 || ch == 0x2E43 /* DASH WITH LEFT UPTURN */
7473 || ch == 0x2E44 /* DOUBLE SUSPENSION MARK */
7474 || ch == 0x2E45 /* INVERTED LOW KAVYKA */
7475 || ch == 0x2E46 /* INVERTED LOW KAVYKA WITH KAVYKA ABOVE */
7476 || ch == 0x2E47 /* LOW KAVYKA */
7477 || ch == 0x2E48 /* LOW KAVYKA WITH DOT */
7478 || ch == 0x2E49 /* DOUBLE STACKED COMMA */
7479 || ch == 0x2E4A /* DOTTED SOLIDUS */
7480 || ch == 0x2E4C /* MEDIEVAL COMMA */
7481 || ch == 0x2E4E /* PUNCTUS ELEVATUS MARK */
7482 || ch == 0x2E4F /* CORNISH VERSE DIVIDER */
7483 || ch == 0x2E5D /* OBLIQUE HYPHEN */
7484 || ch == 0xA4FE /* LISU PUNCTUATION COMMA */
7485 || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */
7486 || ch == 0xA6F3 /* BAMUM FULL STOP */
7487 || ch == 0xA6F4 /* BAMUM COLON */
7488 || ch == 0xA6F5 /* BAMUM COMMA */
7489 || ch == 0xA6F6 /* BAMUM SEMICOLON */
7490 || ch == 0xA6F7 /* BAMUM QUESTION MARK */
7491 || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */
7492 || ch == 0xA9C8 /* JAVANESE PADA LINGSA */
7493 || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */
7494 || ch == 0xAAF0 /* MEETEI MAYEK CHEIKHAN */
7495 || ch == 0xAAF1 /* MEETEI MAYEK AHANG KHUDAM */
7496 || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */
7497 || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
7498 || (ch >= 0x10AF0 && ch <= 0x10AF5) /* MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION TWO DOTS */
7499 || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */
7500 || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */
7501 || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */
7502 || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */
7503 || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */
7504 || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */
7505 || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */
7506 || ch == 0x10D6E /* GARAY HYPHEN */
7507 || ch == 0x10EAD /* YEZIDI HYPHENATION MARK */
7508 || ch == 0x11047 /* BRAHMI DANDA */
7509 || ch == 0x11048 /* BRAHMI DOUBLE DANDA */
7510 || ch == 0x110BE /* KAITHI SECTION MARK */
7511 || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */
7512 || ch == 0x110C0 /* KAITHI DANDA */
7513 || ch == 0x110C1 /* KAITHI DOUBLE DANDA */
7514 || ch == 0x11140 /* CHAKMA SECTION MARK */
7515 || ch == 0x11141 /* CHAKMA DANDA */
7516 || ch == 0x11142 /* CHAKMA DOUBLE DANDA */
7517 || ch == 0x11143 /* CHAKMA QUESTION MARK */
7518 || ch == 0x111C5 /* SHARADA DANDA */
7519 || ch == 0x111C6 /* SHARADA DOUBLE DANDA */
7520 || ch == 0x111C8 /* SHARADA SEPARATOR */
7521 || (ch >= 0x111DD && ch <= 0x111DF) /* SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2 */
7522 || ch == 0x11238 /* KHOJKI DANDA */
7523 || ch == 0x11239 /* KHOJKI DOUBLE DANDA */
7524 || ch == 0x1123B /* KHOJKI SECTION MARK */
7525 || ch == 0x1123C /* KHOJKI DOUBLE SECTION MARK */
7526 || ch == 0x112A9 /* MULTANI SECTION MARK */
7527 || (ch >= 0x1144B && ch <= 0x1144E) /* NEWA DANDA..NEWA GAP FILLER */
7528 || ch == 0x1145A /* NEWA DOUBLE COMMA */
7529 || ch == 0x1145B /* NEWA PLACEHOLDER MARK */
7530 || ch == 0x115C2 /* SIDDHAM DANDA */
7531 || ch == 0x115C3 /* SIDDHAM DOUBLE DANDA */
7532 || (ch >= 0x115C9 && ch <= 0x115D7) /* SIDDHAM END OF TEXT MARK..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES */
7533 || ch == 0x11641 /* MODI DANDA */
7534 || ch == 0x11642 /* MODI DOUBLE DANDA */
7535 || (ch >= 0x1173C && ch <= 0x1173E) /* AHOM SIGN SMALL SECTION..AHOM SIGN RULAI */
7536 || (ch >= 0x11944 && ch <= 0x11946) /* DIVES AKURU DOUBLE DANDA..DIVES AKURU END OF TEXT MARK */
7537 || ch == 0x11A41 /* ZANABAZAR SQUARE MARK TSHEG */
7538 || ch == 0x11A42 /* ZANABAZAR SQUARE MARK SHAD */
7539 || ch == 0x11A43 /* ZANABAZAR SQUARE MARK DOUBLE SHAD */
7540 || ch == 0x11A44 /* ZANABAZAR SQUARE MARK LONG TSHEG */
7541 || ch == 0x11A9A /* SOYOMBO MARK TSHEG */
7542 || ch == 0x11A9B /* SOYOMBO MARK SHAD */
7543 || ch == 0x11A9C /* SOYOMBO MARK DOUBLE SHAD */
7544 || ch == 0x11AA1 /* SOYOMBO TERMINAL MARK-1 */
7545 || ch == 0x11AA2 /* SOYOMBO TERMINAL MARK-2 */
7546 || (ch >= 0x11C41 && ch <= 0x11C45) /* BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2 */
7547 || ch == 0x11F43 /* KAWI DANDA */
7548 || ch == 0x11F44 /* KAWI DOUBLE DANDA */
7549 || ch == 0x11FFF /* TAMIL PUNCTUATION END OF TEXT */
7550 || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
7551 || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
7552 || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */
7553 || ch == 0x12474 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON */
7554 || ch == 0x16A6E /* MRO DANDA */
7555 || ch == 0x16A6F /* MRO DOUBLE DANDA */
7556 || ch == 0x16AF5 /* BASSA VAH FULL STOP */
7557 || ch == 0x16B37 /* PAHAWH HMONG SIGN VOS THOM */
7558 || ch == 0x16B38 /* PAHAWH HMONG SIGN VOS TSHAB CEEB */
7559 || ch == 0x16B39 /* PAHAWH HMONG SIGN CIM CHEEM */
7560 || ch == 0x16B44 /* PAHAWH HMONG SIGN XAUS */
7561 || ch == 0x16D6E /* KIRAT RAI DANDA */
7562 || ch == 0x16D6F /* KIRAT RAI DOUBLE DANDA */
7563 || ch == 0x16E97 /* MEDEFAIDRIN COMMA */
7564 || ch == 0x16E98 /* MEDEFAIDRIN FULL STOP */
7565 || ch == 0x1BC9F /* DUPLOYAN PUNCTUATION CHINOOK FULL STOP */
7566 || (ch >= 0x1DA87 && ch <= 0x1DA8A) /* SIGNWRITING COMMA..SIGNWRITING COLON */)
7567 attr |= (int64_t) 1 << LBP_BA;
7569 /* break opportunity before */
7570 if (/* Dictionary Use */
7571 ch == 0x00B4 /* ACUTE ACCENT */
7572 || ch == 0x1FFD /* GREEK OXIA */
7573 || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */
7574 || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
7575 || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
7576 /* Tibetan and Phags-Pa Head Letters */
7577 || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */
7578 || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */
7579 || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */
7580 || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */
7581 || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */
7582 || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */
7583 || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */
7584 || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */
7585 || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */
7586 || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */
7587 || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */
7588 || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */
7589 || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */
7590 /* Mongolian */
7591 || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */
7592 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7593 || ch == 0x0C77 /* TELUGU SIGN SIDDHAM */
7594 || ch == 0x0C84 /* KANNADA SIGN SIDDHAM */
7595 || ch == 0xA8FC /* DEVANAGARI SIGN SIDDHAM */
7596 || ch == 0x11175 /* MAHAJANI SECTION MARK */
7597 || ch == 0x111DB /* SHARADA SIGN SIDDHAM */
7598 || ch == 0x115C1 /* SIDDHAM SIGN SIDDHAM */
7599 || (ch >= 0x11660 && ch <= 0x1166C) /* MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT */
7600 || ch == 0x119E2 /* NANDINAGARI SIGN SIDDHAM */
7601 || ch == 0x11A3F /* ZANABAZAR SQUARE INITIAL HEAD MARK */
7602 || ch == 0x11A45 /* ZANABAZAR SQUARE INITIAL DOUBLE-LINED HEAD MARK */
7603 || ch == 0x11A9E /* SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME */
7604 || ch == 0x11A9F /* SOYOMBO HEAD MARK WITH MOON AND SUN AND FLAME */
7605 || ch == 0x11AA0 /* SOYOMBO HEAD MARK WITH MOON AND SUN */
7606 || (ch >= 0x11B00 && ch <= 0x11B09) /* DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU */
7607 || ch == 0x11C70 /* MARCHEN HEAD MARK */)
7608 attr |= (int64_t) 1 << LBP_BB;
7610 /* hyphen */
7611 if (ch == 0x002D /* HYPHEN-MINUS */)
7612 attr |= (int64_t) 1 << LBP_HY;
7614 /* contingent break opportunity */
7615 if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
7616 attr |= (int64_t) 1 << LBP_CB;
7618 /* closing parenthesis */
7619 if (ch == 0x0029 /* RIGHT PARENTHESIS */
7620 || ch == 0x005D /* RIGHT SQUARE BRACKET */
7621 || ch == 0x2E56 /* RIGHT SQUARE BRACKET WITH STROKE */
7622 || ch == 0x2E58 /* RIGHT SQUARE BRACKET WITH DOUBLE STROKE */
7623 || ch == 0x2E5A /* TOP HALF RIGHT PARENTHESIS */
7624 || ch == 0x2E5C /* BOTTOM HALF RIGHT PARENTHESIS */)
7626 if (get_lbea (ch))
7627 attr |= (int64_t) 1 << LBP_CP2;
7628 else
7629 attr |= (int64_t) 1 << LBP_CP1;
7632 /* closing punctuation */
7633 if ((unicode_attributes[ch].category[0] == 'P'
7634 && unicode_attributes[ch].category[1] == 'e'
7635 && !(attr & (((int64_t) 1 << LBP_CP1) | ((int64_t) 1 << LBP_CP2))))
7636 || ch == 0x3001 /* IDEOGRAPHIC COMMA */
7637 || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
7638 || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */
7639 || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */
7640 || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */
7641 || ch == 0xFE50 /* SMALL COMMA */
7642 || ch == 0xFE52 /* SMALL FULL STOP */
7643 || ch == 0xFF0C /* FULLWIDTH COMMA */
7644 || ch == 0xFF0E /* FULLWIDTH FULL STOP */
7645 || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
7646 || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
7647 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7648 || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */
7649 || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */
7650 || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */
7651 || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */
7652 || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */
7653 || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */
7654 || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */
7655 || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */
7656 || ch == 0x13438 /* EGYPTIAN HIEROGLYPH END SEGMENT */
7657 || ch == 0x1343D /* EGYPTIAN HIEROGLYPH END ENCLOSURE */
7658 || ch == 0x1343F /* EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE */
7659 || ch == 0x145CF /* ANATOLIAN HIEROGLYPH A410A END LOGOGRAM MARK */)
7660 attr |= (int64_t) 1 << LBP_CL;
7662 /* exclamation/interrogation */
7663 if (ch == 0x0021 /* EXCLAMATION MARK */
7664 || ch == 0x003F /* QUESTION MARK */
7665 || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */
7666 || ch == 0x061B /* ARABIC SEMICOLON */
7667 || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */
7668 || ch == 0x061F /* ARABIC QUESTION MARK */
7669 || ch == 0x06D4 /* ARABIC FULL STOP */
7670 || ch == 0x07F9 /* NKO EXCLAMATION MARK */
7671 || ch == 0x0F0D /* TIBETAN MARK SHAD */
7672 || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
7673 || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */
7674 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7675 || ch == 0x061D /* ARABIC END OF TEXT MARK */
7676 || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */
7677 || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */
7678 || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */
7679 || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */
7680 || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */
7681 || ch == 0x1802 /* MONGOLIAN COMMA */
7682 || ch == 0x1803 /* MONGOLIAN FULL STOP */
7683 || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */
7684 || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */
7685 || ch == 0x1944 /* LIMBU EXCLAMATION MARK */
7686 || ch == 0x1945 /* LIMBU QUESTION MARK */
7687 || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */
7688 || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */
7689 || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */
7690 || ch == 0x2CFE /* COPTIC FULL STOP */
7691 || ch == 0x2E2E /* REVERSED QUESTION MARK */
7692 || ch == 0x2E53 /* MEDIEVAL EXCLAMATION MARK */
7693 || ch == 0x2E54 /* MEDIEVAL QUESTION MARK */
7694 || ch == 0xA60E /* VAI FULL STOP */
7695 || ch == 0xA876 /* PHAGS-PA MARK SHAD */
7696 || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */
7697 || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
7698 || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */
7699 || ch == 0xFE56 /* SMALL QUESTION MARK */
7700 || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
7701 || ch == 0x115C4 /* SIDDHAM SEPARATOR DOT */
7702 || ch == 0x115C5 /* SIDDHAM SEPARATOR BAR */
7703 || ch == 0x11C71 /* MARCHEN MARK SHAD */)
7704 attr |= (int64_t) 1 << LBP_EX;
7706 /* inseparable */
7707 if (ch == 0x2024 /* ONE DOT LEADER */
7708 || ch == 0x2025 /* TWO DOT LEADER */
7709 || ch == 0x2026 /* HORIZONTAL ELLIPSIS */
7710 || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */
7711 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7712 || ch == 0x22EF /* MIDLINE HORIZONTAL ELLIPSIS */
7713 || ch == 0x10AF6 /* MANICHAEAN PUNCTUATION LINE FILLER */)
7714 attr |= (int64_t) 1 << LBP_IN;
7716 /* non starter */
7717 if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
7718 || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
7719 || ch == 0x203D /* INTERROBANG */
7720 || ch == 0x2047 /* DOUBLE QUESTION MARK */
7721 || ch == 0x2048 /* QUESTION EXCLAMATION MARK */
7722 || ch == 0x2049 /* EXCLAMATION QUESTION MARK */
7723 || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
7724 || ch == 0x301C /* WAVE DASH */
7725 || ch == 0x303C /* MASU MARK */
7726 || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */
7727 || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
7728 || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
7729 || ch == 0x309D /* HIRAGANA ITERATION MARK */
7730 || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
7731 || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */
7732 || ch == 0x30FB /* KATAKANA MIDDLE DOT */
7733 || ch == 0x30FD /* KATAKANA ITERATION MARK */
7734 || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */
7735 || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */
7736 || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */
7737 || ch == 0xFE54 /* SMALL SEMICOLON */
7738 || ch == 0xFE55 /* SMALL COLON */
7739 || ch == 0xFF1A /* FULLWIDTH COLON */
7740 || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
7741 || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
7742 || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */
7743 || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
7744 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7745 || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
7746 || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL
7747 || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
7748 || ch == 0xA015 /* YI SYLLABLE WU */
7749 || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
7750 || ch == 0x16FE0 /* TANGUT ITERATION MARK */
7751 || ch == 0x16FE1 /* NUSHU ITERATION MARK */
7752 || ch == 0x16FE2 /* OLD CHINESE HOOK MARK */
7753 || ch == 0x16FE3 /* OLD CHINESE ITERATION MARK */
7754 || ch == 0x1F679 /* HEAVY INTERROBANG ORNAMENT */
7755 || ch == 0x1F67A /* SANS-SERIF INTERROBANG ORNAMENT */
7756 || ch == 0x1F67B /* HEAVY SANS-SERIF INTERROBANG ORNAMENT */)
7757 attr |= (int64_t) 1 << LBP_NS;
7759 /* opening punctuation */
7760 if ((unicode_attributes[ch].category[0] == 'P'
7761 && unicode_attributes[ch].category[1] == 's')
7762 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */
7763 || ch == 0x00BF /* INVERTED QUESTION MARK */
7764 || ch == 0x2E18 /* INVERTED INTERROBANG */
7765 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7766 || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */
7767 || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */
7768 || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */
7769 || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */
7770 || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */
7771 || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */
7772 || ch == 0x1342F /* EGYPTIAN HIEROGLYPH V011D */
7773 || ch == 0x13437 /* EGYPTIAN HIEROGLYPH BEGIN SEGMENT */
7774 || ch == 0x1343C /* EGYPTIAN HIEROGLYPH BEGIN ENCLOSURE */
7775 || ch == 0x1343E /* EGYPTIAN HIEROGLYPH BEGIN WALLED ENCLOSURE */
7776 || ch == 0x145CE /* ANATOLIAN HIEROGLYPH A410 BEGIN LOGOGRAM MARK */
7777 || (ch >= 0x1E95E && ch <= 0x1E95F) /* ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK */)
7779 if (get_lbea (ch))
7780 attr |= (int64_t) 1 << LBP_OP2;
7781 else
7782 attr |= (int64_t) 1 << LBP_OP1;
7785 /* ambiguous quotation */
7786 if ((unicode_attributes[ch].category[0] == 'P'
7787 && (unicode_attributes[ch].category[1] == 'f'
7788 || unicode_attributes[ch].category[1] == 'i'))
7789 || ch == 0x0022 /* QUOTATION MARK */
7790 || ch == 0x0027 /* APOSTROPHE */
7791 || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */
7792 || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */
7793 || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
7794 || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
7795 || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */
7796 || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */
7797 || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */
7798 || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */
7799 || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */
7800 || ch == 0x2E0B /* RAISED SQUARE */
7801 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7802 || ch == 0x275F /* HEAVY LOW SINGLE COMMA QUOTATION MARK ORNAMENT */
7803 || ch == 0x2760 /* HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT */
7804 || ch == 0x1F676 /* SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */
7805 || ch == 0x1F677 /* SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */
7806 || ch == 0x1F678 /* SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT */)
7808 if (unicode_attributes[ch].category[0] == 'P'
7809 && unicode_attributes[ch].category[1] == 'i')
7810 attr |= (int64_t) 1 << LBP_QU2;
7811 else if (unicode_attributes[ch].category[0] == 'P'
7812 && unicode_attributes[ch].category[1] == 'f')
7813 attr |= (int64_t) 1 << LBP_QU3;
7814 else
7815 attr |= (int64_t) 1 << LBP_QU1;
7818 /* infix separator (numeric) */
7819 if (ch == 0x002C /* COMMA */
7820 || ch == 0x002E /* FULL STOP */
7821 || ch == 0x003A /* COLON */
7822 || ch == 0x003B /* SEMICOLON */
7823 || ch == 0x037E /* GREEK QUESTION MARK */
7824 || ch == 0x0589 /* ARMENIAN FULL STOP */
7825 || ch == 0x060C /* ARABIC COMMA */
7826 || ch == 0x060D /* ARABIC DATE SEPARATOR */
7827 || ch == 0x07F8 /* NKO COMMA */
7828 || ch == 0x2044 /* FRACTION SLASH */)
7829 attr |= (int64_t) 1 << LBP_IS;
7831 /* numeric */
7832 if ((unicode_attributes[ch].category[0] == 'N'
7833 && unicode_attributes[ch].category[1] == 'd'
7834 && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL
7835 && !(ch >= 0x1B50 && ch <= 0x1B59) /* BALINESE DIGIT ZERO..NINE */
7836 && !(ch >= 0xA9D0 && ch <= 0xA9D9) /* JAVANESE DIGIT ZERO..NINE */
7837 && !(ch >= 0xAA50 && ch <= 0xAA59) /* CHAM DIGIT ZERO..NINE */
7838 && !(ch >= 0x11066 && ch <= 0x1106F) /* BRAHMI DIGIT ZERO..NINE */
7839 && !(ch >= 0x11950 && ch <= 0x11959) /* DIVES AKURU DIGIT ZERO..NINE */
7840 && !(ch >= 0x11F50 && ch <= 0x11F59) /* KAWI DIGIT ZERO..NINE */
7841 && !(ch >= 0x16130 && ch <= 0x16139)) /* GURUNG KHEMA DIGIT ZERO..NINE */
7842 || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */
7843 || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */
7844 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7845 || ch == 0x0600 /* ARABIC NUMBER SIGN */
7846 || ch == 0x0601 /* ARABIC SIGN SANAH */
7847 || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
7848 || ch == 0x0603 /* ARABIC SIGN SAFHA */
7849 || ch == 0x0604 /* ARABIC SIGN SAMVAT */
7850 || ch == 0x0605 /* ARABIC NUMBER MARK ABOVE */
7851 || ch == 0x06DD /* ARABIC END OF AYAH */
7852 || ch == 0x0890 /* ARABIC POUND MARK ABOVE */
7853 || ch == 0x0891 /* ARABIC PIASTRE MARK ABOVE */
7854 || ch == 0x08E2 /* ARABIC DISPUTED END OF AYAH */
7855 || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */
7856 || ch == 0x110BD /* KAITHI NUMBER SIGN */
7857 || ch == 0x110CD /* KAITHI NUMBER SIGN ABOVE */)
7858 attr |= (int64_t) 1 << LBP_NU;
7860 /* postfix numeric */
7861 if (ch == 0x0025 /* PERCENT SIGN */
7862 || ch == 0x00A2 /* CENT SIGN */
7863 || ch == 0x00B0 /* DEGREE SIGN */
7864 || ch == 0x060B /* AFGHANI SIGN */
7865 || ch == 0x066A /* ARABIC PERCENT SIGN */
7866 || ch == 0x2030 /* PER MILLE SIGN */
7867 || ch == 0x2031 /* PER TEN THOUSAND SIGN */
7868 || ch == 0x2032 /* PRIME */
7869 || ch == 0x2033 /* DOUBLE PRIME */
7870 || ch == 0x2034 /* TRIPLE PRIME */
7871 || ch == 0x2035 /* REVERSED PRIME */
7872 || ch == 0x2036 /* REVERSED DOUBLE PRIME */
7873 || ch == 0x2037 /* REVERSED TRIPLE PRIME */
7874 || ch == 0x20A7 /* PESETA SIGN */
7875 || ch == 0x2103 /* DEGREE CELSIUS */
7876 || ch == 0x2109 /* DEGREE FAHRENHEIT */
7877 || ch == 0xFDFC /* RIAL SIGN */
7878 || ch == 0xFE6A /* SMALL PERCENT SIGN */
7879 || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
7880 || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */
7881 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7882 || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */
7883 || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */
7884 || ch == 0x09F2 /* BENGALI RUPEE MARK */
7885 || ch == 0x09F3 /* BENGALI RUPEE SIGN */
7886 || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */
7887 || ch == 0x0D79 /* MALAYALAM DATE MARK */
7888 || ch == 0x2057 /* QUADRUPLE PRIME */
7889 || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */
7890 || ch == 0x20BB /* NORDIC MARK SIGN */
7891 || ch == 0x20BE /* LARI SIGN */
7892 || ch == 0x20C0 /* SOM SIGN */
7893 || ch == 0xA838 /* NORTH INDIC RUPEE MARK */
7894 || (ch >= 0x11FDD && ch <= 0x11FE0) /* TAMIL SIGN KAACU..TAMIL SIGN VARAAKAN */
7895 || ch == 0x1ECAC /* INDIC SIYAQ PLACEHOLDER */
7896 || ch == 0x1ECB0 /* INDIC SIYAQ RUPEE MARK */)
7897 attr |= (int64_t) 1 << LBP_PO;
7899 /* prefix numeric */
7900 if ((unicode_attributes[ch].category[0] == 'S'
7901 && unicode_attributes[ch].category[1] == 'c')
7902 || ch == 0x002B /* PLUS SIGN */
7903 || ch == 0x005C /* REVERSE SOLIDUS */
7904 || ch == 0x00B1 /* PLUS-MINUS SIGN */
7905 || ch == 0x2116 /* NUMERO SIGN */
7906 || ch == 0x2212 /* MINUS SIGN */
7907 || ch == 0x2213 /* MINUS-OR-PLUS SIGN */)
7908 if (!(attr & ((int64_t) 1 << LBP_PO)))
7909 attr |= (int64_t) 1 << LBP_PR;
7911 /* symbols allowing breaks */
7912 if (ch == 0x002F /* SOLIDUS */)
7913 attr |= (int64_t) 1 << LBP_SY;
7915 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0)
7916 attr |= (int64_t) 1 << LBP_H2;
7918 if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
7919 attr |= (int64_t) 1 << LBP_H3;
7921 if ((ch >= 0x05D0 && ch <= 0x05F2) || ch == 0xFB1D
7922 || (ch >= 0xFB1F && ch <= 0xFB28) || (ch >= 0xFB2A && ch <= 0xFB4F))
7923 attr |= (int64_t) 1 << LBP_HL;
7925 if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
7926 attr |= (int64_t) 1 << LBP_JL;
7928 if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6))
7929 attr |= (int64_t) 1 << LBP_JV;
7931 if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
7932 attr |= (int64_t) 1 << LBP_JT;
7934 /* Brahmic scripts: pre-base repha */
7935 if ((ch >= 0x11003 && ch <= 0x11004)
7936 || ch == 0x11F02
7937 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7938 || ch == 0x113D1
7939 || ch == 0x1193F
7940 || ch == 0x11941)
7941 attr |= (int64_t) 1 << LBP_AP;
7943 /* Brahmic scripts: consonants */
7944 if ((ch >= 0x1B05 && ch <= 0x1B33)
7945 || (ch >= 0x1B45 && ch <= 0x1B4C)
7946 || (ch >= 0xA984 && ch <= 0xA9B2)
7947 || (ch >= 0x11005 && ch <= 0x11037)
7948 || (ch >= 0x11071 && ch <= 0x11072)
7949 || ch == 0x11075
7950 || (ch >= 0x11305 && ch <= 0x1130C)
7951 || (ch >= 0x1130F && ch <= 0x11310)
7952 || (ch >= 0x11313 && ch <= 0x11328)
7953 || (ch >= 0x1132A && ch <= 0x11330)
7954 || (ch >= 0x11332 && ch <= 0x11333)
7955 || (ch >= 0x11335 && ch <= 0x11339)
7956 || (ch >= 0x11360 && ch <= 0x11361)
7957 || (ch >= 0x11F04 && ch <= 0x11F10)
7958 || (ch >= 0x11F12 && ch <= 0x11F33)
7959 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7960 || (ch >= 0x11392 && ch <= 0x113B5)
7961 || (ch >= 0x11900 && ch <= 0x11906)
7962 || ch == 0x11909
7963 || (ch >= 0x1190C && ch <= 0x11913)
7964 || (ch >= 0x11915 && ch <= 0x11916)
7965 || (ch >= 0x11918 && ch <= 0x1192F))
7966 attr |= (int64_t) 1 << LBP_AK;
7968 /* Brahmic scripts: independent vowels */
7969 if ((ch >= 0x1B50 && ch <= 0x1B59) /* BALINESE DIGIT ZERO..NINE */
7970 || (ch >= 0x1BC0 && ch <= 0x1BE5)
7971 || (ch >= 0xA9D0 && ch <= 0xA9D9) /* JAVANESE DIGIT ZERO..NINE */
7972 || (ch >= 0xAA00 && ch <= 0xAA28)
7973 || (ch >= 0xAA50 && ch <= 0xAA59) /* CHAM DIGIT ZERO..NINE */
7974 || (ch >= 0x11066 && ch <= 0x1106F)
7975 || ch == 0x11350
7976 || (ch >= 0x1135E && ch <= 0x1135F)
7977 || (ch >= 0x11950 && ch <= 0x11959) /* DIVES AKURU DIGIT ZERO..NINE */
7978 || (ch >= 0x11EE0 && ch <= 0x11EF1)
7979 || (ch >= 0x11F50 && ch <= 0x11F59)
7980 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7981 || (ch >= 0x11380 && ch <= 0x11389)
7982 || ch == 0x1138B
7983 || ch == 0x1138E
7984 || ch == 0x11390
7985 || ch == 0x11391
7986 || (ch >= 0x16100 && ch <= 0x1611D)
7987 || (ch >= 0x16130 && ch <= 0x16139) /* GURUNG KHEMA DIGIT ZERO..NINE */)
7988 attr |= (int64_t) 1 << LBP_AS;
7990 /* Brahmic scripts: conjoining viramas */
7991 if (ch == 0x1B44
7992 || ch == 0xA9C0
7993 || ch == 0x11046
7994 || ch == 0x1134D
7995 || ch == 0x11F42
7996 /* Extra characters for compatibility with Unicode LineBreak.txt. */
7997 || ch == 0x113D0
7998 || ch == 0x1193E)
7999 attr |= (int64_t) 1 << LBP_VI;
8001 /* Brahmic scripts: viramas for final consonants */
8002 if (ch == 0x1BF2 || ch == 0x1BF3)
8003 attr |= (int64_t) 1 << LBP_VF;
8005 if (is_property_regional_indicator (ch))
8006 attr |= (int64_t) 1 << LBP_RI;
8008 /* complex context (South East Asian) */
8009 if (((unicode_attributes[ch].category[0] == 'C'
8010 && unicode_attributes[ch].category[1] == 'f')
8011 || (unicode_attributes[ch].category[0] == 'L'
8012 && (unicode_attributes[ch].category[1] == 'm'
8013 || unicode_attributes[ch].category[1] == 'o'))
8014 || (unicode_attributes[ch].category[0] == 'M'
8015 && (unicode_attributes[ch].category[1] == 'c'
8016 || unicode_attributes[ch].category[1] == 'n')
8017 && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */)
8018 /* Extra characters for compatibility with Unicode LineBreak.txt. */
8019 || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */
8020 || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */
8021 || ch == 0x19DE /* NEW TAI LUE SIGN LAE */
8022 || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */
8023 || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */
8024 || (ch >= 0xA9E0 && ch <= 0xA9EF) /* Myanmar */
8025 || (ch >= 0xA9FA && ch <= 0xA9FE) /* Myanmar */
8026 || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */
8027 || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */
8028 || (ch >= 0x1173A && ch <= 0x1173B) /* Ahom */
8029 || (ch >= 0x1173F && ch <= 0x11746) /* Ahom */)
8030 && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */
8031 || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */
8032 || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */
8033 || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */
8034 || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */
8035 || (ch >= 0xA9E0 && ch <= 0xA9EF) /* Myanmar */
8036 || (ch >= 0xA9FA && ch <= 0xA9FE) /* Myanmar */
8037 || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */
8038 || (ch >= 0x11700 && ch <= 0x1171A) /* Ahom */
8039 || (ch >= 0x1171D && ch <= 0x1172B) /* Ahom */
8040 || (ch >= 0x1173A && ch <= 0x1173B) /* Ahom */
8041 || (ch >= 0x1173F && ch <= 0x11746) /* Ahom */))
8042 attr |= (int64_t) 1 << LBP_SA;
8044 /* attached characters and combining marks */
8045 if ((unicode_attributes[ch].category[0] == 'M'
8046 && (unicode_attributes[ch].category[1] == 'c'
8047 || unicode_attributes[ch].category[1] == 'e'
8048 || unicode_attributes[ch].category[1] == 'n')
8049 && ch != 0x1BF2 /* BATAK PANGOLAT */
8050 && ch != 0x1BF3 /* BATAK PANONGONAN */)
8051 || (unicode_attributes[ch].category[0] == 'C'
8052 && (unicode_attributes[ch].category[1] == 'c'
8053 || unicode_attributes[ch].category[1] == 'f')
8054 && ch != 0x0600 /* ARABIC NUMBER SIGN */
8055 && ch != 0x0601 /* ARABIC SIGN SANAH */
8056 && ch != 0x0602 /* ARABIC FOOTNOTE MARKER */
8057 && ch != 0x0603 /* ARABIC SIGN SAFHA */
8058 && ch != 0x0604 /* ARABIC SIGN SAMVAT */
8059 && ch != 0x0605 /* ARABIC NUMBER MARK ABOVE */
8060 && ch != 0x06DD /* ARABIC END OF AYAH */
8061 && ch != 0x0890 /* ARABIC POUND MARK ABOVE */
8062 && ch != 0x0891 /* ARABIC PIASTRE MARK ABOVE */
8063 && ch != 0x08E2 /* ARABIC DISPUTED END OF AYAH */
8064 && ch != 0x110BD /* KAITHI NUMBER SIGN */
8065 && ch != 0x110CD /* KAITHI NUMBER SIGN ABOVE */
8066 && ch != 0x13437 /* EGYPTIAN HIEROGLYPH BEGIN SEGMENT */
8067 && ch != 0x13438 /* EGYPTIAN HIEROGLYPH END SEGMENT */
8068 && ch != 0x1343C /* EGYPTIAN HIEROGLYPH BEGIN ENCLOSURE */
8069 && ch != 0x1343D /* EGYPTIAN HIEROGLYPH END ENCLOSURE */
8070 && ch != 0x1343E /* EGYPTIAN HIEROGLYPH BEGIN WALLED ENCLOSURE */
8071 && ch != 0x1343F /* EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE */)
8072 || ch == 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */)
8073 if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_CR) | ((int64_t) 1 << LBP_LF) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_VI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW) | ((int64_t) 1 << LBP_ZWJ))))
8074 attr |= (int64_t) 1 << LBP_CM;
8076 /* ideographic */
8077 if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
8078 || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */
8079 || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */
8080 || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
8081 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */
8082 || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */
8083 /* Extra characters for compatibility with Unicode LineBreak.txt. */
8084 || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
8085 || ch == 0x1B5C /* BALINESE WINDU */
8086 || (ch >= 0x1B61 && ch <= 0x1B6A) /* BALINESE MUSICAL SYMBOL DONG..BALINESE MUSICAL SYMBOL DANG GEDE */
8087 || (ch >= 0x1B74 && ch <= 0x1B7C) /* BALINESE MUSICAL SYMBOL RIGHT-HAND OPEN DUG..BALINESE MUSICAL SYMBOL LEFT-HAND OPEN PING */
8088 || ch == 0x231A /* WATCH */
8089 || ch == 0x231B /* HOURGLASS */
8090 || ch == 0x23F0 /* ALARM CLOCK */
8091 || ch == 0x23F1 /* STOPWATCH */
8092 || ch == 0x23F2 /* TIMER CLOCK */
8093 || ch == 0x23F3 /* HOURGLASS WITH FLOWING SAND */
8094 || ch == 0x2600 /* BLACK SUN WITH RAYS */
8095 || ch == 0x2601 /* CLOUD */
8096 || ch == 0x2602 /* UMBRELLA */
8097 || ch == 0x2603 /* SNOWMAN */
8098 || ch == 0x2614 /* UMBRELLA WITH RAIN DROPS */
8099 || ch == 0x2615 /* HOT BEVERAGE */
8100 || ch == 0x2618 /* SHAMROCK */
8101 || ch == 0x261A /* BLACK LEFT POINTING INDEX */
8102 || ch == 0x261B /* BLACK RIGHT POINTING INDEX */
8103 || ch == 0x261C /* WHITE LEFT POINTING INDEX */
8104 || ch == 0x261D /* WHITE UP POINTING INDEX */
8105 || ch == 0x261E /* WHITE RIGHT POINTING INDEX */
8106 || ch == 0x261F /* WHITE DOWN POINTING INDEX */
8107 || ch == 0x2639 /* WHITE FROWNING FACE */
8108 || ch == 0x263A /* WHITE SMILING FACE */
8109 || ch == 0x263B /* BLACK SMILING FACE */
8110 || ch == 0x2668 /* HOT SPRINGS */
8111 || ch == 0x267F /* WHEELCHAIR SYMBOL */
8112 || ch == 0x26BD /* SOCCER BALL */
8113 || ch == 0x26BE /* BASEBALL */
8114 || ch == 0x26BF /* SQUARED KEY */
8115 || ch == 0x26C0 /* WHITE DRAUGHTS MAN */
8116 || ch == 0x26C1 /* WHITE DRAUGHTS KING */
8117 || ch == 0x26C2 /* BLACK DRAUGHTS MAN */
8118 || ch == 0x26C3 /* BLACK DRAUGHTS KING */
8119 || ch == 0x26C4 /* SNOWMAN WITHOUT SNOW */
8120 || ch == 0x26C5 /* SUN BEHIND CLOUD */
8121 || ch == 0x26C6 /* RAIN */
8122 || ch == 0x26C7 /* BLACK SNOWMAN */
8123 || ch == 0x26C8 /* THUNDER CLOUD AND RAIN */
8124 || ch == 0x26CD /* DISABLED CAR */
8125 || ch == 0x26CF /* PICK */
8126 || ch == 0x26D0 /* CAR SLIDING */
8127 || ch == 0x26D1 /* HELMET WITH WHITE CROSS */
8128 || ch == 0x26D3 /* CHAINS */
8129 || ch == 0x26D4 /* NO ENTRY */
8130 || ch == 0x26D8 /* BLACK LEFT LANE MERGE */
8131 || ch == 0x26D9 /* WHITE LEFT LANE MERGE */
8132 || ch == 0x26DC /* LEFT CLOSED ENTRY */
8133 || ch == 0x26DF /* BLACK TRUCK */
8134 || ch == 0x26E0 /* RESTRICTED LEFT ENTRY-1 */
8135 || ch == 0x26E1 /* RESTRICTED LEFT ENTRY-2 */
8136 || ch == 0x26EA /* CHURCH */
8137 || ch == 0x26F1 /* UMBRELLA ON GROUND */
8138 || ch == 0x26F2 /* FOUNTAIN */
8139 || ch == 0x26F3 /* FLAG IN HOLE */
8140 || ch == 0x26F4 /* FERRY */
8141 || ch == 0x26F5 /* SAILBOAT */
8142 || ch == 0x26F7 /* SKIER */
8143 || ch == 0x26F8 /* ICE SKATE */
8144 || ch == 0x26F9 /* PERSON WITH BALL */
8145 || ch == 0x26FA /* TENT */
8146 || ch == 0x26FD /* FUEL PUMP */
8147 || ch == 0x26FE /* CUP ON BLACK SQUARE */
8148 || ch == 0x26FF /* WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE */
8149 || ch == 0x2700 /* BLACK SAFETY SCISSORS */
8150 || ch == 0x2701 /* UPPER BLADE SCISSORS */
8151 || ch == 0x2702 /* BLACK SCISSORS */
8152 || ch == 0x2703 /* LOWER BLADE SCISSORS */
8153 || ch == 0x2704 /* WHITE SCISSORS */
8154 || ch == 0x2708 /* AIRPLANE */
8155 || ch == 0x2709 /* ENVELOPE */
8156 || ch == 0x270A /* RAISED FIST */
8157 || ch == 0x270B /* RAISED HAND */
8158 || ch == 0x270C /* VICTORY HAND */
8159 || ch == 0x270D /* WRITING HAND */
8160 || ch == 0x2764 /* HEAVY BLACK HEART */
8161 || (ch >= 0x3000 && ch <= 0x33FF
8162 && !(attr & (((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP1) | ((int64_t) 1 << LBP_OP2) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP1) | ((int64_t) 1 << LBP_CP2))))
8163 || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */
8164 || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */
8165 || (ch >= 0xA9C1 && ch <= 0xA9C6) /* JAVANESE LEFT RERENGGAN..JAVANESE PADA WINDU */
8166 || (ch >= 0xA9CA && ch <= 0xA9CD) /* JAVANESE PADA ADEG..JAVANESE TURNED PADA PISELEH */
8167 || ch == 0xA9DE /* JAVANESE PADA TIRTA TUMETES */
8168 || ch == 0xA9DF /* JAVANESE PADA ISEN-ISEN */
8169 || ch == 0xAA5C /* CHAM PUNCTUATION SPIRAL */
8170 || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */
8171 || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */
8172 || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */
8173 || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */
8174 || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */
8175 || ch == 0xFE45 /* SESAME DOT */
8176 || ch == 0xFE46 /* WHITE SESAME DOT */
8177 || ch == 0xFE49 /* DASHED OVERLINE */
8178 || ch == 0xFE4A /* CENTRELINE OVERLINE */
8179 || ch == 0xFE4B /* WAVY OVERLINE */
8180 || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */
8181 || ch == 0xFE4D /* DASHED LOW LINE */
8182 || ch == 0xFE4E /* CENTRELINE LOW LINE */
8183 || ch == 0xFE4F /* WAVY LOW LINE */
8184 || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */
8185 || ch == 0xFE58 /* SMALL EM DASH */
8186 || ch == 0xFE5F /* SMALL NUMBER SIGN */
8187 || ch == 0xFE60 /* SMALL AMPERSAND */
8188 || ch == 0xFE61 /* SMALL ASTERISK */
8189 || ch == 0xFE62 /* SMALL PLUS SIGN */
8190 || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
8191 || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
8192 || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
8193 || ch == 0xFE66 /* SMALL EQUALS SIGN */
8194 || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */
8195 || ch == 0xFE6B /* SMALL COMMERCIAL AT */
8196 || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */
8197 || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */
8198 || ch == 0xFF06 /* FULLWIDTH AMPERSAND */
8199 || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */
8200 || ch == 0xFF0A /* FULLWIDTH ASTERISK */
8201 || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */
8202 || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */
8203 || ch == 0xFF0F /* FULLWIDTH SOLIDUS */
8204 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
8205 || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */
8206 || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */
8207 || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */
8208 || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */
8209 || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */
8210 || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */
8211 || ch == 0xFF3F /* FULLWIDTH LOW LINE */
8212 || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */
8213 || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */
8214 || ch == 0xFF5E /* FULLWIDTH TILDE */
8215 || ch == 0xFF66 /* Halfwidth Katakana */
8216 || (ch >= 0xFF71 && ch <= 0xFF9D) /* Halfwidth Katakana */
8217 || (ch >= 0xFFA0 && ch <= 0xFFBE) /* Halfwidth Hangul */
8218 || (ch >= 0xFFC2 && ch <= 0xFFC7) /* Halfwidth Hangul */
8219 || (ch >= 0xFFCA && ch <= 0xFFCF) /* Halfwidth Hangul */
8220 || (ch >= 0xFFD2 && ch <= 0xFFD7) /* Halfwidth Hangul */
8221 || (ch >= 0xFFDA && ch <= 0xFFDC) /* Halfwidth Hangul */
8222 || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */
8223 || ch == 0xFFE3 /* FULLWIDTH MACRON */
8224 || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */
8225 || (ch >= 0x11049 && ch <= 0x1104D) /* BRAHMI PUNCTUATION DOT..BRAHMI PUNCTUATION LOTUS */
8226 || (ch >= 0x11052 && ch <= 0x11065) /* BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND */
8227 || ch == 0x113B7 /* TULU-TIGALARI SIGN AVAGRAHA */
8228 || ch == 0x113D3 /* TULU-TIGALARI SIGN PLUTA */
8229 || ch == 0x113D4 /* TULU-TIGALARI DANDA */
8230 || ch == 0x113D5 /* TULU-TIGALARI DOUBLE DANDA */
8231 || ch == 0x113D7 /* TULU-TIGALARI SIGN OM PUSHPIKA */
8232 || ch == 0x113D8 /* TULU-TIGALARI SIGN SHRII PUSHPIKA */
8233 || (ch >= 0x11F45 && ch <= 0x11F4F) /* Kawi Punctuation */
8234 || (ch >= 0x17000 && ch <= 0x187F7) /* Tangut Ideograph */
8235 || (ch >= 0x18800 && ch <= 0x18AFF) /* Tangut Ideograph */
8236 || (ch >= 0x18D00 && ch <= 0x18D08) /* Tangut Ideograph Supplement */
8237 || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */
8238 || (ch >= 0x1B002 && ch <= 0x1B122) /* Hentaigana, archaic Hiragana/Katakana */
8239 || (ch >= 0x1B170 && ch <= 0x1B2FB) /* Nushu */
8240 || (ch >= 0x1F000 && ch <= 0x1F02B) /* Mahjong Tiles */
8241 || (ch >= 0x1F030 && ch <= 0x1F093) /* Domino Tiles */
8242 || (ch >= 0x1F0A0 && ch <= 0x1F0F5) /* Playing Cards */
8243 || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */
8244 || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */
8245 || (ch >= 0x1F260 && ch <= 0x1F265) /* Rounded Symbols */
8246 || (ch >= 0x1F300 && ch <= 0x1F5FF /* Miscellaneous Symbols and Pictographs */
8247 && ch != 0x1F3B5 && ch != 0x1F3B6 && ch != 0x1F3BC
8248 && ch != 0x1F4A0 && ch != 0x1F4A2 && ch != 0x1F4A4
8249 && ch != 0x1F4AF && ch != 0x1F4B1 && ch != 0x1F4B2
8250 && !(ch >= 0x1F39C && ch <= 0x1F39D)
8251 && !(ch >= 0x1F3FB && ch <= 0x1F3FF)
8252 && !(ch >= 0x1F500 && ch <= 0x1F506)
8253 && !(ch >= 0x1F517 && ch <= 0x1F524)
8254 && !(ch >= 0x1F532 && ch <= 0x1F549)
8255 && !(ch >= 0x1F5D4 && ch <= 0x1F5DB)
8256 && !(ch >= 0x1F5F4 && ch <= 0x1F5F9))
8257 || (ch >= 0x1F600 && ch <= 0x1F64F) /* Emoticons */
8258 || (ch >= 0x1F680 && ch <= 0x1F6DF) /* Transport and Map Symbols */
8259 || (ch >= 0x1F6E0 && ch <= 0x1F6EC) /* Transport and Map Symbols */
8260 || (ch >= 0x1F6F0 && ch <= 0x1F6FC) /* Transport and Map Symbols */
8261 || ch == 0x1F774 /* LOT OF FORTUNE */
8262 || ch == 0x1F775 /* OCCULTATION */
8263 || ch == 0x1F776 /* LUNAR ECLIPSE */
8264 || ch == 0x1F77B /* HAUMEA */
8265 || ch == 0x1F77C /* MAKEMAKE */
8266 || ch == 0x1F77D /* GONGGONG */
8267 || ch == 0x1F77E /* QUAOAR */
8268 || ch == 0x1F77F /* ORCUS */
8269 || (ch >= 0x1F7D5 && ch <= 0x1F7D8) /* Circled polygons */
8270 || ch == 0x1F7D9 /* NINE POINTED WHITE STAR */
8271 || (ch >= 0x1F7E0 && ch <= 0x1F7EB) /* Large circles */
8272 || ch == 0x1F7F0 /* Heavy equals sign */
8273 || (ch >= 0x1F90C && ch <= 0x1F9FF) /* Supplemental Symbols and Pictographs */
8274 || (ch >= 0x1FA60 && ch <= 0x1FA6D) /* Xiangqi pieces */
8275 || (ch >= 0x1FA70 && ch <= 0x1FA74) /* Emoticons */
8276 || (ch >= 0x1FA75 && ch <= 0x1FA77) /* Colored heart symbols */
8277 || (ch >= 0x1FA78 && ch <= 0x1FA7C) /* Medical pictographs */
8278 || (ch >= 0x1FA80 && ch <= 0x1FA89) /* Pictographs */
8279 || (ch >= 0x1FA8F && ch <= 0x1FABD) /* Pictographs */
8280 || (ch >= 0x1FABE && ch <= 0x1FAC2) /* Pictographs */
8281 || ch == 0x1FAC6 /* Pictographs */
8282 || (ch >= 0x1FACE && ch <= 0x1FADC) /* Pictographs */
8283 || (ch >= 0x1FADF && ch <= 0x1FAE9) /* Pictographs */
8284 || (ch >= 0x1FAF7 && ch <= 0x1FAF8) /* Pictographs */
8285 || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */
8286 || (ch >= 0x2A6D7 && ch <= 0x2A6DF) /* CJK Ideograph Extension B */
8287 || (ch >= 0x2A700 && ch <= 0x2B739) /* CJK Ideograph Extension C */
8288 || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */
8289 || (ch >= 0x2B820 && ch <= 0x2CEAF) /* CJK Ideograph Extension E */
8290 || (ch >= 0x2CEB0 && ch <= 0x2EBE0) /* CJK Ideograph Extension F */
8291 || (ch >= 0x2EBF0 && ch <= 0x2EE5D) /* CJK Ideograph Extension I */
8292 || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */
8293 || (ch >= 0x30000 && ch <= 0x3134A) /* CJK Ideograph Extension G */
8294 || (ch >= 0x31350 && ch <= 0x323AF) /* CJK Ideograph Extension H */)
8295 if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_EB))))
8297 /* ambiguous (ideograph) ? */
8298 if ((unicode_width[ch] != NULL
8299 && unicode_width[ch][0] == 'A'
8300 && ch >= 0x2000
8301 && ch != 0x2614
8302 && ch != 0x2615
8303 && ch != 0x261C
8304 && ch != 0x261E
8305 && ch != 0x2668
8306 && ch != 0x26BE
8307 && ch != 0x26BF
8308 && !(ch >= 0x26C4 && ch <= 0x26C8)
8309 && ch != 0x26CD
8310 && ch != 0x26CF
8311 && ch != 0x26D0
8312 && ch != 0x26D1
8313 && ch != 0x26D3
8314 && ch != 0x26D4
8315 && ch != 0x26D8
8316 && ch != 0x26D9
8317 && ch != 0x26DC
8318 && ch != 0x26DF
8319 && ch != 0x26E0
8320 && ch != 0x26E1
8321 && ch != 0x26EA
8322 && !(ch >= 0x26F1 && ch <= 0x26F5)
8323 && !(ch >= 0x26F7 && ch <= 0x26FA)
8324 && !(ch >= 0x26FD && ch <= 0x26FF))
8325 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
8326 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
8327 || (ch >= 0x3248 && ch <= 0x324F) /* CIRCLED NUMBER TEN ON BLACK SQUARE..CIRCLED NUMBER EIGHTY ON BLACK SQUARE */)
8328 attr |= (int64_t) 1 << LBP_AI;
8329 else
8330 attr |= (int64_t) 1 << LBP_ID1;
8333 /* ordinary alphabetic and symbol characters */
8334 if ((unicode_attributes[ch].category[0] == 'L'
8335 && (unicode_attributes[ch].category[1] == 'u'
8336 || unicode_attributes[ch].category[1] == 'l'
8337 || unicode_attributes[ch].category[1] == 't'
8338 || unicode_attributes[ch].category[1] == 'm'
8339 || unicode_attributes[ch].category[1] == 'o'))
8340 || (unicode_attributes[ch].category[0] == 'S'
8341 && (unicode_attributes[ch].category[1] == 'm'
8342 || unicode_attributes[ch].category[1] == 'k'
8343 || unicode_attributes[ch].category[1] == 'o'))
8344 || (unicode_attributes[ch].category[0] == 'N'
8345 && (unicode_attributes[ch].category[1] == 'l'
8346 || unicode_attributes[ch].category[1] == 'o'))
8347 || (unicode_attributes[ch].category[0] == 'P'
8348 && (unicode_attributes[ch].category[1] == 'c'
8349 || unicode_attributes[ch].category[1] == 'd'
8350 || unicode_attributes[ch].category[1] == 'o'))
8351 || ch == 0x070F /* SYRIAC ABBREVIATION MARK */
8352 || ch == 0x2061 /* FUNCTION APPLICATION */
8353 || ch == 0x2062 /* INVISIBLE TIMES */
8354 || ch == 0x2063 /* INVISIBLE SEPARATOR */
8355 || ch == 0x2064 /* INVISIBLE PLUS */
8356 /* Extra characters for compatibility with Unicode LineBreak.txt. */
8357 || ch == 0x08E2 /* ARABIC DISPUTED END OF AYAH */)
8358 if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP1) | ((int64_t) 1 << LBP_CP2) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP1) | ((int64_t) 1 << LBP_OP2) | ((int64_t) 1 << LBP_QU1) | ((int64_t) 1 << LBP_QU2) | ((int64_t) 1 << LBP_QU3) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_AP) | ((int64_t) 1 << LBP_AK) | ((int64_t) 1 << LBP_AS) | ((int64_t) 1 << LBP_VI) | ((int64_t) 1 << LBP_VF) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID1) | ((int64_t) 1 << LBP_ID2) | ((int64_t) 1 << LBP_EB) | ((int64_t) 1 << LBP_EM)))
8359 && ch != 0x3035 /* VERTICAL KANA REPEAT MARK LOWER HALF */
8360 && !(ch >= 0x3248 && ch <= 0x324F) /* CIRCLED NUMBER TEN ON BLACK SQUARE..CIRCLED NUMBER EIGHTY ON BLACK SQUARE */)
8362 /* ambiguous (alphabetic) ? */
8363 if ((unicode_width[ch] != NULL
8364 && unicode_width[ch][0] == 'A'
8365 && ch >= 0x2000
8366 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */
8367 && ch != 0x2022 /* BULLET */
8368 && ch != 0x203E /* OVERLINE */
8369 && ch != 0x2126 /* OHM SIGN */
8370 && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */
8371 && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */
8372 && ch != 0x21E7 /* UPWARDS WHITE ARROW */
8373 && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */
8374 && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */)
8375 || ch == 0x00A7 /* SECTION SIGN */
8376 || ch == 0x00A8 /* DIAERESIS */
8377 || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */
8378 || ch == 0x00B2 /* SUPERSCRIPT TWO */
8379 || ch == 0x00B3 /* SUPERSCRIPT THREE */
8380 || ch == 0x00B6 /* PILCROW SIGN */
8381 || ch == 0x00B7 /* MIDDLE DOT */
8382 || ch == 0x00B8 /* CEDILLA */
8383 || ch == 0x00B9 /* SUPERSCRIPT ONE */
8384 || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */
8385 || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */
8386 || ch == 0x00BD /* VULGAR FRACTION ONE HALF */
8387 || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */
8388 || ch == 0x00D7 /* MULTIPLICATION SIGN */
8389 || ch == 0x00F7 /* DIVISION SIGN */
8390 || ch == 0x02C7 /* CARON */
8391 || ch == 0x02C9 /* MODIFIER LETTER MACRON */
8392 || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */
8393 || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */
8394 || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */
8395 || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */
8396 || ch == 0x02D8 /* BREVE */
8397 || ch == 0x02D9 /* DOT ABOVE */
8398 || ch == 0x02DA /* RING ABOVE */
8399 || ch == 0x02DB /* OGONEK */
8400 || ch == 0x02DD /* DOUBLE ACUTE ACCENT */
8401 || ch == 0x24EA /* CIRCLED DIGIT ZERO */
8402 || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */
8403 /* Extra characters for compatibility with Unicode LineBreak.txt. */
8404 || ch == 0x2015 /* HORIZONTAL BAR */
8405 || ch == 0x2016 /* DOUBLE VERTICAL LINE */
8406 || ch == 0x2020 /* DAGGER */
8407 || ch == 0x2021 /* DOUBLE DAGGER */
8408 || ch == 0x203B /* REFERENCE MARK */
8409 || ch == 0x2074 /* SUPERSCRIPT FOUR */
8410 || ch == 0x207F /* SUPERSCRIPT LATIN SMALL LETTER N */
8411 || (ch >= 0x2081 && ch <= 0x2084) /* SUBSCRIPT ONE..FOUR */
8412 || ch == 0x2105 /* CARE OF */
8413 || ch == 0x2113 /* SCRIPT SMALL L */
8414 || ch == 0x2121 /* TELEPHONE SIGN */
8415 || ch == 0x2122 /* TRADE MARK SIGN */
8416 || ch == 0x212B /* ANGSTROM SIGN */
8417 || ch == 0x2150 /* VULGAR FRACTION ONE SEVENTH */
8418 || ch == 0x2151 /* VULGAR FRACTION ONE NINTH */
8419 || ch == 0x2152 /* VULGAR FRACTION ONE TENTH */
8420 || ch == 0x2153 /* VULGAR FRACTION ONE THIRD */
8421 || ch == 0x2154 /* VULGAR FRACTION TWO THIRDS */
8422 || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */
8423 || ch == 0x2156 /* VULGAR FRACTION TWO FIFTHS */
8424 || ch == 0x2157 /* VULGAR FRACTION THREE FIFTHS */
8425 || ch == 0x2158 /* VULGAR FRACTION FOUR FIFTHS */
8426 || ch == 0x2159 /* VULGAR FRACTION ONE SIXTH */
8427 || ch == 0x215A /* VULGAR FRACTION FIVE SIXTHS */
8428 || ch == 0x215B /* VULGAR FRACTION ONE EIGHTH */
8429 || ch == 0x215C /* VULGAR FRACTION THREE EIGHTHS */
8430 || ch == 0x215D /* VULGAR FRACTION SEVEN EIGHTHS */
8431 || ch == 0x215E /* VULGAR FRACTION SEVEN EIGHTHS */
8432 || (ch >= 0x2160 && ch <= 0x216B) /* ROMAN NUMERAL ONE..TWELVE */
8433 || (ch >= 0x2170 && ch <= 0x2179) /* SMALL ROMAN NUMERAL ONE..TEN */
8434 || ch == 0x2189 /* VULGAR FRACTION ZERO THIRDS */
8435 || (ch >= 0x2190 && ch <= 0x2199) /* LEFTWARDS ARROW..SOUTH WEST ARROW */
8436 || ch == 0x21D2 /* RIGHTWARDS DOUBLE ARROW */
8437 || ch == 0x21D4 /* LEFT RIGHT DOUBLE ARROW */
8438 || ch == 0x2200 /* FOR ALL */
8439 || ch == 0x2202 /* PARTIAL DIFFERENTIAL */
8440 || ch == 0x2203 /* THERE EXISTS */
8441 || ch == 0x2207 /* NABLA */
8442 || ch == 0x2208 /* ELEMENT OF */
8443 || ch == 0x220B /* CONTAINS AS MEMBER */
8444 || ch == 0x220F /* N-ARY PRODUCT */
8445 || ch == 0x2211 /* N-ARY SUMMATION */
8446 || ch == 0x2215 /* DIVISION SLASH */
8447 || ch == 0x221A /* SQUARE ROOT */
8448 || ch == 0x221D /* PROPORTIONAL TO */
8449 || ch == 0x221E /* INFINITY */
8450 || ch == 0x221F /* RIGHT ANGLE */
8451 || ch == 0x2220 /* ANGLE */
8452 || ch == 0x2223 /* DIVIDES */
8453 || ch == 0x2225 /* PARALLEL TO */
8454 || ch == 0x2227 /* LOGICAL AND */
8455 || ch == 0x2228 /* LOGICAL OR */
8456 || ch == 0x2229 /* INTERSECTION */
8457 || ch == 0x222A /* UNION */
8458 || ch == 0x222B /* INTEGRAL */
8459 || ch == 0x222C /* DOUBLE INTEGRAL */
8460 || ch == 0x222E /* CONTOUR INTEGRAL */
8461 || ch == 0x2234 /* THEREFORE */
8462 || ch == 0x2235 /* BECAUSE */
8463 || ch == 0x2236 /* RATIO */
8464 || ch == 0x2237 /* PROPORTION */
8465 || ch == 0x223C /* TILDE OPERATOR */
8466 || ch == 0x223D /* REVERSED TILDE */
8467 || ch == 0x2248 /* ALMOST EQUAL TO */
8468 || ch == 0x224C /* ALL EQUAL TO */
8469 || ch == 0x2252 /* APPROXIMATELY EQUAL TO OR THE IMAGE OF */
8470 || ch == 0x2260 /* NOT EQUAL TO */
8471 || ch == 0x2261 /* IDENTICAL TO */
8472 || ch == 0x2264 /* LESS-THAN OR EQUAL TO */
8473 || ch == 0x2265 /* GREATER-THAN OR EQUAL TO */
8474 || ch == 0x2266 /* LESS-THAN OVER EQUAL TO */
8475 || ch == 0x2267 /* GREATER-THAN OVER EQUAL TO */
8476 || ch == 0x226A /* MUCH LESS-THAN */
8477 || ch == 0x226B /* MUCH GREATER-THAN */
8478 || ch == 0x226E /* NOT LESS-THAN */
8479 || ch == 0x226F /* NOT GREATER-THAN */
8480 || ch == 0x2282 /* SUBSET OF */
8481 || ch == 0x2283 /* SUPERSET OF */
8482 || ch == 0x2286 /* SUBSET OF OR EQUAL TO */
8483 || ch == 0x2287 /* SUPERSET OF OR EQUAL TO */
8484 || ch == 0x2295 /* CIRCLED PLUS */
8485 || ch == 0x2299 /* CIRCLED DOT OPERATOR */
8486 || ch == 0x22A5 /* UP TACK */
8487 || ch == 0x22BF /* RIGHT TRIANGLE */
8488 || ch == 0x2312 /* ARC */
8489 || (ch >= 0x2460 && ch <= 0x24E9) /* CIRCLED DIGIT ONE..CIRCLED LATIN SMALL LETTER Z */
8490 || (ch >= 0x24EB && ch <= 0x24FE) /* NEGATIVE CIRCLED NUMBER ELEVEN..NEGATIVE CIRCLED DIGIT ZERO */
8491 || (ch >= 0x2500 && ch <= 0x254B) /* BOX DRAWINGS LIGHT HORIZONTAL..BOX DRAWINGS HEAVY VERTICAL AND HORIZONTAL */
8492 || (ch >= 0x2550 && ch <= 0x2574) /* BOX DRAWINGS DOUBLE HORIZONTAL..BOX DRAWINGS LIGHT LEFT */
8493 || (ch >= 0x2580 && ch <= 0x258F) /* UPPER HALF BLOCK..LEFT ONE EIGHTH BLOCK */
8494 || (ch >= 0x2592 && ch <= 0x2595) /* MEDIUM SHADE..RIGHT ONE EIGHTH BLOCK */
8495 || ch == 0x25A0 /* BLACK SQUARE */
8496 || ch == 0x25A1 /* WHITE SQUARE */
8497 || (ch >= 0x25A3 && ch <= 0x25A9) /* WHITE SQUARE CONTAINING BLACK SMALL SQUARE..SQUARE WITH DIAGONAL CROSSHATCH FILL */
8498 || ch == 0x25B2 /* BLACK UP-POINTING TRIANGLE */
8499 || ch == 0x25B3 /* WHITE UP-POINTING TRIANGLE */
8500 || ch == 0x25B6 /* BLACK RIGHT-POINTING TRIANGLE */
8501 || ch == 0x25B7 /* WHITE RIGHT-POINTING TRIANGLE */
8502 || ch == 0x25BC /* BLACK DOWN-POINTING TRIANGLE */
8503 || ch == 0x25BD /* WHITE DOWN-POINTING TRIANGLE */
8504 || ch == 0x25C0 /* BLACK LEFT-POINTING TRIANGLE */
8505 || ch == 0x25C1 /* WHITE LEFT-POINTING TRIANGLE */
8506 || (ch >= 0x25C6 && ch <= 0x25C8) /* BLACK DIAMOND..WHITE DIAMOND CONTAINING BLACK SMALL DIAMOND */
8507 || ch == 0x25CB /* WHITE CIRCLE */
8508 || (ch >= 0x25CE && ch <= 0x25D1) /* BULLSEYE..CIRCLE WITH RIGHT HALF BLACK */
8509 || (ch >= 0x25E2 && ch <= 0x25E5) /* BLACK LOWER RIGHT TRIANGLE..BLACK UPPER RIGHT TRIANGLE */
8510 || ch == 0x25EF /* LARGE CIRCLE */
8511 || ch == 0x2605 /* BLACK STAR */
8512 || ch == 0x2606 /* WHITE STAR */
8513 || ch == 0x2609 /* SUN */
8514 || ch == 0x260E /* BLACK TELEPHONE */
8515 || ch == 0x260F /* WHITE TELEPHONE */
8516 || ch == 0x2616 /* WHITE SHOGI PIECE */
8517 || ch == 0x2617 /* BLACK SHOGI PIECE */
8518 || ch == 0x2640 /* FEMALE SIGN */
8519 || ch == 0x2642 /* MALE SIGN */
8520 || ch == 0x2660 /* BLACK SPADE SUIT */
8521 || ch == 0x2661 /* WHITE HEART SUIT */
8522 || (ch >= 0x2663 && ch <= 0x2665) /* BLACK CLUB SUIT..BLACK HEART SUIT */
8523 || ch == 0x2667 /* WHITE CLUB SUIT */
8524 || ch == 0x2669 /* QUARTER NOTE */
8525 || ch == 0x266A /* EIGHTH NOTE */
8526 || ch == 0x266C /* BEAMED SIXTEENTH NOTES */
8527 || ch == 0x266D /* MUSIC FLAT SIGN */
8528 || ch == 0x266F /* MUSIC SHARP SIGN */
8529 || ch == 0x269E /* THREE LINES CONVERGING RIGHT */
8530 || ch == 0x269F /* THREE LINES CONVERGING LEFT */
8531 || (ch >= 0x26C9 && ch <= 0x26CC) /* TURNED WHITE SHOGI PIECE..CROSSING LANES */
8532 || ch == 0x26D2 /* CIRCLED CROSSING LANES */
8533 || (ch >= 0x26D5 && ch <= 0x26D7) /* ALTERNATE ONE-WAY LEFT WAY TRAFFIC..WHITE TWO-WAY LEFT WAY TRAFFIC */
8534 || ch == 0x26DA /* DRIVE SLOW SIGN */
8535 || ch == 0x26DB /* HEAVY WHITE DOWN-POINTING TRIANGLE */
8536 || ch == 0x26DD /* SQUARED SALTIRE */
8537 || ch == 0x26DE /* FALLING DIAGONAL IN WHITE CIRCLE IN BLACK SQUARE */
8538 || ch == 0x26E3 /* HEAVY CIRCLE WITH STROKE AND TWO DOTS ABOVE */
8539 || ch == 0x26E8 /* BLACK CROSS ON SHIELD */
8540 || ch == 0x26E9 /* SHINTO SHRINE */
8541 || (ch >= 0x26EB && ch <= 0x26F0) /* CASTLE..MOUNTAIN */
8542 || ch == 0x26F6 /* SQUARE FOUR CORNERS */
8543 || ch == 0x26FB /* JAPANESE BANK SYMBOL */
8544 || ch == 0x26FC /* HEADSTONE GRAVEYARD SYMBOL */
8545 || ch == 0x2757 /* HEAVY EXCLAMATION MARK SYMBOL */
8546 || (ch >= 0x2776 && ch <= 0x277F) /* DINGBAT NEGATIVE CIRCLED DIGIT ONE..DINGBAT NEGATIVE CIRCLED NUMBER TEN */
8547 || (ch >= 0x2B55 && ch <= 0x2B59) /* HEAVY LARGE CIRCLE..HEAVY CIRCLED SALTIRE */
8548 || ch == 0xFFFD /* REPLACEMENT CHARACTER */
8549 || (ch >= 0x1F100 && ch <= 0x1F10C) /* DIGIT ZERO FULL STOP..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO */
8550 || (ch >= 0x1F110 && ch <= 0x1F12D) /* PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED CD */
8551 || (ch >= 0x1F130 && ch <= 0x1F169) /* SQUARED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z */
8552 || (ch >= 0x1F170 && ch <= 0x1F1AC) /* NEGATIVE SQUARED LATIN CAPITAL LETTER A..SQUARED VOD */)
8553 attr |= (int64_t) 1 << LBP_AI;
8554 else
8556 if (ch == 0x25CC)
8557 attr |= (int64_t) 1 << LBP_AL2;
8558 else
8559 attr |= (int64_t) 1 << LBP_AL1;
8561 attr &= ~((int64_t) 1 << LBP_CM);
8564 else
8566 /* Unassigned character. */
8567 if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */
8568 || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */
8569 || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */
8570 || (ch >= 0x1F02C && ch <= 0x1F02F) /* reserved */
8571 || (ch >= 0x1F094 && ch <= 0x1F09F) /* reserved */
8572 || (ch >= 0x1F0AF && ch <= 0x1F0B0) /* reserved */
8573 || ch == 0x1F0C0 /* reserved */
8574 || ch == 0x1F0D0 /* reserved */
8575 || (ch >= 0x1F0F6 && ch <= 0x1F0FF) /* reserved */
8576 || (ch >= 0x1F10D && ch <= 0x1F10F) /* reserved */
8577 || ch == 0x1F12F /* reserved */
8578 || (ch >= 0x1F16C && ch <= 0x1F16F) /* reserved */
8579 || (ch >= 0x1F1AD && ch <= 0x1F1E5) /* reserved */
8580 || (ch >= 0x1F203 && ch <= 0x1F20F) /* reserved */
8581 || (ch >= 0x1F23C && ch <= 0x1F23F) /* reserved */
8582 || (ch >= 0x1F249 && ch <= 0x1F24F) /* reserved */
8583 || (ch >= 0x1F252 && ch <= 0x1F2FF) /* reserved */
8584 || (ch >= 0x1F6D3 && ch <= 0x1F6DF) /* reserved */
8585 || (ch >= 0x1F6ED && ch <= 0x1F6EF) /* reserved */
8586 || (ch >= 0x1F6F7 && ch <= 0x1F6FF) /* reserved */
8587 || (ch >= 0x1F774 && ch <= 0x1F77F) /* reserved */
8588 || (ch >= 0x1F7D5 && ch <= 0x1F7FF) /* reserved */
8589 || (ch >= 0x1F8B0 && ch <= 0x1F8BB) /* reserved */
8590 || (ch >= 0x1F8C0 && ch <= 0x1F8C1) /* reserved */
8591 || (ch >= 0x1F900 && ch <= 0x1F90F) /* reserved */
8592 || ch == 0x1F91F /* reserved */
8593 || ch == 0x1F93F /* reserved */
8594 || (ch >= 0x1F928 && ch <= 0x1F92F) /* reserved */
8595 || (ch >= 0x1F931 && ch <= 0x1F932) /* reserved */
8596 || (ch >= 0x1F94C && ch <= 0x1F94F) /* reserved */
8597 || (ch >= 0x1F95F && ch <= 0x1F97F) /* reserved */
8598 || (ch >= 0x1F992 && ch <= 0x1F9BF) /* reserved */
8599 || (ch >= 0x1F9C1 && ch <= 0x1FB92) /* reserved */
8600 || (ch >= 0x1FB94 && ch <= 0x1FBCA) /* reserved */
8601 || (ch >= 0x1FBF0 && ch <= 0x1FBF9) /* reserved */
8602 || (ch >= 0x1FC00 && ch <= 0x1FFFD) /* reserved */
8603 || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */
8604 || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C,
8605 Supplementary Ideographic Plane (Plane 2) outside of blocks */
8606 || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement,
8607 Supplementary Ideographic Plane (Plane 2) outside of blocks */
8608 || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */)
8610 if (is_property_extended_pictographic (ch))
8611 attr |= (int64_t) 1 << LBP_ID2;
8612 else
8613 attr |= (int64_t) 1 << LBP_ID1;
8617 if (attr == 0)
8618 /* unknown */
8619 attr |= (int64_t) 1 << LBP_XX;
8621 return attr;
8624 /* Combining prop and ea to a table entry. */
8625 #define PROP_EA(prop,ea) (((prop) << 1) | (ea))
8627 /* Splitting a table entry into prop and ea. */
8628 #define PROP(entry) ((entry) >> 1)
8629 #define EA(entry) ((entry) & 1)
8631 /* Output the line breaking properties in a human readable format. */
8632 static void
8633 debug_output_lbp (FILE *stream)
8635 unsigned int i;
8637 for (i = 0; i < 0x110000; i++)
8639 int64_t attr = get_lbp (i);
8640 if (attr != (int64_t) 1 << LBP_XX)
8642 fprintf (stream, "0x%04X", i);
8643 #define PRINT_BIT(attr,bit) \
8644 if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit);
8645 #define PRINT_BIT_ALT(attr,bit,name) \
8646 if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #name);
8647 PRINT_BIT(attr,LBP_BK);
8648 PRINT_BIT(attr,LBP_CR);
8649 PRINT_BIT(attr,LBP_LF);
8650 PRINT_BIT(attr,LBP_CM);
8651 PRINT_BIT(attr,LBP_WJ);
8652 PRINT_BIT(attr,LBP_ZW);
8653 PRINT_BIT(attr,LBP_GL);
8654 PRINT_BIT(attr,LBP_SP);
8655 PRINT_BIT(attr,LBP_B2);
8656 PRINT_BIT(attr,LBP_BA);
8657 PRINT_BIT(attr,LBP_BB);
8658 PRINT_BIT(attr,LBP_HY);
8659 PRINT_BIT(attr,LBP_CB);
8660 PRINT_BIT(attr,LBP_CL);
8661 PRINT_BIT_ALT(attr,LBP_CP1,LBP_CP);
8662 PRINT_BIT_ALT(attr,LBP_CP2,LBP_CP);
8663 PRINT_BIT(attr,LBP_EX);
8664 PRINT_BIT(attr,LBP_IN);
8665 PRINT_BIT(attr,LBP_NS);
8666 PRINT_BIT_ALT(attr,LBP_OP1,LBP_OP);
8667 PRINT_BIT_ALT(attr,LBP_OP2,LBP_OP);
8668 PRINT_BIT_ALT(attr,LBP_QU1,LBP_QU);
8669 PRINT_BIT_ALT(attr,LBP_QU2,LBP_QU);
8670 PRINT_BIT_ALT(attr,LBP_QU3,LBP_QU);
8671 PRINT_BIT(attr,LBP_IS);
8672 PRINT_BIT(attr,LBP_NU);
8673 PRINT_BIT(attr,LBP_PO);
8674 PRINT_BIT(attr,LBP_PR);
8675 PRINT_BIT(attr,LBP_SY);
8676 PRINT_BIT(attr,LBP_AI);
8677 PRINT_BIT_ALT(attr,LBP_AL1,LBP_AL);
8678 PRINT_BIT_ALT(attr,LBP_AL2,LBP_AL);
8679 PRINT_BIT(attr,LBP_H2);
8680 PRINT_BIT(attr,LBP_H3);
8681 PRINT_BIT(attr,LBP_HL);
8682 PRINT_BIT_ALT(attr,LBP_ID1,LBP_ID);
8683 PRINT_BIT_ALT(attr,LBP_ID2,LBP_ID);
8684 PRINT_BIT(attr,LBP_JL);
8685 PRINT_BIT(attr,LBP_JV);
8686 PRINT_BIT(attr,LBP_JT);
8687 PRINT_BIT(attr,LBP_AP);
8688 PRINT_BIT(attr,LBP_AK);
8689 PRINT_BIT(attr,LBP_AS);
8690 PRINT_BIT(attr,LBP_VI);
8691 PRINT_BIT(attr,LBP_VF);
8692 PRINT_BIT(attr,LBP_RI);
8693 PRINT_BIT(attr,LBP_SA);
8694 PRINT_BIT(attr,LBP_ZWJ);
8695 PRINT_BIT(attr,LBP_EB);
8696 PRINT_BIT(attr,LBP_EM);
8697 PRINT_BIT(attr,LBP_XX);
8698 #undef PRINT_BIT_ALT
8699 #undef PRINT_BIT
8700 fprintf (stream, "\n");
8705 static void
8706 debug_output_lbrk_tables (const char *filename)
8708 FILE *stream;
8710 stream = fopen (filename, "w");
8711 if (stream == NULL)
8713 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8714 exit (1);
8717 debug_output_lbp (stream);
8719 if (ferror (stream) || fclose (stream))
8721 fprintf (stderr, "error writing to '%s'\n", filename);
8722 exit (1);
8726 /* The line breaking property from the LineBreak.txt file. */
8727 int unicode_org_lbp[0x110000];
8729 /* Stores in unicode_org_lbp[] the line breaking property from the
8730 LineBreak.txt file. */
8731 static void
8732 fill_org_lbp (const char *linebreak_filename)
8734 unsigned int i, j;
8735 FILE *stream;
8736 char field0[FIELDLEN];
8737 char field1[FIELDLEN];
8738 char field2[FIELDLEN];
8739 int lineno = 0;
8741 for (i = 0; i < 0x110000; i++)
8742 unicode_org_lbp[i] = LBP_XX;
8744 stream = fopen (linebreak_filename, "r");
8745 if (stream == NULL)
8747 fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename);
8748 exit (1);
8751 for (;;)
8753 int n;
8754 int c;
8755 int value;
8757 lineno++;
8758 c = getc (stream);
8759 if (c == EOF)
8760 break;
8761 if (c == '\n')
8762 continue;
8763 if (c == '#')
8765 do c = getc (stream); while (c != EOF && c != '\n');
8766 continue;
8768 ungetc (c, stream);
8769 n = getfield (stream, field0, ';');
8770 do c = getc (stream); while (c == ' ');
8771 ungetc (c, stream);
8772 n += getfield (stream, field1, '#');
8773 n += getfield (stream, field2, '\n');
8774 if (n == 0)
8775 break;
8776 if (n != 3)
8778 fprintf (stderr, "short line in '%s':%d\n", linebreak_filename,
8779 lineno);
8780 exit (1);
8782 /* Remove trailing spaces from field0. */
8783 while (strlen (field0) > 0 && field0[strlen (field0) - 1] == ' ')
8784 field0[strlen (field0) - 1] = '\0';
8785 /* Remove trailing spaces from field1. */
8786 while (strlen (field1) > 0 && field1[strlen (field1) - 1] == ' ')
8787 field1[strlen (field1) - 1] = '\0';
8788 #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit;
8789 if (false) {}
8790 TRY(LBP_BK)
8791 TRY(LBP_CR)
8792 TRY(LBP_LF)
8793 TRY(LBP_CM)
8794 TRY(LBP_WJ)
8795 TRY(LBP_ZW)
8796 TRY(LBP_GL)
8797 TRY(LBP_SP)
8798 TRY(LBP_B2)
8799 TRY(LBP_BA)
8800 TRY(LBP_BB)
8801 TRY(LBP_HY)
8802 TRY(LBP_CB)
8803 TRY(LBP_CL)
8804 TRY(LBP_CP)
8805 TRY(LBP_EX)
8806 TRY(LBP_IN)
8807 TRY(LBP_NS)
8808 TRY(LBP_OP)
8809 TRY(LBP_QU)
8810 TRY(LBP_IS)
8811 TRY(LBP_NU)
8812 TRY(LBP_PO)
8813 TRY(LBP_PR)
8814 TRY(LBP_SY)
8815 TRY(LBP_AI)
8816 TRY(LBP_AL)
8817 TRY(LBP_H2)
8818 TRY(LBP_H3)
8819 TRY(LBP_HL)
8820 TRY(LBP_ID)
8821 TRY(LBP_JL)
8822 TRY(LBP_JV)
8823 TRY(LBP_JT)
8824 TRY(LBP_AP)
8825 TRY(LBP_AK)
8826 TRY(LBP_AS)
8827 TRY(LBP_VI)
8828 TRY(LBP_VF)
8829 TRY(LBP_RI)
8830 TRY(LBP_SA)
8831 TRY(LBP_ZWJ)
8832 TRY(LBP_EB)
8833 TRY(LBP_EM)
8834 TRY(LBP_XX)
8835 #undef TRY
8836 else if (strcmp (field1, "NL") == 0) value = LBP_BK;
8837 else if (strcmp (field1, "SG") == 0) value = LBP_XX;
8838 else if (strcmp (field1, "CJ") == 0) value = LBP_NS;
8839 else
8841 fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
8842 field1, linebreak_filename, lineno);
8843 exit (1);
8845 i = strtoul (field0, NULL, 16);
8846 if (strstr (field0, "..") != NULL)
8848 /* Deal with a range. */
8849 j = strtoul (strstr (field0, "..") + 2, NULL, 16);
8850 for (; i <= j; i++)
8851 unicode_org_lbp[i] = value;
8853 else
8855 /* Single character line. */
8856 unicode_org_lbp[i] = value;
8860 if (ferror (stream) || fclose (stream))
8862 fprintf (stderr, "error reading from '%s'\n", linebreak_filename);
8863 exit (1);
8867 /* Output the line breaking properties in a human readable format. */
8868 static void
8869 debug_output_org_lbp (FILE *stream)
8871 unsigned int i;
8873 for (i = 0; i < 0x110000; i++)
8875 int attr = unicode_org_lbp[i];
8876 if (attr != LBP_XX)
8878 fprintf (stream, "0x%04X", i);
8879 #define PRINT_BIT(attr,bit) \
8880 if (attr == bit) fprintf (stream, " " #bit);
8881 PRINT_BIT(attr,LBP_BK);
8882 PRINT_BIT(attr,LBP_CR);
8883 PRINT_BIT(attr,LBP_LF);
8884 PRINT_BIT(attr,LBP_CM);
8885 PRINT_BIT(attr,LBP_WJ);
8886 PRINT_BIT(attr,LBP_ZW);
8887 PRINT_BIT(attr,LBP_GL);
8888 PRINT_BIT(attr,LBP_SP);
8889 PRINT_BIT(attr,LBP_B2);
8890 PRINT_BIT(attr,LBP_BA);
8891 PRINT_BIT(attr,LBP_BB);
8892 PRINT_BIT(attr,LBP_HY);
8893 PRINT_BIT(attr,LBP_CB);
8894 PRINT_BIT(attr,LBP_CL);
8895 PRINT_BIT(attr,LBP_CP);
8896 PRINT_BIT(attr,LBP_EX);
8897 PRINT_BIT(attr,LBP_IN);
8898 PRINT_BIT(attr,LBP_NS);
8899 PRINT_BIT(attr,LBP_OP);
8900 PRINT_BIT(attr,LBP_QU);
8901 PRINT_BIT(attr,LBP_IS);
8902 PRINT_BIT(attr,LBP_NU);
8903 PRINT_BIT(attr,LBP_PO);
8904 PRINT_BIT(attr,LBP_PR);
8905 PRINT_BIT(attr,LBP_SY);
8906 PRINT_BIT(attr,LBP_AI);
8907 PRINT_BIT(attr,LBP_AL);
8908 PRINT_BIT(attr,LBP_H2);
8909 PRINT_BIT(attr,LBP_H3);
8910 PRINT_BIT(attr,LBP_HL);
8911 PRINT_BIT(attr,LBP_ID);
8912 PRINT_BIT(attr,LBP_JL);
8913 PRINT_BIT(attr,LBP_JV);
8914 PRINT_BIT(attr,LBP_JT);
8915 PRINT_BIT(attr,LBP_AP);
8916 PRINT_BIT(attr,LBP_AK);
8917 PRINT_BIT(attr,LBP_AS);
8918 PRINT_BIT(attr,LBP_VI);
8919 PRINT_BIT(attr,LBP_VF);
8920 PRINT_BIT(attr,LBP_RI);
8921 PRINT_BIT(attr,LBP_SA);
8922 PRINT_BIT(attr,LBP_ZWJ);
8923 PRINT_BIT(attr,LBP_EB);
8924 PRINT_BIT(attr,LBP_EM);
8925 PRINT_BIT(attr,LBP_XX);
8926 #undef PRINT_BIT
8927 fprintf (stream, "\n");
8932 static void
8933 debug_output_org_lbrk_tables (const char *filename)
8935 FILE *stream;
8937 stream = fopen (filename, "w");
8938 if (stream == NULL)
8940 fprintf (stderr, "cannot open '%s' for writing\n", filename);
8941 exit (1);
8944 debug_output_org_lbp (stream);
8946 if (ferror (stream) || fclose (stream))
8948 fprintf (stderr, "error writing to '%s'\n", filename);
8949 exit (1);
8953 /* Given an enum value LBP_..., returns its name "LBP_..." as a string. */
8954 static const char *
8955 lbp_value_to_string (unsigned int value)
8957 const char *value_string;
8958 switch (value)
8960 #define CASE(x) case x: value_string = #x; break;
8961 CASE(LBP_BK);
8962 CASE(LBP_CR);
8963 CASE(LBP_LF);
8964 CASE(LBP_CM);
8965 CASE(LBP_WJ);
8966 CASE(LBP_ZW);
8967 CASE(LBP_GL);
8968 CASE(LBP_SP);
8969 CASE(LBP_B2);
8970 CASE(LBP_BA);
8971 CASE(LBP_BB);
8972 CASE(LBP_HY);
8973 CASE(LBP_CB);
8974 CASE(LBP_CL);
8975 CASE(LBP_CP1);
8976 CASE(LBP_CP2);
8977 CASE(LBP_EX);
8978 CASE(LBP_IN);
8979 CASE(LBP_NS);
8980 CASE(LBP_OP1);
8981 CASE(LBP_OP2);
8982 CASE(LBP_QU1);
8983 CASE(LBP_QU2);
8984 CASE(LBP_QU3);
8985 CASE(LBP_IS);
8986 CASE(LBP_NU);
8987 CASE(LBP_PO);
8988 CASE(LBP_PR);
8989 CASE(LBP_SY);
8990 CASE(LBP_AI);
8991 CASE(LBP_AL1);
8992 CASE(LBP_AL2);
8993 CASE(LBP_H2);
8994 CASE(LBP_H3);
8995 CASE(LBP_HL);
8996 CASE(LBP_ID1);
8997 CASE(LBP_ID2);
8998 CASE(LBP_JL);
8999 CASE(LBP_JV);
9000 CASE(LBP_JT);
9001 CASE(LBP_AP);
9002 CASE(LBP_AK);
9003 CASE(LBP_AS);
9004 CASE(LBP_VI);
9005 CASE(LBP_VF);
9006 CASE(LBP_RI);
9007 CASE(LBP_SA);
9008 CASE(LBP_ZWJ);
9009 CASE(LBP_EB);
9010 CASE(LBP_EM);
9011 CASE(LBP_XX);
9012 #undef CASE
9013 default:
9014 abort ();
9016 return value_string;
9019 /* Construction of sparse 3-level tables. */
9020 #define TABLE lbpea_table
9021 #define ELEMENT unsigned char
9022 #define DEFAULT PROP_EA (LBP_XX, 0)
9023 #define xmalloc malloc
9024 #define xrealloc realloc
9025 #include "3level.h"
9027 static void
9028 output_lbpea (FILE *stream1, FILE *stream2)
9030 unsigned int i;
9031 struct lbpea_table t;
9032 unsigned int level1_offset, level2_offset, level3_offset;
9034 t.p = 7;
9035 t.q = 9;
9036 lbpea_table_init (&t);
9038 for (i = 0; i < 0x110000; i++)
9040 int64_t attr = get_lbp (i);
9041 int ea = get_lbea (i);
9043 /* Now attr should contain exactly one bit. */
9044 assert (attr != 0 && (attr & (attr - 1)) == 0);
9046 if (attr != (int64_t) 1 << LBP_XX)
9048 unsigned int log2_attr;
9049 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
9051 lbpea_table_add (&t, i, PROP_EA (log2_attr, ea));
9055 lbpea_table_finalize (&t);
9057 level1_offset =
9058 5 * sizeof (uint32_t);
9059 level2_offset =
9060 5 * sizeof (uint32_t)
9061 + t.level1_size * sizeof (uint32_t);
9062 level3_offset =
9063 5 * sizeof (uint32_t)
9064 + t.level1_size * sizeof (uint32_t)
9065 + (t.level2_size << t.q) * sizeof (uint32_t);
9067 for (i = 0; i < 5; i++)
9068 fprintf (stream1, "#define lbrkprop_header_%d %d\n", i,
9069 ((uint32_t *) t.result)[i]);
9070 fprintf (stream1, "\n");
9071 fprintf (stream1, "typedef struct\n");
9072 fprintf (stream1, " {\n");
9073 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
9074 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
9075 fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
9076 fprintf (stream1, " }\n");
9077 fprintf (stream1, "lbrkprop_t;\n");
9078 fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n");
9080 fprintf (stream2, "const lbrkprop_t unilbrkprop =\n");
9081 fprintf (stream2, "{\n");
9082 fprintf (stream2, " {");
9083 if (t.level1_size > 8)
9084 fprintf (stream2, "\n ");
9085 for (i = 0; i < t.level1_size; i++)
9087 uint32_t offset;
9088 if (i > 0 && (i % 8) == 0)
9089 fprintf (stream2, "\n ");
9090 offset = ((uint32_t *) (t.result + level1_offset))[i];
9091 if (offset == 0)
9092 fprintf (stream2, " %5d", -1);
9093 else
9094 fprintf (stream2, " %5zu",
9095 (offset - level2_offset) / sizeof (uint32_t));
9096 if (i+1 < t.level1_size)
9097 fprintf (stream2, ",");
9099 if (t.level1_size > 8)
9100 fprintf (stream2, "\n ");
9101 fprintf (stream2, " },\n");
9102 fprintf (stream2, " {");
9103 if (t.level2_size << t.q > 8)
9104 fprintf (stream2, "\n ");
9105 for (i = 0; i < t.level2_size << t.q; i++)
9107 uint32_t offset;
9108 if (i > 0 && (i % 8) == 0)
9109 fprintf (stream2, "\n ");
9110 offset = ((uint32_t *) (t.result + level2_offset))[i];
9111 if (offset == 0)
9112 fprintf (stream2, " %5d", -1);
9113 else
9114 fprintf (stream2, " %5zu",
9115 (offset - level3_offset) / sizeof (unsigned char));
9116 if (i+1 < t.level2_size << t.q)
9117 fprintf (stream2, ",");
9119 if (t.level2_size << t.q > 8)
9120 fprintf (stream2, "\n ");
9121 fprintf (stream2, " },\n");
9122 fprintf (stream2, " {");
9123 if (t.level3_size << t.p > 8)
9124 fprintf (stream2, "\n ");
9125 for (i = 0; i < t.level3_size << t.p; i++)
9127 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
9128 if (i > 0 && (i % 4) == 0)
9129 fprintf (stream2, "\n ");
9130 fprintf (stream2, " (%s<<1)|%d%s",
9131 lbp_value_to_string (PROP (value)), EA (value),
9132 (i+1 < t.level3_size << t.p ? "," : ""));
9134 if (t.level3_size << t.p > 8)
9135 fprintf (stream2, "\n ");
9136 fprintf (stream2, " }\n");
9137 fprintf (stream2, "};\n");
9140 static void
9141 output_lbrk_tables (const char *filename1, const char *filename2, const char *version)
9143 const char *filenames[2];
9144 FILE *streams[2];
9145 size_t i;
9147 filenames[0] = filename1;
9148 filenames[1] = filename2;
9150 for (i = 0; i < 2; i++)
9152 streams[i] = fopen (filenames[i], "w");
9153 if (streams[i] == NULL)
9155 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
9156 exit (1);
9160 for (i = 0; i < 2; i++)
9162 FILE *stream = streams[i];
9164 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9165 fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
9166 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9167 version);
9168 fprintf (stream, "\n");
9170 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
9171 fprintf (stream, "\n");
9172 output_library_license (stream, false);
9173 fprintf (stream, "\n");
9176 output_lbpea (streams[0], streams[1]);
9178 for (i = 0; i < 2; i++)
9180 if (ferror (streams[i]) || fclose (streams[i]))
9182 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
9183 exit (1);
9188 static void
9189 output_lbrk_rules_as_tables (const char *filename, const char *version)
9191 FILE *stream;
9193 stream = fopen (filename, "w");
9194 if (stream == NULL)
9196 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9197 exit (1);
9200 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
9201 fprintf (stream, "/* Table that encodes several line breaking rules. */\n");
9202 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
9203 version);
9204 fprintf (stream, "\n");
9206 fprintf (stream, "/* Copyright (C) 2001-2024 Free Software Foundation, Inc.\n");
9207 fprintf (stream, "\n");
9208 output_library_license (stream, false);
9209 fprintf (stream, "\n");
9211 fprintf (stream, "#include <config.h>\n");
9212 fprintf (stream, "\n");
9213 fprintf (stream, "/* Specification. */\n");
9214 fprintf (stream, "#include \"unilbrk/lbrktables.h\"\n");
9215 fprintf (stream, "\n");
9216 fprintf (stream, "/* Define unilbrkprop, table of line breaking properties. */\n");
9217 fprintf (stream, "#include \"unilbrk/lbrkprop2.h\"\n");
9218 fprintf (stream, "\n");
9220 /* LBP_* values that are entered in the table are in the range 0 .. NLBP-1. */
9221 const unsigned int NLBP = 41;
9223 unsigned int before;
9224 unsigned int after;
9225 /* Describe the table cell (before, after). */
9226 struct table_cell
9228 /* Break prohibited when no spaces, i.e. in before ÷ after */
9229 bool prohibited_no_sp;
9230 /* Break prohibited with spaces, i.e. in before SP+ ÷ after */
9231 bool prohibited_with_sp;
9233 struct table_cell table[NLBP][NLBP];
9234 /* Sets table[before][after].field to value. */
9235 #define set_table_cell(field,value) \
9236 (before == LBP_CP ? (set_table_cell_1 (LBP_CP1, field, value), set_table_cell_1 (LBP_CP2, field, value)) : \
9237 before == LBP_OP ? (set_table_cell_1 (LBP_OP1, field, value), set_table_cell_1 (LBP_OP2, field, value)) : \
9238 before == LBP_QU ? (set_table_cell_1 (LBP_QU1, field, value), set_table_cell_1 (LBP_QU2, field, value), set_table_cell_1 (LBP_QU3, field, value)) : \
9239 before == LBP_AL ? (set_table_cell_1 (LBP_AL1, field, value), set_table_cell_1 (LBP_AL2, field, value)) : \
9240 before == LBP_ID ? (set_table_cell_1 (LBP_ID1, field, value), set_table_cell_1 (LBP_ID2, field, value)) : \
9241 set_table_cell_1 (before, field, value))
9242 #define set_table_cell_1(row,field,value) \
9243 (after == LBP_CP ? (set_table_cell_2 (row, LBP_CP1, field, value), set_table_cell_2 (row, LBP_CP2, field, value)) : \
9244 after == LBP_OP ? (set_table_cell_2 (row, LBP_OP1, field, value), set_table_cell_2 (row, LBP_OP2, field, value)) : \
9245 after == LBP_QU ? (set_table_cell_2 (row, LBP_QU1, field, value), set_table_cell_2 (row, LBP_QU2, field, value), set_table_cell_2 (row, LBP_QU3, field, value)) : \
9246 after == LBP_AL ? (set_table_cell_2 (row, LBP_AL1, field, value), set_table_cell_2 (row, LBP_AL2, field, value)) : \
9247 after == LBP_ID ? (set_table_cell_2 (row, LBP_ID1, field, value), set_table_cell_2 (row, LBP_ID2, field, value)) : \
9248 set_table_cell_2 (row, after, field, value))
9249 #define set_table_cell_2(row,column,field,value) \
9250 (table[row][column].field = (value))
9252 /* Fill the table.
9253 If we were to apply the rules in top-down order (high precedence rules
9254 first), the table_cell fields have to support values false/true/unknown.
9255 If we apply the rules in the opposite order (high precedence order last),
9256 the table_cell fields need to support only the values false/true.
9257 So, that's what we do here. */
9259 /* (LB31) Break everywhere. */
9260 for (before = 0; before < NLBP; before++)
9261 for (after = 0; after < NLBP; after++)
9262 set_table_cell (prohibited_no_sp, false);
9264 /* (LB30b) Do not break between an emoji base (or potential emoji) and an
9265 emoji modifier. */
9266 before = LBP_EB; after = LBP_EM; set_table_cell (prohibited_no_sp, true);
9267 before = LBP_ID2; after = LBP_EM; set_table_cell (prohibited_no_sp, true);
9269 /* (LB30) Do not break between letters, numbers, or ordinary symbols and
9270 opening or closing parentheses (except for East Asian parentheses). */
9271 before = LBP_AL; after = LBP_OP1; set_table_cell (prohibited_no_sp, true);
9272 before = LBP_HL; after = LBP_OP1; set_table_cell (prohibited_no_sp, true);
9273 before = LBP_NU; after = LBP_OP1; set_table_cell (prohibited_no_sp, true);
9274 before = LBP_CP1; after = LBP_AL; set_table_cell (prohibited_no_sp, true);
9275 before = LBP_CP1; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
9276 before = LBP_CP1; after = LBP_NU; set_table_cell (prohibited_no_sp, true);
9278 /* (LB29) Do not break between numeric punctuation and alphabetics
9279 ("e.g."). */
9280 before = LBP_IS; after = LBP_AL; set_table_cell (prohibited_no_sp, true);
9281 before = LBP_IS; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
9283 /* (LB28a) Do not break inside the orthographic syllables of Brahmic
9284 scripts. */
9285 /* (LB28a) line 1. */
9286 before = LBP_AP; after = LBP_AK; set_table_cell (prohibited_no_sp, true);
9287 before = LBP_AP; after = LBP_AL2; set_table_cell (prohibited_no_sp, true);
9288 before = LBP_AP; after = LBP_AS; set_table_cell (prohibited_no_sp, true);
9289 /* (LB28a) line 2. */
9290 before = LBP_AK; after = LBP_VF; set_table_cell (prohibited_no_sp, true);
9291 before = LBP_AK; after = LBP_VI; set_table_cell (prohibited_no_sp, true);
9292 before = LBP_AL2; after = LBP_VF; set_table_cell (prohibited_no_sp, true);
9293 before = LBP_AL2; after = LBP_VI; set_table_cell (prohibited_no_sp, true);
9294 before = LBP_AS; after = LBP_VF; set_table_cell (prohibited_no_sp, true);
9295 before = LBP_AS; after = LBP_VI; set_table_cell (prohibited_no_sp, true);
9297 /* (LB28) Do not break between alphabetics ("at"). */
9298 before = LBP_AL; after = LBP_AL; set_table_cell (prohibited_no_sp, true);
9299 before = LBP_AL; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
9300 before = LBP_HL; after = LBP_AL; set_table_cell (prohibited_no_sp, true);
9301 before = LBP_HL; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
9303 /* (LB27) Korean Syllable Block. */
9304 before = LBP_JL; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9305 before = LBP_JV; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9306 before = LBP_JT; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9307 before = LBP_H2; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9308 before = LBP_H3; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9309 before = LBP_PR; after = LBP_JL; set_table_cell (prohibited_no_sp, true);
9310 before = LBP_PR; after = LBP_JV; set_table_cell (prohibited_no_sp, true);
9311 before = LBP_PR; after = LBP_JT; set_table_cell (prohibited_no_sp, true);
9312 before = LBP_PR; after = LBP_H2; set_table_cell (prohibited_no_sp, true);
9313 before = LBP_PR; after = LBP_H3; set_table_cell (prohibited_no_sp, true);
9315 /* (LB26) Do not break a Korean syllable. */
9316 before = LBP_JL; after = LBP_JL; set_table_cell (prohibited_no_sp, true);
9317 before = LBP_JL; after = LBP_JV; set_table_cell (prohibited_no_sp, true);
9318 before = LBP_JL; after = LBP_H2; set_table_cell (prohibited_no_sp, true);
9319 before = LBP_JL; after = LBP_H3; set_table_cell (prohibited_no_sp, true);
9320 before = LBP_JV; after = LBP_JV; set_table_cell (prohibited_no_sp, true);
9321 before = LBP_JV; after = LBP_JT; set_table_cell (prohibited_no_sp, true);
9322 before = LBP_H2; after = LBP_JV; set_table_cell (prohibited_no_sp, true);
9323 before = LBP_H2; after = LBP_JT; set_table_cell (prohibited_no_sp, true);
9324 before = LBP_JT; after = LBP_JT; set_table_cell (prohibited_no_sp, true);
9325 before = LBP_H3; after = LBP_JT; set_table_cell (prohibited_no_sp, true);
9327 /* (LB25) Do not break between the following pairs of classes relevant to
9328 numbers. */
9329 before = LBP_PO; after = LBP_NU; set_table_cell (prohibited_no_sp, true);
9330 before = LBP_PR; after = LBP_NU; set_table_cell (prohibited_no_sp, true);
9331 before = LBP_HY; after = LBP_NU; set_table_cell (prohibited_no_sp, true);
9332 before = LBP_IS; after = LBP_NU; set_table_cell (prohibited_no_sp, true);
9334 /* (LB24) Do not break between numeric prefix/postfix and letters, or between
9335 letters and prefix/postfix. */
9336 before = LBP_PR; after = LBP_AL; set_table_cell (prohibited_no_sp, true);
9337 before = LBP_PR; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
9338 before = LBP_PO; after = LBP_AL; set_table_cell (prohibited_no_sp, true);
9339 before = LBP_PO; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
9340 before = LBP_AL; after = LBP_PR; set_table_cell (prohibited_no_sp, true);
9341 before = LBP_AL; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9342 before = LBP_HL; after = LBP_PR; set_table_cell (prohibited_no_sp, true);
9343 before = LBP_HL; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9345 /* (LB23a) Do not break between numeric prefixes and ideographs, or between
9346 ideographs and numeric postfixes. */
9347 before = LBP_PR; after = LBP_ID; set_table_cell (prohibited_no_sp, true);
9348 before = LBP_PR; after = LBP_EB; set_table_cell (prohibited_no_sp, true);
9349 before = LBP_PR; after = LBP_EM; set_table_cell (prohibited_no_sp, true);
9350 before = LBP_ID; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9351 before = LBP_EB; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9352 before = LBP_EM; after = LBP_PO; set_table_cell (prohibited_no_sp, true);
9354 /* (LB23) Do not break between digits and letters. */
9355 before = LBP_AL; after = LBP_NU; set_table_cell (prohibited_no_sp, true);
9356 before = LBP_HL; after = LBP_NU; set_table_cell (prohibited_no_sp, true);
9357 before = LBP_NU; after = LBP_AL; set_table_cell (prohibited_no_sp, true);
9358 before = LBP_NU; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
9360 /* (LB22) Do not break before ellipses. */
9361 for (before = 0; before < NLBP; before++)
9363 after = LBP_IN; set_table_cell (prohibited_no_sp, true);
9366 /* (LB21b) Don’t break between Solidus and Hebrew letters. */
9367 before = LBP_SY; after = LBP_HL; set_table_cell (prohibited_no_sp, true);
9369 /* (LB21) Do not break before hyphen-minus, other hyphens, fixed-width spaces,
9370 small kana, and other non-starters, or after acute accents. */
9371 for (before = 0; before < NLBP; before++)
9373 after = LBP_BA; set_table_cell (prohibited_no_sp, true);
9374 after = LBP_HY; set_table_cell (prohibited_no_sp, true);
9375 after = LBP_NS; set_table_cell (prohibited_no_sp, true);
9377 for (after = 0; after < NLBP; after++)
9379 before = LBP_BB; set_table_cell (prohibited_no_sp, true);
9382 /* (LB19) Don't break before non-initial ambiguous quotation marks,
9383 such as '”' or '"'. Don't break after non-final ambiguous quotation
9384 marks, such as '“' or '"'. */
9385 for (before = 0; before < NLBP; before++)
9387 after = LBP_QU1; set_table_cell (prohibited_no_sp, true);
9388 after = LBP_QU3; set_table_cell (prohibited_no_sp, true);
9390 for (after = 0; after < NLBP; after++)
9392 before = LBP_QU1; set_table_cell (prohibited_no_sp, true);
9393 before = LBP_QU2; set_table_cell (prohibited_no_sp, true);
9396 /* (LB18) Break after spaces. */
9397 for (before = 0; before < NLBP; before++)
9398 for (after = 0; after < NLBP; after++)
9399 set_table_cell (prohibited_with_sp, false);
9401 /* (LB17) Do not break within '——', even with intervening spaces. */
9402 before = LBP_B2; after = LBP_B2; set_table_cell (prohibited_no_sp, true);
9403 set_table_cell (prohibited_with_sp, true);
9405 /* (LB16) Do not break between closing punctuation and a nonstarter (lb=NS),
9406 even with intervening spaces. */
9407 before = LBP_CL; after = LBP_NS; set_table_cell (prohibited_no_sp, true);
9408 set_table_cell (prohibited_with_sp, true);
9409 before = LBP_CP; after = LBP_NS; set_table_cell (prohibited_no_sp, true);
9410 set_table_cell (prohibited_with_sp, true);
9412 /* (LB15d) Do not break before ';', ',', '.', even after spaces. */
9413 for (before = 0; before < NLBP; before++)
9415 after = LBP_IS; set_table_cell (prohibited_no_sp, true);
9416 set_table_cell (prohibited_with_sp, true);
9419 /* (LB15b) Do not break before an ambiguous quotation that is a final
9420 punctuation, even after spaces. */
9421 for (before = 0; before < NLBP; before++)
9423 after = LBP_QU3; set_table_cell (prohibited_no_sp, true);
9424 set_table_cell (prohibited_with_sp, true);
9427 /* (LB15a) Do not break after an ambiguous quotation that is an initial
9428 punctuation, even after spaces. */
9429 for (after = 0; after < NLBP; after++)
9431 before = LBP_QU2; set_table_cell (prohibited_no_sp, true);
9432 set_table_cell (prohibited_with_sp, true);
9435 /* (LB14) Do not break after '[', even after spaces. */
9436 for (after = 0; after < NLBP; after++)
9438 before = LBP_OP; set_table_cell (prohibited_no_sp, true);
9439 set_table_cell (prohibited_with_sp, true);
9442 /* (LB13) Do not break before ‘]’ or ‘!’ or ‘/’, even after spaces. */
9443 for (before = 0; before < NLBP; before++)
9445 after = LBP_CL; set_table_cell (prohibited_no_sp, true);
9446 set_table_cell (prohibited_with_sp, true);
9447 after = LBP_CP; set_table_cell (prohibited_no_sp, true);
9448 set_table_cell (prohibited_with_sp, true);
9449 after = LBP_EX; set_table_cell (prohibited_no_sp, true);
9450 set_table_cell (prohibited_with_sp, true);
9451 after = LBP_SY; set_table_cell (prohibited_no_sp, true);
9452 set_table_cell (prohibited_with_sp, true);
9455 /* (LB12a) Do not break before NBSP and related characters, except after
9456 spaces and hyphens. */
9457 for (before = 0; before < NLBP; before++)
9458 if (before != LBP_BA && before != LBP_HY)
9460 after = LBP_GL; set_table_cell (prohibited_no_sp, true);
9463 /* (LB12) Do not break after NBSP and related characters. */
9464 for (after = 0; after < NLBP; after++)
9466 before = LBP_GL; set_table_cell (prohibited_no_sp, true);
9469 /* (LB11) Do not break before or after Word joiner and related characters. */
9470 for (before = 0; before < NLBP; before++)
9472 after = LBP_WJ; set_table_cell (prohibited_no_sp, true);
9473 set_table_cell (prohibited_with_sp, true);
9475 for (after = 0; after < NLBP; after++)
9477 before = LBP_WJ; set_table_cell (prohibited_no_sp, true);
9480 /* (LB10) Treat any remaining combining mark or ZWJ as AL. */
9481 /* We resolve LBP_CM at runtime, before accessing the table. */
9482 for (before = 0; before < NLBP; before++)
9483 table[before][LBP_ZWJ] = table[before][LBP_AL1];
9484 for (after = 0; after < NLBP; after++)
9485 table[LBP_ZWJ][after] = table[LBP_AL1][after];
9486 table[LBP_ZWJ][LBP_ZWJ] = table[LBP_AL1][LBP_AL1];
9488 /* (LB8a) Do not break between a zero width joiner and an ideograph, emoji
9489 base or emoji modifier. */
9490 before = LBP_ZWJ; after = LBP_ID; set_table_cell (prohibited_no_sp, true);
9491 before = LBP_ZWJ; after = LBP_EB; set_table_cell (prohibited_no_sp, true);
9492 before = LBP_ZWJ; after = LBP_EM; set_table_cell (prohibited_no_sp, true);
9494 /* Not reflected in the table:
9495 (LB30a) Break between two regional indicator symbols if and only if there are
9496 an even number of regional indicators preceding the position of the
9497 break.
9498 (LB28a) Don't break inside orthographic syllables of Brahmic scripts, lines
9499 3 and 4.
9500 (LB25) Do not break between the following pairs of classes relevant to
9501 numbers, lines with NU (SY|IS)* or OP NU or OP IS NU.
9502 (LB21a) Don't break after Hebrew + Hyphen/Break-After, before non-Hebrew.
9503 (LB20a) Don't break after a word-initial hyphen.
9504 (LB20) Break before and after unresolved CB.
9505 We resolve LBP_CB at runtime, before accessing the table.
9506 (LB19a) Don't break on either side of ambiguous quotation marks, except next
9507 to an EastAsian character.
9508 (LB15c) Break before a decimal mark that follows a space.
9509 Part of (LB15b) Do not break before an ambiguous quotation that is a final
9510 punctuation, even after spaces.
9511 Part of (LB15a) Do not break before an ambiguous quotation that is an initial
9512 punctuation, even after spaces.
9513 (LB9) Do not break a combining character sequence; treat it as if it has the
9514 line breaking class of the base character in all of the following rules.
9515 Treat ZWJ as if it were CM.
9516 Part of (LB8a) Don't break right after a zero-width joiner.
9517 (LB8) Break before any character following a zero-width space, even if one
9518 or more spaces intervene.
9519 We handle LBP_ZW at runtime, before accessing the table.
9520 (LB7) Do not break before spaces or zero width space.
9521 We handle LBP_ZW at runtime, before accessing the table.
9522 (LB6) Do not break before hard line breaks.
9523 We handle LBP_BK at runtime, before accessing the table.
9524 (LB5) Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
9525 (LB4) Always break after hard line breaks.
9526 (LB3) Always break at the end of text.
9527 (LB2) Never break at the start of text.
9530 fprintf (stream, "const unsigned char unilbrk_table[%u][%u] =\n", NLBP, NLBP);
9531 fprintf (stream, "{\n");
9532 fprintf (stream, " /* after */\n");
9534 fprintf (stream, " /* ");
9535 for (after = 0; after < NLBP; after++)
9536 fprintf (stream, " %-3s", lbp_value_to_string (after) + 4);
9537 fprintf (stream, " */\n");
9539 for (before = 0; before < NLBP; before++)
9541 fprintf (stream, "/* %3s */ {", lbp_value_to_string (before) + 4);
9542 for (after = 0; after < NLBP; after++)
9544 if (table[before][after].prohibited_no_sp)
9546 if (table[before][after].prohibited_with_sp)
9547 /* Prohibited break. */
9548 fprintf (stream, " P,");
9549 else
9550 /* Indirect break. */
9551 fprintf (stream, " I,");
9553 else
9555 if (table[before][after].prohibited_with_sp)
9556 abort ();
9557 else
9558 /* Direct break. */
9559 fprintf (stream, " D,");
9562 fprintf (stream, " },\n");
9564 fprintf (stream, "/* \"\" */\n");
9565 fprintf (stream, "/* before */\n");
9566 fprintf (stream, "};\n");
9568 if (ferror (stream) || fclose (stream))
9570 fprintf (stderr, "error writing to '%s'\n", filename);
9571 exit (1);
9575 #undef EA
9576 #undef PROP
9577 #undef PROP_EA
9579 /* ========================================================================= */
9581 /* Word break property.
9582 Updated for Unicode TR #29 revision 17. */
9584 /* Possible values of the Word_Break property. */
9585 enum
9587 WBP_OTHER = 0,
9588 WBP_CR = 11,
9589 WBP_LF = 12,
9590 WBP_NEWLINE = 10,
9591 WBP_EXTEND = 8,
9592 WBP_FORMAT = 9,
9593 WBP_KATAKANA = 1,
9594 WBP_ALETTER = 2,
9595 WBP_MIDNUMLET = 3,
9596 WBP_MIDLETTER = 4,
9597 WBP_MIDNUM = 5,
9598 WBP_NUMERIC = 6,
9599 WBP_EXTENDNUMLET = 7,
9600 WBP_RI = 13,
9601 WBP_DQ = 14,
9602 WBP_SQ = 15,
9603 WBP_HL = 16,
9604 WBP_ZWJ = 17,
9605 WBP_WSS = 22
9608 /* Returns the word breaking property for ch, as a bit mask. */
9609 static int
9610 get_wbp (unsigned int ch)
9612 int attr = 0;
9614 if (unicode_attributes[ch].name != NULL)
9616 if (ch == 0x000D)
9617 attr |= 1 << WBP_CR;
9619 if (ch == 0x000A)
9620 attr |= 1 << WBP_LF;
9622 if (ch == 0x000B || ch == 0x000C
9623 || ch == 0x0085
9624 || ch == 0x2028 || ch == 0x2029)
9625 attr |= 1 << WBP_NEWLINE;
9627 if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0
9628 || ((unicode_properties[ch] >> PROP_OTHER_GRAPHEME_EXTEND) & 1) != 0
9629 || (unicode_attributes[ch].category != NULL
9630 && strcmp (unicode_attributes[ch].category, "Mc") == 0)
9631 || ((unicode_properties[ch] >> PROP_EMOJI_MODIFIER) & 1) != 0 /* Emoji modifier */)
9632 attr |= 1 << WBP_EXTEND;
9634 if (unicode_attributes[ch].category != NULL
9635 && strcmp (unicode_attributes[ch].category, "Cf") == 0
9636 && !(ch >= 0x0600 && ch <= 0x0605)
9637 && ch != 0x06DD
9638 && ch != 0x070F
9639 && ch != 0x0890 && ch != 0x0891 && ch != 0x08E2
9640 && ch != 0x200B && ch != 0x200C && ch != 0x200D
9641 && ch != 0x110BD && ch != 0x110CD
9642 && !(ch >= 0xe0020 && ch <= 0xe007f))
9643 attr |= 1 << WBP_FORMAT;
9645 if ((unicode_scripts[ch] < numscripts
9646 && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0)
9647 || (ch >= 0x3031 && ch <= 0x3035)
9648 || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC
9649 || ch == 0xFF70)
9650 attr |= 1 << WBP_KATAKANA;
9652 if ((unicode_scripts[ch] < numscripts
9653 && strcmp (scripts[unicode_scripts[ch]], "Hebrew") == 0)
9654 && strcmp (unicode_attributes[ch].category, "Lo") == 0)
9655 attr |= 1 << WBP_HL;
9657 if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0
9658 || (ch >= 0x02C2 && ch <= 0x02C5)
9659 || (ch >= 0x02D2 && ch <= 0x02D7)
9660 || (ch >= 0x02DE && ch <= 0x02DF)
9661 || (ch >= 0x02E5 && ch <= 0x02EB)
9662 || ch == 0x02ED
9663 || (ch >= 0x02EF && ch <= 0x02FF)
9664 || (ch >= 0x055A && ch <= 0x055C)
9665 || ch == 0x055E
9666 || ch == 0x058A
9667 || ch == 0x05F3
9668 || ch == 0x070F
9669 || (ch >= 0xA708 && ch <= 0xA716)
9670 || (ch >= 0xA720 && ch <= 0xA721)
9671 || (ch >= 0xA789 && ch <= 0xA78A)
9672 || ch == 0xAB5B)
9673 && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0
9674 && (attr & (1 << WBP_KATAKANA)) == 0
9675 && ((get_lbp (ch) >> LBP_SA) & 1) == 0
9676 && !(unicode_scripts[ch] < numscripts
9677 && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0)
9678 && (attr & (1 << WBP_EXTEND)) == 0
9679 && (attr & (1 << WBP_HL)) == 0)
9680 attr |= 1 << WBP_ALETTER;
9682 if (is_WBP_MIDNUMLET (ch))
9683 attr |= 1 << WBP_MIDNUMLET;
9685 if (is_WBP_MIDLETTER (ch) && ch != 0x02D7)
9686 attr |= 1 << WBP_MIDLETTER;
9688 if ((((get_lbp (ch) >> LBP_IS) & 1) != 0
9689 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C
9690 || ch == 0xFF1B)
9691 && ch != 0x003A && ch != 0xFE13 && ch != 0x002E)
9692 attr |= 1 << WBP_MIDNUM;
9694 if ((((get_lbp (ch) >> LBP_NU) & 1) != 0
9695 || (ch >= 0x1B50 && ch <= 0x1B59) /* BALINESE DIGIT ZERO..NINE */
9696 || (ch >= 0xA9D0 && ch <= 0xA9D9) /* JAVANESE DIGIT ZERO..NINE */
9697 || (ch >= 0xAA50 && ch <= 0xAA59) /* CHAM DIGIT ZERO..NINE */
9698 || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT ZERO..NINE */
9699 || (ch >= 0x11066 && ch <= 0x1106F) /* BRAHMI DIGIT ZERO..NINE */
9700 || (ch >= 0x11950 && ch <= 0x11959) /* DIVES AKURU DIGIT ZERO..NINE */
9701 || (ch >= 0x11F50 && ch <= 0x11F59) /* KAWI DIGIT ZERO..NINE */
9702 || (ch >= 0x16130 && ch <= 0x16139) /* GURUNG KHEMA DIGIT ZERO..NINE */)
9703 && ch != 0x066C)
9704 attr |= 1 << WBP_NUMERIC;
9706 if ((unicode_attributes[ch].category != NULL
9707 && strcmp (unicode_attributes[ch].category, "Pc") == 0)
9708 || ch == 0x202F /* NARROW NO-BREAK SPACE */)
9709 attr |= 1 << WBP_EXTENDNUMLET;
9711 if (is_property_regional_indicator (ch))
9712 attr |= 1 << WBP_RI;
9714 if (ch == 0x0022)
9715 attr |= 1 << WBP_DQ;
9717 if (ch == 0x0027)
9718 attr |= 1 << WBP_SQ;
9720 if (ch == 0x200D)
9721 attr |= 1 << WBP_ZWJ;
9723 if (is_category_Zs (ch) && ((get_lbp (ch) >> LBP_GL) & 1) == 0)
9724 attr |= 1 << WBP_WSS;
9727 if (attr == 0)
9728 /* other */
9729 attr |= 1 << WBP_OTHER;
9731 return attr;
9734 /* Output the word break property in a human readable format. */
9735 static void
9736 debug_output_wbp (FILE *stream)
9738 unsigned int i;
9740 for (i = 0; i < 0x110000; i++)
9742 int attr = get_wbp (i);
9743 if (attr != 1 << WBP_OTHER)
9745 fprintf (stream, "0x%04X", i);
9746 if (attr & (1 << WBP_CR))
9747 fprintf (stream, " CR");
9748 if (attr & (1 << WBP_LF))
9749 fprintf (stream, " LF");
9750 if (attr & (1 << WBP_NEWLINE))
9751 fprintf (stream, " Newline");
9752 if (attr & (1 << WBP_EXTEND))
9753 fprintf (stream, " Extend");
9754 if (attr & (1 << WBP_FORMAT))
9755 fprintf (stream, " Format");
9756 if (attr & (1 << WBP_KATAKANA))
9757 fprintf (stream, " Katakana");
9758 if (attr & (1 << WBP_ALETTER))
9759 fprintf (stream, " ALetter");
9760 if (attr & (1 << WBP_MIDNUMLET))
9761 fprintf (stream, " MidNumLet");
9762 if (attr & (1 << WBP_MIDLETTER))
9763 fprintf (stream, " MidLetter");
9764 if (attr & (1 << WBP_MIDNUM))
9765 fprintf (stream, " MidNum");
9766 if (attr & (1 << WBP_NUMERIC))
9767 fprintf (stream, " Numeric");
9768 if (attr & (1 << WBP_EXTENDNUMLET))
9769 fprintf (stream, " ExtendNumLet");
9770 if (attr & (1 << WBP_RI))
9771 fprintf (stream, " Regional_Indicator");
9772 if (attr & (1 << WBP_DQ))
9773 fprintf (stream, " Double_Quote");
9774 if (attr & (1 << WBP_SQ))
9775 fprintf (stream, " Single_Quote");
9776 if (attr & (1 << WBP_HL))
9777 fprintf (stream, " Hebrew_Letter");
9778 if (attr & (1 << WBP_ZWJ))
9779 fprintf (stream, " ZWJ");
9780 if (attr & (1 << WBP_WSS))
9781 fprintf (stream, " WSegSpace");
9782 fprintf (stream, "\n");
9787 static void
9788 debug_output_wbrk_tables (const char *filename)
9790 FILE *stream;
9792 stream = fopen (filename, "w");
9793 if (stream == NULL)
9795 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9796 exit (1);
9799 debug_output_wbp (stream);
9801 if (ferror (stream) || fclose (stream))
9803 fprintf (stderr, "error writing to '%s'\n", filename);
9804 exit (1);
9808 /* The word break property from the WordBreakProperty.txt file. */
9809 int unicode_org_wbp[0x110000];
9811 /* Stores in unicode_org_wbp[] the word break property from the
9812 WordBreakProperty.txt file. */
9813 static void
9814 fill_org_wbp (const char *wordbreakproperty_filename)
9816 unsigned int i;
9817 FILE *stream;
9819 for (i = 0; i < 0x110000; i++)
9820 unicode_org_wbp[i] = WBP_OTHER;
9822 stream = fopen (wordbreakproperty_filename, "r");
9823 if (stream == NULL)
9825 fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename);
9826 exit (1);
9829 for (;;)
9831 char buf[200+1];
9832 unsigned int i1, i2;
9833 char padding[200+1];
9834 char propname[200+1];
9835 int propvalue;
9837 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
9838 break;
9840 if (buf[0] == '\0' || buf[0] == '#')
9841 continue;
9843 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
9845 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
9847 fprintf (stderr, "parse error in '%s'\n",
9848 wordbreakproperty_filename);
9849 exit (1);
9851 i2 = i1;
9853 #define PROP(name,value) \
9854 if (strcmp (propname, name) == 0) propvalue = value; else
9855 PROP ("CR", WBP_CR)
9856 PROP ("LF", WBP_LF)
9857 PROP ("Newline", WBP_NEWLINE)
9858 PROP ("Extend", WBP_EXTEND)
9859 PROP ("Format", WBP_FORMAT)
9860 PROP ("Katakana", WBP_KATAKANA)
9861 PROP ("ALetter", WBP_ALETTER)
9862 PROP ("MidNumLet", WBP_MIDNUMLET)
9863 PROP ("MidLetter", WBP_MIDLETTER)
9864 PROP ("MidNum", WBP_MIDNUM)
9865 PROP ("Numeric", WBP_NUMERIC)
9866 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
9867 PROP ("Regional_Indicator", WBP_RI)
9868 PROP ("Double_Quote", WBP_DQ)
9869 PROP ("Single_Quote", WBP_SQ)
9870 PROP ("Hebrew_Letter", WBP_HL)
9871 PROP ("ZWJ", WBP_ZWJ)
9872 PROP ("WSegSpace", WBP_WSS)
9873 #undef PROP
9875 fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
9876 wordbreakproperty_filename);
9877 exit (1);
9879 assert (i1 <= i2 && i2 < 0x110000);
9881 for (i = i1; i <= i2; i++)
9882 unicode_org_wbp[i] = propvalue;
9885 if (ferror (stream) || fclose (stream))
9887 fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename);
9888 exit (1);
9892 /* Output the word break property in a human readable format. */
9893 static void
9894 debug_output_org_wbp (FILE *stream)
9896 unsigned int i;
9898 for (i = 0; i < 0x110000; i++)
9900 int propvalue = unicode_org_wbp[i];
9901 if (propvalue != WBP_OTHER)
9903 fprintf (stream, "0x%04X", i);
9904 #define PROP(name,value) \
9905 if (propvalue == value) fprintf (stream, " " name); else
9906 PROP ("CR", WBP_CR)
9907 PROP ("LF", WBP_LF)
9908 PROP ("Newline", WBP_NEWLINE)
9909 PROP ("Extend", WBP_EXTEND)
9910 PROP ("Format", WBP_FORMAT)
9911 PROP ("Katakana", WBP_KATAKANA)
9912 PROP ("ALetter", WBP_ALETTER)
9913 PROP ("MidNumLet", WBP_MIDNUMLET)
9914 PROP ("MidLetter", WBP_MIDLETTER)
9915 PROP ("MidNum", WBP_MIDNUM)
9916 PROP ("Numeric", WBP_NUMERIC)
9917 PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
9918 PROP ("Regional_Indicator", WBP_RI)
9919 PROP ("Double_Quote", WBP_DQ)
9920 PROP ("Single_Quote", WBP_SQ)
9921 PROP ("Hebrew_Letter", WBP_HL)
9922 PROP ("ZWJ", WBP_ZWJ)
9923 PROP ("WSegSpace", WBP_WSS)
9924 #undef PROP
9925 fprintf (stream, " ??");
9926 fprintf (stream, "\n");
9931 static void
9932 debug_output_org_wbrk_tables (const char *filename)
9934 FILE *stream;
9936 stream = fopen (filename, "w");
9937 if (stream == NULL)
9939 fprintf (stderr, "cannot open '%s' for writing\n", filename);
9940 exit (1);
9943 debug_output_org_wbp (stream);
9945 if (ferror (stream) || fclose (stream))
9947 fprintf (stderr, "error writing to '%s'\n", filename);
9948 exit (1);
9952 /* Construction of sparse 3-level tables. */
9953 #define TABLE wbp_table
9954 #define ELEMENT unsigned char
9955 #define DEFAULT WBP_OTHER
9956 #define xmalloc malloc
9957 #define xrealloc realloc
9958 #include "3level.h"
9960 static void
9961 output_wbp (FILE *stream)
9963 unsigned int i;
9964 struct wbp_table t;
9965 unsigned int level1_offset, level2_offset, level3_offset;
9967 t.p = 7;
9968 t.q = 9;
9969 wbp_table_init (&t);
9971 for (i = 0; i < 0x110000; i++)
9973 int attr = get_wbp (i);
9975 /* Now attr should contain exactly one bit. */
9976 assert (attr != 0 && (attr & (attr - 1)) == 0);
9978 if (attr != 1 << WBP_OTHER)
9980 unsigned int log2_attr;
9981 for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
9983 wbp_table_add (&t, i, log2_attr);
9987 wbp_table_finalize (&t);
9989 level1_offset =
9990 5 * sizeof (uint32_t);
9991 level2_offset =
9992 5 * sizeof (uint32_t)
9993 + t.level1_size * sizeof (uint32_t);
9994 level3_offset =
9995 5 * sizeof (uint32_t)
9996 + t.level1_size * sizeof (uint32_t)
9997 + (t.level2_size << t.q) * sizeof (uint32_t);
9999 for (i = 0; i < 5; i++)
10000 fprintf (stream, "#define wbrkprop_header_%d %d\n", i,
10001 ((uint32_t *) t.result)[i]);
10002 fprintf (stream, "\n");
10003 fprintf (stream, "typedef struct\n");
10004 fprintf (stream, " {\n");
10005 fprintf (stream, " int level1[%zu];\n", t.level1_size);
10006 fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q);
10007 fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p);
10008 fprintf (stream, " }\n");
10009 fprintf (stream, "wbrkprop_t;\n");
10010 fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n");
10011 fprintf (stream, "{\n");
10012 fprintf (stream, " {");
10013 if (t.level1_size > 8)
10014 fprintf (stream, "\n ");
10015 for (i = 0; i < t.level1_size; i++)
10017 uint32_t offset;
10018 if (i > 0 && (i % 8) == 0)
10019 fprintf (stream, "\n ");
10020 offset = ((uint32_t *) (t.result + level1_offset))[i];
10021 if (offset == 0)
10022 fprintf (stream, " %5d", -1);
10023 else
10024 fprintf (stream, " %5zu",
10025 (offset - level2_offset) / sizeof (uint32_t));
10026 if (i+1 < t.level1_size)
10027 fprintf (stream, ",");
10029 if (t.level1_size > 8)
10030 fprintf (stream, "\n ");
10031 fprintf (stream, " },\n");
10032 fprintf (stream, " {");
10033 if (t.level2_size << t.q > 8)
10034 fprintf (stream, "\n ");
10035 for (i = 0; i < t.level2_size << t.q; i++)
10037 uint32_t offset;
10038 if (i > 0 && (i % 8) == 0)
10039 fprintf (stream, "\n ");
10040 offset = ((uint32_t *) (t.result + level2_offset))[i];
10041 if (offset == 0)
10042 fprintf (stream, " %5d", -1);
10043 else
10044 fprintf (stream, " %5zu",
10045 (offset - level3_offset) / sizeof (unsigned char));
10046 if (i+1 < t.level2_size << t.q)
10047 fprintf (stream, ",");
10049 if (t.level2_size << t.q > 8)
10050 fprintf (stream, "\n ");
10051 fprintf (stream, " },\n");
10052 fprintf (stream, " {");
10053 if (t.level3_size << t.p > 4)
10054 fprintf (stream, "\n ");
10055 for (i = 0; i < t.level3_size << t.p; i++)
10057 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
10058 const char *value_string;
10059 switch (value)
10061 #define CASE(x) case x: value_string = #x; break;
10062 CASE(WBP_OTHER);
10063 CASE(WBP_CR);
10064 CASE(WBP_LF);
10065 CASE(WBP_NEWLINE);
10066 CASE(WBP_EXTEND);
10067 CASE(WBP_FORMAT);
10068 CASE(WBP_KATAKANA);
10069 CASE(WBP_ALETTER);
10070 CASE(WBP_MIDNUMLET);
10071 CASE(WBP_MIDLETTER);
10072 CASE(WBP_MIDNUM);
10073 CASE(WBP_NUMERIC);
10074 CASE(WBP_EXTENDNUMLET);
10075 CASE(WBP_RI);
10076 CASE(WBP_DQ);
10077 CASE(WBP_SQ);
10078 CASE(WBP_HL);
10079 CASE(WBP_ZWJ);
10080 CASE(WBP_WSS);
10081 #undef CASE
10082 default:
10083 abort ();
10085 if (i > 0 && (i % 4) == 0)
10086 fprintf (stream, "\n ");
10087 fprintf (stream, " %s%s", value_string,
10088 (i+1 < t.level3_size << t.p ? "," : ""));
10090 if (t.level3_size << t.p > 4)
10091 fprintf (stream, "\n ");
10092 fprintf (stream, " }\n");
10093 fprintf (stream, "};\n");
10096 static void
10097 output_wbrk_tables (const char *filename, const char *version)
10099 FILE *stream;
10101 stream = fopen (filename, "w");
10102 if (stream == NULL)
10104 fprintf (stderr, "cannot open '%s' for writing\n", filename);
10105 exit (1);
10108 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10109 fprintf (stream, "/* Word breaking properties of Unicode characters. */\n");
10110 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10111 version);
10112 fprintf (stream, "\n");
10114 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
10115 fprintf (stream, "\n");
10116 output_library_license (stream, false);
10117 fprintf (stream, "\n");
10119 output_wbp (stream);
10121 if (ferror (stream) || fclose (stream))
10123 fprintf (stderr, "error writing to '%s'\n", filename);
10124 exit (1);
10128 /* ========================================================================= */
10130 /* Grapheme break property.
10131 Updated for Unicode TR #29 revision 29. */
10133 /* Possible values of the Grapheme_Cluster_Break property. */
10134 enum
10136 GBP_OTHER = 0,
10137 GBP_CR = 1,
10138 GBP_LF = 2,
10139 GBP_CONTROL = 3,
10140 GBP_EXTEND = 4,
10141 GBP_PREPEND = 5,
10142 GBP_SPACINGMARK = 6,
10143 GBP_L = 7,
10144 GBP_V = 8,
10145 GBP_T = 9,
10146 GBP_LV = 10,
10147 GBP_LVT = 11,
10148 GBP_RI = 12,
10149 GBP_ZWJ = 13,
10150 GBP_EB = 14,
10151 GBP_EM = 15,
10152 GBP_GAZ = 16,
10153 GBP_EBG = 17
10156 /* Construction of sparse 3-level tables. */
10157 #define TABLE gbp_table
10158 #define ELEMENT unsigned char
10159 #define DEFAULT GBP_OTHER
10160 #define xmalloc malloc
10161 #define xrealloc realloc
10162 #include "3level.h"
10164 /* The grapheme break property from the GraphemeBreakProperty.txt file. */
10165 int unicode_org_gbp[0x110000];
10167 /* Output the unit test data for the grapheme break property. */
10168 static void
10169 output_gbp_test (const char *filename)
10171 FILE *stream;
10172 bool need_comma;
10173 unsigned int ch;
10175 stream = fopen (filename, "w");
10176 if (stream == NULL)
10178 fprintf (stderr, "cannot open '%s' for writing\n", filename);
10179 exit (1);
10182 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10183 fprintf (stream, "/* Test the Unicode grapheme break property functions.\n");
10184 fprintf (stream, " Copyright (C) 2010-2024 Free Software Foundation, Inc.\n");
10185 fprintf (stream, "\n");
10186 output_tests_license (stream);
10187 fprintf (stream, "\n");
10189 need_comma = false;
10190 for (ch = 0; ch < 0x110000; ch++)
10192 int gbp = unicode_org_gbp[ch];
10193 const char *gbp_string;
10195 while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp)
10196 ch++;
10198 switch (gbp)
10200 #define CASE(x) case x: gbp_string = #x; break;
10201 CASE (GBP_OTHER)
10202 CASE (GBP_CR)
10203 CASE (GBP_LF)
10204 CASE (GBP_CONTROL)
10205 CASE (GBP_EXTEND)
10206 CASE (GBP_PREPEND)
10207 CASE (GBP_SPACINGMARK)
10208 CASE (GBP_L)
10209 CASE (GBP_V)
10210 CASE (GBP_T)
10211 CASE (GBP_LV)
10212 CASE (GBP_LVT)
10213 CASE (GBP_RI)
10214 CASE (GBP_ZWJ)
10215 CASE (GBP_EB)
10216 CASE (GBP_EM)
10217 CASE (GBP_GAZ)
10218 CASE (GBP_EBG)
10219 #undef CASE
10220 default:
10221 abort ();
10224 if (need_comma)
10225 fprintf (stream, ",\n");
10226 fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string);
10228 need_comma = true;
10230 fprintf (stream, "\n");
10232 if (ferror (stream) || fclose (stream))
10234 fprintf (stderr, "error writing to '%s'\n", filename);
10235 exit (1);
10239 /* Output the per-character grapheme break property table. */
10240 static void
10241 output_gbp_table (const char *filename, const char *version)
10243 FILE *stream;
10244 unsigned int ch, i;
10245 struct gbp_table t;
10246 unsigned int level1_offset, level2_offset, level3_offset;
10248 stream = fopen (filename, "w");
10249 if (stream == NULL)
10251 fprintf (stderr, "cannot open '%s' for writing\n", filename);
10252 exit (1);
10255 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10256 fprintf (stream, "/* Grapheme break property of Unicode characters. */\n");
10257 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10258 version);
10259 fprintf (stream, "\n");
10261 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
10262 fprintf (stream, "\n");
10263 output_library_license (stream, false);
10264 fprintf (stream, "\n");
10266 t.p = 7;
10267 t.q = 9;
10268 gbp_table_init (&t);
10270 for (ch = 0; ch < 0x110000; ch++)
10271 gbp_table_add (&t, ch, unicode_org_gbp[ch]);
10273 gbp_table_finalize (&t);
10275 /* Offsets in t.result, in memory of this process. */
10276 level1_offset =
10277 5 * sizeof (uint32_t);
10278 level2_offset =
10279 5 * sizeof (uint32_t)
10280 + t.level1_size * sizeof (uint32_t);
10281 level3_offset =
10282 5 * sizeof (uint32_t)
10283 + t.level1_size * sizeof (uint32_t)
10284 + (t.level2_size << t.q) * sizeof (uint32_t);
10286 for (i = 0; i < 5; i++)
10287 fprintf (stream, "#define gbrkprop_header_%d %d\n", i,
10288 ((uint32_t *) t.result)[i]);
10289 fprintf (stream, "static const\n");
10290 fprintf (stream, "struct\n");
10291 fprintf (stream, " {\n");
10292 fprintf (stream, " int level1[%zu];\n", t.level1_size);
10293 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
10294 fprintf (stream, " unsigned char level3[%zu << %d];\n",
10295 t.level3_size, t.p);
10296 fprintf (stream, " }\n");
10297 fprintf (stream, "unigbrkprop =\n");
10298 fprintf (stream, "{\n");
10299 fprintf (stream, " {");
10300 if (t.level1_size > 8)
10301 fprintf (stream, "\n ");
10302 for (i = 0; i < t.level1_size; i++)
10304 uint32_t offset;
10305 if (i > 0 && (i % 8) == 0)
10306 fprintf (stream, "\n ");
10307 offset = ((uint32_t *) (t.result + level1_offset))[i];
10308 if (offset == 0)
10309 fprintf (stream, " %5d", -1);
10310 else
10311 fprintf (stream, " %5zu",
10312 (offset - level2_offset) / sizeof (uint32_t));
10313 if (i+1 < t.level1_size)
10314 fprintf (stream, ",");
10316 if (t.level1_size > 8)
10317 fprintf (stream, "\n ");
10318 fprintf (stream, " },\n");
10319 fprintf (stream, " {");
10320 if (t.level2_size << t.q > 8)
10321 fprintf (stream, "\n ");
10322 for (i = 0; i < t.level2_size << t.q; i++)
10324 uint32_t offset;
10325 if (i > 0 && (i % 8) == 0)
10326 fprintf (stream, "\n ");
10327 offset = ((uint32_t *) (t.result + level2_offset))[i];
10328 if (offset == 0)
10329 fprintf (stream, " %5d", -1);
10330 else
10331 fprintf (stream, " %5zu",
10332 (offset - level3_offset) / sizeof (uint8_t));
10333 if (i+1 < t.level2_size << t.q)
10334 fprintf (stream, ",");
10336 if (t.level2_size << t.q > 8)
10337 fprintf (stream, "\n ");
10338 fprintf (stream, " },\n");
10339 fprintf (stream, " {");
10340 if (t.level3_size << t.p > 4)
10341 fprintf (stream, "\n ");
10342 for (i = 0; i < t.level3_size << t.p; i++)
10344 unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
10345 const char *value_string;
10346 switch (value)
10348 #define CASE(x) case x: value_string = #x; break;
10349 CASE (GBP_OTHER)
10350 CASE (GBP_CR)
10351 CASE (GBP_LF)
10352 CASE (GBP_CONTROL)
10353 CASE (GBP_EXTEND)
10354 CASE (GBP_PREPEND)
10355 CASE (GBP_SPACINGMARK)
10356 CASE (GBP_L)
10357 CASE (GBP_V)
10358 CASE (GBP_T)
10359 CASE (GBP_LV)
10360 CASE (GBP_LVT)
10361 CASE (GBP_RI)
10362 CASE (GBP_ZWJ)
10363 CASE (GBP_EB)
10364 CASE (GBP_EM)
10365 CASE (GBP_GAZ)
10366 CASE (GBP_EBG)
10367 #undef CASE
10368 default:
10369 abort ();
10371 if (i > 0 && (i % 4) == 0)
10372 fprintf (stream, "\n ");
10373 fprintf (stream, " %s%s", value_string,
10374 (i+1 < t.level3_size << t.p ? "," : ""));
10376 if (t.level3_size << t.p > 4)
10377 fprintf (stream, "\n ");
10378 fprintf (stream, " }\n");
10379 fprintf (stream, "};\n");
10381 if (ferror (stream) || fclose (stream))
10383 fprintf (stderr, "error writing to '%s'\n", filename);
10384 exit (1);
10388 /* Stores in unicode_org_gbp[] the grapheme breaking property from the
10389 GraphemeBreakProperty.txt file. */
10390 static void
10391 fill_org_gbp (const char *graphemebreakproperty_filename)
10393 unsigned int i;
10394 FILE *stream;
10395 int lineno = 0;
10397 for (i = 0; i < 0x110000; i++)
10398 unicode_org_gbp[i] = GBP_OTHER;
10400 stream = fopen (graphemebreakproperty_filename, "r");
10401 if (stream == NULL)
10403 fprintf (stderr, "error during fopen of '%s'\n",
10404 graphemebreakproperty_filename);
10405 exit (1);
10408 for (;;)
10410 char buf[200+1];
10411 unsigned int i1, i2;
10412 char padding[200+1];
10413 char propname[200+1];
10414 int propvalue;
10416 lineno++;
10417 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
10418 break;
10420 if (buf[0] == '\0' || buf[0] == '#')
10421 continue;
10423 if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4)
10425 if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3)
10427 fprintf (stderr, "parse error in '%s'\n",
10428 graphemebreakproperty_filename);
10429 exit (1);
10431 i2 = i1;
10433 #define PROP(name,value) \
10434 if (strcmp (propname, name) == 0) propvalue = value; else
10435 PROP ("CR", GBP_CR)
10436 PROP ("LF", GBP_LF)
10437 PROP ("Control", GBP_CONTROL)
10438 PROP ("Extend", GBP_EXTEND)
10439 PROP ("Prepend", GBP_PREPEND)
10440 PROP ("SpacingMark", GBP_SPACINGMARK)
10441 PROP ("L", GBP_L)
10442 PROP ("V", GBP_V)
10443 PROP ("T", GBP_T)
10444 PROP ("LV", GBP_LV)
10445 PROP ("LVT", GBP_LVT)
10446 PROP ("Regional_Indicator", GBP_RI)
10447 PROP ("ZWJ", GBP_ZWJ)
10448 PROP ("E_Base", GBP_EB)
10449 PROP ("E_Modifier", GBP_EM)
10450 PROP ("Glue_After_Zwj", GBP_GAZ)
10451 PROP ("E_Base_GAZ", GBP_EBG)
10452 #undef PROP
10454 fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
10455 graphemebreakproperty_filename, lineno);
10456 exit (1);
10458 assert (i1 <= i2 && i2 < 0x110000);
10460 for (i = i1; i <= i2; i++)
10461 unicode_org_gbp[i] = propvalue;
10464 if (ferror (stream) || fclose (stream))
10466 fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename);
10467 exit (1);
10471 /* ========================================================================= */
10473 /* Composition and decomposition.
10474 Updated for Unicode TR #15 revision 33. */
10476 /* Maximum number of characters into which a single Unicode character can be
10477 decomposed. */
10478 #define MAX_DECOMP_LENGTH 18
10480 enum
10482 UC_DECOMP_CANONICAL,/* Canonical decomposition. */
10483 UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
10484 UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
10485 UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
10486 UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
10487 UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
10488 UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
10489 UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
10490 UC_DECOMP_SUPER, /* <super> A superscript form. */
10491 UC_DECOMP_SUB, /* <sub> A subscript form. */
10492 UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
10493 UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
10494 UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
10495 UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
10496 UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
10497 UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
10498 UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
10501 /* Return the decomposition for a Unicode character (ignoring Hangul Jamo
10502 decompositions). Return the type, or -1 for none. */
10503 static int
10504 get_decomposition (unsigned int ch,
10505 unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH])
10507 const char *decomposition = unicode_attributes[ch].decomposition;
10509 if (decomposition != NULL && decomposition[0] != '\0')
10511 int type = UC_DECOMP_CANONICAL;
10512 unsigned int length;
10513 char *endptr;
10515 if (decomposition[0] == '<')
10517 const char *rangle;
10518 size_t typelen;
10520 rangle = strchr (decomposition + 1, '>');
10521 assert (rangle != NULL);
10522 typelen = rangle + 1 - decomposition;
10523 #define TYPE(t1,t2) \
10524 if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \
10525 type = t2; \
10526 else
10527 TYPE ("<font>", UC_DECOMP_FONT)
10528 TYPE ("<noBreak>", UC_DECOMP_NOBREAK)
10529 TYPE ("<initial>", UC_DECOMP_INITIAL)
10530 TYPE ("<medial>", UC_DECOMP_MEDIAL)
10531 TYPE ("<final>", UC_DECOMP_FINAL)
10532 TYPE ("<isolated>", UC_DECOMP_ISOLATED)
10533 TYPE ("<circle>", UC_DECOMP_CIRCLE)
10534 TYPE ("<super>", UC_DECOMP_SUPER)
10535 TYPE ("<sub>", UC_DECOMP_SUB)
10536 TYPE ("<vertical>", UC_DECOMP_VERTICAL)
10537 TYPE ("<wide>", UC_DECOMP_WIDE)
10538 TYPE ("<narrow>", UC_DECOMP_NARROW)
10539 TYPE ("<small>", UC_DECOMP_SMALL)
10540 TYPE ("<square>", UC_DECOMP_SQUARE)
10541 TYPE ("<fraction>", UC_DECOMP_FRACTION)
10542 TYPE ("<compat>", UC_DECOMP_COMPAT)
10544 fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition);
10545 exit (1);
10547 #undef TYPE
10548 decomposition = rangle + 1;
10549 if (decomposition[0] == ' ')
10550 decomposition++;
10552 for (length = 0; length < MAX_DECOMP_LENGTH; length++)
10554 decomposed[length] = strtoul (decomposition, &endptr, 16);
10555 if (endptr == decomposition)
10556 break;
10557 decomposition = endptr;
10558 if (decomposition[0] == ' ')
10559 decomposition++;
10561 /* Make sure that *DECOMPOSITION is not NULL-terminated.
10562 Otherwise MAX_DECOMP_LENGTH is too small. */
10563 assert (*decomposition == '\0');
10565 *lengthp = length;
10566 return type;
10568 else
10569 return -1;
10572 /* Construction of sparse 3-level tables. */
10573 #define TABLE decomp_table
10574 #define ELEMENT uint16_t
10575 #define DEFAULT (uint16_t)(-1)
10576 #define xmalloc malloc
10577 #define xrealloc realloc
10578 #include "3level.h"
10580 static void
10581 output_decomposition (FILE *stream1, FILE *stream2)
10583 struct decomp_table t;
10584 unsigned int level1_offset, level2_offset, level3_offset;
10585 unsigned int offset;
10586 unsigned int ch;
10587 unsigned int i;
10589 t.p = 5;
10590 t.q = 5;
10591 decomp_table_init (&t);
10593 fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n");
10594 fprintf (stream1, "\n");
10595 fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{");
10596 offset = 0;
10598 for (ch = 0; ch < 0x110000; ch++)
10600 unsigned int length;
10601 unsigned int decomposed[MAX_DECOMP_LENGTH];
10602 int type = get_decomposition (ch, &length, decomposed);
10604 if (type >= 0)
10606 assert (offset < (1 << 15));
10607 decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset);
10609 /* Produce length 3-bytes entries. */
10610 /* We would need a special representation of zero-length entries. */
10611 assert (length != 0);
10612 for (i = 0; i < length; i++)
10614 if (offset > 0)
10615 fprintf (stream2, ",");
10616 if ((offset % 4) == 0)
10617 fprintf (stream2, "\n ");
10618 assert (decomposed[i] < (1 << 18));
10619 fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X",
10620 (((i+1 < length ? (1 << 23) : 0)
10621 | (i == 0 ? (type << 18) : 0)
10622 | decomposed[i]) >> 16) & 0xff,
10623 (decomposed[i] >> 8) & 0xff,
10624 decomposed[i] & 0xff);
10625 offset++;
10630 fprintf (stream2, "\n};\n");
10631 fprintf (stream2, "\n");
10633 decomp_table_finalize (&t);
10635 level1_offset =
10636 5 * sizeof (uint32_t);
10637 level2_offset =
10638 5 * sizeof (uint32_t)
10639 + t.level1_size * sizeof (uint32_t);
10640 level3_offset =
10641 5 * sizeof (uint32_t)
10642 + t.level1_size * sizeof (uint32_t)
10643 + (t.level2_size << t.q) * sizeof (uint32_t);
10645 for (i = 0; i < 5; i++)
10646 fprintf (stream1, "#define decomp_header_%d %d\n", i,
10647 ((uint32_t *) t.result)[i]);
10648 fprintf (stream1, "\n");
10649 fprintf (stream1, "typedef struct\n");
10650 fprintf (stream1, " {\n");
10651 fprintf (stream1, " int level1[%zu];\n", t.level1_size);
10652 fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q);
10653 fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p);
10654 fprintf (stream1, " }\n");
10655 fprintf (stream1, "decomp_index_table_t;\n");
10656 fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n");
10657 fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n");
10658 fprintf (stream2, "{\n");
10659 fprintf (stream2, " {");
10660 if (t.level1_size > 8)
10661 fprintf (stream2, "\n ");
10662 for (i = 0; i < t.level1_size; i++)
10664 uint32_t offset;
10665 if (i > 0 && (i % 8) == 0)
10666 fprintf (stream2, "\n ");
10667 offset = ((uint32_t *) (t.result + level1_offset))[i];
10668 if (offset == 0)
10669 fprintf (stream2, " %5d", -1);
10670 else
10671 fprintf (stream2, " %5zu",
10672 (offset - level2_offset) / sizeof (uint32_t));
10673 if (i+1 < t.level1_size)
10674 fprintf (stream2, ",");
10676 if (t.level1_size > 8)
10677 fprintf (stream2, "\n ");
10678 fprintf (stream2, " },\n");
10679 fprintf (stream2, " {");
10680 if (t.level2_size << t.q > 8)
10681 fprintf (stream2, "\n ");
10682 for (i = 0; i < t.level2_size << t.q; i++)
10684 uint32_t offset;
10685 if (i > 0 && (i % 8) == 0)
10686 fprintf (stream2, "\n ");
10687 offset = ((uint32_t *) (t.result + level2_offset))[i];
10688 if (offset == 0)
10689 fprintf (stream2, " %5d", -1);
10690 else
10691 fprintf (stream2, " %5zu",
10692 (offset - level3_offset) / sizeof (uint16_t));
10693 if (i+1 < t.level2_size << t.q)
10694 fprintf (stream2, ",");
10696 if (t.level2_size << t.q > 8)
10697 fprintf (stream2, "\n ");
10698 fprintf (stream2, " },\n");
10699 fprintf (stream2, " {");
10700 if (t.level3_size << t.p > 8)
10701 fprintf (stream2, "\n ");
10702 for (i = 0; i < t.level3_size << t.p; i++)
10704 uint16_t value = ((uint16_t *) (t.result + level3_offset))[i];
10705 if (i > 0 && (i % 8) == 0)
10706 fprintf (stream2, "\n ");
10707 fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value);
10708 if (i+1 < t.level3_size << t.p)
10709 fprintf (stream2, ",");
10711 if (t.level3_size << t.p > 8)
10712 fprintf (stream2, "\n ");
10713 fprintf (stream2, " }\n");
10714 fprintf (stream2, "};\n");
10717 static void
10718 output_decomposition_tables (const char *filename1, const char *filename2, const char *version)
10720 const char *filenames[2];
10721 FILE *streams[2];
10722 size_t i;
10724 filenames[0] = filename1;
10725 filenames[1] = filename2;
10727 for (i = 0; i < 2; i++)
10729 streams[i] = fopen (filenames[i], "w");
10730 if (streams[i] == NULL)
10732 fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]);
10733 exit (1);
10737 for (i = 0; i < 2; i++)
10739 FILE *stream = streams[i];
10741 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10742 fprintf (stream, "/* Decomposition of Unicode characters. */\n");
10743 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10744 version);
10745 fprintf (stream, "\n");
10747 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
10748 fprintf (stream, "\n");
10749 output_library_license (stream, true);
10750 fprintf (stream, "\n");
10753 output_decomposition (streams[0], streams[1]);
10755 for (i = 0; i < 2; i++)
10757 if (ferror (streams[i]) || fclose (streams[i]))
10759 fprintf (stderr, "error writing to '%s'\n", filenames[i]);
10760 exit (1);
10765 /* The "excluded from composition" property from the CompositionExclusions.txt file. */
10766 char unicode_composition_exclusions[0x110000];
10768 static void
10769 fill_composition_exclusions (const char *compositionexclusions_filename)
10771 FILE *stream;
10772 unsigned int i;
10774 stream = fopen (compositionexclusions_filename, "r");
10775 if (stream == NULL)
10777 fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename);
10778 exit (1);
10781 for (i = 0; i < 0x110000; i++)
10782 unicode_composition_exclusions[i] = 0;
10784 for (;;)
10786 char buf[200+1];
10787 unsigned int i;
10789 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
10790 break;
10792 if (buf[0] == '\0' || buf[0] == '#')
10793 continue;
10795 if (sscanf (buf, "%X", &i) != 1)
10797 fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename);
10798 exit (1);
10800 assert (i < 0x110000);
10802 unicode_composition_exclusions[i] = 1;
10805 if (ferror (stream) || fclose (stream))
10807 fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename);
10808 exit (1);
10812 static void
10813 debug_output_composition_tables (const char *filename)
10815 FILE *stream;
10816 unsigned int ch;
10818 stream = fopen (filename, "w");
10819 if (stream == NULL)
10821 fprintf (stderr, "cannot open '%s' for writing\n", filename);
10822 exit (1);
10825 for (ch = 0; ch < 0x110000; ch++)
10827 unsigned int length;
10828 unsigned int decomposed[MAX_DECOMP_LENGTH];
10829 int type = get_decomposition (ch, &length, decomposed);
10831 if (type == UC_DECOMP_CANONICAL
10832 /* Consider only binary decompositions.
10833 Exclude singleton decompositions. */
10834 && length == 2)
10836 unsigned int code1 = decomposed[0];
10837 unsigned int code2 = decomposed[1];
10838 unsigned int combined = ch;
10840 /* Exclude decompositions where the first part is not a starter,
10841 i.e. is not of canonical combining class 0. */
10842 if (strcmp (unicode_attributes[code1].combining, "0") == 0
10843 /* Exclude characters listed in CompositionExclusions.txt. */
10844 && !unicode_composition_exclusions[combined])
10846 /* The combined character must now also be a starter.
10847 Verify this. */
10848 assert (strcmp (unicode_attributes[combined].combining, "0") == 0);
10850 fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n",
10851 code1,
10852 code2,
10853 combined,
10854 unicode_attributes[code2].combining);
10859 if (ferror (stream) || fclose (stream))
10861 fprintf (stderr, "error writing to '%s'\n", filename);
10862 exit (1);
10866 static void
10867 output_composition_tables (const char *filename, const char *filename2,
10868 const char *version)
10870 unsigned int max_code1;
10871 unsigned int max_code2;
10872 FILE *stream;
10873 unsigned int ch;
10875 max_code1 = 0;
10876 max_code2 = 0;
10878 stream = fopen (filename, "w");
10879 if (stream == NULL)
10881 fprintf (stderr, "cannot open '%s' for writing\n", filename);
10882 exit (1);
10885 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10886 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
10887 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10888 version);
10889 fprintf (stream, "\n");
10891 fprintf (stream, "/* Copyright (C) 2009-2024 Free Software Foundation, Inc.\n");
10892 fprintf (stream, "\n");
10893 output_library_license (stream, true);
10894 fprintf (stream, "\n");
10896 /* The composition table is a set of mappings (code1, code2) -> combined,
10897 with 928 entries,
10898 367 values for code1 (from 0x003C to 0x30FD),
10899 54 values for code2 (from 0x0300 to 0x309A).
10900 For a fixed code1, there are from 1 to 19 possible values for code2.
10901 For a fixed code2, there are from 1 to 117 possible values for code1.
10902 This is a very sparse matrix.
10904 We want an O(1) hash lookup.
10906 We could implement the hash lookup by mapping (code1, code2) to a linear
10907 combination mul1*code1 + mul2*code2, which is then used as an index into
10908 a 3-level table. But this leads to a table of size 37 KB.
10910 We use gperf to implement the hash lookup, giving it the 928 sets of
10911 4 bytes (code1, code2) as input. gperf generates a hash table of size
10912 1527, which is quite good (60% filled). It requires an auxiliary table
10913 lookup in a table of size 0.5 KB. The total tables size is 11 KB. */
10915 fprintf (stream, "struct composition_rule { char codes[6]; };\n");
10916 fprintf (stream, "%%struct-type\n");
10917 fprintf (stream, "%%language=ANSI-C\n");
10918 fprintf (stream, "%%define slot-name codes\n");
10919 fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n");
10920 fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n");
10921 fprintf (stream, "%%compare-lengths\n");
10922 fprintf (stream, "%%compare-strncmp\n");
10923 fprintf (stream, "%%readonly-tables\n");
10924 fprintf (stream, "%%omit-struct-type\n");
10925 fprintf (stream, "%%%%\n");
10927 for (ch = 0; ch < 0x110000; ch++)
10929 unsigned int length;
10930 unsigned int decomposed[MAX_DECOMP_LENGTH];
10931 int type = get_decomposition (ch, &length, decomposed);
10933 if (type == UC_DECOMP_CANONICAL
10934 /* Consider only binary decompositions.
10935 Exclude singleton decompositions. */
10936 && length == 2)
10938 unsigned int code1 = decomposed[0];
10939 unsigned int code2 = decomposed[1];
10940 unsigned int combined = ch;
10942 /* Exclude decompositions where the first part is not a starter,
10943 i.e. is not of canonical combining class 0. */
10944 if (strcmp (unicode_attributes[code1].combining, "0") == 0
10945 /* Exclude characters listed in CompositionExclusions.txt. */
10946 && !unicode_composition_exclusions[combined])
10948 /* The combined character must now also be a starter.
10949 Verify this. */
10950 assert (strcmp (unicode_attributes[combined].combining, "0") == 0);
10952 if (max_code1 < code1)
10953 max_code1 = code1;
10954 if (max_code2 < code2)
10955 max_code2 = code2;
10957 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n",
10958 (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff,
10959 (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff,
10960 combined);
10965 if (ferror (stream) || fclose (stream))
10967 fprintf (stderr, "error writing to '%s'\n", filename);
10968 exit (1);
10971 stream = fopen (filename2, "w");
10972 if (stream == NULL)
10974 fprintf (stderr, "cannot open '%s' for writing\n", filename2);
10975 exit (1);
10978 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
10979 fprintf (stream, "/* Canonical composition of Unicode characters. */\n");
10980 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
10981 version);
10982 fprintf (stream, "\n");
10984 fprintf (stream, "/* Copyright (C) 2009-2024 Free Software Foundation, Inc.\n");
10985 fprintf (stream, "\n");
10986 output_library_license (stream, true);
10987 fprintf (stream, "\n");
10989 fprintf (stream, "/* Maximum value of the first argument for which gl_uninorm_compose_lookup\n"
10990 " can return a non-NULL value. */\n");
10991 fprintf (stream, "#define UNINORM_COMPOSE_MAX_ARG1 0x%x\n", max_code1);
10992 fprintf (stream, "/* Maximum value of the second argument for which gl_uninorm_compose_lookup\n"
10993 " can return a non-NULL value. */\n");
10994 fprintf (stream, "#define UNINORM_COMPOSE_MAX_ARG2 0x%x\n", max_code2);
10996 if (ferror (stream) || fclose (stream))
10998 fprintf (stderr, "error writing to '%s'\n", filename2);
10999 exit (1);
11003 /* ========================================================================= */
11005 /* Output the test for a simple character mapping table to the given file. */
11007 static void
11008 output_simple_mapping_test (const char *filename,
11009 const char *function_name,
11010 unsigned int (*func) (unsigned int),
11011 const char *version)
11013 FILE *stream;
11014 bool need_comma;
11015 unsigned int ch;
11017 stream = fopen (filename, "w");
11018 if (stream == NULL)
11020 fprintf (stderr, "cannot open '%s' for writing\n", filename);
11021 exit (1);
11024 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
11025 fprintf (stream, "/* Test the Unicode character mapping functions.\n");
11026 fprintf (stream, " Copyright (C) 2009-2024 Free Software Foundation, Inc.\n");
11027 fprintf (stream, "\n");
11028 output_tests_license (stream);
11029 fprintf (stream, "\n");
11030 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
11031 version);
11032 fprintf (stream, "\n");
11033 fprintf (stream, "#include \"test-mapping-part1.h\"\n");
11034 fprintf (stream, "\n");
11036 need_comma = false;
11037 for (ch = 0; ch < 0x110000; ch++)
11039 unsigned int value = func (ch);
11041 if (value != ch)
11043 if (need_comma)
11044 fprintf (stream, ",\n");
11045 fprintf (stream, " { 0x%04X, 0x%04X }", ch, value);
11046 need_comma = true;
11049 if (need_comma)
11050 fprintf (stream, "\n");
11052 fprintf (stream, "\n");
11053 fprintf (stream, "#define MAP(c) %s (c)\n", function_name);
11054 fprintf (stream, "#include \"test-mapping-part2.h\"\n");
11056 if (ferror (stream) || fclose (stream))
11058 fprintf (stderr, "error writing to '%s'\n", filename);
11059 exit (1);
11063 /* Construction of sparse 3-level tables. */
11064 #define TABLE mapping_table
11065 #define ELEMENT int32_t
11066 #define DEFAULT 0
11067 #define xmalloc malloc
11068 #define xrealloc realloc
11069 #include "3level.h"
11071 /* Output a simple character mapping table to the given file. */
11073 static void
11074 output_simple_mapping (const char *filename,
11075 unsigned int (*func) (unsigned int),
11076 const char *version)
11078 FILE *stream;
11079 unsigned int ch, i;
11080 struct mapping_table t;
11081 unsigned int level1_offset, level2_offset, level3_offset;
11083 stream = fopen (filename, "w");
11084 if (stream == NULL)
11086 fprintf (stderr, "cannot open '%s' for writing\n", filename);
11087 exit (1);
11090 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
11091 fprintf (stream, "/* Simple character mapping of Unicode characters. */\n");
11092 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
11093 version);
11094 fprintf (stream, "\n");
11096 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
11097 fprintf (stream, "\n");
11098 output_library_license (stream,
11099 strcmp (filename, "unicase/tolower.h") == 0
11100 || strcmp (filename, "unicase/toupper.h") == 0);
11101 fprintf (stream, "\n");
11103 t.p = 7;
11104 t.q = 9;
11105 mapping_table_init (&t);
11107 for (ch = 0; ch < 0x110000; ch++)
11109 int value = (int) func (ch) - (int) ch;
11111 mapping_table_add (&t, ch, value);
11114 mapping_table_finalize (&t);
11116 /* Offsets in t.result, in memory of this process. */
11117 level1_offset =
11118 5 * sizeof (uint32_t);
11119 level2_offset =
11120 5 * sizeof (uint32_t)
11121 + t.level1_size * sizeof (uint32_t);
11122 level3_offset =
11123 5 * sizeof (uint32_t)
11124 + t.level1_size * sizeof (uint32_t)
11125 + (t.level2_size << t.q) * sizeof (uint32_t);
11127 for (i = 0; i < 5; i++)
11128 fprintf (stream, "#define mapping_header_%d %d\n", i,
11129 ((uint32_t *) t.result)[i]);
11130 fprintf (stream, "static const\n");
11131 fprintf (stream, "struct\n");
11132 fprintf (stream, " {\n");
11133 fprintf (stream, " int level1[%zu];\n", t.level1_size);
11134 fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
11135 fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p);
11136 fprintf (stream, " }\n");
11137 fprintf (stream, "u_mapping =\n");
11138 fprintf (stream, "{\n");
11139 fprintf (stream, " {");
11140 if (t.level1_size > 8)
11141 fprintf (stream, "\n ");
11142 for (i = 0; i < t.level1_size; i++)
11144 uint32_t offset;
11145 if (i > 0 && (i % 8) == 0)
11146 fprintf (stream, "\n ");
11147 offset = ((uint32_t *) (t.result + level1_offset))[i];
11148 if (offset == 0)
11149 fprintf (stream, " %5d", -1);
11150 else
11151 fprintf (stream, " %5zu",
11152 (offset - level2_offset) / sizeof (uint32_t));
11153 if (i+1 < t.level1_size)
11154 fprintf (stream, ",");
11156 if (t.level1_size > 8)
11157 fprintf (stream, "\n ");
11158 fprintf (stream, " },\n");
11159 fprintf (stream, " {");
11160 if (t.level2_size << t.q > 8)
11161 fprintf (stream, "\n ");
11162 for (i = 0; i < t.level2_size << t.q; i++)
11164 uint32_t offset;
11165 if (i > 0 && (i % 8) == 0)
11166 fprintf (stream, "\n ");
11167 offset = ((uint32_t *) (t.result + level2_offset))[i];
11168 if (offset == 0)
11169 fprintf (stream, " %5d", -1);
11170 else
11171 fprintf (stream, " %5zu",
11172 (offset - level3_offset) / sizeof (int32_t));
11173 if (i+1 < t.level2_size << t.q)
11174 fprintf (stream, ",");
11176 if (t.level2_size << t.q > 8)
11177 fprintf (stream, "\n ");
11178 fprintf (stream, " },\n");
11179 fprintf (stream, " {");
11180 if (t.level3_size << t.p > 8)
11181 fprintf (stream, "\n ");
11182 for (i = 0; i < t.level3_size << t.p; i++)
11184 if (i > 0 && (i % 8) == 0)
11185 fprintf (stream, "\n ");
11186 fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]);
11187 if (i+1 < t.level3_size << t.p)
11188 fprintf (stream, ",");
11190 if (t.level3_size << t.p > 8)
11191 fprintf (stream, "\n ");
11192 fprintf (stream, " }\n");
11193 fprintf (stream, "};\n");
11195 if (ferror (stream) || fclose (stream))
11197 fprintf (stderr, "error writing to '%s'\n", filename);
11198 exit (1);
11202 /* ========================================================================= */
11204 /* A special casing context.
11205 A context is negated through x -> -x. */
11206 enum
11208 SCC_ALWAYS = 0,
11209 SCC_FINAL_SIGMA,
11210 SCC_AFTER_SOFT_DOTTED,
11211 SCC_MORE_ABOVE,
11212 SCC_BEFORE_DOT,
11213 SCC_AFTER_I
11216 /* A special casing rule. */
11217 struct special_casing_rule
11219 unsigned int code;
11220 unsigned int lower_mapping[3];
11221 unsigned int title_mapping[3];
11222 unsigned int upper_mapping[3];
11223 unsigned int casefold_mapping[3];
11224 const char *language;
11225 int context;
11228 /* The special casing rules. */
11229 struct special_casing_rule **casing_rules;
11230 unsigned int num_casing_rules;
11231 unsigned int allocated_casing_rules;
11233 static void
11234 add_casing_rule (struct special_casing_rule *new_rule)
11236 if (num_casing_rules == allocated_casing_rules)
11238 allocated_casing_rules = 2 * allocated_casing_rules;
11239 if (allocated_casing_rules < 16)
11240 allocated_casing_rules = 16;
11241 casing_rules =
11242 (struct special_casing_rule **)
11243 realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
11245 casing_rules[num_casing_rules++] = new_rule;
11248 /* Stores in casing_rules the special casing rules found in
11249 specialcasing_filename. */
11250 static void
11251 fill_casing_rules (const char *specialcasing_filename)
11253 FILE *stream;
11255 stream = fopen (specialcasing_filename, "r");
11256 if (stream == NULL)
11258 fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
11259 exit (1);
11262 casing_rules = NULL;
11263 num_casing_rules = 0;
11264 allocated_casing_rules = 0;
11266 for (;;)
11268 char buf[200+1];
11269 char *scanptr;
11270 char *endptr;
11271 int i;
11273 unsigned int code;
11274 unsigned int lower_mapping[3];
11275 unsigned int title_mapping[3];
11276 unsigned int upper_mapping[3];
11277 char *language;
11278 int context;
11280 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
11281 break;
11283 if (buf[0] == '\0' || buf[0] == '#')
11284 continue;
11286 /* Scan code. */
11287 scanptr = buf;
11288 code = strtoul (scanptr, &endptr, 16);
11289 if (endptr == scanptr)
11291 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11292 exit (1);
11294 scanptr = endptr;
11295 if (*scanptr != ';')
11297 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11298 exit (1);
11300 scanptr++;
11302 /* Scan lower mapping. */
11303 for (i = 0; i < 3; i++)
11304 lower_mapping[i] = 0;
11305 for (i = 0; i < 3; i++)
11307 while (*scanptr == ' ')
11308 scanptr++;
11309 if (*scanptr == ';')
11310 break;
11311 lower_mapping[i] = strtoul (scanptr, &endptr, 16);
11312 if (endptr == scanptr)
11314 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11315 exit (1);
11317 scanptr = endptr;
11319 if (*scanptr != ';')
11321 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11322 exit (1);
11324 scanptr++;
11326 /* Scan title mapping. */
11327 for (i = 0; i < 3; i++)
11328 title_mapping[i] = 0;
11329 for (i = 0; i < 3; i++)
11331 while (*scanptr == ' ')
11332 scanptr++;
11333 if (*scanptr == ';')
11334 break;
11335 title_mapping[i] = strtoul (scanptr, &endptr, 16);
11336 if (endptr == scanptr)
11338 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11339 exit (1);
11341 scanptr = endptr;
11343 if (*scanptr != ';')
11345 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11346 exit (1);
11348 scanptr++;
11350 /* Scan upper mapping. */
11351 for (i = 0; i < 3; i++)
11352 upper_mapping[i] = 0;
11353 for (i = 0; i < 3; i++)
11355 while (*scanptr == ' ')
11356 scanptr++;
11357 if (*scanptr == ';')
11358 break;
11359 upper_mapping[i] = strtoul (scanptr, &endptr, 16);
11360 if (endptr == scanptr)
11362 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11363 exit (1);
11365 scanptr = endptr;
11367 if (*scanptr != ';')
11369 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11370 exit (1);
11372 scanptr++;
11374 /* Scan language and context. */
11375 language = NULL;
11376 context = SCC_ALWAYS;
11377 while (*scanptr == ' ')
11378 scanptr++;
11379 if (*scanptr != '\0' && *scanptr != '#')
11381 const char *word_begin = scanptr;
11382 const char *word_end;
11384 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
11385 scanptr++;
11386 word_end = scanptr;
11388 while (*scanptr == ' ')
11389 scanptr++;
11391 if (word_end - word_begin == 2)
11393 language = (char *) malloc ((word_end - word_begin) + 1);
11394 memcpy (language, word_begin, 2);
11395 language[word_end - word_begin] = '\0';
11396 word_begin = word_end = NULL;
11398 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
11400 word_begin = scanptr;
11401 while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
11402 scanptr++;
11403 word_end = scanptr;
11407 if (word_end > word_begin)
11409 bool negate = false;
11411 if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
11413 word_begin += 4;
11414 negate = true;
11416 if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
11417 context = SCC_FINAL_SIGMA;
11418 else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
11419 context = SCC_AFTER_SOFT_DOTTED;
11420 else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
11421 context = SCC_MORE_ABOVE;
11422 else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
11423 context = SCC_BEFORE_DOT;
11424 else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
11425 context = SCC_AFTER_I;
11426 else
11428 fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
11429 exit (1);
11431 if (negate)
11432 context = - context;
11435 if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';')
11437 fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
11438 exit (1);
11442 /* Store the rule. */
11444 struct special_casing_rule *new_rule =
11445 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
11446 new_rule->code = code;
11447 new_rule->language = language;
11448 new_rule->context = context;
11449 memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
11450 memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
11451 memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
11453 add_casing_rule (new_rule);
11457 if (ferror (stream) || fclose (stream))
11459 fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
11460 exit (1);
11464 /* A casefolding rule. */
11465 struct casefold_rule
11467 unsigned int code;
11468 unsigned int mapping[3];
11469 const char *language;
11472 /* The casefolding rules. */
11473 struct casefold_rule **casefolding_rules;
11474 unsigned int num_casefolding_rules;
11475 unsigned int allocated_casefolding_rules;
11477 /* Stores in casefolding_rules the case folding rules found in
11478 casefolding_filename. */
11479 static void
11480 fill_casefolding_rules (const char *casefolding_filename)
11482 FILE *stream;
11484 stream = fopen (casefolding_filename, "r");
11485 if (stream == NULL)
11487 fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
11488 exit (1);
11491 casefolding_rules = NULL;
11492 num_casefolding_rules = 0;
11493 allocated_casefolding_rules = 0;
11495 for (;;)
11497 char buf[200+1];
11498 char *scanptr;
11499 char *endptr;
11500 int i;
11502 unsigned int code;
11503 char type;
11504 unsigned int mapping[3];
11506 if (fscanf (stream, "%200[^\n]\n", buf) < 1)
11507 break;
11509 if (buf[0] == '\0' || buf[0] == '#')
11510 continue;
11512 /* Scan code. */
11513 scanptr = buf;
11514 code = strtoul (scanptr, &endptr, 16);
11515 if (endptr == scanptr)
11517 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
11518 exit (1);
11520 scanptr = endptr;
11521 if (*scanptr != ';')
11523 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
11524 exit (1);
11526 scanptr++;
11528 /* Scan type. */
11529 while (*scanptr == ' ')
11530 scanptr++;
11532 switch (*scanptr)
11534 case 'C': case 'F': case 'S': case 'T':
11535 type = *scanptr;
11536 break;
11537 default:
11538 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
11539 exit (1);
11541 scanptr++;
11542 if (*scanptr != ';')
11544 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
11545 exit (1);
11547 scanptr++;
11549 /* Scan casefold mapping. */
11550 for (i = 0; i < 3; i++)
11551 mapping[i] = 0;
11552 for (i = 0; i < 3; i++)
11554 while (*scanptr == ' ')
11555 scanptr++;
11556 if (*scanptr == ';')
11557 break;
11558 mapping[i] = strtoul (scanptr, &endptr, 16);
11559 if (endptr == scanptr)
11561 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
11562 exit (1);
11564 scanptr = endptr;
11566 if (*scanptr != ';')
11568 fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
11569 exit (1);
11571 scanptr++;
11573 /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */
11574 if (type != 'S')
11576 const char * const *languages;
11577 unsigned int languages_count;
11579 /* Type 'T' indicates that the rule is applicable to Turkish
11580 languages only. */
11581 if (type == 'T')
11583 static const char * const turkish_languages[] = { "tr", "az" };
11584 languages = turkish_languages;
11585 languages_count = 2;
11587 else
11589 static const char * const all_languages[] = { NULL };
11590 languages = all_languages;
11591 languages_count = 1;
11594 for (i = 0; i < languages_count; i++)
11596 /* Store a new rule. */
11597 struct casefold_rule *new_rule =
11598 (struct casefold_rule *) malloc (sizeof (struct casefold_rule));
11599 new_rule->code = code;
11600 memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
11601 new_rule->language = languages[i];
11603 if (num_casefolding_rules == allocated_casefolding_rules)
11605 allocated_casefolding_rules = 2 * allocated_casefolding_rules;
11606 if (allocated_casefolding_rules < 16)
11607 allocated_casefolding_rules = 16;
11608 casefolding_rules =
11609 (struct casefold_rule **)
11610 realloc (casefolding_rules,
11611 allocated_casefolding_rules * sizeof (struct casefold_rule *));
11613 casefolding_rules[num_casefolding_rules++] = new_rule;
11618 if (ferror (stream) || fclose (stream))
11620 fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
11621 exit (1);
11625 /* Casefold mapping, when it maps to a single character. */
11626 unsigned int unicode_casefold[0x110000];
11628 static unsigned int
11629 to_casefold (unsigned int ch)
11631 return unicode_casefold[ch];
11634 /* Redistribute the casefolding_rules:
11635 - Rules that map to a single character, language independently, are stored
11636 in unicode_casefold.
11637 - Other rules are merged into casing_rules. */
11638 static void
11639 redistribute_casefolding_rules (void)
11641 unsigned int ch, i, j;
11643 /* Fill unicode_casefold[]. */
11644 for (ch = 0; ch < 0x110000; ch++)
11645 unicode_casefold[ch] = ch;
11646 for (i = 0; i < num_casefolding_rules; i++)
11648 struct casefold_rule *cfrule = casefolding_rules[i];
11650 if (cfrule->language == NULL && cfrule->mapping[1] == 0)
11652 ch = cfrule->code;
11653 assert (ch < 0x110000);
11654 unicode_casefold[ch] = cfrule->mapping[0];
11658 /* Extend the special casing rules by filling in their casefold_mapping[]
11659 field. */
11660 for (j = 0; j < num_casing_rules; j++)
11662 struct special_casing_rule *rule = casing_rules[j];
11663 unsigned int k;
11665 rule->casefold_mapping[0] = to_casefold (rule->code);
11666 for (k = 1; k < 3; k++)
11667 rule->casefold_mapping[k] = 0;
11670 /* Now merge the other casefolding rules into casing_rules. */
11671 for (i = 0; i < num_casefolding_rules; i++)
11673 struct casefold_rule *cfrule = casefolding_rules[i];
11675 if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
11677 /* Find a rule that applies to the same code, same language, and it
11678 has context SCC_ALWAYS. At the same time, update all rules that
11679 have the same code and same or more specific language. */
11680 struct special_casing_rule *found_rule = NULL;
11682 for (j = 0; j < num_casing_rules; j++)
11684 struct special_casing_rule *rule = casing_rules[j];
11686 if (rule->code == cfrule->code
11687 && (cfrule->language == NULL
11688 || (rule->language != NULL
11689 && strcmp (rule->language, cfrule->language) == 0)))
11691 memcpy (rule->casefold_mapping, cfrule->mapping,
11692 sizeof (rule->casefold_mapping));
11694 if ((cfrule->language == NULL
11695 ? rule->language == NULL
11696 : rule->language != NULL
11697 && strcmp (rule->language, cfrule->language) == 0)
11698 && rule->context == SCC_ALWAYS)
11700 /* Found it. */
11701 found_rule = rule;
11706 if (found_rule == NULL)
11708 /* Create a new rule. */
11709 struct special_casing_rule *new_rule =
11710 (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
11712 /* Try to find a rule that applies to the same code, no language
11713 restriction, and with context SCC_ALWAYS. */
11714 for (j = 0; j < num_casing_rules; j++)
11716 struct special_casing_rule *rule = casing_rules[j];
11718 if (rule->code == cfrule->code
11719 && rule->context == SCC_ALWAYS
11720 && rule->language == NULL)
11722 /* Found it. */
11723 found_rule = rule;
11724 break;
11728 new_rule->code = cfrule->code;
11729 new_rule->language = cfrule->language;
11730 new_rule->context = SCC_ALWAYS;
11731 if (found_rule != NULL)
11733 memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
11734 sizeof (new_rule->lower_mapping));
11735 memcpy (new_rule->title_mapping, found_rule->title_mapping,
11736 sizeof (new_rule->title_mapping));
11737 memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
11738 sizeof (new_rule->upper_mapping));
11740 else
11742 unsigned int k;
11744 new_rule->lower_mapping[0] = to_lower (cfrule->code);
11745 for (k = 1; k < 3; k++)
11746 new_rule->lower_mapping[k] = 0;
11747 new_rule->title_mapping[0] = to_title (cfrule->code);
11748 for (k = 1; k < 3; k++)
11749 new_rule->title_mapping[k] = 0;
11750 new_rule->upper_mapping[0] = to_upper (cfrule->code);
11751 for (k = 1; k < 3; k++)
11752 new_rule->upper_mapping[k] = 0;
11754 memcpy (new_rule->casefold_mapping, cfrule->mapping,
11755 sizeof (new_rule->casefold_mapping));
11757 add_casing_rule (new_rule);
11763 static int
11764 compare_casing_rules (const void *a, const void *b)
11766 struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
11767 struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
11768 unsigned int a_code = a_rule->code;
11769 unsigned int b_code = b_rule->code;
11771 if (a_code < b_code)
11772 return -1;
11773 if (a_code > b_code)
11774 return 1;
11776 /* Sort the more specific rules before the more general ones. */
11777 return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
11778 + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
11781 static void
11782 sort_casing_rules (void)
11784 /* Sort the rules 1. by code, 2. by specificity. */
11785 if (num_casing_rules > 1)
11786 qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
11787 compare_casing_rules);
11790 /* Output the special casing rules. */
11791 static void
11792 output_casing_rules (const char *filename, const char *version)
11794 FILE *stream;
11795 unsigned int i, j;
11796 unsigned int minor;
11798 stream = fopen (filename, "w");
11799 if (stream == NULL)
11801 fprintf (stderr, "cannot open '%s' for writing\n", filename);
11802 exit (1);
11805 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
11806 fprintf (stream, "/* Special casing rules of Unicode characters. */\n");
11807 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
11808 version);
11809 fprintf (stream, "\n");
11811 fprintf (stream, "/* Copyright (C) 2000-2024 Free Software Foundation, Inc.\n");
11812 fprintf (stream, "\n");
11813 output_library_license (stream, false);
11814 fprintf (stream, "\n");
11816 fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
11817 fprintf (stream, "%%struct-type\n");
11818 fprintf (stream, "%%language=ANSI-C\n");
11819 fprintf (stream, "%%define slot-name code\n");
11820 fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
11821 fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
11822 fprintf (stream, "%%compare-lengths\n");
11823 fprintf (stream, "%%compare-strncmp\n");
11824 fprintf (stream, "%%readonly-tables\n");
11825 fprintf (stream, "%%omit-struct-type\n");
11826 fprintf (stream, "%%%%\n");
11828 minor = 0;
11829 for (i = 0; i < num_casing_rules; i++)
11831 struct special_casing_rule *rule = casing_rules[i];
11832 int context;
11834 if (i > 0 && rule->code == casing_rules[i - 1]->code)
11835 minor += 1;
11836 else
11837 minor = 0;
11839 if (!(rule->code < 0x10000))
11841 fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
11842 exit (1);
11845 fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
11846 (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
11848 fprintf (stream, "%d, ",
11849 i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
11851 context = rule->context;
11852 if (context < 0)
11854 fprintf (stream, "-");
11855 context = - context;
11857 else
11858 fprintf (stream, " ");
11859 switch (context)
11861 case SCC_ALWAYS:
11862 fprintf (stream, "SCC_ALWAYS ");
11863 break;
11864 case SCC_FINAL_SIGMA:
11865 fprintf (stream, "SCC_FINAL_SIGMA ");
11866 break;
11867 case SCC_AFTER_SOFT_DOTTED:
11868 fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
11869 break;
11870 case SCC_MORE_ABOVE:
11871 fprintf (stream, "SCC_MORE_ABOVE ");
11872 break;
11873 case SCC_BEFORE_DOT:
11874 fprintf (stream, "SCC_BEFORE_DOT ");
11875 break;
11876 case SCC_AFTER_I:
11877 fprintf (stream, "SCC_AFTER_I ");
11878 break;
11879 default:
11880 abort ();
11882 fprintf (stream, ", ");
11884 if (rule->language != NULL)
11886 assert (strlen (rule->language) == 2);
11887 fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]);
11889 else
11890 fprintf (stream, "{ '\\0', '\\0' }, ");
11892 fprintf (stream, "{ ");
11893 for (j = 0; j < 3; j++)
11895 if (j > 0)
11896 fprintf (stream, ", ");
11897 if (!(rule->upper_mapping[j] < 0x10000))
11899 fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
11900 exit (1);
11902 if (rule->upper_mapping[j] != 0)
11903 fprintf (stream, "0x%04X", rule->upper_mapping[j]);
11904 else
11905 fprintf (stream, " 0");
11907 fprintf (stream, " }, { ");
11908 for (j = 0; j < 3; j++)
11910 if (j > 0)
11911 fprintf (stream, ", ");
11912 if (!(rule->lower_mapping[j] < 0x10000))
11914 fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
11915 exit (1);
11917 if (rule->lower_mapping[j] != 0)
11918 fprintf (stream, "0x%04X", rule->lower_mapping[j]);
11919 else
11920 fprintf (stream, " 0");
11922 fprintf (stream, " }, { ");
11923 for (j = 0; j < 3; j++)
11925 if (j > 0)
11926 fprintf (stream, ", ");
11927 if (!(rule->title_mapping[j] < 0x10000))
11929 fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
11930 exit (1);
11932 if (rule->title_mapping[j] != 0)
11933 fprintf (stream, "0x%04X", rule->title_mapping[j]);
11934 else
11935 fprintf (stream, " 0");
11937 fprintf (stream, " }, { ");
11938 for (j = 0; j < 3; j++)
11940 if (j > 0)
11941 fprintf (stream, ", ");
11942 if (!(rule->casefold_mapping[j] < 0x10000))
11944 fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
11945 exit (1);
11947 if (rule->casefold_mapping[j] != 0)
11948 fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
11949 else
11950 fprintf (stream, " 0");
11952 fprintf (stream, " }\n");
11955 if (ferror (stream) || fclose (stream))
11957 fprintf (stderr, "error writing to '%s'\n", filename);
11958 exit (1);
11962 /* ========================================================================= */
11964 /* Quoting the Unicode standard:
11965 Definition: A character is defined to be "cased" if it has the Lowercase
11966 or Uppercase property or has a General_Category value of
11967 Titlecase_Letter. */
11968 static bool
11969 is_cased (unsigned int ch)
11971 return (is_property_lowercase (ch)
11972 || is_property_uppercase (ch)
11973 || is_category_Lt (ch));
11976 /* Quoting the Unicode standard:
11977 Definition: A character is defined to be "case-ignorable" if it has the
11978 value MidLetter {or the value MidNumLet} for the Word_Break property or
11979 its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
11980 Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
11981 The text marked in braces was added in Unicode 5.1.0, see
11982 <https://www.unicode.org/versions/Unicode5.1.0/> section "Update of
11983 Definition of case-ignorable". */
11984 /* Since this predicate is only used for the "Before C" and "After C"
11985 conditions of FINAL_SIGMA, we exclude the "cased" characters here.
11986 This simplifies the evaluation of the regular expressions
11987 \p{cased} (\p{case-ignorable})* C
11989 C (\p{case-ignorable})* \p{cased}
11991 static bool
11992 is_case_ignorable (unsigned int ch)
11994 return (unicode_org_wbp[ch] == WBP_MIDLETTER
11995 || unicode_org_wbp[ch] == WBP_MIDNUMLET
11996 || is_category_Mn (ch)
11997 || is_category_Me (ch)
11998 || is_category_Cf (ch)
11999 || is_category_Lm (ch)
12000 || is_category_Sk (ch))
12001 && !is_cased (ch);
12004 /* ------------------------------------------------------------------------- */
12006 /* Output all case related properties. */
12007 static void
12008 output_casing_properties (const char *version)
12010 #define PROPERTY(FN,P) \
12011 debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \
12012 output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \
12013 output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version);
12014 PROPERTY(cased, cased)
12015 PROPERTY(ignorable, case_ignorable)
12016 #undef PROPERTY
12019 /* ========================================================================= */
12021 /* Output the Unicode version. */
12022 static void
12023 output_version (const char *filename, const char *version)
12025 FILE *stream;
12026 int major;
12027 int minor;
12029 stream = fopen (filename, "w");
12030 if (stream == NULL)
12032 fprintf (stderr, "cannot open '%s' for writing\n", filename);
12033 exit (1);
12036 fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
12037 fprintf (stream, "/* Supported Unicode version. */\n");
12038 fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
12039 version);
12040 fprintf (stream, "\n");
12042 fprintf (stream, "/* Copyright (C) 2024 Free Software Foundation, Inc.\n");
12043 fprintf (stream, "\n");
12044 output_library_license (stream, false);
12045 fprintf (stream, "\n");
12047 fprintf (stream, "#include <config.h>\n");
12048 fprintf (stream, "\n");
12050 fprintf (stream, "/* Specification. */\n");
12051 fprintf (stream, "#include \"unimetadata.h\"\n");
12052 fprintf (stream, "\n");
12054 sscanf (version, "%d.%d", &major, &minor);
12055 fprintf (stream, "const int _libunistring_unicode_version = (%d << 8) | %d;\n",
12056 major, minor);
12058 if (ferror (stream) || fclose (stream))
12060 fprintf (stderr, "error writing to '%s'\n", filename);
12061 exit (1);
12065 /* ========================================================================= */
12068 main (int argc, char * argv[])
12070 const char *unicodedata_filename;
12071 const char *proplist_filename;
12072 const char *derivedproplist_filename;
12073 const char *emojidata_filename;
12074 const char *arabicshaping_filename;
12075 const char *scripts_filename;
12076 const char *blocks_filename;
12077 const char *proplist30_filename;
12078 const char *bidimirroring_filename;
12079 const char *eastasianwidth_filename;
12080 const char *linebreak_filename;
12081 const char *wordbreakproperty_filename;
12082 const char *graphemebreakproperty_filename;
12083 const char *compositionexclusions_filename;
12084 const char *specialcasing_filename;
12085 const char *casefolding_filename;
12086 const char *version;
12088 if (argc != 18)
12090 fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt emoji-data.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt BidiMirroring.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
12091 argv[0]);
12092 exit (1);
12095 unicodedata_filename = argv[1];
12096 proplist_filename = argv[2];
12097 derivedproplist_filename = argv[3];
12098 emojidata_filename = argv[4];
12099 arabicshaping_filename = argv[5];
12100 scripts_filename = argv[6];
12101 blocks_filename = argv[7];
12102 proplist30_filename = argv[8];
12103 bidimirroring_filename = argv[9];
12104 eastasianwidth_filename = argv[10];
12105 linebreak_filename = argv[11];
12106 wordbreakproperty_filename = argv[12];
12107 graphemebreakproperty_filename = argv[13];
12108 compositionexclusions_filename = argv[14];
12109 specialcasing_filename = argv[15];
12110 casefolding_filename = argv[16];
12111 version = argv[17];
12113 fill_attributes (unicodedata_filename);
12114 clear_properties ();
12115 fill_properties (proplist_filename);
12116 fill_properties (derivedproplist_filename);
12117 fill_properties (emojidata_filename);
12118 fill_properties30 (proplist30_filename);
12119 fill_arabicshaping (arabicshaping_filename);
12120 fill_scripts (scripts_filename);
12121 fill_blocks (blocks_filename);
12122 fill_mirror (bidimirroring_filename);
12123 fill_width (eastasianwidth_filename);
12124 fill_org_lbp (linebreak_filename);
12125 fill_org_wbp (wordbreakproperty_filename);
12126 fill_org_gbp (graphemebreakproperty_filename);
12127 fill_composition_exclusions (compositionexclusions_filename);
12128 fill_casing_rules (specialcasing_filename);
12129 fill_casefolding_rules (casefolding_filename);
12130 redistribute_casefolding_rules ();
12131 sort_casing_rules ();
12133 output_categories (version);
12134 output_category ("unictype/categ_of.h", version);
12135 output_combclass ("unictype/combiningclass.h", version);
12136 output_bidi_category ("unictype/bidi_of.h", version);
12137 output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version);
12138 output_decimal_digit ("unictype/decdigit.h", version);
12139 output_digit_test ("../tests/unictype/test-digit.h", version);
12140 output_digit ("unictype/digit.h", version);
12141 output_numeric_test ("../tests/unictype/test-numeric.h", version);
12142 output_numeric ("unictype/numeric.h", version);
12143 output_mirror ("unictype/mirror.h", version);
12144 output_properties (version);
12145 output_indic_conjunct_break_test ("../tests/unictype/test-incb_of.h", version);
12146 output_indic_conjunct_break ("unictype/incb_of.h", version);
12147 output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version);
12148 output_joining_type ("unictype/joiningtype_of.h", version);
12149 output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version);
12150 output_joining_group ("unictype/joininggroup_of.h", version);
12152 output_scripts (version);
12153 output_scripts_byname (version);
12154 output_blocks (version);
12155 output_ident_properties (version);
12156 output_nonspacing_property ("uniwidth/width0.h", version);
12157 output_width2_property ("uniwidth/width2.h", version);
12158 output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part");
12159 output_old_ctype (version);
12161 debug_output_lbrk_tables ("unilbrk/lbrkprop.txt");
12162 debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt");
12163 output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version);
12164 output_lbrk_rules_as_tables ("unilbrk/lbrktables.c", version);
12166 debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt");
12167 debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt");
12168 output_wbrk_tables ("uniwbrk/wbrkprop.h", version);
12170 output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h");
12171 output_gbp_table ("unigbrk/gbrkprop.h", version);
12173 output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version);
12174 debug_output_composition_tables ("uninorm/composition.txt");
12175 output_composition_tables ("uninorm/composition-table.gperf", "uninorm/composition-table-bounds.h", version);
12177 output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version);
12178 output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version);
12179 output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version);
12180 output_simple_mapping ("unicase/toupper.h", to_upper, version);
12181 output_simple_mapping ("unicase/tolower.h", to_lower, version);
12182 output_simple_mapping ("unicase/totitle.h", to_title, version);
12183 output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
12184 output_casing_rules ("unicase/special-casing-table.gperf", version);
12185 output_casing_properties (version);
12187 output_version ("unimetadata/u-version.c", version);
12189 return 0;
12193 * Local Variables:
12194 * coding: utf-8
12195 * compile-command: "\
12196 * gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \\
12197 * ./gen-uni-tables \\
12198 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/UnicodeData.txt \\
12199 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/PropList.txt \\
12200 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/DerivedCoreProperties.txt \\
12201 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/emoji/emoji-data.txt \\
12202 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/ArabicShaping.txt \\
12203 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/Scripts.txt \\
12204 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/Blocks.txt \\
12205 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \\
12206 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/BidiMirroring.txt \\
12207 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/EastAsianWidth.txt \\
12208 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/LineBreak.txt \\
12209 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/auxiliary/WordBreakProperty.txt \\
12210 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \\
12211 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/CompositionExclusions.txt \\
12212 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/SpecialCasing.txt \\
12213 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/CaseFolding.txt \\
12214 * 16.0.0 \\
12215 * && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \\
12216 * && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt \\
12217 * && clisp -C uniname/gen-uninames.lisp \\
12218 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/UnicodeData.txt \\
12219 * /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/NameAliases.txt \\
12220 * uniname/uninames.h \\
12221 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12222 * echo; \\
12223 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/NameAliases.txt; } \\
12224 * > ../tests/uniname/NameAliases.txt \\
12225 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12226 * echo; \\
12227 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/UnicodeData.txt; } \\
12228 * > ../tests/uniname/UnicodeData.txt \\
12229 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12230 * echo; \\
12231 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/NormalizationTest.txt; } \\
12232 * > ../tests/uninorm/NormalizationTest.txt \\
12233 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12234 * echo; \\
12235 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/auxiliary/GraphemeBreakTest.txt; } \\
12236 * > ../tests/unigbrk/GraphemeBreakTest.txt \\
12237 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12238 * echo; \\
12239 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/auxiliary/LineBreakTest.txt; } \\
12240 * > ../tests/unilbrk/LineBreakTest.txt \\
12241 * && { sed -e 's/^/# /' -e 's/ $//' < /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/license.txt; \\
12242 * echo; \\
12243 * cat /media/nas/bruno/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/16.0.0/ucd/auxiliary/WordBreakTest.txt; } \\
12244 * > ../tests/uniwbrk/WordBreakTest.txt"
12245 * End: