announce-gen: add comments
[gnulib.git] / lib / unilbrk / u32-possible-linebreaks.c
blob606c4b608443fe3ad879b3c7e89e45a9a73698f3
1 /* Line breaking of UTF-32 strings.
2 Copyright (C) 2001-2003, 2006-2025 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2001.
5 This file is free software.
6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7 You can redistribute it and/or modify it under either
8 - the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation, either version 3, or (at your
10 option) any later version, or
11 - the terms of the GNU General Public License as published by the
12 Free Software Foundation; either version 2, or (at your option)
13 any later version, or
14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
16 This file is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 Lesser General Public License and the GNU General Public License
20 for more details.
22 You should have received a copy of the GNU Lesser General Public
23 License and of the GNU General Public License along with this
24 program. If not, see <https://www.gnu.org/licenses/>. */
26 #include <config.h>
28 /* Specification. */
29 #include "unilbrk.h"
30 #include "unilbrk/internal.h"
32 #include <stdlib.h>
34 #include "unilbrk/lbrktables.h"
35 #include "uniwidth/cjk.h"
37 /* This file implements
38 Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>. */
40 void
41 u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding,
42 int cr, char *p)
44 if (n > 0)
46 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL1);
47 const uint32_t *s_end = s + n;
49 /* We need 2 characters of lookahead:
50 - 1 character of lookahead for (LB15c,LB19a,LB28a),
51 - 2 characters of lookahead for (LB25). */
52 const uint32_t *lookahead1_end;
53 ucs4_t lookahead1_uc;
54 int lookahead1_prop_ea;
55 const uint32_t *lookahead2_end;
56 ucs4_t lookahead2_uc;
57 int lookahead2_prop_ea;
58 /* Get the first lookahead character. */
59 lookahead1_end = s;
60 lookahead1_uc = *lookahead1_end++;
61 lookahead1_prop_ea = unilbrkprop_lookup (lookahead1_uc);
62 /* Get the second lookahead character. */
63 lookahead2_end = lookahead1_end;
64 if (lookahead2_end < s_end)
66 lookahead2_uc = *lookahead2_end++;
67 lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc);
69 else
71 lookahead2_uc = 0xFFFD;
72 lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
75 int preceding_prop = LBP_BK; /* line break property of preceding character */
76 int prev_prop = LBP_BK; /* line break property of previous character
77 (= last character, ignoring intervening characters of class CM or ZWJ) */
78 int prev_ea = 0; /* EastAsian property of previous character
79 (= last character, ignoring intervening characters of class CM or ZWJ) */
80 int prev2_ea = 0; /* EastAsian property of character before the previous character */
81 bool prev_initial_hyphen = false; /* the previous character was a
82 word-initial hyphen or U+2010 */
83 bool prev_nus = false; /* before the previous character, there was a character
84 with line break property LBP_NU and since then
85 only characters with line break property LBP_SY
86 or LBP_IS */
87 int last_prop = LBP_BK; /* line break property of last non-space character
88 (= last character, ignoring intervening characters of class SP or CM or ZWJ) */
89 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
91 /* Number of consecutive regional indicator (RI) characters seen
92 immediately before the current point. */
93 size_t ri_count = 0;
97 /* Read the next character. */
98 s = lookahead1_end;
99 ucs4_t uc = lookahead1_uc;
100 int prop_ea = lookahead1_prop_ea; /* = unilbrkprop_lookup (uc); */
101 int prop = PROP (prop_ea); /* line break property of uc */
102 int ea = EA (prop_ea); /* EastAsian property of uc */
103 /* Refill the pipeline of 2 lookahead characters. */
104 lookahead1_end = lookahead2_end;
105 lookahead1_uc = lookahead2_uc;
106 lookahead1_prop_ea = lookahead2_prop_ea;
107 if (lookahead2_end < s_end)
109 lookahead2_uc = *lookahead2_end++;
110 lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc);
112 else
114 lookahead2_uc = 0xFFFD;
115 lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
118 bool nus = /* ending at the previous character, there was a character
119 with line break property LBP_NU and since then only
120 characters with line break property LBP_SY or LBP_IS */
121 (prev_prop == LBP_NU
122 || (prev_nus && (prev_prop == LBP_SY || prev_prop == LBP_IS)));
124 if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR)
126 /* (LB4,LB5,LB6) Mandatory break. */
127 *p = UC_BREAK_MANDATORY;
128 /* cr is either LBP_CR or -1. In the first case, recognize
129 a CR-LF sequence. */
130 if (prev_prop == cr && prop == LBP_LF)
131 p[-1] = UC_BREAK_CR_BEFORE_LF;
132 last_prop = LBP_BK;
133 seen_space = NULL;
135 else
137 /* Resolve property values whose behaviour is not fixed. */
138 switch (prop)
140 case LBP_AI:
141 /* Resolve ambiguous. */
142 prop = LBP_AI_REPLACEMENT;
143 break;
144 case LBP_CB:
145 /* This is arbitrary. */
146 prop = LBP_ID1;
147 break;
148 case LBP_SA:
149 /* We don't handle complex scripts yet.
150 Treat LBP_SA like LBP_XX. */
151 case LBP_XX:
152 /* This is arbitrary. */
153 prop = LBP_AL1;
154 break;
157 /* Deal with spaces and combining characters. */
158 if (prop == LBP_SP)
160 /* (LB7) Don't break just before a space. */
161 *p = UC_BREAK_PROHIBITED;
162 seen_space = p;
164 else if (prop == LBP_ZW)
166 /* (LB7) Don't break just before a zero-width space. */
167 *p = UC_BREAK_PROHIBITED;
168 last_prop = LBP_ZW;
169 seen_space = NULL;
171 else if (prop == LBP_CM || prop == LBP_ZWJ)
173 /* (LB9) Don't break just before a combining character or
174 zero-width joiner, except immediately after a mandatory
175 break character, space, or zero-width space. */
176 if (last_prop == LBP_BK)
178 /* (LB4,LB5,LB6) Don't break at the beginning of a line. */
179 *p = UC_BREAK_PROHIBITED;
180 /* (LB10) Treat CM or ZWJ as AL. */
181 last_prop = LBP_AL1;
182 seen_space = NULL;
184 else if (last_prop == LBP_ZW
185 || (seen_space != NULL
186 /* (LB14) has higher priority than (LB18). */
187 && !(last_prop == LBP_OP1 || last_prop == LBP_OP2)
188 /* (LB15a) has higher priority than (LB18). */
189 && !(last_prop == LBP_QU2)))
191 /* (LB8) Break after zero-width space. */
192 /* (LB18) Break after spaces.
193 We do *not* implement the "legacy support for space
194 character as base for combining marks" because now the
195 NBSP CM sequence is recommended instead of SP CM. */
196 *p = UC_BREAK_POSSIBLE;
197 /* (LB10) Treat CM or ZWJ as AL. */
198 last_prop = LBP_AL1;
199 seen_space = NULL;
201 else
203 /* Treat X CM as if it were X. */
204 *p = UC_BREAK_PROHIBITED;
207 else
209 /* prop must be usable as an index for table 7.3 of UTR #14. */
210 if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
211 abort ();
213 if (last_prop == LBP_BK)
215 /* (LB4,LB5,LB6) Don't break at the beginning of a line. */
216 *p = UC_BREAK_PROHIBITED;
218 else if (last_prop == LBP_ZW)
220 /* (LB8) Break after zero-width space. */
221 *p = UC_BREAK_POSSIBLE;
223 else if (preceding_prop == LBP_ZWJ)
225 /* (LB8a) Don't break right after a zero-width joiner. */
226 *p = UC_BREAK_PROHIBITED;
228 else if (prop == LBP_IS && prev_prop == LBP_SP
229 && PROP (lookahead1_prop_ea) == LBP_NU)
231 /* (LB15c) Break before a decimal mark that follows a space. */
232 *p = UC_BREAK_POSSIBLE;
234 else if (((prop == LBP_QU1 || prop == LBP_QU2 || prop == LBP_QU3)
235 && (! prev_ea || ! EA (lookahead1_prop_ea))
236 /* (LB18) has higher priority than (LB19a). */
237 && prev_prop != LBP_SP)
238 || ((prev_prop == LBP_QU1 || prev_prop == LBP_QU2 || prev_prop == LBP_QU3)
239 && (! prev2_ea || ! ea)))
241 /* (LB19a) Don't break on either side of ambiguous
242 quotation marks, except next to an EastAsian character. */
243 *p = UC_BREAK_PROHIBITED;
245 else if (prev_initial_hyphen
246 && (prop == LBP_AL1 || prop == LBP_AL2))
248 /* (LB20a) Don't break after a word-initial hyphen. */
249 *p = UC_BREAK_PROHIBITED;
251 else if (prev_prop == LBP_HL_BA && prop != LBP_HL)
253 /* (LB21a) Don't break after Hebrew + Hyphen/Break-After,
254 before non-Hebrew. */
255 *p = UC_BREAK_PROHIBITED;
257 else if ((prev_nus
258 && (prev_prop == LBP_CL
259 || prev_prop == LBP_CP1 || prev_prop == LBP_CP2)
260 && (prop == LBP_PO || prop == LBP_PR))
261 || (nus && (prop == LBP_PO || prop == LBP_PR
262 || prop == LBP_NU)))
264 /* (LB25) Don't break numbers. */
265 *p = UC_BREAK_PROHIBITED;
267 else if ((prev_prop == LBP_PO || prev_prop == LBP_PR)
268 && (prop == LBP_OP1 || prop == LBP_OP2)
269 && (PROP (lookahead1_prop_ea) == LBP_NU
270 || (PROP (lookahead1_prop_ea) == LBP_IS
271 && PROP (lookahead2_prop_ea) == LBP_NU)))
273 /* (LB25) Don't break numbers. */
274 *p = UC_BREAK_PROHIBITED;
276 else if (prev_prop == LBP_AKLS_VI
277 && (prop == LBP_AK || prop == LBP_AL2))
279 /* (LB28a) Don't break inside orthographic syllables of
280 Brahmic scripts, line 3. */
281 *p = UC_BREAK_PROHIBITED;
283 else if (PROP (lookahead1_prop_ea) == LBP_VF
284 && (prop == LBP_AK || prop == LBP_AL2 || prop == LBP_AS)
285 && (prev_prop == LBP_AK || prev_prop == LBP_AL2 || prev_prop == LBP_AS))
287 /* (LB28a) Don't break inside orthographic syllables of
288 Brahmic scripts, line 4. */
289 *p = UC_BREAK_PROHIBITED;
291 else if (last_prop == LBP_IS && uc == 0x003C)
293 /* Partially disable (LB29) Do not break between numeric
294 punctuation and alphabetics ("e.g."). We find it
295 desirable to break before the HTML tag "</P>" in
296 strings like "<P>Some sentence.</P>". */
297 *p = UC_BREAK_POSSIBLE;
299 else if (last_prop == LBP_RI && prop == LBP_RI)
301 /* (LB30a) Break between two regional indicator symbols
302 if and only if there are an even number of regional
303 indicators preceding the position of the break. */
304 *p = (seen_space != NULL || (ri_count % 2) == 0
305 ? UC_BREAK_POSSIBLE
306 : UC_BREAK_PROHIBITED);
308 else
310 int this_prop = prop;
311 if (prop == LBP_QU3)
313 /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the
314 next character's line break property is not one of
315 BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */
316 switch (PROP (lookahead1_prop_ea))
318 case LBP_BK:
319 case LBP_CR:
320 case LBP_LF:
321 case LBP_SP:
322 case LBP_GL:
323 case LBP_WJ:
324 case LBP_CL:
325 case LBP_QU1: case LBP_QU2: case LBP_QU3:
326 case LBP_CP1: case LBP_CP2:
327 case LBP_EX:
328 case LBP_IS:
329 case LBP_SY:
330 case LBP_ZW:
331 break;
332 default:
333 this_prop = LBP_QU1;
334 break;
338 switch (unilbrk_table [last_prop] [this_prop])
340 case D:
341 *p = UC_BREAK_POSSIBLE;
342 break;
343 case I:
344 *p = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
345 break;
346 case P:
347 *p = UC_BREAK_PROHIBITED;
348 break;
349 default:
350 abort ();
354 if (prop == LBP_QU2)
356 /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the
357 previous character's line break property was not one of
358 BK, CR, LF, OP, QU, GL, SP, ZW. */
359 switch (prev_prop)
361 case LBP_BK:
362 case LBP_CR:
363 case LBP_LF:
364 case LBP_OP1: case LBP_OP2:
365 case LBP_QU1: case LBP_QU2: case LBP_QU3:
366 case LBP_GL:
367 case LBP_SP:
368 case LBP_ZW:
369 break;
370 default:
371 prop = LBP_QU1;
372 break;
376 last_prop = prop;
377 seen_space = NULL;
381 /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line
382 break class except BK, CR, LF, NL, SP, or ZW. */
383 if (!((prop == LBP_CM || prop == LBP_ZWJ)
384 && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop == LBP_CR
385 || prev_prop == LBP_SP || prev_prop == LBP_ZW)))
387 prev_initial_hyphen =
388 (prop == LBP_HY || uc == 0x2010)
389 && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == LBP_LF
390 || prev_prop == LBP_SP || prev_prop == LBP_ZW
391 || prev_prop == LBP_CB || prev_prop == LBP_GL);
392 prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
393 || prev_prop == LBP_AL2
394 || prev_prop == LBP_AS)
395 ? LBP_AKLS_VI :
396 prev_prop == LBP_HL && (prop == LBP_HY
397 || (prop == LBP_BA && !ea))
398 ? LBP_HL_BA :
399 prop);
400 prev2_ea = prev_ea;
401 prev_ea = ea;
402 prev_nus = nus;
405 preceding_prop = prop;
407 if (prop == LBP_RI)
408 ri_count++;
409 else
410 ri_count = 0;
412 p++;
414 while (s < s_end);
418 #if defined IN_LIBUNISTRING
419 /* For backward compatibility with older versions of libunistring. */
421 # undef u32_possible_linebreaks
423 void
424 u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding,
425 char *p)
427 u32_possible_linebreaks_loop (s, n, encoding, -1, p);
430 #endif
432 void
433 u32_possible_linebreaks_v2 (const uint32_t *s, size_t n, const char *encoding,
434 char *p)
436 u32_possible_linebreaks_loop (s, n, encoding, LBP_CR, p);