xstrtol-error: pacify -Wswitch-enum
[gnulib.git] / lib / unilbrk / u16-possible-linebreaks.c
blob3f0145812247c3553b65906ef2a120e90d60b541
1 /* Line breaking of UTF-16 strings.
2 Copyright (C) 2001-2003, 2006-2025 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2001.
5 This file is free software.
6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7 You can redistribute it and/or modify it under either
8 - the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation, either version 3, or (at your
10 option) any later version, or
11 - the terms of the GNU General Public License as published by the
12 Free Software Foundation; either version 2, or (at your option)
13 any later version, or
14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
16 This file is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 Lesser General Public License and the GNU General Public License
20 for more details.
22 You should have received a copy of the GNU Lesser General Public
23 License and of the GNU General Public License along with this
24 program. If not, see <https://www.gnu.org/licenses/>. */
26 #include <config.h>
28 /* Specification. */
29 #include "unilbrk.h"
30 #include "unilbrk/internal.h"
32 #include <stdlib.h>
33 #include <string.h>
35 #include "unilbrk/lbrktables.h"
36 #include "uniwidth/cjk.h"
37 #include "unistr.h"
39 /* This file implements
40 Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>. */
42 void
43 u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding,
44 int cr, char *p)
46 if (n > 0)
48 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL1);
50 /* Don't break inside multibyte characters. */
51 memset (p, UC_BREAK_PROHIBITED, n);
53 const uint16_t *s_end = s + n;
55 /* We need 2 characters of lookahead:
56 - 1 character of lookahead for (LB15c,LB19a,LB28a),
57 - 2 characters of lookahead for (LB25). */
58 const uint16_t *lookahead1_end;
59 ucs4_t lookahead1_uc;
60 int lookahead1_prop_ea;
61 const uint16_t *lookahead2_end;
62 ucs4_t lookahead2_uc;
63 int lookahead2_prop_ea;
64 /* Get the first lookahead character. */
65 lookahead1_end = s;
66 lookahead1_end += u16_mbtouc_unsafe (&lookahead1_uc, lookahead1_end, s_end - lookahead1_end);
67 lookahead1_prop_ea = unilbrkprop_lookup (lookahead1_uc);
68 /* Get the second lookahead character. */
69 lookahead2_end = lookahead1_end;
70 if (lookahead2_end < s_end)
72 lookahead2_end += u16_mbtouc_unsafe (&lookahead2_uc, lookahead2_end, s_end - lookahead2_end);
73 lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc);
75 else
77 lookahead2_uc = 0xFFFD;
78 lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
81 int preceding_prop = LBP_BK; /* line break property of preceding character */
82 int prev_prop = LBP_BK; /* line break property of previous character
83 (= last character, ignoring intervening characters of class CM or ZWJ) */
84 int prev_ea = 0; /* EastAsian property of previous character
85 (= last character, ignoring intervening characters of class CM or ZWJ) */
86 int prev2_ea = 0; /* EastAsian property of character before the previous character */
87 bool prev_initial_hyphen = false; /* the previous character was a
88 word-initial hyphen or U+2010 */
89 bool prev_nus = false; /* before the previous character, there was a character
90 with line break property LBP_NU and since then
91 only characters with line break property LBP_SY
92 or LBP_IS */
93 int last_prop = LBP_BK; /* line break property of last non-space character
94 (= last character, ignoring intervening characters of class SP or CM or ZWJ) */
95 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
97 /* Number of consecutive regional indicator (RI) characters seen
98 immediately before the current point. */
99 size_t ri_count = 0;
103 /* Read the next character. */
104 size_t count = lookahead1_end - s;
105 s = lookahead1_end;
106 ucs4_t uc = lookahead1_uc;
107 int prop_ea = lookahead1_prop_ea; /* = unilbrkprop_lookup (uc); */
108 int prop = PROP (prop_ea); /* line break property of uc */
109 int ea = EA (prop_ea); /* EastAsian property of uc */
110 /* Refill the pipeline of 2 lookahead characters. */
111 lookahead1_end = lookahead2_end;
112 lookahead1_uc = lookahead2_uc;
113 lookahead1_prop_ea = lookahead2_prop_ea;
114 if (lookahead2_end < s_end)
116 lookahead2_end += u16_mbtouc_unsafe (&lookahead2_uc, lookahead2_end, s_end - lookahead2_end);
117 lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc);
119 else
121 lookahead2_uc = 0xFFFD;
122 lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
125 bool nus = /* ending at the previous character, there was a character
126 with line break property LBP_NU and since then only
127 characters with line break property LBP_SY or LBP_IS */
128 (prev_prop == LBP_NU
129 || (prev_nus && (prev_prop == LBP_SY || prev_prop == LBP_IS)));
131 if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR)
133 /* (LB4,LB5,LB6) Mandatory break. */
134 *p = UC_BREAK_MANDATORY;
135 /* cr is either LBP_CR or -1. In the first case, recognize
136 a CR-LF sequence. */
137 if (prev_prop == cr && prop == LBP_LF)
138 p[-1] = UC_BREAK_CR_BEFORE_LF;
139 last_prop = LBP_BK;
140 seen_space = NULL;
142 else
144 /* Resolve property values whose behaviour is not fixed. */
145 switch (prop)
147 case LBP_AI:
148 /* Resolve ambiguous. */
149 prop = LBP_AI_REPLACEMENT;
150 break;
151 case LBP_CB:
152 /* This is arbitrary. */
153 prop = LBP_ID1;
154 break;
155 case LBP_SA:
156 /* We don't handle complex scripts yet.
157 Treat LBP_SA like LBP_XX. */
158 case LBP_XX:
159 /* This is arbitrary. */
160 prop = LBP_AL1;
161 break;
164 /* Deal with spaces and combining characters. */
165 if (prop == LBP_SP)
167 /* (LB7) Don't break just before a space. */
168 *p = UC_BREAK_PROHIBITED;
169 seen_space = p;
171 else if (prop == LBP_ZW)
173 /* (LB7) Don't break just before a zero-width space. */
174 *p = UC_BREAK_PROHIBITED;
175 last_prop = LBP_ZW;
176 seen_space = NULL;
178 else if (prop == LBP_CM || prop == LBP_ZWJ)
180 /* (LB9) Don't break just before a combining character or
181 zero-width joiner, except immediately after a mandatory
182 break character, space, or zero-width space. */
183 if (last_prop == LBP_BK)
185 /* (LB4,LB5,LB6) Don't break at the beginning of a line. */
186 *p = UC_BREAK_PROHIBITED;
187 /* (LB10) Treat CM or ZWJ as AL. */
188 last_prop = LBP_AL1;
189 seen_space = NULL;
191 else if (last_prop == LBP_ZW
192 || (seen_space != NULL
193 /* (LB14) has higher priority than (LB18). */
194 && !(last_prop == LBP_OP1 || last_prop == LBP_OP2)
195 /* (LB15a) has higher priority than (LB18). */
196 && !(last_prop == LBP_QU2)))
198 /* (LB8) Break after zero-width space. */
199 /* (LB18) Break after spaces.
200 We do *not* implement the "legacy support for space
201 character as base for combining marks" because now the
202 NBSP CM sequence is recommended instead of SP CM. */
203 *p = UC_BREAK_POSSIBLE;
204 /* (LB10) Treat CM or ZWJ as AL. */
205 last_prop = LBP_AL1;
206 seen_space = NULL;
208 else
210 /* Treat X CM as if it were X. */
211 *p = UC_BREAK_PROHIBITED;
214 else
216 /* prop must be usable as an index for table 7.3 of UTR #14. */
217 if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
218 abort ();
220 if (last_prop == LBP_BK)
222 /* (LB4,LB5,LB6) Don't break at the beginning of a line. */
223 *p = UC_BREAK_PROHIBITED;
225 else if (last_prop == LBP_ZW)
227 /* (LB8) Break after zero-width space. */
228 *p = UC_BREAK_POSSIBLE;
230 else if (preceding_prop == LBP_ZWJ)
232 /* (LB8a) Don't break right after a zero-width joiner. */
233 *p = UC_BREAK_PROHIBITED;
235 else if (prop == LBP_IS && prev_prop == LBP_SP
236 && PROP (lookahead1_prop_ea) == LBP_NU)
238 /* (LB15c) Break before a decimal mark that follows a space. */
239 *p = UC_BREAK_POSSIBLE;
241 else if (((prop == LBP_QU1 || prop == LBP_QU2 || prop == LBP_QU3)
242 && (! prev_ea || ! EA (lookahead1_prop_ea))
243 /* (LB18) has higher priority than (LB19a). */
244 && prev_prop != LBP_SP)
245 || ((prev_prop == LBP_QU1 || prev_prop == LBP_QU2 || prev_prop == LBP_QU3)
246 && (! prev2_ea || ! ea)))
248 /* (LB19a) Don't break on either side of ambiguous
249 quotation marks, except next to an EastAsian character. */
250 *p = UC_BREAK_PROHIBITED;
252 else if (prev_initial_hyphen
253 && (prop == LBP_AL1 || prop == LBP_AL2))
255 /* (LB20a) Don't break after a word-initial hyphen. */
256 *p = UC_BREAK_PROHIBITED;
258 else if (prev_prop == LBP_HL_BA && prop != LBP_HL)
260 /* (LB21a) Don't break after Hebrew + Hyphen/Break-After,
261 before non-Hebrew. */
262 *p = UC_BREAK_PROHIBITED;
264 else if ((prev_nus
265 && (prev_prop == LBP_CL
266 || prev_prop == LBP_CP1 || prev_prop == LBP_CP2)
267 && (prop == LBP_PO || prop == LBP_PR))
268 || (nus && (prop == LBP_PO || prop == LBP_PR
269 || prop == LBP_NU)))
271 /* (LB25) Don't break numbers. */
272 *p = UC_BREAK_PROHIBITED;
274 else if ((prev_prop == LBP_PO || prev_prop == LBP_PR)
275 && (prop == LBP_OP1 || prop == LBP_OP2)
276 && (PROP (lookahead1_prop_ea) == LBP_NU
277 || (PROP (lookahead1_prop_ea) == LBP_IS
278 && PROP (lookahead2_prop_ea) == LBP_NU)))
280 /* (LB25) Don't break numbers. */
281 *p = UC_BREAK_PROHIBITED;
283 else if (prev_prop == LBP_AKLS_VI
284 && (prop == LBP_AK || prop == LBP_AL2))
286 /* (LB28a) Don't break inside orthographic syllables of
287 Brahmic scripts, line 3. */
288 *p = UC_BREAK_PROHIBITED;
290 else if (PROP (lookahead1_prop_ea) == LBP_VF
291 && (prop == LBP_AK || prop == LBP_AL2 || prop == LBP_AS)
292 && (prev_prop == LBP_AK || prev_prop == LBP_AL2 || prev_prop == LBP_AS))
294 /* (LB28a) Don't break inside orthographic syllables of
295 Brahmic scripts, line 4. */
296 *p = UC_BREAK_PROHIBITED;
298 else if (last_prop == LBP_IS && uc == 0x003C)
300 /* Partially disable (LB29) Do not break between numeric
301 punctuation and alphabetics ("e.g."). We find it
302 desirable to break before the HTML tag "</P>" in
303 strings like "<P>Some sentence.</P>". */
304 *p = UC_BREAK_POSSIBLE;
306 else if (last_prop == LBP_RI && prop == LBP_RI)
308 /* (LB30a) Break between two regional indicator symbols
309 if and only if there are an even number of regional
310 indicators preceding the position of the break. */
311 *p = (seen_space != NULL || (ri_count % 2) == 0
312 ? UC_BREAK_POSSIBLE
313 : UC_BREAK_PROHIBITED);
315 else
317 int this_prop = prop;
318 if (prop == LBP_QU3)
320 /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the
321 next character's line break property is not one of
322 BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */
323 switch (PROP (lookahead1_prop_ea))
325 case LBP_BK:
326 case LBP_CR:
327 case LBP_LF:
328 case LBP_SP:
329 case LBP_GL:
330 case LBP_WJ:
331 case LBP_CL:
332 case LBP_QU1: case LBP_QU2: case LBP_QU3:
333 case LBP_CP1: case LBP_CP2:
334 case LBP_EX:
335 case LBP_IS:
336 case LBP_SY:
337 case LBP_ZW:
338 break;
339 default:
340 this_prop = LBP_QU1;
341 break;
345 switch (unilbrk_table [last_prop] [this_prop])
347 case D:
348 *p = UC_BREAK_POSSIBLE;
349 break;
350 case I:
351 *p = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
352 break;
353 case P:
354 *p = UC_BREAK_PROHIBITED;
355 break;
356 default:
357 abort ();
361 if (prop == LBP_QU2)
363 /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the
364 previous character's line break property was not one of
365 BK, CR, LF, OP, QU, GL, SP, ZW. */
366 switch (prev_prop)
368 case LBP_BK:
369 case LBP_CR:
370 case LBP_LF:
371 case LBP_OP1: case LBP_OP2:
372 case LBP_QU1: case LBP_QU2: case LBP_QU3:
373 case LBP_GL:
374 case LBP_SP:
375 case LBP_ZW:
376 break;
377 default:
378 prop = LBP_QU1;
379 break;
383 last_prop = prop;
384 seen_space = NULL;
388 /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line
389 break class except BK, CR, LF, NL, SP, or ZW. */
390 if (!((prop == LBP_CM || prop == LBP_ZWJ)
391 && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop == LBP_CR
392 || prev_prop == LBP_SP || prev_prop == LBP_ZW)))
394 prev_initial_hyphen =
395 (prop == LBP_HY || uc == 0x2010)
396 && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == LBP_LF
397 || prev_prop == LBP_SP || prev_prop == LBP_ZW
398 || prev_prop == LBP_CB || prev_prop == LBP_GL);
399 prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
400 || prev_prop == LBP_AL2
401 || prev_prop == LBP_AS)
402 ? LBP_AKLS_VI :
403 prev_prop == LBP_HL && (prop == LBP_HY
404 || (prop == LBP_BA && !ea))
405 ? LBP_HL_BA :
406 prop);
407 prev2_ea = prev_ea;
408 prev_ea = ea;
409 prev_nus = nus;
412 preceding_prop = prop;
414 if (prop == LBP_RI)
415 ri_count++;
416 else
417 ri_count = 0;
419 p += count;
421 while (s < s_end);
425 #if defined IN_LIBUNISTRING
426 /* For backward compatibility with older versions of libunistring. */
428 # undef u16_possible_linebreaks
430 void
431 u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding,
432 char *p)
434 u16_possible_linebreaks_loop (s, n, encoding, -1, p);
437 #endif
439 void
440 u16_possible_linebreaks_v2 (const uint16_t *s, size_t n, const char *encoding,
441 char *p)
443 u16_possible_linebreaks_loop (s, n, encoding, LBP_CR, p);