lib/unilbrk/u16-possible-linebreaks.c

   1 /* Line breaking of UTF-16 strings.
   2    Copyright (C) 2001-2003, 2006-2025 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2001.
   4
   5    This file is free software.
   6    It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
   7    You can redistribute it and/or modify it under either
   8      - the terms of the GNU Lesser General Public License as published
   9        by the Free Software Foundation, either version 3, or (at your
  10        option) any later version, or
  11      - the terms of the GNU General Public License as published by the
  12        Free Software Foundation; either version 2, or (at your option)
  13        any later version, or
  14      - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
  15
  16    This file is distributed in the hope that it will be useful,
  17    but WITHOUT ANY WARRANTY; without even the implied warranty of
  18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19    Lesser General Public License and the GNU General Public License
  20    for more details.
  21
  22    You should have received a copy of the GNU Lesser General Public
  23    License and of the GNU General Public License along with this
  24    program.  If not, see <https://www.gnu.org/licenses/>.  */
  25
  26 #include <config.h>
  27
  28 /* Specification.  */
  29 #include "unilbrk.h"
  30 #include "unilbrk/internal.h"
  31
  32 #include <stdlib.h>
  33 #include <string.h>
  34
  35 #include "unilbrk/lbrktables.h"
  36 #include "uniwidth/cjk.h"
  37 #include "unistr.h"
  38
  39 /* This file implements
  40    Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>.  */
  41
  42 void
  43 u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding,
  44                               int cr, char *p)
  45 {
  46   if (n > 0)
  47     {
  48       int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL1);
  49
  50       /* Don't break inside multibyte characters.  */
  51       memset (p, UC_BREAK_PROHIBITED, n);
  52
  53       const uint16_t *s_end = s + n;
  54
  55       /* We need 2 characters of lookahead:
  56            - 1 character of lookahead for (LB15c,LB19a,LB28a),
  57            - 2 characters of lookahead for (LB25).  */
  58       const uint16_t *lookahead1_end;
  59       ucs4_t lookahead1_uc;
  60       int lookahead1_prop_ea;
  61       const uint16_t *lookahead2_end;
  62       ucs4_t lookahead2_uc;
  63       int lookahead2_prop_ea;
  64       /* Get the first lookahead character.  */
  65       lookahead1_end = s;
  66       lookahead1_end += u16_mbtouc_unsafe (&lookahead1_uc, lookahead1_end, s_end - lookahead1_end);
  67       lookahead1_prop_ea = unilbrkprop_lookup (lookahead1_uc);
  68       /* Get the second lookahead character.  */
  69       lookahead2_end = lookahead1_end;
  70       if (lookahead2_end < s_end)
  71         {
  72           lookahead2_end += u16_mbtouc_unsafe (&lookahead2_uc, lookahead2_end, s_end - lookahead2_end);
  73           lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc);
  74         }
  75       else
  76         {
  77           lookahead2_uc = 0xFFFD;
  78           lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
  79         }
  80
  81       int preceding_prop = LBP_BK; /* line break property of preceding character */
  82       int prev_prop = LBP_BK; /* line break property of previous character
  83                                  (= last character, ignoring intervening characters of class CM or ZWJ) */
  84       int prev_ea = 0;        /* EastAsian property of previous character
  85                                  (= last character, ignoring intervening characters of class CM or ZWJ) */
  86       int prev2_ea = 0;       /* EastAsian property of character before the previous character */
  87       bool prev_initial_hyphen = false; /* the previous character was a
  88                                            word-initial hyphen or U+2010 */
  89       bool prev_nus = false; /* before the previous character, there was a character
  90                                 with line break property LBP_NU and since then
  91                                 only characters with line break property LBP_SY
  92                                 or LBP_IS */
  93       int last_prop = LBP_BK; /* line break property of last non-space character
  94                                  (= last character, ignoring intervening characters of class SP or CM or ZWJ) */
  95       char *seen_space = NULL; /* Was a space seen after the last non-space character? */
  96
  97       /* Number of consecutive regional indicator (RI) characters seen
  98          immediately before the current point.  */
  99       size_t ri_count = 0;
 100
 101       do
 102         {
 103           /* Read the next character.  */
 104           size_t count = lookahead1_end - s;
 105           s = lookahead1_end;
 106           ucs4_t uc = lookahead1_uc;
 107           int prop_ea = lookahead1_prop_ea; /* = unilbrkprop_lookup (uc); */
 108           int prop = PROP (prop_ea); /* line break property of uc */
 109           int ea = EA (prop_ea);     /* EastAsian property of uc */
 110           /*  Refill the pipeline of 2 lookahead characters.  */
 111           lookahead1_end = lookahead2_end;
 112           lookahead1_uc = lookahead2_uc;
 113           lookahead1_prop_ea = lookahead2_prop_ea;
 114           if (lookahead2_end < s_end)
 115             {
 116               lookahead2_end += u16_mbtouc_unsafe (&lookahead2_uc, lookahead2_end, s_end - lookahead2_end);
 117               lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc);
 118             }
 119           else
 120             {
 121               lookahead2_uc = 0xFFFD;
 122               lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
 123             }
 124
 125           bool nus = /* ending at the previous character, there was a character
 126                         with line break property LBP_NU and since then only
 127                         characters with line break property LBP_SY or LBP_IS */
 128             (prev_prop == LBP_NU
 129              || (prev_nus && (prev_prop == LBP_SY || prev_prop == LBP_IS)));
 130
 131           if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR)
 132             {
 133               /* (LB4,LB5,LB6) Mandatory break.  */
 134               *p = UC_BREAK_MANDATORY;
 135               /* cr is either LBP_CR or -1.  In the first case, recognize
 136                  a CR-LF sequence.  */
 137               if (prev_prop == cr && prop == LBP_LF)
 138                 p[-1] = UC_BREAK_CR_BEFORE_LF;
 139               last_prop = LBP_BK;
 140               seen_space = NULL;
 141             }
 142           else
 143             {
 144               /* Resolve property values whose behaviour is not fixed.  */
 145               switch (prop)
 146                 {
 147                 case LBP_AI:
 148                   /* Resolve ambiguous.  */
 149                   prop = LBP_AI_REPLACEMENT;
 150                   break;
 151                 case LBP_CB:
 152                   /* This is arbitrary.  */
 153                   prop = LBP_ID1;
 154                   break;
 155                 case LBP_SA:
 156                   /* We don't handle complex scripts yet.
 157                      Treat LBP_SA like LBP_XX.  */
 158                 case LBP_XX:
 159                   /* This is arbitrary.  */
 160                   prop = LBP_AL1;
 161                   break;
 162                 }
 163
 164               /* Deal with spaces and combining characters.  */
 165               if (prop == LBP_SP)
 166                 {
 167                   /* (LB7) Don't break just before a space.  */
 168                   *p = UC_BREAK_PROHIBITED;
 169                   seen_space = p;
 170                 }
 171               else if (prop == LBP_ZW)
 172                 {
 173                   /* (LB7) Don't break just before a zero-width space.  */
 174                   *p = UC_BREAK_PROHIBITED;
 175                   last_prop = LBP_ZW;
 176                   seen_space = NULL;
 177                 }
 178               else if (prop == LBP_CM || prop == LBP_ZWJ)
 179                 {
 180                   /* (LB9) Don't break just before a combining character or
 181                      zero-width joiner, except immediately after a mandatory
 182                      break character, space, or zero-width space.  */
 183                   if (last_prop == LBP_BK)
 184                     {
 185                       /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
 186                       *p = UC_BREAK_PROHIBITED;
 187                       /* (LB10) Treat CM or ZWJ as AL.  */
 188                       last_prop = LBP_AL1;
 189                       seen_space = NULL;
 190                     }
 191                   else if (last_prop == LBP_ZW
 192                            || (seen_space != NULL
 193                                /* (LB14) has higher priority than (LB18).  */
 194                                && !(last_prop == LBP_OP1 || last_prop == LBP_OP2)
 195                                /* (LB15a) has higher priority than (LB18).  */
 196                                && !(last_prop == LBP_QU2)))
 197                     {
 198                       /* (LB8) Break after zero-width space.  */
 199                       /* (LB18) Break after spaces.
 200                          We do *not* implement the "legacy support for space
 201                          character as base for combining marks" because now the
 202                          NBSP CM sequence is recommended instead of SP CM.  */
 203                       *p = UC_BREAK_POSSIBLE;
 204                       /* (LB10) Treat CM or ZWJ as AL.  */
 205                       last_prop = LBP_AL1;
 206                       seen_space = NULL;
 207                     }
 208                   else
 209                     {
 210                       /* Treat X CM as if it were X.  */
 211                       *p = UC_BREAK_PROHIBITED;
 212                     }
 213                 }
 214               else
 215                 {
 216                   /* prop must be usable as an index for table 7.3 of UTR #14.  */
 217                   if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
 218                     abort ();
 219
 220                   if (last_prop == LBP_BK)
 221                     {
 222                       /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
 223                       *p = UC_BREAK_PROHIBITED;
 224                     }
 225                   else if (last_prop == LBP_ZW)
 226                     {
 227                       /* (LB8) Break after zero-width space.  */
 228                       *p = UC_BREAK_POSSIBLE;
 229                     }
 230                   else if (preceding_prop == LBP_ZWJ)
 231                     {
 232                       /* (LB8a) Don't break right after a zero-width joiner.  */
 233                       *p = UC_BREAK_PROHIBITED;
 234                     }
 235                   else if (prop == LBP_IS && prev_prop == LBP_SP
 236                            && PROP (lookahead1_prop_ea) == LBP_NU)
 237                     {
 238                       /* (LB15c) Break before a decimal mark that follows a space.  */
 239                       *p = UC_BREAK_POSSIBLE;
 240                     }
 241                   else if (((prop == LBP_QU1 || prop == LBP_QU2 || prop == LBP_QU3)
 242                             && (! prev_ea || ! EA (lookahead1_prop_ea))
 243                             /* (LB18) has higher priority than (LB19a).  */
 244                             && prev_prop != LBP_SP)
 245                            || ((prev_prop == LBP_QU1 || prev_prop == LBP_QU2 || prev_prop == LBP_QU3)
 246                                && (! prev2_ea || ! ea)))
 247                     {
 248                       /* (LB19a) Don't break on either side of ambiguous
 249                          quotation marks, except next to an EastAsian character.  */
 250                       *p = UC_BREAK_PROHIBITED;
 251                     }
 252                   else if (prev_initial_hyphen
 253                            && (prop == LBP_AL1 || prop == LBP_AL2))
 254                     {
 255                       /* (LB20a) Don't break after a word-initial hyphen.  */
 256                       *p = UC_BREAK_PROHIBITED;
 257                     }
 258                   else if (prev_prop == LBP_HL_BA && prop != LBP_HL)
 259                     {
 260                       /* (LB21a) Don't break after Hebrew + Hyphen/Break-After,
 261                          before non-Hebrew.  */
 262                       *p = UC_BREAK_PROHIBITED;
 263                     }
 264                   else if ((prev_nus
 265                             && (prev_prop == LBP_CL
 266                                 || prev_prop == LBP_CP1 || prev_prop == LBP_CP2)
 267                             && (prop == LBP_PO || prop == LBP_PR))
 268                            || (nus && (prop == LBP_PO || prop == LBP_PR
 269                                        || prop == LBP_NU)))
 270                     {
 271                       /* (LB25) Don't break numbers.  */
 272                       *p = UC_BREAK_PROHIBITED;
 273                     }
 274                   else if ((prev_prop == LBP_PO || prev_prop == LBP_PR)
 275                            && (prop == LBP_OP1 || prop == LBP_OP2)
 276                            && (PROP (lookahead1_prop_ea) == LBP_NU
 277                                || (PROP (lookahead1_prop_ea) == LBP_IS
 278                                    && PROP (lookahead2_prop_ea) == LBP_NU)))
 279                     {
 280                       /* (LB25) Don't break numbers.  */
 281                       *p = UC_BREAK_PROHIBITED;
 282                     }
 283                   else if (prev_prop == LBP_AKLS_VI
 284                            && (prop == LBP_AK || prop == LBP_AL2))
 285                     {
 286                       /* (LB28a) Don't break inside orthographic syllables of
 287                          Brahmic scripts, line 3.  */
 288                       *p = UC_BREAK_PROHIBITED;
 289                     }
 290                   else if (PROP (lookahead1_prop_ea) == LBP_VF
 291                            && (prop == LBP_AK || prop == LBP_AL2 || prop == LBP_AS)
 292                            && (prev_prop == LBP_AK || prev_prop == LBP_AL2 || prev_prop == LBP_AS))
 293                     {
 294                       /* (LB28a) Don't break inside orthographic syllables of
 295                          Brahmic scripts, line 4.  */
 296                       *p = UC_BREAK_PROHIBITED;
 297                     }
 298                   else if (last_prop == LBP_IS && uc == 0x003C)
 299                     {
 300                       /* Partially disable (LB29) Do not break between numeric
 301                          punctuation and alphabetics ("e.g.").  We find it
 302                          desirable to break before the HTML tag "</P>" in
 303                          strings like "<P>Some sentence.</P>".  */
 304                       *p = UC_BREAK_POSSIBLE;
 305                     }
 306                   else if (last_prop == LBP_RI && prop == LBP_RI)
 307                     {
 308                       /* (LB30a) Break between two regional indicator symbols
 309                          if and only if there are an even number of regional
 310                          indicators preceding the position of the break.  */
 311                       *p = (seen_space != NULL || (ri_count % 2) == 0
 312                             ? UC_BREAK_POSSIBLE
 313                             : UC_BREAK_PROHIBITED);
 314                     }
 315                   else
 316                     {
 317                       int this_prop = prop;
 318                       if (prop == LBP_QU3)
 319                         {
 320                           /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the
 321                              next character's line break property is not one of
 322                              BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW.  */
 323                           switch (PROP (lookahead1_prop_ea))
 324                             {
 325                             case LBP_BK:
 326                             case LBP_CR:
 327                             case LBP_LF:
 328                             case LBP_SP:
 329                             case LBP_GL:
 330                             case LBP_WJ:
 331                             case LBP_CL:
 332                             case LBP_QU1: case LBP_QU2: case LBP_QU3:
 333                             case LBP_CP1: case LBP_CP2:
 334                             case LBP_EX:
 335                             case LBP_IS:
 336                             case LBP_SY:
 337                             case LBP_ZW:
 338                               break;
 339                             default:
 340                               this_prop = LBP_QU1;
 341                               break;
 342                             }
 343                         }
 344
 345                       switch (unilbrk_table [last_prop] [this_prop])
 346                         {
 347                         case D:
 348                           *p = UC_BREAK_POSSIBLE;
 349                           break;
 350                         case I:
 351                           *p = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 352                           break;
 353                         case P:
 354                           *p = UC_BREAK_PROHIBITED;
 355                           break;
 356                         default:
 357                           abort ();
 358                         }
 359                     }
 360
 361                   if (prop == LBP_QU2)
 362                     {
 363                       /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the
 364                          previous character's line break property was not one of
 365                          BK, CR, LF, OP, QU, GL, SP, ZW.  */
 366                       switch (prev_prop)
 367                         {
 368                         case LBP_BK:
 369                         case LBP_CR:
 370                         case LBP_LF:
 371                         case LBP_OP1: case LBP_OP2:
 372                         case LBP_QU1: case LBP_QU2: case LBP_QU3:
 373                         case LBP_GL:
 374                         case LBP_SP:
 375                         case LBP_ZW:
 376                           break;
 377                         default:
 378                           prop = LBP_QU1;
 379                           break;
 380                         }
 381                     }
 382
 383                   last_prop = prop;
 384                   seen_space = NULL;
 385                 }
 386             }
 387
 388           /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line
 389              break class except BK, CR, LF, NL, SP, or ZW.  */
 390           if (!((prop == LBP_CM || prop == LBP_ZWJ)
 391                 && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop == LBP_CR
 392                      || prev_prop == LBP_SP || prev_prop == LBP_ZW)))
 393             {
 394               prev_initial_hyphen =
 395                 (prop == LBP_HY || uc == 0x2010)
 396                 && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == LBP_LF
 397                     || prev_prop == LBP_SP || prev_prop == LBP_ZW
 398                     || prev_prop == LBP_CB || prev_prop == LBP_GL);
 399               prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
 400                                               || prev_prop == LBP_AL2
 401                                               || prev_prop == LBP_AS)
 402                            ? LBP_AKLS_VI :
 403                            prev_prop == LBP_HL && (prop == LBP_HY
 404                                                    || (prop == LBP_BA && !ea))
 405                            ? LBP_HL_BA :
 406                            prop);
 407               prev2_ea = prev_ea;
 408               prev_ea = ea;
 409               prev_nus = nus;
 410             }
 411
 412           preceding_prop = prop;
 413
 414           if (prop == LBP_RI)
 415             ri_count++;
 416           else
 417             ri_count = 0;
 418
 419           p += count;
 420         }
 421       while (s < s_end);
 422     }
 423 }
 424
 425 #if defined IN_LIBUNISTRING
 426 /* For backward compatibility with older versions of libunistring.  */
 427
 428 # undef u16_possible_linebreaks
 429
 430 void
 431 u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding,
 432                          char *p)
 433 {
 434   u16_possible_linebreaks_loop (s, n, encoding, -1, p);
 435 }
 436
 437 #endif
 438
 439 void
 440 u16_possible_linebreaks_v2 (const uint16_t *s, size_t n, const char *encoding,
 441                             char *p)
 442 {
 443   u16_possible_linebreaks_loop (s, n, encoding, LBP_CR, p);
 444 }