lib/unilbrk/u32-possible-linebreaks.c

   1 /* Line breaking of UTF-32 strings.
   2    Copyright (C) 2001-2003, 2006-2025 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2001.
   4
   5    This file is free software.
   6    It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
   7    You can redistribute it and/or modify it under either
   8      - the terms of the GNU Lesser General Public License as published
   9        by the Free Software Foundation, either version 3, or (at your
  10        option) any later version, or
  11      - the terms of the GNU General Public License as published by the
  12        Free Software Foundation; either version 2, or (at your option)
  13        any later version, or
  14      - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
  15
  16    This file is distributed in the hope that it will be useful,
  17    but WITHOUT ANY WARRANTY; without even the implied warranty of
  18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19    Lesser General Public License and the GNU General Public License
  20    for more details.
  21
  22    You should have received a copy of the GNU Lesser General Public
  23    License and of the GNU General Public License along with this
  24    program.  If not, see <https://www.gnu.org/licenses/>.  */
  25
  26 #include <config.h>
  27
  28 /* Specification.  */
  29 #include "unilbrk.h"
  30 #include "unilbrk/internal.h"
  31
  32 #include <stdlib.h>
  33
  34 #include "unilbrk/lbrktables.h"
  35 #include "uniwidth/cjk.h"
  36
  37 /* This file implements
  38    Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>.  */
  39
  40 void
  41 u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding,
  42                               int cr, char *p)
  43 {
  44   if (n > 0)
  45     {
  46       int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL1);
  47       const uint32_t *s_end = s + n;
  48
  49       /* We need 2 characters of lookahead:
  50            - 1 character of lookahead for (LB15c,LB19a,LB28a),
  51            - 2 characters of lookahead for (LB25).  */
  52       const uint32_t *lookahead1_end;
  53       ucs4_t lookahead1_uc;
  54       int lookahead1_prop_ea;
  55       const uint32_t *lookahead2_end;
  56       ucs4_t lookahead2_uc;
  57       int lookahead2_prop_ea;
  58       /* Get the first lookahead character.  */
  59       lookahead1_end = s;
  60       lookahead1_uc = *lookahead1_end++;
  61       lookahead1_prop_ea = unilbrkprop_lookup (lookahead1_uc);
  62       /* Get the second lookahead character.  */
  63       lookahead2_end = lookahead1_end;
  64       if (lookahead2_end < s_end)
  65         {
  66           lookahead2_uc = *lookahead2_end++;
  67           lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc);
  68         }
  69       else
  70         {
  71           lookahead2_uc = 0xFFFD;
  72           lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
  73         }
  74
  75       int preceding_prop = LBP_BK; /* line break property of preceding character */
  76       int prev_prop = LBP_BK; /* line break property of previous character
  77                                  (= last character, ignoring intervening characters of class CM or ZWJ) */
  78       int prev_ea = 0;        /* EastAsian property of previous character
  79                                  (= last character, ignoring intervening characters of class CM or ZWJ) */
  80       int prev2_ea = 0;       /* EastAsian property of character before the previous character */
  81       bool prev_initial_hyphen = false; /* the previous character was a
  82                                            word-initial hyphen or U+2010 */
  83       bool prev_nus = false; /* before the previous character, there was a character
  84                                 with line break property LBP_NU and since then
  85                                 only characters with line break property LBP_SY
  86                                 or LBP_IS */
  87       int last_prop = LBP_BK; /* line break property of last non-space character
  88                                  (= last character, ignoring intervening characters of class SP or CM or ZWJ) */
  89       char *seen_space = NULL; /* Was a space seen after the last non-space character? */
  90
  91       /* Number of consecutive regional indicator (RI) characters seen
  92          immediately before the current point.  */
  93       size_t ri_count = 0;
  94
  95       do
  96         {
  97           /* Read the next character.  */
  98           s = lookahead1_end;
  99           ucs4_t uc = lookahead1_uc;
 100           int prop_ea = lookahead1_prop_ea; /* = unilbrkprop_lookup (uc); */
 101           int prop = PROP (prop_ea); /* line break property of uc */
 102           int ea = EA (prop_ea);     /* EastAsian property of uc */
 103           /*  Refill the pipeline of 2 lookahead characters.  */
 104           lookahead1_end = lookahead2_end;
 105           lookahead1_uc = lookahead2_uc;
 106           lookahead1_prop_ea = lookahead2_prop_ea;
 107           if (lookahead2_end < s_end)
 108             {
 109               lookahead2_uc = *lookahead2_end++;
 110               lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc);
 111             }
 112           else
 113             {
 114               lookahead2_uc = 0xFFFD;
 115               lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
 116             }
 117
 118           bool nus = /* ending at the previous character, there was a character
 119                         with line break property LBP_NU and since then only
 120                         characters with line break property LBP_SY or LBP_IS */
 121             (prev_prop == LBP_NU
 122              || (prev_nus && (prev_prop == LBP_SY || prev_prop == LBP_IS)));
 123
 124           if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR)
 125             {
 126               /* (LB4,LB5,LB6) Mandatory break.  */
 127               *p = UC_BREAK_MANDATORY;
 128               /* cr is either LBP_CR or -1.  In the first case, recognize
 129                  a CR-LF sequence.  */
 130               if (prev_prop == cr && prop == LBP_LF)
 131                 p[-1] = UC_BREAK_CR_BEFORE_LF;
 132               last_prop = LBP_BK;
 133               seen_space = NULL;
 134             }
 135           else
 136             {
 137               /* Resolve property values whose behaviour is not fixed.  */
 138               switch (prop)
 139                 {
 140                 case LBP_AI:
 141                   /* Resolve ambiguous.  */
 142                   prop = LBP_AI_REPLACEMENT;
 143                   break;
 144                 case LBP_CB:
 145                   /* This is arbitrary.  */
 146                   prop = LBP_ID1;
 147                   break;
 148                 case LBP_SA:
 149                   /* We don't handle complex scripts yet.
 150                      Treat LBP_SA like LBP_XX.  */
 151                 case LBP_XX:
 152                   /* This is arbitrary.  */
 153                   prop = LBP_AL1;
 154                   break;
 155                 }
 156
 157               /* Deal with spaces and combining characters.  */
 158               if (prop == LBP_SP)
 159                 {
 160                   /* (LB7) Don't break just before a space.  */
 161                   *p = UC_BREAK_PROHIBITED;
 162                   seen_space = p;
 163                 }
 164               else if (prop == LBP_ZW)
 165                 {
 166                   /* (LB7) Don't break just before a zero-width space.  */
 167                   *p = UC_BREAK_PROHIBITED;
 168                   last_prop = LBP_ZW;
 169                   seen_space = NULL;
 170                 }
 171               else if (prop == LBP_CM || prop == LBP_ZWJ)
 172                 {
 173                   /* (LB9) Don't break just before a combining character or
 174                      zero-width joiner, except immediately after a mandatory
 175                      break character, space, or zero-width space.  */
 176                   if (last_prop == LBP_BK)
 177                     {
 178                       /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
 179                       *p = UC_BREAK_PROHIBITED;
 180                       /* (LB10) Treat CM or ZWJ as AL.  */
 181                       last_prop = LBP_AL1;
 182                       seen_space = NULL;
 183                     }
 184                   else if (last_prop == LBP_ZW
 185                            || (seen_space != NULL
 186                                /* (LB14) has higher priority than (LB18).  */
 187                                && !(last_prop == LBP_OP1 || last_prop == LBP_OP2)
 188                                /* (LB15a) has higher priority than (LB18).  */
 189                                && !(last_prop == LBP_QU2)))
 190                     {
 191                       /* (LB8) Break after zero-width space.  */
 192                       /* (LB18) Break after spaces.
 193                          We do *not* implement the "legacy support for space
 194                          character as base for combining marks" because now the
 195                          NBSP CM sequence is recommended instead of SP CM.  */
 196                       *p = UC_BREAK_POSSIBLE;
 197                       /* (LB10) Treat CM or ZWJ as AL.  */
 198                       last_prop = LBP_AL1;
 199                       seen_space = NULL;
 200                     }
 201                   else
 202                     {
 203                       /* Treat X CM as if it were X.  */
 204                       *p = UC_BREAK_PROHIBITED;
 205                     }
 206                 }
 207               else
 208                 {
 209                   /* prop must be usable as an index for table 7.3 of UTR #14.  */
 210                   if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
 211                     abort ();
 212
 213                   if (last_prop == LBP_BK)
 214                     {
 215                       /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
 216                       *p = UC_BREAK_PROHIBITED;
 217                     }
 218                   else if (last_prop == LBP_ZW)
 219                     {
 220                       /* (LB8) Break after zero-width space.  */
 221                       *p = UC_BREAK_POSSIBLE;
 222                     }
 223                   else if (preceding_prop == LBP_ZWJ)
 224                     {
 225                       /* (LB8a) Don't break right after a zero-width joiner.  */
 226                       *p = UC_BREAK_PROHIBITED;
 227                     }
 228                   else if (prop == LBP_IS && prev_prop == LBP_SP
 229                            && PROP (lookahead1_prop_ea) == LBP_NU)
 230                     {
 231                       /* (LB15c) Break before a decimal mark that follows a space.  */
 232                       *p = UC_BREAK_POSSIBLE;
 233                     }
 234                   else if (((prop == LBP_QU1 || prop == LBP_QU2 || prop == LBP_QU3)
 235                             && (! prev_ea || ! EA (lookahead1_prop_ea))
 236                             /* (LB18) has higher priority than (LB19a).  */
 237                             && prev_prop != LBP_SP)
 238                            || ((prev_prop == LBP_QU1 || prev_prop == LBP_QU2 || prev_prop == LBP_QU3)
 239                                && (! prev2_ea || ! ea)))
 240                     {
 241                       /* (LB19a) Don't break on either side of ambiguous
 242                          quotation marks, except next to an EastAsian character.  */
 243                       *p = UC_BREAK_PROHIBITED;
 244                     }
 245                   else if (prev_initial_hyphen
 246                            && (prop == LBP_AL1 || prop == LBP_AL2))
 247                     {
 248                       /* (LB20a) Don't break after a word-initial hyphen.  */
 249                       *p = UC_BREAK_PROHIBITED;
 250                     }
 251                   else if (prev_prop == LBP_HL_BA && prop != LBP_HL)
 252                     {
 253                       /* (LB21a) Don't break after Hebrew + Hyphen/Break-After,
 254                          before non-Hebrew.  */
 255                       *p = UC_BREAK_PROHIBITED;
 256                     }
 257                   else if ((prev_nus
 258                             && (prev_prop == LBP_CL
 259                                 || prev_prop == LBP_CP1 || prev_prop == LBP_CP2)
 260                             && (prop == LBP_PO || prop == LBP_PR))
 261                            || (nus && (prop == LBP_PO || prop == LBP_PR
 262                                        || prop == LBP_NU)))
 263                     {
 264                       /* (LB25) Don't break numbers.  */
 265                       *p = UC_BREAK_PROHIBITED;
 266                     }
 267                   else if ((prev_prop == LBP_PO || prev_prop == LBP_PR)
 268                            && (prop == LBP_OP1 || prop == LBP_OP2)
 269                            && (PROP (lookahead1_prop_ea) == LBP_NU
 270                                || (PROP (lookahead1_prop_ea) == LBP_IS
 271                                    && PROP (lookahead2_prop_ea) == LBP_NU)))
 272                     {
 273                       /* (LB25) Don't break numbers.  */
 274                       *p = UC_BREAK_PROHIBITED;
 275                     }
 276                   else if (prev_prop == LBP_AKLS_VI
 277                            && (prop == LBP_AK || prop == LBP_AL2))
 278                     {
 279                       /* (LB28a) Don't break inside orthographic syllables of
 280                          Brahmic scripts, line 3.  */
 281                       *p = UC_BREAK_PROHIBITED;
 282                     }
 283                   else if (PROP (lookahead1_prop_ea) == LBP_VF
 284                            && (prop == LBP_AK || prop == LBP_AL2 || prop == LBP_AS)
 285                            && (prev_prop == LBP_AK || prev_prop == LBP_AL2 || prev_prop == LBP_AS))
 286                     {
 287                       /* (LB28a) Don't break inside orthographic syllables of
 288                          Brahmic scripts, line 4.  */
 289                       *p = UC_BREAK_PROHIBITED;
 290                     }
 291                   else if (last_prop == LBP_IS && uc == 0x003C)
 292                     {
 293                       /* Partially disable (LB29) Do not break between numeric
 294                          punctuation and alphabetics ("e.g.").  We find it
 295                          desirable to break before the HTML tag "</P>" in
 296                          strings like "<P>Some sentence.</P>".  */
 297                       *p = UC_BREAK_POSSIBLE;
 298                     }
 299                   else if (last_prop == LBP_RI && prop == LBP_RI)
 300                     {
 301                       /* (LB30a) Break between two regional indicator symbols
 302                          if and only if there are an even number of regional
 303                          indicators preceding the position of the break.  */
 304                       *p = (seen_space != NULL || (ri_count % 2) == 0
 305                             ? UC_BREAK_POSSIBLE
 306                             : UC_BREAK_PROHIBITED);
 307                     }
 308                   else
 309                     {
 310                       int this_prop = prop;
 311                       if (prop == LBP_QU3)
 312                         {
 313                           /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the
 314                              next character's line break property is not one of
 315                              BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW.  */
 316                           switch (PROP (lookahead1_prop_ea))
 317                             {
 318                             case LBP_BK:
 319                             case LBP_CR:
 320                             case LBP_LF:
 321                             case LBP_SP:
 322                             case LBP_GL:
 323                             case LBP_WJ:
 324                             case LBP_CL:
 325                             case LBP_QU1: case LBP_QU2: case LBP_QU3:
 326                             case LBP_CP1: case LBP_CP2:
 327                             case LBP_EX:
 328                             case LBP_IS:
 329                             case LBP_SY:
 330                             case LBP_ZW:
 331                               break;
 332                             default:
 333                               this_prop = LBP_QU1;
 334                               break;
 335                             }
 336                         }
 337
 338                       switch (unilbrk_table [last_prop] [this_prop])
 339                         {
 340                         case D:
 341                           *p = UC_BREAK_POSSIBLE;
 342                           break;
 343                         case I:
 344                           *p = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 345                           break;
 346                         case P:
 347                           *p = UC_BREAK_PROHIBITED;
 348                           break;
 349                         default:
 350                           abort ();
 351                         }
 352                     }
 353
 354                   if (prop == LBP_QU2)
 355                     {
 356                       /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the
 357                          previous character's line break property was not one of
 358                          BK, CR, LF, OP, QU, GL, SP, ZW.  */
 359                       switch (prev_prop)
 360                         {
 361                         case LBP_BK:
 362                         case LBP_CR:
 363                         case LBP_LF:
 364                         case LBP_OP1: case LBP_OP2:
 365                         case LBP_QU1: case LBP_QU2: case LBP_QU3:
 366                         case LBP_GL:
 367                         case LBP_SP:
 368                         case LBP_ZW:
 369                           break;
 370                         default:
 371                           prop = LBP_QU1;
 372                           break;
 373                         }
 374                     }
 375
 376                   last_prop = prop;
 377                   seen_space = NULL;
 378                 }
 379             }
 380
 381           /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line
 382              break class except BK, CR, LF, NL, SP, or ZW.  */
 383           if (!((prop == LBP_CM || prop == LBP_ZWJ)
 384                 && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop == LBP_CR
 385                      || prev_prop == LBP_SP || prev_prop == LBP_ZW)))
 386             {
 387               prev_initial_hyphen =
 388                 (prop == LBP_HY || uc == 0x2010)
 389                 && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == LBP_LF
 390                     || prev_prop == LBP_SP || prev_prop == LBP_ZW
 391                     || prev_prop == LBP_CB || prev_prop == LBP_GL);
 392               prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
 393                                               || prev_prop == LBP_AL2
 394                                               || prev_prop == LBP_AS)
 395                            ? LBP_AKLS_VI :
 396                            prev_prop == LBP_HL && (prop == LBP_HY
 397                                                    || (prop == LBP_BA && !ea))
 398                            ? LBP_HL_BA :
 399                            prop);
 400               prev2_ea = prev_ea;
 401               prev_ea = ea;
 402               prev_nus = nus;
 403             }
 404
 405           preceding_prop = prop;
 406
 407           if (prop == LBP_RI)
 408             ri_count++;
 409           else
 410             ri_count = 0;
 411
 412           p++;
 413         }
 414       while (s < s_end);
 415     }
 416 }
 417
 418 #if defined IN_LIBUNISTRING
 419 /* For backward compatibility with older versions of libunistring.  */
 420
 421 # undef u32_possible_linebreaks
 422
 423 void
 424 u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding,
 425                          char *p)
 426 {
 427   u32_possible_linebreaks_loop (s, n, encoding, -1, p);
 428 }
 429
 430 #endif
 431
 432 void
 433 u32_possible_linebreaks_v2 (const uint32_t *s, size_t n, const char *encoding,
 434                             char *p)
 435 {
 436   u32_possible_linebreaks_loop (s, n, encoding, LBP_CR, p);
 437 }