external/icu/icu4c-khmerbreakengine.patch.1

   1 diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp
   2 --- icu.org/source/common/dictbe.cpp    2023-06-14 06:23:55.000000000 +0900
   3 +++ icu/source/common/dictbe.cpp        2023-06-26 17:43:53.034173100 +0900
   4 @@ -35,7 +35,19 @@
   5   ******************************************************************
   6   */
   7
   8 -DictionaryBreakEngine::DictionaryBreakEngine() {
   9 +DictionaryBreakEngine::DictionaryBreakEngine()
  10 +    : fTypes(0), clusterLimit(0) {
  11 +}
  12 +
  13 +DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes)
  14 +    : fTypes(breakTypes), clusterLimit(3) {
  15 +    UErrorCode status = U_ZERO_ERROR;
  16 +    fViramaSet.applyPattern(UnicodeString(u"[[:ccc=VR:]]"), status);
  17 +
  18 +    // note Skip Sets contain fIgnoreSet characters too.
  19 +    fSkipStartSet.applyPattern(UnicodeString(u"[[:lb=OP:][:lb=QU:]\\u200C\\u200D\\u2060]"), status);
  20 +    fSkipEndSet.applyPattern(UnicodeString(u"[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]\\u200C\\u200D\\u2060]"), status);
  21 +    fNBeforeSet.applyPattern(UnicodeString(u"[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status);
  22  }
  23
  24  DictionaryBreakEngine::~DictionaryBreakEngine() {
  25 @@ -85,6 +97,169 @@
  26      fSet.compact();
  27  }
  28
  29 +bool
  30 +DictionaryBreakEngine::scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const {
  31 +    UErrorCode status = U_ZERO_ERROR;
  32 +    UText* ut = utext_clone(NULL, text, false, true, &status);
  33 +    utext_setNativeIndex(ut, start);
  34 +    UChar32 c = utext_current32(ut);
  35 +    bool res = false;
  36 +    doBreak = true;
  37 +    while (start >= 0) {
  38 +        if (!fSkipStartSet.contains(c)) {
  39 +            res = (c == ZWSP);
  40 +            break;
  41 +        }
  42 +        --start;
  43 +        c = utext_previous32(ut);
  44 +        doBreak = false;
  45 +    }
  46 +    utext_close(ut);
  47 +    return res;
  48 +}
  49 +
  50 +bool
  51 +DictionaryBreakEngine::scanAfterEnd(UText *text, int32_t textEnd, int32_t& end, bool &doBreak) const {
  52 +    UErrorCode status = U_ZERO_ERROR;
  53 +    UText* ut = utext_clone(NULL, text, false, true, &status);
  54 +    utext_setNativeIndex(ut, end);
  55 +    UChar32 c = utext_current32(ut);
  56 +    bool res = false;
  57 +    doBreak = !fNBeforeSet.contains(c);
  58 +    while (end < textEnd) {
  59 +        if (!fSkipEndSet.contains(c)) {
  60 +            res = (c == ZWSP);
  61 +            break;
  62 +        }
  63 +        ++end;
  64 +        c = utext_next32(ut);
  65 +        doBreak = false;
  66 +    }
  67 +    utext_close(ut);
  68 +    return res;
  69 +}
  70 +
  71 +void
  72 +DictionaryBreakEngine::scanBackClusters(UText *text, int32_t textStart, int32_t& start) const {
  73 +    UChar32 c = 0;
  74 +    start = utext_getNativeIndex(text);
  75 +    while (start > textStart) {
  76 +        c = utext_previous32(text);
  77 +        --start;
  78 +        if (!fSkipEndSet.contains(c))
  79 +            break;
  80 +    }
  81 +    for (int i = 0; i < clusterLimit; ++i) { // scan backwards clusterLimit clusters
  82 +        while (start > textStart) {
  83 +            while (fIgnoreSet.contains(c))
  84 +                c = utext_previous32(text);
  85 +            if (!fMarkSet.contains(c)) {
  86 +                if (fBaseSet.contains(c)) {
  87 +                    c = utext_previous32(text);
  88 +                    if (!fViramaSet.contains(c)) { // Virama (e.g. coeng) preceding base. Treat sequence as a mark
  89 +                        utext_next32(text);
  90 +                        c = utext_current32(text);
  91 +                        break;
  92 +                    } else {
  93 +                        --start;
  94 +                    }
  95 +                } else {
  96 +                    break;
  97 +                }
  98 +            }
  99 +            c = utext_previous32(text);
 100 +            --start;
 101 +        }
 102 +        if (!fBaseSet.contains(c) || start < textStart) {  // not a cluster start so finish
 103 +            break;
 104 +        }
 105 +        c = utext_previous32(text);
 106 +        --start;        // go round again
 107 +    }                   // ignore hitting previous inhibitor since scanning for it should have found us!
 108 +    ++start;            // counteract --before
 109 +}
 110 +
 111 +void
 112 +DictionaryBreakEngine::scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const {
 113 +    UChar32 c = utext_current32(text);
 114 +    end = utext_getNativeIndex(text);
 115 +    while (end < textEnd) {
 116 +        if (!fSkipStartSet.contains(c))
 117 +            break;
 118 +        utext_next32(text);
 119 +        c = utext_current32(text);
 120 +        ++end;
 121 +    }
 122 +    for (int i = 0; i < clusterLimit; ++i) { // scan forwards clusterLimit clusters
 123 +        while (fIgnoreSet.contains(c)) {
 124 +            utext_next32(text);
 125 +            c = utext_current32(text);
 126 +        }
 127 +        if (fBaseSet.contains(c)) {
 128 +            while (end < textEnd) {
 129 +                utext_next32(text);
 130 +                c = utext_current32(text);
 131 +                ++end;
 132 +                if (!fMarkSet.contains(c))
 133 +                    break;
 134 +                else if (fViramaSet.contains(c)) {  // handle coeng + base as mark
 135 +                    utext_next32(text);
 136 +                    c = utext_current32(text);
 137 +                    ++end;
 138 +                    if (!fBaseSet.contains(c))
 139 +                        break;
 140 +                }
 141 +            }
 142 +        } else {
 143 +            --end;    // bad char so break after char before it
 144 +            break;
 145 +        }
 146 +    }
 147 +}
 148 +
 149 +bool
 150 +DictionaryBreakEngine::scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const {
 151 +    UErrorCode status = U_ZERO_ERROR;
 152 +    UText* ut = utext_clone(NULL, text, false, true, &status);
 153 +    int32_t nat = start;
 154 +    utext_setNativeIndex(ut, nat);
 155 +    bool foundFirst = true;
 156 +    int32_t curr = start;
 157 +    while (nat < end) {
 158 +        UChar32 c = utext_current32(ut);
 159 +        if (c == ZWSP || c == WJ) {
 160 +            curr = nat + 1;
 161 +            if (foundFirst)     // only scan backwards for first inhibitor
 162 +                scanBackClusters(ut, start, before);
 163 +            foundFirst = false; // don't scan backwards if we go around again. Also marks found something
 164 +
 165 +            utext_next32(ut);
 166 +            scanFwdClusters(ut, end, after);
 167 +            nat = after + 1;
 168 +
 169 +            if (c == ZWSP || c == WJ) {  // did we hit another one?
 170 +                continue;
 171 +            } else {
 172 +                break;
 173 +            }
 174 +        }
 175 +
 176 +        ++nat;                  // keep hunting
 177 +        utext_next32(ut);
 178 +    }
 179 +
 180 +    utext_close(ut);
 181 +
 182 +    if (nat >= end && foundFirst) {
 183 +        start = before = after = nat;
 184 +        return false;           // failed to find anything
 185 +    }
 186 +    else {
 187 +        start = curr;
 188 +    }
 189 +    return true;                // yup hit one
 190 +}
 191 +
 192  /*
 193   ******************************************************************
 194   * PossibleWord
 195 @@ -114,7 +289,7 @@
 196      ~PossibleWord() {}
 197
 198      // Fill the list of candidates if needed, select the longest, and return the number found
 199 -    int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
 200 +    int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet = NULL, int32_t minLength = 0 );
 201
 202      // Select the currently marked candidate, point after it in the text, and invalidate self
 203      int32_t   acceptMarked( UText *text );
 204 @@ -135,12 +310,12 @@
 205  };
 206
 207
 208 -int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
 209 +int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet, int32_t minLength) {
 210      // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
 211      int32_t start = (int32_t)utext_getNativeIndex(text);
 212      if (start != offset) {
 213          offset = start;
 214 -        count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix);
 215 +        count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix, ignoreSet, minLength);
 216          // Dictionary leaves text after longest prefix, not longest word. Back up.
 217          if (count <= 0) {
 218              utext_setNativeIndex(text, start);
 219 @@ -814,53 +989,30 @@
 220   * KhmerBreakEngine
 221   */
 222
 223 -// How many words in a row are "good enough"?
 224 -static const int32_t KHMER_LOOKAHEAD = 3;
 225 -
 226 -// Will not combine a non-word with a preceding dictionary word longer than this
 227 -static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
 228 -
 229 -// Will not combine a non-word that shares at least this much prefix with a
 230 -// dictionary word, with a preceding word
 231 -static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
 232 -
 233 -// Minimum word size
 234 -static const int32_t KHMER_MIN_WORD = 2;
 235 -
 236 -// Minimum number of characters for two words
 237 -static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
 238 -
 239  KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 240 -    : DictionaryBreakEngine(),
 241 +    : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
 242        fDictionary(adoptDictionary)
 243  {
 244      UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
 245      UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
 246 -    UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status);
 247 +
 248 +    clusterLimit = 3;
 249 +
 250 +    UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]\\u2060\\u200C\\u200D]"), status);
 251      if (U_SUCCESS(status)) {
 252          setCharacters(khmerWordSet);
 253      }
 254      fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
 255 -    fMarkSet.add(0x0020);
 256 -    fEndWordSet = khmerWordSet;
 257 -    fBeginWordSet.add(0x1780, 0x17B3);
 258 -    //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
 259 -    //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
 260 -    //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
 261 -    fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
 262 -    //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
 263 -//    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
 264 -//    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
 265 -//    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
 266 -//    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
 267 -//    fSuffixSet.add(THAI_PAIYANNOI);
 268 -//    fSuffixSet.add(THAI_MAIYAMOK);
 269 +    fIgnoreSet.add(0x2060);         // WJ
 270 +    fIgnoreSet.add(0x200C, 0x200D); // ZWJ, ZWNJ
 271 +    fBaseSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:lb=SA:]&[:^M:]]"), status);
 272 +    fPuncSet.applyPattern(UnicodeString(u"[\\u17D4\\u17D5\\u17D6\\u17D7\\u17D9:]"), status);
 273
 274      // Compact for caching.
 275      fMarkSet.compact();
 276 -    fEndWordSet.compact();
 277 -    fBeginWordSet.compact();
 278 -//    fSuffixSet.compact();
 279 +    fIgnoreSet.compact();
 280 +    fBaseSet.compact();
 281 +    fPuncSet.compact();
 282      UTRACE_EXIT_STATUS(status);
 283  }
 284
 285 @@ -876,175 +1028,205 @@
 286                                                  UBool /* isPhraseBreaking */,
 287                                                  UErrorCode& status ) const {
 288      if (U_FAILURE(status)) return 0;
 289 -    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
 290 -        return 0;       // Not enough characters for two words
 291 +    uint32_t wordsFound = foundBreaks.size();
 292 +    int32_t before = 0;
 293 +    int32_t after = 0;
 294 +    int32_t finalBefore = 0;
 295 +    int32_t initAfter = 0;
 296 +    int32_t scanStart = rangeStart;
 297 +    int32_t scanEnd = rangeEnd;
 298 +
 299 +    bool startZwsp = false;
 300 +    bool breakStart = false;
 301 +    bool breakEnd = false;
 302 +
 303 +    if (rangeStart > 0) {
 304 +        --scanStart;
 305 +        startZwsp = scanBeforeStart(text, scanStart, breakStart);
 306      }
 307
 308 -    uint32_t wordsFound = 0;
 309 -    int32_t cpWordLength = 0;
 310 -    int32_t cuWordLength = 0;
 311 -    int32_t current;
 312 -    PossibleWord words[KHMER_LOOKAHEAD];
 313 -
 314      utext_setNativeIndex(text, rangeStart);
 315 +    scanFwdClusters(text, rangeEnd, initAfter);
 316 +    bool endZwsp = scanAfterEnd(text, utext_nativeLength(text), scanEnd, breakEnd);
 317 +    utext_setNativeIndex(text, rangeEnd - 1);
 318 +    scanBackClusters(text, rangeStart, finalBefore);
 319 +    if (finalBefore < initAfter) {   // the whole run is tented so no breaks
 320 +        if (breakStart || fTypes < UBRK_LINE)
 321 +            foundBreaks.push(rangeStart, status);
 322 +        if (breakEnd || fTypes < UBRK_LINE)
 323 +            foundBreaks.push(rangeEnd, status);
 324 +        return foundBreaks.size() - wordsFound;
 325 +    }
 326
 327 -    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 328 -        cuWordLength = 0;
 329 -        cpWordLength = 0;
 330 -
 331 -        // Look for candidate words at the current position
 332 -        int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 333 -
 334 -        // If we found exactly one, use that
 335 -        if (candidates == 1) {
 336 -            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
 337 -            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
 338 -            wordsFound += 1;
 339 -        }
 340 +    scanStart = rangeStart;
 341 +    scanWJ(text, scanStart, rangeEnd, before, after);
 342 +    if (startZwsp || initAfter >= before) {
 343 +        after = initAfter;
 344 +        before = 0;
 345 +    }
 346 +    if (!endZwsp && after > finalBefore && after < rangeEnd)
 347 +        endZwsp = true;
 348 +    if (endZwsp && before > finalBefore)
 349 +        before = finalBefore;
 350
 351 -        // If there was more than one, see which one can take us forward the most words
 352 -        else if (candidates > 1) {
 353 -            // If we're already at the end of the range, we're done
 354 -            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 355 -                goto foundBest;
 356 -            }
 357 -            do {
 358 -                if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 359 -                    // Followed by another dictionary word; mark first word as a good candidate
 360 -                    words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
 361 +    utext_setNativeIndex(text, rangeStart);
 362 +    int32_t numCodePts = rangeEnd - rangeStart;
 363 +    // bestSnlp[i] is the snlp of the best segmentation of the first i
 364 +    // code points in the range to be matched.
 365 +    UVector32 bestSnlp(numCodePts + 1, status);
 366 +    bestSnlp.addElement(0, status);
 367 +    for(int32_t i = 1; i <= numCodePts; i++) {
 368 +        bestSnlp.addElement(kuint32max, status);
 369 +    }
 370
 371 -                    // If we're already at the end of the range, we're done
 372 -                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 373 -                        goto foundBest;
 374 -                    }
 375 +    // prev[i] is the index of the last code point in the previous word in
 376 +    // the best segmentation of the first i characters. Note negative implies
 377 +       // that the code point is part of an unknown word.
 378 +    UVector32 prev(numCodePts + 1, status);
 379 +    for(int32_t i = 0; i <= numCodePts; i++) {
 380 +        prev.addElement(kuint32max, status);
 381 +    }
 382
 383 -                    // See if any of the possible second words is followed by a third word
 384 -                    do {
 385 -                        // If we find a third word, stop right away
 386 -                        if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 387 -                            words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
 388 -                            goto foundBest;
 389 -                        }
 390 -                    }
 391 -                    while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
 392 -                }
 393 +    const int32_t maxWordSize = 20;
 394 +    UVector32 values(maxWordSize, status);
 395 +    values.setSize(maxWordSize);
 396 +    UVector32 lengths(maxWordSize, status);
 397 +    lengths.setSize(maxWordSize);
 398 +
 399 +    // Dynamic programming to find the best segmentation.
 400 +
 401 +    // In outer loop, i  is the code point index,
 402 +    //                ix is the corresponding string (code unit) index.
 403 +    //    They differ when the string contains supplementary characters.
 404 +    int32_t ix = rangeStart;
 405 +    for (int32_t i = 0;  i < numCodePts;  ++i, utext_setNativeIndex(text, ++ix)) {
 406 +        if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
 407 +            continue;
 408 +        }
 409 +
 410 +        int32_t count;
 411 +        count = fDictionary->matches(text, numCodePts - i, maxWordSize,
 412 +                             NULL, lengths.getBuffer(), values.getBuffer(), NULL, &fIgnoreSet, 2);
 413 +                             // Note: lengths is filled with code point lengths
 414 +                             //       The NULL parameter is the ignored code unit lengths.
 415 +
 416 +        for (int32_t j = 0; j < count; j++) {
 417 +            int32_t ln = lengths.elementAti(j);
 418 +            if (ln + i >= numCodePts)
 419 +                continue;
 420 +            utext_setNativeIndex(text, ln+ix);
 421 +            int32_t c = utext_current32(text);
 422 +            if (fMarkSet.contains(c) || c == 0x17D2) { // Coeng
 423 +                lengths.removeElementAt(j);
 424 +                values.removeElementAt(j);
 425 +                --j;
 426 +                --count;
 427              }
 428 -            while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
 429 -foundBest:
 430 -            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
 431 -            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
 432 -            wordsFound += 1;
 433          }
 434 -
 435 -        // We come here after having either found a word or not. We look ahead to the
 436 -        // next word. If it's not a dictionary word, we will combine it with the word we
 437 -        // just found (if there is one), but only if the preceding word does not exceed
 438 -        // the threshold.
 439 -        // The text iterator should now be positioned at the end of the word we found.
 440 -        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
 441 -            // if it is a dictionary word, do nothing. If it isn't, then if there is
 442 -            // no preceding word, or the non-word shares less than the minimum threshold
 443 -            // of characters with a dictionary word, then scan to resynchronize
 444 -            if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 445 -                  && (cuWordLength == 0
 446 -                      || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
 447 -                // Look for a plausible word boundary
 448 -                int32_t remaining = rangeEnd - (current+cuWordLength);
 449 -                UChar32 pc;
 450 -                UChar32 uc;
 451 -                int32_t chars = 0;
 452 -                for (;;) {
 453 -                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 454 -                    pc = utext_next32(text);
 455 -                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 456 -                    chars += pcSize;
 457 -                    remaining -= pcSize;
 458 -                    if (remaining <= 0) {
 459 +        if (count == 0) {
 460 +            utext_setNativeIndex(text, ix);
 461 +            int32_t c = utext_current32(text);
 462 +            if (fPuncSet.contains(c) || fIgnoreSet.contains(c) || c == ZWSP) {
 463 +                values.setElementAt(0, count);
 464 +                lengths.setElementAt(1, count++);
 465 +            } else if (fBaseSet.contains(c)) {
 466 +                int32_t currix = utext_getNativeIndex(text);
 467 +                do {
 468 +                    utext_next32(text);
 469 +                    c = utext_current32(text);
 470 +                    if (utext_getNativeIndex(text) >= rangeEnd)
 471                          break;
 472 -                    }
 473 -                    uc = utext_current32(text);
 474 -                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 475 -                        // Maybe. See if it's in the dictionary.
 476 -                        int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 477 -                        utext_setNativeIndex(text, current+cuWordLength+chars);
 478 -                        if (num_candidates > 0) {
 479 +                    if (c == 0x17D2) { // Coeng
 480 +                        utext_next32(text);
 481 +                        c = utext_current32(text);
 482 +                        if (!fBaseSet.contains(c) || utext_getNativeIndex(text) >= rangeEnd) {
 483                              break;
 484 +                        } else {
 485 +                            utext_next32(text);
 486 +                            c = utext_current32(text);
 487 +                            if (utext_getNativeIndex(text) >= rangeEnd)
 488 +                                break;
 489                          }
 490                      }
 491 -                }
 492 -
 493 -                // Bump the word count if there wasn't already one
 494 -                if (cuWordLength <= 0) {
 495 -                    wordsFound += 1;
 496 -                }
 497 +                } while (fMarkSet.contains(c) || fIgnoreSet.contains(c));
 498 +                values.setElementAt(BADSNLP, count);
 499 +                lengths.setElementAt(utext_getNativeIndex(text) - currix, count++);
 500 +            } else {
 501 +                values.setElementAt(BADSNLP, count);
 502 +                lengths.setElementAt(1, count++);
 503 +            }
 504 +        }
 505
 506 -                // Update the length with the passed-over characters
 507 -                cuWordLength += chars;
 508 +        for (int32_t j = 0; j < count; j++) {
 509 +            uint32_t v = values.elementAti(j);
 510 +            int32_t newSnlp = bestSnlp.elementAti(i) + v;
 511 +            int32_t ln = lengths.elementAti(j);
 512 +            utext_setNativeIndex(text, ln+ix);
 513 +            int32_t c = utext_current32(text);
 514 +            while ((fPuncSet.contains(c) || fIgnoreSet.contains(c)) && ln + i < numCodePts) {
 515 +                ++ln;
 516 +                utext_next32(text);
 517 +                c = utext_current32(text);
 518              }
 519 -            else {
 520 -                // Back up to where we were for next iteration
 521 -                utext_setNativeIndex(text, current+cuWordLength);
 522 +            int32_t ln_j_i = ln + i;   // yes really i!
 523 +            if (newSnlp < bestSnlp.elementAti(ln_j_i)) {
 524 +                if (v == BADSNLP) {
 525 +                    int32_t p = prev.elementAti(i);
 526 +                    if (p < 0)
 527 +                        prev.setElementAt(p, ln_j_i);
 528 +                    else
 529 +                        prev.setElementAt(-i, ln_j_i);
 530 +                }
 531 +                else
 532 +                    prev.setElementAt(i, ln_j_i);
 533 +                bestSnlp.setElementAt(newSnlp, ln_j_i);
 534              }
 535          }
 536 -
 537 -        // Never stop before a combining mark.
 538 -        int32_t currPos;
 539 -        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 540 -            utext_next32(text);
 541 -            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 542 +    }
 543 +    // Start pushing the optimal offset index into t_boundary (t for tentative).
 544 +    // prev[numCodePts] is guaranteed to be meaningful.
 545 +    // We'll first push in the reverse order, i.e.,
 546 +    // t_boundary[0] = numCodePts, and afterwards do a swap.
 547 +    UVector32 t_boundary(numCodePts+1, status);
 548 +
 549 +    int32_t numBreaks = 0;
 550 +    // No segmentation found, set boundary to end of range
 551 +    while (numCodePts >= 0 && (uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
 552 +        --numCodePts;
 553 +    }
 554 +    if (numCodePts < 0) {
 555 +        t_boundary.addElement(numCodePts, status);
 556 +        numBreaks++;
 557 +    } else {
 558 +        for (int32_t i = numCodePts; (uint32_t)i != kuint32max; i = prev.elementAti(i)) {
 559 +            if (i < 0) i = -i;
 560 +            t_boundary.addElement(i, status);
 561 +            numBreaks++;
 562          }
 563 +        // U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
 564 +    }
 565
 566 -        // Look ahead for possible suffixes if a dictionary word does not follow.
 567 -        // We do this in code rather than using a rule so that the heuristic
 568 -        // resynch continues to function. For example, one of the suffix characters
 569 -        // could be a typo in the middle of a word.
 570 -//        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
 571 -//            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 572 -//                && fSuffixSet.contains(uc = utext_current32(text))) {
 573 -//                if (uc == KHMER_PAIYANNOI) {
 574 -//                    if (!fSuffixSet.contains(utext_previous32(text))) {
 575 -//                        // Skip over previous end and PAIYANNOI
 576 -//                        utext_next32(text);
 577 -//                        utext_next32(text);
 578 -//                        wordLength += 1;            // Add PAIYANNOI to word
 579 -//                        uc = utext_current32(text);     // Fetch next character
 580 -//                    }
 581 -//                    else {
 582 -//                        // Restore prior position
 583 -//                        utext_next32(text);
 584 -//                    }
 585 -//                }
 586 -//                if (uc == KHMER_MAIYAMOK) {
 587 -//                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
 588 -//                        // Skip over previous end and MAIYAMOK
 589 -//                        utext_next32(text);
 590 -//                        utext_next32(text);
 591 -//                        wordLength += 1;            // Add MAIYAMOK to word
 592 -//                    }
 593 -//                    else {
 594 -//                        // Restore prior position
 595 -//                        utext_next32(text);
 596 -//                    }
 597 -//                }
 598 -//            }
 599 -//            else {
 600 -//                utext_setNativeIndex(text, current+wordLength);
 601 -//            }
 602 -//        }
 603 -
 604 -        // Did we find a word on this iteration? If so, push it on the break stack
 605 -        if (cuWordLength > 0) {
 606 -            foundBreaks.push((current+cuWordLength), status);
 607 +    // Now that we're done, convert positions in t_boundary[] (indices in
 608 +    // the normalized input string) back to indices in the original input UText
 609 +    // while reversing t_boundary and pushing values to foundBreaks.
 610 +    for (int32_t i = numBreaks-1; i >= 0; i--) {
 611 +        int32_t cpPos = t_boundary.elementAti(i);
 612 +        if (cpPos == 0 && !breakStart && fTypes >= UBRK_LINE) continue;
 613 +        int32_t utextPos = cpPos + rangeStart;
 614 +        while (utextPos > after && scanWJ(text, utextPos, scanEnd, before, after));
 615 +        if (utextPos < before) {
 616 +        // Boundaries are added to foundBreaks output in ascending order.
 617 +            U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
 618 +            foundBreaks.push(utextPos, status);
 619          }
 620      }
 621 -
 622 +
 623      // Don't return a break for the end of the dictionary range if there is one there.
 624 -    if (foundBreaks.peeki() >= rangeEnd) {
 625 +    if (!breakEnd && fTypes >= UBRK_LINE && foundBreaks.peeki() >= rangeEnd) {
 626          (void) foundBreaks.popi();
 627 -        wordsFound -= 1;
 628      }
 629
 630 -    return wordsFound;
 631 +    return foundBreaks.size() - wordsFound;
 632  }
 633
 634  #if !UCONFIG_NO_NORMALIZATION
 635 diff -ur icu.org/source/common/dictbe.h icu/source/common/dictbe.h
 636 --- icu.org/source/common/dictbe.h      2022-04-08 00:41:55.000000000 +0200
 637 +++ icu/source/common/dictbe.h  2022-05-16 13:49:33.820459894 +0200
 638 @@ -35,7 +35,8 @@
 639   * threads without synchronization.</p>
 640   */
 641  class DictionaryBreakEngine : public LanguageBreakEngine {
 642 - private:
 643 + protected:
 644 +
 645      /**
 646       * The set of characters handled by this engine
 647       * @internal
 648 @@ -43,14 +44,84 @@
 649
 650    UnicodeSet    fSet;
 651
 652 +  const int32_t WJ   = 0x2060;
 653 +  const int32_t ZWSP = 0x200B;
 654 +
 655 +  /**
 656 +   * The break types it was constructed with
 657 +   * @internal
 658 +   */
 659 +  uint32_t      fTypes;
 660 +
 661 +  /**
 662 +   * A Unicode set of all viramas
 663 +   * @internal
 664 +   */
 665 +  UnicodeSet    fViramaSet;
 666 +
 667 +  /**
 668 +   * A Unicode set of all base characters
 669 +   * @internal
 670 +   */
 671 +  UnicodeSet    fBaseSet;
 672 +
 673 +  /**
 674 +   * A Unicode set of all marks
 675 +   * @internal
 676 +   */
 677 +  UnicodeSet    fMarkSet;
 678 +
 679 +  /**
 680 +   * A Unicode set of all characters ignored ignored in dictionary matching
 681 +   * @internal
 682 +   */
 683 +  UnicodeSet    fIgnoreSet;
 684 +
 685 +  /**
 686 +   * A Unicode set of all characters ignored ignored in dictionary matching
 687 +   * @internal
 688 +   */
 689 +  UnicodeSet    fSkipStartSet;
 690 +
 691 +  /**
 692 +   * A Unicode set of all characters ignored ignored in dictionary matching
 693 +   * @internal
 694 +   */
 695 +  UnicodeSet    fSkipEndSet;
 696 +
 697 +  /**
 698 +   * A Unicode set of all characters that should not be broken before
 699 +   * @internal
 700 +   */
 701 +  UnicodeSet    fNBeforeSet;
 702 +
 703 +  /**
 704 +   * The number of clusters within which breaks are inhibited
 705 +   * @internal
 706 +   */
 707 +  int32_t clusterLimit;
 708 +
 709 +  bool scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const;
 710 +
 711 +  bool scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const;
 712 +  bool scanAfterEnd(UText *text, int32_t rangeEnd, int32_t& end, bool &doBreak) const;
 713 +  void scanBackClusters(UText *text, int32_t textStart, int32_t& start) const;
 714 +  void scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const;
 715 +
 716   public:
 717
 718    /**
 719 -   * <p>Constructor </p>
 720 +   * <p>Default constructor.</p>
 721 +   *
 722     */
 723    DictionaryBreakEngine();
 724
 725    /**
 726 +   * <p>Constructor with break types.</p>
 727 +   */
 728 +  explicit DictionaryBreakEngine(uint32_t breakTypes);
 729 +
 730 +  /**
 731     * <p>Virtual destructor.</p>
 732     */
 733    virtual ~DictionaryBreakEngine();
 734 @@ -305,10 +376,12 @@
 735       * @internal
 736       */
 737
 738 -  UnicodeSet                fEndWordSet;
 739    UnicodeSet                fBeginWordSet;
 740 -  UnicodeSet                fMarkSet;
 741 -  DictionaryMatcher  *fDictionary;
 742 +  UnicodeSet                fPuncSet;
 743 +  DictionaryMatcher        *fDictionary;
 744 +
 745 +  const uint32_t BADSNLP = 256 * 20;
 746 +  const uint32_t kuint32max = 0x7FFFFFFF;
 747
 748   public:
 749
 750 diff -ur icu.org/source/common/dictionarydata.cpp icu/source/common/dictionarydata.cpp
 751 --- icu.org/source/common/dictionarydata.cpp    2023-06-14 06:23:55.000000000 +0900
 752 +++ icu/source/common/dictionarydata.cpp        2023-06-26 02:18:05.709454400 +0900
 753 @@ -44,7 +44,7 @@
 754
 755  int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
 756                              int32_t *lengths, int32_t *cpLengths, int32_t *values,
 757 -                            int32_t *prefix) const {
 758 +                            int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {
 759
 760      UCharsTrie uct(characters);
 761      int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
 762 @@ -55,7 +55,13 @@
 763          UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
 764          int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
 765          codePointsMatched += 1;
 766 +        if (ignoreSet != NULL && ignoreSet->contains(c)) {
 767 +            continue;
 768 +        }
 769          if (USTRINGTRIE_HAS_VALUE(result)) {
 770 +            if (codePointsMatched < minLength) {
 771 +                continue;
 772 +            }
 773              if (wordCount < limit) {
 774                  if (values != nullptr) {
 775                      values[wordCount] = uct.getValue();
 776 @@ -112,7 +118,7 @@
 777
 778  int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
 779                              int32_t *lengths, int32_t *cpLengths, int32_t *values,
 780 -                            int32_t *prefix) const {
 781 +                            int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {
 782      BytesTrie bt(characters);
 783      int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
 784      int32_t wordCount = 0;
 785 @@ -122,7 +128,13 @@
 786          UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
 787          int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
 788          codePointsMatched += 1;
 789 +        if (ignoreSet != NULL && ignoreSet->contains(c)) {
 790 +            continue;
 791 +        }
 792          if (USTRINGTRIE_HAS_VALUE(result)) {
 793 +            if (codePointsMatched < minLength) {
 794 +                continue;
 795 +            }
 796              if (wordCount < limit) {
 797                  if (values != nullptr) {
 798                      values[wordCount] = bt.getValue();
 799 diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata.h
 800 --- icu.org/source/common/dictionarydata.h      2023-06-14 06:23:55.000000000 +0900
 801 +++ icu/source/common/dictionarydata.h  2023-06-26 17:43:53.097724900 +0900
 802 @@ -21,6 +21,7 @@
 803  #include "unicode/utext.h"
 804  #include "unicode/udata.h"
 805  #include "udataswp.h"
 806 +#include "unicode/uniset.h"
 807  #include "unicode/uobject.h"
 808  #include "unicode/ustringtrie.h"
 809
 810 @@ -92,7 +93,7 @@
 811       */
 812      virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
 813                              int32_t *lengths, int32_t *cpLengths, int32_t *values,
 814 -                            int32_t *prefix) const = 0;
 815 +                            int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const = 0;
 816
 817      /** @return DictionaryData::TRIE_TYPE_XYZ */
 818      virtual int32_t getType() const = 0;
 819 @@ -107,7 +108,7 @@
 820      virtual ~UCharsDictionaryMatcher();
 821      virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
 822                              int32_t *lengths, int32_t *cpLengths, int32_t *values,
 823 -                            int32_t *prefix) const override;
 824 +                            int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const override;
 825      virtual int32_t getType() const override;
 826  private:
 827      const char16_t *characters;
 828 @@ -125,7 +126,7 @@
 829      virtual ~BytesDictionaryMatcher();
 830      virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
 831                              int32_t *lengths, int32_t *cpLengths, int32_t *values,
 832 -                            int32_t *prefix) const override;
 833 +                            int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const override;
 834      virtual int32_t getType() const override;
 835  private:
 836      UChar32 transform(UChar32 c) const;