Avoid potential negative array index access to cached text.
[LibreOffice.git] / external / icu / icu4c-khmerbreakengine.patch.1
blob605914014e96c640067fb2716bf4aca55ce36f69
1 diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp
2 --- icu.org/source/common/dictbe.cpp    2023-06-14 06:23:55.000000000 +0900
3 +++ icu/source/common/dictbe.cpp        2023-06-26 17:43:53.034173100 +0900
4 @@ -35,7 +35,19 @@
5   ******************************************************************
6   */
7  
8 -DictionaryBreakEngine::DictionaryBreakEngine() {
9 +DictionaryBreakEngine::DictionaryBreakEngine()
10 +    : fTypes(0), clusterLimit(0) {
13 +DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes)
14 +    : fTypes(breakTypes), clusterLimit(3) {
15 +    UErrorCode status = U_ZERO_ERROR;
16 +    fViramaSet.applyPattern(UnicodeString(u"[[:ccc=VR:]]"), status);
18 +    // note Skip Sets contain fIgnoreSet characters too.
19 +    fSkipStartSet.applyPattern(UnicodeString(u"[[:lb=OP:][:lb=QU:]\\u200C\\u200D\\u2060]"), status);
20 +    fSkipEndSet.applyPattern(UnicodeString(u"[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]\\u200C\\u200D\\u2060]"), status);
21 +    fNBeforeSet.applyPattern(UnicodeString(u"[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status);
22  }
24  DictionaryBreakEngine::~DictionaryBreakEngine() {
25 @@ -85,6 +97,169 @@
26      fSet.compact();
27  }
29 +bool
30 +DictionaryBreakEngine::scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const {
31 +    UErrorCode status = U_ZERO_ERROR;
32 +    UText* ut = utext_clone(NULL, text, false, true, &status);
33 +    utext_setNativeIndex(ut, start);
34 +    UChar32 c = utext_current32(ut);
35 +    bool res = false;
36 +    doBreak = true;
37 +    while (start >= 0) {
38 +        if (!fSkipStartSet.contains(c)) {
39 +            res = (c == ZWSP);
40 +            break;
41 +        }
42 +        --start;
43 +        c = utext_previous32(ut);
44 +        doBreak = false;
45 +    }
46 +    utext_close(ut);
47 +    return res;
50 +bool
51 +DictionaryBreakEngine::scanAfterEnd(UText *text, int32_t textEnd, int32_t& end, bool &doBreak) const {
52 +    UErrorCode status = U_ZERO_ERROR;
53 +    UText* ut = utext_clone(NULL, text, false, true, &status);
54 +    utext_setNativeIndex(ut, end);
55 +    UChar32 c = utext_current32(ut);
56 +    bool res = false;
57 +    doBreak = !fNBeforeSet.contains(c);
58 +    while (end < textEnd) {
59 +        if (!fSkipEndSet.contains(c)) {
60 +            res = (c == ZWSP);
61 +            break;
62 +        }
63 +        ++end;
64 +        c = utext_next32(ut);
65 +        doBreak = false;
66 +    }
67 +    utext_close(ut);
68 +    return res;
71 +void
72 +DictionaryBreakEngine::scanBackClusters(UText *text, int32_t textStart, int32_t& start) const {
73 +    UChar32 c = 0;
74 +    start = utext_getNativeIndex(text);
75 +    while (start > textStart) {
76 +        c = utext_previous32(text);
77 +        --start;
78 +        if (!fSkipEndSet.contains(c))
79 +            break;
80 +    }
81 +    for (int i = 0; i < clusterLimit; ++i) { // scan backwards clusterLimit clusters
82 +        while (start > textStart) {
83 +            while (fIgnoreSet.contains(c))
84 +                c = utext_previous32(text);
85 +            if (!fMarkSet.contains(c)) {
86 +                if (fBaseSet.contains(c)) {
87 +                    c = utext_previous32(text);
88 +                    if (!fViramaSet.contains(c)) { // Virama (e.g. coeng) preceding base. Treat sequence as a mark
89 +                        utext_next32(text);
90 +                        c = utext_current32(text);
91 +                        break;
92 +                    } else {
93 +                        --start;
94 +                    }
95 +                } else {
96 +                    break;
97 +                }
98 +            }
99 +            c = utext_previous32(text);
100 +            --start;
101 +        }
102 +        if (!fBaseSet.contains(c) || start < textStart) {  // not a cluster start so finish
103 +            break;
104 +        }
105 +        c = utext_previous32(text);
106 +        --start;        // go round again
107 +    }                   // ignore hitting previous inhibitor since scanning for it should have found us!
108 +    ++start;            // counteract --before
111 +void
112 +DictionaryBreakEngine::scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const {
113 +    UChar32 c = utext_current32(text);
114 +    end = utext_getNativeIndex(text);
115 +    while (end < textEnd) {
116 +        if (!fSkipStartSet.contains(c))
117 +            break;
118 +        utext_next32(text);
119 +        c = utext_current32(text);
120 +        ++end;
121 +    }
122 +    for (int i = 0; i < clusterLimit; ++i) { // scan forwards clusterLimit clusters
123 +        while (fIgnoreSet.contains(c)) {
124 +            utext_next32(text);
125 +            c = utext_current32(text);
126 +        }
127 +        if (fBaseSet.contains(c)) {
128 +            while (end < textEnd) {
129 +                utext_next32(text);
130 +                c = utext_current32(text);
131 +                ++end;
132 +                if (!fMarkSet.contains(c))
133 +                    break;
134 +                else if (fViramaSet.contains(c)) {  // handle coeng + base as mark
135 +                    utext_next32(text);
136 +                    c = utext_current32(text);
137 +                    ++end;
138 +                    if (!fBaseSet.contains(c))
139 +                        break;
140 +                }
141 +            }
142 +        } else {
143 +            --end;    // bad char so break after char before it
144 +            break;
145 +        }
146 +    }
149 +bool
150 +DictionaryBreakEngine::scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const {
151 +    UErrorCode status = U_ZERO_ERROR;
152 +    UText* ut = utext_clone(NULL, text, false, true, &status);
153 +    int32_t nat = start;
154 +    utext_setNativeIndex(ut, nat);
155 +    bool foundFirst = true;
156 +    int32_t curr = start;
157 +    while (nat < end) {
158 +        UChar32 c = utext_current32(ut);
159 +        if (c == ZWSP || c == WJ) {
160 +            curr = nat + 1;
161 +            if (foundFirst)     // only scan backwards for first inhibitor
162 +                scanBackClusters(ut, start, before);
163 +            foundFirst = false; // don't scan backwards if we go around again. Also marks found something
165 +            utext_next32(ut);
166 +            scanFwdClusters(ut, end, after);
167 +            nat = after + 1;
169 +            if (c == ZWSP || c == WJ) {  // did we hit another one?
170 +                continue;
171 +            } else {
172 +                break;
173 +            }
174 +        }
176 +        ++nat;                  // keep hunting
177 +        utext_next32(ut);
178 +    }
180 +    utext_close(ut);
182 +    if (nat >= end && foundFirst) {
183 +        start = before = after = nat;
184 +        return false;           // failed to find anything
185 +    }
186 +    else {
187 +        start = curr;
188 +    }
189 +    return true;                // yup hit one
192  /*
193   ******************************************************************
194   * PossibleWord
195 @@ -114,7 +289,7 @@
196      ~PossibleWord() {}
197    
198      // Fill the list of candidates if needed, select the longest, and return the number found
199 -    int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
200 +    int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet = NULL, int32_t minLength = 0 );
201    
202      // Select the currently marked candidate, point after it in the text, and invalidate self
203      int32_t   acceptMarked( UText *text );
204 @@ -135,12 +310,12 @@
205  };
208 -int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
209 +int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet, int32_t minLength) {
210      // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
211      int32_t start = (int32_t)utext_getNativeIndex(text);
212      if (start != offset) {
213          offset = start;
214 -        count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix);
215 +        count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix, ignoreSet, minLength);
216          // Dictionary leaves text after longest prefix, not longest word. Back up.
217          if (count <= 0) {
218              utext_setNativeIndex(text, start);
219 @@ -814,53 +989,30 @@
220   * KhmerBreakEngine
221   */
223 -// How many words in a row are "good enough"?
224 -static const int32_t KHMER_LOOKAHEAD = 3;
226 -// Will not combine a non-word with a preceding dictionary word longer than this
227 -static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
229 -// Will not combine a non-word that shares at least this much prefix with a
230 -// dictionary word, with a preceding word
231 -static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
233 -// Minimum word size
234 -static const int32_t KHMER_MIN_WORD = 2;
236 -// Minimum number of characters for two words
237 -static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
239  KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
240 -    : DictionaryBreakEngine(),
241 +    : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
242        fDictionary(adoptDictionary)
244      UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
245      UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
246 -    UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status);
248 +    clusterLimit = 3;
250 +    UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]\\u2060\\u200C\\u200D]"), status);
251      if (U_SUCCESS(status)) {
252          setCharacters(khmerWordSet);
253      }
254      fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
255 -    fMarkSet.add(0x0020);
256 -    fEndWordSet = khmerWordSet;
257 -    fBeginWordSet.add(0x1780, 0x17B3);
258 -    //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
259 -    //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
260 -    //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
261 -    fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
262 -    //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
263 -//    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
264 -//    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
265 -//    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
266 -//    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
267 -//    fSuffixSet.add(THAI_PAIYANNOI);
268 -//    fSuffixSet.add(THAI_MAIYAMOK);
269 +    fIgnoreSet.add(0x2060);         // WJ
270 +    fIgnoreSet.add(0x200C, 0x200D); // ZWJ, ZWNJ
271 +    fBaseSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:lb=SA:]&[:^M:]]"), status);
272 +    fPuncSet.applyPattern(UnicodeString(u"[\\u17D4\\u17D5\\u17D6\\u17D7\\u17D9:]"), status);
274      // Compact for caching.
275      fMarkSet.compact();
276 -    fEndWordSet.compact();
277 -    fBeginWordSet.compact();
278 -//    fSuffixSet.compact();
279 +    fIgnoreSet.compact();
280 +    fBaseSet.compact();
281 +    fPuncSet.compact();
282      UTRACE_EXIT_STATUS(status);
285 @@ -876,175 +1028,205 @@
286                                                  UBool /* isPhraseBreaking */,
287                                                  UErrorCode& status ) const {
288      if (U_FAILURE(status)) return 0;
289 -    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
290 -        return 0;       // Not enough characters for two words
291 +    uint32_t wordsFound = foundBreaks.size();
292 +    int32_t before = 0;
293 +    int32_t after = 0;
294 +    int32_t finalBefore = 0;
295 +    int32_t initAfter = 0;
296 +    int32_t scanStart = rangeStart;
297 +    int32_t scanEnd = rangeEnd;
299 +    bool startZwsp = false;
300 +    bool breakStart = false;
301 +    bool breakEnd = false;
303 +    if (rangeStart > 0) {
304 +        --scanStart;
305 +        startZwsp = scanBeforeStart(text, scanStart, breakStart);
306      }
308 -    uint32_t wordsFound = 0;
309 -    int32_t cpWordLength = 0;
310 -    int32_t cuWordLength = 0;
311 -    int32_t current;
312 -    PossibleWord words[KHMER_LOOKAHEAD];
314      utext_setNativeIndex(text, rangeStart);
315 +    scanFwdClusters(text, rangeEnd, initAfter);
316 +    bool endZwsp = scanAfterEnd(text, utext_nativeLength(text), scanEnd, breakEnd);
317 +    utext_setNativeIndex(text, rangeEnd - 1);
318 +    scanBackClusters(text, rangeStart, finalBefore);
319 +    if (finalBefore < initAfter) {   // the whole run is tented so no breaks
320 +        if (breakStart || fTypes < UBRK_LINE)
321 +            foundBreaks.push(rangeStart, status);
322 +        if (breakEnd || fTypes < UBRK_LINE)
323 +            foundBreaks.push(rangeEnd, status);
324 +        return foundBreaks.size() - wordsFound;
325 +    }
327 -    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
328 -        cuWordLength = 0;
329 -        cpWordLength = 0;
331 -        // Look for candidate words at the current position
332 -        int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
334 -        // If we found exactly one, use that
335 -        if (candidates == 1) {
336 -            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
337 -            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
338 -            wordsFound += 1;
339 -        }
340 +    scanStart = rangeStart;
341 +    scanWJ(text, scanStart, rangeEnd, before, after);
342 +    if (startZwsp || initAfter >= before) {
343 +        after = initAfter;
344 +        before = 0;
345 +    }
346 +    if (!endZwsp && after > finalBefore && after < rangeEnd)
347 +        endZwsp = true;
348 +    if (endZwsp && before > finalBefore)
349 +        before = finalBefore;
351 -        // If there was more than one, see which one can take us forward the most words
352 -        else if (candidates > 1) {
353 -            // If we're already at the end of the range, we're done
354 -            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
355 -                goto foundBest;
356 -            }
357 -            do {
358 -                if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
359 -                    // Followed by another dictionary word; mark first word as a good candidate
360 -                    words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
361 +    utext_setNativeIndex(text, rangeStart);
362 +    int32_t numCodePts = rangeEnd - rangeStart;
363 +    // bestSnlp[i] is the snlp of the best segmentation of the first i
364 +    // code points in the range to be matched.
365 +    UVector32 bestSnlp(numCodePts + 1, status);
366 +    bestSnlp.addElement(0, status);
367 +    for(int32_t i = 1; i <= numCodePts; i++) {
368 +        bestSnlp.addElement(kuint32max, status);
369 +    }
371 -                    // If we're already at the end of the range, we're done
372 -                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
373 -                        goto foundBest;
374 -                    }
375 +    // prev[i] is the index of the last code point in the previous word in
376 +    // the best segmentation of the first i characters. Note negative implies
377 +       // that the code point is part of an unknown word.
378 +    UVector32 prev(numCodePts + 1, status);
379 +    for(int32_t i = 0; i <= numCodePts; i++) {
380 +        prev.addElement(kuint32max, status);
381 +    }
383 -                    // See if any of the possible second words is followed by a third word
384 -                    do {
385 -                        // If we find a third word, stop right away
386 -                        if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
387 -                            words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
388 -                            goto foundBest;
389 -                        }
390 -                    }
391 -                    while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
392 -                }
393 +    const int32_t maxWordSize = 20;
394 +    UVector32 values(maxWordSize, status);
395 +    values.setSize(maxWordSize);
396 +    UVector32 lengths(maxWordSize, status);
397 +    lengths.setSize(maxWordSize);
399 +    // Dynamic programming to find the best segmentation.
401 +    // In outer loop, i  is the code point index,
402 +    //                ix is the corresponding string (code unit) index.
403 +    //    They differ when the string contains supplementary characters.
404 +    int32_t ix = rangeStart;
405 +    for (int32_t i = 0;  i < numCodePts;  ++i, utext_setNativeIndex(text, ++ix)) {
406 +        if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
407 +            continue;
408 +        }
410 +        int32_t count;
411 +        count = fDictionary->matches(text, numCodePts - i, maxWordSize,
412 +                             NULL, lengths.getBuffer(), values.getBuffer(), NULL, &fIgnoreSet, 2);
413 +                             // Note: lengths is filled with code point lengths
414 +                             //       The NULL parameter is the ignored code unit lengths.
416 +        for (int32_t j = 0; j < count; j++) {
417 +            int32_t ln = lengths.elementAti(j);
418 +            if (ln + i >= numCodePts)
419 +                continue;
420 +            utext_setNativeIndex(text, ln+ix);
421 +            int32_t c = utext_current32(text);
422 +            if (fMarkSet.contains(c) || c == 0x17D2) { // Coeng
423 +                lengths.removeElementAt(j);
424 +                values.removeElementAt(j);
425 +                --j;
426 +                --count;
427              }
428 -            while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
429 -foundBest:
430 -            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
431 -            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
432 -            wordsFound += 1;
433          }
435 -        // We come here after having either found a word or not. We look ahead to the
436 -        // next word. If it's not a dictionary word, we will combine it with the word we
437 -        // just found (if there is one), but only if the preceding word does not exceed
438 -        // the threshold.
439 -        // The text iterator should now be positioned at the end of the word we found.
440 -        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
441 -            // if it is a dictionary word, do nothing. If it isn't, then if there is
442 -            // no preceding word, or the non-word shares less than the minimum threshold
443 -            // of characters with a dictionary word, then scan to resynchronize
444 -            if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
445 -                  && (cuWordLength == 0
446 -                      || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
447 -                // Look for a plausible word boundary
448 -                int32_t remaining = rangeEnd - (current+cuWordLength);
449 -                UChar32 pc;
450 -                UChar32 uc;
451 -                int32_t chars = 0;
452 -                for (;;) {
453 -                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
454 -                    pc = utext_next32(text);
455 -                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
456 -                    chars += pcSize;
457 -                    remaining -= pcSize;
458 -                    if (remaining <= 0) {
459 +        if (count == 0) {
460 +            utext_setNativeIndex(text, ix);
461 +            int32_t c = utext_current32(text);
462 +            if (fPuncSet.contains(c) || fIgnoreSet.contains(c) || c == ZWSP) {
463 +                values.setElementAt(0, count);
464 +                lengths.setElementAt(1, count++);
465 +            } else if (fBaseSet.contains(c)) {
466 +                int32_t currix = utext_getNativeIndex(text);
467 +                do {
468 +                    utext_next32(text);
469 +                    c = utext_current32(text);
470 +                    if (utext_getNativeIndex(text) >= rangeEnd)
471                          break;
472 -                    }
473 -                    uc = utext_current32(text);
474 -                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
475 -                        // Maybe. See if it's in the dictionary.
476 -                        int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
477 -                        utext_setNativeIndex(text, current+cuWordLength+chars);
478 -                        if (num_candidates > 0) {
479 +                    if (c == 0x17D2) { // Coeng
480 +                        utext_next32(text);
481 +                        c = utext_current32(text);
482 +                        if (!fBaseSet.contains(c) || utext_getNativeIndex(text) >= rangeEnd) {
483                              break;
484 +                        } else {
485 +                            utext_next32(text);
486 +                            c = utext_current32(text);
487 +                            if (utext_getNativeIndex(text) >= rangeEnd)
488 +                                break;
489                          }
490                      }
491 -                }
493 -                // Bump the word count if there wasn't already one
494 -                if (cuWordLength <= 0) {
495 -                    wordsFound += 1;
496 -                }
497 +                } while (fMarkSet.contains(c) || fIgnoreSet.contains(c));
498 +                values.setElementAt(BADSNLP, count);
499 +                lengths.setElementAt(utext_getNativeIndex(text) - currix, count++);
500 +            } else {
501 +                values.setElementAt(BADSNLP, count);
502 +                lengths.setElementAt(1, count++);
503 +            }
504 +        }
506 -                // Update the length with the passed-over characters
507 -                cuWordLength += chars;
508 +        for (int32_t j = 0; j < count; j++) {
509 +            uint32_t v = values.elementAti(j);
510 +            int32_t newSnlp = bestSnlp.elementAti(i) + v;
511 +            int32_t ln = lengths.elementAti(j);
512 +            utext_setNativeIndex(text, ln+ix);
513 +            int32_t c = utext_current32(text);
514 +            while ((fPuncSet.contains(c) || fIgnoreSet.contains(c)) && ln + i < numCodePts) {
515 +                ++ln;
516 +                utext_next32(text);
517 +                c = utext_current32(text);
518              }
519 -            else {
520 -                // Back up to where we were for next iteration
521 -                utext_setNativeIndex(text, current+cuWordLength);
522 +            int32_t ln_j_i = ln + i;   // yes really i!
523 +            if (newSnlp < bestSnlp.elementAti(ln_j_i)) {
524 +                if (v == BADSNLP) {
525 +                    int32_t p = prev.elementAti(i);
526 +                    if (p < 0)
527 +                        prev.setElementAt(p, ln_j_i);
528 +                    else
529 +                        prev.setElementAt(-i, ln_j_i);
530 +                }
531 +                else
532 +                    prev.setElementAt(i, ln_j_i);
533 +                bestSnlp.setElementAt(newSnlp, ln_j_i);
534              }
535          }
537 -        // Never stop before a combining mark.
538 -        int32_t currPos;
539 -        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
540 -            utext_next32(text);
541 -            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
542 +    }
543 +    // Start pushing the optimal offset index into t_boundary (t for tentative).
544 +    // prev[numCodePts] is guaranteed to be meaningful.
545 +    // We'll first push in the reverse order, i.e.,
546 +    // t_boundary[0] = numCodePts, and afterwards do a swap.
547 +    UVector32 t_boundary(numCodePts+1, status);
549 +    int32_t numBreaks = 0;
550 +    // No segmentation found, set boundary to end of range
551 +    while (numCodePts >= 0 && (uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
552 +        --numCodePts;
553 +    }
554 +    if (numCodePts < 0) {
555 +        t_boundary.addElement(numCodePts, status);
556 +        numBreaks++;
557 +    } else {
558 +        for (int32_t i = numCodePts; (uint32_t)i != kuint32max; i = prev.elementAti(i)) {
559 +            if (i < 0) i = -i;
560 +            t_boundary.addElement(i, status);
561 +            numBreaks++;
562          }
563 +        // U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
564 +    }
566 -        // Look ahead for possible suffixes if a dictionary word does not follow.
567 -        // We do this in code rather than using a rule so that the heuristic
568 -        // resynch continues to function. For example, one of the suffix characters
569 -        // could be a typo in the middle of a word.
570 -//        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
571 -//            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
572 -//                && fSuffixSet.contains(uc = utext_current32(text))) {
573 -//                if (uc == KHMER_PAIYANNOI) {
574 -//                    if (!fSuffixSet.contains(utext_previous32(text))) {
575 -//                        // Skip over previous end and PAIYANNOI
576 -//                        utext_next32(text);
577 -//                        utext_next32(text);
578 -//                        wordLength += 1;            // Add PAIYANNOI to word
579 -//                        uc = utext_current32(text);     // Fetch next character
580 -//                    }
581 -//                    else {
582 -//                        // Restore prior position
583 -//                        utext_next32(text);
584 -//                    }
585 -//                }
586 -//                if (uc == KHMER_MAIYAMOK) {
587 -//                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
588 -//                        // Skip over previous end and MAIYAMOK
589 -//                        utext_next32(text);
590 -//                        utext_next32(text);
591 -//                        wordLength += 1;            // Add MAIYAMOK to word
592 -//                    }
593 -//                    else {
594 -//                        // Restore prior position
595 -//                        utext_next32(text);
596 -//                    }
597 -//                }
598 -//            }
599 -//            else {
600 -//                utext_setNativeIndex(text, current+wordLength);
601 -//            }
602 -//        }
604 -        // Did we find a word on this iteration? If so, push it on the break stack
605 -        if (cuWordLength > 0) {
606 -            foundBreaks.push((current+cuWordLength), status);
607 +    // Now that we're done, convert positions in t_boundary[] (indices in
608 +    // the normalized input string) back to indices in the original input UText
609 +    // while reversing t_boundary and pushing values to foundBreaks.
610 +    for (int32_t i = numBreaks-1; i >= 0; i--) {
611 +        int32_t cpPos = t_boundary.elementAti(i);
612 +        if (cpPos == 0 && !breakStart && fTypes >= UBRK_LINE) continue;
613 +        int32_t utextPos = cpPos + rangeStart;
614 +        while (utextPos > after && scanWJ(text, utextPos, scanEnd, before, after));
615 +        if (utextPos < before) {
616 +        // Boundaries are added to foundBreaks output in ascending order.
617 +            U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
618 +            foundBreaks.push(utextPos, status);
619          }
620      }
621 -    
623      // Don't return a break for the end of the dictionary range if there is one there.
624 -    if (foundBreaks.peeki() >= rangeEnd) {
625 +    if (!breakEnd && fTypes >= UBRK_LINE && foundBreaks.peeki() >= rangeEnd) {
626          (void) foundBreaks.popi();
627 -        wordsFound -= 1;
628      }
630 -    return wordsFound;
631 +    return foundBreaks.size() - wordsFound;
634  #if !UCONFIG_NO_NORMALIZATION
635 diff -ur icu.org/source/common/dictbe.h icu/source/common/dictbe.h
636 --- icu.org/source/common/dictbe.h      2022-04-08 00:41:55.000000000 +0200
637 +++ icu/source/common/dictbe.h  2022-05-16 13:49:33.820459894 +0200
638 @@ -35,7 +35,8 @@
639   * threads without synchronization.</p>
640   */
641  class DictionaryBreakEngine : public LanguageBreakEngine {
642 - private:
643 + protected:
645      /**
646       * The set of characters handled by this engine
647       * @internal
648 @@ -43,14 +44,84 @@
650    UnicodeSet    fSet;
652 +  const int32_t WJ   = 0x2060;
653 +  const int32_t ZWSP = 0x200B;
655 +  /**
656 +   * The break types it was constructed with
657 +   * @internal
658 +   */
659 +  uint32_t      fTypes;
661 +  /**
662 +   * A Unicode set of all viramas
663 +   * @internal
664 +   */
665 +  UnicodeSet    fViramaSet;
667 +  /**
668 +   * A Unicode set of all base characters
669 +   * @internal
670 +   */
671 +  UnicodeSet    fBaseSet;
673 +  /**
674 +   * A Unicode set of all marks
675 +   * @internal
676 +   */
677 +  UnicodeSet    fMarkSet;
679 +  /**
680 +   * A Unicode set of all characters ignored ignored in dictionary matching
681 +   * @internal
682 +   */
683 +  UnicodeSet    fIgnoreSet;
685 +  /**
686 +   * A Unicode set of all characters ignored ignored in dictionary matching
687 +   * @internal
688 +   */
689 +  UnicodeSet    fSkipStartSet;
691 +  /**
692 +   * A Unicode set of all characters ignored ignored in dictionary matching
693 +   * @internal
694 +   */
695 +  UnicodeSet    fSkipEndSet;
697 +  /**
698 +   * A Unicode set of all characters that should not be broken before
699 +   * @internal
700 +   */
701 +  UnicodeSet    fNBeforeSet;
703 +  /**
704 +   * The number of clusters within which breaks are inhibited
705 +   * @internal
706 +   */
707 +  int32_t clusterLimit;
709 +  bool scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const;
711 +  bool scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const;
712 +  bool scanAfterEnd(UText *text, int32_t rangeEnd, int32_t& end, bool &doBreak) const;
713 +  void scanBackClusters(UText *text, int32_t textStart, int32_t& start) const;
714 +  void scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const;
716   public:
718    /**
719 -   * <p>Constructor </p>
720 +   * <p>Default constructor.</p>
721 +   *
722     */
723    DictionaryBreakEngine();
725    /**
726 +   * <p>Constructor with break types.</p>
727 +   */
728 +  explicit DictionaryBreakEngine(uint32_t breakTypes);
730 +  /**
731     * <p>Virtual destructor.</p>
732     */
733    virtual ~DictionaryBreakEngine();
734 @@ -305,10 +376,12 @@
735       * @internal
736       */
738 -  UnicodeSet                fEndWordSet;
739    UnicodeSet                fBeginWordSet;
740 -  UnicodeSet                fMarkSet;
741 -  DictionaryMatcher  *fDictionary;
742 +  UnicodeSet                fPuncSet;
743 +  DictionaryMatcher        *fDictionary;
745 +  const uint32_t BADSNLP = 256 * 20;
746 +  const uint32_t kuint32max = 0x7FFFFFFF;
748   public:
750 diff -ur icu.org/source/common/dictionarydata.cpp icu/source/common/dictionarydata.cpp
751 --- icu.org/source/common/dictionarydata.cpp    2023-06-14 06:23:55.000000000 +0900
752 +++ icu/source/common/dictionarydata.cpp        2023-06-26 02:18:05.709454400 +0900
753 @@ -44,7 +44,7 @@
755  int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
756                              int32_t *lengths, int32_t *cpLengths, int32_t *values,
757 -                            int32_t *prefix) const {
758 +                            int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {
760      UCharsTrie uct(characters);
761      int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
762 @@ -55,7 +55,13 @@
763          UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
764          int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
765          codePointsMatched += 1;
766 +        if (ignoreSet != NULL && ignoreSet->contains(c)) {
767 +            continue;
768 +        }
769          if (USTRINGTRIE_HAS_VALUE(result)) {
770 +            if (codePointsMatched < minLength) {
771 +                continue;
772 +            }
773              if (wordCount < limit) {
774                  if (values != nullptr) {
775                      values[wordCount] = uct.getValue();
776 @@ -112,7 +118,7 @@
778  int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
779                              int32_t *lengths, int32_t *cpLengths, int32_t *values,
780 -                            int32_t *prefix) const {
781 +                            int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {
782      BytesTrie bt(characters);
783      int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
784      int32_t wordCount = 0;
785 @@ -122,7 +128,13 @@
786          UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
787          int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
788          codePointsMatched += 1;
789 +        if (ignoreSet != NULL && ignoreSet->contains(c)) {
790 +            continue;
791 +        }
792          if (USTRINGTRIE_HAS_VALUE(result)) {
793 +            if (codePointsMatched < minLength) {
794 +                continue;
795 +            }
796              if (wordCount < limit) {
797                  if (values != nullptr) {
798                      values[wordCount] = bt.getValue();
800 diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata.h
801 --- icu.org/source/common/dictionarydata.h      2023-06-14 06:23:55.000000000 +0900
802 +++ icu/source/common/dictionarydata.h  2023-06-26 17:43:53.097724900 +0900
803 @@ -21,6 +21,7 @@
804  #include "unicode/utext.h"
805  #include "unicode/udata.h"
806  #include "udataswp.h"
807 +#include "unicode/uniset.h"
808  #include "unicode/uobject.h"
809  #include "unicode/ustringtrie.h"
811 @@ -92,7 +93,7 @@
812       */
813      virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
814                              int32_t *lengths, int32_t *cpLengths, int32_t *values,
815 -                            int32_t *prefix) const = 0;
816 +                            int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const = 0;
818      /** @return DictionaryData::TRIE_TYPE_XYZ */
819      virtual int32_t getType() const = 0;
820 @@ -107,7 +108,7 @@
821      virtual ~UCharsDictionaryMatcher();
822      virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
823                              int32_t *lengths, int32_t *cpLengths, int32_t *values,
824 -                            int32_t *prefix) const override;
825 +                            int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const override;
826      virtual int32_t getType() const override;
827  private:
828      const char16_t *characters;
829 @@ -125,7 +126,7 @@
830      virtual ~BytesDictionaryMatcher();
831      virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
832                              int32_t *lengths, int32_t *cpLengths, int32_t *values,
833 -                            int32_t *prefix) const override;
834 +                            int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const override;
835      virtual int32_t getType() const override;
836  private:
837      UChar32 transform(UChar32 c) const;