Version 5.4.3.2, tag libreoffice-5.4.3.2
[LibreOffice.git] / external / icu / khmerbreakengine.patch
blob8f81f315da3e912c507bcf9a572da71043a82d33
1 diff --git a/source/common/dictbe.cpp b/source/common/dictbe.cpp
2 index f1c874d..3ad1b3f 100644
3 --- misc/icu/source/common/dictbe.cpp
4 +++ build/icu/source/common/dictbe.cpp
5 @@ -27,8 +27,17 @@ U_NAMESPACE_BEGIN
6 ******************************************************************
7 */
9 -DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
10 +DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) :
11 + clusterLimit(3)
13 + UErrorCode status = U_ZERO_ERROR;
14 fTypes = breakTypes;
15 + fViramaSet.applyPattern(UNICODE_STRING_SIMPLE("[[:ccc=VR:]]"), status);
17 + // note Skip Sets contain fIgnoreSet characters too.
18 + fSkipStartSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=OP:][:lb=QU:]\\u200C\\u200D\\u2060]"), status);
19 + fSkipEndSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]\\u200C\\u200D\\u2060]"), status);
20 + fNBeforeSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status);
23 DictionaryBreakEngine::~DictionaryBreakEngine() {
24 @@ -90,7 +99,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
25 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
26 utext_setNativeIndex(text, current);
30 return result;
33 @@ -101,6 +110,169 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
34 fSet.compact();
37 +bool
38 +DictionaryBreakEngine::scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const {
39 + UErrorCode status = U_ZERO_ERROR;
40 + UText* ut = utext_clone(NULL, text, false, true, &status);
41 + utext_setNativeIndex(ut, start);
42 + UChar32 c = utext_current32(ut);
43 + bool res = false;
44 + doBreak = true;
45 + while (start >= 0) {
46 + if (!fSkipStartSet.contains(c)) {
47 + res = (c == ZWSP);
48 + break;
49 + }
50 + --start;
51 + c = utext_previous32(ut);
52 + doBreak = false;
53 + }
54 + utext_close(ut);
55 + return res;
58 +bool
59 +DictionaryBreakEngine::scanAfterEnd(UText *text, int32_t textEnd, int32_t& end, bool &doBreak) const {
60 + UErrorCode status = U_ZERO_ERROR;
61 + UText* ut = utext_clone(NULL, text, false, true, &status);
62 + utext_setNativeIndex(ut, end);
63 + UChar32 c = utext_current32(ut);
64 + bool res = false;
65 + doBreak = !fNBeforeSet.contains(c);
66 + while (end < textEnd) {
67 + if (!fSkipEndSet.contains(c)) {
68 + res = (c == ZWSP);
69 + break;
70 + }
71 + ++end;
72 + c = utext_next32(ut);
73 + doBreak = false;
74 + }
75 + utext_close(ut);
76 + return res;
79 +void
80 +DictionaryBreakEngine::scanBackClusters(UText *text, int32_t textStart, int32_t& start) const {
81 + UChar32 c = 0;
82 + start = utext_getNativeIndex(text);
83 + while (start > textStart) {
84 + c = utext_previous32(text);
85 + --start;
86 + if (!fSkipEndSet.contains(c))
87 + break;
88 + }
89 + for (int i = 0; i < clusterLimit; ++i) { // scan backwards clusterLimit clusters
90 + while (start > textStart) {
91 + while (fIgnoreSet.contains(c))
92 + c = utext_previous32(text);
93 + if (!fMarkSet.contains(c)) {
94 + if (fBaseSet.contains(c)) {
95 + c = utext_previous32(text);
96 + if (!fViramaSet.contains(c)) { // Virama (e.g. coeng) preceding base. Treat sequence as a mark
97 + utext_next32(text);
98 + c = utext_current32(text);
99 + break;
100 + } else {
101 + --start;
103 + } else {
104 + break;
107 + c = utext_previous32(text);
108 + --start;
110 + if (!fBaseSet.contains(c) || start < textStart) { // not a cluster start so finish
111 + break;
113 + c = utext_previous32(text);
114 + --start; // go round again
115 + } // ignore hitting previous inhibitor since scanning for it should have found us!
116 + ++start; // counteract --before
119 +void
120 +DictionaryBreakEngine::scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const {
121 + UChar32 c = utext_current32(text);
122 + end = utext_getNativeIndex(text);
123 + while (end < textEnd) {
124 + if (!fSkipStartSet.contains(c))
125 + break;
126 + utext_next32(text);
127 + c = utext_current32(text);
128 + ++end;
130 + for (int i = 0; i < clusterLimit; ++i) { // scan forwards clusterLimit clusters
131 + while (fIgnoreSet.contains(c)) {
132 + utext_next32(text);
133 + c = utext_current32(text);
135 + if (fBaseSet.contains(c)) {
136 + while (end < textEnd) {
137 + utext_next32(text);
138 + c = utext_current32(text);
139 + ++end;
140 + if (!fMarkSet.contains(c))
141 + break;
142 + else if (fViramaSet.contains(c)) { // handle coeng + base as mark
143 + utext_next32(text);
144 + c = utext_current32(text);
145 + ++end;
146 + if (!fBaseSet.contains(c))
147 + break;
150 + } else {
151 + --end; // bad char so break after char before it
152 + break;
157 +bool
158 +DictionaryBreakEngine::scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const {
159 + UErrorCode status = U_ZERO_ERROR;
160 + UText* ut = utext_clone(NULL, text, false, true, &status);
161 + int32_t nat = start;
162 + utext_setNativeIndex(ut, nat);
163 + bool foundFirst = true;
164 + int32_t curr = start;
165 + while (nat < end) {
166 + UChar32 c = utext_current32(ut);
167 + if (c == ZWSP || c == WJ) {
168 + curr = nat + 1;
169 + if (foundFirst) // only scan backwards for first inhibitor
170 + scanBackClusters(ut, start, before);
171 + foundFirst = false; // don't scan backwards if we go around again. Also marks found something
173 + utext_next32(ut);
174 + scanFwdClusters(ut, end, after);
175 + nat = after + 1;
177 + if (c == ZWSP || c == WJ) { // did we hit another one?
178 + continue;
179 + } else {
180 + break;
184 + ++nat; // keep hunting
185 + utext_next32(ut);
188 + utext_close(ut);
190 + if (nat >= end && foundFirst) {
191 + start = before = after = nat;
192 + return false; // failed to find anything
194 + else {
195 + start = curr;
197 + return true; // yup hit one
201 ******************************************************************
202 * PossibleWord
203 @@ -128,35 +302,35 @@ private:
204 public:
205 PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
206 ~PossibleWord() {};
209 // Fill the list of candidates if needed, select the longest, and return the number found
210 - int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
212 + int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet = NULL, int32_t minLength = 0 );
214 // Select the currently marked candidate, point after it in the text, and invalidate self
215 int32_t acceptMarked( UText *text );
218 // Back up from the current candidate to the next shorter one; return TRUE if that exists
219 // and point the text after it
220 UBool backUp( UText *text );
223 // Return the longest prefix this candidate location shares with a dictionary word
224 // Return value is in code points.
225 int32_t longestPrefix() { return prefix; };
228 // Mark the current candidate as the one we like
229 void markCurrent() { mark = current; };
232 // Get length in code points of the marked word.
233 int32_t markedCPLength() { return cpLengths[mark]; };
237 -int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
238 +int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet, int32_t minLength) {
239 // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
240 int32_t start = (int32_t)utext_getNativeIndex(text);
241 if (start != offset) {
242 offset = start;
243 - count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
244 + count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix, ignoreSet, minLength);
245 // Dictionary leaves text after longest prefix, not longest word. Back up.
246 if (count <= 0) {
247 utext_setNativeIndex(text, start);
248 @@ -828,51 +1002,28 @@ foundBest:
249 * KhmerBreakEngine
252 -// How many words in a row are "good enough"?
253 -static const int32_t KHMER_LOOKAHEAD = 3;
255 -// Will not combine a non-word with a preceding dictionary word longer than this
256 -static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
258 -// Will not combine a non-word that shares at least this much prefix with a
259 -// dictionary word, with a preceding word
260 -static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
262 -// Minimum word size
263 -static const int32_t KHMER_MIN_WORD = 2;
265 -// Minimum number of characters for two words
266 -static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
268 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
269 : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
270 fDictionary(adoptDictionary)
272 - fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
274 + clusterLimit = 3;
276 + fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]\\u2060\\u200C\\u200D]"), status);
277 if (U_SUCCESS(status)) {
278 setCharacters(fKhmerWordSet);
280 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
281 - fMarkSet.add(0x0020);
282 - fEndWordSet = fKhmerWordSet;
283 - fBeginWordSet.add(0x1780, 0x17B3);
284 - //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
285 - //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
286 - //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word
287 - fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
288 - //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels
289 -// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
290 -// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
291 -// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
292 -// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
293 -// fSuffixSet.add(THAI_PAIYANNOI);
294 -// fSuffixSet.add(THAI_MAIYAMOK);
295 + fIgnoreSet.add(0x2060); // WJ
296 + fIgnoreSet.add(0x200C, 0x200D); // ZWJ, ZWNJ
297 + fBaseSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:lb=SA:]&[:^M:]]"), status);
298 + fPuncSet.applyPattern(UNICODE_STRING_SIMPLE("[\\u17D4\\u17D5\\u17D6\\u17D7\\u17D9:]"), status);
300 // Compact for caching.
301 fMarkSet.compact();
302 - fEndWordSet.compact();
303 - fBeginWordSet.compact();
304 -// fSuffixSet.compact();
305 + fIgnoreSet.compact();
306 + fBaseSet.compact();
307 + fPuncSet.compact();
310 KhmerBreakEngine::~KhmerBreakEngine() {
311 @@ -884,180 +1036,204 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
312 int32_t rangeStart,
313 int32_t rangeEnd,
314 UStack &foundBreaks ) const {
315 - if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
316 - return 0; // Not enough characters for two words
317 + uint32_t wordsFound = foundBreaks.size();
318 + UErrorCode status = U_ZERO_ERROR;
319 + int32_t before = 0;
320 + int32_t after = 0;
321 + int32_t finalBefore = 0;
322 + int32_t initAfter = 0;
323 + int32_t scanStart = rangeStart;
324 + int32_t scanEnd = rangeEnd;
326 + bool startZwsp = false;
327 + bool breakStart = false;
328 + bool breakEnd = false;
330 + if (rangeStart > 0) {
331 + --scanStart;
332 + startZwsp = scanBeforeStart(text, scanStart, breakStart);
334 + utext_setNativeIndex(text, rangeStart);
335 + scanFwdClusters(text, rangeEnd, initAfter);
336 + bool endZwsp = scanAfterEnd(text, utext_nativeLength(text), scanEnd, breakEnd);
337 + utext_setNativeIndex(text, rangeEnd - 1);
338 + scanBackClusters(text, rangeStart, finalBefore);
339 + if (finalBefore < initAfter) { // the whole run is tented so no breaks
340 + if (breakStart || fTypes < UBRK_LINE)
341 + foundBreaks.push(rangeStart, status);
342 + if (breakEnd || fTypes < UBRK_LINE)
343 + foundBreaks.push(rangeEnd, status);
344 + return foundBreaks.size() - wordsFound;
347 - uint32_t wordsFound = 0;
348 - int32_t cpWordLength = 0;
349 - int32_t cuWordLength = 0;
350 - int32_t current;
351 - UErrorCode status = U_ZERO_ERROR;
352 - PossibleWord words[KHMER_LOOKAHEAD];
353 + scanStart = rangeStart;
354 + scanWJ(text, scanStart, rangeEnd, before, after);
355 + if (startZwsp || initAfter >= before) {
356 + after = initAfter;
357 + before = 0;
359 + if (!endZwsp && after > finalBefore && after < rangeEnd)
360 + endZwsp = true;
361 + if (endZwsp && before > finalBefore)
362 + before = finalBefore;
364 utext_setNativeIndex(text, rangeStart);
365 + int32_t numCodePts = rangeEnd - rangeStart;
366 + // bestSnlp[i] is the snlp of the best segmentation of the first i
367 + // code points in the range to be matched.
368 + UVector32 bestSnlp(numCodePts + 1, status);
369 + bestSnlp.addElement(0, status);
370 + for(int32_t i = 1; i <= numCodePts; i++) {
371 + bestSnlp.addElement(kuint32max, status);
374 - while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
375 - cuWordLength = 0;
376 - cpWordLength = 0;
377 + // prev[i] is the index of the last code point in the previous word in
378 + // the best segmentation of the first i characters. Note negative implies
379 + // that the code point is part of an unknown word.
380 + UVector32 prev(numCodePts + 1, status);
381 + for(int32_t i = 0; i <= numCodePts; i++) {
382 + prev.addElement(kuint32max, status);
385 - // Look for candidate words at the current position
386 - int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
387 + const int32_t maxWordSize = 20;
388 + UVector32 values(maxWordSize, status);
389 + values.setSize(maxWordSize);
390 + UVector32 lengths(maxWordSize, status);
391 + lengths.setSize(maxWordSize);
393 - // If we found exactly one, use that
394 - if (candidates == 1) {
395 - cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
396 - cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
397 - wordsFound += 1;
399 + // Dynamic programming to find the best segmentation.
401 - // If there was more than one, see which one can take us forward the most words
402 - else if (candidates > 1) {
403 - // If we're already at the end of the range, we're done
404 - if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
405 - goto foundBest;
407 - do {
408 - int32_t wordsMatched = 1;
409 - if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
410 - if (wordsMatched < 2) {
411 - // Followed by another dictionary word; mark first word as a good candidate
412 - words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
413 - wordsMatched = 2;
415 + // In outer loop, i is the code point index,
416 + // ix is the corresponding string (code unit) index.
417 + // They differ when the string contains supplementary characters.
418 + int32_t ix = rangeStart;
419 + for (int32_t i = 0; i < numCodePts; ++i, utext_setNativeIndex(text, ++ix)) {
420 + if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
421 + continue;
424 - // If we're already at the end of the range, we're done
425 - if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
426 - goto foundBest;
428 + int32_t count;
429 + count = fDictionary->matches(text, numCodePts - i, maxWordSize,
430 + NULL, lengths.getBuffer(), values.getBuffer(), NULL, &fIgnoreSet, 2);
431 + // Note: lengths is filled with code point lengths
432 + // The NULL parameter is the ignored code unit lengths.
434 - // See if any of the possible second words is followed by a third word
435 - do {
436 - // If we find a third word, stop right away
437 - if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
438 - words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
439 - goto foundBest;
442 - while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
444 + for (int32_t j = 0; j < count; j++) {
445 + int32_t ln = lengths.elementAti(j);
446 + if (ln + i >= numCodePts)
447 + continue;
448 + utext_setNativeIndex(text, ln+ix);
449 + int32_t c = utext_current32(text);
450 + if (fMarkSet.contains(c) || c == 0x17D2) { // Coeng
451 + lengths.removeElementAt(j);
452 + values.removeElementAt(j);
453 + --j;
454 + --count;
456 - while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
457 -foundBest:
458 - cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
459 - cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
460 - wordsFound += 1;
463 - // We come here after having either found a word or not. We look ahead to the
464 - // next word. If it's not a dictionary word, we will combine it with the word we
465 - // just found (if there is one), but only if the preceding word does not exceed
466 - // the threshold.
467 - // The text iterator should now be positioned at the end of the word we found.
468 - if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
469 - // if it is a dictionary word, do nothing. If it isn't, then if there is
470 - // no preceding word, or the non-word shares less than the minimum threshold
471 - // of characters with a dictionary word, then scan to resynchronize
472 - if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
473 - && (cuWordLength == 0
474 - || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
475 - // Look for a plausible word boundary
476 - int32_t remaining = rangeEnd - (current+cuWordLength);
477 - UChar32 pc;
478 - UChar32 uc;
479 - int32_t chars = 0;
480 - for (;;) {
481 - int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
482 - pc = utext_next32(text);
483 - int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
484 - chars += pcSize;
485 - remaining -= pcSize;
486 - if (remaining <= 0) {
487 + if (count == 0) {
488 + utext_setNativeIndex(text, ix);
489 + int32_t c = utext_current32(text);
490 + if (fPuncSet.contains(c) || fIgnoreSet.contains(c) || c == ZWSP) {
491 + values.setElementAt(0, count);
492 + lengths.setElementAt(1, count++);
493 + } else if (fBaseSet.contains(c)) {
494 + int32_t currix = utext_getNativeIndex(text);
495 + do {
496 + utext_next32(text);
497 + c = utext_current32(text);
498 + if (utext_getNativeIndex(text) >= rangeEnd)
499 break;
501 - uc = utext_current32(text);
502 - if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
503 - // Maybe. See if it's in the dictionary.
504 - int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
505 - utext_setNativeIndex(text, current+cuWordLength+chars);
506 - if (candidates > 0) {
507 + if (c == 0x17D2) { // Coeng
508 + utext_next32(text);
509 + c = utext_current32(text);
510 + if (!fBaseSet.contains(c) || utext_getNativeIndex(text) >= rangeEnd) {
511 break;
512 + } else {
513 + utext_next32(text);
514 + c = utext_current32(text);
515 + if (utext_getNativeIndex(text) >= rangeEnd)
516 + break;
521 - // Bump the word count if there wasn't already one
522 - if (cuWordLength <= 0) {
523 - wordsFound += 1;
525 + } while (fMarkSet.contains(c) || fIgnoreSet.contains(c));
526 + values.setElementAt(BADSNLP, count);
527 + lengths.setElementAt(utext_getNativeIndex(text) - currix, count++);
528 + } else {
529 + values.setElementAt(BADSNLP, count);
530 + lengths.setElementAt(1, count++);
534 - // Update the length with the passed-over characters
535 - cuWordLength += chars;
536 + for (int32_t j = 0; j < count; j++) {
537 + uint32_t v = values.elementAti(j);
538 + int32_t newSnlp = bestSnlp.elementAti(i) + v;
539 + int32_t ln = lengths.elementAti(j);
540 + utext_setNativeIndex(text, ln+ix);
541 + int32_t c = utext_current32(text);
542 + while ((fPuncSet.contains(c) || fIgnoreSet.contains(c)) && ln + i < numCodePts) {
543 + ++ln;
544 + utext_next32(text);
545 + c = utext_current32(text);
547 - else {
548 - // Back up to where we were for next iteration
549 - utext_setNativeIndex(text, current+cuWordLength);
550 + int32_t ln_j_i = ln + i; // yes really i!
551 + if (newSnlp < bestSnlp.elementAti(ln_j_i)) {
552 + if (v == BADSNLP) {
553 + int32_t p = prev.elementAti(i);
554 + if (p < 0)
555 + prev.setElementAt(p, ln_j_i);
556 + else
557 + prev.setElementAt(-i, ln_j_i);
559 + else
560 + prev.setElementAt(i, ln_j_i);
561 + bestSnlp.setElementAt(newSnlp, ln_j_i);
565 + // Start pushing the optimal offset index into t_boundary (t for tentative).
566 + // prev[numCodePts] is guaranteed to be meaningful.
567 + // We'll first push in the reverse order, i.e.,
568 + // t_boundary[0] = numCodePts, and afterwards do a swap.
569 + UVector32 t_boundary(numCodePts+1, status);
571 - // Never stop before a combining mark.
572 - int32_t currPos;
573 - while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
574 - utext_next32(text);
575 - cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
576 + int32_t numBreaks = 0;
577 + // No segmentation found, set boundary to end of range
578 + while (numCodePts >= 0 && (uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
579 + --numCodePts;
581 + if (numCodePts < 0) {
582 + t_boundary.addElement(numCodePts, status);
583 + numBreaks++;
584 + } else {
585 + for (int32_t i = numCodePts; (uint32_t)i != kuint32max; i = prev.elementAti(i)) {
586 + if (i < 0) i = -i;
587 + t_boundary.addElement(i, status);
588 + numBreaks++;
590 + U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
593 - // Look ahead for possible suffixes if a dictionary word does not follow.
594 - // We do this in code rather than using a rule so that the heuristic
595 - // resynch continues to function. For example, one of the suffix characters
596 - // could be a typo in the middle of a word.
597 -// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
598 -// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
599 -// && fSuffixSet.contains(uc = utext_current32(text))) {
600 -// if (uc == KHMER_PAIYANNOI) {
601 -// if (!fSuffixSet.contains(utext_previous32(text))) {
602 -// // Skip over previous end and PAIYANNOI
603 -// utext_next32(text);
604 -// utext_next32(text);
605 -// wordLength += 1; // Add PAIYANNOI to word
606 -// uc = utext_current32(text); // Fetch next character
607 -// }
608 -// else {
609 -// // Restore prior position
610 -// utext_next32(text);
611 -// }
612 -// }
613 -// if (uc == KHMER_MAIYAMOK) {
614 -// if (utext_previous32(text) != KHMER_MAIYAMOK) {
615 -// // Skip over previous end and MAIYAMOK
616 -// utext_next32(text);
617 -// utext_next32(text);
618 -// wordLength += 1; // Add MAIYAMOK to word
619 -// }
620 -// else {
621 -// // Restore prior position
622 -// utext_next32(text);
623 -// }
624 -// }
625 -// }
626 -// else {
627 -// utext_setNativeIndex(text, current+wordLength);
628 -// }
629 -// }
631 - // Did we find a word on this iteration? If so, push it on the break stack
632 - if (cuWordLength > 0) {
633 - foundBreaks.push((current+cuWordLength), status);
634 + // Now that we're done, convert positions in t_boundary[] (indices in
635 + // the normalized input string) back to indices in the original input UText
636 + // while reversing t_boundary and pushing values to foundBreaks.
637 + for (int32_t i = numBreaks-1; i >= 0; i--) {
638 + int32_t cpPos = t_boundary.elementAti(i);
639 + if (cpPos == 0 && !breakStart && fTypes >= UBRK_LINE) continue;
640 + int32_t utextPos = cpPos + rangeStart;
641 + while (utextPos > after && scanWJ(text, utextPos, scanEnd, before, after));
642 + if (utextPos < before) {
643 + // Boundaries are added to foundBreaks output in ascending order.
644 + U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
645 + foundBreaks.push(utextPos, status);
650 // Don't return a break for the end of the dictionary range if there is one there.
651 - if (foundBreaks.peeki() >= rangeEnd) {
652 + if (!breakEnd && fTypes >= UBRK_LINE && foundBreaks.peeki() >= rangeEnd) {
653 (void) foundBreaks.popi();
654 - wordsFound -= 1;
657 - return wordsFound;
658 + return foundBreaks.size() - wordsFound;
661 #if !UCONFIG_NO_NORMALIZATION
662 diff --git a/source/common/dictbe.h b/source/common/dictbe.h
663 index d3488cd..26caa75 100644
664 --- misc/icu/source/common/dictbe.h
665 +++ build/icu/source/common/dictbe.h
666 @@ -32,6 +32,15 @@ class Normalizer2;
668 class DictionaryBreakEngine : public LanguageBreakEngine {
669 private:
671 + /**
672 + * <p>Default constructor.</p>
674 + */
675 + DictionaryBreakEngine();
677 + protected:
680 * The set of characters handled by this engine
681 * @internal
682 @@ -46,11 +55,63 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
684 uint32_t fTypes;
686 + const int32_t WJ = 0x2060;
687 + const int32_t ZWSP = 0x200B;
690 - * <p>Default constructor.</p>
692 + * A Unicode set of all viramas
693 + * @internal
695 - DictionaryBreakEngine();
696 + UnicodeSet fViramaSet;
698 + /**
699 + * A Unicode set of all base characters
700 + * @internal
701 + */
702 + UnicodeSet fBaseSet;
704 + /**
705 + * A Unicode set of all marks
706 + * @internal
707 + */
708 + UnicodeSet fMarkSet;
710 + /**
711 + * A Unicode set of all characters ignored ignored in dictionary matching
712 + * @internal
713 + */
714 + UnicodeSet fIgnoreSet;
716 + /**
717 + * A Unicode set of all characters ignored ignored in dictionary matching
718 + * @internal
719 + */
720 + UnicodeSet fSkipStartSet;
722 + /**
723 + * A Unicode set of all characters ignored ignored in dictionary matching
724 + * @internal
725 + */
726 + UnicodeSet fSkipEndSet;
728 + /**
729 + * A Unicode set of all characters that should not be broken before
730 + * @internal
731 + */
732 + UnicodeSet fNBeforeSet;
734 + /**
735 + * The number of clusters within which breaks are inhibited
736 + * @internal
737 + */
738 + int32_t clusterLimit;
740 + bool scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const;
742 + bool scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const;
743 + bool scanAfterEnd(UText *text, int32_t rangeEnd, int32_t& end, bool &doBreak) const;
744 + void scanBackClusters(UText *text, int32_t textStart, int32_t& start) const;
745 + void scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const;
747 public:
749 @@ -81,7 +142,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
750 * <p>Find any breaks within a run in the supplied text.</p>
752 * @param text A UText representing the text. The iterator is left at
753 - * the end of the run of characters which the engine is capable of handling
754 + * the end of the run of characters which the engine is capable of handling
755 * that starts from the first (or last) character in the range.
756 * @param startPos The start of the run within the supplied text.
757 * @param endPos The end of the run within the supplied text.
758 @@ -243,118 +304,120 @@ class LaoBreakEngine : public DictionaryBreakEngine {
762 -/*******************************************************************
763 - * BurmeseBreakEngine
764 - */
766 -/**
767 - * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
768 - * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
769 - *
770 - * <p>After it is constructed a BurmeseBreakEngine may be shared between
771 - * threads without synchronization.</p>
772 - */
773 -class BurmeseBreakEngine : public DictionaryBreakEngine {
774 - private:
775 - /**
776 - * The set of characters handled by this engine
777 - * @internal
778 - */
780 - UnicodeSet fBurmeseWordSet;
781 - UnicodeSet fEndWordSet;
782 - UnicodeSet fBeginWordSet;
783 - UnicodeSet fMarkSet;
784 - DictionaryMatcher *fDictionary;
786 - public:
788 - /**
789 - * <p>Default constructor.</p>
790 - *
791 - * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
792 - * engine is deleted.
793 - */
794 - BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
796 - /**
797 - * <p>Virtual destructor.</p>
798 - */
799 - virtual ~BurmeseBreakEngine();
801 - protected:
802 - /**
803 - * <p>Divide up a range of known dictionary characters.</p>
804 - *
805 - * @param text A UText representing the text
806 - * @param rangeStart The start of the range of dictionary characters
807 - * @param rangeEnd The end of the range of dictionary characters
808 - * @param foundBreaks Output of C array of int32_t break positions, or 0
809 - * @return The number of breaks found
810 - */
811 - virtual int32_t divideUpDictionaryRange( UText *text,
812 - int32_t rangeStart,
813 - int32_t rangeEnd,
814 - UStack &foundBreaks ) const;
816 -};
818 -/*******************************************************************
819 - * KhmerBreakEngine
820 - */
822 -/**
823 - * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
824 - * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
825 - *
826 - * <p>After it is constructed a KhmerBreakEngine may be shared between
827 - * threads without synchronization.</p>
828 - */
829 -class KhmerBreakEngine : public DictionaryBreakEngine {
830 - private:
831 - /**
832 - * The set of characters handled by this engine
833 - * @internal
834 - */
836 - UnicodeSet fKhmerWordSet;
837 - UnicodeSet fEndWordSet;
838 - UnicodeSet fBeginWordSet;
839 - UnicodeSet fMarkSet;
840 - DictionaryMatcher *fDictionary;
842 - public:
844 - /**
845 - * <p>Default constructor.</p>
846 - *
847 - * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
848 - * engine is deleted.
849 - */
850 - KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
852 - /**
853 - * <p>Virtual destructor.</p>
854 - */
855 - virtual ~KhmerBreakEngine();
857 - protected:
858 - /**
859 - * <p>Divide up a range of known dictionary characters.</p>
860 - *
861 - * @param text A UText representing the text
862 - * @param rangeStart The start of the range of dictionary characters
863 - * @param rangeEnd The end of the range of dictionary characters
864 - * @param foundBreaks Output of C array of int32_t break positions, or 0
865 - * @return The number of breaks found
866 - */
867 - virtual int32_t divideUpDictionaryRange( UText *text,
868 - int32_t rangeStart,
869 - int32_t rangeEnd,
870 - UStack &foundBreaks ) const;
872 -};
874 +/*******************************************************************
875 + * BurmeseBreakEngine
876 + */
878 +/**
879 + * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
880 + * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
882 + * <p>After it is constructed a BurmeseBreakEngine may be shared between
883 + * threads without synchronization.</p>
884 + */
885 +class BurmeseBreakEngine : public DictionaryBreakEngine {
886 + private:
887 + /**
888 + * The set of characters handled by this engine
889 + * @internal
890 + */
892 + UnicodeSet fBurmeseWordSet;
893 + UnicodeSet fEndWordSet;
894 + UnicodeSet fBeginWordSet;
895 + UnicodeSet fMarkSet;
896 + DictionaryMatcher *fDictionary;
898 + public:
900 + /**
901 + * <p>Default constructor.</p>
903 + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
904 + * engine is deleted.
905 + */
906 + BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
908 + /**
909 + * <p>Virtual destructor.</p>
910 + */
911 + virtual ~BurmeseBreakEngine();
913 + protected:
914 + /**
915 + * <p>Divide up a range of known dictionary characters.</p>
917 + * @param text A UText representing the text
918 + * @param rangeStart The start of the range of dictionary characters
919 + * @param rangeEnd The end of the range of dictionary characters
920 + * @param foundBreaks Output of C array of int32_t break positions, or 0
921 + * @return The number of breaks found
922 + */
923 + virtual int32_t divideUpDictionaryRange( UText *text,
924 + int32_t rangeStart,
925 + int32_t rangeEnd,
926 + UStack &foundBreaks ) const;
930 +/*******************************************************************
931 + * KhmerBreakEngine
932 + */
934 +/**
935 + * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
936 + * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
938 + * <p>After it is constructed a KhmerBreakEngine may be shared between
939 + * threads without synchronization.</p>
940 + */
941 +class KhmerBreakEngine : public DictionaryBreakEngine {
942 + private:
943 + /**
944 + * The set of characters handled by this engine
945 + * @internal
946 + */
948 + UnicodeSet fKhmerWordSet;
949 + UnicodeSet fBeginWordSet;
950 + UnicodeSet fPuncSet;
951 + DictionaryMatcher *fDictionary;
953 + const uint32_t BADSNLP = 256 * 20;
954 + const uint32_t kuint32max = 0x7FFFFFFF;
956 + public:
958 + /**
959 + * <p>Default constructor.</p>
961 + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
962 + * engine is deleted.
963 + */
964 + KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
966 + /**
967 + * <p>Virtual destructor.</p>
968 + */
969 + virtual ~KhmerBreakEngine();
971 + protected:
972 + /**
973 + * <p>Divide up a range of known dictionary characters.</p>
975 + * @param text A UText representing the text
976 + * @param rangeStart The start of the range of dictionary characters
977 + * @param rangeEnd The end of the range of dictionary characters
978 + * @param foundBreaks Output of C array of int32_t break positions, or 0
979 + * @return The number of breaks found
980 + */
981 + virtual int32_t divideUpDictionaryRange( UText *text,
982 + int32_t rangeStart,
983 + int32_t rangeEnd,
984 + UStack &foundBreaks ) const;
988 #if !UCONFIG_NO_NORMALIZATION
990 /*******************************************************************
991 diff --git a/source/common/dictionarydata.cpp b/source/common/dictionarydata.cpp
992 index cb594c6..82f2e77 100644
993 --- misc/icu/source/common/dictionarydata.cpp
994 +++ build/icu/source/common/dictionarydata.cpp
995 @@ -42,7 +42,7 @@ int32_t UCharsDictionaryMatcher::getType() const {
997 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
998 int32_t *lengths, int32_t *cpLengths, int32_t *values,
999 - int32_t *prefix) const {
1000 + int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {
1002 UCharsTrie uct(characters);
1003 int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
1004 @@ -53,7 +53,13 @@ int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t
1005 UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
1006 int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
1007 codePointsMatched += 1;
1008 + if (ignoreSet != NULL && ignoreSet->contains(c)) {
1009 + continue;
1011 if (USTRINGTRIE_HAS_VALUE(result)) {
1012 + if (codePointsMatched < minLength) {
1013 + continue;
1015 if (wordCount < limit) {
1016 if (values != NULL) {
1017 values[wordCount] = uct.getValue();
1018 @@ -110,7 +116,7 @@ int32_t BytesDictionaryMatcher::getType() const {
1020 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
1021 int32_t *lengths, int32_t *cpLengths, int32_t *values,
1022 - int32_t *prefix) const {
1023 + int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {
1024 BytesTrie bt(characters);
1025 int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
1026 int32_t wordCount = 0;
1027 @@ -120,7 +126,13 @@ int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t
1028 UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
1029 int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
1030 codePointsMatched += 1;
1031 + if (ignoreSet != NULL && ignoreSet->contains(c)) {
1032 + continue;
1034 if (USTRINGTRIE_HAS_VALUE(result)) {
1035 + if (codePointsMatched < minLength) {
1036 + continue;
1038 if (wordCount < limit) {
1039 if (values != NULL) {
1040 values[wordCount] = bt.getValue();
1041 diff --git a/source/common/dictionarydata.h b/source/common/dictionarydata.h
1042 index 0216ab0..ee9e571 100644
1043 --- misc/icu/source/common/dictionarydata.h
1044 +++ build/icu/source/common/dictionarydata.h
1045 @@ -19,6 +19,7 @@
1046 #include "unicode/utext.h"
1047 #include "unicode/udata.h"
1048 #include "udataswp.h"
1049 +#include "unicode/uniset.h"
1050 #include "unicode/uobject.h"
1051 #include "unicode/ustringtrie.h"
1053 @@ -90,7 +91,7 @@ public:
1055 virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
1056 int32_t *lengths, int32_t *cpLengths, int32_t *values,
1057 - int32_t *prefix) const = 0;
1058 + int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const = 0;
1060 /** @return DictionaryData::TRIE_TYPE_XYZ */
1061 virtual int32_t getType() const = 0;
1062 @@ -105,7 +106,7 @@ public:
1063 virtual ~UCharsDictionaryMatcher();
1064 virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
1065 int32_t *lengths, int32_t *cpLengths, int32_t *values,
1066 - int32_t *prefix) const;
1067 + int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const;
1068 virtual int32_t getType() const;
1069 private:
1070 const UChar *characters;
1071 @@ -123,7 +124,7 @@ public:
1072 virtual ~BytesDictionaryMatcher();
1073 virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
1074 int32_t *lengths, int32_t *cpLengths, int32_t *values,
1075 - int32_t *prefix) const;
1076 + int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const;
1077 virtual int32_t getType() const;
1078 private:
1079 UChar32 transform(UChar32 c) const;
1080 diff --git a/source/data/Makefile.in b/source/data/Makefile.in
1081 index 816c82d..c637d70 100644
1082 --- misc/icu/source/data/Makefile.in
1083 +++ build/icu/source/data/Makefile.in
1084 @@ -181,7 +181,7 @@ endif
1085 endif
1086 endif
1088 -packagedata: icupkg.inc $(PKGDATA_LIST) build-local
1089 +packagedata: icupkg.inc $(PKGDATA_LIST) build-local $(MAINBUILDDIR)/khmerdict.stamp
1090 ifneq ($(ENABLE_STATIC),)
1091 ifeq ($(PKGDATA_MODE),dll)
1092 $(PKGDATA_INVOKE) $(PKGDATA) -e $(ICUDATA_ENTRY_POINT) -T $(OUTTMPDIR) -p $(ICUDATA_NAME) $(PKGDATA_LIBSTATICNAME) -m static $(PKGDATA_VERSIONING) $(PKGDATA_LIST)
1093 @@ -564,8 +564,14 @@ $(BRKBLDDIR)/burmesedict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
1094 $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1000 -c -i $(BUILDDIR) $(DICTSRCDIR)/burmesedict.txt $(BRKBLDDIR)/burmesedict.dict
1096 # TODO: figure out why combining characters are here?
1097 -$(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
1098 - $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
1099 +#$(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
1100 +# $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
1102 +#$(MAINBUILDDIR)/khmerdict.stamp: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(BRKSRCDIR)/khmerdict.txt build-local
1103 +# $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
1104 +$(MAINBUILDDIR)/khmerdict.stamp: $(BRKSRCDIR)/khmerdict.dict build-local
1105 + cp $< $(BRKBLDDIR)
1106 + echo "timestamp" > $@
1108 #################################################### CFU
1109 # CFU FILES