1 diff --git a/source/common/dictbe.cpp b/source/common/dictbe.cpp
2 index f1c874d..3ad1b3f 100644
3 --- misc/icu/source/common/dictbe.cpp
4 +++ build/icu/source/common/dictbe.cpp
5 @@ -27,8 +27,17 @@ U_NAMESPACE_BEGIN
6 ******************************************************************
9 -DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
10 +DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) :
13 + UErrorCode status = U_ZERO_ERROR;
15 + fViramaSet.applyPattern(UNICODE_STRING_SIMPLE("[[:ccc=VR:]]"), status);
17 + // note Skip Sets contain fIgnoreSet characters too.
18 + fSkipStartSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=OP:][:lb=QU:]\\u200C\\u200D\\u2060]"), status);
19 + fSkipEndSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]\\u200C\\u200D\\u2060]"), status);
20 + fNBeforeSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status);
23 DictionaryBreakEngine::~DictionaryBreakEngine() {
24 @@ -90,7 +99,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
25 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
26 utext_setNativeIndex(text, current);
33 @@ -101,6 +110,169 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
38 +DictionaryBreakEngine::scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const {
39 + UErrorCode status = U_ZERO_ERROR;
40 + UText* ut = utext_clone(NULL, text, false, true, &status);
41 + utext_setNativeIndex(ut, start);
42 + UChar32 c = utext_current32(ut);
45 + while (start >= 0) {
46 + if (!fSkipStartSet.contains(c)) {
51 + c = utext_previous32(ut);
59 +DictionaryBreakEngine::scanAfterEnd(UText *text, int32_t textEnd, int32_t& end, bool &doBreak) const {
60 + UErrorCode status = U_ZERO_ERROR;
61 + UText* ut = utext_clone(NULL, text, false, true, &status);
62 + utext_setNativeIndex(ut, end);
63 + UChar32 c = utext_current32(ut);
65 + doBreak = !fNBeforeSet.contains(c);
66 + while (end < textEnd) {
67 + if (!fSkipEndSet.contains(c)) {
72 + c = utext_next32(ut);
80 +DictionaryBreakEngine::scanBackClusters(UText *text, int32_t textStart, int32_t& start) const {
82 + start = utext_getNativeIndex(text);
83 + while (start > textStart) {
84 + c = utext_previous32(text);
86 + if (!fSkipEndSet.contains(c))
89 + for (int i = 0; i < clusterLimit; ++i) { // scan backwards clusterLimit clusters
90 + while (start > textStart) {
91 + while (fIgnoreSet.contains(c))
92 + c = utext_previous32(text);
93 + if (!fMarkSet.contains(c)) {
94 + if (fBaseSet.contains(c)) {
95 + c = utext_previous32(text);
96 + if (!fViramaSet.contains(c)) { // Virama (e.g. coeng) preceding base. Treat sequence as a mark
98 + c = utext_current32(text);
107 + c = utext_previous32(text);
110 + if (!fBaseSet.contains(c) || start < textStart) { // not a cluster start so finish
113 + c = utext_previous32(text);
114 + --start; // go round again
115 + } // ignore hitting previous inhibitor since scanning for it should have found us!
116 + ++start; // counteract --before
120 +DictionaryBreakEngine::scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const {
121 + UChar32 c = utext_current32(text);
122 + end = utext_getNativeIndex(text);
123 + while (end < textEnd) {
124 + if (!fSkipStartSet.contains(c))
126 + utext_next32(text);
127 + c = utext_current32(text);
130 + for (int i = 0; i < clusterLimit; ++i) { // scan forwards clusterLimit clusters
131 + while (fIgnoreSet.contains(c)) {
132 + utext_next32(text);
133 + c = utext_current32(text);
135 + if (fBaseSet.contains(c)) {
136 + while (end < textEnd) {
137 + utext_next32(text);
138 + c = utext_current32(text);
140 + if (!fMarkSet.contains(c))
142 + else if (fViramaSet.contains(c)) { // handle coeng + base as mark
143 + utext_next32(text);
144 + c = utext_current32(text);
146 + if (!fBaseSet.contains(c))
151 + --end; // bad char so break after char before it
158 +DictionaryBreakEngine::scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const {
159 + UErrorCode status = U_ZERO_ERROR;
160 + UText* ut = utext_clone(NULL, text, false, true, &status);
161 + int32_t nat = start;
162 + utext_setNativeIndex(ut, nat);
163 + bool foundFirst = true;
164 + int32_t curr = start;
165 + while (nat < end) {
166 + UChar32 c = utext_current32(ut);
167 + if (c == ZWSP || c == WJ) {
169 + if (foundFirst) // only scan backwards for first inhibitor
170 + scanBackClusters(ut, start, before);
171 + foundFirst = false; // don't scan backwards if we go around again. Also marks found something
174 + scanFwdClusters(ut, end, after);
177 + if (c == ZWSP || c == WJ) { // did we hit another one?
184 + ++nat; // keep hunting
190 + if (nat >= end && foundFirst) {
191 + start = before = after = nat;
192 + return false; // failed to find anything
197 + return true; // yup hit one
201 ******************************************************************
203 @@ -128,35 +302,35 @@ private:
205 PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
209 // Fill the list of candidates if needed, select the longest, and return the number found
210 - int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
212 + int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet = NULL, int32_t minLength = 0 );
214 // Select the currently marked candidate, point after it in the text, and invalidate self
215 int32_t acceptMarked( UText *text );
218 // Back up from the current candidate to the next shorter one; return TRUE if that exists
219 // and point the text after it
220 UBool backUp( UText *text );
223 // Return the longest prefix this candidate location shares with a dictionary word
224 // Return value is in code points.
225 int32_t longestPrefix() { return prefix; };
228 // Mark the current candidate as the one we like
229 void markCurrent() { mark = current; };
232 // Get length in code points of the marked word.
233 int32_t markedCPLength() { return cpLengths[mark]; };
237 -int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
238 +int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet, int32_t minLength) {
239 // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
240 int32_t start = (int32_t)utext_getNativeIndex(text);
241 if (start != offset) {
243 - count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
244 + count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix, ignoreSet, minLength);
245 // Dictionary leaves text after longest prefix, not longest word. Back up.
247 utext_setNativeIndex(text, start);
248 @@ -828,51 +1002,28 @@ foundBest:
252 -// How many words in a row are "good enough"?
253 -static const int32_t KHMER_LOOKAHEAD = 3;
255 -// Will not combine a non-word with a preceding dictionary word longer than this
256 -static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
258 -// Will not combine a non-word that shares at least this much prefix with a
259 -// dictionary word, with a preceding word
260 -static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
262 -// Minimum word size
263 -static const int32_t KHMER_MIN_WORD = 2;
265 -// Minimum number of characters for two words
266 -static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
268 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
269 : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
270 fDictionary(adoptDictionary)
272 - fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
276 + fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]\\u2060\\u200C\\u200D]"), status);
277 if (U_SUCCESS(status)) {
278 setCharacters(fKhmerWordSet);
280 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
281 - fMarkSet.add(0x0020);
282 - fEndWordSet = fKhmerWordSet;
283 - fBeginWordSet.add(0x1780, 0x17B3);
284 - //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
285 - //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
286 - //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word
287 - fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
288 - //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels
289 -// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
290 -// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
291 -// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
292 -// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
293 -// fSuffixSet.add(THAI_PAIYANNOI);
294 -// fSuffixSet.add(THAI_MAIYAMOK);
295 + fIgnoreSet.add(0x2060); // WJ
296 + fIgnoreSet.add(0x200C, 0x200D); // ZWJ, ZWNJ
297 + fBaseSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:lb=SA:]&[:^M:]]"), status);
298 + fPuncSet.applyPattern(UNICODE_STRING_SIMPLE("[\\u17D4\\u17D5\\u17D6\\u17D7\\u17D9:]"), status);
300 // Compact for caching.
302 - fEndWordSet.compact();
303 - fBeginWordSet.compact();
304 -// fSuffixSet.compact();
305 + fIgnoreSet.compact();
306 + fBaseSet.compact();
307 + fPuncSet.compact();
310 KhmerBreakEngine::~KhmerBreakEngine() {
311 @@ -884,180 +1036,204 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
314 UStack &foundBreaks ) const {
315 - if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
316 - return 0; // Not enough characters for two words
317 + uint32_t wordsFound = foundBreaks.size();
318 + UErrorCode status = U_ZERO_ERROR;
319 + int32_t before = 0;
321 + int32_t finalBefore = 0;
322 + int32_t initAfter = 0;
323 + int32_t scanStart = rangeStart;
324 + int32_t scanEnd = rangeEnd;
326 + bool startZwsp = false;
327 + bool breakStart = false;
328 + bool breakEnd = false;
330 + if (rangeStart > 0) {
332 + startZwsp = scanBeforeStart(text, scanStart, breakStart);
334 + utext_setNativeIndex(text, rangeStart);
335 + scanFwdClusters(text, rangeEnd, initAfter);
336 + bool endZwsp = scanAfterEnd(text, utext_nativeLength(text), scanEnd, breakEnd);
337 + utext_setNativeIndex(text, rangeEnd - 1);
338 + scanBackClusters(text, rangeStart, finalBefore);
339 + if (finalBefore < initAfter) { // the whole run is tented so no breaks
340 + if (breakStart || fTypes < UBRK_LINE)
341 + foundBreaks.push(rangeStart, status);
342 + if (breakEnd || fTypes < UBRK_LINE)
343 + foundBreaks.push(rangeEnd, status);
344 + return foundBreaks.size() - wordsFound;
347 - uint32_t wordsFound = 0;
348 - int32_t cpWordLength = 0;
349 - int32_t cuWordLength = 0;
351 - UErrorCode status = U_ZERO_ERROR;
352 - PossibleWord words[KHMER_LOOKAHEAD];
353 + scanStart = rangeStart;
354 + scanWJ(text, scanStart, rangeEnd, before, after);
355 + if (startZwsp || initAfter >= before) {
359 + if (!endZwsp && after > finalBefore && after < rangeEnd)
361 + if (endZwsp && before > finalBefore)
362 + before = finalBefore;
364 utext_setNativeIndex(text, rangeStart);
365 + int32_t numCodePts = rangeEnd - rangeStart;
366 + // bestSnlp[i] is the snlp of the best segmentation of the first i
367 + // code points in the range to be matched.
368 + UVector32 bestSnlp(numCodePts + 1, status);
369 + bestSnlp.addElement(0, status);
370 + for(int32_t i = 1; i <= numCodePts; i++) {
371 + bestSnlp.addElement(kuint32max, status);
374 - while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
377 + // prev[i] is the index of the last code point in the previous word in
378 + // the best segmentation of the first i characters. Note negative implies
379 + // that the code point is part of an unknown word.
380 + UVector32 prev(numCodePts + 1, status);
381 + for(int32_t i = 0; i <= numCodePts; i++) {
382 + prev.addElement(kuint32max, status);
385 - // Look for candidate words at the current position
386 - int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
387 + const int32_t maxWordSize = 20;
388 + UVector32 values(maxWordSize, status);
389 + values.setSize(maxWordSize);
390 + UVector32 lengths(maxWordSize, status);
391 + lengths.setSize(maxWordSize);
393 - // If we found exactly one, use that
394 - if (candidates == 1) {
395 - cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
396 - cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
399 + // Dynamic programming to find the best segmentation.
401 - // If there was more than one, see which one can take us forward the most words
402 - else if (candidates > 1) {
403 - // If we're already at the end of the range, we're done
404 - if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
408 - int32_t wordsMatched = 1;
409 - if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
410 - if (wordsMatched < 2) {
411 - // Followed by another dictionary word; mark first word as a good candidate
412 - words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
415 + // In outer loop, i is the code point index,
416 + // ix is the corresponding string (code unit) index.
417 + // They differ when the string contains supplementary characters.
418 + int32_t ix = rangeStart;
419 + for (int32_t i = 0; i < numCodePts; ++i, utext_setNativeIndex(text, ++ix)) {
420 + if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
424 - // If we're already at the end of the range, we're done
425 - if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
429 + count = fDictionary->matches(text, numCodePts - i, maxWordSize,
430 + NULL, lengths.getBuffer(), values.getBuffer(), NULL, &fIgnoreSet, 2);
431 + // Note: lengths is filled with code point lengths
432 + // The NULL parameter is the ignored code unit lengths.
434 - // See if any of the possible second words is followed by a third word
436 - // If we find a third word, stop right away
437 - if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
438 - words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
442 - while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
444 + for (int32_t j = 0; j < count; j++) {
445 + int32_t ln = lengths.elementAti(j);
446 + if (ln + i >= numCodePts)
448 + utext_setNativeIndex(text, ln+ix);
449 + int32_t c = utext_current32(text);
450 + if (fMarkSet.contains(c) || c == 0x17D2) { // Coeng
451 + lengths.removeElementAt(j);
452 + values.removeElementAt(j);
456 - while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
458 - cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
459 - cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
463 - // We come here after having either found a word or not. We look ahead to the
464 - // next word. If it's not a dictionary word, we will combine it with the word we
465 - // just found (if there is one), but only if the preceding word does not exceed
467 - // The text iterator should now be positioned at the end of the word we found.
468 - if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
469 - // if it is a dictionary word, do nothing. If it isn't, then if there is
470 - // no preceding word, or the non-word shares less than the minimum threshold
471 - // of characters with a dictionary word, then scan to resynchronize
472 - if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
473 - && (cuWordLength == 0
474 - || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
475 - // Look for a plausible word boundary
476 - int32_t remaining = rangeEnd - (current+cuWordLength);
481 - int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
482 - pc = utext_next32(text);
483 - int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
485 - remaining -= pcSize;
486 - if (remaining <= 0) {
488 + utext_setNativeIndex(text, ix);
489 + int32_t c = utext_current32(text);
490 + if (fPuncSet.contains(c) || fIgnoreSet.contains(c) || c == ZWSP) {
491 + values.setElementAt(0, count);
492 + lengths.setElementAt(1, count++);
493 + } else if (fBaseSet.contains(c)) {
494 + int32_t currix = utext_getNativeIndex(text);
496 + utext_next32(text);
497 + c = utext_current32(text);
498 + if (utext_getNativeIndex(text) >= rangeEnd)
501 - uc = utext_current32(text);
502 - if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
503 - // Maybe. See if it's in the dictionary.
504 - int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
505 - utext_setNativeIndex(text, current+cuWordLength+chars);
506 - if (candidates > 0) {
507 + if (c == 0x17D2) { // Coeng
508 + utext_next32(text);
509 + c = utext_current32(text);
510 + if (!fBaseSet.contains(c) || utext_getNativeIndex(text) >= rangeEnd) {
513 + utext_next32(text);
514 + c = utext_current32(text);
515 + if (utext_getNativeIndex(text) >= rangeEnd)
521 - // Bump the word count if there wasn't already one
522 - if (cuWordLength <= 0) {
525 + } while (fMarkSet.contains(c) || fIgnoreSet.contains(c));
526 + values.setElementAt(BADSNLP, count);
527 + lengths.setElementAt(utext_getNativeIndex(text) - currix, count++);
529 + values.setElementAt(BADSNLP, count);
530 + lengths.setElementAt(1, count++);
534 - // Update the length with the passed-over characters
535 - cuWordLength += chars;
536 + for (int32_t j = 0; j < count; j++) {
537 + uint32_t v = values.elementAti(j);
538 + int32_t newSnlp = bestSnlp.elementAti(i) + v;
539 + int32_t ln = lengths.elementAti(j);
540 + utext_setNativeIndex(text, ln+ix);
541 + int32_t c = utext_current32(text);
542 + while ((fPuncSet.contains(c) || fIgnoreSet.contains(c)) && ln + i < numCodePts) {
544 + utext_next32(text);
545 + c = utext_current32(text);
548 - // Back up to where we were for next iteration
549 - utext_setNativeIndex(text, current+cuWordLength);
550 + int32_t ln_j_i = ln + i; // yes really i!
551 + if (newSnlp < bestSnlp.elementAti(ln_j_i)) {
552 + if (v == BADSNLP) {
553 + int32_t p = prev.elementAti(i);
555 + prev.setElementAt(p, ln_j_i);
557 + prev.setElementAt(-i, ln_j_i);
560 + prev.setElementAt(i, ln_j_i);
561 + bestSnlp.setElementAt(newSnlp, ln_j_i);
565 + // Start pushing the optimal offset index into t_boundary (t for tentative).
566 + // prev[numCodePts] is guaranteed to be meaningful.
567 + // We'll first push in the reverse order, i.e.,
568 + // t_boundary[0] = numCodePts, and afterwards do a swap.
569 + UVector32 t_boundary(numCodePts+1, status);
571 - // Never stop before a combining mark.
573 - while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
574 - utext_next32(text);
575 - cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
576 + int32_t numBreaks = 0;
577 + // No segmentation found, set boundary to end of range
578 + while (numCodePts >= 0 && (uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
581 + if (numCodePts < 0) {
582 + t_boundary.addElement(numCodePts, status);
585 + for (int32_t i = numCodePts; (uint32_t)i != kuint32max; i = prev.elementAti(i)) {
587 + t_boundary.addElement(i, status);
590 + U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
593 - // Look ahead for possible suffixes if a dictionary word does not follow.
594 - // We do this in code rather than using a rule so that the heuristic
595 - // resynch continues to function. For example, one of the suffix characters
596 - // could be a typo in the middle of a word.
597 -// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
598 -// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
599 -// && fSuffixSet.contains(uc = utext_current32(text))) {
600 -// if (uc == KHMER_PAIYANNOI) {
601 -// if (!fSuffixSet.contains(utext_previous32(text))) {
602 -// // Skip over previous end and PAIYANNOI
603 -// utext_next32(text);
604 -// utext_next32(text);
605 -// wordLength += 1; // Add PAIYANNOI to word
606 -// uc = utext_current32(text); // Fetch next character
609 -// // Restore prior position
610 -// utext_next32(text);
613 -// if (uc == KHMER_MAIYAMOK) {
614 -// if (utext_previous32(text) != KHMER_MAIYAMOK) {
615 -// // Skip over previous end and MAIYAMOK
616 -// utext_next32(text);
617 -// utext_next32(text);
618 -// wordLength += 1; // Add MAIYAMOK to word
621 -// // Restore prior position
622 -// utext_next32(text);
627 -// utext_setNativeIndex(text, current+wordLength);
631 - // Did we find a word on this iteration? If so, push it on the break stack
632 - if (cuWordLength > 0) {
633 - foundBreaks.push((current+cuWordLength), status);
634 + // Now that we're done, convert positions in t_boundary[] (indices in
635 + // the normalized input string) back to indices in the original input UText
636 + // while reversing t_boundary and pushing values to foundBreaks.
637 + for (int32_t i = numBreaks-1; i >= 0; i--) {
638 + int32_t cpPos = t_boundary.elementAti(i);
639 + if (cpPos == 0 && !breakStart && fTypes >= UBRK_LINE) continue;
640 + int32_t utextPos = cpPos + rangeStart;
641 + while (utextPos > after && scanWJ(text, utextPos, scanEnd, before, after));
642 + if (utextPos < before) {
643 + // Boundaries are added to foundBreaks output in ascending order.
644 + U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
645 + foundBreaks.push(utextPos, status);
650 // Don't return a break for the end of the dictionary range if there is one there.
651 - if (foundBreaks.peeki() >= rangeEnd) {
652 + if (!breakEnd && fTypes >= UBRK_LINE && foundBreaks.peeki() >= rangeEnd) {
653 (void) foundBreaks.popi();
658 + return foundBreaks.size() - wordsFound;
661 #if !UCONFIG_NO_NORMALIZATION
662 diff --git a/source/common/dictbe.h b/source/common/dictbe.h
663 index d3488cd..26caa75 100644
664 --- misc/icu/source/common/dictbe.h
665 +++ build/icu/source/common/dictbe.h
666 @@ -32,6 +32,15 @@ class Normalizer2;
668 class DictionaryBreakEngine : public LanguageBreakEngine {
672 + * <p>Default constructor.</p>
675 + DictionaryBreakEngine();
680 * The set of characters handled by this engine
682 @@ -46,11 +55,63 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
686 + const int32_t WJ = 0x2060;
687 + const int32_t ZWSP = 0x200B;
690 - * <p>Default constructor.</p>
692 + * A Unicode set of all viramas
695 - DictionaryBreakEngine();
696 + UnicodeSet fViramaSet;
699 + * A Unicode set of all base characters
702 + UnicodeSet fBaseSet;
705 + * A Unicode set of all marks
708 + UnicodeSet fMarkSet;
711 + * A Unicode set of all characters ignored ignored in dictionary matching
714 + UnicodeSet fIgnoreSet;
717 + * A Unicode set of all characters ignored ignored in dictionary matching
720 + UnicodeSet fSkipStartSet;
723 + * A Unicode set of all characters ignored ignored in dictionary matching
726 + UnicodeSet fSkipEndSet;
729 + * A Unicode set of all characters that should not be broken before
732 + UnicodeSet fNBeforeSet;
735 + * The number of clusters within which breaks are inhibited
738 + int32_t clusterLimit;
740 + bool scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const;
742 + bool scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const;
743 + bool scanAfterEnd(UText *text, int32_t rangeEnd, int32_t& end, bool &doBreak) const;
744 + void scanBackClusters(UText *text, int32_t textStart, int32_t& start) const;
745 + void scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const;
749 @@ -81,7 +142,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
750 * <p>Find any breaks within a run in the supplied text.</p>
752 * @param text A UText representing the text. The iterator is left at
753 - * the end of the run of characters which the engine is capable of handling
754 + * the end of the run of characters which the engine is capable of handling
755 * that starts from the first (or last) character in the range.
756 * @param startPos The start of the run within the supplied text.
757 * @param endPos The end of the run within the supplied text.
758 @@ -243,118 +304,120 @@ class LaoBreakEngine : public DictionaryBreakEngine {
762 -/*******************************************************************
763 - * BurmeseBreakEngine
767 - * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
768 - * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
770 - * <p>After it is constructed a BurmeseBreakEngine may be shared between
771 - * threads without synchronization.</p>
773 -class BurmeseBreakEngine : public DictionaryBreakEngine {
776 - * The set of characters handled by this engine
780 - UnicodeSet fBurmeseWordSet;
781 - UnicodeSet fEndWordSet;
782 - UnicodeSet fBeginWordSet;
783 - UnicodeSet fMarkSet;
784 - DictionaryMatcher *fDictionary;
789 - * <p>Default constructor.</p>
791 - * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
792 - * engine is deleted.
794 - BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
797 - * <p>Virtual destructor.</p>
799 - virtual ~BurmeseBreakEngine();
803 - * <p>Divide up a range of known dictionary characters.</p>
805 - * @param text A UText representing the text
806 - * @param rangeStart The start of the range of dictionary characters
807 - * @param rangeEnd The end of the range of dictionary characters
808 - * @param foundBreaks Output of C array of int32_t break positions, or 0
809 - * @return The number of breaks found
811 - virtual int32_t divideUpDictionaryRange( UText *text,
812 - int32_t rangeStart,
814 - UStack &foundBreaks ) const;
818 -/*******************************************************************
823 - * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
824 - * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
826 - * <p>After it is constructed a KhmerBreakEngine may be shared between
827 - * threads without synchronization.</p>
829 -class KhmerBreakEngine : public DictionaryBreakEngine {
832 - * The set of characters handled by this engine
836 - UnicodeSet fKhmerWordSet;
837 - UnicodeSet fEndWordSet;
838 - UnicodeSet fBeginWordSet;
839 - UnicodeSet fMarkSet;
840 - DictionaryMatcher *fDictionary;
845 - * <p>Default constructor.</p>
847 - * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
848 - * engine is deleted.
850 - KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
853 - * <p>Virtual destructor.</p>
855 - virtual ~KhmerBreakEngine();
859 - * <p>Divide up a range of known dictionary characters.</p>
861 - * @param text A UText representing the text
862 - * @param rangeStart The start of the range of dictionary characters
863 - * @param rangeEnd The end of the range of dictionary characters
864 - * @param foundBreaks Output of C array of int32_t break positions, or 0
865 - * @return The number of breaks found
867 - virtual int32_t divideUpDictionaryRange( UText *text,
868 - int32_t rangeStart,
870 - UStack &foundBreaks ) const;
874 +/*******************************************************************
875 + * BurmeseBreakEngine
879 + * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
880 + * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
882 + * <p>After it is constructed a BurmeseBreakEngine may be shared between
883 + * threads without synchronization.</p>
885 +class BurmeseBreakEngine : public DictionaryBreakEngine {
888 + * The set of characters handled by this engine
892 + UnicodeSet fBurmeseWordSet;
893 + UnicodeSet fEndWordSet;
894 + UnicodeSet fBeginWordSet;
895 + UnicodeSet fMarkSet;
896 + DictionaryMatcher *fDictionary;
901 + * <p>Default constructor.</p>
903 + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
904 + * engine is deleted.
906 + BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
909 + * <p>Virtual destructor.</p>
911 + virtual ~BurmeseBreakEngine();
915 + * <p>Divide up a range of known dictionary characters.</p>
917 + * @param text A UText representing the text
918 + * @param rangeStart The start of the range of dictionary characters
919 + * @param rangeEnd The end of the range of dictionary characters
920 + * @param foundBreaks Output of C array of int32_t break positions, or 0
921 + * @return The number of breaks found
923 + virtual int32_t divideUpDictionaryRange( UText *text,
924 + int32_t rangeStart,
926 + UStack &foundBreaks ) const;
930 +/*******************************************************************
935 + * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
936 + * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
938 + * <p>After it is constructed a KhmerBreakEngine may be shared between
939 + * threads without synchronization.</p>
941 +class KhmerBreakEngine : public DictionaryBreakEngine {
944 + * The set of characters handled by this engine
948 + UnicodeSet fKhmerWordSet;
949 + UnicodeSet fBeginWordSet;
950 + UnicodeSet fPuncSet;
951 + DictionaryMatcher *fDictionary;
953 + const uint32_t BADSNLP = 256 * 20;
954 + const uint32_t kuint32max = 0x7FFFFFFF;
959 + * <p>Default constructor.</p>
961 + * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
962 + * engine is deleted.
964 + KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
967 + * <p>Virtual destructor.</p>
969 + virtual ~KhmerBreakEngine();
973 + * <p>Divide up a range of known dictionary characters.</p>
975 + * @param text A UText representing the text
976 + * @param rangeStart The start of the range of dictionary characters
977 + * @param rangeEnd The end of the range of dictionary characters
978 + * @param foundBreaks Output of C array of int32_t break positions, or 0
979 + * @return The number of breaks found
981 + virtual int32_t divideUpDictionaryRange( UText *text,
982 + int32_t rangeStart,
984 + UStack &foundBreaks ) const;
988 #if !UCONFIG_NO_NORMALIZATION
990 /*******************************************************************
991 diff --git a/source/common/dictionarydata.cpp b/source/common/dictionarydata.cpp
992 index cb594c6..82f2e77 100644
993 --- misc/icu/source/common/dictionarydata.cpp
994 +++ build/icu/source/common/dictionarydata.cpp
995 @@ -42,7 +42,7 @@ int32_t UCharsDictionaryMatcher::getType() const {
997 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
998 int32_t *lengths, int32_t *cpLengths, int32_t *values,
999 - int32_t *prefix) const {
1000 + int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {
1002 UCharsTrie uct(characters);
1003 int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
1004 @@ -53,7 +53,13 @@ int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t
1005 UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
1006 int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
1007 codePointsMatched += 1;
1008 + if (ignoreSet != NULL && ignoreSet->contains(c)) {
1011 if (USTRINGTRIE_HAS_VALUE(result)) {
1012 + if (codePointsMatched < minLength) {
1015 if (wordCount < limit) {
1016 if (values != NULL) {
1017 values[wordCount] = uct.getValue();
1018 @@ -110,7 +116,7 @@ int32_t BytesDictionaryMatcher::getType() const {
1020 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
1021 int32_t *lengths, int32_t *cpLengths, int32_t *values,
1022 - int32_t *prefix) const {
1023 + int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {
1024 BytesTrie bt(characters);
1025 int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
1026 int32_t wordCount = 0;
1027 @@ -120,7 +126,13 @@ int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t
1028 UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
1029 int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
1030 codePointsMatched += 1;
1031 + if (ignoreSet != NULL && ignoreSet->contains(c)) {
1034 if (USTRINGTRIE_HAS_VALUE(result)) {
1035 + if (codePointsMatched < minLength) {
1038 if (wordCount < limit) {
1039 if (values != NULL) {
1040 values[wordCount] = bt.getValue();
1041 diff --git a/source/common/dictionarydata.h b/source/common/dictionarydata.h
1042 index 0216ab0..ee9e571 100644
1043 --- misc/icu/source/common/dictionarydata.h
1044 +++ build/icu/source/common/dictionarydata.h
1046 #include "unicode/utext.h"
1047 #include "unicode/udata.h"
1048 #include "udataswp.h"
1049 +#include "unicode/uniset.h"
1050 #include "unicode/uobject.h"
1051 #include "unicode/ustringtrie.h"
1053 @@ -90,7 +91,7 @@ public:
1055 virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
1056 int32_t *lengths, int32_t *cpLengths, int32_t *values,
1057 - int32_t *prefix) const = 0;
1058 + int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const = 0;
1060 /** @return DictionaryData::TRIE_TYPE_XYZ */
1061 virtual int32_t getType() const = 0;
1062 @@ -105,7 +106,7 @@ public:
1063 virtual ~UCharsDictionaryMatcher();
1064 virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
1065 int32_t *lengths, int32_t *cpLengths, int32_t *values,
1066 - int32_t *prefix) const;
1067 + int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const;
1068 virtual int32_t getType() const;
1070 const UChar *characters;
1071 @@ -123,7 +124,7 @@ public:
1072 virtual ~BytesDictionaryMatcher();
1073 virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
1074 int32_t *lengths, int32_t *cpLengths, int32_t *values,
1075 - int32_t *prefix) const;
1076 + int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const;
1077 virtual int32_t getType() const;
1079 UChar32 transform(UChar32 c) const;
1080 diff --git a/source/data/Makefile.in b/source/data/Makefile.in
1081 index 816c82d..c637d70 100644
1082 --- misc/icu/source/data/Makefile.in
1083 +++ build/icu/source/data/Makefile.in
1084 @@ -181,7 +181,7 @@ endif
1088 -packagedata: icupkg.inc $(PKGDATA_LIST) build-local
1089 +packagedata: icupkg.inc $(PKGDATA_LIST) build-local $(MAINBUILDDIR)/khmerdict.stamp
1090 ifneq ($(ENABLE_STATIC),)
1091 ifeq ($(PKGDATA_MODE),dll)
1092 $(PKGDATA_INVOKE) $(PKGDATA) -e $(ICUDATA_ENTRY_POINT) -T $(OUTTMPDIR) -p $(ICUDATA_NAME) $(PKGDATA_LIBSTATICNAME) -m static $(PKGDATA_VERSIONING) $(PKGDATA_LIST)
1093 @@ -564,8 +564,14 @@ $(BRKBLDDIR)/burmesedict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
1094 $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1000 -c -i $(BUILDDIR) $(DICTSRCDIR)/burmesedict.txt $(BRKBLDDIR)/burmesedict.dict
1096 # TODO: figure out why combining characters are here?
1097 -$(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
1098 - $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
1099 +#$(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
1100 +# $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
1102 +#$(MAINBUILDDIR)/khmerdict.stamp: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(BRKSRCDIR)/khmerdict.txt build-local
1103 +# $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
1104 +$(MAINBUILDDIR)/khmerdict.stamp: $(BRKSRCDIR)/khmerdict.dict build-local
1105 + cp $< $(BRKBLDDIR)
1106 + echo "timestamp" > $@
1108 #################################################### CFU