external/icu/khmerbreakengine.patch

   1 diff --git a/source/common/dictbe.cpp b/source/common/dictbe.cpp
   2 index f1c874d..3ad1b3f 100644
   3 --- misc/icu/source/common/dictbe.cpp
   4 +++ build/icu/source/common/dictbe.cpp
   5 @@ -27,8 +27,17 @@ U_NAMESPACE_BEGIN
   6   ******************************************************************
   7   */
   8
   9 -DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
  10 +DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) :
  11 +    clusterLimit(3)
  12 +{
  13 +    UErrorCode status = U_ZERO_ERROR;
  14      fTypes = breakTypes;
  15 +    fViramaSet.applyPattern(UNICODE_STRING_SIMPLE("[[:ccc=VR:]]"), status);
  16 +
  17 +    // note Skip Sets contain fIgnoreSet characters too.
  18 +    fSkipStartSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=OP:][:lb=QU:]\\u200C\\u200D\\u2060]"), status);
  19 +    fSkipEndSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]\\u200C\\u200D\\u2060]"), status);
  20 +    fNBeforeSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status);
  21  }
  22
  23  DictionaryBreakEngine::~DictionaryBreakEngine() {
  24 @@ -90,7 +99,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
  25          result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
  26          utext_setNativeIndex(text, current);
  27      }
  28 -
  29 +
  30      return result;
  31  }
  32
  33 @@ -101,6 +110,169 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
  34      fSet.compact();
  35  }
  36
  37 +bool
  38 +DictionaryBreakEngine::scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const {
  39 +    UErrorCode status = U_ZERO_ERROR;
  40 +    UText* ut = utext_clone(NULL, text, false, true, &status);
  41 +    utext_setNativeIndex(ut, start);
  42 +    UChar32 c = utext_current32(ut);
  43 +    bool res = false;
  44 +    doBreak = true;
  45 +    while (start >= 0) {
  46 +        if (!fSkipStartSet.contains(c)) {
  47 +            res = (c == ZWSP);
  48 +            break;
  49 +        }
  50 +        --start;
  51 +        c = utext_previous32(ut);
  52 +        doBreak = false;
  53 +    }
  54 +    utext_close(ut);
  55 +    return res;
  56 +}
  57 +
  58 +bool
  59 +DictionaryBreakEngine::scanAfterEnd(UText *text, int32_t textEnd, int32_t& end, bool &doBreak) const {
  60 +    UErrorCode status = U_ZERO_ERROR;
  61 +    UText* ut = utext_clone(NULL, text, false, true, &status);
  62 +    utext_setNativeIndex(ut, end);
  63 +    UChar32 c = utext_current32(ut);
  64 +    bool res = false;
  65 +    doBreak = !fNBeforeSet.contains(c);
  66 +    while (end < textEnd) {
  67 +        if (!fSkipEndSet.contains(c)) {
  68 +            res = (c == ZWSP);
  69 +            break;
  70 +        }
  71 +        ++end;
  72 +        c = utext_next32(ut);
  73 +        doBreak = false;
  74 +    }
  75 +    utext_close(ut);
  76 +    return res;
  77 +}
  78 +
  79 +void
  80 +DictionaryBreakEngine::scanBackClusters(UText *text, int32_t textStart, int32_t& start) const {
  81 +    UChar32 c = 0;
  82 +    start = utext_getNativeIndex(text);
  83 +    while (start > textStart) {
  84 +        c = utext_previous32(text);
  85 +        --start;
  86 +        if (!fSkipEndSet.contains(c))
  87 +            break;
  88 +    }
  89 +    for (int i = 0; i < clusterLimit; ++i) { // scan backwards clusterLimit clusters
  90 +        while (start > textStart) {
  91 +            while (fIgnoreSet.contains(c))
  92 +                c = utext_previous32(text);
  93 +            if (!fMarkSet.contains(c)) {
  94 +                if (fBaseSet.contains(c)) {
  95 +                    c = utext_previous32(text);
  96 +                    if (!fViramaSet.contains(c)) { // Virama (e.g. coeng) preceding base. Treat sequence as a mark
  97 +                        utext_next32(text);
  98 +                        c = utext_current32(text);
  99 +                        break;
 100 +                    } else {
 101 +                        --start;
 102 +                    }
 103 +                } else {
 104 +                    break;
 105 +                }
 106 +            }
 107 +            c = utext_previous32(text);
 108 +            --start;
 109 +        }
 110 +        if (!fBaseSet.contains(c) || start < textStart) {  // not a cluster start so finish
 111 +            break;
 112 +        }
 113 +        c = utext_previous32(text);
 114 +        --start;        // go round again
 115 +    }                   // ignore hitting previous inhibitor since scanning for it should have found us!
 116 +    ++start;            // counteract --before
 117 +}
 118 +
 119 +void
 120 +DictionaryBreakEngine::scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const {
 121 +    UChar32 c = utext_current32(text);
 122 +    end = utext_getNativeIndex(text);
 123 +    while (end < textEnd) {
 124 +        if (!fSkipStartSet.contains(c))
 125 +            break;
 126 +        utext_next32(text);
 127 +        c = utext_current32(text);
 128 +        ++end;
 129 +    }
 130 +    for (int i = 0; i < clusterLimit; ++i) { // scan forwards clusterLimit clusters
 131 +        while (fIgnoreSet.contains(c)) {
 132 +            utext_next32(text);
 133 +            c = utext_current32(text);
 134 +        }
 135 +        if (fBaseSet.contains(c)) {
 136 +            while (end < textEnd) {
 137 +                utext_next32(text);
 138 +                c = utext_current32(text);
 139 +                ++end;
 140 +                if (!fMarkSet.contains(c))
 141 +                    break;
 142 +                else if (fViramaSet.contains(c)) {  // handle coeng + base as mark
 143 +                    utext_next32(text);
 144 +                    c = utext_current32(text);
 145 +                    ++end;
 146 +                    if (!fBaseSet.contains(c))
 147 +                        break;
 148 +                }
 149 +            }
 150 +        } else {
 151 +            --end;    // bad char so break after char before it
 152 +            break;
 153 +        }
 154 +    }
 155 +}
 156 +
 157 +bool
 158 +DictionaryBreakEngine::scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const {
 159 +    UErrorCode status = U_ZERO_ERROR;
 160 +    UText* ut = utext_clone(NULL, text, false, true, &status);
 161 +    int32_t nat = start;
 162 +    utext_setNativeIndex(ut, nat);
 163 +    bool foundFirst = true;
 164 +    int32_t curr = start;
 165 +    while (nat < end) {
 166 +        UChar32 c = utext_current32(ut);
 167 +        if (c == ZWSP || c == WJ) {
 168 +            curr = nat + 1;
 169 +            if (foundFirst)     // only scan backwards for first inhibitor
 170 +                scanBackClusters(ut, start, before);
 171 +            foundFirst = false; // don't scan backwards if we go around again. Also marks found something
 172 +
 173 +            utext_next32(ut);
 174 +            scanFwdClusters(ut, end, after);
 175 +            nat = after + 1;
 176 +
 177 +            if (c == ZWSP || c == WJ) {  // did we hit another one?
 178 +                continue;
 179 +            } else {
 180 +                break;
 181 +            }
 182 +        }
 183 +
 184 +        ++nat;                  // keep hunting
 185 +        utext_next32(ut);
 186 +    }
 187 +
 188 +    utext_close(ut);
 189 +
 190 +    if (nat >= end && foundFirst) {
 191 +        start = before = after = nat;
 192 +        return false;           // failed to find anything
 193 +    }
 194 +    else {
 195 +        start = curr;
 196 +    }
 197 +    return true;                // yup hit one
 198 +}
 199 +
 200  /*
 201   ******************************************************************
 202   * PossibleWord
 203 @@ -128,35 +302,35 @@ private:
 204  public:
 205      PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
 206      ~PossibleWord() {};
 207 -
 208 +
 209      // Fill the list of candidates if needed, select the longest, and return the number found
 210 -    int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
 211 -
 212 +    int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet = NULL, int32_t minLength = 0 );
 213 +
 214      // Select the currently marked candidate, point after it in the text, and invalidate self
 215      int32_t   acceptMarked( UText *text );
 216 -
 217 +
 218      // Back up from the current candidate to the next shorter one; return TRUE if that exists
 219      // and point the text after it
 220      UBool     backUp( UText *text );
 221 -
 222 +
 223      // Return the longest prefix this candidate location shares with a dictionary word
 224      // Return value is in code points.
 225      int32_t   longestPrefix() { return prefix; };
 226 -
 227 +
 228      // Mark the current candidate as the one we like
 229      void      markCurrent() { mark = current; };
 230 -
 231 +
 232      // Get length in code points of the marked word.
 233      int32_t   markedCPLength() { return cpLengths[mark]; };
 234  };
 235
 236
 237 -int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
 238 +int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet, int32_t minLength) {
 239      // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
 240      int32_t start = (int32_t)utext_getNativeIndex(text);
 241      if (start != offset) {
 242          offset = start;
 243 -        count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
 244 +        count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix, ignoreSet, minLength);
 245          // Dictionary leaves text after longest prefix, not longest word. Back up.
 246          if (count <= 0) {
 247              utext_setNativeIndex(text, start);
 248 @@ -828,51 +1002,28 @@ foundBest:
 249   * KhmerBreakEngine
 250   */
 251
 252 -// How many words in a row are "good enough"?
 253 -static const int32_t KHMER_LOOKAHEAD = 3;
 254 -
 255 -// Will not combine a non-word with a preceding dictionary word longer than this
 256 -static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
 257 -
 258 -// Will not combine a non-word that shares at least this much prefix with a
 259 -// dictionary word, with a preceding word
 260 -static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
 261 -
 262 -// Minimum word size
 263 -static const int32_t KHMER_MIN_WORD = 2;
 264 -
 265 -// Minimum number of characters for two words
 266 -static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
 267 -
 268  KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 269      : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
 270        fDictionary(adoptDictionary)
 271  {
 272 -    fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
 273 +
 274 +    clusterLimit = 3;
 275 +
 276 +    fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]\\u2060\\u200C\\u200D]"), status);
 277      if (U_SUCCESS(status)) {
 278          setCharacters(fKhmerWordSet);
 279      }
 280      fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
 281 -    fMarkSet.add(0x0020);
 282 -    fEndWordSet = fKhmerWordSet;
 283 -    fBeginWordSet.add(0x1780, 0x17B3);
 284 -    //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
 285 -    //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
 286 -    //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
 287 -    fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
 288 -    //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
 289 -//    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
 290 -//    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
 291 -//    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
 292 -//    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
 293 -//    fSuffixSet.add(THAI_PAIYANNOI);
 294 -//    fSuffixSet.add(THAI_MAIYAMOK);
 295 +    fIgnoreSet.add(0x2060);         // WJ
 296 +    fIgnoreSet.add(0x200C, 0x200D); // ZWJ, ZWNJ
 297 +    fBaseSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:lb=SA:]&[:^M:]]"), status);
 298 +    fPuncSet.applyPattern(UNICODE_STRING_SIMPLE("[\\u17D4\\u17D5\\u17D6\\u17D7\\u17D9:]"), status);
 299
 300      // Compact for caching.
 301      fMarkSet.compact();
 302 -    fEndWordSet.compact();
 303 -    fBeginWordSet.compact();
 304 -//    fSuffixSet.compact();
 305 +       fIgnoreSet.compact();
 306 +       fBaseSet.compact();
 307 +       fPuncSet.compact();
 308  }
 309
 310  KhmerBreakEngine::~KhmerBreakEngine() {
 311 @@ -884,180 +1036,204 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
 312                                                  int32_t rangeStart,
 313                                                  int32_t rangeEnd,
 314                                                  UStack &foundBreaks ) const {
 315 -    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
 316 -        return 0;       // Not enough characters for two words
 317 +    uint32_t wordsFound = foundBreaks.size();
 318 +    UErrorCode status = U_ZERO_ERROR;
 319 +    int32_t before = 0;
 320 +    int32_t after = 0;
 321 +    int32_t finalBefore = 0;
 322 +    int32_t initAfter = 0;
 323 +    int32_t scanStart = rangeStart;
 324 +    int32_t scanEnd = rangeEnd;
 325 +
 326 +    bool startZwsp = false;
 327 +    bool breakStart = false;
 328 +    bool breakEnd = false;
 329 +
 330 +    if (rangeStart > 0) {
 331 +        --scanStart;
 332 +        startZwsp = scanBeforeStart(text, scanStart, breakStart);
 333 +    }
 334 +    utext_setNativeIndex(text, rangeStart);
 335 +    scanFwdClusters(text, rangeEnd, initAfter);
 336 +    bool endZwsp = scanAfterEnd(text, utext_nativeLength(text), scanEnd, breakEnd);
 337 +    utext_setNativeIndex(text, rangeEnd - 1);
 338 +    scanBackClusters(text, rangeStart, finalBefore);
 339 +    if (finalBefore < initAfter) {   // the whole run is tented so no breaks
 340 +        if (breakStart || fTypes < UBRK_LINE)
 341 +            foundBreaks.push(rangeStart, status);
 342 +        if (breakEnd || fTypes < UBRK_LINE)
 343 +            foundBreaks.push(rangeEnd, status);
 344 +        return foundBreaks.size() - wordsFound;
 345      }
 346
 347 -    uint32_t wordsFound = 0;
 348 -    int32_t cpWordLength = 0;
 349 -    int32_t cuWordLength = 0;
 350 -    int32_t current;
 351 -    UErrorCode status = U_ZERO_ERROR;
 352 -    PossibleWord words[KHMER_LOOKAHEAD];
 353 +    scanStart = rangeStart;
 354 +    scanWJ(text, scanStart, rangeEnd, before, after);
 355 +    if (startZwsp || initAfter >= before) {
 356 +        after = initAfter;
 357 +        before = 0;
 358 +    }
 359 +    if (!endZwsp && after > finalBefore && after < rangeEnd)
 360 +        endZwsp = true;
 361 +    if (endZwsp && before > finalBefore)
 362 +        before = finalBefore;
 363
 364      utext_setNativeIndex(text, rangeStart);
 365 +    int32_t numCodePts = rangeEnd - rangeStart;
 366 +    // bestSnlp[i] is the snlp of the best segmentation of the first i
 367 +    // code points in the range to be matched.
 368 +    UVector32 bestSnlp(numCodePts + 1, status);
 369 +    bestSnlp.addElement(0, status);
 370 +    for(int32_t i = 1; i <= numCodePts; i++) {
 371 +        bestSnlp.addElement(kuint32max, status);
 372 +    }
 373
 374 -    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 375 -        cuWordLength = 0;
 376 -        cpWordLength = 0;
 377 +    // prev[i] is the index of the last code point in the previous word in
 378 +    // the best segmentation of the first i characters. Note negative implies
 379 +       // that the code point is part of an unknown word.
 380 +    UVector32 prev(numCodePts + 1, status);
 381 +    for(int32_t i = 0; i <= numCodePts; i++) {
 382 +        prev.addElement(kuint32max, status);
 383 +    }
 384
 385 -        // Look for candidate words at the current position
 386 -        int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 387 +    const int32_t maxWordSize = 20;
 388 +    UVector32 values(maxWordSize, status);
 389 +    values.setSize(maxWordSize);
 390 +    UVector32 lengths(maxWordSize, status);
 391 +    lengths.setSize(maxWordSize);
 392
 393 -        // If we found exactly one, use that
 394 -        if (candidates == 1) {
 395 -            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
 396 -            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
 397 -            wordsFound += 1;
 398 -        }
 399 +    // Dynamic programming to find the best segmentation.
 400
 401 -        // If there was more than one, see which one can take us forward the most words
 402 -        else if (candidates > 1) {
 403 -            // If we're already at the end of the range, we're done
 404 -            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 405 -                goto foundBest;
 406 -            }
 407 -            do {
 408 -                int32_t wordsMatched = 1;
 409 -                if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 410 -                    if (wordsMatched < 2) {
 411 -                        // Followed by another dictionary word; mark first word as a good candidate
 412 -                        words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
 413 -                        wordsMatched = 2;
 414 -                    }
 415 +    // In outer loop, i  is the code point index,
 416 +    //                ix is the corresponding string (code unit) index.
 417 +    //    They differ when the string contains supplementary characters.
 418 +    int32_t ix = rangeStart;
 419 +    for (int32_t i = 0;  i < numCodePts;  ++i, utext_setNativeIndex(text, ++ix)) {
 420 +        if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
 421 +            continue;
 422 +        }
 423
 424 -                    // If we're already at the end of the range, we're done
 425 -                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 426 -                        goto foundBest;
 427 -                    }
 428 +        int32_t count;
 429 +        count = fDictionary->matches(text, numCodePts - i, maxWordSize,
 430 +                             NULL, lengths.getBuffer(), values.getBuffer(), NULL, &fIgnoreSet, 2);
 431 +                             // Note: lengths is filled with code point lengths
 432 +                             //       The NULL parameter is the ignored code unit lengths.
 433
 434 -                    // See if any of the possible second words is followed by a third word
 435 -                    do {
 436 -                        // If we find a third word, stop right away
 437 -                        if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 438 -                            words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
 439 -                            goto foundBest;
 440 -                        }
 441 -                    }
 442 -                    while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
 443 -                }
 444 +        for (int32_t j = 0; j < count; j++) {
 445 +            int32_t ln = lengths.elementAti(j);
 446 +            if (ln + i >= numCodePts)
 447 +                continue;
 448 +            utext_setNativeIndex(text, ln+ix);
 449 +            int32_t c = utext_current32(text);
 450 +            if (fMarkSet.contains(c) || c == 0x17D2) { // Coeng
 451 +                lengths.removeElementAt(j);
 452 +                values.removeElementAt(j);
 453 +                --j;
 454 +                --count;
 455              }
 456 -            while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
 457 -foundBest:
 458 -            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
 459 -            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
 460 -            wordsFound += 1;
 461          }
 462 -
 463 -        // We come here after having either found a word or not. We look ahead to the
 464 -        // next word. If it's not a dictionary word, we will combine it with the word we
 465 -        // just found (if there is one), but only if the preceding word does not exceed
 466 -        // the threshold.
 467 -        // The text iterator should now be positioned at the end of the word we found.
 468 -        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
 469 -            // if it is a dictionary word, do nothing. If it isn't, then if there is
 470 -            // no preceding word, or the non-word shares less than the minimum threshold
 471 -            // of characters with a dictionary word, then scan to resynchronize
 472 -            if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 473 -                  && (cuWordLength == 0
 474 -                      || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
 475 -                // Look for a plausible word boundary
 476 -                int32_t remaining = rangeEnd - (current+cuWordLength);
 477 -                UChar32 pc;
 478 -                UChar32 uc;
 479 -                int32_t chars = 0;
 480 -                for (;;) {
 481 -                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 482 -                    pc = utext_next32(text);
 483 -                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 484 -                    chars += pcSize;
 485 -                    remaining -= pcSize;
 486 -                    if (remaining <= 0) {
 487 +        if (count == 0) {
 488 +            utext_setNativeIndex(text, ix);
 489 +            int32_t c = utext_current32(text);
 490 +            if (fPuncSet.contains(c) || fIgnoreSet.contains(c) || c == ZWSP) {
 491 +                values.setElementAt(0, count);
 492 +                lengths.setElementAt(1, count++);
 493 +            } else if (fBaseSet.contains(c)) {
 494 +                int32_t currix = utext_getNativeIndex(text);
 495 +                do {
 496 +                    utext_next32(text);
 497 +                    c = utext_current32(text);
 498 +                    if (utext_getNativeIndex(text) >= rangeEnd)
 499                          break;
 500 -                    }
 501 -                    uc = utext_current32(text);
 502 -                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 503 -                        // Maybe. See if it's in the dictionary.
 504 -                        int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 505 -                        utext_setNativeIndex(text, current+cuWordLength+chars);
 506 -                        if (candidates > 0) {
 507 +                    if (c == 0x17D2) { // Coeng
 508 +                        utext_next32(text);
 509 +                        c = utext_current32(text);
 510 +                        if (!fBaseSet.contains(c) || utext_getNativeIndex(text) >= rangeEnd) {
 511                              break;
 512 +                        } else {
 513 +                            utext_next32(text);
 514 +                            c = utext_current32(text);
 515 +                            if (utext_getNativeIndex(text) >= rangeEnd)
 516 +                                break;
 517                          }
 518                      }
 519 -                }
 520 -
 521 -                // Bump the word count if there wasn't already one
 522 -                if (cuWordLength <= 0) {
 523 -                    wordsFound += 1;
 524 -                }
 525 +                } while (fMarkSet.contains(c) || fIgnoreSet.contains(c));
 526 +                values.setElementAt(BADSNLP, count);
 527 +                lengths.setElementAt(utext_getNativeIndex(text) - currix, count++);
 528 +            } else {
 529 +                values.setElementAt(BADSNLP, count);
 530 +                lengths.setElementAt(1, count++);
 531 +            }
 532 +        }
 533
 534 -                // Update the length with the passed-over characters
 535 -                cuWordLength += chars;
 536 +        for (int32_t j = 0; j < count; j++) {
 537 +            uint32_t v = values.elementAti(j);
 538 +            int32_t newSnlp = bestSnlp.elementAti(i) + v;
 539 +            int32_t ln = lengths.elementAti(j);
 540 +            utext_setNativeIndex(text, ln+ix);
 541 +            int32_t c = utext_current32(text);
 542 +            while ((fPuncSet.contains(c) || fIgnoreSet.contains(c)) && ln + i < numCodePts) {
 543 +                ++ln;
 544 +                utext_next32(text);
 545 +                c = utext_current32(text);
 546              }
 547 -            else {
 548 -                // Back up to where we were for next iteration
 549 -                utext_setNativeIndex(text, current+cuWordLength);
 550 +            int32_t ln_j_i = ln + i;   // yes really i!
 551 +            if (newSnlp < bestSnlp.elementAti(ln_j_i)) {
 552 +                if (v == BADSNLP) {
 553 +                    int32_t p = prev.elementAti(i);
 554 +                    if (p < 0)
 555 +                        prev.setElementAt(p, ln_j_i);
 556 +                    else
 557 +                        prev.setElementAt(-i, ln_j_i);
 558 +                }
 559 +                else
 560 +                    prev.setElementAt(i, ln_j_i);
 561 +                bestSnlp.setElementAt(newSnlp, ln_j_i);
 562              }
 563          }
 564 +    }
 565 +    // Start pushing the optimal offset index into t_boundary (t for tentative).
 566 +    // prev[numCodePts] is guaranteed to be meaningful.
 567 +    // We'll first push in the reverse order, i.e.,
 568 +    // t_boundary[0] = numCodePts, and afterwards do a swap.
 569 +    UVector32 t_boundary(numCodePts+1, status);
 570
 571 -        // Never stop before a combining mark.
 572 -        int32_t currPos;
 573 -        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 574 -            utext_next32(text);
 575 -            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 576 +    int32_t numBreaks = 0;
 577 +    // No segmentation found, set boundary to end of range
 578 +    while (numCodePts >= 0 && (uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
 579 +        --numCodePts;
 580 +    }
 581 +    if (numCodePts < 0) {
 582 +        t_boundary.addElement(numCodePts, status);
 583 +        numBreaks++;
 584 +    } else {
 585 +        for (int32_t i = numCodePts; (uint32_t)i != kuint32max; i = prev.elementAti(i)) {
 586 +            if (i < 0) i = -i;
 587 +            t_boundary.addElement(i, status);
 588 +            numBreaks++;
 589          }
 590 +        U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
 591 +    }
 592
 593 -        // Look ahead for possible suffixes if a dictionary word does not follow.
 594 -        // We do this in code rather than using a rule so that the heuristic
 595 -        // resynch continues to function. For example, one of the suffix characters
 596 -        // could be a typo in the middle of a word.
 597 -//        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
 598 -//            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 599 -//                && fSuffixSet.contains(uc = utext_current32(text))) {
 600 -//                if (uc == KHMER_PAIYANNOI) {
 601 -//                    if (!fSuffixSet.contains(utext_previous32(text))) {
 602 -//                        // Skip over previous end and PAIYANNOI
 603 -//                        utext_next32(text);
 604 -//                        utext_next32(text);
 605 -//                        wordLength += 1;            // Add PAIYANNOI to word
 606 -//                        uc = utext_current32(text);     // Fetch next character
 607 -//                    }
 608 -//                    else {
 609 -//                        // Restore prior position
 610 -//                        utext_next32(text);
 611 -//                    }
 612 -//                }
 613 -//                if (uc == KHMER_MAIYAMOK) {
 614 -//                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
 615 -//                        // Skip over previous end and MAIYAMOK
 616 -//                        utext_next32(text);
 617 -//                        utext_next32(text);
 618 -//                        wordLength += 1;            // Add MAIYAMOK to word
 619 -//                    }
 620 -//                    else {
 621 -//                        // Restore prior position
 622 -//                        utext_next32(text);
 623 -//                    }
 624 -//                }
 625 -//            }
 626 -//            else {
 627 -//                utext_setNativeIndex(text, current+wordLength);
 628 -//            }
 629 -//        }
 630 -
 631 -        // Did we find a word on this iteration? If so, push it on the break stack
 632 -        if (cuWordLength > 0) {
 633 -            foundBreaks.push((current+cuWordLength), status);
 634 +    // Now that we're done, convert positions in t_boundary[] (indices in
 635 +    // the normalized input string) back to indices in the original input UText
 636 +    // while reversing t_boundary and pushing values to foundBreaks.
 637 +    for (int32_t i = numBreaks-1; i >= 0; i--) {
 638 +        int32_t cpPos = t_boundary.elementAti(i);
 639 +        if (cpPos == 0 && !breakStart && fTypes >= UBRK_LINE) continue;
 640 +        int32_t utextPos = cpPos + rangeStart;
 641 +        while (utextPos > after && scanWJ(text, utextPos, scanEnd, before, after));
 642 +        if (utextPos < before) {
 643 +        // Boundaries are added to foundBreaks output in ascending order.
 644 +            U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
 645 +            foundBreaks.push(utextPos, status);
 646          }
 647      }
 648 -
 649 +
 650      // Don't return a break for the end of the dictionary range if there is one there.
 651 -    if (foundBreaks.peeki() >= rangeEnd) {
 652 +    if (!breakEnd && fTypes >= UBRK_LINE && foundBreaks.peeki() >= rangeEnd) {
 653          (void) foundBreaks.popi();
 654 -        wordsFound -= 1;
 655      }
 656 -
 657 -    return wordsFound;
 658 +    return foundBreaks.size() - wordsFound;
 659  }
 660
 661  #if !UCONFIG_NO_NORMALIZATION
 662 diff --git a/source/common/dictbe.h b/source/common/dictbe.h
 663 index d3488cd..26caa75 100644
 664 --- misc/icu/source/common/dictbe.h
 665 +++ build/icu/source/common/dictbe.h
 666 @@ -32,6 +32,15 @@ class Normalizer2;
 667   */
 668  class DictionaryBreakEngine : public LanguageBreakEngine {
 669   private:
 670 +
 671 +  /**
 672 +   * <p>Default constructor.</p>
 673 +   *
 674 +   */
 675 +  DictionaryBreakEngine();
 676 +
 677 + protected:
 678 +
 679      /**
 680       * The set of characters handled by this engine
 681       * @internal
 682 @@ -46,11 +55,63 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
 683
 684    uint32_t      fTypes;
 685
 686 +  const int32_t WJ   = 0x2060;
 687 +  const int32_t ZWSP = 0x200B;
 688 +
 689    /**
 690 -   * <p>Default constructor.</p>
 691 -   *
 692 +   * A Unicode set of all viramas
 693 +   * @internal
 694     */
 695 -  DictionaryBreakEngine();
 696 +  UnicodeSet    fViramaSet;
 697 +
 698 +  /**
 699 +   * A Unicode set of all base characters
 700 +   * @internal
 701 +   */
 702 +  UnicodeSet    fBaseSet;
 703 +
 704 +  /**
 705 +   * A Unicode set of all marks
 706 +   * @internal
 707 +   */
 708 +  UnicodeSet    fMarkSet;
 709 +
 710 +  /**
 711 +   * A Unicode set of all characters ignored ignored in dictionary matching
 712 +   * @internal
 713 +   */
 714 +  UnicodeSet    fIgnoreSet;
 715 +
 716 +  /**
 717 +   * A Unicode set of all characters ignored ignored in dictionary matching
 718 +   * @internal
 719 +   */
 720 +  UnicodeSet    fSkipStartSet;
 721 +
 722 +  /**
 723 +   * A Unicode set of all characters ignored ignored in dictionary matching
 724 +   * @internal
 725 +   */
 726 +  UnicodeSet    fSkipEndSet;
 727 +
 728 +  /**
 729 +   * A Unicode set of all characters that should not be broken before
 730 +   * @internal
 731 +   */
 732 +  UnicodeSet    fNBeforeSet;
 733 +
 734 +  /**
 735 +   * The number of clusters within which breaks are inhibited
 736 +   * @internal
 737 +   */
 738 +  int32_t clusterLimit;
 739 +
 740 +  bool scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const;
 741 +
 742 +  bool scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const;
 743 +  bool scanAfterEnd(UText *text, int32_t rangeEnd, int32_t& end, bool &doBreak) const;
 744 +  void scanBackClusters(UText *text, int32_t textStart, int32_t& start) const;
 745 +  void scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const;
 746
 747   public:
 748
 749 @@ -81,7 +142,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
 750     * <p>Find any breaks within a run in the supplied text.</p>
 751     *
 752     * @param text A UText representing the text. The iterator is left at
 753 -   * the end of the run of characters which the engine is capable of handling
 754 +   * the end of the run of characters which the engine is capable of handling
 755     * that starts from the first (or last) character in the range.
 756     * @param startPos The start of the run within the supplied text.
 757     * @param endPos The end of the run within the supplied text.
 758 @@ -243,118 +304,120 @@ class LaoBreakEngine : public DictionaryBreakEngine {
 759
 760  };
 761
 762 -/*******************************************************************
 763 - * BurmeseBreakEngine
 764 - */
 765 -
 766 -/**
 767 - * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
 768 - * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
 769 - *
 770 - * <p>After it is constructed a BurmeseBreakEngine may be shared between
 771 - * threads without synchronization.</p>
 772 - */
 773 -class BurmeseBreakEngine : public DictionaryBreakEngine {
 774 - private:
 775 -    /**
 776 -     * The set of characters handled by this engine
 777 -     * @internal
 778 -     */
 779 -
 780 -  UnicodeSet                fBurmeseWordSet;
 781 -  UnicodeSet                fEndWordSet;
 782 -  UnicodeSet                fBeginWordSet;
 783 -  UnicodeSet                fMarkSet;
 784 -  DictionaryMatcher  *fDictionary;
 785 -
 786 - public:
 787 -
 788 -  /**
 789 -   * <p>Default constructor.</p>
 790 -   *
 791 -   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
 792 -   * engine is deleted.
 793 -   */
 794 -  BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
 795 -
 796 -  /**
 797 -   * <p>Virtual destructor.</p>
 798 -   */
 799 -  virtual ~BurmeseBreakEngine();
 800 -
 801 - protected:
 802 - /**
 803 -  * <p>Divide up a range of known dictionary characters.</p>
 804 -  *
 805 -  * @param text A UText representing the text
 806 -  * @param rangeStart The start of the range of dictionary characters
 807 -  * @param rangeEnd The end of the range of dictionary characters
 808 -  * @param foundBreaks Output of C array of int32_t break positions, or 0
 809 -  * @return The number of breaks found
 810 -  */
 811 -  virtual int32_t divideUpDictionaryRange( UText *text,
 812 -                                           int32_t rangeStart,
 813 -                                           int32_t rangeEnd,
 814 -                                           UStack &foundBreaks ) const;
 815 -
 816 -};
 817 -
 818 -/*******************************************************************
 819 - * KhmerBreakEngine
 820 - */
 821 -
 822 -/**
 823 - * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
 824 - * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
 825 - *
 826 - * <p>After it is constructed a KhmerBreakEngine may be shared between
 827 - * threads without synchronization.</p>
 828 - */
 829 -class KhmerBreakEngine : public DictionaryBreakEngine {
 830 - private:
 831 -    /**
 832 -     * The set of characters handled by this engine
 833 -     * @internal
 834 -     */
 835 -
 836 -  UnicodeSet                fKhmerWordSet;
 837 -  UnicodeSet                fEndWordSet;
 838 -  UnicodeSet                fBeginWordSet;
 839 -  UnicodeSet                fMarkSet;
 840 -  DictionaryMatcher  *fDictionary;
 841 -
 842 - public:
 843 -
 844 -  /**
 845 -   * <p>Default constructor.</p>
 846 -   *
 847 -   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
 848 -   * engine is deleted.
 849 -   */
 850 -  KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
 851 -
 852 -  /**
 853 -   * <p>Virtual destructor.</p>
 854 -   */
 855 -  virtual ~KhmerBreakEngine();
 856 -
 857 - protected:
 858 - /**
 859 -  * <p>Divide up a range of known dictionary characters.</p>
 860 -  *
 861 -  * @param text A UText representing the text
 862 -  * @param rangeStart The start of the range of dictionary characters
 863 -  * @param rangeEnd The end of the range of dictionary characters
 864 -  * @param foundBreaks Output of C array of int32_t break positions, or 0
 865 -  * @return The number of breaks found
 866 -  */
 867 -  virtual int32_t divideUpDictionaryRange( UText *text,
 868 -                                           int32_t rangeStart,
 869 -                                           int32_t rangeEnd,
 870 -                                           UStack &foundBreaks ) const;
 871 -
 872 -};
 873 -
 874 +/*******************************************************************
 875 + * BurmeseBreakEngine
 876 + */
 877 +
 878 +/**
 879 + * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
 880 + * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
 881 + *
 882 + * <p>After it is constructed a BurmeseBreakEngine may be shared between
 883 + * threads without synchronization.</p>
 884 + */
 885 +class BurmeseBreakEngine : public DictionaryBreakEngine {
 886 + private:
 887 +    /**
 888 +     * The set of characters handled by this engine
 889 +     * @internal
 890 +     */
 891 +
 892 +  UnicodeSet                fBurmeseWordSet;
 893 +  UnicodeSet                fEndWordSet;
 894 +  UnicodeSet                fBeginWordSet;
 895 +  UnicodeSet                fMarkSet;
 896 +  DictionaryMatcher  *fDictionary;
 897 +
 898 + public:
 899 +
 900 +  /**
 901 +   * <p>Default constructor.</p>
 902 +   *
 903 +   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
 904 +   * engine is deleted.
 905 +   */
 906 +  BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
 907 +
 908 +  /**
 909 +   * <p>Virtual destructor.</p>
 910 +   */
 911 +  virtual ~BurmeseBreakEngine();
 912 +
 913 + protected:
 914 + /**
 915 +  * <p>Divide up a range of known dictionary characters.</p>
 916 +  *
 917 +  * @param text A UText representing the text
 918 +  * @param rangeStart The start of the range of dictionary characters
 919 +  * @param rangeEnd The end of the range of dictionary characters
 920 +  * @param foundBreaks Output of C array of int32_t break positions, or 0
 921 +  * @return The number of breaks found
 922 +  */
 923 +  virtual int32_t divideUpDictionaryRange( UText *text,
 924 +                                           int32_t rangeStart,
 925 +                                           int32_t rangeEnd,
 926 +                                           UStack &foundBreaks ) const;
 927 +
 928 +};
 929 +
 930 +/*******************************************************************
 931 + * KhmerBreakEngine
 932 + */
 933 +
 934 +/**
 935 + * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
 936 + * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
 937 + *
 938 + * <p>After it is constructed a KhmerBreakEngine may be shared between
 939 + * threads without synchronization.</p>
 940 + */
 941 +class KhmerBreakEngine : public DictionaryBreakEngine {
 942 + private:
 943 +    /**
 944 +     * The set of characters handled by this engine
 945 +     * @internal
 946 +     */
 947 +
 948 +  UnicodeSet                fKhmerWordSet;
 949 +  UnicodeSet                fBeginWordSet;
 950 +  UnicodeSet                fPuncSet;
 951 +  DictionaryMatcher        *fDictionary;
 952 +
 953 +  const uint32_t BADSNLP = 256 * 20;
 954 +  const uint32_t kuint32max = 0x7FFFFFFF;
 955 +
 956 + public:
 957 +
 958 +  /**
 959 +   * <p>Default constructor.</p>
 960 +   *
 961 +   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
 962 +   * engine is deleted.
 963 +   */
 964 +  KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
 965 +
 966 +  /**
 967 +   * <p>Virtual destructor.</p>
 968 +   */
 969 +  virtual ~KhmerBreakEngine();
 970 +
 971 + protected:
 972 + /**
 973 +  * <p>Divide up a range of known dictionary characters.</p>
 974 +  *
 975 +  * @param text A UText representing the text
 976 +  * @param rangeStart The start of the range of dictionary characters
 977 +  * @param rangeEnd The end of the range of dictionary characters
 978 +  * @param foundBreaks Output of C array of int32_t break positions, or 0
 979 +  * @return The number of breaks found
 980 +  */
 981 +  virtual int32_t divideUpDictionaryRange( UText *text,
 982 +                                           int32_t rangeStart,
 983 +                                           int32_t rangeEnd,
 984 +                                           UStack &foundBreaks ) const;
 985 +
 986 +};
 987 +
 988  #if !UCONFIG_NO_NORMALIZATION
 989
 990  /*******************************************************************
 991 diff --git a/source/common/dictionarydata.cpp b/source/common/dictionarydata.cpp
 992 index cb594c6..82f2e77 100644
 993 --- misc/icu/source/common/dictionarydata.cpp
 994 +++ build/icu/source/common/dictionarydata.cpp
 995 @@ -42,7 +42,7 @@ int32_t UCharsDictionaryMatcher::getType() const {
 996
 997  int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
 998                              int32_t *lengths, int32_t *cpLengths, int32_t *values,
 999 -                            int32_t *prefix) const {
1000 +                            int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {
1001
1002      UCharsTrie uct(characters);
1003      int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
1004 @@ -53,7 +53,13 @@ int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t
1005          UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
1006          int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
1007          codePointsMatched += 1;
1008 +        if (ignoreSet != NULL && ignoreSet->contains(c)) {
1009 +            continue;
1010 +        }
1011          if (USTRINGTRIE_HAS_VALUE(result)) {
1012 +            if (codePointsMatched < minLength) {
1013 +                continue;
1014 +            }
1015              if (wordCount < limit) {
1016                  if (values != NULL) {
1017                      values[wordCount] = uct.getValue();
1018 @@ -110,7 +116,7 @@ int32_t BytesDictionaryMatcher::getType() const {
1019
1020  int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
1021                              int32_t *lengths, int32_t *cpLengths, int32_t *values,
1022 -                            int32_t *prefix) const {
1023 +                            int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {
1024      BytesTrie bt(characters);
1025      int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
1026      int32_t wordCount = 0;
1027 @@ -120,7 +126,13 @@ int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t
1028          UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
1029          int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
1030          codePointsMatched += 1;
1031 +        if (ignoreSet != NULL && ignoreSet->contains(c)) {
1032 +            continue;
1033 +        }
1034          if (USTRINGTRIE_HAS_VALUE(result)) {
1035 +            if (codePointsMatched < minLength) {
1036 +                continue;
1037 +            }
1038              if (wordCount < limit) {
1039                  if (values != NULL) {
1040                      values[wordCount] = bt.getValue();
1041 diff --git a/source/common/dictionarydata.h b/source/common/dictionarydata.h
1042 index 0216ab0..ee9e571 100644
1043 --- misc/icu/source/common/dictionarydata.h
1044 +++ build/icu/source/common/dictionarydata.h
1045 @@ -19,6 +19,7 @@
1046  #include "unicode/utext.h"
1047  #include "unicode/udata.h"
1048  #include "udataswp.h"
1049 +#include "unicode/uniset.h"
1050  #include "unicode/uobject.h"
1051  #include "unicode/ustringtrie.h"
1052
1053 @@ -90,7 +91,7 @@ public:
1054       */
1055      virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
1056                              int32_t *lengths, int32_t *cpLengths, int32_t *values,
1057 -                            int32_t *prefix) const = 0;
1058 +                            int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const = 0;
1059
1060      /** @return DictionaryData::TRIE_TYPE_XYZ */
1061      virtual int32_t getType() const = 0;
1062 @@ -105,7 +106,7 @@ public:
1063      virtual ~UCharsDictionaryMatcher();
1064      virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
1065                              int32_t *lengths, int32_t *cpLengths, int32_t *values,
1066 -                            int32_t *prefix) const;
1067 +                            int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const;
1068      virtual int32_t getType() const;
1069  private:
1070      const UChar *characters;
1071 @@ -123,7 +124,7 @@ public:
1072      virtual ~BytesDictionaryMatcher();
1073      virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
1074                              int32_t *lengths, int32_t *cpLengths, int32_t *values,
1075 -                            int32_t *prefix) const;
1076 +                            int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const;
1077      virtual int32_t getType() const;
1078  private:
1079      UChar32 transform(UChar32 c) const;
1080 diff --git a/source/data/Makefile.in b/source/data/Makefile.in
1081 index 816c82d..c637d70 100644
1082 --- misc/icu/source/data/Makefile.in
1083 +++ build/icu/source/data/Makefile.in
1084 @@ -181,7 +181,7 @@ endif
1085  endif
1086  endif
1087
1088 -packagedata: icupkg.inc $(PKGDATA_LIST) build-local
1089 +packagedata: icupkg.inc $(PKGDATA_LIST) build-local $(MAINBUILDDIR)/khmerdict.stamp
1090  ifneq ($(ENABLE_STATIC),)
1091  ifeq ($(PKGDATA_MODE),dll)
1092         $(PKGDATA_INVOKE) $(PKGDATA) -e $(ICUDATA_ENTRY_POINT) -T $(OUTTMPDIR) -p $(ICUDATA_NAME) $(PKGDATA_LIBSTATICNAME) -m static $(PKGDATA_VERSIONING) $(PKGDATA_LIST)
1093 @@ -564,8 +564,14 @@ $(BRKBLDDIR)/burmesedict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
1094         $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1000 -c -i $(BUILDDIR) $(DICTSRCDIR)/burmesedict.txt $(BRKBLDDIR)/burmesedict.dict
1095
1096  # TODO: figure out why combining characters are here?
1097 -$(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
1098 -       $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
1099 +#$(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
1100 +#      $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
1101 +
1102 +#$(MAINBUILDDIR)/khmerdict.stamp: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(BRKSRCDIR)/khmerdict.txt build-local
1103 +#      $(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
1104 +$(MAINBUILDDIR)/khmerdict.stamp: $(BRKSRCDIR)/khmerdict.dict build-local
1105 +       cp $< $(BRKBLDDIR)
1106 +       echo "timestamp" > $@
1107
1108  ####################################################    CFU
1109  # CFU FILES
1110