i18npool/qa/cppunit/test_breakiterator.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  */
   9
  10 #include <com/sun/star/i18n/XBreakIterator.hpp>
  11 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
  12 #include <com/sun/star/i18n/ScriptType.hpp>
  13 #include <com/sun/star/i18n/WordType.hpp>
  14 #include <unotest/bootstrapfixturebase.hxx>
  15
  16 #include <unicode/uvernum.h>
  17
  18 #include <rtl/strbuf.hxx>
  19
  20 #include <string.h>
  21
  22 #include <stack>
  23
  24 using namespace ::com::sun::star;
  25
  26 class TestBreakIterator : public test::BootstrapFixtureBase
  27 {
  28 public:
  29     virtual void setUp() override;
  30     virtual void tearDown() override;
  31
  32     void testLineBreaking();
  33     void testWordBoundaries();
  34     void testGraphemeIteration();
  35     void testWeak();
  36     void testAsian();
  37     void testThai();
  38 #if (U_ICU_VERSION_MAJOR_NUM > 51)
  39     void testLao();
  40 #ifdef TODO
  41     void testNorthernThai();
  42     void testKhmer();
  43 #endif
  44 #endif
  45     void testJapanese();
  46     void testChinese();
  47
  48     CPPUNIT_TEST_SUITE(TestBreakIterator);
  49     CPPUNIT_TEST(testLineBreaking);
  50     CPPUNIT_TEST(testWordBoundaries);
  51     CPPUNIT_TEST(testGraphemeIteration);
  52     CPPUNIT_TEST(testWeak);
  53     CPPUNIT_TEST(testAsian);
  54     CPPUNIT_TEST(testThai);
  55 #if (U_ICU_VERSION_MAJOR_NUM > 51)
  56     CPPUNIT_TEST(testLao);
  57 #ifdef TODO
  58     CPPUNIT_TEST(testKhmer);
  59     CPPUNIT_TEST(testNorthernThai);
  60 #endif
  61 #endif
  62     CPPUNIT_TEST(testJapanese);
  63     CPPUNIT_TEST(testChinese);
  64     CPPUNIT_TEST_SUITE_END();
  65
  66 private:
  67     uno::Reference<i18n::XBreakIterator> m_xBreak;
  68     void doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak);
  69 };
  70
  71 void TestBreakIterator::testLineBreaking()
  72 {
  73     i18n::LineBreakHyphenationOptions aHyphOptions;
  74     i18n::LineBreakUserOptions aUserOptions;
  75     lang::Locale aLocale;
  76
  77     //See https://bugs.libreoffice.org/show_bug.cgi?id=31271
  78     {
  79         OUString aTest("(some text here)");
  80
  81         aLocale.Language = "en";
  82         aLocale.Country = "US";
  83
  84         {
  85             //Here we want the line break to leave text here) on the next line
  86             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
  87             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(6), aResult.breakIndex);
  88         }
  89
  90         {
  91             //Here we want the line break to leave "here)" on the next line
  92             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
  93             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(11), aResult.breakIndex);
  94         }
  95     }
  96
  97     //See https://bugs.libreoffice.org/show_bug.cgi?id=49849
  98     {
  99         const sal_Unicode HEBREW1[] = { 0x05DE, 0x05D9, 0x05DC, 0x05D9, 0x5DD };
 100         OUString aWord(HEBREW1, SAL_N_ELEMENTS(HEBREW1));
 101         OUString aTest(aWord + " " + aWord);
 102
 103         aLocale.Language = "he";
 104         aLocale.Country = "IL";
 105
 106         {
 107             //Here we want the line break to happen at the whitespace
 108             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
 109             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", aWord.getLength()+1, aResult.breakIndex);
 110         }
 111     }
 112
 113     //See https://bz.apache.org/ooo/show_bug.cgi?id=17155
 114     {
 115         OUString const aTest("foo /bar/baz");
 116
 117         aLocale.Language = "en";
 118         aLocale.Country = "US";
 119
 120         {
 121             //Here we want the line break to leave /bar/ba clumped together on the next line
 122             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("foo /bar/ba"), aLocale, 0,
 123                 aHyphOptions, aUserOptions);
 124             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first slash", static_cast<sal_Int32>(4), aResult.breakIndex);
 125         }
 126     }
 127
 128     //See https://bz.apache.org/ooo/show_bug.cgi?id=19716
 129     {
 130         OUString aTest("aaa]aaa");
 131
 132         aLocale.Language = "en";
 133         aLocale.Country = "US";
 134
 135         {
 136             //Here we want the line break to move the whole lot to the next line
 137             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
 138                 aHyphOptions, aUserOptions);
 139             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the line, not at ]", static_cast<sal_Int32>(0), aResult.breakIndex);
 140         }
 141     }
 142
 143     //this is an example sequence from tdf92993-1.docx caught by the load crashtesting
 144     {
 145         const sal_Unicode WEIRD1[] = { 0xd83c, 0xdf56, 0xd83c, 0xdf57, 0xd83c, 0xdf46,
 146                                        0xd83c, 0xdf64, 0x2668, 0xfe0f, 0xd83c, 0xdfc6};
 147
 148         OUString aTest(WEIRD1, SAL_N_ELEMENTS(WEIRD1));
 149
 150         aLocale.Language = "en";
 151         aLocale.Country = "US";
 152
 153         {
 154             //This must not assert/crash
 155             (void)m_xBreak->getLineBreak(aTest, 0, aLocale, 0, aHyphOptions, aUserOptions);
 156         }
 157     }
 158
 159     //See https://bugs.documentfoundation.org/show_bug.cgi?id=96197
 160     {
 161         const sal_Unicode HANGUL[] = { 0xc560, 0xad6D, 0xac00, 0xc758, 0x0020, 0xac00,
 162                                        0xc0ac, 0xb294};
 163         OUString aTest(HANGUL, SAL_N_ELEMENTS(HANGUL));
 164
 165         aLocale.Language = "ko";
 166         aLocale.Country = "KR";
 167
 168         {
 169             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
 170                 aHyphOptions, aUserOptions);
 171             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast<sal_Int32>(5), aResult.breakIndex);
 172         }
 173     }
 174 }
 175
 176 //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
 177 void TestBreakIterator::testWordBoundaries()
 178 {
 179     lang::Locale aLocale;
 180     aLocale.Language = "en";
 181     aLocale.Country = "US";
 182
 183     i18n::Boundary aBounds;
 184
 185     //See https://bz.apache.org/ooo/show_bug.cgi?id=11993
 186     {
 187         OUString aTest("abcd ef  ghi??? KLM");
 188
 189         CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
 190         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
 191         aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
 192         CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
 193
 194         CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
 195         CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
 196
 197         //next word
 198         aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true);
 199         CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
 200
 201         //previous word
 202         aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 203         CPPUNIT_ASSERT(aBounds.startPos == 5 && aBounds.endPos == 7);
 204
 205         CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
 206         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
 207         aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true);
 208         CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
 209
 210         CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
 211         CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
 212         aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true);
 213         CPPUNIT_ASSERT(aBounds.startPos == 16 && aBounds.endPos == 19);
 214     }
 215
 216     //See https://bz.apache.org/ooo/show_bug.cgi?id=21907
 217     {
 218         OUString aTest("b a?");
 219
 220         CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
 221         CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
 222         CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
 223
 224         CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
 225
 226         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
 227         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
 228         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
 229
 230         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
 231     }
 232
 233     //See https://bz.apache.org/ooo/show_bug.cgi?id=14904
 234     {
 235         const sal_Unicode TEST[] =
 236         {
 237             'W', 'o', 'r', 'k', 'i', 'n', 'g', ' ', 0x201C, 'W', 'o', 'r', 'd', 's',
 238             ' ', 's', 't', 'a', 'r', 't', 'i', 'n', 'g', ' ', 'w', 'i', 't',
 239             'h', ' ', 'q', 'u', 'o', 't', 'e', 's', 0x201D, ' ', 'W', 'o', 'r', 'k',
 240             'i', 'n', 'g', ' ', 0x2018, 'B', 'r', 'o', 'k', 'e', 'n', 0x2019, ' ',
 241             '?', 'S', 'p', 'a', 'n', 'i', 's', 'h', '?', ' ', 'd', 'o', 'e',
 242             's', 'n', 0x2019, 't', ' ', 'w', 'o', 'r', 'k', '.', ' ', 'N', 'o',
 243             't', ' ', 'e', 'v', 'e', 'n', ' ' , 0x00BF, 'r', 'e', 'a', 'l', '?', ' ',
 244             'S', 'p', 'a', 'n', 'i', 's', 'h'
 245         };
 246         OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
 247
 248         aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 249         CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
 250
 251         aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 252         CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 14);
 253
 254         aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 255         CPPUNIT_ASSERT(aBounds.startPos == 37 && aBounds.endPos == 44);
 256
 257         aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 258         CPPUNIT_ASSERT(aBounds.startPos == 46 && aBounds.endPos == 52);
 259
 260         aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 261         CPPUNIT_ASSERT(aBounds.startPos == 55 && aBounds.endPos == 62);
 262
 263         aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 264         CPPUNIT_ASSERT(aBounds.startPos == 64 && aBounds.endPos == 71);
 265
 266         aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 267         CPPUNIT_ASSERT(aBounds.startPos == 88 && aBounds.endPos == 92);
 268     }
 269
 270     //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
 271     sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF };
 272     for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
 273     {
 274         //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
 275         for (size_t i = 0; i < SAL_N_ELEMENTS(aBreakTests); ++i)
 276         {
 277 #if (U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM <= 2)
 278             //Note the breakiterator test is known to fail on older icu
 279             //versions (4.2.1) for the 200B (ZWSP) Zero Width Space testcase.
 280             if (aBreakTests[i] == 0x200B)
 281                 continue;
 282 #endif
 283             OUString aTest = "Word" + OUStringChar(aBreakTests[i]) + "Word";
 284             aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
 285             switch (mode)
 286             {
 287                 case i18n::WordType::ANY_WORD:
 288                     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
 289                     break;
 290                 case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
 291                     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
 292                     break;
 293                 case i18n::WordType::DICTIONARY_WORD:
 294                     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
 295                     break;
 296                 case i18n::WordType::WORD_COUNT:
 297                     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
 298                     break;
 299             }
 300
 301             CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
 302             CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
 303         }
 304     }
 305
 306     sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB };
 307     for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
 308     {
 309         //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
 310         for (size_t i = 0; i < SAL_N_ELEMENTS(aJoinTests); ++i)
 311         {
 312             OUString aTest = "Word" + OUStringChar(aJoinTests[i]) + "Word";
 313             aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
 314             switch (mode)
 315             {
 316                 case i18n::WordType::ANY_WORD:
 317                     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
 318                     break;
 319                 case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
 320                     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
 321                     break;
 322                 case i18n::WordType::DICTIONARY_WORD:
 323                     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
 324                     break;
 325                 case i18n::WordType::WORD_COUNT:
 326                     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
 327                     break;
 328             }
 329
 330             CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
 331             CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
 332         }
 333     }
 334
 335     //See https://bz.apache.org/ooo/show_bug.cgi?id=13494
 336     {
 337         const OUString aBase("xxAAxxBBxxCCxx");
 338         const sal_Unicode aTests[] =
 339         {
 340             '\'', ';', ',', '.', '!', '@', '#', '%', '&', '*',
 341             '(', ')', '_', '-', '{', '}', '[', ']', '\"', '/',
 342             '\\', '?', '~', '$', '+', '^', '=', '<', '>', '|'
 343         };
 344
 345         const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14};
 346         for (size_t j = 0; j < SAL_N_ELEMENTS(aTests); ++j)
 347         {
 348             OUString aTest = aBase.replace('x', aTests[j]);
 349             sal_Int32 nPos = -1;
 350             size_t i = 0;
 351             do
 352             {
 353                 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aDoublePositions));
 354                 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 355                 CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
 356                 ++i;
 357             }
 358             while (nPos < aTest.getLength());
 359             nPos = aTest.getLength();
 360             i = SAL_N_ELEMENTS(aDoublePositions)-1;
 361             do
 362             {
 363                 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 364                 --i;
 365                 CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
 366             }
 367             while (nPos > 0);
 368         }
 369
 370         const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10};
 371         for (size_t j = 1; j < SAL_N_ELEMENTS(aTests); ++j)
 372         {
 373             OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[j]));
 374             sal_Int32 nPos = -1;
 375             size_t i = 0;
 376             do
 377             {
 378                 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSinglePositions));
 379                 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 380                 CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
 381                 ++i;
 382             }
 383             while (nPos < aTest.getLength());
 384             nPos = aTest.getLength();
 385             i = SAL_N_ELEMENTS(aSinglePositions)-1;
 386             do
 387             {
 388                 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 389                 --i;
 390                 CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
 391             }
 392             while (nPos > 0);
 393         }
 394
 395         const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10};
 396         CPPUNIT_ASSERT_EQUAL(u'\'', aTests[0]);
 397         {
 398             OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[0]));
 399             sal_Int32 nPos = -1;
 400             size_t i = 0;
 401             do
 402             {
 403                 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSingleQuotePositions));
 404                 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 405                 CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
 406                 ++i;
 407             }
 408             while (nPos < aTest.getLength());
 409             nPos = aTest.getLength();
 410             i = SAL_N_ELEMENTS(aSingleQuotePositions)-1;
 411             do
 412             {
 413                 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 414                 --i;
 415                 CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
 416             }
 417             while (nPos > 0);
 418         }
 419     }
 420
 421     //See https://bz.apache.org/ooo/show_bug.cgi?id=13451
 422     {
 423         aLocale.Language = "ca";
 424         aLocale.Country = "ES";
 425
 426         OUString aTest("mirar-se comprar-vos donem-nos les mans aneu-vos-en!");
 427
 428         sal_Int32 nPos = 0;
 429         sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52};
 430         size_t i = 0;
 431         do
 432         {
 433             CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
 434             nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
 435                 i18n::WordType::DICTIONARY_WORD, true).endPos;
 436             CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
 437             ++i;
 438         }
 439         while (nPos++ < aTest.getLength());
 440         CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
 441     }
 442
 443     //See https://bz.apache.org/ooo/show_bug.cgi?id=85411
 444     for (int j = 0; j < 3; ++j)
 445     {
 446         switch (j)
 447         {
 448             case 0:
 449                 aLocale.Language = "en";
 450                 aLocale.Country = "US";
 451                 break;
 452             case 1:
 453                 aLocale.Language = "ca";
 454                 aLocale.Country = "ES";
 455                 break;
 456             case 2:
 457                 aLocale.Language = "fi";
 458                 aLocale.Country = "FI";
 459                 break;
 460             default:
 461                 CPPUNIT_ASSERT(false);
 462                 break;
 463         }
 464
 465         const sal_Unicode TEST[] =
 466         {
 467             'I', 0x200B, 'w', 'a', 'n', 't', 0x200B, 't', 'o', 0x200B, 'g', 'o'
 468         };
 469         OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
 470
 471         sal_Int32 nPos = 0;
 472         sal_Int32 aExpected[] = {1, 6, 9, 12};
 473         size_t i = 0;
 474         do
 475         {
 476             CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
 477             nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
 478                 i18n::WordType::DICTIONARY_WORD, true).endPos;
 479             CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
 480             ++i;
 481         }
 482         while (nPos++ < aTest.getLength());
 483         CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
 484     }
 485
 486     //https://bz.apache.org/ooo/show_bug.cgi?id=21290
 487     for (int j = 0; j < 2; ++j)
 488     {
 489         switch (j)
 490         {
 491             case 0:
 492                 aLocale.Language = "en";
 493                 aLocale.Country = "US";
 494                 break;
 495             case 1:
 496                 aLocale.Language = "grc";
 497                 aLocale.Country.clear();
 498                 break;
 499             default:
 500                 CPPUNIT_ASSERT(false);
 501                 break;
 502         }
 503
 504         const sal_Unicode TEST[] =
 505         {
 506             0x1F0C, 0x03BD, 0x03B4, 0x03C1, 0x03B1, 0x0020, 0x1F00,
 507             0x03C1, 0x03BD, 0x1F7B, 0x03BC, 0x03B5, 0x03BD, 0x03BF,
 508             0x03C2, 0x0020, 0x1F00, 0x03BB, 0x03BB, 0x0020, 0x1F24,
 509             0x03C3, 0x03B8, 0x03B9, 0x03BF, 0x03BD
 510         };
 511         OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
 512
 513         sal_Int32 nPos = 0;
 514         sal_Int32 aExpected[] = {5, 15, 19, 26};
 515         size_t i = 0;
 516         do
 517         {
 518             CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
 519             nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
 520                 i18n::WordType::DICTIONARY_WORD, true).endPos;
 521             CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
 522             ++i;
 523         }
 524         while (nPos++ < aTest.getLength());
 525         CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
 526     }
 527
 528     //See https://bz.apache.org/ooo/show_bug.cgi?id=58513
 529     //See https://bugs.libreoffice.org/show_bug.cgi?id=55707
 530     {
 531         aLocale.Language = "fi";
 532         aLocale.Country = "FI";
 533
 534         OUString aTest("Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n");
 535
 536         {
 537             sal_Int32 nPos = 0;
 538             sal_Int32 aExpected[] = {11, 21, 24, 36, 42, 47, 51};
 539             size_t i = 0;
 540             do
 541             {
 542                 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
 543                 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
 544                     i18n::WordType::WORD_COUNT, true).endPos;
 545                 CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
 546                 ++i;
 547             }
 548             while (nPos++ < aTest.getLength());
 549             CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
 550         }
 551
 552         {
 553             sal_Int32 nPos = 0;
 554             sal_Int32 aExpected[] = {0, 11, 12, 20, 22, 24, 25, 36, 37,
 555                                     40, 41, 42, 43, 45, 46, 47, 50, 51};
 556             size_t i = 0;
 557             do
 558             {
 559                 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
 560                 aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
 561                     i18n::WordType::DICTIONARY_WORD, true);
 562                 CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.startPos);
 563                 ++i;
 564                 CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.endPos);
 565                 ++i;
 566                 nPos = aBounds.endPos;
 567             }
 568             while (nPos++ < aTest.getLength());
 569             CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
 570         }
 571     }
 572
 573     //See https://bz.apache.org/ooo/show_bug.cgi?id=107843
 574     {
 575         aLocale.Language = "en";
 576         aLocale.Country = "US";
 577
 578         const sal_Unicode TEST[] =
 579         {
 580             'r', 'u', 0xFB00, 'l', 'e', ' ', 0xFB01, 's', 'h'
 581         };
 582         OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
 583
 584         aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 585         CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 5);
 586
 587         aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 588         CPPUNIT_ASSERT(aBounds.startPos == 6 && aBounds.endPos == 9);
 589     }
 590
 591     //See https://bz.apache.org/ooo/show_bug.cgi?id=113785
 592     {
 593         aLocale.Language = "en";
 594         aLocale.Country = "US";
 595
 596         const sal_Unicode TEST[] =
 597         {
 598             'a', 0x2013, 'b', 0x2014, 'c'
 599         };
 600         OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
 601
 602         aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
 603         CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 1);
 604
 605         aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD);
 606         CPPUNIT_ASSERT(aBounds.startPos == 2 && aBounds.endPos == 3);
 607
 608         aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD);
 609         CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 5);
 610     }
 611 }
 612
 613 //See https://bugs.libreoffice.org/show_bug.cgi?id=40292
 614 //See https://bz.apache.org/ooo/show_bug.cgi?id=80412
 615 //See https://bz.apache.org/ooo/show_bug.cgi?id=111152
 616 //See https://bz.apache.org/ooo/show_bug.cgi?id=50172
 617 void TestBreakIterator::testGraphemeIteration()
 618 {
 619     lang::Locale aLocale;
 620     aLocale.Language = "bn";
 621     aLocale.Country = "IN";
 622
 623     {
 624         const sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF };
 625         OUString aTest(BA_HALANT_LA, SAL_N_ELEMENTS(BA_HALANT_LA));
 626
 627         sal_Int32 nDone=0;
 628         sal_Int32 nPos;
 629         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
 630             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 631         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(BA_HALANT_LA)), nPos);
 632         nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(BA_HALANT_LA), aLocale,
 633             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 634         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
 635     }
 636
 637     {
 638         const sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF };
 639         OUString aTest(HA_HALANT_NA_VOWELSIGNI, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
 640
 641         sal_Int32 nDone=0;
 642         sal_Int32 nPos;
 643         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
 644             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 645         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI)), nPos);
 646         nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI), aLocale,
 647             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 648         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
 649     }
 650
 651     {
 652         const sal_Unicode TA_HALANT_MA_HALANT_YA  [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF };
 653         OUString aTest(TA_HALANT_MA_HALANT_YA, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
 654
 655         sal_Int32 nDone=0;
 656         sal_Int32 nPos;
 657         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
 658             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 659         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA)), nPos);
 660         nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA), aLocale,
 661             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 662         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
 663     }
 664
 665     aLocale.Language = "ta";
 666     aLocale.Country = "IN";
 667
 668     {
 669         const sal_Unicode KA_VIRAMA_SSA[] = { 0x0B95, 0x0BCD, 0x0BB7 };
 670         OUString aTest(KA_VIRAMA_SSA, SAL_N_ELEMENTS(KA_VIRAMA_SSA));
 671
 672         sal_Int32 nDone=0;
 673         sal_Int32 nPos = 0;
 674
 675         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
 676             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 677         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(KA_VIRAMA_SSA)), nPos);
 678         nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VIRAMA_SSA), aLocale,
 679             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 680         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
 681     }
 682
 683     {
 684         const sal_Unicode KA_VOWELSIGNU[] = { 0x0B95, 0x0BC1 };
 685         OUString aTest(KA_VOWELSIGNU, SAL_N_ELEMENTS(KA_VOWELSIGNU));
 686
 687         sal_Int32 nDone=0;
 688         sal_Int32 nPos = 0;
 689
 690         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
 691             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 692         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(KA_VOWELSIGNU)), nPos);
 693         nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VOWELSIGNU), aLocale,
 694             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 695         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
 696     }
 697
 698     {
 699         const sal_Unicode CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI[] =
 700             { 0x0B9A, 0x0BBF, 0x0BA4, 0x0BCD, 0x0BA4, 0x0BBF, 0x0BB0, 0x0BC8 };
 701         OUString aTest(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI,
 702             SAL_N_ELEMENTS(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI));
 703
 704         sal_Int32 nDone=0;
 705         sal_Int32 nPos=0;
 706
 707         for (sal_Int32 i = 0; i < 4; ++i)
 708         {
 709             sal_Int32 nOldPos = nPos;
 710             nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale,
 711                 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 712             CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos+2, nPos);
 713         }
 714
 715         for (sal_Int32 i = 0; i < 4; ++i)
 716         {
 717             sal_Int32 nOldPos = nPos;
 718             nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale,
 719                 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 720             CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos-2, nPos);
 721         }
 722     }
 723
 724     {
 725         const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
 726         OUString aText(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
 727
 728         sal_Int32 nGraphemeCount = 0;
 729
 730         sal_Int32 nCurPos = 0;
 731         while (nCurPos < aText.getLength())
 732         {
 733             sal_Int32 nCount2 = 1;
 734             nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
 735                 i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
 736             ++nGraphemeCount;
 737         }
 738
 739         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be considered 1 grapheme", static_cast<sal_Int32>(1), nGraphemeCount);
 740     }
 741
 742     aLocale.Language = "hi";
 743     aLocale.Country = "IN";
 744
 745     {
 746         const sal_Unicode SHA_VOWELSIGNII[] = { 0x936, 0x940 };
 747         OUString aTest(SHA_VOWELSIGNII, SAL_N_ELEMENTS(SHA_VOWELSIGNII));
 748
 749         sal_Int32 nDone=0;
 750         sal_Int32 nPos = 0;
 751
 752         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
 753             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 754         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(SHA_VOWELSIGNII)), nPos);
 755         nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(SHA_VOWELSIGNII), aLocale,
 756             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 757         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
 758     }
 759 }
 760
 761 //A test to ensure that certain ranges and codepoints that are categorized as
 762 //weak remain as weak, so that existing docs that depend on this don't silently
 763 //change font for those weak chars
 764 void TestBreakIterator::testWeak()
 765 {
 766     lang::Locale aLocale;
 767     aLocale.Language = "en";
 768     aLocale.Country = "US";
 769
 770     {
 771         const sal_Unicode WEAKS[] =
 772         {
 773             0x0001, 0x0002,
 774             0x0020, 0x00A0,
 775             0x0300, 0x036F, //Combining Diacritical Marks
 776             0x1AB0, 0x1AFF, //Combining Diacritical Marks Extended
 777             0x1DC0, 0x1DFF, //Combining Diacritical Marks Supplement
 778             0x20D0, 0x20FF, //Combining Diacritical Marks for Symbols
 779             0x2150, 0x215F, //Number Forms, fractions
 780             0x2160, 0x2180, //Number Forms, roman numerals
 781             0x2200, 0x22FF, //Mathematical Operators
 782             0x27C0, 0x27EF, //Miscellaneous Mathematical Symbols-A
 783             0x2980, 0x29FF, //Miscellaneous Mathematical Symbols-B
 784             0x2A00, 0x2AFF, //Supplemental Mathematical Operators
 785             0x2100, 0x214F, //Letterlike Symbols
 786             0x2308, 0x230B, //Miscellaneous technical
 787             0x25A0, 0x25FF, //Geometric Shapes
 788             0x2B30, 0x2B4C  //Miscellaneous Symbols and Arrows
 789         };
 790         OUString aWeaks(WEAKS, SAL_N_ELEMENTS(WEAKS));
 791
 792         for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
 793         {
 794             sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
 795             OString aMsg =
 796                 "Char 0x" +
 797                 OString::number(static_cast<sal_Int32>(aWeaks[i]), 16) +
 798                 " should have been weak";
 799             CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
 800                 i18n::ScriptType::WEAK, nScript);
 801         }
 802     }
 803 }
 804
 805 //A test to ensure that certain ranges and codepoints that are categorized as
 806 //asian remain as asian, so that existing docs that depend on this don't silently
 807 //change font for those asian chars.
 808 //See https://bugs.libreoffice.org/show_bug.cgi?id=38095
 809 void TestBreakIterator::testAsian()
 810 {
 811     lang::Locale aLocale;
 812     aLocale.Language = "en";
 813     aLocale.Country = "US";
 814
 815     {
 816         const sal_Unicode ASIANS[] =
 817         {
 818             //some typical CJK chars
 819             0x4E00, 0x62FF,
 820             //The full HalfWidth and FullWidth block has historically been
 821             //designated as taking the CJK font :-(
 822             //HalfWidth and FullWidth forms of ASCII 0-9, categorized under
 823             //UAX24 as "Common" i.e. by that logic WEAK
 824             0xFF10, 0xFF19,
 825             //HalfWidth and FullWidth forms of ASCII A-z, categorized under
 826             //UAX25 as "Latin", i.e. by that logic LATIN
 827             0xFF21, 0xFF5A
 828         };
 829         OUString aAsians(ASIANS, SAL_N_ELEMENTS(ASIANS));
 830
 831         for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
 832         {
 833             sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
 834             OString aMsg =
 835                 "Char 0x" +
 836                 OString::number(static_cast<sal_Int32>(aAsians[i]), 16) +
 837                 " should have been asian";
 838             CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
 839                 i18n::ScriptType::ASIAN, nScript);
 840         }
 841     }
 842 }
 843
 844 #if (U_ICU_VERSION_MAJOR_NUM > 51)
 845 //A test to ensure that our Lao word boundary detection is useful
 846 void TestBreakIterator::testLao()
 847 {
 848     lang::Locale aLocale;
 849     aLocale.Language = "lo";
 850     aLocale.Country = "LA";
 851
 852     const sal_Unicode LAO[] = { 0x0e8d, 0x0eb4, 0x0e99, 0x0e94, 0x0eb5, 0x0e95, 0x0ec9, 0x0ead, 0x0e99, 0x0eae, 0x0eb1, 0x0e9a };
 853     OUString aTest(LAO, SAL_N_ELEMENTS(LAO));
 854     i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
 855         i18n::WordType::DICTIONARY_WORD, true);
 856
 857     CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
 858     CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
 859
 860     aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
 861         i18n::WordType::DICTIONARY_WORD, true);
 862
 863     CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
 864     CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
 865 }
 866 #endif
 867
 868 //A test to ensure that our thai word boundary detection is useful
 869 void TestBreakIterator::testThai()
 870 {
 871     lang::Locale aLocale;
 872     aLocale.Language = "th";
 873     aLocale.Country = "TH";
 874
 875     //See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
 876     {
 877         const sal_Unicode THAI[] = { 0x0E01, 0x0E38, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
 878         OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
 879         i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
 880             i18n::WordType::DICTIONARY_WORD, true);
 881         CPPUNIT_ASSERT_MESSAGE("Should skip full word",
 882             aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
 883     }
 884
 885     //See https://bz.apache.org/ooo/show_bug.cgi?id=29548
 886     //make sure forwards and back are consistent
 887     {
 888         const sal_Unicode THAI[] =
 889         {
 890             0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
 891             0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
 892             0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27,
 893             0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
 894             0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
 895             0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27
 896         };
 897         OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
 898
 899         std::stack<sal_Int32> aPositions;
 900         sal_Int32 nPos = -1;
 901         do
 902         {
 903             nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 904             aPositions.push(nPos);
 905         }
 906         while (nPos < aTest.getLength());
 907         nPos = aTest.getLength();
 908         CPPUNIT_ASSERT(!aPositions.empty());
 909         aPositions.pop();
 910         do
 911         {
 912             CPPUNIT_ASSERT(!aPositions.empty());
 913             nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 914             CPPUNIT_ASSERT_EQUAL(aPositions.top(), nPos);
 915             aPositions.pop();
 916         }
 917         while (nPos > 0);
 918     }
 919
 920     // tdf#113694
 921     {
 922         const sal_Unicode NON_BMP[] = { 0xD800, 0xDC00 };
 923         OUString aTest(NON_BMP, SAL_N_ELEMENTS(NON_BMP));
 924
 925         sal_Int32 nDone=0;
 926         sal_Int32 nPos;
 927
 928         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
 929             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 930         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(SAL_N_ELEMENTS(NON_BMP)), nPos);
 931         nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(NON_BMP), aLocale,
 932             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 933         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
 934
 935         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
 936             i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
 937         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(SAL_N_ELEMENTS(NON_BMP)), nPos);
 938         nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(NON_BMP), aLocale,
 939             i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
 940         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
 941     }
 942 }
 943
 944 #ifdef TODO
 945 void TestBreakIterator::testNorthernThai()
 946 {
 947     lang::Locale aLocale;
 948     aLocale.Language = "nod";
 949     aLocale.Country = "TH";
 950
 951     const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
 952     OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1));
 953     i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
 954         i18n::WordType::DICTIONARY_WORD, true);
 955     CPPUNIT_ASSERT_MESSAGE("Should skip full word",
 956         aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
 957 }
 958
 959 // Not sure if any version earlier than 49 did have Khmer word boundary
 960 // dictionaries, 4.6 does not.
 961
 962 // As of icu 54, word boundary detection for Khmer is still considered
 963 // insufficient, so icu khmer stuff is disabled
 964
 965 //A test to ensure that our khmer word boundary detection is useful
 966 //https://bugs.libreoffice.org/show_bug.cgi?id=52020
 967 void TestBreakIterator::testKhmer()
 968 {
 969     lang::Locale aLocale;
 970     aLocale.Language = "km";
 971     aLocale.Country = "KH";
 972
 973     const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
 974
 975     OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER));
 976     i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
 977         i18n::WordType::DICTIONARY_WORD, true);
 978
 979     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
 980
 981     aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
 982         i18n::WordType::DICTIONARY_WORD, true);
 983
 984     CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5);
 985 }
 986 #endif
 987
 988 void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak)
 989 {
 990     lang::Locale aLocale;
 991     aLocale.Language = "ja";
 992     aLocale.Country = "JP";
 993     i18n::Boundary aBounds;
 994
 995     {
 996         const sal_Unicode JAPANESE[] = { 0x30B7, 0x30E3, 0x30C3, 0x30C8, 0x30C0, 0x30A6, 0x30F3 };
 997
 998         OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
 999         aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
1000             i18n::WordType::DICTIONARY_WORD, true);
1001
1002         CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
1003     }
1004
1005     {
1006         const sal_Unicode JAPANESE[] = { 0x9EBB, 0x306E, 0x8449, 0x9EBB, 0x306E, 0x8449 };
1007
1008         OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
1009         aBounds = xBreak->getWordBoundary(aTest, 1, aLocale,
1010             i18n::WordType::DICTIONARY_WORD, true);
1011
1012         CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
1013
1014         aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
1015             i18n::WordType::DICTIONARY_WORD, true);
1016
1017         CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 6);
1018     }
1019 }
1020
1021 void TestBreakIterator::testJapanese()
1022 {
1023     doTestJapanese(m_xBreak);
1024
1025     // fdo#78479 - test second / cached instantiation of xdictionary
1026     uno::Reference< i18n::XBreakIterator > xTmpBreak(m_xSFactory->createInstance(
1027         "com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
1028
1029     doTestJapanese(xTmpBreak);
1030 }
1031
1032 void TestBreakIterator::testChinese()
1033 {
1034     lang::Locale aLocale;
1035     aLocale.Language = "zh";
1036     aLocale.Country = "CN";
1037     i18n::Boundary aBounds;
1038
1039     {
1040         const sal_Unicode CHINESE[] = { 0x6A35, 0x6A30, 0x69FE, 0x8919, 0xD867, 0xDEDB  };
1041
1042         OUString aTest(CHINESE, SAL_N_ELEMENTS(CHINESE));
1043         aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale,
1044             i18n::WordType::DICTIONARY_WORD, true);
1045         CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 6);
1046     }
1047 }
1048 void TestBreakIterator::setUp()
1049 {
1050     BootstrapFixtureBase::setUp();
1051     m_xBreak.set(m_xSFactory->createInstance("com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
1052 }
1053
1054 void TestBreakIterator::tearDown()
1055 {
1056     m_xBreak.clear();
1057     BootstrapFixtureBase::tearDown();
1058 }
1059
1060 CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator);
1061
1062 CPPUNIT_PLUGIN_IMPLEMENT();
1063
1064 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */