i18npool/qa/cppunit/test_breakiterator.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  */
   9
  10 #include <cppuhelper/bootstrap.hxx>
  11 #include <cppuhelper/basemutex.hxx>
  12 #include <com/sun/star/i18n/XBreakIterator.hpp>
  13 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
  14 #include <com/sun/star/i18n/ScriptType.hpp>
  15 #include <com/sun/star/i18n/WordType.hpp>
  16 #include <unotest/bootstrapfixturebase.hxx>
  17
  18 #include <unicode/uversion.h>
  19
  20 #include <rtl/strbuf.hxx>
  21 #include <rtl/ustrbuf.hxx>
  22
  23 #include <string.h>
  24
  25 #include <stack>
  26
  27 using namespace ::com::sun::star;
  28
  29 class TestBreakIterator : public test::BootstrapFixtureBase
  30 {
  31 public:
  32     virtual void setUp() override;
  33     virtual void tearDown() override;
  34
  35     void testLineBreaking();
  36     void testWordBoundaries();
  37     void testGraphemeIteration();
  38     void testWeak();
  39     void testAsian();
  40     void testThai();
  41 #if (U_ICU_VERSION_MAJOR_NUM > 51)
  42     void testLao();
  43 #ifdef TODO
  44     void testNorthernThai();
  45     void testKhmer();
  46 #endif
  47 #endif
  48     void testJapanese();
  49     void testChinese();
  50
  51     CPPUNIT_TEST_SUITE(TestBreakIterator);
  52     CPPUNIT_TEST(testLineBreaking);
  53     CPPUNIT_TEST(testWordBoundaries);
  54     CPPUNIT_TEST(testGraphemeIteration);
  55     CPPUNIT_TEST(testWeak);
  56     CPPUNIT_TEST(testAsian);
  57     CPPUNIT_TEST(testThai);
  58 #if (U_ICU_VERSION_MAJOR_NUM > 51)
  59     CPPUNIT_TEST(testLao);
  60 #ifdef TODO
  61     CPPUNIT_TEST(testKhmer);
  62     CPPUNIT_TEST(testNorthernThai);
  63 #endif
  64 #endif
  65     CPPUNIT_TEST(testJapanese);
  66     CPPUNIT_TEST(testChinese);
  67     CPPUNIT_TEST_SUITE_END();
  68
  69 private:
  70     uno::Reference<i18n::XBreakIterator> m_xBreak;
  71     void doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak);
  72 };
  73
  74 void TestBreakIterator::testLineBreaking()
  75 {
  76     i18n::LineBreakHyphenationOptions aHyphOptions;
  77     i18n::LineBreakUserOptions aUserOptions;
  78     lang::Locale aLocale;
  79
  80     //See https://bugs.libreoffice.org/show_bug.cgi?id=31271
  81     {
  82         OUString aTest("(some text here)");
  83
  84         aLocale.Language = "en";
  85         aLocale.Country = "US";
  86
  87         {
  88             //Here we want the line break to leave text here) on the next line
  89             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
  90             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(6), aResult.breakIndex);
  91         }
  92
  93         {
  94             //Here we want the line break to leave "here)" on the next line
  95             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
  96             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(11), aResult.breakIndex);
  97         }
  98     }
  99
 100     //See https://bugs.libreoffice.org/show_bug.cgi?id=49849
 101     {
 102         const sal_Unicode HEBREW1[] = { 0x05DE, 0x05D9, 0x05DC, 0x05D9, 0x5DD };
 103         OUString aWord(HEBREW1, SAL_N_ELEMENTS(HEBREW1));
 104         OUString aTest(aWord + " " + aWord);
 105
 106         aLocale.Language = "he";
 107         aLocale.Country = "IL";
 108
 109         {
 110             //Here we want the line break to happen at the whitespace
 111             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
 112             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", aWord.getLength()+1, aResult.breakIndex);
 113         }
 114     }
 115
 116     //See https://bz.apache.org/ooo/show_bug.cgi?id=17155
 117     {
 118         OUString const aTest("foo /bar/baz");
 119
 120         aLocale.Language = "en";
 121         aLocale.Country = "US";
 122
 123         {
 124             //Here we want the line break to leave /bar/ba clumped together on the next line
 125             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("foo /bar/ba"), aLocale, 0,
 126                 aHyphOptions, aUserOptions);
 127             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first slash", static_cast<sal_Int32>(4), aResult.breakIndex);
 128         }
 129     }
 130
 131     //See https://bz.apache.org/ooo/show_bug.cgi?id=19716
 132     {
 133         OUString aTest("aaa]aaa");
 134
 135         aLocale.Language = "en";
 136         aLocale.Country = "US";
 137
 138         {
 139             //Here we want the line break to move the whole lot to the next line
 140             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
 141                 aHyphOptions, aUserOptions);
 142             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the line, not at ]", static_cast<sal_Int32>(0), aResult.breakIndex);
 143         }
 144     }
 145
 146     //this is an example sequence from tdf92993-1.docx caught by the load crashtesting
 147     {
 148         const sal_Unicode WEIRD1[] = { 0xd83c, 0xdf56, 0xd83c, 0xdf57, 0xd83c, 0xdf46,
 149                                        0xd83c, 0xdf64, 0x2668, 0xfe0f, 0xd83c, 0xdfc6};
 150
 151         OUString aTest(WEIRD1, SAL_N_ELEMENTS(WEIRD1));
 152
 153         aLocale.Language = "en";
 154         aLocale.Country = "US";
 155
 156         {
 157             //This must not assert/crash
 158             (void)m_xBreak->getLineBreak(aTest, 0, aLocale, 0, aHyphOptions, aUserOptions);
 159         }
 160     }
 161
 162     //See https://bugs.documentfoundation.org/show_bug.cgi?id=96197
 163     {
 164         const sal_Unicode HANGUL[] = { 0xc560, 0xad6D, 0xac00, 0xc758, 0x0020, 0xac00,
 165                                        0xc0ac, 0xb294};
 166         OUString aTest(HANGUL, SAL_N_ELEMENTS(HANGUL));
 167
 168         aLocale.Language = "ko";
 169         aLocale.Country = "KR";
 170
 171         {
 172             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
 173                 aHyphOptions, aUserOptions);
 174             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast<sal_Int32>(5), aResult.breakIndex);
 175         }
 176     }
 177 }
 178
 179 //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
 180 void TestBreakIterator::testWordBoundaries()
 181 {
 182     lang::Locale aLocale;
 183     aLocale.Language = "en";
 184     aLocale.Country = "US";
 185
 186     i18n::Boundary aBounds;
 187
 188     //See https://bz.apache.org/ooo/show_bug.cgi?id=11993
 189     {
 190         OUString aTest("abcd ef  ghi??? KLM");
 191
 192         CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
 193         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
 194         aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
 195         CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
 196
 197         CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
 198         CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
 199
 200         //next word
 201         aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true);
 202         CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
 203
 204         //previous word
 205         aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 206         CPPUNIT_ASSERT(aBounds.startPos == 5 && aBounds.endPos == 7);
 207
 208         CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
 209         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
 210         aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true);
 211         CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
 212
 213         CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
 214         CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
 215         aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true);
 216         CPPUNIT_ASSERT(aBounds.startPos == 16 && aBounds.endPos == 19);
 217     }
 218
 219     //See https://bz.apache.org/ooo/show_bug.cgi?id=21907
 220     {
 221         OUString aTest("b a?");
 222
 223         CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
 224         CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
 225         CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
 226
 227         CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
 228
 229         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
 230         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
 231         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
 232
 233         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
 234     }
 235
 236     //See https://bz.apache.org/ooo/show_bug.cgi?id=14904
 237     {
 238         const sal_Unicode TEST[] =
 239         {
 240             'W', 'o', 'r', 'k', 'i', 'n', 'g', ' ', 0x201C, 'W', 'o', 'r', 'd', 's',
 241             ' ', 's', 't', 'a', 'r', 't', 'i', 'n', 'g', ' ', 'w', 'i', 't',
 242             'h', ' ', 'q', 'u', 'o', 't', 'e', 's', 0x201D, ' ', 'W', 'o', 'r', 'k',
 243             'i', 'n', 'g', ' ', 0x2018, 'B', 'r', 'o', 'k', 'e', 'n', 0x2019, ' ',
 244             '?', 'S', 'p', 'a', 'n', 'i', 's', 'h', '?', ' ', 'd', 'o', 'e',
 245             's', 'n', 0x2019, 't', ' ', 'w', 'o', 'r', 'k', '.', ' ', 'N', 'o',
 246             't', ' ', 'e', 'v', 'e', 'n', ' ' , 0x00BF, 'r', 'e', 'a', 'l', '?', ' ',
 247             'S', 'p', 'a', 'n', 'i', 's', 'h'
 248         };
 249         OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
 250
 251         aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 252         CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
 253
 254         aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 255         CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 14);
 256
 257         aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 258         CPPUNIT_ASSERT(aBounds.startPos == 37 && aBounds.endPos == 44);
 259
 260         aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 261         CPPUNIT_ASSERT(aBounds.startPos == 46 && aBounds.endPos == 52);
 262
 263         aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 264         CPPUNIT_ASSERT(aBounds.startPos == 55 && aBounds.endPos == 62);
 265
 266         aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 267         CPPUNIT_ASSERT(aBounds.startPos == 64 && aBounds.endPos == 71);
 268
 269         aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 270         CPPUNIT_ASSERT(aBounds.startPos == 88 && aBounds.endPos == 92);
 271     }
 272
 273     //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
 274     sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF };
 275     for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
 276     {
 277         //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
 278         for (size_t i = 0; i < SAL_N_ELEMENTS(aBreakTests); ++i)
 279         {
 280 #if (U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM <= 2)
 281             //Note the breakiterator test is known to fail on older icu
 282             //versions (4.2.1) for the 200B (ZWSP) Zero Width Space testcase.
 283             if (aBreakTests[i] == 0x200B)
 284                 continue;
 285 #endif
 286             OUString aTest = "Word" + OUStringLiteral1(aBreakTests[i]) + "Word";
 287             aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
 288             switch (mode)
 289             {
 290                 case i18n::WordType::ANY_WORD:
 291                     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
 292                     break;
 293                 case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
 294                     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
 295                     break;
 296                 case i18n::WordType::DICTIONARY_WORD:
 297                     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
 298                     break;
 299                 case i18n::WordType::WORD_COUNT:
 300                     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
 301                     break;
 302             }
 303
 304             CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
 305             CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
 306         }
 307     }
 308
 309     sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB };
 310     for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
 311     {
 312         //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
 313         for (size_t i = 0; i < SAL_N_ELEMENTS(aJoinTests); ++i)
 314         {
 315             OUString aTest = "Word" + OUStringLiteral1(aJoinTests[i]) + "Word";
 316             aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
 317             switch (mode)
 318             {
 319                 case i18n::WordType::ANY_WORD:
 320                     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
 321                     break;
 322                 case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
 323                     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
 324                     break;
 325                 case i18n::WordType::DICTIONARY_WORD:
 326                     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
 327                     break;
 328                 case i18n::WordType::WORD_COUNT:
 329                     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
 330                     break;
 331             }
 332
 333             CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
 334             CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
 335         }
 336     }
 337
 338     //See https://bz.apache.org/ooo/show_bug.cgi?id=13494
 339     {
 340         const OUString aBase("xxAAxxBBxxCCxx");
 341         const sal_Unicode aTests[] =
 342         {
 343             '\'', ';', ',', '.', '!', '@', '#', '%', '&', '*',
 344             '(', ')', '_', '-', '{', '}', '[', ']', '\"', '/',
 345             '\\', '?', '~', '$', '+', '^', '=', '<', '>', '|'
 346         };
 347
 348         const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14};
 349         for (size_t j = 0; j < SAL_N_ELEMENTS(aTests); ++j)
 350         {
 351             OUString aTest = aBase.replace('x', aTests[j]);
 352             sal_Int32 nPos = -1;
 353             size_t i = 0;
 354             do
 355             {
 356                 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aDoublePositions));
 357                 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 358                 CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
 359                 ++i;
 360             }
 361             while (nPos < aTest.getLength());
 362             nPos = aTest.getLength();
 363             i = SAL_N_ELEMENTS(aDoublePositions)-1;
 364             do
 365             {
 366                 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 367                 --i;
 368                 CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
 369             }
 370             while (nPos > 0);
 371         }
 372
 373         const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10};
 374         for (size_t j = 1; j < SAL_N_ELEMENTS(aTests); ++j)
 375         {
 376             OUString aTest = aBase.replaceAll("xx", OUStringLiteral1(aTests[j]));
 377             sal_Int32 nPos = -1;
 378             size_t i = 0;
 379             do
 380             {
 381                 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSinglePositions));
 382                 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 383                 CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
 384                 ++i;
 385             }
 386             while (nPos < aTest.getLength());
 387             nPos = aTest.getLength();
 388             i = SAL_N_ELEMENTS(aSinglePositions)-1;
 389             do
 390             {
 391                 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 392                 --i;
 393                 CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
 394             }
 395             while (nPos > 0);
 396         }
 397
 398         const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10};
 399         CPPUNIT_ASSERT_EQUAL(u'\'', aTests[0]);
 400         {
 401             OUString aTest = aBase.replaceAll("xx", OUStringLiteral1(aTests[0]));
 402             sal_Int32 nPos = -1;
 403             size_t i = 0;
 404             do
 405             {
 406                 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSingleQuotePositions));
 407                 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 408                 CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
 409                 ++i;
 410             }
 411             while (nPos < aTest.getLength());
 412             nPos = aTest.getLength();
 413             i = SAL_N_ELEMENTS(aSingleQuotePositions)-1;
 414             do
 415             {
 416                 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 417                 --i;
 418                 CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
 419             }
 420             while (nPos > 0);
 421         }
 422     }
 423
 424     //See https://bz.apache.org/ooo/show_bug.cgi?id=13451
 425     {
 426         aLocale.Language = "ca";
 427         aLocale.Country = "ES";
 428
 429         OUString aTest("mirar-se comprar-vos donem-nos les mans aneu-vos-en!");
 430
 431         sal_Int32 nPos = 0;
 432         sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52};
 433         size_t i = 0;
 434         do
 435         {
 436             CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
 437             nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
 438                 i18n::WordType::DICTIONARY_WORD, true).endPos;
 439             CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
 440             ++i;
 441         }
 442         while (nPos++ < aTest.getLength());
 443         CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
 444     }
 445
 446     //See https://bz.apache.org/ooo/show_bug.cgi?id=85411
 447     for (int j = 0; j < 3; ++j)
 448     {
 449         switch (j)
 450         {
 451             case 0:
 452                 aLocale.Language = "en";
 453                 aLocale.Country = "US";
 454                 break;
 455             case 1:
 456                 aLocale.Language = "ca";
 457                 aLocale.Country = "ES";
 458                 break;
 459             case 2:
 460                 aLocale.Language = "fi";
 461                 aLocale.Country = "FI";
 462                 break;
 463             default:
 464                 CPPUNIT_ASSERT(false);
 465                 break;
 466         }
 467
 468         const sal_Unicode TEST[] =
 469         {
 470             'I', 0x200B, 'w', 'a', 'n', 't', 0x200B, 't', 'o', 0x200B, 'g', 'o'
 471         };
 472         OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
 473
 474         sal_Int32 nPos = 0;
 475         sal_Int32 aExpected[] = {1, 6, 9, 12};
 476         size_t i = 0;
 477         do
 478         {
 479             CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
 480             nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
 481                 i18n::WordType::DICTIONARY_WORD, true).endPos;
 482             CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
 483             ++i;
 484         }
 485         while (nPos++ < aTest.getLength());
 486         CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
 487     }
 488
 489     //https://bz.apache.org/ooo/show_bug.cgi?id=21290
 490     for (int j = 0; j < 2; ++j)
 491     {
 492         switch (j)
 493         {
 494             case 0:
 495                 aLocale.Language = "en";
 496                 aLocale.Country = "US";
 497                 break;
 498             case 1:
 499                 aLocale.Language = "grc";
 500                 aLocale.Country.clear();
 501                 break;
 502             default:
 503                 CPPUNIT_ASSERT(false);
 504                 break;
 505         }
 506
 507         const sal_Unicode TEST[] =
 508         {
 509             0x1F0C, 0x03BD, 0x03B4, 0x03C1, 0x03B1, 0x0020, 0x1F00,
 510             0x03C1, 0x03BD, 0x1F7B, 0x03BC, 0x03B5, 0x03BD, 0x03BF,
 511             0x03C2, 0x0020, 0x1F00, 0x03BB, 0x03BB, 0x0020, 0x1F24,
 512             0x03C3, 0x03B8, 0x03B9, 0x03BF, 0x03BD
 513         };
 514         OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
 515
 516         sal_Int32 nPos = 0;
 517         sal_Int32 aExpected[] = {5, 15, 19, 26};
 518         size_t i = 0;
 519         do
 520         {
 521             CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
 522             nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
 523                 i18n::WordType::DICTIONARY_WORD, true).endPos;
 524             CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
 525             ++i;
 526         }
 527         while (nPos++ < aTest.getLength());
 528         CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
 529     }
 530
 531     //See https://bz.apache.org/ooo/show_bug.cgi?id=58513
 532     //See https://bugs.libreoffice.org/show_bug.cgi?id=55707
 533     {
 534         aLocale.Language = "fi";
 535         aLocale.Country = "FI";
 536
 537         OUString aTest("Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n");
 538
 539         {
 540             sal_Int32 nPos = 0;
 541             sal_Int32 aExpected[] = {11, 21, 24, 36, 42, 47, 51};
 542             size_t i = 0;
 543             do
 544             {
 545                 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
 546                 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
 547                     i18n::WordType::WORD_COUNT, true).endPos;
 548                 CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
 549                 ++i;
 550             }
 551             while (nPos++ < aTest.getLength());
 552             CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
 553         }
 554
 555         {
 556             sal_Int32 nPos = 0;
 557             sal_Int32 aExpected[] = {0, 11, 12, 20, 22, 24, 25, 36, 37,
 558                                     40, 41, 42, 43, 45, 46, 47, 50, 51};
 559             size_t i = 0;
 560             do
 561             {
 562                 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
 563                 aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
 564                     i18n::WordType::DICTIONARY_WORD, true);
 565                 CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.startPos);
 566                 ++i;
 567                 CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.endPos);
 568                 ++i;
 569                 nPos = aBounds.endPos;
 570             }
 571             while (nPos++ < aTest.getLength());
 572             CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
 573         }
 574     }
 575
 576     //See https://bz.apache.org/ooo/show_bug.cgi?id=107843
 577     {
 578         aLocale.Language = "en";
 579         aLocale.Country = "US";
 580
 581         const sal_Unicode TEST[] =
 582         {
 583             'r', 'u', 0xFB00, 'l', 'e', ' ', 0xFB01, 's', 'h'
 584         };
 585         OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
 586
 587         aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 588         CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 5);
 589
 590         aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 591         CPPUNIT_ASSERT(aBounds.startPos == 6 && aBounds.endPos == 9);
 592     }
 593
 594     //See https://bz.apache.org/ooo/show_bug.cgi?id=113785
 595     {
 596         aLocale.Language = "en";
 597         aLocale.Country = "US";
 598
 599         const sal_Unicode TEST[] =
 600         {
 601             'a', 0x2013, 'b', 0x2014, 'c'
 602         };
 603         OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
 604
 605         aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
 606         CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 1);
 607
 608         aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD);
 609         CPPUNIT_ASSERT(aBounds.startPos == 2 && aBounds.endPos == 3);
 610
 611         aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD);
 612         CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 5);
 613     }
 614 }
 615
 616 //See https://bugs.libreoffice.org/show_bug.cgi?id=40292
 617 //See https://bz.apache.org/ooo/show_bug.cgi?id=80412
 618 //See https://bz.apache.org/ooo/show_bug.cgi?id=111152
 619 //See https://bz.apache.org/ooo/show_bug.cgi?id=50172
 620 void TestBreakIterator::testGraphemeIteration()
 621 {
 622     lang::Locale aLocale;
 623     aLocale.Language = "bn";
 624     aLocale.Country = "IN";
 625
 626     {
 627         const sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF };
 628         OUString aTest(BA_HALANT_LA, SAL_N_ELEMENTS(BA_HALANT_LA));
 629
 630         sal_Int32 nDone=0;
 631         sal_Int32 nPos;
 632         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
 633             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 634         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(BA_HALANT_LA)), nPos);
 635         nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(BA_HALANT_LA), aLocale,
 636             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 637         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
 638     }
 639
 640     {
 641         const sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF };
 642         OUString aTest(HA_HALANT_NA_VOWELSIGNI, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
 643
 644         sal_Int32 nDone=0;
 645         sal_Int32 nPos;
 646         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
 647             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 648         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI)), nPos);
 649         nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI), aLocale,
 650             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 651         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
 652     }
 653
 654     {
 655         const sal_Unicode TA_HALANT_MA_HALANT_YA  [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF };
 656         OUString aTest(TA_HALANT_MA_HALANT_YA, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
 657
 658         sal_Int32 nDone=0;
 659         sal_Int32 nPos;
 660         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
 661             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 662         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA)), nPos);
 663         nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA), aLocale,
 664             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 665         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
 666     }
 667
 668     aLocale.Language = "ta";
 669     aLocale.Country = "IN";
 670
 671     {
 672         const sal_Unicode KA_VIRAMA_SSA[] = { 0x0B95, 0x0BCD, 0x0BB7 };
 673         OUString aTest(KA_VIRAMA_SSA, SAL_N_ELEMENTS(KA_VIRAMA_SSA));
 674
 675         sal_Int32 nDone=0;
 676         sal_Int32 nPos = 0;
 677
 678         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
 679             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 680         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(KA_VIRAMA_SSA)), nPos);
 681         nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VIRAMA_SSA), aLocale,
 682             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 683         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
 684     }
 685
 686     {
 687         const sal_Unicode KA_VOWELSIGNU[] = { 0x0B95, 0x0BC1 };
 688         OUString aTest(KA_VOWELSIGNU, SAL_N_ELEMENTS(KA_VOWELSIGNU));
 689
 690         sal_Int32 nDone=0;
 691         sal_Int32 nPos = 0;
 692
 693         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
 694             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 695         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(KA_VOWELSIGNU)), nPos);
 696         nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VOWELSIGNU), aLocale,
 697             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 698         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
 699     }
 700
 701     {
 702         const sal_Unicode CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI[] =
 703             { 0x0B9A, 0x0BBF, 0x0BA4, 0x0BCD, 0x0BA4, 0x0BBF, 0x0BB0, 0x0BC8 };
 704         OUString aTest(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI,
 705             SAL_N_ELEMENTS(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI));
 706
 707         sal_Int32 nDone=0;
 708         sal_Int32 nPos=0;
 709
 710         for (sal_Int32 i = 0; i < 4; ++i)
 711         {
 712             sal_Int32 nOldPos = nPos;
 713             nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale,
 714                 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 715             CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos+2, nPos);
 716         }
 717
 718         for (sal_Int32 i = 0; i < 4; ++i)
 719         {
 720             sal_Int32 nOldPos = nPos;
 721             nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale,
 722                 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 723             CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos-2, nPos);
 724         }
 725     }
 726
 727     {
 728         const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
 729         OUString aText(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
 730
 731         sal_Int32 nGraphemeCount = 0;
 732
 733         sal_Int32 nCurPos = 0;
 734         while (nCurPos < aText.getLength())
 735         {
 736             sal_Int32 nCount2 = 1;
 737             nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
 738                 i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
 739             ++nGraphemeCount;
 740         }
 741
 742         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be considered 1 grapheme", static_cast<sal_Int32>(1), nGraphemeCount);
 743     }
 744
 745     aLocale.Language = "hi";
 746     aLocale.Country = "IN";
 747
 748     {
 749         const sal_Unicode SHA_VOWELSIGNII[] = { 0x936, 0x940 };
 750         OUString aTest(SHA_VOWELSIGNII, SAL_N_ELEMENTS(SHA_VOWELSIGNII));
 751
 752         sal_Int32 nDone=0;
 753         sal_Int32 nPos = 0;
 754
 755         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
 756             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 757         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(SHA_VOWELSIGNII)), nPos);
 758         nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(SHA_VOWELSIGNII), aLocale,
 759             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 760         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
 761     }
 762 }
 763
 764 //A test to ensure that certain ranges and codepoints that are categorized as
 765 //weak remain as weak, so that existing docs that depend on this don't silently
 766 //change font for those weak chars
 767 void TestBreakIterator::testWeak()
 768 {
 769     lang::Locale aLocale;
 770     aLocale.Language = "en";
 771     aLocale.Country = "US";
 772
 773     {
 774         const sal_Unicode WEAKS[] =
 775         {
 776             0x0001, 0x0002,
 777             0x0020, 0x00A0,
 778             0x0300, 0x036F, //Combining Diacritical Marks
 779             0x1AB0, 0x1AFF, //Combining Diacritical Marks Extended
 780             0x1DC0, 0x1DFF, //Combining Diacritical Marks Supplement
 781             0x20D0, 0x20FF, //Combining Diacritical Marks for Symbols
 782             0x2150, 0x215F, //Number Forms, fractions
 783             0x2160, 0x2180, //Number Forms, roman numerals
 784             0x2200, 0x22FF, //Mathematical Operators
 785             0x27C0, 0x27EF, //Miscellaneous Mathematical Symbols-A
 786             0x2980, 0x29FF, //Miscellaneous Mathematical Symbols-B
 787             0x2A00, 0x2AFF, //Supplemental Mathematical Operators
 788             0x2100, 0x214F, //Letterlike Symbols
 789             0x2308, 0x230B, //Miscellaneous technical
 790             0x25A0, 0x25FF, //Geometric Shapes
 791             0x2B30, 0x2B4C  //Miscellaneous Symbols and Arrows
 792         };
 793         OUString aWeaks(WEAKS, SAL_N_ELEMENTS(WEAKS));
 794
 795         for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
 796         {
 797             sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
 798             OStringBuffer aMsg;
 799             aMsg.append("Char 0x");
 800             aMsg.append(static_cast<sal_Int32>(aWeaks[i]), 16);
 801             aMsg.append(" should have been weak");
 802             CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
 803                 i18n::ScriptType::WEAK, nScript);
 804         }
 805     }
 806 }
 807
 808 //A test to ensure that certain ranges and codepoints that are categorized as
 809 //asian remain as asian, so that existing docs that depend on this don't silently
 810 //change font for those asian chars.
 811 //See https://bugs.libreoffice.org/show_bug.cgi?id=38095
 812 void TestBreakIterator::testAsian()
 813 {
 814     lang::Locale aLocale;
 815     aLocale.Language = "en";
 816     aLocale.Country = "US";
 817
 818     {
 819         const sal_Unicode ASIANS[] =
 820         {
 821             //some typical CJK chars
 822             0x4E00, 0x62FF,
 823             //The full HalfWidth and FullWidth block has historically been
 824             //designated as taking the CJK font :-(
 825             //HalfWidth and FullWidth forms of ASCII 0-9, categorized under
 826             //UAX24 as "Common" i.e. by that logic WEAK
 827             0xFF10, 0xFF19,
 828             //HalfWidth and FullWidth forms of ASCII A-z, categorized under
 829             //UAX25 as "Latin", i.e. by that logic LATIN
 830             0xFF21, 0xFF5A
 831         };
 832         OUString aAsians(ASIANS, SAL_N_ELEMENTS(ASIANS));
 833
 834         for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
 835         {
 836             sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
 837             OStringBuffer aMsg;
 838             aMsg.append("Char 0x");
 839             aMsg.append(static_cast<sal_Int32>(aAsians[i]), 16);
 840             aMsg.append(" should have been asian");
 841             CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
 842                 i18n::ScriptType::ASIAN, nScript);
 843         }
 844     }
 845 }
 846
 847 #if (U_ICU_VERSION_MAJOR_NUM > 51)
 848 //A test to ensure that our Lao word boundary detection is useful
 849 void TestBreakIterator::testLao()
 850 {
 851     lang::Locale aLocale;
 852     aLocale.Language = "lo";
 853     aLocale.Country = "LA";
 854
 855     const sal_Unicode LAO[] = { 0x0e8d, 0x0eb4, 0x0e99, 0x0e94, 0x0eb5, 0x0e95, 0x0ec9, 0x0ead, 0x0e99, 0x0eae, 0x0eb1, 0x0e9a };
 856     OUString aTest(LAO, SAL_N_ELEMENTS(LAO));
 857     i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
 858         i18n::WordType::DICTIONARY_WORD, true);
 859
 860     CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
 861     CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
 862
 863     aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
 864         i18n::WordType::DICTIONARY_WORD, true);
 865
 866     CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
 867     CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
 868 }
 869 #endif
 870
 871 //A test to ensure that our thai word boundary detection is useful
 872 void TestBreakIterator::testThai()
 873 {
 874     lang::Locale aLocale;
 875     aLocale.Language = "th";
 876     aLocale.Country = "TH";
 877
 878     //See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
 879     {
 880         const sal_Unicode THAI[] = { 0x0E01, 0x0E38, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
 881         OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
 882         i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
 883             i18n::WordType::DICTIONARY_WORD, true);
 884         CPPUNIT_ASSERT_MESSAGE("Should skip full word",
 885             aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
 886     }
 887
 888     //See https://bz.apache.org/ooo/show_bug.cgi?id=29548
 889     //make sure forwards and back are consistent
 890     {
 891         const sal_Unicode THAI[] =
 892         {
 893             0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
 894             0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
 895             0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27,
 896             0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
 897             0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
 898             0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27
 899         };
 900         OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
 901
 902         std::stack<sal_Int32> aPositions;
 903         sal_Int32 nPos = -1;
 904         do
 905         {
 906             nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 907             aPositions.push(nPos);
 908         }
 909         while (nPos < aTest.getLength());
 910         nPos = aTest.getLength();
 911         CPPUNIT_ASSERT(!aPositions.empty());
 912         aPositions.pop();
 913         do
 914         {
 915             CPPUNIT_ASSERT(!aPositions.empty());
 916             nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 917             CPPUNIT_ASSERT_EQUAL(aPositions.top(), nPos);
 918             aPositions.pop();
 919         }
 920         while (nPos > 0);
 921     }
 922
 923     // tdf#113694
 924     {
 925         const sal_Unicode NON_BMP[] = { 0xD800, 0xDC00 };
 926         OUString aTest(NON_BMP, SAL_N_ELEMENTS(NON_BMP));
 927
 928         sal_Int32 nDone=0;
 929         sal_Int32 nPos;
 930
 931         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
 932             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 933         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(SAL_N_ELEMENTS(NON_BMP)), nPos);
 934         nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(NON_BMP), aLocale,
 935             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
 936         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
 937
 938         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
 939             i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
 940         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(SAL_N_ELEMENTS(NON_BMP)), nPos);
 941         nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(NON_BMP), aLocale,
 942             i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
 943         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
 944     }
 945 }
 946
 947 #ifdef TODO
 948 void TestBreakIterator::testNorthernThai()
 949 {
 950     lang::Locale aLocale;
 951     aLocale.Language = "nod";
 952     aLocale.Country = "TH";
 953
 954     const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
 955     OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1));
 956     i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
 957         i18n::WordType::DICTIONARY_WORD, true);
 958     CPPUNIT_ASSERT_MESSAGE("Should skip full word",
 959         aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
 960 }
 961
 962 // Not sure if any version earlier than 49 did have Khmer word boundary
 963 // dictionaries, 4.6 does not.
 964
 965 // As of icu 54, word boundary detection for Khmer is still considered
 966 // insufficient, so icu khmer stuff is disabled
 967
 968 //A test to ensure that our khmer word boundary detection is useful
 969 //https://bugs.libreoffice.org/show_bug.cgi?id=52020
 970 void TestBreakIterator::testKhmer()
 971 {
 972     lang::Locale aLocale;
 973     aLocale.Language = "km";
 974     aLocale.Country = "KH";
 975
 976     const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
 977
 978     OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER));
 979     i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
 980         i18n::WordType::DICTIONARY_WORD, true);
 981
 982     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
 983
 984     aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
 985         i18n::WordType::DICTIONARY_WORD, true);
 986
 987     CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5);
 988 }
 989 #endif
 990
 991 void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak)
 992 {
 993     lang::Locale aLocale;
 994     aLocale.Language = "ja";
 995     aLocale.Country = "JP";
 996     i18n::Boundary aBounds;
 997
 998     {
 999         const sal_Unicode JAPANESE[] = { 0x30B7, 0x30E3, 0x30C3, 0x30C8, 0x30C0, 0x30A6, 0x30F3 };
1000
1001         OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
1002         aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
1003             i18n::WordType::DICTIONARY_WORD, true);
1004
1005         CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
1006     }
1007
1008     {
1009         const sal_Unicode JAPANESE[] = { 0x9EBB, 0x306E, 0x8449, 0x9EBB, 0x306E, 0x8449 };
1010
1011         OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
1012         aBounds = xBreak->getWordBoundary(aTest, 1, aLocale,
1013             i18n::WordType::DICTIONARY_WORD, true);
1014
1015         CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
1016
1017         aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
1018             i18n::WordType::DICTIONARY_WORD, true);
1019
1020         CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 6);
1021     }
1022 }
1023
1024 void TestBreakIterator::testJapanese()
1025 {
1026     doTestJapanese(m_xBreak);
1027
1028     // fdo#78479 - test second / cached instantiation of xdictionary
1029     uno::Reference< i18n::XBreakIterator > xTmpBreak(m_xSFactory->createInstance(
1030         "com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
1031
1032     doTestJapanese(xTmpBreak);
1033 }
1034
1035 void TestBreakIterator::testChinese()
1036 {
1037     lang::Locale aLocale;
1038     aLocale.Language = "zh";
1039     aLocale.Country = "CN";
1040     i18n::Boundary aBounds;
1041
1042     {
1043         const sal_Unicode CHINESE[] = { 0x6A35, 0x6A30, 0x69FE, 0x8919, 0xD867, 0xDEDB  };
1044
1045         OUString aTest(CHINESE, SAL_N_ELEMENTS(CHINESE));
1046         aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale,
1047             i18n::WordType::DICTIONARY_WORD, true);
1048         CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 6);
1049     }
1050 }
1051 void TestBreakIterator::setUp()
1052 {
1053     BootstrapFixtureBase::setUp();
1054     m_xBreak.set(m_xSFactory->createInstance("com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
1055 }
1056
1057 void TestBreakIterator::tearDown()
1058 {
1059     m_xBreak.clear();
1060     BootstrapFixtureBase::tearDown();
1061 }
1062
1063 CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator);
1064
1065 CPPUNIT_PLUGIN_IMPLEMENT();
1066
1067 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */