i18npool/qa/cppunit/test_breakiterator.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  */
   9
  10 #include <com/sun/star/i18n/XBreakIterator.hpp>
  11 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
  12 #include <com/sun/star/i18n/ScriptType.hpp>
  13 #include <com/sun/star/i18n/WordType.hpp>
  14 #include <o3tl/cppunittraitshelper.hxx>
  15 #include <unotest/bootstrapfixturebase.hxx>
  16
  17 #include <unicode/uvernum.h>
  18
  19 #include <string.h>
  20
  21 #include <stack>
  22 #include <string_view>
  23
  24 using namespace ::com::sun::star;
  25
  26 class TestBreakIterator : public test::BootstrapFixtureBase
  27 {
  28 public:
  29     virtual void setUp() override;
  30     virtual void tearDown() override;
  31
  32     void testLineBreaking();
  33     void testWordBoundaries();
  34     void testSentenceBoundaries();
  35     void testGraphemeIteration();
  36     void testWeak();
  37     void testAsian();
  38     void testThai();
  39     void testLao();
  40 #ifdef TODO
  41     void testNorthernThai();
  42     void testKhmer();
  43 #endif
  44     void testJapanese();
  45     void testChinese();
  46
  47     void testDictWordAbbreviation();
  48     void testDictWordPrepostDash();
  49     void testHebrewGereshGershaim();
  50     void testLegacySurrogatePairs();
  51     void testWordCount();
  52     void testDictionaryIteratorLanguages();
  53
  54     CPPUNIT_TEST_SUITE(TestBreakIterator);
  55     CPPUNIT_TEST(testLineBreaking);
  56     CPPUNIT_TEST(testWordBoundaries);
  57     CPPUNIT_TEST(testSentenceBoundaries);
  58     CPPUNIT_TEST(testGraphemeIteration);
  59     CPPUNIT_TEST(testWeak);
  60     CPPUNIT_TEST(testAsian);
  61     CPPUNIT_TEST(testThai);
  62     CPPUNIT_TEST(testLao);
  63 #ifdef TODO
  64     CPPUNIT_TEST(testKhmer);
  65     CPPUNIT_TEST(testNorthernThai);
  66 #endif
  67     CPPUNIT_TEST(testJapanese);
  68     CPPUNIT_TEST(testChinese);
  69     CPPUNIT_TEST(testDictWordAbbreviation);
  70     CPPUNIT_TEST(testDictWordPrepostDash);
  71     CPPUNIT_TEST(testHebrewGereshGershaim);
  72     CPPUNIT_TEST(testLegacySurrogatePairs);
  73     CPPUNIT_TEST(testWordCount);
  74     CPPUNIT_TEST(testDictionaryIteratorLanguages);
  75     CPPUNIT_TEST_SUITE_END();
  76
  77 private:
  78     uno::Reference<i18n::XBreakIterator> m_xBreak;
  79     void doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak);
  80 };
  81
  82 void TestBreakIterator::testLineBreaking()
  83 {
  84     i18n::LineBreakHyphenationOptions aHyphOptions;
  85     i18n::LineBreakUserOptions aUserOptions;
  86     lang::Locale aLocale;
  87
  88     //See https://bugs.libreoffice.org/show_bug.cgi?id=31271
  89     {
  90         OUString aTest(u"(some text here)"_ustr);
  91
  92         aLocale.Language = "en";
  93         aLocale.Country = "US";
  94
  95         {
  96             //Here we want the line break to leave text here) on the next line
  97             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
  98             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(6), aResult.breakIndex);
  99         }
 100
 101         {
 102             //Here we want the line break to leave "here)" on the next line
 103             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
 104             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(11), aResult.breakIndex);
 105         }
 106     }
 107
 108     //See https://bugs.libreoffice.org/show_bug.cgi?id=49849
 109     {
 110         static constexpr OUString aWord = u"\u05DE\u05D9\u05DC\u05D9\u05DD"_ustr;
 111         OUString aTest(aWord + " " + aWord);
 112
 113         aLocale.Language = "he";
 114         aLocale.Country = "IL";
 115
 116         {
 117             //Here we want the line break to happen at the whitespace
 118             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
 119             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", aWord.getLength()+1, aResult.breakIndex);
 120         }
 121     }
 122
 123     //See https://bz.apache.org/ooo/show_bug.cgi?id=17155
 124     {
 125         aLocale.Language = "en";
 126         aLocale.Country = "US";
 127
 128         {
 129             //Here we want the line break to leave /bar/ba clumped together on the next line
 130             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(u"foo /bar/baz"_ustr, strlen("foo /bar/ba"), aLocale, 0,
 131                 aHyphOptions, aUserOptions);
 132             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first slash", static_cast<sal_Int32>(4), aResult.breakIndex);
 133         }
 134     }
 135
 136     // i#22602: writer breaks word after dot immediately followed by a letter
 137     {
 138         aLocale.Language = "en";
 139         aLocale.Country = "US";
 140
 141         {
 142             //Here we want the line break to leave ./bar/baz clumped together on the next line
 143             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
 144                 u"foo ./bar/baz"_ustr, strlen("foo ./bar/ba"), aLocale, 0, aHyphOptions, aUserOptions);
 145             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first period",
 146                                          static_cast<sal_Int32>(4), aResult.breakIndex);
 147         }
 148     }
 149
 150     // i#81448: slash and backslash make non-breaking spaces of preceding spaces
 151     {
 152         aLocale.Language = "en";
 153         aLocale.Country = "US";
 154
 155         {
 156             // Per the bug, the line break should leave ...BE clumped together on the next line.
 157             // However, the current behavior does not wrap the string at all. This test asserts the
 158             // current behavior as a point of reference.
 159             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
 160                 u"THIS... ...BE"_ustr, strlen("THIS... ...B"), aLocale, 0, aHyphOptions, aUserOptions);
 161             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aResult.breakIndex);
 162         }
 163     }
 164
 165     // i#81448: slash and backslash make non-breaking spaces of preceding spaces
 166     {
 167         aLocale.Language = "en";
 168         aLocale.Country = "US";
 169
 170         {
 171             // The line break should leave /BE clumped together on the next line.
 172             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
 173                 u"THIS... /BE"_ustr, strlen("THIS... /B"), aLocale, 0, aHyphOptions, aUserOptions);
 174             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(8), aResult.breakIndex);
 175         }
 176     }
 177
 178     // i#80548: Bad word wrap between dash and word
 179     {
 180         aLocale.Language = "fi";
 181         aLocale.Country = "FI";
 182
 183         {
 184             // Per the bug, the line break should leave -bar clumped together on the next line.
 185             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
 186                 u"foo -bar"_ustr, strlen("foo -ba"), aLocale, 0, aHyphOptions, aUserOptions);
 187             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first dash",
 188                                          static_cast<sal_Int32>(4), aResult.breakIndex);
 189         }
 190     }
 191
 192     // i#80645: Line erroneously breaks at backslash
 193     {
 194         aLocale.Language = "en";
 195         aLocale.Country = "US";
 196
 197         {
 198             // Note that the current behavior deviates from the original fix for this bug.
 199             //
 200             // The original report was filed due to wrapping all of "\Program Files\aaaa" to the
 201             // next line, even though only "aaaa" overflowed. The original fix was to simply make
 202             // U+005C reverse solidus (backslash) a breaking character.
 203             //
 204             // However, the root cause for this bug was not the behavior of '\', but rather some
 205             // other bug making all of "\Program Files\" behave like a single token, despite it
 206             // even containing whitespace.
 207             //
 208             // Reverting to the ICU line rules fixes this root issue. Now, in the following,
 209             // "C:\Program" and "Files\LibreOffice" are treated as separate tokens. This is also
 210             // consistent with the behavior of other office programs.
 211             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
 212                 u"C:\\Program Files\\LibreOffice"_ustr, strlen("C:\\Program Files\\Libre"), aLocale, 0,
 213                 aHyphOptions, aUserOptions);
 214             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(11), aResult.breakIndex);
 215
 216             // An identical result should be generated for solidus.
 217             aResult = m_xBreak->getLineBreak(
 218                 u"C:/Program Files/LibreOffice"_ustr, strlen("C:/Program Files/Libre"), aLocale, 0,
 219                 aHyphOptions, aUserOptions);
 220             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(11), aResult.breakIndex);
 221         }
 222     }
 223
 224     // i#80841: Words separated by hyphens will always break to next line
 225     {
 226         aLocale.Language = "en";
 227         aLocale.Country = "US";
 228
 229         {
 230             // Here we want the line break to leave toll- on the first line
 231             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
 232                 u"toll-free"_ustr, strlen("toll-fr"), aLocale, 0, aHyphOptions, aUserOptions);
 233             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(5), aResult.breakIndex);
 234         }
 235     }
 236
 237     // i#83464: Line break between letter and $
 238     {
 239         aLocale.Language = "en";
 240         aLocale.Country = "US";
 241
 242         {
 243             // Here we want the line break to leave US$ clumped on the next line.
 244             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
 245                 u"word US$ 123"_ustr, strlen("word U"), aLocale, 0, aHyphOptions, aUserOptions);
 246             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(5), aResult.breakIndex);
 247         }
 248     }
 249
 250     // Unknown bug number: "fix line break problem of dot after letter and before number"
 251     {
 252         aLocale.Language = "en";
 253         aLocale.Country = "US";
 254
 255         {
 256             // Here we want the line break to leave US$ clumped on the next line.
 257             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
 258                 u"word L.5 word"_ustr, strlen("word L"), aLocale, 0, aHyphOptions, aUserOptions);
 259             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(5), aResult.breakIndex);
 260         }
 261     }
 262
 263     // i#83229: Wrong line break when word contains a hyphen
 264     {
 265         aLocale.Language = "en";
 266         aLocale.Country = "US";
 267
 268         {
 269             // The root cause for this bug was the Unicode standard introducing special treatment
 270             // for '-' in a number range context. This change makes number ranges (e.g. "100-199")
 271             // behave as if they are single tokens for the purposes of line breaking. Unfortunately,
 272             // this caused a significant appearance change to existing documents.
 273             //
 274             // Despite being a user-visible layout change, this isn't exactly a bug. Wrapping
 275             // number ranges as a single token is consistent with other applications, including web
 276             // browsers, and other office suites as mentioned in the bug discussion. Removing this
 277             // customization seems like it would be a major change, however.
 278             //
 279             // Here we want the line break to leave 100- clumped on the first line.
 280
 281             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
 282                 u"word 100-199 word"_ustr, strlen("word 100-1"), aLocale, 0, aHyphOptions, aUserOptions);
 283             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(9), aResult.breakIndex);
 284         }
 285
 286         {
 287             // From the same bug: "the leading minus must stay with numbers and strings"
 288
 289             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
 290                     u"range of -100.000 to 100.000"_ustr, strlen("range of -1"), aLocale, 0,
 291                     aHyphOptions, aUserOptions);
 292             CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex);
 293
 294             static constexpr OUString str = u"range of \u2212100.000 to 100.000"_ustr;
 295             aResult = m_xBreak->getLineBreak(
 296                     str, strlen("range of -"), aLocale, 0, aHyphOptions, aUserOptions);
 297             CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex);
 298         }
 299
 300         aLocale.Language = "de";
 301         aLocale.Country = "DE";
 302
 303         {
 304             // From the same bug: "the leading minus must stay with numbers and strings"
 305
 306             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
 307                     u"EURO is -10,50"_ustr, strlen("EURO is -1"), aLocale, 0, aHyphOptions, aUserOptions);
 308             CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex);
 309
 310             // Also the mathematical minus sign:
 311
 312             static constexpr OUString str = u"EURO is \u221210,50"_ustr;
 313             aResult = m_xBreak->getLineBreak(
 314                     str, strlen("EURO is -"), aLocale, 0, aHyphOptions, aUserOptions);
 315             CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex);
 316         }
 317
 318         {
 319             // From the same bug: "the leading minus must stay with numbers and strings"
 320
 321             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
 322                     u"und -kosten"_ustr, strlen("und -ko"), aLocale, 0,
 323                     aHyphOptions, aUserOptions);
 324             CPPUNIT_ASSERT_EQUAL(sal_Int32{4}, aResult.breakIndex);
 325
 326             // But not the non-breaking hyphen:
 327
 328             static constexpr OUString str = u"und \u2011"_ustr;
 329             aResult = m_xBreak->getLineBreak(
 330                     str, strlen("und -ko"), aLocale, 0, aHyphOptions, aUserOptions);
 331             CPPUNIT_ASSERT_EQUAL(sal_Int32{5}, aResult.breakIndex);
 332         }
 333     }
 334
 335     // i#83649: "Line break should be between typographical quote and left bracket"
 336     // - Actually: Spaces between quotation mark and opening punctuation not treated as a break.
 337     // - Note that per the Unicode standard, prohibiting breaks in this context is intentional
 338     // because it may cause issues in certain languages due to the various ways quotation
 339     // characters are used.
 340     // - We do it anyway by customizing the ICU line breaking rules.
 341     {
 342         {
 343             // This uses the sample text provided in the bug report. Based on usage, it is assumed
 344             // they were in the de_DE locale.
 345
 346             aLocale.Language = "de";
 347             aLocale.Country = "DE";
 348
 349             // Per the bug report, it is expected that »angetan werden« remains on the first line.
 350             const OUString str = u"»angetan werden« [Passiv]"_ustr;
 351             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
 352                 str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
 353             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
 354
 355             // The same result should be returned for this and the first case.
 356             const OUString str2 = u"»angetan werden« Passiv"_ustr;
 357             aResult = m_xBreak->getLineBreak(
 358                 str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
 359             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
 360
 361             // Under ICU rules, no amount of spaces would cause this to wrap.
 362             const OUString str3 = u"»angetan werden«    [Passiv]"_ustr;
 363             aResult = m_xBreak->getLineBreak(
 364                 str3, str3.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
 365             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(20), aResult.breakIndex);
 366
 367             // However, tabs will
 368             const OUString str4 = u"»angetan werden«\t[Passiv]"_ustr;
 369             aResult = m_xBreak->getLineBreak(
 370                 str4, str4.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
 371             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
 372         }
 373
 374         {
 375             // The same behavior is seen in English
 376
 377             aLocale.Language = "en";
 378             aLocale.Country = "US";
 379
 380             const OUString str = u"\"angetan werden\" [Passiv]"_ustr;
 381             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
 382                 str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
 383             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
 384
 385             const OUString str2 = u"\"angetan werden\" Passiv"_ustr;
 386             aResult = m_xBreak->getLineBreak(
 387                 str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
 388             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
 389         }
 390     }
 391
 392     // i#72868: Writer/Impress line does not break after Chinese punctuation and Latin letters
 393     {
 394         aLocale.Language = "zh";
 395         aLocale.Country = "HK";
 396
 397         {
 398             // Per the bug, this should break at the ideographic comma. However, this change has
 399             // been reverted at some point. This test only verifies current behavior.
 400             const OUString str = u"word word、word word"_ustr;
 401             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
 402                 str, strlen("word wordXwor"), aLocale, 0, aHyphOptions, aUserOptions);
 403             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(13), aResult.breakIndex);
 404         }
 405     }
 406
 407     // i#80891: Character in the forbidden list sometimes appears at the start of line
 408     {
 409         aLocale.Language = "zh";
 410         aLocale.Country = "HK";
 411
 412         {
 413             // Per the bug, the ideographic two-dot leader should be a forbidden character. However,
 414             // this change seems to have been reverted or broken at some point.
 415             const OUString str = u"電話︰電話"_ustr;
 416             i18n::LineBreakResults aResult
 417                 = m_xBreak->getLineBreak(str, 2, aLocale, 0, aHyphOptions, aUserOptions);
 418             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(2), aResult.breakIndex);
 419         }
 420     }
 421
 422     //See https://bz.apache.org/ooo/show_bug.cgi?id=19716
 423     {
 424         aLocale.Language = "en";
 425         aLocale.Country = "US";
 426
 427         {
 428             OUString aTest(u"aaa]aaa"_ustr);
 429             //Here we want the line break to move the whole lot to the next line
 430             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
 431                 aHyphOptions, aUserOptions);
 432             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the line, not at ]", static_cast<sal_Int32>(0), aResult.breakIndex);
 433         }
 434     }
 435
 436     //this is an example sequence from tdf92993-1.docx caught by the load crashtesting
 437     {
 438         static constexpr OUStringLiteral aTest = u"\U0001f356\U0001f357\U0001f346"
 439                                        "\U0001f364\u2668\ufe0f\U0001f3c6";
 440
 441         aLocale.Language = "en";
 442         aLocale.Country = "US";
 443
 444         {
 445             //This must not assert/crash
 446             (void)m_xBreak->getLineBreak(aTest, 0, aLocale, 0, aHyphOptions, aUserOptions);
 447         }
 448     }
 449
 450     //See https://bugs.documentfoundation.org/show_bug.cgi?id=96197
 451     {
 452         static constexpr OUString aTest = u"\uc560\uad6D\uac00\uc758 \uac00"
 453                                        "\uc0ac\ub294"_ustr;
 454
 455         aLocale.Language = "ko";
 456         aLocale.Country = "KR";
 457
 458         {
 459             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
 460                 aHyphOptions, aUserOptions);
 461             CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast<sal_Int32>(5), aResult.breakIndex);
 462         }
 463     }
 464
 465     // i#65267: Comma is badly broken at end of line
 466     // - The word should be wrapped along with the comma
 467     {
 468         aLocale.Language = "de";
 469         aLocale.Country = "DE";
 470
 471         {
 472             auto res = m_xBreak->getLineBreak(u"Wort -prinzessinnen, wort"_ustr,
 473                                               strlen("Wort -prinzessinnen,"), aLocale, 0,
 474                                               aHyphOptions, aUserOptions);
 475             CPPUNIT_ASSERT_EQUAL(sal_Int32{ 5 }, res.breakIndex);
 476         }
 477     }
 478
 479     // tdf#114160: ZWJ shouldn't be treated as a breaking character
 480     {
 481         aLocale.Language = "mn";
 482         aLocale.Country = "MN";
 483
 484         {
 485             auto res = m_xBreak->getLineBreak(u"\u1828\u1820\u200d\u00a0\u200d\u1873\u1873"_ustr, 6,
 486                                               aLocale, 0, aHyphOptions, aUserOptions);
 487             CPPUNIT_ASSERT_EQUAL(sal_Int32(0), res.breakIndex);
 488         }
 489
 490         aLocale.Language = "en";
 491         aLocale.Country = "US";
 492
 493         {
 494             auto res = m_xBreak->getLineBreak(u"AB\u200d\u00a0\u200dCD"_ustr, 6, aLocale, 0,
 495                                               aHyphOptions, aUserOptions);
 496             CPPUNIT_ASSERT_EQUAL(sal_Int32(0), res.breakIndex);
 497         }
 498     }
 499 }
 500
 501 //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
 502 void TestBreakIterator::testWordBoundaries()
 503 {
 504     lang::Locale aLocale;
 505     aLocale.Language = "en";
 506     aLocale.Country = "US";
 507
 508     i18n::Boundary aBounds;
 509
 510     //See https://bz.apache.org/ooo/show_bug.cgi?id=11993
 511     {
 512         OUString aTest(u"abcd ef  ghi??? KLM"_ustr);
 513
 514         CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
 515         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
 516         aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
 517         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
 518         CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
 519
 520         CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
 521         CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
 522
 523         //next word
 524         aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true);
 525         CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
 526         CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
 527
 528         //previous word
 529         aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 530         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
 531         CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
 532
 533         CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
 534         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
 535         aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true);
 536         CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
 537         CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
 538
 539         CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
 540         CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
 541         aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true);
 542         CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.startPos);
 543         CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
 544     }
 545
 546     //See https://bz.apache.org/ooo/show_bug.cgi?id=21907
 547     {
 548         OUString aTest(u"b a?"_ustr);
 549
 550         CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
 551         CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
 552         CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
 553
 554         CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
 555
 556         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
 557         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
 558         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
 559
 560         CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
 561     }
 562
 563     //See https://bz.apache.org/ooo/show_bug.cgi?id=14904
 564     {
 565         static constexpr OUString aTest =
 566             u"Working \u201CWords"
 567             " starting wit"
 568             "h quotes\u201D Work"
 569             "ing \u2018Broken\u2019 "
 570             "?Spanish? doe"
 571             "sn\u2019t work. No"
 572             "t even \u00BFreal? "
 573             "Spanish"_ustr;
 574
 575         aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 576         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
 577         CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
 578
 579         aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 580         CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
 581         CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
 582
 583         aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 584         CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.startPos);
 585         CPPUNIT_ASSERT_EQUAL(sal_Int32(44), aBounds.endPos);
 586
 587         aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 588         CPPUNIT_ASSERT_EQUAL(sal_Int32(46), aBounds.startPos);
 589         CPPUNIT_ASSERT_EQUAL(sal_Int32(52), aBounds.endPos);
 590
 591         aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 592         CPPUNIT_ASSERT_EQUAL(sal_Int32(55), aBounds.startPos);
 593         CPPUNIT_ASSERT_EQUAL(sal_Int32(62), aBounds.endPos);
 594
 595         aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 596         CPPUNIT_ASSERT_EQUAL(sal_Int32(64), aBounds.startPos);
 597         CPPUNIT_ASSERT_EQUAL(sal_Int32(71), aBounds.endPos);
 598
 599         aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 600         CPPUNIT_ASSERT_EQUAL(sal_Int32(88), aBounds.startPos);
 601         CPPUNIT_ASSERT_EQUAL(sal_Int32(92), aBounds.endPos);
 602     }
 603
 604     //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
 605     sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF };
 606     for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
 607     {
 608         //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
 609         for (auto const& i: aBreakTests)
 610         {
 611             OUString aTest = "Word" + OUStringChar(i) + "Word";
 612             aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
 613             switch (mode)
 614             {
 615                 case i18n::WordType::ANY_WORD:
 616                     CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
 617                     CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
 618                     break;
 619                 case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
 620                     CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
 621                     CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
 622                     break;
 623                 case i18n::WordType::DICTIONARY_WORD:
 624                     CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
 625                     CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
 626                     break;
 627                 case i18n::WordType::WORD_COUNT:
 628                     CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
 629                     CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
 630                     break;
 631             }
 632
 633             CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
 634             CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
 635         }
 636     }
 637
 638     sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB };
 639     for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
 640     {
 641         //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
 642         for (auto const& p: aJoinTests)
 643         {
 644             OUString aTest = "Word" + OUStringChar(p) + "Word";
 645             aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
 646             switch (mode)
 647             {
 648                 case i18n::WordType::ANY_WORD:
 649                     CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
 650                     CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
 651                     break;
 652                 case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
 653                     CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
 654                     CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
 655                     break;
 656                 case i18n::WordType::DICTIONARY_WORD:
 657                     CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
 658                     CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
 659                     break;
 660                 case i18n::WordType::WORD_COUNT:
 661                     CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
 662                     CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
 663                     break;
 664             }
 665
 666             CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
 667             CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
 668         }
 669     }
 670
 671     //See https://bz.apache.org/ooo/show_bug.cgi?id=13494
 672     {
 673         static constexpr OUString aBase(u"xxAAxxBBxxCCxx"_ustr);
 674         const sal_Unicode aTests[] =
 675         {
 676             '\'', ';', ',', '.', '!', '@', '#', '%', '&', '*',
 677             '(', ')', '_', '-', '{', '}', '[', ']', '\"', '/',
 678             '\\', '?', '~', '$', '+', '^', '=', '<', '>', '|'
 679         };
 680
 681         const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14};
 682         for (auto const& r: aTests)
 683         {
 684             OUString aTest = aBase.replace('x', r);
 685             sal_Int32 nPos = -1;
 686             size_t i = 0;
 687             do
 688             {
 689                 CPPUNIT_ASSERT(i < std::size(aDoublePositions));
 690                 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 691                 CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
 692                 ++i;
 693             }
 694             while (nPos < aTest.getLength());
 695             nPos = aTest.getLength();
 696             i = std::size(aDoublePositions)-1;
 697             do
 698             {
 699                 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 700                 --i;
 701                 CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
 702             }
 703             while (nPos > 0);
 704         }
 705
 706         const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10};
 707         for (size_t j = 1; j < std::size(aTests); ++j)
 708         {
 709             OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[j]));
 710             sal_Int32 nPos = -1;
 711             size_t i = 0;
 712             do
 713             {
 714                 CPPUNIT_ASSERT(i < std::size(aSinglePositions));
 715                 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 716                 CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
 717                 ++i;
 718             }
 719             while (nPos < aTest.getLength());
 720             nPos = aTest.getLength();
 721             i = std::size(aSinglePositions)-1;
 722             do
 723             {
 724                 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 725                 --i;
 726                 CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
 727             }
 728             while (nPos > 0);
 729         }
 730
 731         const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10};
 732         CPPUNIT_ASSERT_EQUAL(u'\'', aTests[0]);
 733         {
 734             OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[0]));
 735             sal_Int32 nPos = -1;
 736             size_t i = 0;
 737             do
 738             {
 739                 CPPUNIT_ASSERT(i < std::size(aSingleQuotePositions));
 740                 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 741                 CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
 742                 ++i;
 743             }
 744             while (nPos < aTest.getLength());
 745             nPos = aTest.getLength();
 746             i = std::size(aSingleQuotePositions)-1;
 747             do
 748             {
 749                 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
 750                 --i;
 751                 CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
 752             }
 753             while (nPos > 0);
 754         }
 755     }
 756
 757     //See https://bz.apache.org/ooo/show_bug.cgi?id=13451
 758     {
 759         aLocale.Language = "ca";
 760         aLocale.Country = "ES";
 761
 762         OUString aTest(u"mirar-se comprar-vos donem-nos les mans aneu-vos-en!"_ustr);
 763
 764         sal_Int32 nPos = 0;
 765         sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52};
 766         size_t i = 0;
 767         do
 768         {
 769             CPPUNIT_ASSERT(i < std::size(aExpected));
 770             nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
 771                 i18n::WordType::DICTIONARY_WORD, true).endPos;
 772             CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
 773             ++i;
 774         }
 775         while (nPos++ < aTest.getLength());
 776         CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
 777     }
 778
 779     // i#85411: ZWSP should be a word separator for spellchecking
 780     // - This fix was applied to both dict and edit customizations
 781     for (int j = 0; j < 3; ++j)
 782     {
 783         switch (j)
 784         {
 785             case 0:
 786                 aLocale.Language = "en";
 787                 aLocale.Country = "US";
 788                 break;
 789             case 1:
 790                 aLocale.Language = "ca";
 791                 aLocale.Country = "ES";
 792                 break;
 793             case 2:
 794                 aLocale.Language = "fi";
 795                 aLocale.Country = "FI";
 796                 break;
 797             default:
 798                 CPPUNIT_ASSERT(false);
 799                 break;
 800         }
 801
 802         static constexpr OUString aTest = u"I\u200Bwant\u200Bto\u200Bgo"_ustr;
 803
 804         sal_Int32 nPos = 0;
 805         sal_Int32 aExpected[] = { 1, 6, 9, 12 };
 806         size_t i = 0;
 807         do
 808         {
 809             CPPUNIT_ASSERT(i < std::size(aExpected));
 810             auto dwPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
 811                                                    i18n::WordType::DICTIONARY_WORD, true);
 812             CPPUNIT_ASSERT_EQUAL(aExpected[i], dwPos.endPos);
 813             auto ewPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
 814                                                    i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
 815             CPPUNIT_ASSERT_EQUAL(aExpected[i], ewPos.endPos);
 816             nPos = dwPos.endPos;
 817             ++i;
 818         } while (nPos++ < aTest.getLength());
 819         CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
 820     }
 821
 822     //https://bz.apache.org/ooo/show_bug.cgi?id=21290
 823     for (int j = 0; j < 2; ++j)
 824     {
 825         switch (j)
 826         {
 827             case 0:
 828                 aLocale.Language = "en";
 829                 aLocale.Country = "US";
 830                 break;
 831             case 1:
 832                 aLocale.Language = "grc";
 833                 aLocale.Country.clear();
 834                 break;
 835             default:
 836                 CPPUNIT_ASSERT(false);
 837                 break;
 838         }
 839
 840         static constexpr OUString aTest =
 841             u"\u1F0C\u03BD\u03B4\u03C1\u03B1 \u1F00"
 842             "\u03C1\u03BD\u1F7B\u03BC\u03B5\u03BD\u03BF"
 843             "\u03C2 \u1F00\u03BB\u03BB \u1F24"
 844             "\u03C3\u03B8\u03B9\u03BF\u03BD"_ustr;
 845
 846         sal_Int32 nPos = 0;
 847         sal_Int32 aExpected[] = {5, 15, 19, 26};
 848         size_t i = 0;
 849         do
 850         {
 851             CPPUNIT_ASSERT(i < std::size(aExpected));
 852             nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
 853                 i18n::WordType::DICTIONARY_WORD, true).endPos;
 854             CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
 855             ++i;
 856         }
 857         while (nPos++ < aTest.getLength());
 858         CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
 859     }
 860
 861     //See https://bz.apache.org/ooo/show_bug.cgi?id=58513
 862     //See https://bugs.libreoffice.org/show_bug.cgi?id=55707
 863     {
 864         aLocale.Language = "fi";
 865         aLocale.Country = "FI";
 866
 867         OUString aTest(u"Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n"_ustr);
 868
 869         {
 870             sal_Int32 nPos = 0;
 871             sal_Int32 aExpected[] = {11, 21, 24, 36, 42, 47, 51};
 872             size_t i = 0;
 873             do
 874             {
 875                 CPPUNIT_ASSERT(i < std::size(aExpected));
 876                 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
 877                     i18n::WordType::WORD_COUNT, true).endPos;
 878                 CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
 879                 ++i;
 880             }
 881             while (nPos++ < aTest.getLength());
 882             CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
 883         }
 884
 885         {
 886             sal_Int32 nPos = 0;
 887             sal_Int32 aExpected[] = {0, 11, 12, 20, 22, 24, 25, 36, 37,
 888                                     40, 41, 42, 43, 45, 46, 47, 50, 51};
 889             size_t i = 0;
 890             do
 891             {
 892                 CPPUNIT_ASSERT(i < std::size(aExpected));
 893                 aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
 894                     i18n::WordType::DICTIONARY_WORD, true);
 895                 CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.startPos);
 896                 ++i;
 897                 CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.endPos);
 898                 ++i;
 899                 nPos = aBounds.endPos;
 900             }
 901             while (nPos++ < aTest.getLength());
 902             CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
 903         }
 904     }
 905
 906     //See https://bz.apache.org/ooo/show_bug.cgi?id=107843
 907     {
 908         aLocale.Language = "en";
 909         aLocale.Country = "US";
 910
 911         static constexpr OUString aTest =
 912             u"ru\uFB00le \uFB01sh"_ustr;
 913
 914         aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 915         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
 916         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
 917
 918         aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 919         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.startPos);
 920         CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
 921     }
 922
 923     //See https://bz.apache.org/ooo/show_bug.cgi?id=113785
 924     {
 925         aLocale.Language = "en";
 926         aLocale.Country = "US";
 927
 928         static constexpr OUString aTest =
 929             u"a\u2013b\u2014c"_ustr;
 930
 931         aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
 932         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
 933         CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
 934
 935         aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD);
 936         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
 937         CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
 938
 939         aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD);
 940         CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos);
 941         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
 942     }
 943
 944     // i#55778: Words containing numbers get broken up
 945     {
 946         aLocale.Language = "en";
 947         aLocale.Country = "US";
 948
 949         static constexpr OUString aTest = u"first i18n third"_ustr;
 950
 951         aBounds
 952             = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
 953         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.startPos);
 954         CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.endPos);
 955     }
 956
 957     // i#56347: "BreakIterator patch for Hungarian"
 958     // i#56348: Special chars in first pos not handled by spell checking in Writer (Hungarian)
 959     // Rules for Hungarian affixes after numbers and certain symbols
 960     {
 961         aLocale.Language = "hu";
 962         aLocale.Country = "HU";
 963
 964         OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr;
 965
 966         for (auto mode :
 967              { i18n::WordType::DICTIONARY_WORD, i18n::WordType::ANYWORD_IGNOREWHITESPACES })
 968         {
 969             aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true);
 970             CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
 971             CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
 972
 973             aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true);
 974             CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
 975             CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
 976
 977             aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
 978             CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
 979             CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
 980
 981             aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true);
 982             CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos);
 983             CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
 984
 985             aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true);
 986             CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
 987             CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
 988
 989             aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true);
 990             CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
 991             CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
 992
 993             aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true);
 994             CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos);
 995             CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos);
 996         }
 997     }
 998
 999     // tdf#49885: Upgrade CJ word boundary analysis to ICU frequency-based analysis
1000     {
1001         aLocale.Language = "ja";
1002         aLocale.Country = "JP";
1003
1004         static constexpr OUString aTest = u"通産省工業技術院北海道工業開発試験所"_ustr;
1005
1006         aBounds
1007             = m_xBreak->getWordBoundary(aTest, 9, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1008
1009         // When using the old LO custom dictionaries, this will select the entire phrase.
1010         // When using ICU, it will select only 北海道.
1011         CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
1012         CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
1013     }
1014
1015     //  tdf#161737: narrow no-break space at the end of words resulted spelling mistakes
1016     {
1017         aLocale.Language = "en";
1018         aLocale.Country = "US";
1019
1020         OUString aTest(u"L’espace fine insécable\u202F!"_ustr);
1021         aBounds
1022             = m_xBreak->getWordBoundary(aTest, 14, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1023         CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos);
1024         // This was 24 (word + NNBSP)
1025         CPPUNIT_ASSERT_EQUAL(sal_Int32(23), aBounds.endPos);
1026     }
1027
1028     //  tdf#161737: narrow no-break space between digits resulted spelling mistakes
1029     //  as a quick fix, limit NBSP as word-part character only for editing, and not for spell checking
1030     //  TODO: remove NBSP by the linguistic module or by the spell checking dictionaries to allow
1031     //  to check numbers with thousand separators and with correct suffix
1032     {
1033         aLocale.Language = "en";
1034         aLocale.Country = "US";
1035
1036         OUString aTest(u"1\u202F000\u202F000"_ustr);
1037         aBounds
1038             = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1039         // This was 0 (word + NNBSP)
1040         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
1041         // This was 8 (word + NNBSP)
1042         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
1043     }
1044
1045     //  tdf#161737: narrow no-break space at the end of words resulted spelling mistakes
1046     {
1047         aLocale.Language = "hu";
1048         aLocale.Country = "HU";
1049
1050         OUString aTest(u"L’espace fine insécable\u202F!"_ustr);
1051         aBounds
1052             = m_xBreak->getWordBoundary(aTest, 14, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1053         CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos);
1054         // This was 24 (word + NNBSP)
1055         CPPUNIT_ASSERT_EQUAL(sal_Int32(23), aBounds.endPos);
1056     }
1057
1058     //  tdf#161737: narrow no-break space between digits resulted spelling mistakes
1059     //  as a quick fix, limit NBSP as word-part character only for editing, and not for spell checking
1060     //  TODO: remove NBSP by the linguistic module or by the spell checking dictionaries to allow
1061     //  to check numbers with thousand separators and with correct suffix
1062     {
1063         aLocale.Language = "hu";
1064         aLocale.Country = "HU";
1065
1066         OUString aTest(u"1\u202F000\u202F000"_ustr);
1067         aBounds
1068             = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1069         // This was 0 (word + NNBSP)
1070         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
1071         // This was 8 (word + NNBSP)
1072         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
1073     }
1074 }
1075
1076 void TestBreakIterator::testSentenceBoundaries()
1077 {
1078     lang::Locale aLocale;
1079     aLocale.Language = "en";
1080     aLocale.Country = "US";
1081
1082     // Trivial characteristic test for sentence boundary detection
1083     {
1084         OUString aTest(u"This is a sentence. This is a different sentence."_ustr);
1085
1086         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 5, aLocale));
1087         CPPUNIT_ASSERT_EQUAL(sal_Int32(19), m_xBreak->endOfSentence(aTest, 5, aLocale));
1088         CPPUNIT_ASSERT_EQUAL(sal_Int32(20), m_xBreak->beginOfSentence(aTest, 31, aLocale));
1089         CPPUNIT_ASSERT_EQUAL(sal_Int32(49), m_xBreak->endOfSentence(aTest, 31, aLocale));
1090     }
1091
1092     // i#24098: i18n API beginOfSentence/endOfSentence
1093     // fix beginOfSentence, ... when cursor is on the beginning of the sentence
1094     {
1095         OUString aTest(u"This is a sentence. This is a different sentence."_ustr);
1096
1097         CPPUNIT_ASSERT_EQUAL(sal_Int32(20), m_xBreak->beginOfSentence(aTest, 20, aLocale));
1098         CPPUNIT_ASSERT_EQUAL(sal_Int32(49), m_xBreak->endOfSentence(aTest, 20, aLocale));
1099     }
1100
1101     // i#24098: i18n API beginOfSentence/endOfSentence
1102     // "skip preceding space for beginOfSentence"
1103     {
1104         OUString aTest(u"This is a sentence.     This is a different sentence."_ustr);
1105
1106         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 20, aLocale));
1107         CPPUNIT_ASSERT_EQUAL(sal_Int32(19), m_xBreak->endOfSentence(aTest, 20, aLocale));
1108         CPPUNIT_ASSERT_EQUAL(sal_Int32(24), m_xBreak->beginOfSentence(aTest, 26, aLocale));
1109         CPPUNIT_ASSERT_EQUAL(sal_Int32(53), m_xBreak->endOfSentence(aTest, 26, aLocale));
1110     }
1111
1112     // i#55063: Sentence selection in Thai should select a space-delimited phrase.
1113     // - This customization broke at some point. It works in an English locale in a synthetic test
1114     // like this one, but does not work in the Thai locale, nor on Thai text in practice.
1115     {
1116         static constexpr OUString aTest = u"ว้อย โหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอก โปรโมเตอร์"_ustr;
1117
1118         aLocale.Language = "en";
1119         aLocale.Country = "US";
1120         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
1121         CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, aLocale));
1122
1123         aLocale.Language = "th";
1124         aLocale.Country = "TH";
1125         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
1126         CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, aLocale));
1127     }
1128
1129     // i#55063: Thai phrases should delimit English sentence selection.
1130     // - This customization broke at some point. It works in an English locale in a synthetic test
1131     // like this one, but does not work in the Thai locale, nor on Thai text in practice.
1132     {
1133         static constexpr OUString aTest = u"ว้อย English usually ends with a period โปรโมเตอร์."_ustr;
1134
1135         aLocale.Language = "en";
1136         aLocale.Country = "US";
1137         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
1138         CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, aLocale));
1139
1140         aLocale.Language = "th";
1141         aLocale.Country = "TH";
1142         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
1143         CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, aLocale));
1144     }
1145
1146     // i#55063: Characteristic test for English text delimiting Thai phrases (sentences)
1147     // - English text should not delimit Thai phrases.
1148     {
1149         static constexpr OUString aTest = u"Englishโหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอกEnglish"_ustr;
1150
1151         aLocale.Language = "en";
1152         aLocale.Country = "US";
1153         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
1154         CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, aLocale));
1155
1156         aLocale.Language = "th";
1157         aLocale.Country = "TH";
1158         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
1159         CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, aLocale));
1160     }
1161 }
1162
1163 //See https://bugs.libreoffice.org/show_bug.cgi?id=40292
1164 //See https://bz.apache.org/ooo/show_bug.cgi?id=80412
1165 //See https://bz.apache.org/ooo/show_bug.cgi?id=111152
1166 //See https://bz.apache.org/ooo/show_bug.cgi?id=50172
1167 void TestBreakIterator::testGraphemeIteration()
1168 {
1169     lang::Locale aLocale;
1170     aLocale.Language = "bn";
1171     aLocale.Country = "IN";
1172
1173     {
1174         static constexpr OUString aTest = u"\u09AC\u09CD\u09AF"_ustr; // BA HALANT LA
1175
1176         sal_Int32 nDone=0;
1177         sal_Int32 nPos;
1178         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
1179             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1180         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
1181         nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1182             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1183         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
1184     }
1185
1186     {
1187         static constexpr OUString aTest = u"\u09B9\u09CD\u09A3\u09BF"_ustr;
1188             // HA HALANT NA VOWELSIGNI
1189
1190         sal_Int32 nDone=0;
1191         sal_Int32 nPos;
1192         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
1193             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1194         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
1195         nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1196             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1197         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
1198     }
1199
1200     {
1201         static constexpr OUString aTest = u"\u09A4\u09CD\u09AE\u09CD\u09AF"_ustr;
1202             // TA HALANT MA HALANT YA
1203
1204         sal_Int32 nDone=0;
1205         sal_Int32 nPos;
1206         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
1207             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1208         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
1209         nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1210             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1211         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
1212     }
1213
1214     aLocale.Language = "ta";
1215     aLocale.Country = "IN";
1216
1217     {
1218         static constexpr OUString aTest = u"\u0B9A\u0BBF\u0BA4\u0BCD\u0BA4\u0BBF\u0BB0\u0BC8"_ustr; // CA VOWELSIGNI TA VIRAMA TA VOWELSIGNI RA VOWELSIGNAI
1219
1220         sal_Int32 nDone=0;
1221         sal_Int32 nPos = 0;
1222
1223         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1224         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(2), nPos);
1225         nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1226         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(4), nPos);
1227         nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1228         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(6), nPos);
1229         nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1230         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
1231         nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1232             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1233         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(6), nPos);
1234         nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1235         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(4), nPos);
1236         nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1237         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(2), nPos);
1238         nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1239         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
1240     }
1241
1242     {
1243         static constexpr OUString aTest = u"\u0B95\u0BC1"_ustr; // KA VOWELSIGNU
1244
1245         sal_Int32 nDone=0;
1246         sal_Int32 nPos = 0;
1247
1248         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
1249             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1250         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
1251         nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1252             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1253         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
1254     }
1255
1256     {
1257         static constexpr OUString aTest =
1258             u"\u0B9A\u0BBF\u0BA4\u0BCD\u0BA4\u0BBF\u0BB0\u0BC8"_ustr;
1259             // CA VOWELSIGNI TA VIRAMA TA VOWELSIGNI RA VOWELSIGNAI
1260
1261         sal_Int32 nDone=0;
1262         sal_Int32 nPos=0;
1263
1264         for (sal_Int32 i = 0; i < 4; ++i)
1265         {
1266             sal_Int32 nOldPos = nPos;
1267             nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale,
1268                 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1269             CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos+2, nPos);
1270         }
1271
1272         for (sal_Int32 i = 0; i < 4; ++i)
1273         {
1274             sal_Int32 nOldPos = nPos;
1275             nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale,
1276                 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1277             CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos-2, nPos);
1278         }
1279     }
1280
1281     {
1282         static constexpr OUString aText = u"\u05D0\u05B8"_ustr; // ALEF QAMATS
1283
1284         sal_Int32 nGraphemeCount = 0;
1285
1286         sal_Int32 nCurPos = 0;
1287         while (nCurPos < aText.getLength())
1288         {
1289             sal_Int32 nCount2 = 1;
1290             nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
1291                 i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
1292             ++nGraphemeCount;
1293         }
1294
1295         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be considered 1 grapheme", static_cast<sal_Int32>(1), nGraphemeCount);
1296     }
1297
1298     aLocale.Language = "hi";
1299     aLocale.Country = "IN";
1300
1301     {
1302         static constexpr OUString aTest = u"\u0936\u0940"_ustr; // SHA VOWELSIGNII
1303
1304         sal_Int32 nDone=0;
1305         sal_Int32 nPos = 0;
1306
1307         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
1308             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1309         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
1310         nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1311             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1312         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
1313     }
1314
1315     // tdf#49885: Replace custom Thai implementation with ICU
1316     {
1317         aLocale.Language = "th";
1318         aLocale.Country = "TH";
1319
1320         static constexpr OUString aTest = u"กำ"_ustr;
1321
1322         CPPUNIT_ASSERT_EQUAL(sal_Int32{ 2 }, aTest.getLength());
1323
1324         sal_Int32 nDone = 0;
1325         sal_Int32 nPos = 0;
1326
1327         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1,
1328                                         nDone);
1329         CPPUNIT_ASSERT_EQUAL(aTest.getLength(), nPos);
1330
1331         nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1332                                             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1333         CPPUNIT_ASSERT_EQUAL(sal_Int32{ 0 }, nPos);
1334     }
1335
1336     // Korean may also use grapheme clusters for character composition
1337     {
1338         aLocale.Language = "ko";
1339         aLocale.Country = "KR";
1340
1341         static constexpr OUString aTest = u"각"_ustr;
1342
1343         CPPUNIT_ASSERT_EQUAL(sal_Int32{ 3 }, aTest.getLength());
1344
1345         sal_Int32 nDone = 0;
1346         sal_Int32 nPos = 0;
1347
1348         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1,
1349                                         nDone);
1350         CPPUNIT_ASSERT_EQUAL(aTest.getLength(), nPos);
1351
1352         nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1353                                             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1354         CPPUNIT_ASSERT_EQUAL(sal_Int32{ 0 }, nPos);
1355     }
1356 }
1357
1358 //A test to ensure that certain ranges and codepoints that are categorized as
1359 //weak remain as weak, so that existing docs that depend on this don't silently
1360 //change font for those weak chars
1361 void TestBreakIterator::testWeak()
1362 {
1363     lang::Locale aLocale;
1364     aLocale.Language = "en";
1365     aLocale.Country = "US";
1366
1367     {
1368         static constexpr OUString aWeaks =
1369             u"\u0001\u0002"
1370             " \u00A0"
1371             "\u0300\u036F"  //Combining Diacritical Marks
1372             "\u1AB0\u1AFF"  //Combining Diacritical Marks Extended
1373             "\u1DC0\u1DFF"  //Combining Diacritical Marks Supplement
1374             "\u20D0\u20FF"  //Combining Diacritical Marks for Symbols
1375             "\u2150\u215F"  //Number Forms, fractions
1376             "\u2160\u2180"  //Number Forms, roman numerals
1377             "\u2200\u22FF"  //Mathematical Operators
1378             "\u27C0\u27EF"  //Miscellaneous Mathematical Symbols-A
1379             "\u2980\u29FF"  //Miscellaneous Mathematical Symbols-B
1380             "\u2A00\u2AFF"  //Supplemental Mathematical Operators
1381             "\u2100\u214F"  //Letterlike Symbols
1382             "\u2308\u230B"  //Miscellaneous technical
1383             "\u25A0\u25FF"  //Geometric Shapes
1384             "\u2B30\u2B4C"_ustr; //Miscellaneous Symbols and Arrows
1385
1386         for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
1387         {
1388             sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
1389             OString aMsg =
1390                 "Char 0x" +
1391                 OString::number(static_cast<sal_Int32>(std::u16string_view(aWeaks)[i]), 16) +
1392                 " should have been weak";
1393             CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
1394                 i18n::ScriptType::WEAK, nScript);
1395         }
1396     }
1397 }
1398
1399 //A test to ensure that certain ranges and codepoints that are categorized as
1400 //asian remain as asian, so that existing docs that depend on this don't silently
1401 //change font for those asian chars.
1402 //See https://bugs.libreoffice.org/show_bug.cgi?id=38095
1403 void TestBreakIterator::testAsian()
1404 {
1405     lang::Locale aLocale;
1406     aLocale.Language = "en";
1407     aLocale.Country = "US";
1408
1409     {
1410         static constexpr OUString aAsians =
1411             //some typical CJK chars
1412             u"\u4E00\u62FF"
1413             //The full HalfWidth and FullWidth block has historically been
1414             //designated as taking the CJK font :-(
1415             //HalfWidth and FullWidth forms of ASCII 0-9, categorized under
1416             //UAX24 as "Common" i.e. by that logic WEAK
1417             "\uFF10\uFF19"
1418             //HalfWidth and FullWidth forms of ASCII A-z, categorized under
1419             //UAX25 as "Latin", i.e. by that logic LATIN
1420             "\uFF21\uFF5A"_ustr;
1421
1422         for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
1423         {
1424             sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
1425             OString aMsg =
1426                 "Char 0x" +
1427                 OString::number(static_cast<sal_Int32>(std::u16string_view(aAsians)[i]), 16) +
1428                 " should have been asian";
1429             CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
1430                 i18n::ScriptType::ASIAN, nScript);
1431         }
1432     }
1433 }
1434
1435 //A test to ensure that our Lao word boundary detection is useful
1436 void TestBreakIterator::testLao()
1437 {
1438     lang::Locale aLocale;
1439     aLocale.Language = "lo";
1440     aLocale.Country = "LA";
1441
1442     static constexpr OUString aTest = u"\u0e8d\u0eb4\u0e99\u0e94\u0eb5\u0e95\u0ec9\u0ead\u0e99\u0eae\u0eb1\u0e9a"_ustr;
1443     i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
1444         i18n::WordType::DICTIONARY_WORD, true);
1445
1446     CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1447     CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
1448
1449     aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
1450         i18n::WordType::DICTIONARY_WORD, true);
1451
1452     CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
1453 #if (U_ICU_VERSION_MAJOR_NUM < 70)
1454     CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
1455 #else
1456     // FIXME:
1457     // In ICU 70/71 for yet unknown reason the word boundary 9 is not detected and
1458     // instead the length 12 is returned as endpos.
1459     // Deep in
1460     // icu_70::RuleBasedBreakIterator::BreakCache::next()
1461     // icu_70::RuleBasedBreakIterator::BreakCache::following()
1462     // icu_70::RuleBasedBreakIterator::following()
1463     // i18npool::BreakIterator_Unicode::getWordBoundary()
1464     CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
1465 #endif
1466 }
1467
1468 //A test to ensure that our thai word boundary detection is useful
1469 void TestBreakIterator::testThai()
1470 {
1471     lang::Locale aLocale;
1472     aLocale.Language = "th";
1473     aLocale.Country = "TH";
1474
1475     //See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
1476     {
1477         static constexpr OUString aTest = u"\u0E01\u0E38\u0E2B\u0E25\u0E32\u0E1A"_ustr;
1478         i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
1479             i18n::WordType::DICTIONARY_WORD, true);
1480         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full word",
1481             sal_Int32(0), aBounds.startPos);
1482         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full word",
1483             aTest.getLength(), aBounds.endPos);
1484     }
1485
1486     //See https://bz.apache.org/ooo/show_bug.cgi?id=29548
1487     //make sure forwards and back are consistent
1488     {
1489         static constexpr OUString aTest =
1490             u"\u0E2D\u0E38\u0E17\u0E22\u0E32\u0E19\u0E41"
1491             "\u0E2B\u0E48\u0E07\u0E0A\u0E32\u0E15\u0E34"
1492             "\u0E19\u0E49\u0E33\u0E2B\u0E19\u0E32\u0E27"
1493             "\u0E2D\u0E38\u0E17\u0E22\u0E32\u0E19\u0E41"
1494             "\u0E2B\u0E48\u0E07\u0E0A\u0E32\u0E15\u0E34"
1495             "\u0E19\u0E49\u0E33\u0E2B\u0E19\u0E32\u0E27"_ustr;
1496
1497         std::stack<sal_Int32> aPositions;
1498         sal_Int32 nPos = -1;
1499         do
1500         {
1501             nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
1502             aPositions.push(nPos);
1503         }
1504         while (nPos < aTest.getLength());
1505         nPos = aTest.getLength();
1506         CPPUNIT_ASSERT(!aPositions.empty());
1507         aPositions.pop();
1508         do
1509         {
1510             CPPUNIT_ASSERT(!aPositions.empty());
1511             nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
1512             CPPUNIT_ASSERT_EQUAL(aPositions.top(), nPos);
1513             aPositions.pop();
1514         }
1515         while (nPos > 0);
1516     }
1517
1518     // tdf#113694
1519     {
1520         static constexpr OUString aTest = u"\U00010000"_ustr;
1521
1522         sal_Int32 nDone=0;
1523         sal_Int32 nPos;
1524
1525         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
1526             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1527         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", aTest.getLength(), nPos);
1528         nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1529             i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1530         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
1531
1532         nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
1533             i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
1534         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", aTest.getLength(), nPos);
1535         nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1536             i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
1537         CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
1538     }
1539 }
1540
1541 #ifdef TODO
1542 void TestBreakIterator::testNorthernThai()
1543 {
1544     lang::Locale aLocale;
1545     aLocale.Language = "nod";
1546     aLocale.Country = "TH";
1547
1548     const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
1549     OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1));
1550     i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
1551         i18n::WordType::DICTIONARY_WORD, true);
1552     CPPUNIT_ASSERT_MESSAGE("Should skip full word",
1553         aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
1554 }
1555
1556 // Not sure if any version earlier than 49 did have Khmer word boundary
1557 // dictionaries, 4.6 does not.
1558
1559 // As of icu 54, word boundary detection for Khmer is still considered
1560 // insufficient, so icu khmer stuff is disabled
1561
1562 //A test to ensure that our khmer word boundary detection is useful
1563 //https://bugs.libreoffice.org/show_bug.cgi?id=52020
1564 void TestBreakIterator::testKhmer()
1565 {
1566     lang::Locale aLocale;
1567     aLocale.Language = "km";
1568     aLocale.Country = "KH";
1569
1570     const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
1571
1572     OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER));
1573     i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
1574         i18n::WordType::DICTIONARY_WORD, true);
1575
1576     CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
1577
1578     aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
1579         i18n::WordType::DICTIONARY_WORD, true);
1580
1581     CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5);
1582 }
1583 #endif
1584
1585 void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak)
1586 {
1587     lang::Locale aLocale;
1588     aLocale.Language = "ja";
1589     aLocale.Country = "JP";
1590     i18n::Boundary aBounds;
1591
1592     {
1593         static constexpr OUString aTest = u"シャットダウン"_ustr;
1594
1595         aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
1596             i18n::WordType::DICTIONARY_WORD, true);
1597
1598         CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos);
1599         CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
1600     }
1601
1602     {
1603         static constexpr OUString aTest = u"\u9EBB\u306E\u8449\u9EBB\u306E\u8449"_ustr;
1604
1605         aBounds = xBreak->getWordBoundary(aTest, 1, aLocale,
1606             i18n::WordType::DICTIONARY_WORD, true);
1607
1608         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1609         CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
1610
1611         aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
1612             i18n::WordType::DICTIONARY_WORD, true);
1613
1614         CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
1615         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1616     }
1617
1618     {
1619         // tdf#162912: Double-clicking should only select one Basic identifier
1620         static constexpr OUString aTest = u"ThisComponent.CurrentSelection"_ustr;
1621
1622         aBounds = xBreak->getWordBoundary(aTest, 5, aLocale, i18n::WordType::DICTIONARY_WORD, true);
1623         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1624         CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
1625
1626         aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
1627                                           i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
1628         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1629         CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos);
1630
1631         aBounds = xBreak->getWordBoundary(aTest, 15, aLocale,
1632                                           i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
1633         CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos);
1634         CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
1635     }
1636 }
1637
1638 void TestBreakIterator::testJapanese()
1639 {
1640     doTestJapanese(m_xBreak);
1641
1642     // fdo#78479 - test second / cached instantiation of xdictionary
1643     uno::Reference< i18n::XBreakIterator > xTmpBreak(m_xSFactory->createInstance(
1644         u"com.sun.star.i18n.BreakIterator"_ustr), uno::UNO_QUERY_THROW);
1645
1646     doTestJapanese(xTmpBreak);
1647 }
1648
1649 void TestBreakIterator::testChinese()
1650 {
1651     lang::Locale aLocale;
1652     aLocale.Language = "zh";
1653     aLocale.Country = "CN";
1654
1655     {
1656         static constexpr OUStringLiteral aTest = u"\u6A35\u6A30\u69FE\u8919\U00029EDB";
1657
1658         i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale,
1659             i18n::WordType::DICTIONARY_WORD, true);
1660         CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos);
1661         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1662     }
1663 }
1664
1665 void TestBreakIterator::testDictWordPrepostDash()
1666 {
1667     std::vector<lang::Locale> aLocale{ { "de", "DE", "" },
1668                                        { "nds", "DE", "" },
1669                                        { "nl", "NL", "" },
1670                                        { "sv", "SE", "" },
1671                                        { "da", "DK", "" } };
1672
1673     for (const auto& rLocale : aLocale)
1674     {
1675         auto aTest = u"Arbeits- -nehmer"_ustr;
1676
1677         i18n::Boundary aBounds
1678             = m_xBreak->getWordBoundary(aTest, 3, rLocale, i18n::WordType::DICTIONARY_WORD, false);
1679         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1680         CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
1681
1682         aBounds
1683             = m_xBreak->getWordBoundary(aTest, 13, rLocale, i18n::WordType::DICTIONARY_WORD, false);
1684         CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
1685         CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
1686     }
1687 }
1688
1689 void TestBreakIterator::testDictWordAbbreviation()
1690 {
1691     std::vector<lang::Locale> aLocale{
1692         { "en", "US", "" }, // dict_word locale
1693         { "de", "DE", "" } // dict_word_prepostdash locale
1694     };
1695
1696     for (const auto& rLocale : aLocale)
1697     {
1698         auto aTest = u"Examples: e.g. i.e. etc. and such"_ustr;
1699
1700         i18n::Boundary aBounds
1701             = m_xBreak->getWordBoundary(aTest, 3, rLocale, i18n::WordType::DICTIONARY_WORD, false);
1702         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1703         CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
1704
1705         aBounds
1706             = m_xBreak->getWordBoundary(aTest, 10, rLocale, i18n::WordType::DICTIONARY_WORD, false);
1707         CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
1708         CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
1709
1710         aBounds
1711             = m_xBreak->getWordBoundary(aTest, 15, rLocale, i18n::WordType::DICTIONARY_WORD, false);
1712         CPPUNIT_ASSERT_EQUAL(sal_Int32(15), aBounds.startPos);
1713         CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
1714
1715         aBounds
1716             = m_xBreak->getWordBoundary(aTest, 20, rLocale, i18n::WordType::DICTIONARY_WORD, false);
1717         CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.startPos);
1718         CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
1719
1720         aBounds
1721             = m_xBreak->getWordBoundary(aTest, 26, rLocale, i18n::WordType::DICTIONARY_WORD, false);
1722         CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
1723         CPPUNIT_ASSERT_EQUAL(sal_Int32(28), aBounds.endPos);
1724
1725         aBounds
1726             = m_xBreak->getWordBoundary(aTest, 30, rLocale, i18n::WordType::DICTIONARY_WORD, false);
1727         CPPUNIT_ASSERT_EQUAL(sal_Int32(29), aBounds.startPos);
1728         CPPUNIT_ASSERT_EQUAL(sal_Int32(33), aBounds.endPos);
1729     }
1730 }
1731
1732 void TestBreakIterator::testHebrewGereshGershaim()
1733 {
1734     // In Hebrew documents, there are multiple valid ways to represent the geresh and gershaim
1735     // intra-word punctuation marks. This test exhaustively exercises them.
1736     //
1737     // See the following bugs:
1738     // i#51661: Add quotation mark as middle letter for Hebrew
1739     // tdf#46950: Spell-checking breaks Hebrew words at intra-word single and double quotes
1740
1741     lang::Locale aLocale;
1742
1743     aLocale.Language = "he";
1744     aLocale.Country = "IL";
1745
1746     // Unicode U+05F3 HEBREW PUNCTUATION GERESH
1747     {
1748         auto aTest = u"ג׳ירפה"_ustr;
1749
1750         auto aBounds
1751             = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1752         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1753         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1754
1755         aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale,
1756                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
1757         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1758         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1759     }
1760
1761     // Apostrophe as geresh
1762     {
1763         auto aTest = u"ג'ירפה"_ustr;
1764
1765         auto aBounds
1766             = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1767         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1768         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1769
1770         aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale,
1771                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
1772         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1773         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1774     }
1775
1776     // Right single quote as geresh
1777     {
1778         auto aTest = u"ג’ירפה"_ustr;
1779
1780         auto aBounds
1781             = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1782         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1783         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1784
1785         aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale,
1786                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
1787         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1788         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1789     }
1790
1791     // Unicode U+05F4 HEBREW PUNCTUATION GERSHAYIM
1792     {
1793         auto aTest = u"דו״ח"_ustr;
1794
1795         auto aBounds
1796             = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1797         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1798         CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
1799
1800         aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale,
1801                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
1802         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1803         CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
1804     }
1805
1806     // Double quote as gershayim
1807     {
1808         auto aTest = u"דו\"ח"_ustr;
1809
1810         auto aBounds
1811             = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1812         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1813         CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
1814
1815         aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale,
1816                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
1817         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1818         CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
1819     }
1820
1821     // Right double quote as gershayim
1822     {
1823         auto aTest = u"דו”ח"_ustr;
1824
1825         auto aBounds
1826             = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1827         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1828         CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
1829
1830         aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale,
1831                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
1832         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1833         CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
1834     }
1835 }
1836
1837 void TestBreakIterator::testLegacySurrogatePairs()
1838 {
1839     lang::Locale aLocale;
1840
1841     aLocale.Language = "ja";
1842     aLocale.Country = "JP";
1843
1844     // i#75632: [surrogate pair] Japanese word break does not work properly for surrogate pairs.
1845     // and many others to address bugs: i#75631 i#75633 i#75412 etc.
1846     //
1847     // BreakIterator supports surrogate pairs (UTF-16). This is a simple characteristic test.
1848     {
1849         static constexpr OUString aTest = u"X 𠮟 X"_ustr;
1850
1851         auto aBounds
1852             = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1853         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1854         CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
1855
1856         aBounds
1857             = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1858         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
1859         CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
1860
1861         aBounds
1862             = m_xBreak->getWordBoundary(aTest, 5, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1863         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
1864         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1865     }
1866 }
1867
1868 void TestBreakIterator::testWordCount()
1869 {
1870     auto fnCountWords = [&](const OUString& aStr, const lang::Locale& aLocale) -> int
1871     {
1872         int nWords = 0;
1873         sal_Int32 nNextPos = 0;
1874         int nIterGuard = 0;
1875
1876         if (m_xBreak->isBeginWord(aStr, nNextPos, aLocale, i18n::WordType::WORD_COUNT))
1877         {
1878             ++nWords;
1879         }
1880
1881         while (true)
1882         {
1883             CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++nIterGuard < 100);
1884
1885             auto aBounds = m_xBreak->nextWord(aStr, nNextPos, aLocale, i18n::WordType::WORD_COUNT);
1886             if (aBounds.endPos == aBounds.startPos)
1887             {
1888                 break;
1889             }
1890
1891             nNextPos = aBounds.endPos;
1892             ++nWords;
1893         }
1894
1895         return nWords;
1896     };
1897
1898     // i#80815: "Word count differs from MS Word"
1899     // This is a characteristic test for word count using test data from the linked bug.
1900     {
1901         lang::Locale aLocale;
1902         aLocale.Language = "en";
1903         aLocale.Country = "US";
1904
1905         const OUString aStr = u""
1906                               "test data for word count issue #80815\n"
1907                               "fo\\\'sforos\n"
1908                               "archipi\\\'elago\n"
1909                               "do\\^me\n"
1910                               "f**k\n"
1911                               "\n"
1912                               "battery-driven\n"
1913                               "and/or\n"
1914                               "apple(s)\n"
1915                               "money+opportunity\n"
1916                               "Micro$oft\n"
1917                               "\n"
1918                               "300$\n"
1919                               "I(not you)\n"
1920                               "a****n\n"
1921                               "1+3=4\n"
1922                               "\n"
1923                               "aaaaaaa.aaaaaaa\n"
1924                               "aaaaaaa,aaaaaaa\n"
1925                               "aaaaaaa;aaaaaaa\n"_ustr;
1926
1927         CPPUNIT_ASSERT_EQUAL(24, fnCountWords(aStr, aLocale));
1928     }
1929
1930     // Test that the switch to upstream ICU for CJ word boundary analysis doesn't change word count.
1931     {
1932         lang::Locale aLocale;
1933         aLocale.Language = "ja";
1934         aLocale.Country = "JP";
1935
1936         const OUString aStr = u"Wordの様にワード数をするのにTest\n植松町"_ustr;
1937
1938         CPPUNIT_ASSERT_EQUAL(8, fnCountWords(aStr, aLocale));
1939     }
1940
1941     // tdf#150621 Korean words should be counted individually, rather than by syllable.
1942     //
1943     // Per i#80815, the intention for the word count feature is to emulate the behavior of MS Word.
1944     {
1945         lang::Locale aLocale;
1946         aLocale.Language = "ko";
1947         aLocale.Country = "KR";
1948
1949         // Basic case: Korean words are counted as space-delimited. In particular, grammatical
1950         // particles are treated as part of the previous word.
1951         CPPUNIT_ASSERT_EQUAL(3, fnCountWords(u"저는 영화를 봤어요"_ustr, aLocale));
1952
1953         // Mixed script: Korean is mostly written in hangul, but hanja are still used in certain
1954         // situations (e.g. abbreviations in newspaper articles). For Chinese and Japanese, such
1955         // ideographs would be counted individually as words. In Korean, however, they are treated
1956         // no differently than hangul characters.
1957         CPPUNIT_ASSERT_EQUAL(1, fnCountWords(u"불렀다...與"_ustr, aLocale));
1958         CPPUNIT_ASSERT_EQUAL(2, fnCountWords(u"불렀다 ...與"_ustr, aLocale));
1959         CPPUNIT_ASSERT_EQUAL(3, fnCountWords(u"불렀다 ... 與"_ustr, aLocale));
1960         CPPUNIT_ASSERT_EQUAL(1, fnCountWords(u"尹탄핵"_ustr, aLocale));
1961         CPPUNIT_ASSERT_EQUAL(2, fnCountWords(u"尹 탄핵"_ustr, aLocale));
1962     }
1963 }
1964
1965 void TestBreakIterator::testDictionaryIteratorLanguages()
1966 {
1967     // Thai
1968     {
1969         lang::Locale aLocale{ "th", "TH", "" };
1970
1971         const OUString aStr = u"รอนานหรือเปล่า"_ustr;
1972
1973         i18n::Boundary aBounds;
1974
1975         aBounds
1976             = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true);
1977         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1978         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
1979
1980         aBounds
1981             = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::DICTIONARY_WORD, true);
1982         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
1983         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
1984
1985         aBounds
1986             = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::DICTIONARY_WORD, true);
1987         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
1988         CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
1989
1990         aBounds
1991             = m_xBreak->getWordBoundary(aStr, 10, aLocale, i18n::WordType::DICTIONARY_WORD, true);
1992         CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
1993         CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
1994
1995         aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true);
1996         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1997         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
1998
1999         aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::ANY_WORD, true);
2000         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
2001         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
2002
2003         aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::ANY_WORD, true);
2004         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
2005         CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
2006
2007         aBounds = m_xBreak->getWordBoundary(aStr, 10, aLocale, i18n::WordType::ANY_WORD, true);
2008         CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
2009         CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
2010
2011         aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale,
2012                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2013         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
2014         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
2015
2016         aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale,
2017                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2018         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
2019         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
2020
2021         aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale,
2022                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2023         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
2024         CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
2025
2026         aBounds = m_xBreak->getWordBoundary(aStr, 10, aLocale,
2027                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2028         CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
2029         CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
2030     }
2031
2032     // Japanese
2033     {
2034         lang::Locale aLocale{ "ja", "JP", "" };
2035
2036         const OUString aStr = u"通産省工業技術院北海道"_ustr;
2037
2038         i18n::Boundary aBounds;
2039
2040         aBounds
2041             = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2042         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
2043         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
2044
2045         aBounds
2046             = m_xBreak->getWordBoundary(aStr, 2, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2047         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
2048         CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
2049
2050         aBounds
2051             = m_xBreak->getWordBoundary(aStr, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2052         CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
2053         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
2054
2055         aBounds
2056             = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2057         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
2058         CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
2059
2060         aBounds
2061             = m_xBreak->getWordBoundary(aStr, 7, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2062         CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
2063         CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
2064
2065         aBounds
2066             = m_xBreak->getWordBoundary(aStr, 9, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2067         CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
2068         CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
2069
2070         aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true);
2071         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
2072         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
2073
2074         aBounds = m_xBreak->getWordBoundary(aStr, 2, aLocale, i18n::WordType::ANY_WORD, true);
2075         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
2076         CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
2077
2078         aBounds = m_xBreak->getWordBoundary(aStr, 4, aLocale, i18n::WordType::ANY_WORD, true);
2079         CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
2080         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
2081
2082         aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::ANY_WORD, true);
2083         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
2084         CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
2085
2086         aBounds = m_xBreak->getWordBoundary(aStr, 7, aLocale, i18n::WordType::ANY_WORD, true);
2087         CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
2088         CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
2089
2090         aBounds = m_xBreak->getWordBoundary(aStr, 9, aLocale, i18n::WordType::ANY_WORD, true);
2091         CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
2092         CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
2093
2094         aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale,
2095                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2096         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
2097         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
2098
2099         aBounds = m_xBreak->getWordBoundary(aStr, 2, aLocale,
2100                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2101         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
2102         CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
2103
2104         aBounds = m_xBreak->getWordBoundary(aStr, 4, aLocale,
2105                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2106         CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
2107         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
2108
2109         aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale,
2110                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2111         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
2112         CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
2113
2114         aBounds = m_xBreak->getWordBoundary(aStr, 7, aLocale,
2115                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2116         CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
2117         CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
2118
2119         aBounds = m_xBreak->getWordBoundary(aStr, 9, aLocale,
2120                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2121         CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
2122         CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
2123     }
2124
2125     // Chinese
2126     {
2127         lang::Locale aLocale{ "zh", "CN", "" };
2128
2129         const OUString aStr = u"很高兴认识你"_ustr;
2130
2131         i18n::Boundary aBounds;
2132
2133         aBounds
2134             = m_xBreak->getWordBoundary(aStr, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2135         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
2136         CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
2137
2138         aBounds
2139             = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2140         CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos);
2141         CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
2142
2143         aBounds
2144             = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2145         CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
2146         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
2147
2148         aBounds
2149             = m_xBreak->getWordBoundary(aStr, 5, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2150         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
2151         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
2152
2153         aBounds = m_xBreak->getWordBoundary(aStr, 0, aLocale, i18n::WordType::ANY_WORD, true);
2154         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
2155         CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
2156
2157         aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true);
2158         CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos);
2159         CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
2160
2161         aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::ANY_WORD, true);
2162         CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
2163         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
2164
2165         aBounds = m_xBreak->getWordBoundary(aStr, 5, aLocale, i18n::WordType::ANY_WORD, true);
2166         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
2167         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
2168
2169         aBounds = m_xBreak->getWordBoundary(aStr, 0, aLocale,
2170                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2171         CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
2172         CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
2173
2174         aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale,
2175                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2176         CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos);
2177         CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
2178
2179         aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale,
2180                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2181         CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
2182         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
2183
2184         aBounds = m_xBreak->getWordBoundary(aStr, 5, aLocale,
2185                                             i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2186         CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
2187         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
2188     }
2189 }
2190
2191 void TestBreakIterator::setUp()
2192 {
2193     BootstrapFixtureBase::setUp();
2194     m_xBreak.set(m_xSFactory->createInstance(u"com.sun.star.i18n.BreakIterator"_ustr), uno::UNO_QUERY_THROW);
2195 }
2196
2197 void TestBreakIterator::tearDown()
2198 {
2199     m_xBreak.clear();
2200     BootstrapFixtureBase::tearDown();
2201 }
2202
2203 CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator);
2204
2205 CPPUNIT_PLUGIN_IMPLEMENT();
2206
2207 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */