tdf#121119 master docs at-page anchor: clean-up renaming
[LibreOffice.git] / i18npool / qa / cppunit / test_breakiterator.cxx
blob80bdeb15c7be10390e47362ae4be0d6ec88f5b6e
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 */
10 #include <com/sun/star/i18n/XBreakIterator.hpp>
11 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
12 #include <com/sun/star/i18n/ScriptType.hpp>
13 #include <com/sun/star/i18n/WordType.hpp>
14 #include <o3tl/cppunittraitshelper.hxx>
15 #include <unotest/bootstrapfixturebase.hxx>
17 #include <unicode/uvernum.h>
19 #include <string.h>
21 #include <stack>
22 #include <string_view>
24 using namespace ::com::sun::star;
26 class TestBreakIterator : public test::BootstrapFixtureBase
28 public:
29 virtual void setUp() override;
30 virtual void tearDown() override;
32 void testLineBreaking();
33 void testWordBoundaries();
34 void testSentenceBoundaries();
35 void testGraphemeIteration();
36 void testWeak();
37 void testAsian();
38 void testThai();
39 void testLao();
40 #ifdef TODO
41 void testNorthernThai();
42 void testKhmer();
43 #endif
44 void testJapanese();
45 void testChinese();
47 void testDictWordAbbreviation();
48 void testDictWordPrepostDash();
49 void testHebrewGereshGershaim();
50 void testLegacySurrogatePairs();
51 void testWordCount();
52 void testDictionaryIteratorLanguages();
54 CPPUNIT_TEST_SUITE(TestBreakIterator);
55 CPPUNIT_TEST(testLineBreaking);
56 CPPUNIT_TEST(testWordBoundaries);
57 CPPUNIT_TEST(testSentenceBoundaries);
58 CPPUNIT_TEST(testGraphemeIteration);
59 CPPUNIT_TEST(testWeak);
60 CPPUNIT_TEST(testAsian);
61 CPPUNIT_TEST(testThai);
62 CPPUNIT_TEST(testLao);
63 #ifdef TODO
64 CPPUNIT_TEST(testKhmer);
65 CPPUNIT_TEST(testNorthernThai);
66 #endif
67 CPPUNIT_TEST(testJapanese);
68 CPPUNIT_TEST(testChinese);
69 CPPUNIT_TEST(testDictWordAbbreviation);
70 CPPUNIT_TEST(testDictWordPrepostDash);
71 CPPUNIT_TEST(testHebrewGereshGershaim);
72 CPPUNIT_TEST(testLegacySurrogatePairs);
73 CPPUNIT_TEST(testWordCount);
74 CPPUNIT_TEST(testDictionaryIteratorLanguages);
75 CPPUNIT_TEST_SUITE_END();
77 private:
78 uno::Reference<i18n::XBreakIterator> m_xBreak;
79 void doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak);
82 void TestBreakIterator::testLineBreaking()
84 i18n::LineBreakHyphenationOptions aHyphOptions;
85 i18n::LineBreakUserOptions aUserOptions;
86 lang::Locale aLocale;
88 //See https://bugs.libreoffice.org/show_bug.cgi?id=31271
90 OUString aTest(u"(some text here)"_ustr);
92 aLocale.Language = "en";
93 aLocale.Country = "US";
96 //Here we want the line break to leave text here) on the next line
97 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
98 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(6), aResult.breakIndex);
102 //Here we want the line break to leave "here)" on the next line
103 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
104 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(11), aResult.breakIndex);
108 //See https://bugs.libreoffice.org/show_bug.cgi?id=49849
110 static constexpr OUString aWord = u"\u05DE\u05D9\u05DC\u05D9\u05DD"_ustr;
111 OUString aTest(aWord + " " + aWord);
113 aLocale.Language = "he";
114 aLocale.Country = "IL";
117 //Here we want the line break to happen at the whitespace
118 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
119 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", aWord.getLength()+1, aResult.breakIndex);
123 //See https://bz.apache.org/ooo/show_bug.cgi?id=17155
125 aLocale.Language = "en";
126 aLocale.Country = "US";
129 //Here we want the line break to leave /bar/ba clumped together on the next line
130 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(u"foo /bar/baz"_ustr, strlen("foo /bar/ba"), aLocale, 0,
131 aHyphOptions, aUserOptions);
132 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first slash", static_cast<sal_Int32>(4), aResult.breakIndex);
136 // i#22602: writer breaks word after dot immediately followed by a letter
138 aLocale.Language = "en";
139 aLocale.Country = "US";
142 //Here we want the line break to leave ./bar/baz clumped together on the next line
143 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
144 u"foo ./bar/baz"_ustr, strlen("foo ./bar/ba"), aLocale, 0, aHyphOptions, aUserOptions);
145 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first period",
146 static_cast<sal_Int32>(4), aResult.breakIndex);
150 // i#81448: slash and backslash make non-breaking spaces of preceding spaces
152 aLocale.Language = "en";
153 aLocale.Country = "US";
156 // Per the bug, the line break should leave ...BE clumped together on the next line.
157 // However, the current behavior does not wrap the string at all. This test asserts the
158 // current behavior as a point of reference.
159 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
160 u"THIS... ...BE"_ustr, strlen("THIS... ...B"), aLocale, 0, aHyphOptions, aUserOptions);
161 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aResult.breakIndex);
165 // i#81448: slash and backslash make non-breaking spaces of preceding spaces
167 aLocale.Language = "en";
168 aLocale.Country = "US";
171 // The line break should leave /BE clumped together on the next line.
172 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
173 u"THIS... /BE"_ustr, strlen("THIS... /B"), aLocale, 0, aHyphOptions, aUserOptions);
174 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(8), aResult.breakIndex);
178 // i#80548: Bad word wrap between dash and word
180 aLocale.Language = "fi";
181 aLocale.Country = "FI";
184 // Per the bug, the line break should leave -bar clumped together on the next line.
185 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
186 u"foo -bar"_ustr, strlen("foo -ba"), aLocale, 0, aHyphOptions, aUserOptions);
187 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first dash",
188 static_cast<sal_Int32>(4), aResult.breakIndex);
192 // i#80645: Line erroneously breaks at backslash
194 aLocale.Language = "en";
195 aLocale.Country = "US";
198 // Note that the current behavior deviates from the original fix for this bug.
200 // The original report was filed due to wrapping all of "\Program Files\aaaa" to the
201 // next line, even though only "aaaa" overflowed. The original fix was to simply make
202 // U+005C reverse solidus (backslash) a breaking character.
204 // However, the root cause for this bug was not the behavior of '\', but rather some
205 // other bug making all of "\Program Files\" behave like a single token, despite it
206 // even containing whitespace.
208 // Reverting to the ICU line rules fixes this root issue. Now, in the following,
209 // "C:\Program" and "Files\LibreOffice" are treated as separate tokens. This is also
210 // consistent with the behavior of other office programs.
211 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
212 u"C:\\Program Files\\LibreOffice"_ustr, strlen("C:\\Program Files\\Libre"), aLocale, 0,
213 aHyphOptions, aUserOptions);
214 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(11), aResult.breakIndex);
216 // An identical result should be generated for solidus.
217 aResult = m_xBreak->getLineBreak(
218 u"C:/Program Files/LibreOffice"_ustr, strlen("C:/Program Files/Libre"), aLocale, 0,
219 aHyphOptions, aUserOptions);
220 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(11), aResult.breakIndex);
224 // i#80841: Words separated by hyphens will always break to next line
226 aLocale.Language = "en";
227 aLocale.Country = "US";
230 // Here we want the line break to leave toll- on the first line
231 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
232 u"toll-free"_ustr, strlen("toll-fr"), aLocale, 0, aHyphOptions, aUserOptions);
233 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(5), aResult.breakIndex);
237 // i#83464: Line break between letter and $
239 aLocale.Language = "en";
240 aLocale.Country = "US";
243 // Here we want the line break to leave US$ clumped on the next line.
244 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
245 u"word US$ 123"_ustr, strlen("word U"), aLocale, 0, aHyphOptions, aUserOptions);
246 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(5), aResult.breakIndex);
250 // Unknown bug number: "fix line break problem of dot after letter and before number"
252 aLocale.Language = "en";
253 aLocale.Country = "US";
256 // Here we want the line break to leave US$ clumped on the next line.
257 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
258 u"word L.5 word"_ustr, strlen("word L"), aLocale, 0, aHyphOptions, aUserOptions);
259 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(5), aResult.breakIndex);
263 // i#83229: Wrong line break when word contains a hyphen
265 aLocale.Language = "en";
266 aLocale.Country = "US";
269 // The root cause for this bug was the Unicode standard introducing special treatment
270 // for '-' in a number range context. This change makes number ranges (e.g. "100-199")
271 // behave as if they are single tokens for the purposes of line breaking. Unfortunately,
272 // this caused a significant appearance change to existing documents.
274 // Despite being a user-visible layout change, this isn't exactly a bug. Wrapping
275 // number ranges as a single token is consistent with other applications, including web
276 // browsers, and other office suites as mentioned in the bug discussion. Removing this
277 // customization seems like it would be a major change, however.
279 // Here we want the line break to leave 100- clumped on the first line.
281 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
282 u"word 100-199 word"_ustr, strlen("word 100-1"), aLocale, 0, aHyphOptions, aUserOptions);
283 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(9), aResult.breakIndex);
287 // From the same bug: "the leading minus must stay with numbers and strings"
289 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
290 u"range of -100.000 to 100.000"_ustr, strlen("range of -1"), aLocale, 0,
291 aHyphOptions, aUserOptions);
292 CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex);
294 static constexpr OUString str = u"range of \u2212100.000 to 100.000"_ustr;
295 aResult = m_xBreak->getLineBreak(
296 str, strlen("range of -"), aLocale, 0, aHyphOptions, aUserOptions);
297 CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex);
300 aLocale.Language = "de";
301 aLocale.Country = "DE";
304 // From the same bug: "the leading minus must stay with numbers and strings"
306 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
307 u"EURO is -10,50"_ustr, strlen("EURO is -1"), aLocale, 0, aHyphOptions, aUserOptions);
308 CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex);
310 // Also the mathematical minus sign:
312 static constexpr OUString str = u"EURO is \u221210,50"_ustr;
313 aResult = m_xBreak->getLineBreak(
314 str, strlen("EURO is -"), aLocale, 0, aHyphOptions, aUserOptions);
315 CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex);
319 // From the same bug: "the leading minus must stay with numbers and strings"
321 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
322 u"und -kosten"_ustr, strlen("und -ko"), aLocale, 0,
323 aHyphOptions, aUserOptions);
324 CPPUNIT_ASSERT_EQUAL(sal_Int32{4}, aResult.breakIndex);
326 // But not the non-breaking hyphen:
328 static constexpr OUString str = u"und \u2011"_ustr;
329 aResult = m_xBreak->getLineBreak(
330 str, strlen("und -ko"), aLocale, 0, aHyphOptions, aUserOptions);
331 CPPUNIT_ASSERT_EQUAL(sal_Int32{5}, aResult.breakIndex);
335 // i#83649: "Line break should be between typographical quote and left bracket"
336 // - Actually: Spaces between quotation mark and opening punctuation not treated as a break.
337 // - Note that per the Unicode standard, prohibiting breaks in this context is intentional
338 // because it may cause issues in certain languages due to the various ways quotation
339 // characters are used.
340 // - We do it anyway by customizing the ICU line breaking rules.
343 // This uses the sample text provided in the bug report. Based on usage, it is assumed
344 // they were in the de_DE locale.
346 aLocale.Language = "de";
347 aLocale.Country = "DE";
349 // Per the bug report, it is expected that »angetan werden« remains on the first line.
350 const OUString str = u"»angetan werden« [Passiv]"_ustr;
351 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
352 str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
353 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
355 // The same result should be returned for this and the first case.
356 const OUString str2 = u"»angetan werden« Passiv"_ustr;
357 aResult = m_xBreak->getLineBreak(
358 str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
359 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
361 // Under ICU rules, no amount of spaces would cause this to wrap.
362 const OUString str3 = u"»angetan werden« [Passiv]"_ustr;
363 aResult = m_xBreak->getLineBreak(
364 str3, str3.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
365 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(20), aResult.breakIndex);
367 // However, tabs will
368 const OUString str4 = u"»angetan werden«\t[Passiv]"_ustr;
369 aResult = m_xBreak->getLineBreak(
370 str4, str4.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
371 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
375 // The same behavior is seen in English
377 aLocale.Language = "en";
378 aLocale.Country = "US";
380 const OUString str = u"\"angetan werden\" [Passiv]"_ustr;
381 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
382 str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
383 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
385 const OUString str2 = u"\"angetan werden\" Passiv"_ustr;
386 aResult = m_xBreak->getLineBreak(
387 str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
388 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
392 // i#72868: Writer/Impress line does not break after Chinese punctuation and Latin letters
394 aLocale.Language = "zh";
395 aLocale.Country = "HK";
398 // Per the bug, this should break at the ideographic comma. However, this change has
399 // been reverted at some point. This test only verifies current behavior.
400 const OUString str = u"word word、word word"_ustr;
401 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
402 str, strlen("word wordXwor"), aLocale, 0, aHyphOptions, aUserOptions);
403 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(13), aResult.breakIndex);
407 // i#80891: Character in the forbidden list sometimes appears at the start of line
409 aLocale.Language = "zh";
410 aLocale.Country = "HK";
413 // Per the bug, the ideographic two-dot leader should be a forbidden character. However,
414 // this change seems to have been reverted or broken at some point.
415 const OUString str = u"電話︰電話"_ustr;
416 i18n::LineBreakResults aResult
417 = m_xBreak->getLineBreak(str, 2, aLocale, 0, aHyphOptions, aUserOptions);
418 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(2), aResult.breakIndex);
422 //See https://bz.apache.org/ooo/show_bug.cgi?id=19716
424 aLocale.Language = "en";
425 aLocale.Country = "US";
428 OUString aTest(u"aaa]aaa"_ustr);
429 //Here we want the line break to move the whole lot to the next line
430 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
431 aHyphOptions, aUserOptions);
432 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the line, not at ]", static_cast<sal_Int32>(0), aResult.breakIndex);
436 //this is an example sequence from tdf92993-1.docx caught by the load crashtesting
438 static constexpr OUStringLiteral aTest = u"\U0001f356\U0001f357\U0001f346"
439 "\U0001f364\u2668\ufe0f\U0001f3c6";
441 aLocale.Language = "en";
442 aLocale.Country = "US";
445 //This must not assert/crash
446 (void)m_xBreak->getLineBreak(aTest, 0, aLocale, 0, aHyphOptions, aUserOptions);
450 //See https://bugs.documentfoundation.org/show_bug.cgi?id=96197
452 static constexpr OUString aTest = u"\uc560\uad6D\uac00\uc758 \uac00"
453 "\uc0ac\ub294"_ustr;
455 aLocale.Language = "ko";
456 aLocale.Country = "KR";
459 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
460 aHyphOptions, aUserOptions);
461 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast<sal_Int32>(5), aResult.breakIndex);
465 // i#65267: Comma is badly broken at end of line
466 // - The word should be wrapped along with the comma
468 aLocale.Language = "de";
469 aLocale.Country = "DE";
472 auto res = m_xBreak->getLineBreak(u"Wort -prinzessinnen, wort"_ustr,
473 strlen("Wort -prinzessinnen,"), aLocale, 0,
474 aHyphOptions, aUserOptions);
475 CPPUNIT_ASSERT_EQUAL(sal_Int32{ 5 }, res.breakIndex);
479 // tdf#114160: ZWJ shouldn't be treated as a breaking character
481 aLocale.Language = "mn";
482 aLocale.Country = "MN";
485 auto res = m_xBreak->getLineBreak(u"\u1828\u1820\u200d\u00a0\u200d\u1873\u1873"_ustr, 6,
486 aLocale, 0, aHyphOptions, aUserOptions);
487 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), res.breakIndex);
490 aLocale.Language = "en";
491 aLocale.Country = "US";
494 auto res = m_xBreak->getLineBreak(u"AB\u200d\u00a0\u200dCD"_ustr, 6, aLocale, 0,
495 aHyphOptions, aUserOptions);
496 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), res.breakIndex);
501 //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
502 void TestBreakIterator::testWordBoundaries()
504 lang::Locale aLocale;
505 aLocale.Language = "en";
506 aLocale.Country = "US";
508 i18n::Boundary aBounds;
510 //See https://bz.apache.org/ooo/show_bug.cgi?id=11993
512 OUString aTest(u"abcd ef ghi??? KLM"_ustr);
514 CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
515 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
516 aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
517 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
518 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
520 CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
521 CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
523 //next word
524 aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true);
525 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
526 CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
528 //previous word
529 aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
530 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
531 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
533 CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
534 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
535 aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true);
536 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
537 CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
539 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
540 CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
541 aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true);
542 CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.startPos);
543 CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
546 //See https://bz.apache.org/ooo/show_bug.cgi?id=21907
548 OUString aTest(u"b a?"_ustr);
550 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
551 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
552 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
554 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
556 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
557 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
558 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
560 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
563 //See https://bz.apache.org/ooo/show_bug.cgi?id=14904
565 static constexpr OUString aTest =
566 u"Working \u201CWords"
567 " starting wit"
568 "h quotes\u201D Work"
569 "ing \u2018Broken\u2019 "
570 "?Spanish? doe"
571 "sn\u2019t work. No"
572 "t even \u00BFreal? "
573 "Spanish"_ustr;
575 aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false);
576 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
577 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
579 aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false);
580 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
581 CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
583 aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false);
584 CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.startPos);
585 CPPUNIT_ASSERT_EQUAL(sal_Int32(44), aBounds.endPos);
587 aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false);
588 CPPUNIT_ASSERT_EQUAL(sal_Int32(46), aBounds.startPos);
589 CPPUNIT_ASSERT_EQUAL(sal_Int32(52), aBounds.endPos);
591 aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false);
592 CPPUNIT_ASSERT_EQUAL(sal_Int32(55), aBounds.startPos);
593 CPPUNIT_ASSERT_EQUAL(sal_Int32(62), aBounds.endPos);
595 aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false);
596 CPPUNIT_ASSERT_EQUAL(sal_Int32(64), aBounds.startPos);
597 CPPUNIT_ASSERT_EQUAL(sal_Int32(71), aBounds.endPos);
599 aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false);
600 CPPUNIT_ASSERT_EQUAL(sal_Int32(88), aBounds.startPos);
601 CPPUNIT_ASSERT_EQUAL(sal_Int32(92), aBounds.endPos);
604 //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
605 sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF };
606 for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
608 //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
609 for (auto const& i: aBreakTests)
611 OUString aTest = "Word" + OUStringChar(i) + "Word";
612 aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
613 switch (mode)
615 case i18n::WordType::ANY_WORD:
616 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
617 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
618 break;
619 case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
620 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
621 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
622 break;
623 case i18n::WordType::DICTIONARY_WORD:
624 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
625 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
626 break;
627 case i18n::WordType::WORD_COUNT:
628 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
629 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
630 break;
633 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
634 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
638 sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB };
639 for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
641 //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
642 for (auto const& p: aJoinTests)
644 OUString aTest = "Word" + OUStringChar(p) + "Word";
645 aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
646 switch (mode)
648 case i18n::WordType::ANY_WORD:
649 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
650 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
651 break;
652 case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
653 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
654 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
655 break;
656 case i18n::WordType::DICTIONARY_WORD:
657 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
658 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
659 break;
660 case i18n::WordType::WORD_COUNT:
661 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
662 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
663 break;
666 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
667 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
671 //See https://bz.apache.org/ooo/show_bug.cgi?id=13494
673 static constexpr OUString aBase(u"xxAAxxBBxxCCxx"_ustr);
674 const sal_Unicode aTests[] =
676 '\'', ';', ',', '.', '!', '@', '#', '%', '&', '*',
677 '(', ')', '_', '-', '{', '}', '[', ']', '\"', '/',
678 '\\', '?', '~', '$', '+', '^', '=', '<', '>', '|'
681 const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14};
682 for (auto const& r: aTests)
684 OUString aTest = aBase.replace('x', r);
685 sal_Int32 nPos = -1;
686 size_t i = 0;
689 CPPUNIT_ASSERT(i < std::size(aDoublePositions));
690 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
691 CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
692 ++i;
694 while (nPos < aTest.getLength());
695 nPos = aTest.getLength();
696 i = std::size(aDoublePositions)-1;
699 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
700 --i;
701 CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
703 while (nPos > 0);
706 const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10};
707 for (size_t j = 1; j < std::size(aTests); ++j)
709 OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[j]));
710 sal_Int32 nPos = -1;
711 size_t i = 0;
714 CPPUNIT_ASSERT(i < std::size(aSinglePositions));
715 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
716 CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
717 ++i;
719 while (nPos < aTest.getLength());
720 nPos = aTest.getLength();
721 i = std::size(aSinglePositions)-1;
724 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
725 --i;
726 CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
728 while (nPos > 0);
731 const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10};
732 CPPUNIT_ASSERT_EQUAL(u'\'', aTests[0]);
734 OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[0]));
735 sal_Int32 nPos = -1;
736 size_t i = 0;
739 CPPUNIT_ASSERT(i < std::size(aSingleQuotePositions));
740 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
741 CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
742 ++i;
744 while (nPos < aTest.getLength());
745 nPos = aTest.getLength();
746 i = std::size(aSingleQuotePositions)-1;
749 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
750 --i;
751 CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
753 while (nPos > 0);
757 //See https://bz.apache.org/ooo/show_bug.cgi?id=13451
759 aLocale.Language = "ca";
760 aLocale.Country = "ES";
762 OUString aTest(u"mirar-se comprar-vos donem-nos les mans aneu-vos-en!"_ustr);
764 sal_Int32 nPos = 0;
765 sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52};
766 size_t i = 0;
769 CPPUNIT_ASSERT(i < std::size(aExpected));
770 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
771 i18n::WordType::DICTIONARY_WORD, true).endPos;
772 CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
773 ++i;
775 while (nPos++ < aTest.getLength());
776 CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
779 // i#85411: ZWSP should be a word separator for spellchecking
780 // - This fix was applied to both dict and edit customizations
781 for (int j = 0; j < 3; ++j)
783 switch (j)
785 case 0:
786 aLocale.Language = "en";
787 aLocale.Country = "US";
788 break;
789 case 1:
790 aLocale.Language = "ca";
791 aLocale.Country = "ES";
792 break;
793 case 2:
794 aLocale.Language = "fi";
795 aLocale.Country = "FI";
796 break;
797 default:
798 CPPUNIT_ASSERT(false);
799 break;
802 static constexpr OUString aTest = u"I\u200Bwant\u200Bto\u200Bgo"_ustr;
804 sal_Int32 nPos = 0;
805 sal_Int32 aExpected[] = { 1, 6, 9, 12 };
806 size_t i = 0;
809 CPPUNIT_ASSERT(i < std::size(aExpected));
810 auto dwPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
811 i18n::WordType::DICTIONARY_WORD, true);
812 CPPUNIT_ASSERT_EQUAL(aExpected[i], dwPos.endPos);
813 auto ewPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
814 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
815 CPPUNIT_ASSERT_EQUAL(aExpected[i], ewPos.endPos);
816 nPos = dwPos.endPos;
817 ++i;
818 } while (nPos++ < aTest.getLength());
819 CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
822 //https://bz.apache.org/ooo/show_bug.cgi?id=21290
823 for (int j = 0; j < 2; ++j)
825 switch (j)
827 case 0:
828 aLocale.Language = "en";
829 aLocale.Country = "US";
830 break;
831 case 1:
832 aLocale.Language = "grc";
833 aLocale.Country.clear();
834 break;
835 default:
836 CPPUNIT_ASSERT(false);
837 break;
840 static constexpr OUString aTest =
841 u"\u1F0C\u03BD\u03B4\u03C1\u03B1 \u1F00"
842 "\u03C1\u03BD\u1F7B\u03BC\u03B5\u03BD\u03BF"
843 "\u03C2 \u1F00\u03BB\u03BB \u1F24"
844 "\u03C3\u03B8\u03B9\u03BF\u03BD"_ustr;
846 sal_Int32 nPos = 0;
847 sal_Int32 aExpected[] = {5, 15, 19, 26};
848 size_t i = 0;
851 CPPUNIT_ASSERT(i < std::size(aExpected));
852 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
853 i18n::WordType::DICTIONARY_WORD, true).endPos;
854 CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
855 ++i;
857 while (nPos++ < aTest.getLength());
858 CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
861 //See https://bz.apache.org/ooo/show_bug.cgi?id=58513
862 //See https://bugs.libreoffice.org/show_bug.cgi?id=55707
864 aLocale.Language = "fi";
865 aLocale.Country = "FI";
867 OUString aTest(u"Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n"_ustr);
870 sal_Int32 nPos = 0;
871 sal_Int32 aExpected[] = {11, 21, 24, 36, 42, 47, 51};
872 size_t i = 0;
875 CPPUNIT_ASSERT(i < std::size(aExpected));
876 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
877 i18n::WordType::WORD_COUNT, true).endPos;
878 CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
879 ++i;
881 while (nPos++ < aTest.getLength());
882 CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
886 sal_Int32 nPos = 0;
887 sal_Int32 aExpected[] = {0, 11, 12, 20, 22, 24, 25, 36, 37,
888 40, 41, 42, 43, 45, 46, 47, 50, 51};
889 size_t i = 0;
892 CPPUNIT_ASSERT(i < std::size(aExpected));
893 aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
894 i18n::WordType::DICTIONARY_WORD, true);
895 CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.startPos);
896 ++i;
897 CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.endPos);
898 ++i;
899 nPos = aBounds.endPos;
901 while (nPos++ < aTest.getLength());
902 CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
906 //See https://bz.apache.org/ooo/show_bug.cgi?id=107843
908 aLocale.Language = "en";
909 aLocale.Country = "US";
911 static constexpr OUString aTest =
912 u"ru\uFB00le \uFB01sh"_ustr;
914 aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
915 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
916 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
918 aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false);
919 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.startPos);
920 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
923 //See https://bz.apache.org/ooo/show_bug.cgi?id=113785
925 aLocale.Language = "en";
926 aLocale.Country = "US";
928 static constexpr OUString aTest =
929 u"a\u2013b\u2014c"_ustr;
931 aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
932 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
933 CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
935 aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD);
936 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
937 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
939 aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD);
940 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos);
941 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
944 // i#55778: Words containing numbers get broken up
946 aLocale.Language = "en";
947 aLocale.Country = "US";
949 static constexpr OUString aTest = u"first i18n third"_ustr;
951 aBounds
952 = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
953 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.startPos);
954 CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.endPos);
957 // i#56347: "BreakIterator patch for Hungarian"
958 // i#56348: Special chars in first pos not handled by spell checking in Writer (Hungarian)
959 // Rules for Hungarian affixes after numbers and certain symbols
961 aLocale.Language = "hu";
962 aLocale.Country = "HU";
964 OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr;
966 for (auto mode :
967 { i18n::WordType::DICTIONARY_WORD, i18n::WordType::ANYWORD_IGNOREWHITESPACES })
969 aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true);
970 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
971 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
973 aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true);
974 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
975 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
977 aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
978 CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
979 CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
981 aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true);
982 CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos);
983 CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
985 aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true);
986 CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
987 CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
989 aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true);
990 CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
991 CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
993 aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true);
994 CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos);
995 CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos);
999 // tdf#49885: Upgrade CJ word boundary analysis to ICU frequency-based analysis
1001 aLocale.Language = "ja";
1002 aLocale.Country = "JP";
1004 static constexpr OUString aTest = u"通産省工業技術院北海道工業開発試験所"_ustr;
1006 aBounds
1007 = m_xBreak->getWordBoundary(aTest, 9, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1009 // When using the old LO custom dictionaries, this will select the entire phrase.
1010 // When using ICU, it will select only 北海道.
1011 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
1012 CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
1015 // tdf#161737: narrow no-break space at the end of words resulted spelling mistakes
1017 aLocale.Language = "en";
1018 aLocale.Country = "US";
1020 OUString aTest(u"L’espace fine insécable\u202F!"_ustr);
1021 aBounds
1022 = m_xBreak->getWordBoundary(aTest, 14, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1023 CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos);
1024 // This was 24 (word + NNBSP)
1025 CPPUNIT_ASSERT_EQUAL(sal_Int32(23), aBounds.endPos);
1028 // tdf#161737: narrow no-break space between digits resulted spelling mistakes
1029 // as a quick fix, limit NBSP as word-part character only for editing, and not for spell checking
1030 // TODO: remove NBSP by the linguistic module or by the spell checking dictionaries to allow
1031 // to check numbers with thousand separators and with correct suffix
1033 aLocale.Language = "en";
1034 aLocale.Country = "US";
1036 OUString aTest(u"1\u202F000\u202F000"_ustr);
1037 aBounds
1038 = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1039 // This was 0 (word + NNBSP)
1040 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
1041 // This was 8 (word + NNBSP)
1042 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
1045 // tdf#161737: narrow no-break space at the end of words resulted spelling mistakes
1047 aLocale.Language = "hu";
1048 aLocale.Country = "HU";
1050 OUString aTest(u"L’espace fine insécable\u202F!"_ustr);
1051 aBounds
1052 = m_xBreak->getWordBoundary(aTest, 14, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1053 CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos);
1054 // This was 24 (word + NNBSP)
1055 CPPUNIT_ASSERT_EQUAL(sal_Int32(23), aBounds.endPos);
1058 // tdf#161737: narrow no-break space between digits resulted spelling mistakes
1059 // as a quick fix, limit NBSP as word-part character only for editing, and not for spell checking
1060 // TODO: remove NBSP by the linguistic module or by the spell checking dictionaries to allow
1061 // to check numbers with thousand separators and with correct suffix
1063 aLocale.Language = "hu";
1064 aLocale.Country = "HU";
1066 OUString aTest(u"1\u202F000\u202F000"_ustr);
1067 aBounds
1068 = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1069 // This was 0 (word + NNBSP)
1070 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
1071 // This was 8 (word + NNBSP)
1072 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
1076 void TestBreakIterator::testSentenceBoundaries()
1078 lang::Locale aLocale;
1079 aLocale.Language = "en";
1080 aLocale.Country = "US";
1082 // Trivial characteristic test for sentence boundary detection
1084 OUString aTest(u"This is a sentence. This is a different sentence."_ustr);
1086 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 5, aLocale));
1087 CPPUNIT_ASSERT_EQUAL(sal_Int32(19), m_xBreak->endOfSentence(aTest, 5, aLocale));
1088 CPPUNIT_ASSERT_EQUAL(sal_Int32(20), m_xBreak->beginOfSentence(aTest, 31, aLocale));
1089 CPPUNIT_ASSERT_EQUAL(sal_Int32(49), m_xBreak->endOfSentence(aTest, 31, aLocale));
1092 // i#24098: i18n API beginOfSentence/endOfSentence
1093 // fix beginOfSentence, ... when cursor is on the beginning of the sentence
1095 OUString aTest(u"This is a sentence. This is a different sentence."_ustr);
1097 CPPUNIT_ASSERT_EQUAL(sal_Int32(20), m_xBreak->beginOfSentence(aTest, 20, aLocale));
1098 CPPUNIT_ASSERT_EQUAL(sal_Int32(49), m_xBreak->endOfSentence(aTest, 20, aLocale));
1101 // i#24098: i18n API beginOfSentence/endOfSentence
1102 // "skip preceding space for beginOfSentence"
1104 OUString aTest(u"This is a sentence. This is a different sentence."_ustr);
1106 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 20, aLocale));
1107 CPPUNIT_ASSERT_EQUAL(sal_Int32(19), m_xBreak->endOfSentence(aTest, 20, aLocale));
1108 CPPUNIT_ASSERT_EQUAL(sal_Int32(24), m_xBreak->beginOfSentence(aTest, 26, aLocale));
1109 CPPUNIT_ASSERT_EQUAL(sal_Int32(53), m_xBreak->endOfSentence(aTest, 26, aLocale));
1112 // i#55063: Sentence selection in Thai should select a space-delimited phrase.
1113 // - This customization broke at some point. It works in an English locale in a synthetic test
1114 // like this one, but does not work in the Thai locale, nor on Thai text in practice.
1116 static constexpr OUString aTest = u"ว้อย โหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอก โปรโมเตอร์"_ustr;
1118 aLocale.Language = "en";
1119 aLocale.Country = "US";
1120 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
1121 CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, aLocale));
1123 aLocale.Language = "th";
1124 aLocale.Country = "TH";
1125 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
1126 CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, aLocale));
1129 // i#55063: Thai phrases should delimit English sentence selection.
1130 // - This customization broke at some point. It works in an English locale in a synthetic test
1131 // like this one, but does not work in the Thai locale, nor on Thai text in practice.
1133 static constexpr OUString aTest = u"ว้อย English usually ends with a period โปรโมเตอร์."_ustr;
1135 aLocale.Language = "en";
1136 aLocale.Country = "US";
1137 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
1138 CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, aLocale));
1140 aLocale.Language = "th";
1141 aLocale.Country = "TH";
1142 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
1143 CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, aLocale));
1146 // i#55063: Characteristic test for English text delimiting Thai phrases (sentences)
1147 // - English text should not delimit Thai phrases.
1149 static constexpr OUString aTest = u"Englishโหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอกEnglish"_ustr;
1151 aLocale.Language = "en";
1152 aLocale.Country = "US";
1153 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
1154 CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, aLocale));
1156 aLocale.Language = "th";
1157 aLocale.Country = "TH";
1158 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
1159 CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, aLocale));
1163 //See https://bugs.libreoffice.org/show_bug.cgi?id=40292
1164 //See https://bz.apache.org/ooo/show_bug.cgi?id=80412
1165 //See https://bz.apache.org/ooo/show_bug.cgi?id=111152
1166 //See https://bz.apache.org/ooo/show_bug.cgi?id=50172
1167 void TestBreakIterator::testGraphemeIteration()
1169 lang::Locale aLocale;
1170 aLocale.Language = "bn";
1171 aLocale.Country = "IN";
1174 static constexpr OUString aTest = u"\u09AC\u09CD\u09AF"_ustr; // BA HALANT LA
1176 sal_Int32 nDone=0;
1177 sal_Int32 nPos;
1178 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
1179 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1180 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
1181 nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1182 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1183 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
1187 static constexpr OUString aTest = u"\u09B9\u09CD\u09A3\u09BF"_ustr;
1188 // HA HALANT NA VOWELSIGNI
1190 sal_Int32 nDone=0;
1191 sal_Int32 nPos;
1192 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
1193 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1194 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
1195 nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1196 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1197 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
1201 static constexpr OUString aTest = u"\u09A4\u09CD\u09AE\u09CD\u09AF"_ustr;
1202 // TA HALANT MA HALANT YA
1204 sal_Int32 nDone=0;
1205 sal_Int32 nPos;
1206 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
1207 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1208 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
1209 nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1210 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1211 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
1214 aLocale.Language = "ta";
1215 aLocale.Country = "IN";
1218 static constexpr OUString aTest = u"\u0B9A\u0BBF\u0BA4\u0BCD\u0BA4\u0BBF\u0BB0\u0BC8"_ustr; // CA VOWELSIGNI TA VIRAMA TA VOWELSIGNI RA VOWELSIGNAI
1220 sal_Int32 nDone=0;
1221 sal_Int32 nPos = 0;
1223 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1224 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(2), nPos);
1225 nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1226 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(4), nPos);
1227 nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1228 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(6), nPos);
1229 nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1230 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
1231 nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1232 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1233 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(6), nPos);
1234 nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1235 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(4), nPos);
1236 nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1237 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(2), nPos);
1238 nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1239 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
1243 static constexpr OUString aTest = u"\u0B95\u0BC1"_ustr; // KA VOWELSIGNU
1245 sal_Int32 nDone=0;
1246 sal_Int32 nPos = 0;
1248 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
1249 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1250 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
1251 nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1252 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1253 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
1257 static constexpr OUString aTest =
1258 u"\u0B9A\u0BBF\u0BA4\u0BCD\u0BA4\u0BBF\u0BB0\u0BC8"_ustr;
1259 // CA VOWELSIGNI TA VIRAMA TA VOWELSIGNI RA VOWELSIGNAI
1261 sal_Int32 nDone=0;
1262 sal_Int32 nPos=0;
1264 for (sal_Int32 i = 0; i < 4; ++i)
1266 sal_Int32 nOldPos = nPos;
1267 nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale,
1268 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1269 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos+2, nPos);
1272 for (sal_Int32 i = 0; i < 4; ++i)
1274 sal_Int32 nOldPos = nPos;
1275 nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale,
1276 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1277 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos-2, nPos);
1282 static constexpr OUString aText = u"\u05D0\u05B8"_ustr; // ALEF QAMATS
1284 sal_Int32 nGraphemeCount = 0;
1286 sal_Int32 nCurPos = 0;
1287 while (nCurPos < aText.getLength())
1289 sal_Int32 nCount2 = 1;
1290 nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
1291 i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
1292 ++nGraphemeCount;
1295 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be considered 1 grapheme", static_cast<sal_Int32>(1), nGraphemeCount);
1298 aLocale.Language = "hi";
1299 aLocale.Country = "IN";
1302 static constexpr OUString aTest = u"\u0936\u0940"_ustr; // SHA VOWELSIGNII
1304 sal_Int32 nDone=0;
1305 sal_Int32 nPos = 0;
1307 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
1308 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1309 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
1310 nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1311 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1312 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
1315 // tdf#49885: Replace custom Thai implementation with ICU
1317 aLocale.Language = "th";
1318 aLocale.Country = "TH";
1320 static constexpr OUString aTest = u"กำ"_ustr;
1322 CPPUNIT_ASSERT_EQUAL(sal_Int32{ 2 }, aTest.getLength());
1324 sal_Int32 nDone = 0;
1325 sal_Int32 nPos = 0;
1327 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1,
1328 nDone);
1329 CPPUNIT_ASSERT_EQUAL(aTest.getLength(), nPos);
1331 nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1332 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1333 CPPUNIT_ASSERT_EQUAL(sal_Int32{ 0 }, nPos);
1336 // Korean may also use grapheme clusters for character composition
1338 aLocale.Language = "ko";
1339 aLocale.Country = "KR";
1341 static constexpr OUString aTest = u"각"_ustr;
1343 CPPUNIT_ASSERT_EQUAL(sal_Int32{ 3 }, aTest.getLength());
1345 sal_Int32 nDone = 0;
1346 sal_Int32 nPos = 0;
1348 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1,
1349 nDone);
1350 CPPUNIT_ASSERT_EQUAL(aTest.getLength(), nPos);
1352 nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1353 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1354 CPPUNIT_ASSERT_EQUAL(sal_Int32{ 0 }, nPos);
1358 //A test to ensure that certain ranges and codepoints that are categorized as
1359 //weak remain as weak, so that existing docs that depend on this don't silently
1360 //change font for those weak chars
1361 void TestBreakIterator::testWeak()
1363 lang::Locale aLocale;
1364 aLocale.Language = "en";
1365 aLocale.Country = "US";
1368 static constexpr OUString aWeaks =
1369 u"\u0001\u0002"
1370 " \u00A0"
1371 "\u0300\u036F" //Combining Diacritical Marks
1372 "\u1AB0\u1AFF" //Combining Diacritical Marks Extended
1373 "\u1DC0\u1DFF" //Combining Diacritical Marks Supplement
1374 "\u20D0\u20FF" //Combining Diacritical Marks for Symbols
1375 "\u2150\u215F" //Number Forms, fractions
1376 "\u2160\u2180" //Number Forms, roman numerals
1377 "\u2200\u22FF" //Mathematical Operators
1378 "\u27C0\u27EF" //Miscellaneous Mathematical Symbols-A
1379 "\u2980\u29FF" //Miscellaneous Mathematical Symbols-B
1380 "\u2A00\u2AFF" //Supplemental Mathematical Operators
1381 "\u2100\u214F" //Letterlike Symbols
1382 "\u2308\u230B" //Miscellaneous technical
1383 "\u25A0\u25FF" //Geometric Shapes
1384 "\u2B30\u2B4C"_ustr; //Miscellaneous Symbols and Arrows
1386 for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
1388 sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
1389 OString aMsg =
1390 "Char 0x" +
1391 OString::number(static_cast<sal_Int32>(std::u16string_view(aWeaks)[i]), 16) +
1392 " should have been weak";
1393 CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
1394 i18n::ScriptType::WEAK, nScript);
1399 //A test to ensure that certain ranges and codepoints that are categorized as
1400 //asian remain as asian, so that existing docs that depend on this don't silently
1401 //change font for those asian chars.
1402 //See https://bugs.libreoffice.org/show_bug.cgi?id=38095
1403 void TestBreakIterator::testAsian()
1405 lang::Locale aLocale;
1406 aLocale.Language = "en";
1407 aLocale.Country = "US";
1410 static constexpr OUString aAsians =
1411 //some typical CJK chars
1412 u"\u4E00\u62FF"
1413 //The full HalfWidth and FullWidth block has historically been
1414 //designated as taking the CJK font :-(
1415 //HalfWidth and FullWidth forms of ASCII 0-9, categorized under
1416 //UAX24 as "Common" i.e. by that logic WEAK
1417 "\uFF10\uFF19"
1418 //HalfWidth and FullWidth forms of ASCII A-z, categorized under
1419 //UAX25 as "Latin", i.e. by that logic LATIN
1420 "\uFF21\uFF5A"_ustr;
1422 for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
1424 sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
1425 OString aMsg =
1426 "Char 0x" +
1427 OString::number(static_cast<sal_Int32>(std::u16string_view(aAsians)[i]), 16) +
1428 " should have been asian";
1429 CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
1430 i18n::ScriptType::ASIAN, nScript);
1435 //A test to ensure that our Lao word boundary detection is useful
1436 void TestBreakIterator::testLao()
1438 lang::Locale aLocale;
1439 aLocale.Language = "lo";
1440 aLocale.Country = "LA";
1442 static constexpr OUString aTest = u"\u0e8d\u0eb4\u0e99\u0e94\u0eb5\u0e95\u0ec9\u0ead\u0e99\u0eae\u0eb1\u0e9a"_ustr;
1443 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
1444 i18n::WordType::DICTIONARY_WORD, true);
1446 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1447 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
1449 aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
1450 i18n::WordType::DICTIONARY_WORD, true);
1452 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
1453 #if (U_ICU_VERSION_MAJOR_NUM < 70)
1454 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
1455 #else
1456 // FIXME:
1457 // In ICU 70/71 for yet unknown reason the word boundary 9 is not detected and
1458 // instead the length 12 is returned as endpos.
1459 // Deep in
1460 // icu_70::RuleBasedBreakIterator::BreakCache::next()
1461 // icu_70::RuleBasedBreakIterator::BreakCache::following()
1462 // icu_70::RuleBasedBreakIterator::following()
1463 // i18npool::BreakIterator_Unicode::getWordBoundary()
1464 CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
1465 #endif
1468 //A test to ensure that our thai word boundary detection is useful
1469 void TestBreakIterator::testThai()
1471 lang::Locale aLocale;
1472 aLocale.Language = "th";
1473 aLocale.Country = "TH";
1475 //See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
1477 static constexpr OUString aTest = u"\u0E01\u0E38\u0E2B\u0E25\u0E32\u0E1A"_ustr;
1478 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
1479 i18n::WordType::DICTIONARY_WORD, true);
1480 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full word",
1481 sal_Int32(0), aBounds.startPos);
1482 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full word",
1483 aTest.getLength(), aBounds.endPos);
1486 //See https://bz.apache.org/ooo/show_bug.cgi?id=29548
1487 //make sure forwards and back are consistent
1489 static constexpr OUString aTest =
1490 u"\u0E2D\u0E38\u0E17\u0E22\u0E32\u0E19\u0E41"
1491 "\u0E2B\u0E48\u0E07\u0E0A\u0E32\u0E15\u0E34"
1492 "\u0E19\u0E49\u0E33\u0E2B\u0E19\u0E32\u0E27"
1493 "\u0E2D\u0E38\u0E17\u0E22\u0E32\u0E19\u0E41"
1494 "\u0E2B\u0E48\u0E07\u0E0A\u0E32\u0E15\u0E34"
1495 "\u0E19\u0E49\u0E33\u0E2B\u0E19\u0E32\u0E27"_ustr;
1497 std::stack<sal_Int32> aPositions;
1498 sal_Int32 nPos = -1;
1501 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
1502 aPositions.push(nPos);
1504 while (nPos < aTest.getLength());
1505 nPos = aTest.getLength();
1506 CPPUNIT_ASSERT(!aPositions.empty());
1507 aPositions.pop();
1510 CPPUNIT_ASSERT(!aPositions.empty());
1511 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
1512 CPPUNIT_ASSERT_EQUAL(aPositions.top(), nPos);
1513 aPositions.pop();
1515 while (nPos > 0);
1518 // tdf#113694
1520 static constexpr OUString aTest = u"\U00010000"_ustr;
1522 sal_Int32 nDone=0;
1523 sal_Int32 nPos;
1525 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
1526 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1527 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", aTest.getLength(), nPos);
1528 nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1529 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
1530 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
1532 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
1533 i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
1534 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", aTest.getLength(), nPos);
1535 nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
1536 i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
1537 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
1541 #ifdef TODO
1542 void TestBreakIterator::testNorthernThai()
1544 lang::Locale aLocale;
1545 aLocale.Language = "nod";
1546 aLocale.Country = "TH";
1548 const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
1549 OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1));
1550 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
1551 i18n::WordType::DICTIONARY_WORD, true);
1552 CPPUNIT_ASSERT_MESSAGE("Should skip full word",
1553 aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
1556 // Not sure if any version earlier than 49 did have Khmer word boundary
1557 // dictionaries, 4.6 does not.
1559 // As of icu 54, word boundary detection for Khmer is still considered
1560 // insufficient, so icu khmer stuff is disabled
1562 //A test to ensure that our khmer word boundary detection is useful
1563 //https://bugs.libreoffice.org/show_bug.cgi?id=52020
1564 void TestBreakIterator::testKhmer()
1566 lang::Locale aLocale;
1567 aLocale.Language = "km";
1568 aLocale.Country = "KH";
1570 const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
1572 OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER));
1573 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
1574 i18n::WordType::DICTIONARY_WORD, true);
1576 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
1578 aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
1579 i18n::WordType::DICTIONARY_WORD, true);
1581 CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5);
1583 #endif
1585 void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak)
1587 lang::Locale aLocale;
1588 aLocale.Language = "ja";
1589 aLocale.Country = "JP";
1590 i18n::Boundary aBounds;
1593 static constexpr OUString aTest = u"シャットダウン"_ustr;
1595 aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
1596 i18n::WordType::DICTIONARY_WORD, true);
1598 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos);
1599 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
1603 static constexpr OUString aTest = u"\u9EBB\u306E\u8449\u9EBB\u306E\u8449"_ustr;
1605 aBounds = xBreak->getWordBoundary(aTest, 1, aLocale,
1606 i18n::WordType::DICTIONARY_WORD, true);
1608 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1609 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
1611 aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
1612 i18n::WordType::DICTIONARY_WORD, true);
1614 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
1615 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1619 // tdf#162912: Double-clicking should only select one Basic identifier
1620 static constexpr OUString aTest = u"ThisComponent.CurrentSelection"_ustr;
1622 aBounds = xBreak->getWordBoundary(aTest, 5, aLocale, i18n::WordType::DICTIONARY_WORD, true);
1623 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1624 CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
1626 aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
1627 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
1628 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1629 CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos);
1631 aBounds = xBreak->getWordBoundary(aTest, 15, aLocale,
1632 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
1633 CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos);
1634 CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
1638 void TestBreakIterator::testJapanese()
1640 doTestJapanese(m_xBreak);
1642 // fdo#78479 - test second / cached instantiation of xdictionary
1643 uno::Reference< i18n::XBreakIterator > xTmpBreak(m_xSFactory->createInstance(
1644 u"com.sun.star.i18n.BreakIterator"_ustr), uno::UNO_QUERY_THROW);
1646 doTestJapanese(xTmpBreak);
1649 void TestBreakIterator::testChinese()
1651 lang::Locale aLocale;
1652 aLocale.Language = "zh";
1653 aLocale.Country = "CN";
1656 static constexpr OUStringLiteral aTest = u"\u6A35\u6A30\u69FE\u8919\U00029EDB";
1658 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale,
1659 i18n::WordType::DICTIONARY_WORD, true);
1660 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos);
1661 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1665 void TestBreakIterator::testDictWordPrepostDash()
1667 std::vector<lang::Locale> aLocale{ { "de", "DE", "" },
1668 { "nds", "DE", "" },
1669 { "nl", "NL", "" },
1670 { "sv", "SE", "" },
1671 { "da", "DK", "" } };
1673 for (const auto& rLocale : aLocale)
1675 auto aTest = u"Arbeits- -nehmer"_ustr;
1677 i18n::Boundary aBounds
1678 = m_xBreak->getWordBoundary(aTest, 3, rLocale, i18n::WordType::DICTIONARY_WORD, false);
1679 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1680 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
1682 aBounds
1683 = m_xBreak->getWordBoundary(aTest, 13, rLocale, i18n::WordType::DICTIONARY_WORD, false);
1684 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
1685 CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
1689 void TestBreakIterator::testDictWordAbbreviation()
1691 std::vector<lang::Locale> aLocale{
1692 { "en", "US", "" }, // dict_word locale
1693 { "de", "DE", "" } // dict_word_prepostdash locale
1696 for (const auto& rLocale : aLocale)
1698 auto aTest = u"Examples: e.g. i.e. etc. and such"_ustr;
1700 i18n::Boundary aBounds
1701 = m_xBreak->getWordBoundary(aTest, 3, rLocale, i18n::WordType::DICTIONARY_WORD, false);
1702 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1703 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
1705 aBounds
1706 = m_xBreak->getWordBoundary(aTest, 10, rLocale, i18n::WordType::DICTIONARY_WORD, false);
1707 CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
1708 CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
1710 aBounds
1711 = m_xBreak->getWordBoundary(aTest, 15, rLocale, i18n::WordType::DICTIONARY_WORD, false);
1712 CPPUNIT_ASSERT_EQUAL(sal_Int32(15), aBounds.startPos);
1713 CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
1715 aBounds
1716 = m_xBreak->getWordBoundary(aTest, 20, rLocale, i18n::WordType::DICTIONARY_WORD, false);
1717 CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.startPos);
1718 CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
1720 aBounds
1721 = m_xBreak->getWordBoundary(aTest, 26, rLocale, i18n::WordType::DICTIONARY_WORD, false);
1722 CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
1723 CPPUNIT_ASSERT_EQUAL(sal_Int32(28), aBounds.endPos);
1725 aBounds
1726 = m_xBreak->getWordBoundary(aTest, 30, rLocale, i18n::WordType::DICTIONARY_WORD, false);
1727 CPPUNIT_ASSERT_EQUAL(sal_Int32(29), aBounds.startPos);
1728 CPPUNIT_ASSERT_EQUAL(sal_Int32(33), aBounds.endPos);
1732 void TestBreakIterator::testHebrewGereshGershaim()
1734 // In Hebrew documents, there are multiple valid ways to represent the geresh and gershaim
1735 // intra-word punctuation marks. This test exhaustively exercises them.
1737 // See the following bugs:
1738 // i#51661: Add quotation mark as middle letter for Hebrew
1739 // tdf#46950: Spell-checking breaks Hebrew words at intra-word single and double quotes
1741 lang::Locale aLocale;
1743 aLocale.Language = "he";
1744 aLocale.Country = "IL";
1746 // Unicode U+05F3 HEBREW PUNCTUATION GERESH
1748 auto aTest = u"ג׳ירפה"_ustr;
1750 auto aBounds
1751 = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1752 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1753 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1755 aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale,
1756 i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
1757 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1758 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1761 // Apostrophe as geresh
1763 auto aTest = u"ג'ירפה"_ustr;
1765 auto aBounds
1766 = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1767 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1768 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1770 aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale,
1771 i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
1772 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1773 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1776 // Right single quote as geresh
1778 auto aTest = u"ג’ירפה"_ustr;
1780 auto aBounds
1781 = m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1782 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1783 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1785 aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale,
1786 i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
1787 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1788 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1791 // Unicode U+05F4 HEBREW PUNCTUATION GERSHAYIM
1793 auto aTest = u"דו״ח"_ustr;
1795 auto aBounds
1796 = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1797 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1798 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
1800 aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale,
1801 i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
1802 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1803 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
1806 // Double quote as gershayim
1808 auto aTest = u"דו\"ח"_ustr;
1810 auto aBounds
1811 = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1812 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1813 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
1815 aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale,
1816 i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
1817 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1818 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
1821 // Right double quote as gershayim
1823 auto aTest = u"דו”ח"_ustr;
1825 auto aBounds
1826 = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1827 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1828 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
1830 aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale,
1831 i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
1832 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1833 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
1837 void TestBreakIterator::testLegacySurrogatePairs()
1839 lang::Locale aLocale;
1841 aLocale.Language = "ja";
1842 aLocale.Country = "JP";
1844 // i#75632: [surrogate pair] Japanese word break does not work properly for surrogate pairs.
1845 // and many others to address bugs: i#75631 i#75633 i#75412 etc.
1847 // BreakIterator supports surrogate pairs (UTF-16). This is a simple characteristic test.
1849 static constexpr OUString aTest = u"X 𠮟 X"_ustr;
1851 auto aBounds
1852 = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1853 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1854 CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
1856 aBounds
1857 = m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1858 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
1859 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
1861 aBounds
1862 = m_xBreak->getWordBoundary(aTest, 5, aLocale, i18n::WordType::DICTIONARY_WORD, false);
1863 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
1864 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
1868 void TestBreakIterator::testWordCount()
1870 auto fnCountWords = [&](const OUString& aStr, const lang::Locale& aLocale) -> int
1872 int nWords = 0;
1873 sal_Int32 nNextPos = 0;
1874 int nIterGuard = 0;
1876 if (m_xBreak->isBeginWord(aStr, nNextPos, aLocale, i18n::WordType::WORD_COUNT))
1878 ++nWords;
1881 while (true)
1883 CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++nIterGuard < 100);
1885 auto aBounds = m_xBreak->nextWord(aStr, nNextPos, aLocale, i18n::WordType::WORD_COUNT);
1886 if (aBounds.endPos == aBounds.startPos)
1888 break;
1891 nNextPos = aBounds.endPos;
1892 ++nWords;
1895 return nWords;
1898 // i#80815: "Word count differs from MS Word"
1899 // This is a characteristic test for word count using test data from the linked bug.
1901 lang::Locale aLocale;
1902 aLocale.Language = "en";
1903 aLocale.Country = "US";
1905 const OUString aStr = u""
1906 "test data for word count issue #80815\n"
1907 "fo\\\'sforos\n"
1908 "archipi\\\'elago\n"
1909 "do\\^me\n"
1910 "f**k\n"
1911 "\n"
1912 "battery-driven\n"
1913 "and/or\n"
1914 "apple(s)\n"
1915 "money+opportunity\n"
1916 "Micro$oft\n"
1917 "\n"
1918 "300$\n"
1919 "I(not you)\n"
1920 "a****n\n"
1921 "1+3=4\n"
1922 "\n"
1923 "aaaaaaa.aaaaaaa\n"
1924 "aaaaaaa,aaaaaaa\n"
1925 "aaaaaaa;aaaaaaa\n"_ustr;
1927 CPPUNIT_ASSERT_EQUAL(24, fnCountWords(aStr, aLocale));
1930 // Test that the switch to upstream ICU for CJ word boundary analysis doesn't change word count.
1932 lang::Locale aLocale;
1933 aLocale.Language = "ja";
1934 aLocale.Country = "JP";
1936 const OUString aStr = u"Wordの様にワード数をするのにTest\n植松町"_ustr;
1938 CPPUNIT_ASSERT_EQUAL(8, fnCountWords(aStr, aLocale));
1941 // tdf#150621 Korean words should be counted individually, rather than by syllable.
1943 // Per i#80815, the intention for the word count feature is to emulate the behavior of MS Word.
1945 lang::Locale aLocale;
1946 aLocale.Language = "ko";
1947 aLocale.Country = "KR";
1949 // Basic case: Korean words are counted as space-delimited. In particular, grammatical
1950 // particles are treated as part of the previous word.
1951 CPPUNIT_ASSERT_EQUAL(3, fnCountWords(u"저는 영화를 봤어요"_ustr, aLocale));
1953 // Mixed script: Korean is mostly written in hangul, but hanja are still used in certain
1954 // situations (e.g. abbreviations in newspaper articles). For Chinese and Japanese, such
1955 // ideographs would be counted individually as words. In Korean, however, they are treated
1956 // no differently than hangul characters.
1957 CPPUNIT_ASSERT_EQUAL(1, fnCountWords(u"불렀다...與"_ustr, aLocale));
1958 CPPUNIT_ASSERT_EQUAL(2, fnCountWords(u"불렀다 ...與"_ustr, aLocale));
1959 CPPUNIT_ASSERT_EQUAL(3, fnCountWords(u"불렀다 ... 與"_ustr, aLocale));
1960 CPPUNIT_ASSERT_EQUAL(1, fnCountWords(u"尹탄핵"_ustr, aLocale));
1961 CPPUNIT_ASSERT_EQUAL(2, fnCountWords(u"尹 탄핵"_ustr, aLocale));
1965 void TestBreakIterator::testDictionaryIteratorLanguages()
1967 // Thai
1969 lang::Locale aLocale{ "th", "TH", "" };
1971 const OUString aStr = u"รอนานหรือเปล่า"_ustr;
1973 i18n::Boundary aBounds;
1975 aBounds
1976 = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true);
1977 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1978 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
1980 aBounds
1981 = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::DICTIONARY_WORD, true);
1982 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
1983 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
1985 aBounds
1986 = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::DICTIONARY_WORD, true);
1987 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
1988 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
1990 aBounds
1991 = m_xBreak->getWordBoundary(aStr, 10, aLocale, i18n::WordType::DICTIONARY_WORD, true);
1992 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
1993 CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
1995 aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true);
1996 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
1997 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
1999 aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::ANY_WORD, true);
2000 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
2001 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
2003 aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::ANY_WORD, true);
2004 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
2005 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
2007 aBounds = m_xBreak->getWordBoundary(aStr, 10, aLocale, i18n::WordType::ANY_WORD, true);
2008 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
2009 CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
2011 aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale,
2012 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2013 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
2014 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
2016 aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale,
2017 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2018 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
2019 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
2021 aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale,
2022 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2023 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
2024 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
2026 aBounds = m_xBreak->getWordBoundary(aStr, 10, aLocale,
2027 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2028 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
2029 CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
2032 // Japanese
2034 lang::Locale aLocale{ "ja", "JP", "" };
2036 const OUString aStr = u"通産省工業技術院北海道"_ustr;
2038 i18n::Boundary aBounds;
2040 aBounds
2041 = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2042 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
2043 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
2045 aBounds
2046 = m_xBreak->getWordBoundary(aStr, 2, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2047 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
2048 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
2050 aBounds
2051 = m_xBreak->getWordBoundary(aStr, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2052 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
2053 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
2055 aBounds
2056 = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2057 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
2058 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
2060 aBounds
2061 = m_xBreak->getWordBoundary(aStr, 7, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2062 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
2063 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
2065 aBounds
2066 = m_xBreak->getWordBoundary(aStr, 9, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2067 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
2068 CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
2070 aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true);
2071 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
2072 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
2074 aBounds = m_xBreak->getWordBoundary(aStr, 2, aLocale, i18n::WordType::ANY_WORD, true);
2075 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
2076 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
2078 aBounds = m_xBreak->getWordBoundary(aStr, 4, aLocale, i18n::WordType::ANY_WORD, true);
2079 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
2080 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
2082 aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::ANY_WORD, true);
2083 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
2084 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
2086 aBounds = m_xBreak->getWordBoundary(aStr, 7, aLocale, i18n::WordType::ANY_WORD, true);
2087 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
2088 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
2090 aBounds = m_xBreak->getWordBoundary(aStr, 9, aLocale, i18n::WordType::ANY_WORD, true);
2091 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
2092 CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
2094 aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale,
2095 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2096 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
2097 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
2099 aBounds = m_xBreak->getWordBoundary(aStr, 2, aLocale,
2100 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2101 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
2102 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
2104 aBounds = m_xBreak->getWordBoundary(aStr, 4, aLocale,
2105 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2106 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
2107 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
2109 aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale,
2110 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2111 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
2112 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
2114 aBounds = m_xBreak->getWordBoundary(aStr, 7, aLocale,
2115 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2116 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
2117 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
2119 aBounds = m_xBreak->getWordBoundary(aStr, 9, aLocale,
2120 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2121 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
2122 CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
2125 // Chinese
2127 lang::Locale aLocale{ "zh", "CN", "" };
2129 const OUString aStr = u"很高兴认识你"_ustr;
2131 i18n::Boundary aBounds;
2133 aBounds
2134 = m_xBreak->getWordBoundary(aStr, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2135 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
2136 CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
2138 aBounds
2139 = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2140 CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos);
2141 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
2143 aBounds
2144 = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2145 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
2146 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
2148 aBounds
2149 = m_xBreak->getWordBoundary(aStr, 5, aLocale, i18n::WordType::DICTIONARY_WORD, true);
2150 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
2151 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
2153 aBounds = m_xBreak->getWordBoundary(aStr, 0, aLocale, i18n::WordType::ANY_WORD, true);
2154 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
2155 CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
2157 aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true);
2158 CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos);
2159 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
2161 aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::ANY_WORD, true);
2162 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
2163 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
2165 aBounds = m_xBreak->getWordBoundary(aStr, 5, aLocale, i18n::WordType::ANY_WORD, true);
2166 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
2167 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
2169 aBounds = m_xBreak->getWordBoundary(aStr, 0, aLocale,
2170 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2171 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
2172 CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
2174 aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale,
2175 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2176 CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos);
2177 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
2179 aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale,
2180 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2181 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
2182 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
2184 aBounds = m_xBreak->getWordBoundary(aStr, 5, aLocale,
2185 i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
2186 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
2187 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
2191 void TestBreakIterator::setUp()
2193 BootstrapFixtureBase::setUp();
2194 m_xBreak.set(m_xSFactory->createInstance(u"com.sun.star.i18n.BreakIterator"_ustr), uno::UNO_QUERY_THROW);
2197 void TestBreakIterator::tearDown()
2199 m_xBreak.clear();
2200 BootstrapFixtureBase::tearDown();
2203 CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator);
2205 CPPUNIT_PLUGIN_IMPLEMENT();
2207 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */