1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 #include <com/sun/star/i18n/XBreakIterator.hpp>
11 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
12 #include <com/sun/star/i18n/ScriptType.hpp>
13 #include <com/sun/star/i18n/WordType.hpp>
14 #include <o3tl/cppunittraitshelper.hxx>
15 #include <unotest/bootstrapfixturebase.hxx>
17 #include <unicode/uvernum.h>
22 #include <string_view>
24 using namespace ::com::sun::star
;
26 class TestBreakIterator
: public test::BootstrapFixtureBase
29 virtual void setUp() override
;
30 virtual void tearDown() override
;
32 void testLineBreaking();
33 void testWordBoundaries();
34 void testSentenceBoundaries();
35 void testGraphemeIteration();
41 void testNorthernThai();
47 void testDictWordAbbreviation();
48 void testDictWordPrepostDash();
49 void testHebrewGereshGershaim();
50 void testLegacySurrogatePairs();
52 void testDictionaryIteratorLanguages();
54 CPPUNIT_TEST_SUITE(TestBreakIterator
);
55 CPPUNIT_TEST(testLineBreaking
);
56 CPPUNIT_TEST(testWordBoundaries
);
57 CPPUNIT_TEST(testSentenceBoundaries
);
58 CPPUNIT_TEST(testGraphemeIteration
);
59 CPPUNIT_TEST(testWeak
);
60 CPPUNIT_TEST(testAsian
);
61 CPPUNIT_TEST(testThai
);
62 CPPUNIT_TEST(testLao
);
64 CPPUNIT_TEST(testKhmer
);
65 CPPUNIT_TEST(testNorthernThai
);
67 CPPUNIT_TEST(testJapanese
);
68 CPPUNIT_TEST(testChinese
);
69 CPPUNIT_TEST(testDictWordAbbreviation
);
70 CPPUNIT_TEST(testDictWordPrepostDash
);
71 CPPUNIT_TEST(testHebrewGereshGershaim
);
72 CPPUNIT_TEST(testLegacySurrogatePairs
);
73 CPPUNIT_TEST(testWordCount
);
74 CPPUNIT_TEST(testDictionaryIteratorLanguages
);
75 CPPUNIT_TEST_SUITE_END();
78 uno::Reference
<i18n::XBreakIterator
> m_xBreak
;
79 void doTestJapanese(uno::Reference
< i18n::XBreakIterator
> const &xBreak
);
82 void TestBreakIterator::testLineBreaking()
84 i18n::LineBreakHyphenationOptions aHyphOptions
;
85 i18n::LineBreakUserOptions aUserOptions
;
88 //See https://bugs.libreoffice.org/show_bug.cgi?id=31271
90 OUString
aTest(u
"(some text here)"_ustr
);
92 aLocale
.Language
= "en";
93 aLocale
.Country
= "US";
96 //Here we want the line break to leave text here) on the next line
97 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(aTest
, strlen("(some tex"), aLocale
, 0, aHyphOptions
, aUserOptions
);
98 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32
>(6), aResult
.breakIndex
);
102 //Here we want the line break to leave "here)" on the next line
103 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(aTest
, strlen("(some text here"), aLocale
, 0, aHyphOptions
, aUserOptions
);
104 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32
>(11), aResult
.breakIndex
);
108 //See https://bugs.libreoffice.org/show_bug.cgi?id=49849
110 static constexpr OUString aWord
= u
"\u05DE\u05D9\u05DC\u05D9\u05DD"_ustr
;
111 OUString
aTest(aWord
+ " " + aWord
);
113 aLocale
.Language
= "he";
114 aLocale
.Country
= "IL";
117 //Here we want the line break to happen at the whitespace
118 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(aTest
, aTest
.getLength()-1, aLocale
, 0, aHyphOptions
, aUserOptions
);
119 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", aWord
.getLength()+1, aResult
.breakIndex
);
123 //See https://bz.apache.org/ooo/show_bug.cgi?id=17155
125 aLocale
.Language
= "en";
126 aLocale
.Country
= "US";
129 //Here we want the line break to leave /bar/ba clumped together on the next line
130 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(u
"foo /bar/baz"_ustr
, strlen("foo /bar/ba"), aLocale
, 0,
131 aHyphOptions
, aUserOptions
);
132 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first slash", static_cast<sal_Int32
>(4), aResult
.breakIndex
);
136 // i#22602: writer breaks word after dot immediately followed by a letter
138 aLocale
.Language
= "en";
139 aLocale
.Country
= "US";
142 //Here we want the line break to leave ./bar/baz clumped together on the next line
143 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(
144 u
"foo ./bar/baz"_ustr
, strlen("foo ./bar/ba"), aLocale
, 0, aHyphOptions
, aUserOptions
);
145 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first period",
146 static_cast<sal_Int32
>(4), aResult
.breakIndex
);
150 // i#81448: slash and backslash make non-breaking spaces of preceding spaces
152 aLocale
.Language
= "en";
153 aLocale
.Country
= "US";
156 // Per the bug, the line break should leave ...BE clumped together on the next line.
157 // However, the current behavior does not wrap the string at all. This test asserts the
158 // current behavior as a point of reference.
159 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(
160 u
"THIS... ...BE"_ustr
, strlen("THIS... ...B"), aLocale
, 0, aHyphOptions
, aUserOptions
);
161 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32
>(0), aResult
.breakIndex
);
165 // i#81448: slash and backslash make non-breaking spaces of preceding spaces
167 aLocale
.Language
= "en";
168 aLocale
.Country
= "US";
171 // The line break should leave /BE clumped together on the next line.
172 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(
173 u
"THIS... /BE"_ustr
, strlen("THIS... /B"), aLocale
, 0, aHyphOptions
, aUserOptions
);
174 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32
>(8), aResult
.breakIndex
);
178 // i#80548: Bad word wrap between dash and word
180 aLocale
.Language
= "fi";
181 aLocale
.Country
= "FI";
184 // Per the bug, the line break should leave -bar clumped together on the next line.
185 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(
186 u
"foo -bar"_ustr
, strlen("foo -ba"), aLocale
, 0, aHyphOptions
, aUserOptions
);
187 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first dash",
188 static_cast<sal_Int32
>(4), aResult
.breakIndex
);
192 // i#80645: Line erroneously breaks at backslash
194 aLocale
.Language
= "en";
195 aLocale
.Country
= "US";
198 // Note that the current behavior deviates from the original fix for this bug.
200 // The original report was filed due to wrapping all of "\Program Files\aaaa" to the
201 // next line, even though only "aaaa" overflowed. The original fix was to simply make
202 // U+005C reverse solidus (backslash) a breaking character.
204 // However, the root cause for this bug was not the behavior of '\', but rather some
205 // other bug making all of "\Program Files\" behave like a single token, despite it
206 // even containing whitespace.
208 // Reverting to the ICU line rules fixes this root issue. Now, in the following,
209 // "C:\Program" and "Files\LibreOffice" are treated as separate tokens. This is also
210 // consistent with the behavior of other office programs.
211 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(
212 u
"C:\\Program Files\\LibreOffice"_ustr
, strlen("C:\\Program Files\\Libre"), aLocale
, 0,
213 aHyphOptions
, aUserOptions
);
214 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32
>(11), aResult
.breakIndex
);
216 // An identical result should be generated for solidus.
217 aResult
= m_xBreak
->getLineBreak(
218 u
"C:/Program Files/LibreOffice"_ustr
, strlen("C:/Program Files/Libre"), aLocale
, 0,
219 aHyphOptions
, aUserOptions
);
220 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32
>(11), aResult
.breakIndex
);
224 // i#80841: Words separated by hyphens will always break to next line
226 aLocale
.Language
= "en";
227 aLocale
.Country
= "US";
230 // Here we want the line break to leave toll- on the first line
231 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(
232 u
"toll-free"_ustr
, strlen("toll-fr"), aLocale
, 0, aHyphOptions
, aUserOptions
);
233 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32
>(5), aResult
.breakIndex
);
237 // i#83464: Line break between letter and $
239 aLocale
.Language
= "en";
240 aLocale
.Country
= "US";
243 // Here we want the line break to leave US$ clumped on the next line.
244 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(
245 u
"word US$ 123"_ustr
, strlen("word U"), aLocale
, 0, aHyphOptions
, aUserOptions
);
246 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32
>(5), aResult
.breakIndex
);
250 // Unknown bug number: "fix line break problem of dot after letter and before number"
252 aLocale
.Language
= "en";
253 aLocale
.Country
= "US";
256 // Here we want the line break to leave US$ clumped on the next line.
257 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(
258 u
"word L.5 word"_ustr
, strlen("word L"), aLocale
, 0, aHyphOptions
, aUserOptions
);
259 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32
>(5), aResult
.breakIndex
);
263 // i#83229: Wrong line break when word contains a hyphen
265 aLocale
.Language
= "en";
266 aLocale
.Country
= "US";
269 // The root cause for this bug was the Unicode standard introducing special treatment
270 // for '-' in a number range context. This change makes number ranges (e.g. "100-199")
271 // behave as if they are single tokens for the purposes of line breaking. Unfortunately,
272 // this caused a significant appearance change to existing documents.
274 // Despite being a user-visible layout change, this isn't exactly a bug. Wrapping
275 // number ranges as a single token is consistent with other applications, including web
276 // browsers, and other office suites as mentioned in the bug discussion. Removing this
277 // customization seems like it would be a major change, however.
279 // Here we want the line break to leave 100- clumped on the first line.
281 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(
282 u
"word 100-199 word"_ustr
, strlen("word 100-1"), aLocale
, 0, aHyphOptions
, aUserOptions
);
283 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32
>(9), aResult
.breakIndex
);
287 // From the same bug: "the leading minus must stay with numbers and strings"
289 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(
290 u
"range of -100.000 to 100.000"_ustr
, strlen("range of -1"), aLocale
, 0,
291 aHyphOptions
, aUserOptions
);
292 CPPUNIT_ASSERT_EQUAL(sal_Int32
{9}, aResult
.breakIndex
);
294 static constexpr OUString str
= u
"range of \u2212100.000 to 100.000"_ustr
;
295 aResult
= m_xBreak
->getLineBreak(
296 str
, strlen("range of -"), aLocale
, 0, aHyphOptions
, aUserOptions
);
297 CPPUNIT_ASSERT_EQUAL(sal_Int32
{9}, aResult
.breakIndex
);
300 aLocale
.Language
= "de";
301 aLocale
.Country
= "DE";
304 // From the same bug: "the leading minus must stay with numbers and strings"
306 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(
307 u
"EURO is -10,50"_ustr
, strlen("EURO is -1"), aLocale
, 0, aHyphOptions
, aUserOptions
);
308 CPPUNIT_ASSERT_EQUAL(sal_Int32
{8}, aResult
.breakIndex
);
310 // Also the mathematical minus sign:
312 static constexpr OUString str
= u
"EURO is \u221210,50"_ustr
;
313 aResult
= m_xBreak
->getLineBreak(
314 str
, strlen("EURO is -"), aLocale
, 0, aHyphOptions
, aUserOptions
);
315 CPPUNIT_ASSERT_EQUAL(sal_Int32
{8}, aResult
.breakIndex
);
319 // From the same bug: "the leading minus must stay with numbers and strings"
321 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(
322 u
"und -kosten"_ustr
, strlen("und -ko"), aLocale
, 0,
323 aHyphOptions
, aUserOptions
);
324 CPPUNIT_ASSERT_EQUAL(sal_Int32
{4}, aResult
.breakIndex
);
326 // But not the non-breaking hyphen:
328 static constexpr OUString str
= u
"und \u2011"_ustr
;
329 aResult
= m_xBreak
->getLineBreak(
330 str
, strlen("und -ko"), aLocale
, 0, aHyphOptions
, aUserOptions
);
331 CPPUNIT_ASSERT_EQUAL(sal_Int32
{5}, aResult
.breakIndex
);
335 // i#83649: "Line break should be between typographical quote and left bracket"
336 // - Actually: Spaces between quotation mark and opening punctuation not treated as a break.
337 // - Note that per the Unicode standard, prohibiting breaks in this context is intentional
338 // because it may cause issues in certain languages due to the various ways quotation
339 // characters are used.
340 // - We do it anyway by customizing the ICU line breaking rules.
343 // This uses the sample text provided in the bug report. Based on usage, it is assumed
344 // they were in the de_DE locale.
346 aLocale
.Language
= "de";
347 aLocale
.Country
= "DE";
349 // Per the bug report, it is expected that »angetan werden« remains on the first line.
350 const OUString str
= u
"»angetan werden« [Passiv]"_ustr
;
351 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(
352 str
, str
.getLength() - 4, aLocale
, 0, aHyphOptions
, aUserOptions
);
353 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32
>(17), aResult
.breakIndex
);
355 // The same result should be returned for this and the first case.
356 const OUString str2
= u
"»angetan werden« Passiv"_ustr
;
357 aResult
= m_xBreak
->getLineBreak(
358 str2
, str2
.getLength() - 4, aLocale
, 0, aHyphOptions
, aUserOptions
);
359 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32
>(17), aResult
.breakIndex
);
361 // Under ICU rules, no amount of spaces would cause this to wrap.
362 const OUString str3
= u
"»angetan werden« [Passiv]"_ustr
;
363 aResult
= m_xBreak
->getLineBreak(
364 str3
, str3
.getLength() - 4, aLocale
, 0, aHyphOptions
, aUserOptions
);
365 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32
>(20), aResult
.breakIndex
);
367 // However, tabs will
368 const OUString str4
= u
"»angetan werden«\t[Passiv]"_ustr
;
369 aResult
= m_xBreak
->getLineBreak(
370 str4
, str4
.getLength() - 4, aLocale
, 0, aHyphOptions
, aUserOptions
);
371 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32
>(17), aResult
.breakIndex
);
375 // The same behavior is seen in English
377 aLocale
.Language
= "en";
378 aLocale
.Country
= "US";
380 const OUString str
= u
"\"angetan werden\" [Passiv]"_ustr
;
381 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(
382 str
, str
.getLength() - 4, aLocale
, 0, aHyphOptions
, aUserOptions
);
383 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32
>(17), aResult
.breakIndex
);
385 const OUString str2
= u
"\"angetan werden\" Passiv"_ustr
;
386 aResult
= m_xBreak
->getLineBreak(
387 str2
, str2
.getLength() - 4, aLocale
, 0, aHyphOptions
, aUserOptions
);
388 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32
>(17), aResult
.breakIndex
);
392 // i#72868: Writer/Impress line does not break after Chinese punctuation and Latin letters
394 aLocale
.Language
= "zh";
395 aLocale
.Country
= "HK";
398 // Per the bug, this should break at the ideographic comma. However, this change has
399 // been reverted at some point. This test only verifies current behavior.
400 const OUString str
= u
"word word、word word"_ustr
;
401 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(
402 str
, strlen("word wordXwor"), aLocale
, 0, aHyphOptions
, aUserOptions
);
403 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32
>(13), aResult
.breakIndex
);
407 // i#80891: Character in the forbidden list sometimes appears at the start of line
409 aLocale
.Language
= "zh";
410 aLocale
.Country
= "HK";
413 // Per the bug, the ideographic two-dot leader should be a forbidden character. However,
414 // this change seems to have been reverted or broken at some point.
415 const OUString str
= u
"電話︰電話"_ustr
;
416 i18n::LineBreakResults aResult
417 = m_xBreak
->getLineBreak(str
, 2, aLocale
, 0, aHyphOptions
, aUserOptions
);
418 CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32
>(2), aResult
.breakIndex
);
422 //See https://bz.apache.org/ooo/show_bug.cgi?id=19716
424 aLocale
.Language
= "en";
425 aLocale
.Country
= "US";
428 OUString
aTest(u
"aaa]aaa"_ustr
);
429 //Here we want the line break to move the whole lot to the next line
430 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(aTest
, aTest
.getLength()-2, aLocale
, 0,
431 aHyphOptions
, aUserOptions
);
432 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the line, not at ]", static_cast<sal_Int32
>(0), aResult
.breakIndex
);
436 //this is an example sequence from tdf92993-1.docx caught by the load crashtesting
438 static constexpr OUStringLiteral aTest
= u
"\U0001f356\U0001f357\U0001f346"
439 "\U0001f364\u2668\ufe0f\U0001f3c6";
441 aLocale
.Language
= "en";
442 aLocale
.Country
= "US";
445 //This must not assert/crash
446 (void)m_xBreak
->getLineBreak(aTest
, 0, aLocale
, 0, aHyphOptions
, aUserOptions
);
450 //See https://bugs.documentfoundation.org/show_bug.cgi?id=96197
452 static constexpr OUString aTest
= u
"\uc560\uad6D\uac00\uc758 \uac00"
455 aLocale
.Language
= "ko";
456 aLocale
.Country
= "KR";
459 i18n::LineBreakResults aResult
= m_xBreak
->getLineBreak(aTest
, aTest
.getLength()-2, aLocale
, 0,
460 aHyphOptions
, aUserOptions
);
461 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast<sal_Int32
>(5), aResult
.breakIndex
);
465 // i#65267: Comma is badly broken at end of line
466 // - The word should be wrapped along with the comma
468 aLocale
.Language
= "de";
469 aLocale
.Country
= "DE";
472 auto res
= m_xBreak
->getLineBreak(u
"Wort -prinzessinnen, wort"_ustr
,
473 strlen("Wort -prinzessinnen,"), aLocale
, 0,
474 aHyphOptions
, aUserOptions
);
475 CPPUNIT_ASSERT_EQUAL(sal_Int32
{ 5 }, res
.breakIndex
);
479 // tdf#114160: ZWJ shouldn't be treated as a breaking character
481 aLocale
.Language
= "mn";
482 aLocale
.Country
= "MN";
485 auto res
= m_xBreak
->getLineBreak(u
"\u1828\u1820\u200d\u00a0\u200d\u1873\u1873"_ustr
, 6,
486 aLocale
, 0, aHyphOptions
, aUserOptions
);
487 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), res
.breakIndex
);
490 aLocale
.Language
= "en";
491 aLocale
.Country
= "US";
494 auto res
= m_xBreak
->getLineBreak(u
"AB\u200d\u00a0\u200dCD"_ustr
, 6, aLocale
, 0,
495 aHyphOptions
, aUserOptions
);
496 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), res
.breakIndex
);
501 //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
502 void TestBreakIterator::testWordBoundaries()
504 lang::Locale aLocale
;
505 aLocale
.Language
= "en";
506 aLocale
.Country
= "US";
508 i18n::Boundary aBounds
;
510 //See https://bz.apache.org/ooo/show_bug.cgi?id=11993
512 OUString
aTest(u
"abcd ef ghi??? KLM"_ustr
);
514 CPPUNIT_ASSERT(!m_xBreak
->isBeginWord(aTest
, 4, aLocale
, i18n::WordType::DICTIONARY_WORD
));
515 CPPUNIT_ASSERT(m_xBreak
->isEndWord(aTest
, 4, aLocale
, i18n::WordType::DICTIONARY_WORD
));
516 aBounds
= m_xBreak
->getWordBoundary(aTest
, 4, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
517 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
518 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds
.endPos
);
520 CPPUNIT_ASSERT(!m_xBreak
->isBeginWord(aTest
, 8, aLocale
, i18n::WordType::DICTIONARY_WORD
));
521 CPPUNIT_ASSERT(!m_xBreak
->isEndWord(aTest
, 8, aLocale
, i18n::WordType::DICTIONARY_WORD
));
524 aBounds
= m_xBreak
->getWordBoundary(aTest
, 8, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
525 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.startPos
);
526 CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds
.endPos
);
529 aBounds
= m_xBreak
->getWordBoundary(aTest
, 8, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
530 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.startPos
);
531 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds
.endPos
);
533 CPPUNIT_ASSERT(!m_xBreak
->isBeginWord(aTest
, 12, aLocale
, i18n::WordType::DICTIONARY_WORD
));
534 CPPUNIT_ASSERT(m_xBreak
->isEndWord(aTest
, 12, aLocale
, i18n::WordType::DICTIONARY_WORD
));
535 aBounds
= m_xBreak
->getWordBoundary(aTest
, 12, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
536 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.startPos
);
537 CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds
.endPos
);
539 CPPUNIT_ASSERT(m_xBreak
->isBeginWord(aTest
, 16, aLocale
, i18n::WordType::DICTIONARY_WORD
));
540 CPPUNIT_ASSERT(!m_xBreak
->isEndWord(aTest
, 16, aLocale
, i18n::WordType::DICTIONARY_WORD
));
541 aBounds
= m_xBreak
->getWordBoundary(aTest
, 16, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
542 CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds
.startPos
);
543 CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds
.endPos
);
546 //See https://bz.apache.org/ooo/show_bug.cgi?id=21907
548 OUString
aTest(u
"b a?"_ustr
);
550 CPPUNIT_ASSERT(m_xBreak
->isBeginWord(aTest
, 1, aLocale
, i18n::WordType::ANY_WORD
));
551 CPPUNIT_ASSERT(m_xBreak
->isBeginWord(aTest
, 2, aLocale
, i18n::WordType::ANY_WORD
));
552 CPPUNIT_ASSERT(m_xBreak
->isBeginWord(aTest
, 3, aLocale
, i18n::WordType::ANY_WORD
));
554 CPPUNIT_ASSERT(m_xBreak
->isBeginWord(aTest
, 3, aLocale
, i18n::WordType::ANYWORD_IGNOREWHITESPACES
));
556 CPPUNIT_ASSERT(m_xBreak
->isEndWord(aTest
, 1, aLocale
, i18n::WordType::ANY_WORD
));
557 CPPUNIT_ASSERT(m_xBreak
->isEndWord(aTest
, 2, aLocale
, i18n::WordType::ANY_WORD
));
558 CPPUNIT_ASSERT(m_xBreak
->isEndWord(aTest
, 3, aLocale
, i18n::WordType::ANY_WORD
));
560 CPPUNIT_ASSERT(m_xBreak
->isEndWord(aTest
, 3, aLocale
, i18n::WordType::ANYWORD_IGNOREWHITESPACES
));
563 //See https://bz.apache.org/ooo/show_bug.cgi?id=14904
565 static constexpr OUString aTest
=
566 u
"Working \u201CWords"
568 "h quotes\u201D Work"
569 "ing \u2018Broken\u2019 "
572 "t even \u00BFreal? "
575 aBounds
= m_xBreak
->getWordBoundary(aTest
, 4, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
576 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
577 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds
.endPos
);
579 aBounds
= m_xBreak
->getWordBoundary(aTest
, 12, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
580 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.startPos
);
581 CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds
.endPos
);
583 aBounds
= m_xBreak
->getWordBoundary(aTest
, 40, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
584 CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds
.startPos
);
585 CPPUNIT_ASSERT_EQUAL(sal_Int32(44), aBounds
.endPos
);
587 aBounds
= m_xBreak
->getWordBoundary(aTest
, 49, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
588 CPPUNIT_ASSERT_EQUAL(sal_Int32(46), aBounds
.startPos
);
589 CPPUNIT_ASSERT_EQUAL(sal_Int32(52), aBounds
.endPos
);
591 aBounds
= m_xBreak
->getWordBoundary(aTest
, 58, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
592 CPPUNIT_ASSERT_EQUAL(sal_Int32(55), aBounds
.startPos
);
593 CPPUNIT_ASSERT_EQUAL(sal_Int32(62), aBounds
.endPos
);
595 aBounds
= m_xBreak
->getWordBoundary(aTest
, 67, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
596 CPPUNIT_ASSERT_EQUAL(sal_Int32(64), aBounds
.startPos
);
597 CPPUNIT_ASSERT_EQUAL(sal_Int32(71), aBounds
.endPos
);
599 aBounds
= m_xBreak
->getWordBoundary(aTest
, 90, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
600 CPPUNIT_ASSERT_EQUAL(sal_Int32(88), aBounds
.startPos
);
601 CPPUNIT_ASSERT_EQUAL(sal_Int32(92), aBounds
.endPos
);
604 //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
605 sal_Unicode aBreakTests
[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF };
606 for (int mode
= i18n::WordType::ANY_WORD
; mode
<= i18n::WordType::WORD_COUNT
; ++mode
)
608 //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
609 for (auto const& i
: aBreakTests
)
611 OUString aTest
= "Word" + OUStringChar(i
) + "Word";
612 aBounds
= m_xBreak
->getWordBoundary(aTest
, 0, aLocale
, mode
, true);
615 case i18n::WordType::ANY_WORD
:
616 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
617 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds
.endPos
);
619 case i18n::WordType::ANYWORD_IGNOREWHITESPACES
:
620 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
621 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds
.endPos
);
623 case i18n::WordType::DICTIONARY_WORD
:
624 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
625 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds
.endPos
);
627 case i18n::WordType::WORD_COUNT
:
628 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
629 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds
.endPos
);
633 CPPUNIT_ASSERT(m_xBreak
->isBeginWord(aTest
, aBounds
.startPos
, aLocale
, mode
));
634 CPPUNIT_ASSERT(m_xBreak
->isEndWord(aTest
, aBounds
.endPos
, aLocale
, mode
));
638 sal_Unicode aJoinTests
[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB };
639 for (int mode
= i18n::WordType::ANY_WORD
; mode
<= i18n::WordType::WORD_COUNT
; ++mode
)
641 //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
642 for (auto const& p
: aJoinTests
)
644 OUString aTest
= "Word" + OUStringChar(p
) + "Word";
645 aBounds
= m_xBreak
->getWordBoundary(aTest
, 0, aLocale
, mode
, true);
648 case i18n::WordType::ANY_WORD
:
649 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
650 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.endPos
);
652 case i18n::WordType::ANYWORD_IGNOREWHITESPACES
:
653 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
654 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.endPos
);
656 case i18n::WordType::DICTIONARY_WORD
:
657 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
658 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.endPos
);
660 case i18n::WordType::WORD_COUNT
:
661 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
662 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.endPos
);
666 CPPUNIT_ASSERT(m_xBreak
->isBeginWord(aTest
, aBounds
.startPos
, aLocale
, mode
));
667 CPPUNIT_ASSERT(m_xBreak
->isEndWord(aTest
, aBounds
.endPos
, aLocale
, mode
));
671 //See https://bz.apache.org/ooo/show_bug.cgi?id=13494
673 static constexpr OUString
aBase(u
"xxAAxxBBxxCCxx"_ustr
);
674 const sal_Unicode aTests
[] =
676 '\'', ';', ',', '.', '!', '@', '#', '%', '&', '*',
677 '(', ')', '_', '-', '{', '}', '[', ']', '\"', '/',
678 '\\', '?', '~', '$', '+', '^', '=', '<', '>', '|'
681 const sal_Int32 aDoublePositions
[] = {0, 2, 4, 6, 8, 10, 12, 14};
682 for (auto const& r
: aTests
)
684 OUString aTest
= aBase
.replace('x', r
);
689 CPPUNIT_ASSERT(i
< std::size(aDoublePositions
));
690 nPos
= m_xBreak
->nextWord(aTest
, nPos
, aLocale
, i18n::WordType::ANYWORD_IGNOREWHITESPACES
).startPos
;
691 CPPUNIT_ASSERT_EQUAL(aDoublePositions
[i
], nPos
);
694 while (nPos
< aTest
.getLength());
695 nPos
= aTest
.getLength();
696 i
= std::size(aDoublePositions
)-1;
699 nPos
= m_xBreak
->previousWord(aTest
, nPos
, aLocale
, i18n::WordType::ANYWORD_IGNOREWHITESPACES
).startPos
;
701 CPPUNIT_ASSERT_EQUAL(aDoublePositions
[i
], nPos
);
706 const sal_Int32 aSinglePositions
[] = {0, 1, 3, 4, 6, 7, 9, 10};
707 for (size_t j
= 1; j
< std::size(aTests
); ++j
)
709 OUString aTest
= aBase
.replaceAll("xx", OUStringChar(aTests
[j
]));
714 CPPUNIT_ASSERT(i
< std::size(aSinglePositions
));
715 nPos
= m_xBreak
->nextWord(aTest
, nPos
, aLocale
, i18n::WordType::ANYWORD_IGNOREWHITESPACES
).startPos
;
716 CPPUNIT_ASSERT_EQUAL(aSinglePositions
[i
], nPos
);
719 while (nPos
< aTest
.getLength());
720 nPos
= aTest
.getLength();
721 i
= std::size(aSinglePositions
)-1;
724 nPos
= m_xBreak
->previousWord(aTest
, nPos
, aLocale
, i18n::WordType::ANYWORD_IGNOREWHITESPACES
).startPos
;
726 CPPUNIT_ASSERT_EQUAL(aSinglePositions
[i
], nPos
);
731 const sal_Int32 aSingleQuotePositions
[] = {0, 1, 9, 10};
732 CPPUNIT_ASSERT_EQUAL(u
'\'', aTests
[0]);
734 OUString aTest
= aBase
.replaceAll("xx", OUStringChar(aTests
[0]));
739 CPPUNIT_ASSERT(i
< std::size(aSingleQuotePositions
));
740 nPos
= m_xBreak
->nextWord(aTest
, nPos
, aLocale
, i18n::WordType::ANYWORD_IGNOREWHITESPACES
).startPos
;
741 CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions
[i
], nPos
);
744 while (nPos
< aTest
.getLength());
745 nPos
= aTest
.getLength();
746 i
= std::size(aSingleQuotePositions
)-1;
749 nPos
= m_xBreak
->previousWord(aTest
, nPos
, aLocale
, i18n::WordType::ANYWORD_IGNOREWHITESPACES
).startPos
;
751 CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions
[i
], nPos
);
757 //See https://bz.apache.org/ooo/show_bug.cgi?id=13451
759 aLocale
.Language
= "ca";
760 aLocale
.Country
= "ES";
762 OUString
aTest(u
"mirar-se comprar-vos donem-nos les mans aneu-vos-en!"_ustr
);
765 sal_Int32 aExpected
[] = {8, 20, 30, 34, 39, 51, 52};
769 CPPUNIT_ASSERT(i
< std::size(aExpected
));
770 nPos
= m_xBreak
->getWordBoundary(aTest
, nPos
, aLocale
,
771 i18n::WordType::DICTIONARY_WORD
, true).endPos
;
772 CPPUNIT_ASSERT_EQUAL(aExpected
[i
], nPos
);
775 while (nPos
++ < aTest
.getLength());
776 CPPUNIT_ASSERT_EQUAL(std::size(aExpected
), i
);
779 // i#85411: ZWSP should be a word separator for spellchecking
780 // - This fix was applied to both dict and edit customizations
781 for (int j
= 0; j
< 3; ++j
)
786 aLocale
.Language
= "en";
787 aLocale
.Country
= "US";
790 aLocale
.Language
= "ca";
791 aLocale
.Country
= "ES";
794 aLocale
.Language
= "fi";
795 aLocale
.Country
= "FI";
798 CPPUNIT_ASSERT(false);
802 static constexpr OUString aTest
= u
"I\u200Bwant\u200Bto\u200Bgo"_ustr
;
805 sal_Int32 aExpected
[] = { 1, 6, 9, 12 };
809 CPPUNIT_ASSERT(i
< std::size(aExpected
));
810 auto dwPos
= m_xBreak
->getWordBoundary(aTest
, nPos
, aLocale
,
811 i18n::WordType::DICTIONARY_WORD
, true);
812 CPPUNIT_ASSERT_EQUAL(aExpected
[i
], dwPos
.endPos
);
813 auto ewPos
= m_xBreak
->getWordBoundary(aTest
, nPos
, aLocale
,
814 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
815 CPPUNIT_ASSERT_EQUAL(aExpected
[i
], ewPos
.endPos
);
818 } while (nPos
++ < aTest
.getLength());
819 CPPUNIT_ASSERT_EQUAL(std::size(aExpected
), i
);
822 //https://bz.apache.org/ooo/show_bug.cgi?id=21290
823 for (int j
= 0; j
< 2; ++j
)
828 aLocale
.Language
= "en";
829 aLocale
.Country
= "US";
832 aLocale
.Language
= "grc";
833 aLocale
.Country
.clear();
836 CPPUNIT_ASSERT(false);
840 static constexpr OUString aTest
=
841 u
"\u1F0C\u03BD\u03B4\u03C1\u03B1 \u1F00"
842 "\u03C1\u03BD\u1F7B\u03BC\u03B5\u03BD\u03BF"
843 "\u03C2 \u1F00\u03BB\u03BB \u1F24"
844 "\u03C3\u03B8\u03B9\u03BF\u03BD"_ustr
;
847 sal_Int32 aExpected
[] = {5, 15, 19, 26};
851 CPPUNIT_ASSERT(i
< std::size(aExpected
));
852 nPos
= m_xBreak
->getWordBoundary(aTest
, nPos
, aLocale
,
853 i18n::WordType::DICTIONARY_WORD
, true).endPos
;
854 CPPUNIT_ASSERT_EQUAL(aExpected
[i
], nPos
);
857 while (nPos
++ < aTest
.getLength());
858 CPPUNIT_ASSERT_EQUAL(std::size(aExpected
), i
);
861 //See https://bz.apache.org/ooo/show_bug.cgi?id=58513
862 //See https://bugs.libreoffice.org/show_bug.cgi?id=55707
864 aLocale
.Language
= "fi";
865 aLocale
.Country
= "FI";
867 OUString
aTest(u
"Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n"_ustr
);
871 sal_Int32 aExpected
[] = {11, 21, 24, 36, 42, 47, 51};
875 CPPUNIT_ASSERT(i
< std::size(aExpected
));
876 nPos
= m_xBreak
->getWordBoundary(aTest
, nPos
, aLocale
,
877 i18n::WordType::WORD_COUNT
, true).endPos
;
878 CPPUNIT_ASSERT_EQUAL(aExpected
[i
], nPos
);
881 while (nPos
++ < aTest
.getLength());
882 CPPUNIT_ASSERT_EQUAL(std::size(aExpected
), i
);
887 sal_Int32 aExpected
[] = {0, 11, 12, 20, 22, 24, 25, 36, 37,
888 40, 41, 42, 43, 45, 46, 47, 50, 51};
892 CPPUNIT_ASSERT(i
< std::size(aExpected
));
893 aBounds
= m_xBreak
->getWordBoundary(aTest
, nPos
, aLocale
,
894 i18n::WordType::DICTIONARY_WORD
, true);
895 CPPUNIT_ASSERT_EQUAL(aExpected
[i
], aBounds
.startPos
);
897 CPPUNIT_ASSERT_EQUAL(aExpected
[i
], aBounds
.endPos
);
899 nPos
= aBounds
.endPos
;
901 while (nPos
++ < aTest
.getLength());
902 CPPUNIT_ASSERT_EQUAL(std::size(aExpected
), i
);
906 //See https://bz.apache.org/ooo/show_bug.cgi?id=107843
908 aLocale
.Language
= "en";
909 aLocale
.Country
= "US";
911 static constexpr OUString aTest
=
912 u
"ru\uFB00le \uFB01sh"_ustr
;
914 aBounds
= m_xBreak
->getWordBoundary(aTest
, 1, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
915 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
916 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.endPos
);
918 aBounds
= m_xBreak
->getWordBoundary(aTest
, 7, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
919 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds
.startPos
);
920 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.endPos
);
923 //See https://bz.apache.org/ooo/show_bug.cgi?id=113785
925 aLocale
.Language
= "en";
926 aLocale
.Country
= "US";
928 static constexpr OUString aTest
=
929 u
"a\u2013b\u2014c"_ustr
;
931 aBounds
= m_xBreak
->getWordBoundary(aTest
, 0, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
932 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
933 CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds
.endPos
);
935 aBounds
= m_xBreak
->nextWord(aTest
, 0, aLocale
, i18n::WordType::DICTIONARY_WORD
);
936 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds
.startPos
);
937 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds
.endPos
);
939 aBounds
= m_xBreak
->nextWord(aTest
, aBounds
.endPos
, aLocale
, i18n::WordType::DICTIONARY_WORD
);
940 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds
.startPos
);
941 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.endPos
);
944 // i#55778: Words containing numbers get broken up
946 aLocale
.Language
= "en";
947 aLocale
.Country
= "US";
949 static constexpr OUString aTest
= u
"first i18n third"_ustr
;
952 = m_xBreak
->getWordBoundary(aTest
, 8, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
953 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds
.startPos
);
954 CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds
.endPos
);
957 // i#56347: "BreakIterator patch for Hungarian"
958 // i#56348: Special chars in first pos not handled by spell checking in Writer (Hungarian)
959 // Rules for Hungarian affixes after numbers and certain symbols
961 aLocale
.Language
= "hu";
962 aLocale
.Country
= "HU";
964 OUString aTest
= u
"szavak 15 15-tel 15%-kal €-val szavak"_ustr
;
967 { i18n::WordType::DICTIONARY_WORD
, i18n::WordType::ANYWORD_IGNOREWHITESPACES
})
969 aBounds
= m_xBreak
->getWordBoundary(aTest
, 2, aLocale
, mode
, true);
970 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
971 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds
.endPos
);
973 aBounds
= m_xBreak
->getWordBoundary(aTest
, 7, aLocale
, mode
, true);
974 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds
.startPos
);
975 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.endPos
);
977 aBounds
= m_xBreak
->getWordBoundary(aTest
, 11, aLocale
, mode
, true);
978 CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds
.startPos
);
979 CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds
.endPos
);
981 aBounds
= m_xBreak
->getWordBoundary(aTest
, 18, aLocale
, mode
, true);
982 CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds
.startPos
);
983 CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds
.endPos
);
985 aBounds
= m_xBreak
->getWordBoundary(aTest
, 25, aLocale
, mode
, true);
986 CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds
.startPos
);
987 CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds
.endPos
);
989 aBounds
= m_xBreak
->getWordBoundary(aTest
, 27, aLocale
, mode
, true);
990 CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds
.startPos
);
991 CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds
.endPos
);
993 aBounds
= m_xBreak
->getWordBoundary(aTest
, 34, aLocale
, mode
, true);
994 CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds
.startPos
);
995 CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds
.endPos
);
999 // tdf#49885: Upgrade CJ word boundary analysis to ICU frequency-based analysis
1001 aLocale
.Language
= "ja";
1002 aLocale
.Country
= "JP";
1004 static constexpr OUString aTest
= u
"通産省工業技術院北海道工業開発試験所"_ustr
;
1007 = m_xBreak
->getWordBoundary(aTest
, 9, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1009 // When using the old LO custom dictionaries, this will select the entire phrase.
1010 // When using ICU, it will select only 北海道.
1011 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds
.startPos
);
1012 CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds
.endPos
);
1015 // tdf#161737: narrow no-break space at the end of words resulted spelling mistakes
1017 aLocale
.Language
= "en";
1018 aLocale
.Country
= "US";
1020 OUString
aTest(u
"L’espace fine insécable\u202F!"_ustr
);
1022 = m_xBreak
->getWordBoundary(aTest
, 14, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1023 CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds
.startPos
);
1024 // This was 24 (word + NNBSP)
1025 CPPUNIT_ASSERT_EQUAL(sal_Int32(23), aBounds
.endPos
);
1028 // tdf#161737: narrow no-break space between digits resulted spelling mistakes
1029 // as a quick fix, limit NBSP as word-part character only for editing, and not for spell checking
1030 // TODO: remove NBSP by the linguistic module or by the spell checking dictionaries to allow
1031 // to check numbers with thousand separators and with correct suffix
1033 aLocale
.Language
= "en";
1034 aLocale
.Country
= "US";
1036 OUString
aTest(u
"1\u202F000\u202F000"_ustr
);
1038 = m_xBreak
->getWordBoundary(aTest
, 2, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1039 // This was 0 (word + NNBSP)
1040 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds
.startPos
);
1041 // This was 8 (word + NNBSP)
1042 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.endPos
);
1045 // tdf#161737: narrow no-break space at the end of words resulted spelling mistakes
1047 aLocale
.Language
= "hu";
1048 aLocale
.Country
= "HU";
1050 OUString
aTest(u
"L’espace fine insécable\u202F!"_ustr
);
1052 = m_xBreak
->getWordBoundary(aTest
, 14, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1053 CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds
.startPos
);
1054 // This was 24 (word + NNBSP)
1055 CPPUNIT_ASSERT_EQUAL(sal_Int32(23), aBounds
.endPos
);
1058 // tdf#161737: narrow no-break space between digits resulted spelling mistakes
1059 // as a quick fix, limit NBSP as word-part character only for editing, and not for spell checking
1060 // TODO: remove NBSP by the linguistic module or by the spell checking dictionaries to allow
1061 // to check numbers with thousand separators and with correct suffix
1063 aLocale
.Language
= "hu";
1064 aLocale
.Country
= "HU";
1066 OUString
aTest(u
"1\u202F000\u202F000"_ustr
);
1068 = m_xBreak
->getWordBoundary(aTest
, 2, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1069 // This was 0 (word + NNBSP)
1070 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds
.startPos
);
1071 // This was 8 (word + NNBSP)
1072 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.endPos
);
1076 void TestBreakIterator::testSentenceBoundaries()
1078 lang::Locale aLocale
;
1079 aLocale
.Language
= "en";
1080 aLocale
.Country
= "US";
1082 // Trivial characteristic test for sentence boundary detection
1084 OUString
aTest(u
"This is a sentence. This is a different sentence."_ustr
);
1086 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak
->beginOfSentence(aTest
, 5, aLocale
));
1087 CPPUNIT_ASSERT_EQUAL(sal_Int32(19), m_xBreak
->endOfSentence(aTest
, 5, aLocale
));
1088 CPPUNIT_ASSERT_EQUAL(sal_Int32(20), m_xBreak
->beginOfSentence(aTest
, 31, aLocale
));
1089 CPPUNIT_ASSERT_EQUAL(sal_Int32(49), m_xBreak
->endOfSentence(aTest
, 31, aLocale
));
1092 // i#24098: i18n API beginOfSentence/endOfSentence
1093 // fix beginOfSentence, ... when cursor is on the beginning of the sentence
1095 OUString
aTest(u
"This is a sentence. This is a different sentence."_ustr
);
1097 CPPUNIT_ASSERT_EQUAL(sal_Int32(20), m_xBreak
->beginOfSentence(aTest
, 20, aLocale
));
1098 CPPUNIT_ASSERT_EQUAL(sal_Int32(49), m_xBreak
->endOfSentence(aTest
, 20, aLocale
));
1101 // i#24098: i18n API beginOfSentence/endOfSentence
1102 // "skip preceding space for beginOfSentence"
1104 OUString
aTest(u
"This is a sentence. This is a different sentence."_ustr
);
1106 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak
->beginOfSentence(aTest
, 20, aLocale
));
1107 CPPUNIT_ASSERT_EQUAL(sal_Int32(19), m_xBreak
->endOfSentence(aTest
, 20, aLocale
));
1108 CPPUNIT_ASSERT_EQUAL(sal_Int32(24), m_xBreak
->beginOfSentence(aTest
, 26, aLocale
));
1109 CPPUNIT_ASSERT_EQUAL(sal_Int32(53), m_xBreak
->endOfSentence(aTest
, 26, aLocale
));
1112 // i#55063: Sentence selection in Thai should select a space-delimited phrase.
1113 // - This customization broke at some point. It works in an English locale in a synthetic test
1114 // like this one, but does not work in the Thai locale, nor on Thai text in practice.
1116 static constexpr OUString aTest
= u
"ว้อย โหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอก โปรโมเตอร์"_ustr
;
1118 aLocale
.Language
= "en";
1119 aLocale
.Country
= "US";
1120 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak
->beginOfSentence(aTest
, 23, aLocale
));
1121 CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak
->endOfSentence(aTest
, 23, aLocale
));
1123 aLocale
.Language
= "th";
1124 aLocale
.Country
= "TH";
1125 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak
->beginOfSentence(aTest
, 23, aLocale
));
1126 CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak
->endOfSentence(aTest
, 23, aLocale
));
1129 // i#55063: Thai phrases should delimit English sentence selection.
1130 // - This customization broke at some point. It works in an English locale in a synthetic test
1131 // like this one, but does not work in the Thai locale, nor on Thai text in practice.
1133 static constexpr OUString aTest
= u
"ว้อย English usually ends with a period โปรโมเตอร์."_ustr
;
1135 aLocale
.Language
= "en";
1136 aLocale
.Country
= "US";
1137 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak
->beginOfSentence(aTest
, 23, aLocale
));
1138 CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak
->endOfSentence(aTest
, 23, aLocale
));
1140 aLocale
.Language
= "th";
1141 aLocale
.Country
= "TH";
1142 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak
->beginOfSentence(aTest
, 23, aLocale
));
1143 CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak
->endOfSentence(aTest
, 23, aLocale
));
1146 // i#55063: Characteristic test for English text delimiting Thai phrases (sentences)
1147 // - English text should not delimit Thai phrases.
1149 static constexpr OUString aTest
= u
"Englishโหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอกEnglish"_ustr
;
1151 aLocale
.Language
= "en";
1152 aLocale
.Country
= "US";
1153 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak
->beginOfSentence(aTest
, 23, aLocale
));
1154 CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak
->endOfSentence(aTest
, 23, aLocale
));
1156 aLocale
.Language
= "th";
1157 aLocale
.Country
= "TH";
1158 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak
->beginOfSentence(aTest
, 23, aLocale
));
1159 CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak
->endOfSentence(aTest
, 23, aLocale
));
1163 //See https://bugs.libreoffice.org/show_bug.cgi?id=40292
1164 //See https://bz.apache.org/ooo/show_bug.cgi?id=80412
1165 //See https://bz.apache.org/ooo/show_bug.cgi?id=111152
1166 //See https://bz.apache.org/ooo/show_bug.cgi?id=50172
1167 void TestBreakIterator::testGraphemeIteration()
1169 lang::Locale aLocale
;
1170 aLocale
.Language
= "bn";
1171 aLocale
.Country
= "IN";
1174 static constexpr OUString aTest
= u
"\u09AC\u09CD\u09AF"_ustr
; // BA HALANT LA
1178 nPos
= m_xBreak
->nextCharacters(aTest
, 0, aLocale
,
1179 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1180 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest
.getLength(), nPos
);
1181 nPos
= m_xBreak
->previousCharacters(aTest
, aTest
.getLength(), aLocale
,
1182 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1183 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32
>(0), nPos
);
1187 static constexpr OUString aTest
= u
"\u09B9\u09CD\u09A3\u09BF"_ustr
;
1188 // HA HALANT NA VOWELSIGNI
1192 nPos
= m_xBreak
->nextCharacters(aTest
, 0, aLocale
,
1193 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1194 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest
.getLength(), nPos
);
1195 nPos
= m_xBreak
->previousCharacters(aTest
, aTest
.getLength(), aLocale
,
1196 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1197 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32
>(0), nPos
);
1201 static constexpr OUString aTest
= u
"\u09A4\u09CD\u09AE\u09CD\u09AF"_ustr
;
1202 // TA HALANT MA HALANT YA
1206 nPos
= m_xBreak
->nextCharacters(aTest
, 0, aLocale
,
1207 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1208 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest
.getLength(), nPos
);
1209 nPos
= m_xBreak
->previousCharacters(aTest
, aTest
.getLength(), aLocale
,
1210 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1211 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32
>(0), nPos
);
1214 aLocale
.Language
= "ta";
1215 aLocale
.Country
= "IN";
1218 static constexpr OUString aTest
= u
"\u0B9A\u0BBF\u0BA4\u0BCD\u0BA4\u0BBF\u0BB0\u0BC8"_ustr
; // CA VOWELSIGNI TA VIRAMA TA VOWELSIGNI RA VOWELSIGNAI
1223 nPos
= m_xBreak
->nextCharacters(aTest
, 0, aLocale
, i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1224 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32
>(2), nPos
);
1225 nPos
= m_xBreak
->nextCharacters(aTest
, nPos
, aLocale
, i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1226 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32
>(4), nPos
);
1227 nPos
= m_xBreak
->nextCharacters(aTest
, nPos
, aLocale
, i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1228 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32
>(6), nPos
);
1229 nPos
= m_xBreak
->nextCharacters(aTest
, nPos
, aLocale
, i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1230 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest
.getLength(), nPos
);
1231 nPos
= m_xBreak
->previousCharacters(aTest
, aTest
.getLength(), aLocale
,
1232 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1233 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32
>(6), nPos
);
1234 nPos
= m_xBreak
->previousCharacters(aTest
, nPos
, aLocale
, i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1235 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32
>(4), nPos
);
1236 nPos
= m_xBreak
->previousCharacters(aTest
, nPos
, aLocale
, i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1237 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32
>(2), nPos
);
1238 nPos
= m_xBreak
->previousCharacters(aTest
, nPos
, aLocale
, i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1239 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32
>(0), nPos
);
1243 static constexpr OUString aTest
= u
"\u0B95\u0BC1"_ustr
; // KA VOWELSIGNU
1248 nPos
= m_xBreak
->nextCharacters(aTest
, 0, aLocale
,
1249 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1250 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest
.getLength(), nPos
);
1251 nPos
= m_xBreak
->previousCharacters(aTest
, aTest
.getLength(), aLocale
,
1252 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1253 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32
>(0), nPos
);
1257 static constexpr OUString aTest
=
1258 u
"\u0B9A\u0BBF\u0BA4\u0BCD\u0BA4\u0BBF\u0BB0\u0BC8"_ustr
;
1259 // CA VOWELSIGNI TA VIRAMA TA VOWELSIGNI RA VOWELSIGNAI
1264 for (sal_Int32 i
= 0; i
< 4; ++i
)
1266 sal_Int32 nOldPos
= nPos
;
1267 nPos
= m_xBreak
->nextCharacters(aTest
, nPos
, aLocale
,
1268 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1269 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos
+2, nPos
);
1272 for (sal_Int32 i
= 0; i
< 4; ++i
)
1274 sal_Int32 nOldPos
= nPos
;
1275 nPos
= m_xBreak
->previousCharacters(aTest
, nPos
, aLocale
,
1276 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1277 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos
-2, nPos
);
1282 static constexpr OUString aText
= u
"\u05D0\u05B8"_ustr
; // ALEF QAMATS
1284 sal_Int32 nGraphemeCount
= 0;
1286 sal_Int32 nCurPos
= 0;
1287 while (nCurPos
< aText
.getLength())
1289 sal_Int32 nCount2
= 1;
1290 nCurPos
= m_xBreak
->nextCharacters(aText
, nCurPos
, lang::Locale(),
1291 i18n::CharacterIteratorMode::SKIPCELL
, nCount2
, nCount2
);
1295 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be considered 1 grapheme", static_cast<sal_Int32
>(1), nGraphemeCount
);
1298 aLocale
.Language
= "hi";
1299 aLocale
.Country
= "IN";
1302 static constexpr OUString aTest
= u
"\u0936\u0940"_ustr
; // SHA VOWELSIGNII
1307 nPos
= m_xBreak
->nextCharacters(aTest
, 0, aLocale
,
1308 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1309 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest
.getLength(), nPos
);
1310 nPos
= m_xBreak
->previousCharacters(aTest
, aTest
.getLength(), aLocale
,
1311 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1312 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32
>(0), nPos
);
1315 // tdf#49885: Replace custom Thai implementation with ICU
1317 aLocale
.Language
= "th";
1318 aLocale
.Country
= "TH";
1320 static constexpr OUString aTest
= u
"กำ"_ustr
;
1322 CPPUNIT_ASSERT_EQUAL(sal_Int32
{ 2 }, aTest
.getLength());
1324 sal_Int32 nDone
= 0;
1327 nPos
= m_xBreak
->nextCharacters(aTest
, 0, aLocale
, i18n::CharacterIteratorMode::SKIPCELL
, 1,
1329 CPPUNIT_ASSERT_EQUAL(aTest
.getLength(), nPos
);
1331 nPos
= m_xBreak
->previousCharacters(aTest
, aTest
.getLength(), aLocale
,
1332 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1333 CPPUNIT_ASSERT_EQUAL(sal_Int32
{ 0 }, nPos
);
1336 // Korean may also use grapheme clusters for character composition
1338 aLocale
.Language
= "ko";
1339 aLocale
.Country
= "KR";
1341 static constexpr OUString aTest
= u
"각"_ustr
;
1343 CPPUNIT_ASSERT_EQUAL(sal_Int32
{ 3 }, aTest
.getLength());
1345 sal_Int32 nDone
= 0;
1348 nPos
= m_xBreak
->nextCharacters(aTest
, 0, aLocale
, i18n::CharacterIteratorMode::SKIPCELL
, 1,
1350 CPPUNIT_ASSERT_EQUAL(aTest
.getLength(), nPos
);
1352 nPos
= m_xBreak
->previousCharacters(aTest
, aTest
.getLength(), aLocale
,
1353 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1354 CPPUNIT_ASSERT_EQUAL(sal_Int32
{ 0 }, nPos
);
1358 //A test to ensure that certain ranges and codepoints that are categorized as
1359 //weak remain as weak, so that existing docs that depend on this don't silently
1360 //change font for those weak chars
1361 void TestBreakIterator::testWeak()
1363 lang::Locale aLocale
;
1364 aLocale
.Language
= "en";
1365 aLocale
.Country
= "US";
1368 static constexpr OUString aWeaks
=
1371 "\u0300\u036F" //Combining Diacritical Marks
1372 "\u1AB0\u1AFF" //Combining Diacritical Marks Extended
1373 "\u1DC0\u1DFF" //Combining Diacritical Marks Supplement
1374 "\u20D0\u20FF" //Combining Diacritical Marks for Symbols
1375 "\u2150\u215F" //Number Forms, fractions
1376 "\u2160\u2180" //Number Forms, roman numerals
1377 "\u2200\u22FF" //Mathematical Operators
1378 "\u27C0\u27EF" //Miscellaneous Mathematical Symbols-A
1379 "\u2980\u29FF" //Miscellaneous Mathematical Symbols-B
1380 "\u2A00\u2AFF" //Supplemental Mathematical Operators
1381 "\u2100\u214F" //Letterlike Symbols
1382 "\u2308\u230B" //Miscellaneous technical
1383 "\u25A0\u25FF" //Geometric Shapes
1384 "\u2B30\u2B4C"_ustr
; //Miscellaneous Symbols and Arrows
1386 for (sal_Int32 i
= 0; i
< aWeaks
.getLength(); ++i
)
1388 sal_Int16 nScript
= m_xBreak
->getScriptType(aWeaks
, i
);
1391 OString::number(static_cast<sal_Int32
>(std::u16string_view(aWeaks
)[i
]), 16) +
1392 " should have been weak";
1393 CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg
.getStr(),
1394 i18n::ScriptType::WEAK
, nScript
);
1399 //A test to ensure that certain ranges and codepoints that are categorized as
1400 //asian remain as asian, so that existing docs that depend on this don't silently
1401 //change font for those asian chars.
1402 //See https://bugs.libreoffice.org/show_bug.cgi?id=38095
1403 void TestBreakIterator::testAsian()
1405 lang::Locale aLocale
;
1406 aLocale
.Language
= "en";
1407 aLocale
.Country
= "US";
1410 static constexpr OUString aAsians
=
1411 //some typical CJK chars
1413 //The full HalfWidth and FullWidth block has historically been
1414 //designated as taking the CJK font :-(
1415 //HalfWidth and FullWidth forms of ASCII 0-9, categorized under
1416 //UAX24 as "Common" i.e. by that logic WEAK
1418 //HalfWidth and FullWidth forms of ASCII A-z, categorized under
1419 //UAX25 as "Latin", i.e. by that logic LATIN
1420 "\uFF21\uFF5A"_ustr
;
1422 for (sal_Int32 i
= 0; i
< aAsians
.getLength(); ++i
)
1424 sal_Int16 nScript
= m_xBreak
->getScriptType(aAsians
, i
);
1427 OString::number(static_cast<sal_Int32
>(std::u16string_view(aAsians
)[i
]), 16) +
1428 " should have been asian";
1429 CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg
.getStr(),
1430 i18n::ScriptType::ASIAN
, nScript
);
1435 //A test to ensure that our Lao word boundary detection is useful
1436 void TestBreakIterator::testLao()
1438 lang::Locale aLocale
;
1439 aLocale
.Language
= "lo";
1440 aLocale
.Country
= "LA";
1442 static constexpr OUString aTest
= u
"\u0e8d\u0eb4\u0e99\u0e94\u0eb5\u0e95\u0ec9\u0ead\u0e99\u0eae\u0eb1\u0e9a"_ustr
;
1443 i18n::Boundary aBounds
= m_xBreak
->getWordBoundary(aTest
, 0, aLocale
,
1444 i18n::WordType::DICTIONARY_WORD
, true);
1446 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1447 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.endPos
);
1449 aBounds
= m_xBreak
->getWordBoundary(aTest
, aBounds
.endPos
, aLocale
,
1450 i18n::WordType::DICTIONARY_WORD
, true);
1452 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.startPos
);
1453 #if (U_ICU_VERSION_MAJOR_NUM < 70)
1454 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.endPos
);
1457 // In ICU 70/71 for yet unknown reason the word boundary 9 is not detected and
1458 // instead the length 12 is returned as endpos.
1460 // icu_70::RuleBasedBreakIterator::BreakCache::next()
1461 // icu_70::RuleBasedBreakIterator::BreakCache::following()
1462 // icu_70::RuleBasedBreakIterator::following()
1463 // i18npool::BreakIterator_Unicode::getWordBoundary()
1464 CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds
.endPos
);
1468 //A test to ensure that our thai word boundary detection is useful
1469 void TestBreakIterator::testThai()
1471 lang::Locale aLocale
;
1472 aLocale
.Language
= "th";
1473 aLocale
.Country
= "TH";
1475 //See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
1477 static constexpr OUString aTest
= u
"\u0E01\u0E38\u0E2B\u0E25\u0E32\u0E1A"_ustr
;
1478 i18n::Boundary aBounds
= m_xBreak
->getWordBoundary(aTest
, 0, aLocale
,
1479 i18n::WordType::DICTIONARY_WORD
, true);
1480 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full word",
1481 sal_Int32(0), aBounds
.startPos
);
1482 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full word",
1483 aTest
.getLength(), aBounds
.endPos
);
1486 //See https://bz.apache.org/ooo/show_bug.cgi?id=29548
1487 //make sure forwards and back are consistent
1489 static constexpr OUString aTest
=
1490 u
"\u0E2D\u0E38\u0E17\u0E22\u0E32\u0E19\u0E41"
1491 "\u0E2B\u0E48\u0E07\u0E0A\u0E32\u0E15\u0E34"
1492 "\u0E19\u0E49\u0E33\u0E2B\u0E19\u0E32\u0E27"
1493 "\u0E2D\u0E38\u0E17\u0E22\u0E32\u0E19\u0E41"
1494 "\u0E2B\u0E48\u0E07\u0E0A\u0E32\u0E15\u0E34"
1495 "\u0E19\u0E49\u0E33\u0E2B\u0E19\u0E32\u0E27"_ustr
;
1497 std::stack
<sal_Int32
> aPositions
;
1498 sal_Int32 nPos
= -1;
1501 nPos
= m_xBreak
->nextWord(aTest
, nPos
, aLocale
, i18n::WordType::ANYWORD_IGNOREWHITESPACES
).startPos
;
1502 aPositions
.push(nPos
);
1504 while (nPos
< aTest
.getLength());
1505 nPos
= aTest
.getLength();
1506 CPPUNIT_ASSERT(!aPositions
.empty());
1510 CPPUNIT_ASSERT(!aPositions
.empty());
1511 nPos
= m_xBreak
->previousWord(aTest
, nPos
, aLocale
, i18n::WordType::ANYWORD_IGNOREWHITESPACES
).startPos
;
1512 CPPUNIT_ASSERT_EQUAL(aPositions
.top(), nPos
);
1520 static constexpr OUString aTest
= u
"\U00010000"_ustr
;
1525 nPos
= m_xBreak
->nextCharacters(aTest
, 0, aLocale
,
1526 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1527 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", aTest
.getLength(), nPos
);
1528 nPos
= m_xBreak
->previousCharacters(aTest
, aTest
.getLength(), aLocale
,
1529 i18n::CharacterIteratorMode::SKIPCELL
, 1, nDone
);
1530 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32
>(0), nPos
);
1532 nPos
= m_xBreak
->nextCharacters(aTest
, 0, aLocale
,
1533 i18n::CharacterIteratorMode::SKIPCHARACTER
, 1, nDone
);
1534 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", aTest
.getLength(), nPos
);
1535 nPos
= m_xBreak
->previousCharacters(aTest
, aTest
.getLength(), aLocale
,
1536 i18n::CharacterIteratorMode::SKIPCHARACTER
, 1, nDone
);
1537 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32
>(0), nPos
);
1542 void TestBreakIterator::testNorthernThai()
1544 lang::Locale aLocale
;
1545 aLocale
.Language
= "nod";
1546 aLocale
.Country
= "TH";
1548 const sal_Unicode NORTHERN_THAI1
[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
1549 OUString
aTest(NORTHERN_THAI1
, SAL_N_ELEMENTS(NORTHERN_THAI1
));
1550 i18n::Boundary aBounds
= m_xBreak
->getWordBoundary(aTest
, 0, aLocale
,
1551 i18n::WordType::DICTIONARY_WORD
, true);
1552 CPPUNIT_ASSERT_MESSAGE("Should skip full word",
1553 aBounds
.startPos
== 0 && aBounds
.endPos
== aTest
.getLength());
1556 // Not sure if any version earlier than 49 did have Khmer word boundary
1557 // dictionaries, 4.6 does not.
1559 // As of icu 54, word boundary detection for Khmer is still considered
1560 // insufficient, so icu khmer stuff is disabled
1562 //A test to ensure that our khmer word boundary detection is useful
1563 //https://bugs.libreoffice.org/show_bug.cgi?id=52020
1564 void TestBreakIterator::testKhmer()
1566 lang::Locale aLocale
;
1567 aLocale
.Language
= "km";
1568 aLocale
.Country
= "KH";
1570 const sal_Unicode KHMER
[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
1572 OUString
aTest(KHMER
, SAL_N_ELEMENTS(KHMER
));
1573 i18n::Boundary aBounds
= m_xBreak
->getWordBoundary(aTest
, 0, aLocale
,
1574 i18n::WordType::DICTIONARY_WORD
, true);
1576 CPPUNIT_ASSERT(aBounds
.startPos
== 0 && aBounds
.endPos
== 3);
1578 aBounds
= m_xBreak
->getWordBoundary(aTest
, aBounds
.endPos
, aLocale
,
1579 i18n::WordType::DICTIONARY_WORD
, true);
1581 CPPUNIT_ASSERT(aBounds
.startPos
== 3 && aBounds
.endPos
== 5);
1585 void TestBreakIterator::doTestJapanese(uno::Reference
< i18n::XBreakIterator
> const &xBreak
)
1587 lang::Locale aLocale
;
1588 aLocale
.Language
= "ja";
1589 aLocale
.Country
= "JP";
1590 i18n::Boundary aBounds
;
1593 static constexpr OUString aTest
= u
"シャットダウン"_ustr
;
1595 aBounds
= xBreak
->getWordBoundary(aTest
, 5, aLocale
,
1596 i18n::WordType::DICTIONARY_WORD
, true);
1598 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds
.startPos
);
1599 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds
.endPos
);
1603 static constexpr OUString aTest
= u
"\u9EBB\u306E\u8449\u9EBB\u306E\u8449"_ustr
;
1605 aBounds
= xBreak
->getWordBoundary(aTest
, 1, aLocale
,
1606 i18n::WordType::DICTIONARY_WORD
, true);
1608 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1609 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds
.endPos
);
1611 aBounds
= xBreak
->getWordBoundary(aTest
, 5, aLocale
,
1612 i18n::WordType::DICTIONARY_WORD
, true);
1614 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds
.startPos
);
1615 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds
.endPos
);
1619 // tdf#162912: Double-clicking should only select one Basic identifier
1620 static constexpr OUString aTest
= u
"ThisComponent.CurrentSelection"_ustr
;
1622 aBounds
= xBreak
->getWordBoundary(aTest
, 5, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
1623 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1624 CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds
.endPos
);
1626 aBounds
= xBreak
->getWordBoundary(aTest
, 5, aLocale
,
1627 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
1628 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1629 CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds
.endPos
);
1631 aBounds
= xBreak
->getWordBoundary(aTest
, 15, aLocale
,
1632 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
1633 CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds
.startPos
);
1634 CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds
.endPos
);
1638 void TestBreakIterator::testJapanese()
1640 doTestJapanese(m_xBreak
);
1642 // fdo#78479 - test second / cached instantiation of xdictionary
1643 uno::Reference
< i18n::XBreakIterator
> xTmpBreak(m_xSFactory
->createInstance(
1644 u
"com.sun.star.i18n.BreakIterator"_ustr
), uno::UNO_QUERY_THROW
);
1646 doTestJapanese(xTmpBreak
);
1649 void TestBreakIterator::testChinese()
1651 lang::Locale aLocale
;
1652 aLocale
.Language
= "zh";
1653 aLocale
.Country
= "CN";
1656 static constexpr OUStringLiteral aTest
= u
"\u6A35\u6A30\u69FE\u8919\U00029EDB";
1658 i18n::Boundary aBounds
= m_xBreak
->getWordBoundary(aTest
, 4, aLocale
,
1659 i18n::WordType::DICTIONARY_WORD
, true);
1660 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds
.startPos
);
1661 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds
.endPos
);
1665 void TestBreakIterator::testDictWordPrepostDash()
1667 std::vector
<lang::Locale
> aLocale
{ { "de", "DE", "" },
1668 { "nds", "DE", "" },
1671 { "da", "DK", "" } };
1673 for (const auto& rLocale
: aLocale
)
1675 auto aTest
= u
"Arbeits- -nehmer"_ustr
;
1677 i18n::Boundary aBounds
1678 = m_xBreak
->getWordBoundary(aTest
, 3, rLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1679 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1680 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds
.endPos
);
1683 = m_xBreak
->getWordBoundary(aTest
, 13, rLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1684 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.startPos
);
1685 CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds
.endPos
);
1689 void TestBreakIterator::testDictWordAbbreviation()
1691 std::vector
<lang::Locale
> aLocale
{
1692 { "en", "US", "" }, // dict_word locale
1693 { "de", "DE", "" } // dict_word_prepostdash locale
1696 for (const auto& rLocale
: aLocale
)
1698 auto aTest
= u
"Examples: e.g. i.e. etc. and such"_ustr
;
1700 i18n::Boundary aBounds
1701 = m_xBreak
->getWordBoundary(aTest
, 3, rLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1702 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1703 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds
.endPos
);
1706 = m_xBreak
->getWordBoundary(aTest
, 10, rLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1707 CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds
.startPos
);
1708 CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds
.endPos
);
1711 = m_xBreak
->getWordBoundary(aTest
, 15, rLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1712 CPPUNIT_ASSERT_EQUAL(sal_Int32(15), aBounds
.startPos
);
1713 CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds
.endPos
);
1716 = m_xBreak
->getWordBoundary(aTest
, 20, rLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1717 CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds
.startPos
);
1718 CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds
.endPos
);
1721 = m_xBreak
->getWordBoundary(aTest
, 26, rLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1722 CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds
.startPos
);
1723 CPPUNIT_ASSERT_EQUAL(sal_Int32(28), aBounds
.endPos
);
1726 = m_xBreak
->getWordBoundary(aTest
, 30, rLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1727 CPPUNIT_ASSERT_EQUAL(sal_Int32(29), aBounds
.startPos
);
1728 CPPUNIT_ASSERT_EQUAL(sal_Int32(33), aBounds
.endPos
);
1732 void TestBreakIterator::testHebrewGereshGershaim()
1734 // In Hebrew documents, there are multiple valid ways to represent the geresh and gershaim
1735 // intra-word punctuation marks. This test exhaustively exercises them.
1737 // See the following bugs:
1738 // i#51661: Add quotation mark as middle letter for Hebrew
1739 // tdf#46950: Spell-checking breaks Hebrew words at intra-word single and double quotes
1741 lang::Locale aLocale
;
1743 aLocale
.Language
= "he";
1744 aLocale
.Country
= "IL";
1746 // Unicode U+05F3 HEBREW PUNCTUATION GERESH
1748 auto aTest
= u
"ג׳ירפה"_ustr
;
1751 = m_xBreak
->getWordBoundary(aTest
, 3, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1752 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1753 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds
.endPos
);
1755 aBounds
= m_xBreak
->getWordBoundary(aTest
, 3, aLocale
,
1756 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, false);
1757 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1758 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds
.endPos
);
1761 // Apostrophe as geresh
1763 auto aTest
= u
"ג'ירפה"_ustr
;
1766 = m_xBreak
->getWordBoundary(aTest
, 3, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1767 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1768 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds
.endPos
);
1770 aBounds
= m_xBreak
->getWordBoundary(aTest
, 3, aLocale
,
1771 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, false);
1772 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1773 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds
.endPos
);
1776 // Right single quote as geresh
1778 auto aTest
= u
"ג’ירפה"_ustr
;
1781 = m_xBreak
->getWordBoundary(aTest
, 3, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1782 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1783 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds
.endPos
);
1785 aBounds
= m_xBreak
->getWordBoundary(aTest
, 3, aLocale
,
1786 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, false);
1787 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1788 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds
.endPos
);
1791 // Unicode U+05F4 HEBREW PUNCTUATION GERSHAYIM
1793 auto aTest
= u
"דו״ח"_ustr
;
1796 = m_xBreak
->getWordBoundary(aTest
, 2, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1797 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1798 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds
.endPos
);
1800 aBounds
= m_xBreak
->getWordBoundary(aTest
, 2, aLocale
,
1801 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, false);
1802 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1803 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds
.endPos
);
1806 // Double quote as gershayim
1808 auto aTest
= u
"דו\"ח"_ustr
;
1811 = m_xBreak
->getWordBoundary(aTest
, 2, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1812 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1813 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds
.endPos
);
1815 aBounds
= m_xBreak
->getWordBoundary(aTest
, 2, aLocale
,
1816 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, false);
1817 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1818 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds
.endPos
);
1821 // Right double quote as gershayim
1823 auto aTest
= u
"דו”ח"_ustr
;
1826 = m_xBreak
->getWordBoundary(aTest
, 2, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1827 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1828 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds
.endPos
);
1830 aBounds
= m_xBreak
->getWordBoundary(aTest
, 2, aLocale
,
1831 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, false);
1832 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1833 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds
.endPos
);
1837 void TestBreakIterator::testLegacySurrogatePairs()
1839 lang::Locale aLocale
;
1841 aLocale
.Language
= "ja";
1842 aLocale
.Country
= "JP";
1844 // i#75632: [surrogate pair] Japanese word break does not work properly for surrogate pairs.
1845 // and many others to address bugs: i#75631 i#75633 i#75412 etc.
1847 // BreakIterator supports surrogate pairs (UTF-16). This is a simple characteristic test.
1849 static constexpr OUString aTest
= u
"X 𠮟 X"_ustr
;
1852 = m_xBreak
->getWordBoundary(aTest
, 1, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1853 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1854 CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds
.endPos
);
1857 = m_xBreak
->getWordBoundary(aTest
, 2, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1858 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds
.startPos
);
1859 CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds
.endPos
);
1862 = m_xBreak
->getWordBoundary(aTest
, 5, aLocale
, i18n::WordType::DICTIONARY_WORD
, false);
1863 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.startPos
);
1864 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds
.endPos
);
1868 void TestBreakIterator::testWordCount()
1870 auto fnCountWords
= [&](const OUString
& aStr
, const lang::Locale
& aLocale
) -> int
1873 sal_Int32 nNextPos
= 0;
1876 if (m_xBreak
->isBeginWord(aStr
, nNextPos
, aLocale
, i18n::WordType::WORD_COUNT
))
1883 CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++nIterGuard
< 100);
1885 auto aBounds
= m_xBreak
->nextWord(aStr
, nNextPos
, aLocale
, i18n::WordType::WORD_COUNT
);
1886 if (aBounds
.endPos
== aBounds
.startPos
)
1891 nNextPos
= aBounds
.endPos
;
1898 // i#80815: "Word count differs from MS Word"
1899 // This is a characteristic test for word count using test data from the linked bug.
1901 lang::Locale aLocale
;
1902 aLocale
.Language
= "en";
1903 aLocale
.Country
= "US";
1905 const OUString aStr
= u
""
1906 "test data for word count issue #80815\n"
1908 "archipi\\\'elago\n"
1915 "money+opportunity\n"
1925 "aaaaaaa;aaaaaaa\n"_ustr
;
1927 CPPUNIT_ASSERT_EQUAL(24, fnCountWords(aStr
, aLocale
));
1930 // Test that the switch to upstream ICU for CJ word boundary analysis doesn't change word count.
1932 lang::Locale aLocale
;
1933 aLocale
.Language
= "ja";
1934 aLocale
.Country
= "JP";
1936 const OUString aStr
= u
"Wordの様にワード数をするのにTest\n植松町"_ustr
;
1938 CPPUNIT_ASSERT_EQUAL(8, fnCountWords(aStr
, aLocale
));
1941 // tdf#150621 Korean words should be counted individually, rather than by syllable.
1943 // Per i#80815, the intention for the word count feature is to emulate the behavior of MS Word.
1945 lang::Locale aLocale
;
1946 aLocale
.Language
= "ko";
1947 aLocale
.Country
= "KR";
1949 // Basic case: Korean words are counted as space-delimited. In particular, grammatical
1950 // particles are treated as part of the previous word.
1951 CPPUNIT_ASSERT_EQUAL(3, fnCountWords(u
"저는 영화를 봤어요"_ustr
, aLocale
));
1953 // Mixed script: Korean is mostly written in hangul, but hanja are still used in certain
1954 // situations (e.g. abbreviations in newspaper articles). For Chinese and Japanese, such
1955 // ideographs would be counted individually as words. In Korean, however, they are treated
1956 // no differently than hangul characters.
1957 CPPUNIT_ASSERT_EQUAL(1, fnCountWords(u
"불렀다...與"_ustr
, aLocale
));
1958 CPPUNIT_ASSERT_EQUAL(2, fnCountWords(u
"불렀다 ...與"_ustr
, aLocale
));
1959 CPPUNIT_ASSERT_EQUAL(3, fnCountWords(u
"불렀다 ... 與"_ustr
, aLocale
));
1960 CPPUNIT_ASSERT_EQUAL(1, fnCountWords(u
"尹탄핵"_ustr
, aLocale
));
1961 CPPUNIT_ASSERT_EQUAL(2, fnCountWords(u
"尹 탄핵"_ustr
, aLocale
));
1965 void TestBreakIterator::testDictionaryIteratorLanguages()
1969 lang::Locale aLocale
{ "th", "TH", "" };
1971 const OUString aStr
= u
"รอนานหรือเปล่า"_ustr
;
1973 i18n::Boundary aBounds
;
1976 = m_xBreak
->getWordBoundary(aStr
, 1, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
1977 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1978 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds
.endPos
);
1981 = m_xBreak
->getWordBoundary(aStr
, 3, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
1982 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds
.startPos
);
1983 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.endPos
);
1986 = m_xBreak
->getWordBoundary(aStr
, 6, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
1987 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.startPos
);
1988 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.endPos
);
1991 = m_xBreak
->getWordBoundary(aStr
, 10, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
1992 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.startPos
);
1993 CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds
.endPos
);
1995 aBounds
= m_xBreak
->getWordBoundary(aStr
, 1, aLocale
, i18n::WordType::ANY_WORD
, true);
1996 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
1997 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds
.endPos
);
1999 aBounds
= m_xBreak
->getWordBoundary(aStr
, 3, aLocale
, i18n::WordType::ANY_WORD
, true);
2000 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds
.startPos
);
2001 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.endPos
);
2003 aBounds
= m_xBreak
->getWordBoundary(aStr
, 6, aLocale
, i18n::WordType::ANY_WORD
, true);
2004 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.startPos
);
2005 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.endPos
);
2007 aBounds
= m_xBreak
->getWordBoundary(aStr
, 10, aLocale
, i18n::WordType::ANY_WORD
, true);
2008 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.startPos
);
2009 CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds
.endPos
);
2011 aBounds
= m_xBreak
->getWordBoundary(aStr
, 1, aLocale
,
2012 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
2013 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
2014 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds
.endPos
);
2016 aBounds
= m_xBreak
->getWordBoundary(aStr
, 3, aLocale
,
2017 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
2018 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds
.startPos
);
2019 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.endPos
);
2021 aBounds
= m_xBreak
->getWordBoundary(aStr
, 6, aLocale
,
2022 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
2023 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.startPos
);
2024 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.endPos
);
2026 aBounds
= m_xBreak
->getWordBoundary(aStr
, 10, aLocale
,
2027 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
2028 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds
.startPos
);
2029 CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds
.endPos
);
2034 lang::Locale aLocale
{ "ja", "JP", "" };
2036 const OUString aStr
= u
"通産省工業技術院北海道"_ustr
;
2038 i18n::Boundary aBounds
;
2041 = m_xBreak
->getWordBoundary(aStr
, 1, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
2042 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
2043 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds
.endPos
);
2046 = m_xBreak
->getWordBoundary(aStr
, 2, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
2047 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds
.startPos
);
2048 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds
.endPos
);
2051 = m_xBreak
->getWordBoundary(aStr
, 4, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
2052 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds
.startPos
);
2053 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.endPos
);
2056 = m_xBreak
->getWordBoundary(aStr
, 6, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
2057 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.startPos
);
2058 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds
.endPos
);
2061 = m_xBreak
->getWordBoundary(aStr
, 7, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
2062 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds
.startPos
);
2063 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds
.endPos
);
2066 = m_xBreak
->getWordBoundary(aStr
, 9, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
2067 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds
.startPos
);
2068 CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds
.endPos
);
2070 aBounds
= m_xBreak
->getWordBoundary(aStr
, 1, aLocale
, i18n::WordType::ANY_WORD
, true);
2071 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
2072 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds
.endPos
);
2074 aBounds
= m_xBreak
->getWordBoundary(aStr
, 2, aLocale
, i18n::WordType::ANY_WORD
, true);
2075 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds
.startPos
);
2076 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds
.endPos
);
2078 aBounds
= m_xBreak
->getWordBoundary(aStr
, 4, aLocale
, i18n::WordType::ANY_WORD
, true);
2079 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds
.startPos
);
2080 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.endPos
);
2082 aBounds
= m_xBreak
->getWordBoundary(aStr
, 6, aLocale
, i18n::WordType::ANY_WORD
, true);
2083 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.startPos
);
2084 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds
.endPos
);
2086 aBounds
= m_xBreak
->getWordBoundary(aStr
, 7, aLocale
, i18n::WordType::ANY_WORD
, true);
2087 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds
.startPos
);
2088 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds
.endPos
);
2090 aBounds
= m_xBreak
->getWordBoundary(aStr
, 9, aLocale
, i18n::WordType::ANY_WORD
, true);
2091 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds
.startPos
);
2092 CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds
.endPos
);
2094 aBounds
= m_xBreak
->getWordBoundary(aStr
, 1, aLocale
,
2095 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
2096 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
2097 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds
.endPos
);
2099 aBounds
= m_xBreak
->getWordBoundary(aStr
, 2, aLocale
,
2100 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
2101 CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds
.startPos
);
2102 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds
.endPos
);
2104 aBounds
= m_xBreak
->getWordBoundary(aStr
, 4, aLocale
,
2105 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
2106 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds
.startPos
);
2107 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.endPos
);
2109 aBounds
= m_xBreak
->getWordBoundary(aStr
, 6, aLocale
,
2110 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
2111 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.startPos
);
2112 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds
.endPos
);
2114 aBounds
= m_xBreak
->getWordBoundary(aStr
, 7, aLocale
,
2115 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
2116 CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds
.startPos
);
2117 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds
.endPos
);
2119 aBounds
= m_xBreak
->getWordBoundary(aStr
, 9, aLocale
,
2120 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
2121 CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds
.startPos
);
2122 CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds
.endPos
);
2127 lang::Locale aLocale
{ "zh", "CN", "" };
2129 const OUString aStr
= u
"很高兴认识你"_ustr
;
2131 i18n::Boundary aBounds
;
2134 = m_xBreak
->getWordBoundary(aStr
, 0, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
2135 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
2136 CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds
.endPos
);
2139 = m_xBreak
->getWordBoundary(aStr
, 1, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
2140 CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds
.startPos
);
2141 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds
.endPos
);
2144 = m_xBreak
->getWordBoundary(aStr
, 3, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
2145 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds
.startPos
);
2146 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.endPos
);
2149 = m_xBreak
->getWordBoundary(aStr
, 5, aLocale
, i18n::WordType::DICTIONARY_WORD
, true);
2150 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.startPos
);
2151 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds
.endPos
);
2153 aBounds
= m_xBreak
->getWordBoundary(aStr
, 0, aLocale
, i18n::WordType::ANY_WORD
, true);
2154 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
2155 CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds
.endPos
);
2157 aBounds
= m_xBreak
->getWordBoundary(aStr
, 1, aLocale
, i18n::WordType::ANY_WORD
, true);
2158 CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds
.startPos
);
2159 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds
.endPos
);
2161 aBounds
= m_xBreak
->getWordBoundary(aStr
, 3, aLocale
, i18n::WordType::ANY_WORD
, true);
2162 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds
.startPos
);
2163 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.endPos
);
2165 aBounds
= m_xBreak
->getWordBoundary(aStr
, 5, aLocale
, i18n::WordType::ANY_WORD
, true);
2166 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.startPos
);
2167 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds
.endPos
);
2169 aBounds
= m_xBreak
->getWordBoundary(aStr
, 0, aLocale
,
2170 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
2171 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds
.startPos
);
2172 CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds
.endPos
);
2174 aBounds
= m_xBreak
->getWordBoundary(aStr
, 1, aLocale
,
2175 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
2176 CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds
.startPos
);
2177 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds
.endPos
);
2179 aBounds
= m_xBreak
->getWordBoundary(aStr
, 3, aLocale
,
2180 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
2181 CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds
.startPos
);
2182 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.endPos
);
2184 aBounds
= m_xBreak
->getWordBoundary(aStr
, 5, aLocale
,
2185 i18n::WordType::ANYWORD_IGNOREWHITESPACES
, true);
2186 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds
.startPos
);
2187 CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds
.endPos
);
2191 void TestBreakIterator::setUp()
2193 BootstrapFixtureBase::setUp();
2194 m_xBreak
.set(m_xSFactory
->createInstance(u
"com.sun.star.i18n.BreakIterator"_ustr
), uno::UNO_QUERY_THROW
);
2197 void TestBreakIterator::tearDown()
2200 BootstrapFixtureBase::tearDown();
2203 CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator
);
2205 CPPUNIT_PLUGIN_IMPLEMENT();
2207 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */