Version 6.4.0.0.beta1, tag libreoffice-6.4.0.0.beta1
[LibreOffice.git] / i18npool / qa / cppunit / test_breakiterator.cxx
bloba1d423879109e42f420385a1ac02eb04f11808a4
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 */
10 #include <com/sun/star/i18n/XBreakIterator.hpp>
11 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
12 #include <com/sun/star/i18n/ScriptType.hpp>
13 #include <com/sun/star/i18n/WordType.hpp>
14 #include <unotest/bootstrapfixturebase.hxx>
16 #include <unicode/uvernum.h>
18 #include <rtl/strbuf.hxx>
20 #include <string.h>
22 #include <stack>
24 using namespace ::com::sun::star;
26 class TestBreakIterator : public test::BootstrapFixtureBase
28 public:
29 virtual void setUp() override;
30 virtual void tearDown() override;
32 void testLineBreaking();
33 void testWordBoundaries();
34 void testGraphemeIteration();
35 void testWeak();
36 void testAsian();
37 void testThai();
38 #if (U_ICU_VERSION_MAJOR_NUM > 51)
39 void testLao();
40 #ifdef TODO
41 void testNorthernThai();
42 void testKhmer();
43 #endif
44 #endif
45 void testJapanese();
46 void testChinese();
48 CPPUNIT_TEST_SUITE(TestBreakIterator);
49 CPPUNIT_TEST(testLineBreaking);
50 CPPUNIT_TEST(testWordBoundaries);
51 CPPUNIT_TEST(testGraphemeIteration);
52 CPPUNIT_TEST(testWeak);
53 CPPUNIT_TEST(testAsian);
54 CPPUNIT_TEST(testThai);
55 #if (U_ICU_VERSION_MAJOR_NUM > 51)
56 CPPUNIT_TEST(testLao);
57 #ifdef TODO
58 CPPUNIT_TEST(testKhmer);
59 CPPUNIT_TEST(testNorthernThai);
60 #endif
61 #endif
62 CPPUNIT_TEST(testJapanese);
63 CPPUNIT_TEST(testChinese);
64 CPPUNIT_TEST_SUITE_END();
66 private:
67 uno::Reference<i18n::XBreakIterator> m_xBreak;
68 void doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak);
71 void TestBreakIterator::testLineBreaking()
73 i18n::LineBreakHyphenationOptions aHyphOptions;
74 i18n::LineBreakUserOptions aUserOptions;
75 lang::Locale aLocale;
77 //See https://bugs.libreoffice.org/show_bug.cgi?id=31271
79 OUString aTest("(some text here)");
81 aLocale.Language = "en";
82 aLocale.Country = "US";
85 //Here we want the line break to leave text here) on the next line
86 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
87 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(6), aResult.breakIndex);
91 //Here we want the line break to leave "here)" on the next line
92 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
93 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(11), aResult.breakIndex);
97 //See https://bugs.libreoffice.org/show_bug.cgi?id=49849
99 const sal_Unicode HEBREW1[] = { 0x05DE, 0x05D9, 0x05DC, 0x05D9, 0x5DD };
100 OUString aWord(HEBREW1, SAL_N_ELEMENTS(HEBREW1));
101 OUString aTest(aWord + " " + aWord);
103 aLocale.Language = "he";
104 aLocale.Country = "IL";
107 //Here we want the line break to happen at the whitespace
108 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
109 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", aWord.getLength()+1, aResult.breakIndex);
113 //See https://bz.apache.org/ooo/show_bug.cgi?id=17155
115 OUString const aTest("foo /bar/baz");
117 aLocale.Language = "en";
118 aLocale.Country = "US";
121 //Here we want the line break to leave /bar/ba clumped together on the next line
122 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("foo /bar/ba"), aLocale, 0,
123 aHyphOptions, aUserOptions);
124 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first slash", static_cast<sal_Int32>(4), aResult.breakIndex);
128 //See https://bz.apache.org/ooo/show_bug.cgi?id=19716
130 OUString aTest("aaa]aaa");
132 aLocale.Language = "en";
133 aLocale.Country = "US";
136 //Here we want the line break to move the whole lot to the next line
137 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
138 aHyphOptions, aUserOptions);
139 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the line, not at ]", static_cast<sal_Int32>(0), aResult.breakIndex);
143 //this is an example sequence from tdf92993-1.docx caught by the load crashtesting
145 const sal_Unicode WEIRD1[] = { 0xd83c, 0xdf56, 0xd83c, 0xdf57, 0xd83c, 0xdf46,
146 0xd83c, 0xdf64, 0x2668, 0xfe0f, 0xd83c, 0xdfc6};
148 OUString aTest(WEIRD1, SAL_N_ELEMENTS(WEIRD1));
150 aLocale.Language = "en";
151 aLocale.Country = "US";
154 //This must not assert/crash
155 (void)m_xBreak->getLineBreak(aTest, 0, aLocale, 0, aHyphOptions, aUserOptions);
159 //See https://bugs.documentfoundation.org/show_bug.cgi?id=96197
161 const sal_Unicode HANGUL[] = { 0xc560, 0xad6D, 0xac00, 0xc758, 0x0020, 0xac00,
162 0xc0ac, 0xb294};
163 OUString aTest(HANGUL, SAL_N_ELEMENTS(HANGUL));
165 aLocale.Language = "ko";
166 aLocale.Country = "KR";
169 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
170 aHyphOptions, aUserOptions);
171 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast<sal_Int32>(5), aResult.breakIndex);
176 //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
177 void TestBreakIterator::testWordBoundaries()
179 lang::Locale aLocale;
180 aLocale.Language = "en";
181 aLocale.Country = "US";
183 i18n::Boundary aBounds;
185 //See https://bz.apache.org/ooo/show_bug.cgi?id=11993
187 OUString aTest("abcd ef ghi??? KLM");
189 CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
190 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
191 aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
192 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
194 CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
195 CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
197 //next word
198 aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true);
199 CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
201 //previous word
202 aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
203 CPPUNIT_ASSERT(aBounds.startPos == 5 && aBounds.endPos == 7);
205 CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
206 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
207 aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true);
208 CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
210 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
211 CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
212 aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true);
213 CPPUNIT_ASSERT(aBounds.startPos == 16 && aBounds.endPos == 19);
216 //See https://bz.apache.org/ooo/show_bug.cgi?id=21907
218 OUString aTest("b a?");
220 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
221 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
222 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
224 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
226 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
227 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
228 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
230 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
233 //See https://bz.apache.org/ooo/show_bug.cgi?id=14904
235 const sal_Unicode TEST[] =
237 'W', 'o', 'r', 'k', 'i', 'n', 'g', ' ', 0x201C, 'W', 'o', 'r', 'd', 's',
238 ' ', 's', 't', 'a', 'r', 't', 'i', 'n', 'g', ' ', 'w', 'i', 't',
239 'h', ' ', 'q', 'u', 'o', 't', 'e', 's', 0x201D, ' ', 'W', 'o', 'r', 'k',
240 'i', 'n', 'g', ' ', 0x2018, 'B', 'r', 'o', 'k', 'e', 'n', 0x2019, ' ',
241 '?', 'S', 'p', 'a', 'n', 'i', 's', 'h', '?', ' ', 'd', 'o', 'e',
242 's', 'n', 0x2019, 't', ' ', 'w', 'o', 'r', 'k', '.', ' ', 'N', 'o',
243 't', ' ', 'e', 'v', 'e', 'n', ' ' , 0x00BF, 'r', 'e', 'a', 'l', '?', ' ',
244 'S', 'p', 'a', 'n', 'i', 's', 'h'
246 OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
248 aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false);
249 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
251 aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false);
252 CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 14);
254 aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false);
255 CPPUNIT_ASSERT(aBounds.startPos == 37 && aBounds.endPos == 44);
257 aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false);
258 CPPUNIT_ASSERT(aBounds.startPos == 46 && aBounds.endPos == 52);
260 aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false);
261 CPPUNIT_ASSERT(aBounds.startPos == 55 && aBounds.endPos == 62);
263 aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false);
264 CPPUNIT_ASSERT(aBounds.startPos == 64 && aBounds.endPos == 71);
266 aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false);
267 CPPUNIT_ASSERT(aBounds.startPos == 88 && aBounds.endPos == 92);
270 //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
271 sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF };
272 for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
274 //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
275 for (size_t i = 0; i < SAL_N_ELEMENTS(aBreakTests); ++i)
277 #if (U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM <= 2)
278 //Note the breakiterator test is known to fail on older icu
279 //versions (4.2.1) for the 200B (ZWSP) Zero Width Space testcase.
280 if (aBreakTests[i] == 0x200B)
281 continue;
282 #endif
283 OUString aTest = "Word" + OUStringChar(aBreakTests[i]) + "Word";
284 aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
285 switch (mode)
287 case i18n::WordType::ANY_WORD:
288 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
289 break;
290 case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
291 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
292 break;
293 case i18n::WordType::DICTIONARY_WORD:
294 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
295 break;
296 case i18n::WordType::WORD_COUNT:
297 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
298 break;
301 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
302 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
306 sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB };
307 for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
309 //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
310 for (size_t i = 0; i < SAL_N_ELEMENTS(aJoinTests); ++i)
312 OUString aTest = "Word" + OUStringChar(aJoinTests[i]) + "Word";
313 aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
314 switch (mode)
316 case i18n::WordType::ANY_WORD:
317 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
318 break;
319 case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
320 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
321 break;
322 case i18n::WordType::DICTIONARY_WORD:
323 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
324 break;
325 case i18n::WordType::WORD_COUNT:
326 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
327 break;
330 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
331 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
335 //See https://bz.apache.org/ooo/show_bug.cgi?id=13494
337 const OUString aBase("xxAAxxBBxxCCxx");
338 const sal_Unicode aTests[] =
340 '\'', ';', ',', '.', '!', '@', '#', '%', '&', '*',
341 '(', ')', '_', '-', '{', '}', '[', ']', '\"', '/',
342 '\\', '?', '~', '$', '+', '^', '=', '<', '>', '|'
345 const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14};
346 for (size_t j = 0; j < SAL_N_ELEMENTS(aTests); ++j)
348 OUString aTest = aBase.replace('x', aTests[j]);
349 sal_Int32 nPos = -1;
350 size_t i = 0;
353 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aDoublePositions));
354 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
355 CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
356 ++i;
358 while (nPos < aTest.getLength());
359 nPos = aTest.getLength();
360 i = SAL_N_ELEMENTS(aDoublePositions)-1;
363 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
364 --i;
365 CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
367 while (nPos > 0);
370 const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10};
371 for (size_t j = 1; j < SAL_N_ELEMENTS(aTests); ++j)
373 OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[j]));
374 sal_Int32 nPos = -1;
375 size_t i = 0;
378 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSinglePositions));
379 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
380 CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
381 ++i;
383 while (nPos < aTest.getLength());
384 nPos = aTest.getLength();
385 i = SAL_N_ELEMENTS(aSinglePositions)-1;
388 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
389 --i;
390 CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
392 while (nPos > 0);
395 const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10};
396 CPPUNIT_ASSERT_EQUAL(u'\'', aTests[0]);
398 OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[0]));
399 sal_Int32 nPos = -1;
400 size_t i = 0;
403 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSingleQuotePositions));
404 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
405 CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
406 ++i;
408 while (nPos < aTest.getLength());
409 nPos = aTest.getLength();
410 i = SAL_N_ELEMENTS(aSingleQuotePositions)-1;
413 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
414 --i;
415 CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
417 while (nPos > 0);
421 //See https://bz.apache.org/ooo/show_bug.cgi?id=13451
423 aLocale.Language = "ca";
424 aLocale.Country = "ES";
426 OUString aTest("mirar-se comprar-vos donem-nos les mans aneu-vos-en!");
428 sal_Int32 nPos = 0;
429 sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52};
430 size_t i = 0;
433 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
434 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
435 i18n::WordType::DICTIONARY_WORD, true).endPos;
436 CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
437 ++i;
439 while (nPos++ < aTest.getLength());
440 CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
443 //See https://bz.apache.org/ooo/show_bug.cgi?id=85411
444 for (int j = 0; j < 3; ++j)
446 switch (j)
448 case 0:
449 aLocale.Language = "en";
450 aLocale.Country = "US";
451 break;
452 case 1:
453 aLocale.Language = "ca";
454 aLocale.Country = "ES";
455 break;
456 case 2:
457 aLocale.Language = "fi";
458 aLocale.Country = "FI";
459 break;
460 default:
461 CPPUNIT_ASSERT(false);
462 break;
465 const sal_Unicode TEST[] =
467 'I', 0x200B, 'w', 'a', 'n', 't', 0x200B, 't', 'o', 0x200B, 'g', 'o'
469 OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
471 sal_Int32 nPos = 0;
472 sal_Int32 aExpected[] = {1, 6, 9, 12};
473 size_t i = 0;
476 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
477 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
478 i18n::WordType::DICTIONARY_WORD, true).endPos;
479 CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
480 ++i;
482 while (nPos++ < aTest.getLength());
483 CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
486 //https://bz.apache.org/ooo/show_bug.cgi?id=21290
487 for (int j = 0; j < 2; ++j)
489 switch (j)
491 case 0:
492 aLocale.Language = "en";
493 aLocale.Country = "US";
494 break;
495 case 1:
496 aLocale.Language = "grc";
497 aLocale.Country.clear();
498 break;
499 default:
500 CPPUNIT_ASSERT(false);
501 break;
504 const sal_Unicode TEST[] =
506 0x1F0C, 0x03BD, 0x03B4, 0x03C1, 0x03B1, 0x0020, 0x1F00,
507 0x03C1, 0x03BD, 0x1F7B, 0x03BC, 0x03B5, 0x03BD, 0x03BF,
508 0x03C2, 0x0020, 0x1F00, 0x03BB, 0x03BB, 0x0020, 0x1F24,
509 0x03C3, 0x03B8, 0x03B9, 0x03BF, 0x03BD
511 OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
513 sal_Int32 nPos = 0;
514 sal_Int32 aExpected[] = {5, 15, 19, 26};
515 size_t i = 0;
518 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
519 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
520 i18n::WordType::DICTIONARY_WORD, true).endPos;
521 CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
522 ++i;
524 while (nPos++ < aTest.getLength());
525 CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
528 //See https://bz.apache.org/ooo/show_bug.cgi?id=58513
529 //See https://bugs.libreoffice.org/show_bug.cgi?id=55707
531 aLocale.Language = "fi";
532 aLocale.Country = "FI";
534 OUString aTest("Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n");
537 sal_Int32 nPos = 0;
538 sal_Int32 aExpected[] = {11, 21, 24, 36, 42, 47, 51};
539 size_t i = 0;
542 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
543 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
544 i18n::WordType::WORD_COUNT, true).endPos;
545 CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
546 ++i;
548 while (nPos++ < aTest.getLength());
549 CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
553 sal_Int32 nPos = 0;
554 sal_Int32 aExpected[] = {0, 11, 12, 20, 22, 24, 25, 36, 37,
555 40, 41, 42, 43, 45, 46, 47, 50, 51};
556 size_t i = 0;
559 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
560 aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
561 i18n::WordType::DICTIONARY_WORD, true);
562 CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.startPos);
563 ++i;
564 CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.endPos);
565 ++i;
566 nPos = aBounds.endPos;
568 while (nPos++ < aTest.getLength());
569 CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
573 //See https://bz.apache.org/ooo/show_bug.cgi?id=107843
575 aLocale.Language = "en";
576 aLocale.Country = "US";
578 const sal_Unicode TEST[] =
580 'r', 'u', 0xFB00, 'l', 'e', ' ', 0xFB01, 's', 'h'
582 OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
584 aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
585 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 5);
587 aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false);
588 CPPUNIT_ASSERT(aBounds.startPos == 6 && aBounds.endPos == 9);
591 //See https://bz.apache.org/ooo/show_bug.cgi?id=113785
593 aLocale.Language = "en";
594 aLocale.Country = "US";
596 const sal_Unicode TEST[] =
598 'a', 0x2013, 'b', 0x2014, 'c'
600 OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
602 aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
603 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 1);
605 aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD);
606 CPPUNIT_ASSERT(aBounds.startPos == 2 && aBounds.endPos == 3);
608 aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD);
609 CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 5);
613 //See https://bugs.libreoffice.org/show_bug.cgi?id=40292
614 //See https://bz.apache.org/ooo/show_bug.cgi?id=80412
615 //See https://bz.apache.org/ooo/show_bug.cgi?id=111152
616 //See https://bz.apache.org/ooo/show_bug.cgi?id=50172
617 void TestBreakIterator::testGraphemeIteration()
619 lang::Locale aLocale;
620 aLocale.Language = "bn";
621 aLocale.Country = "IN";
624 const sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF };
625 OUString aTest(BA_HALANT_LA, SAL_N_ELEMENTS(BA_HALANT_LA));
627 sal_Int32 nDone=0;
628 sal_Int32 nPos;
629 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
630 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
631 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(BA_HALANT_LA)), nPos);
632 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(BA_HALANT_LA), aLocale,
633 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
634 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
638 const sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF };
639 OUString aTest(HA_HALANT_NA_VOWELSIGNI, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
641 sal_Int32 nDone=0;
642 sal_Int32 nPos;
643 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
644 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
645 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI)), nPos);
646 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI), aLocale,
647 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
648 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
652 const sal_Unicode TA_HALANT_MA_HALANT_YA [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF };
653 OUString aTest(TA_HALANT_MA_HALANT_YA, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
655 sal_Int32 nDone=0;
656 sal_Int32 nPos;
657 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
658 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
659 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA)), nPos);
660 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA), aLocale,
661 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
662 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
665 aLocale.Language = "ta";
666 aLocale.Country = "IN";
669 const sal_Unicode KA_VIRAMA_SSA[] = { 0x0B95, 0x0BCD, 0x0BB7 };
670 OUString aTest(KA_VIRAMA_SSA, SAL_N_ELEMENTS(KA_VIRAMA_SSA));
672 sal_Int32 nDone=0;
673 sal_Int32 nPos = 0;
675 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
676 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
677 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(KA_VIRAMA_SSA)), nPos);
678 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VIRAMA_SSA), aLocale,
679 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
680 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
684 const sal_Unicode KA_VOWELSIGNU[] = { 0x0B95, 0x0BC1 };
685 OUString aTest(KA_VOWELSIGNU, SAL_N_ELEMENTS(KA_VOWELSIGNU));
687 sal_Int32 nDone=0;
688 sal_Int32 nPos = 0;
690 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
691 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
692 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(KA_VOWELSIGNU)), nPos);
693 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VOWELSIGNU), aLocale,
694 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
695 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
699 const sal_Unicode CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI[] =
700 { 0x0B9A, 0x0BBF, 0x0BA4, 0x0BCD, 0x0BA4, 0x0BBF, 0x0BB0, 0x0BC8 };
701 OUString aTest(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI,
702 SAL_N_ELEMENTS(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI));
704 sal_Int32 nDone=0;
705 sal_Int32 nPos=0;
707 for (sal_Int32 i = 0; i < 4; ++i)
709 sal_Int32 nOldPos = nPos;
710 nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale,
711 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
712 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos+2, nPos);
715 for (sal_Int32 i = 0; i < 4; ++i)
717 sal_Int32 nOldPos = nPos;
718 nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale,
719 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
720 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos-2, nPos);
725 const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
726 OUString aText(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
728 sal_Int32 nGraphemeCount = 0;
730 sal_Int32 nCurPos = 0;
731 while (nCurPos < aText.getLength())
733 sal_Int32 nCount2 = 1;
734 nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
735 i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
736 ++nGraphemeCount;
739 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be considered 1 grapheme", static_cast<sal_Int32>(1), nGraphemeCount);
742 aLocale.Language = "hi";
743 aLocale.Country = "IN";
746 const sal_Unicode SHA_VOWELSIGNII[] = { 0x936, 0x940 };
747 OUString aTest(SHA_VOWELSIGNII, SAL_N_ELEMENTS(SHA_VOWELSIGNII));
749 sal_Int32 nDone=0;
750 sal_Int32 nPos = 0;
752 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
753 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
754 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(SHA_VOWELSIGNII)), nPos);
755 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(SHA_VOWELSIGNII), aLocale,
756 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
757 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
761 //A test to ensure that certain ranges and codepoints that are categorized as
762 //weak remain as weak, so that existing docs that depend on this don't silently
763 //change font for those weak chars
764 void TestBreakIterator::testWeak()
766 lang::Locale aLocale;
767 aLocale.Language = "en";
768 aLocale.Country = "US";
771 const sal_Unicode WEAKS[] =
773 0x0001, 0x0002,
774 0x0020, 0x00A0,
775 0x0300, 0x036F, //Combining Diacritical Marks
776 0x1AB0, 0x1AFF, //Combining Diacritical Marks Extended
777 0x1DC0, 0x1DFF, //Combining Diacritical Marks Supplement
778 0x20D0, 0x20FF, //Combining Diacritical Marks for Symbols
779 0x2150, 0x215F, //Number Forms, fractions
780 0x2160, 0x2180, //Number Forms, roman numerals
781 0x2200, 0x22FF, //Mathematical Operators
782 0x27C0, 0x27EF, //Miscellaneous Mathematical Symbols-A
783 0x2980, 0x29FF, //Miscellaneous Mathematical Symbols-B
784 0x2A00, 0x2AFF, //Supplemental Mathematical Operators
785 0x2100, 0x214F, //Letterlike Symbols
786 0x2308, 0x230B, //Miscellaneous technical
787 0x25A0, 0x25FF, //Geometric Shapes
788 0x2B30, 0x2B4C //Miscellaneous Symbols and Arrows
790 OUString aWeaks(WEAKS, SAL_N_ELEMENTS(WEAKS));
792 for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
794 sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
795 OString aMsg =
796 "Char 0x" +
797 OString::number(static_cast<sal_Int32>(aWeaks[i]), 16) +
798 " should have been weak";
799 CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
800 i18n::ScriptType::WEAK, nScript);
805 //A test to ensure that certain ranges and codepoints that are categorized as
806 //asian remain as asian, so that existing docs that depend on this don't silently
807 //change font for those asian chars.
808 //See https://bugs.libreoffice.org/show_bug.cgi?id=38095
809 void TestBreakIterator::testAsian()
811 lang::Locale aLocale;
812 aLocale.Language = "en";
813 aLocale.Country = "US";
816 const sal_Unicode ASIANS[] =
818 //some typical CJK chars
819 0x4E00, 0x62FF,
820 //The full HalfWidth and FullWidth block has historically been
821 //designated as taking the CJK font :-(
822 //HalfWidth and FullWidth forms of ASCII 0-9, categorized under
823 //UAX24 as "Common" i.e. by that logic WEAK
824 0xFF10, 0xFF19,
825 //HalfWidth and FullWidth forms of ASCII A-z, categorized under
826 //UAX25 as "Latin", i.e. by that logic LATIN
827 0xFF21, 0xFF5A
829 OUString aAsians(ASIANS, SAL_N_ELEMENTS(ASIANS));
831 for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
833 sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
834 OString aMsg =
835 "Char 0x" +
836 OString::number(static_cast<sal_Int32>(aAsians[i]), 16) +
837 " should have been asian";
838 CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
839 i18n::ScriptType::ASIAN, nScript);
844 #if (U_ICU_VERSION_MAJOR_NUM > 51)
845 //A test to ensure that our Lao word boundary detection is useful
846 void TestBreakIterator::testLao()
848 lang::Locale aLocale;
849 aLocale.Language = "lo";
850 aLocale.Country = "LA";
852 const sal_Unicode LAO[] = { 0x0e8d, 0x0eb4, 0x0e99, 0x0e94, 0x0eb5, 0x0e95, 0x0ec9, 0x0ead, 0x0e99, 0x0eae, 0x0eb1, 0x0e9a };
853 OUString aTest(LAO, SAL_N_ELEMENTS(LAO));
854 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
855 i18n::WordType::DICTIONARY_WORD, true);
857 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
858 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
860 aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
861 i18n::WordType::DICTIONARY_WORD, true);
863 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
864 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
866 #endif
868 //A test to ensure that our thai word boundary detection is useful
869 void TestBreakIterator::testThai()
871 lang::Locale aLocale;
872 aLocale.Language = "th";
873 aLocale.Country = "TH";
875 //See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
877 const sal_Unicode THAI[] = { 0x0E01, 0x0E38, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
878 OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
879 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
880 i18n::WordType::DICTIONARY_WORD, true);
881 CPPUNIT_ASSERT_MESSAGE("Should skip full word",
882 aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
885 //See https://bz.apache.org/ooo/show_bug.cgi?id=29548
886 //make sure forwards and back are consistent
888 const sal_Unicode THAI[] =
890 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
891 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
892 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27,
893 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
894 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
895 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27
897 OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
899 std::stack<sal_Int32> aPositions;
900 sal_Int32 nPos = -1;
903 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
904 aPositions.push(nPos);
906 while (nPos < aTest.getLength());
907 nPos = aTest.getLength();
908 CPPUNIT_ASSERT(!aPositions.empty());
909 aPositions.pop();
912 CPPUNIT_ASSERT(!aPositions.empty());
913 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
914 CPPUNIT_ASSERT_EQUAL(aPositions.top(), nPos);
915 aPositions.pop();
917 while (nPos > 0);
920 // tdf#113694
922 const sal_Unicode NON_BMP[] = { 0xD800, 0xDC00 };
923 OUString aTest(NON_BMP, SAL_N_ELEMENTS(NON_BMP));
925 sal_Int32 nDone=0;
926 sal_Int32 nPos;
928 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
929 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
930 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(SAL_N_ELEMENTS(NON_BMP)), nPos);
931 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(NON_BMP), aLocale,
932 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
933 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
935 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
936 i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
937 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(SAL_N_ELEMENTS(NON_BMP)), nPos);
938 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(NON_BMP), aLocale,
939 i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
940 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
944 #ifdef TODO
945 void TestBreakIterator::testNorthernThai()
947 lang::Locale aLocale;
948 aLocale.Language = "nod";
949 aLocale.Country = "TH";
951 const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
952 OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1));
953 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
954 i18n::WordType::DICTIONARY_WORD, true);
955 CPPUNIT_ASSERT_MESSAGE("Should skip full word",
956 aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
959 // Not sure if any version earlier than 49 did have Khmer word boundary
960 // dictionaries, 4.6 does not.
962 // As of icu 54, word boundary detection for Khmer is still considered
963 // insufficient, so icu khmer stuff is disabled
965 //A test to ensure that our khmer word boundary detection is useful
966 //https://bugs.libreoffice.org/show_bug.cgi?id=52020
967 void TestBreakIterator::testKhmer()
969 lang::Locale aLocale;
970 aLocale.Language = "km";
971 aLocale.Country = "KH";
973 const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
975 OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER));
976 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
977 i18n::WordType::DICTIONARY_WORD, true);
979 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
981 aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
982 i18n::WordType::DICTIONARY_WORD, true);
984 CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5);
986 #endif
988 void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak)
990 lang::Locale aLocale;
991 aLocale.Language = "ja";
992 aLocale.Country = "JP";
993 i18n::Boundary aBounds;
996 const sal_Unicode JAPANESE[] = { 0x30B7, 0x30E3, 0x30C3, 0x30C8, 0x30C0, 0x30A6, 0x30F3 };
998 OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
999 aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
1000 i18n::WordType::DICTIONARY_WORD, true);
1002 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
1006 const sal_Unicode JAPANESE[] = { 0x9EBB, 0x306E, 0x8449, 0x9EBB, 0x306E, 0x8449 };
1008 OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
1009 aBounds = xBreak->getWordBoundary(aTest, 1, aLocale,
1010 i18n::WordType::DICTIONARY_WORD, true);
1012 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
1014 aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
1015 i18n::WordType::DICTIONARY_WORD, true);
1017 CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 6);
1021 void TestBreakIterator::testJapanese()
1023 doTestJapanese(m_xBreak);
1025 // fdo#78479 - test second / cached instantiation of xdictionary
1026 uno::Reference< i18n::XBreakIterator > xTmpBreak(m_xSFactory->createInstance(
1027 "com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
1029 doTestJapanese(xTmpBreak);
1032 void TestBreakIterator::testChinese()
1034 lang::Locale aLocale;
1035 aLocale.Language = "zh";
1036 aLocale.Country = "CN";
1037 i18n::Boundary aBounds;
1040 const sal_Unicode CHINESE[] = { 0x6A35, 0x6A30, 0x69FE, 0x8919, 0xD867, 0xDEDB };
1042 OUString aTest(CHINESE, SAL_N_ELEMENTS(CHINESE));
1043 aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale,
1044 i18n::WordType::DICTIONARY_WORD, true);
1045 CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 6);
1048 void TestBreakIterator::setUp()
1050 BootstrapFixtureBase::setUp();
1051 m_xBreak.set(m_xSFactory->createInstance("com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
1054 void TestBreakIterator::tearDown()
1056 m_xBreak.clear();
1057 BootstrapFixtureBase::tearDown();
1060 CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator);
1062 CPPUNIT_PLUGIN_IMPLEMENT();
1064 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */