lok: vcl: fix multiple floatwin removal case more robustly.
[LibreOffice.git] / i18npool / qa / cppunit / test_breakiterator.cxx
blob0c132acf3a436877b4e1d4a1fd33a7ce1030a104
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 */
10 #include <cppuhelper/bootstrap.hxx>
11 #include <cppuhelper/basemutex.hxx>
12 #include <com/sun/star/i18n/XBreakIterator.hpp>
13 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
14 #include <com/sun/star/i18n/ScriptType.hpp>
15 #include <com/sun/star/i18n/WordType.hpp>
16 #include <unotest/bootstrapfixturebase.hxx>
18 #include <unicode/uversion.h>
20 #include <rtl/strbuf.hxx>
21 #include <rtl/ustrbuf.hxx>
23 #include <string.h>
25 #include <stack>
27 using namespace ::com::sun::star;
29 class TestBreakIterator : public test::BootstrapFixtureBase
31 public:
32 virtual void setUp() override;
33 virtual void tearDown() override;
35 void testLineBreaking();
36 void testWordBoundaries();
37 void testGraphemeIteration();
38 void testWeak();
39 void testAsian();
40 void testThai();
41 #if (U_ICU_VERSION_MAJOR_NUM > 51)
42 void testLao();
43 #ifdef TODO
44 void testNorthernThai();
45 void testKhmer();
46 #endif
47 #endif
48 void testJapanese();
49 void testChinese();
51 CPPUNIT_TEST_SUITE(TestBreakIterator);
52 CPPUNIT_TEST(testLineBreaking);
53 CPPUNIT_TEST(testWordBoundaries);
54 CPPUNIT_TEST(testGraphemeIteration);
55 CPPUNIT_TEST(testWeak);
56 CPPUNIT_TEST(testAsian);
57 CPPUNIT_TEST(testThai);
58 #if (U_ICU_VERSION_MAJOR_NUM > 51)
59 CPPUNIT_TEST(testLao);
60 #ifdef TODO
61 CPPUNIT_TEST(testKhmer);
62 CPPUNIT_TEST(testNorthernThai);
63 #endif
64 #endif
65 CPPUNIT_TEST(testJapanese);
66 CPPUNIT_TEST(testChinese);
67 CPPUNIT_TEST_SUITE_END();
69 private:
70 uno::Reference<i18n::XBreakIterator> m_xBreak;
71 void doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak);
74 void TestBreakIterator::testLineBreaking()
76 i18n::LineBreakHyphenationOptions aHyphOptions;
77 i18n::LineBreakUserOptions aUserOptions;
78 lang::Locale aLocale;
80 //See https://bugs.libreoffice.org/show_bug.cgi?id=31271
82 OUString aTest("(some text here)");
84 aLocale.Language = "en";
85 aLocale.Country = "US";
88 //Here we want the line break to leave text here) on the next line
89 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
90 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(6), aResult.breakIndex);
94 //Here we want the line break to leave "here)" on the next line
95 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
96 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(11), aResult.breakIndex);
100 //See https://bugs.libreoffice.org/show_bug.cgi?id=49849
102 const sal_Unicode HEBREW1[] = { 0x05DE, 0x05D9, 0x05DC, 0x05D9, 0x5DD };
103 OUString aWord(HEBREW1, SAL_N_ELEMENTS(HEBREW1));
104 OUString aTest(aWord + " " + aWord);
106 aLocale.Language = "he";
107 aLocale.Country = "IL";
110 //Here we want the line break to happen at the whitespace
111 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
112 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", aWord.getLength()+1, aResult.breakIndex);
116 //See https://bz.apache.org/ooo/show_bug.cgi?id=17155
118 OUString const aTest("foo /bar/baz");
120 aLocale.Language = "en";
121 aLocale.Country = "US";
124 //Here we want the line break to leave /bar/ba clumped together on the next line
125 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("foo /bar/ba"), aLocale, 0,
126 aHyphOptions, aUserOptions);
127 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first slash", static_cast<sal_Int32>(4), aResult.breakIndex);
131 //See https://bz.apache.org/ooo/show_bug.cgi?id=19716
133 OUString aTest("aaa]aaa");
135 aLocale.Language = "en";
136 aLocale.Country = "US";
139 //Here we want the line break to move the whole lot to the next line
140 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
141 aHyphOptions, aUserOptions);
142 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the line, not at ]", static_cast<sal_Int32>(0), aResult.breakIndex);
146 //this is an example sequence from tdf92993-1.docx caught by the load crashtesting
148 const sal_Unicode WEIRD1[] = { 0xd83c, 0xdf56, 0xd83c, 0xdf57, 0xd83c, 0xdf46,
149 0xd83c, 0xdf64, 0x2668, 0xfe0f, 0xd83c, 0xdfc6};
151 OUString aTest(WEIRD1, SAL_N_ELEMENTS(WEIRD1));
153 aLocale.Language = "en";
154 aLocale.Country = "US";
157 //This must not assert/crash
158 (void)m_xBreak->getLineBreak(aTest, 0, aLocale, 0, aHyphOptions, aUserOptions);
162 //See https://bugs.documentfoundation.org/show_bug.cgi?id=96197
164 const sal_Unicode HANGUL[] = { 0xc560, 0xad6D, 0xac00, 0xc758, 0x0020, 0xac00,
165 0xc0ac, 0xb294};
166 OUString aTest(HANGUL, SAL_N_ELEMENTS(HANGUL));
168 aLocale.Language = "ko";
169 aLocale.Country = "KR";
172 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
173 aHyphOptions, aUserOptions);
174 CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast<sal_Int32>(5), aResult.breakIndex);
179 //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
180 void TestBreakIterator::testWordBoundaries()
182 lang::Locale aLocale;
183 aLocale.Language = "en";
184 aLocale.Country = "US";
186 i18n::Boundary aBounds;
188 //See https://bz.apache.org/ooo/show_bug.cgi?id=11993
190 OUString aTest("abcd ef ghi??? KLM");
192 CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
193 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
194 aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
195 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
197 CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
198 CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
200 //next word
201 aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true);
202 CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
204 //previous word
205 aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
206 CPPUNIT_ASSERT(aBounds.startPos == 5 && aBounds.endPos == 7);
208 CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
209 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
210 aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true);
211 CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
213 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
214 CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
215 aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true);
216 CPPUNIT_ASSERT(aBounds.startPos == 16 && aBounds.endPos == 19);
219 //See https://bz.apache.org/ooo/show_bug.cgi?id=21907
221 OUString aTest("b a?");
223 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
224 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
225 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
227 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
229 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
230 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
231 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
233 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
236 //See https://bz.apache.org/ooo/show_bug.cgi?id=14904
238 const sal_Unicode TEST[] =
240 'W', 'o', 'r', 'k', 'i', 'n', 'g', ' ', 0x201C, 'W', 'o', 'r', 'd', 's',
241 ' ', 's', 't', 'a', 'r', 't', 'i', 'n', 'g', ' ', 'w', 'i', 't',
242 'h', ' ', 'q', 'u', 'o', 't', 'e', 's', 0x201D, ' ', 'W', 'o', 'r', 'k',
243 'i', 'n', 'g', ' ', 0x2018, 'B', 'r', 'o', 'k', 'e', 'n', 0x2019, ' ',
244 '?', 'S', 'p', 'a', 'n', 'i', 's', 'h', '?', ' ', 'd', 'o', 'e',
245 's', 'n', 0x2019, 't', ' ', 'w', 'o', 'r', 'k', '.', ' ', 'N', 'o',
246 't', ' ', 'e', 'v', 'e', 'n', ' ' , 0x00BF, 'r', 'e', 'a', 'l', '?', ' ',
247 'S', 'p', 'a', 'n', 'i', 's', 'h'
249 OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
251 aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false);
252 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
254 aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false);
255 CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 14);
257 aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false);
258 CPPUNIT_ASSERT(aBounds.startPos == 37 && aBounds.endPos == 44);
260 aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false);
261 CPPUNIT_ASSERT(aBounds.startPos == 46 && aBounds.endPos == 52);
263 aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false);
264 CPPUNIT_ASSERT(aBounds.startPos == 55 && aBounds.endPos == 62);
266 aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false);
267 CPPUNIT_ASSERT(aBounds.startPos == 64 && aBounds.endPos == 71);
269 aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false);
270 CPPUNIT_ASSERT(aBounds.startPos == 88 && aBounds.endPos == 92);
273 //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
274 sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF };
275 for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
277 //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
278 for (size_t i = 0; i < SAL_N_ELEMENTS(aBreakTests); ++i)
280 #if (U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM <= 2)
281 //Note the breakiterator test is known to fail on older icu
282 //versions (4.2.1) for the 200B (ZWSP) Zero Width Space testcase.
283 if (aBreakTests[i] == 0x200B)
284 continue;
285 #endif
286 OUString aTest = "Word" + OUStringLiteral1(aBreakTests[i]) + "Word";
287 aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
288 switch (mode)
290 case i18n::WordType::ANY_WORD:
291 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
292 break;
293 case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
294 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
295 break;
296 case i18n::WordType::DICTIONARY_WORD:
297 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
298 break;
299 case i18n::WordType::WORD_COUNT:
300 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
301 break;
304 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
305 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
309 sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB };
310 for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
312 //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
313 for (size_t i = 0; i < SAL_N_ELEMENTS(aJoinTests); ++i)
315 OUString aTest = "Word" + OUStringLiteral1(aJoinTests[i]) + "Word";
316 aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
317 switch (mode)
319 case i18n::WordType::ANY_WORD:
320 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
321 break;
322 case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
323 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
324 break;
325 case i18n::WordType::DICTIONARY_WORD:
326 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
327 break;
328 case i18n::WordType::WORD_COUNT:
329 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
330 break;
333 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
334 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
338 //See https://bz.apache.org/ooo/show_bug.cgi?id=13494
340 const OUString aBase("xxAAxxBBxxCCxx");
341 const sal_Unicode aTests[] =
343 '\'', ';', ',', '.', '!', '@', '#', '%', '&', '*',
344 '(', ')', '_', '-', '{', '}', '[', ']', '\"', '/',
345 '\\', '?', '~', '$', '+', '^', '=', '<', '>', '|'
348 const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14};
349 for (size_t j = 0; j < SAL_N_ELEMENTS(aTests); ++j)
351 OUString aTest = aBase.replace('x', aTests[j]);
352 sal_Int32 nPos = -1;
353 size_t i = 0;
356 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aDoublePositions));
357 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
358 CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
359 ++i;
361 while (nPos < aTest.getLength());
362 nPos = aTest.getLength();
363 i = SAL_N_ELEMENTS(aDoublePositions)-1;
366 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
367 --i;
368 CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
370 while (nPos > 0);
373 const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10};
374 for (size_t j = 1; j < SAL_N_ELEMENTS(aTests); ++j)
376 OUString aTest = aBase.replaceAll("xx", OUStringLiteral1(aTests[j]));
377 sal_Int32 nPos = -1;
378 size_t i = 0;
381 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSinglePositions));
382 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
383 CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
384 ++i;
386 while (nPos < aTest.getLength());
387 nPos = aTest.getLength();
388 i = SAL_N_ELEMENTS(aSinglePositions)-1;
391 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
392 --i;
393 CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
395 while (nPos > 0);
398 const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10};
399 CPPUNIT_ASSERT_EQUAL(u'\'', aTests[0]);
401 OUString aTest = aBase.replaceAll("xx", OUStringLiteral1(aTests[0]));
402 sal_Int32 nPos = -1;
403 size_t i = 0;
406 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSingleQuotePositions));
407 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
408 CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
409 ++i;
411 while (nPos < aTest.getLength());
412 nPos = aTest.getLength();
413 i = SAL_N_ELEMENTS(aSingleQuotePositions)-1;
416 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
417 --i;
418 CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
420 while (nPos > 0);
424 //See https://bz.apache.org/ooo/show_bug.cgi?id=13451
426 aLocale.Language = "ca";
427 aLocale.Country = "ES";
429 OUString aTest("mirar-se comprar-vos donem-nos les mans aneu-vos-en!");
431 sal_Int32 nPos = 0;
432 sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52};
433 size_t i = 0;
436 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
437 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
438 i18n::WordType::DICTIONARY_WORD, true).endPos;
439 CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
440 ++i;
442 while (nPos++ < aTest.getLength());
443 CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
446 //See https://bz.apache.org/ooo/show_bug.cgi?id=85411
447 for (int j = 0; j < 3; ++j)
449 switch (j)
451 case 0:
452 aLocale.Language = "en";
453 aLocale.Country = "US";
454 break;
455 case 1:
456 aLocale.Language = "ca";
457 aLocale.Country = "ES";
458 break;
459 case 2:
460 aLocale.Language = "fi";
461 aLocale.Country = "FI";
462 break;
463 default:
464 CPPUNIT_ASSERT(false);
465 break;
468 const sal_Unicode TEST[] =
470 'I', 0x200B, 'w', 'a', 'n', 't', 0x200B, 't', 'o', 0x200B, 'g', 'o'
472 OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
474 sal_Int32 nPos = 0;
475 sal_Int32 aExpected[] = {1, 6, 9, 12};
476 size_t i = 0;
479 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
480 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
481 i18n::WordType::DICTIONARY_WORD, true).endPos;
482 CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
483 ++i;
485 while (nPos++ < aTest.getLength());
486 CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
489 //https://bz.apache.org/ooo/show_bug.cgi?id=21290
490 for (int j = 0; j < 2; ++j)
492 switch (j)
494 case 0:
495 aLocale.Language = "en";
496 aLocale.Country = "US";
497 break;
498 case 1:
499 aLocale.Language = "grc";
500 aLocale.Country.clear();
501 break;
502 default:
503 CPPUNIT_ASSERT(false);
504 break;
507 const sal_Unicode TEST[] =
509 0x1F0C, 0x03BD, 0x03B4, 0x03C1, 0x03B1, 0x0020, 0x1F00,
510 0x03C1, 0x03BD, 0x1F7B, 0x03BC, 0x03B5, 0x03BD, 0x03BF,
511 0x03C2, 0x0020, 0x1F00, 0x03BB, 0x03BB, 0x0020, 0x1F24,
512 0x03C3, 0x03B8, 0x03B9, 0x03BF, 0x03BD
514 OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
516 sal_Int32 nPos = 0;
517 sal_Int32 aExpected[] = {5, 15, 19, 26};
518 size_t i = 0;
521 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
522 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
523 i18n::WordType::DICTIONARY_WORD, true).endPos;
524 CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
525 ++i;
527 while (nPos++ < aTest.getLength());
528 CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
531 //See https://bz.apache.org/ooo/show_bug.cgi?id=58513
532 //See https://bugs.libreoffice.org/show_bug.cgi?id=55707
534 aLocale.Language = "fi";
535 aLocale.Country = "FI";
537 OUString aTest("Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n");
540 sal_Int32 nPos = 0;
541 sal_Int32 aExpected[] = {11, 21, 24, 36, 42, 47, 51};
542 size_t i = 0;
545 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
546 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
547 i18n::WordType::WORD_COUNT, true).endPos;
548 CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
549 ++i;
551 while (nPos++ < aTest.getLength());
552 CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
556 sal_Int32 nPos = 0;
557 sal_Int32 aExpected[] = {0, 11, 12, 20, 22, 24, 25, 36, 37,
558 40, 41, 42, 43, 45, 46, 47, 50, 51};
559 size_t i = 0;
562 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
563 aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
564 i18n::WordType::DICTIONARY_WORD, true);
565 CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.startPos);
566 ++i;
567 CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.endPos);
568 ++i;
569 nPos = aBounds.endPos;
571 while (nPos++ < aTest.getLength());
572 CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
576 //See https://bz.apache.org/ooo/show_bug.cgi?id=107843
578 aLocale.Language = "en";
579 aLocale.Country = "US";
581 const sal_Unicode TEST[] =
583 'r', 'u', 0xFB00, 'l', 'e', ' ', 0xFB01, 's', 'h'
585 OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
587 aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
588 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 5);
590 aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false);
591 CPPUNIT_ASSERT(aBounds.startPos == 6 && aBounds.endPos == 9);
594 //See https://bz.apache.org/ooo/show_bug.cgi?id=113785
596 aLocale.Language = "en";
597 aLocale.Country = "US";
599 const sal_Unicode TEST[] =
601 'a', 0x2013, 'b', 0x2014, 'c'
603 OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
605 aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
606 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 1);
608 aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD);
609 CPPUNIT_ASSERT(aBounds.startPos == 2 && aBounds.endPos == 3);
611 aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD);
612 CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 5);
616 //See https://bugs.libreoffice.org/show_bug.cgi?id=40292
617 //See https://bz.apache.org/ooo/show_bug.cgi?id=80412
618 //See https://bz.apache.org/ooo/show_bug.cgi?id=111152
619 //See https://bz.apache.org/ooo/show_bug.cgi?id=50172
620 void TestBreakIterator::testGraphemeIteration()
622 lang::Locale aLocale;
623 aLocale.Language = "bn";
624 aLocale.Country = "IN";
627 const sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF };
628 OUString aTest(BA_HALANT_LA, SAL_N_ELEMENTS(BA_HALANT_LA));
630 sal_Int32 nDone=0;
631 sal_Int32 nPos;
632 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
633 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
634 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(BA_HALANT_LA)), nPos);
635 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(BA_HALANT_LA), aLocale,
636 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
637 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
641 const sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF };
642 OUString aTest(HA_HALANT_NA_VOWELSIGNI, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
644 sal_Int32 nDone=0;
645 sal_Int32 nPos;
646 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
647 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
648 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI)), nPos);
649 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI), aLocale,
650 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
651 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
655 const sal_Unicode TA_HALANT_MA_HALANT_YA [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF };
656 OUString aTest(TA_HALANT_MA_HALANT_YA, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
658 sal_Int32 nDone=0;
659 sal_Int32 nPos;
660 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
661 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
662 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA)), nPos);
663 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA), aLocale,
664 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
665 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
668 aLocale.Language = "ta";
669 aLocale.Country = "IN";
672 const sal_Unicode KA_VIRAMA_SSA[] = { 0x0B95, 0x0BCD, 0x0BB7 };
673 OUString aTest(KA_VIRAMA_SSA, SAL_N_ELEMENTS(KA_VIRAMA_SSA));
675 sal_Int32 nDone=0;
676 sal_Int32 nPos = 0;
678 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
679 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
680 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(KA_VIRAMA_SSA)), nPos);
681 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VIRAMA_SSA), aLocale,
682 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
683 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
687 const sal_Unicode KA_VOWELSIGNU[] = { 0x0B95, 0x0BC1 };
688 OUString aTest(KA_VOWELSIGNU, SAL_N_ELEMENTS(KA_VOWELSIGNU));
690 sal_Int32 nDone=0;
691 sal_Int32 nPos = 0;
693 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
694 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
695 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(KA_VOWELSIGNU)), nPos);
696 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VOWELSIGNU), aLocale,
697 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
698 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
702 const sal_Unicode CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI[] =
703 { 0x0B9A, 0x0BBF, 0x0BA4, 0x0BCD, 0x0BA4, 0x0BBF, 0x0BB0, 0x0BC8 };
704 OUString aTest(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI,
705 SAL_N_ELEMENTS(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI));
707 sal_Int32 nDone=0;
708 sal_Int32 nPos=0;
710 for (sal_Int32 i = 0; i < 4; ++i)
712 sal_Int32 nOldPos = nPos;
713 nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale,
714 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
715 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos+2, nPos);
718 for (sal_Int32 i = 0; i < 4; ++i)
720 sal_Int32 nOldPos = nPos;
721 nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale,
722 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
723 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos-2, nPos);
728 const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
729 OUString aText(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
731 sal_Int32 nGraphemeCount = 0;
733 sal_Int32 nCurPos = 0;
734 while (nCurPos < aText.getLength())
736 sal_Int32 nCount2 = 1;
737 nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
738 i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
739 ++nGraphemeCount;
742 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be considered 1 grapheme", static_cast<sal_Int32>(1), nGraphemeCount);
745 aLocale.Language = "hi";
746 aLocale.Country = "IN";
749 const sal_Unicode SHA_VOWELSIGNII[] = { 0x936, 0x940 };
750 OUString aTest(SHA_VOWELSIGNII, SAL_N_ELEMENTS(SHA_VOWELSIGNII));
752 sal_Int32 nDone=0;
753 sal_Int32 nPos = 0;
755 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
756 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
757 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(SHA_VOWELSIGNII)), nPos);
758 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(SHA_VOWELSIGNII), aLocale,
759 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
760 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
764 //A test to ensure that certain ranges and codepoints that are categorized as
765 //weak remain as weak, so that existing docs that depend on this don't silently
766 //change font for those weak chars
767 void TestBreakIterator::testWeak()
769 lang::Locale aLocale;
770 aLocale.Language = "en";
771 aLocale.Country = "US";
774 const sal_Unicode WEAKS[] =
776 0x0001, 0x0002,
777 0x0020, 0x00A0,
778 0x0300, 0x036F, //Combining Diacritical Marks
779 0x1AB0, 0x1AFF, //Combining Diacritical Marks Extended
780 0x1DC0, 0x1DFF, //Combining Diacritical Marks Supplement
781 0x20D0, 0x20FF, //Combining Diacritical Marks for Symbols
782 0x2150, 0x215F, //Number Forms, fractions
783 0x2160, 0x2180, //Number Forms, roman numerals
784 0x2200, 0x22FF, //Mathematical Operators
785 0x27C0, 0x27EF, //Miscellaneous Mathematical Symbols-A
786 0x2980, 0x29FF, //Miscellaneous Mathematical Symbols-B
787 0x2A00, 0x2AFF, //Supplemental Mathematical Operators
788 0x2100, 0x214F, //Letterlike Symbols
789 0x2308, 0x230B, //Miscellaneous technical
790 0x25A0, 0x25FF, //Geometric Shapes
791 0x2B30, 0x2B4C //Miscellaneous Symbols and Arrows
793 OUString aWeaks(WEAKS, SAL_N_ELEMENTS(WEAKS));
795 for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
797 sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
798 OStringBuffer aMsg;
799 aMsg.append("Char 0x");
800 aMsg.append(static_cast<sal_Int32>(aWeaks[i]), 16);
801 aMsg.append(" should have been weak");
802 CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
803 i18n::ScriptType::WEAK, nScript);
808 //A test to ensure that certain ranges and codepoints that are categorized as
809 //asian remain as asian, so that existing docs that depend on this don't silently
810 //change font for those asian chars.
811 //See https://bugs.libreoffice.org/show_bug.cgi?id=38095
812 void TestBreakIterator::testAsian()
814 lang::Locale aLocale;
815 aLocale.Language = "en";
816 aLocale.Country = "US";
819 const sal_Unicode ASIANS[] =
821 //some typical CJK chars
822 0x4E00, 0x62FF,
823 //The full HalfWidth and FullWidth block has historically been
824 //designated as taking the CJK font :-(
825 //HalfWidth and FullWidth forms of ASCII 0-9, categorized under
826 //UAX24 as "Common" i.e. by that logic WEAK
827 0xFF10, 0xFF19,
828 //HalfWidth and FullWidth forms of ASCII A-z, categorized under
829 //UAX25 as "Latin", i.e. by that logic LATIN
830 0xFF21, 0xFF5A
832 OUString aAsians(ASIANS, SAL_N_ELEMENTS(ASIANS));
834 for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
836 sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
837 OStringBuffer aMsg;
838 aMsg.append("Char 0x");
839 aMsg.append(static_cast<sal_Int32>(aAsians[i]), 16);
840 aMsg.append(" should have been asian");
841 CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
842 i18n::ScriptType::ASIAN, nScript);
847 #if (U_ICU_VERSION_MAJOR_NUM > 51)
848 //A test to ensure that our Lao word boundary detection is useful
849 void TestBreakIterator::testLao()
851 lang::Locale aLocale;
852 aLocale.Language = "lo";
853 aLocale.Country = "LA";
855 const sal_Unicode LAO[] = { 0x0e8d, 0x0eb4, 0x0e99, 0x0e94, 0x0eb5, 0x0e95, 0x0ec9, 0x0ead, 0x0e99, 0x0eae, 0x0eb1, 0x0e9a };
856 OUString aTest(LAO, SAL_N_ELEMENTS(LAO));
857 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
858 i18n::WordType::DICTIONARY_WORD, true);
860 CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
861 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
863 aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
864 i18n::WordType::DICTIONARY_WORD, true);
866 CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
867 CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
869 #endif
871 //A test to ensure that our thai word boundary detection is useful
872 void TestBreakIterator::testThai()
874 lang::Locale aLocale;
875 aLocale.Language = "th";
876 aLocale.Country = "TH";
878 //See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
880 const sal_Unicode THAI[] = { 0x0E01, 0x0E38, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
881 OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
882 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
883 i18n::WordType::DICTIONARY_WORD, true);
884 CPPUNIT_ASSERT_MESSAGE("Should skip full word",
885 aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
888 //See https://bz.apache.org/ooo/show_bug.cgi?id=29548
889 //make sure forwards and back are consistent
891 const sal_Unicode THAI[] =
893 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
894 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
895 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27,
896 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
897 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
898 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27
900 OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
902 std::stack<sal_Int32> aPositions;
903 sal_Int32 nPos = -1;
906 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
907 aPositions.push(nPos);
909 while (nPos < aTest.getLength());
910 nPos = aTest.getLength();
911 CPPUNIT_ASSERT(!aPositions.empty());
912 aPositions.pop();
915 CPPUNIT_ASSERT(!aPositions.empty());
916 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
917 CPPUNIT_ASSERT_EQUAL(aPositions.top(), nPos);
918 aPositions.pop();
920 while (nPos > 0);
923 // tdf#113694
925 const sal_Unicode NON_BMP[] = { 0xD800, 0xDC00 };
926 OUString aTest(NON_BMP, SAL_N_ELEMENTS(NON_BMP));
928 sal_Int32 nDone=0;
929 sal_Int32 nPos;
931 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
932 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
933 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(SAL_N_ELEMENTS(NON_BMP)), nPos);
934 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(NON_BMP), aLocale,
935 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
936 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
938 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
939 i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
940 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(SAL_N_ELEMENTS(NON_BMP)), nPos);
941 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(NON_BMP), aLocale,
942 i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
943 CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
947 #ifdef TODO
948 void TestBreakIterator::testNorthernThai()
950 lang::Locale aLocale;
951 aLocale.Language = "nod";
952 aLocale.Country = "TH";
954 const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
955 OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1));
956 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
957 i18n::WordType::DICTIONARY_WORD, true);
958 CPPUNIT_ASSERT_MESSAGE("Should skip full word",
959 aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
962 // Not sure if any version earlier than 49 did have Khmer word boundary
963 // dictionaries, 4.6 does not.
965 // As of icu 54, word boundary detection for Khmer is still considered
966 // insufficient, so icu khmer stuff is disabled
968 //A test to ensure that our khmer word boundary detection is useful
969 //https://bugs.libreoffice.org/show_bug.cgi?id=52020
970 void TestBreakIterator::testKhmer()
972 lang::Locale aLocale;
973 aLocale.Language = "km";
974 aLocale.Country = "KH";
976 const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
978 OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER));
979 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
980 i18n::WordType::DICTIONARY_WORD, true);
982 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
984 aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
985 i18n::WordType::DICTIONARY_WORD, true);
987 CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5);
989 #endif
991 void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak)
993 lang::Locale aLocale;
994 aLocale.Language = "ja";
995 aLocale.Country = "JP";
996 i18n::Boundary aBounds;
999 const sal_Unicode JAPANESE[] = { 0x30B7, 0x30E3, 0x30C3, 0x30C8, 0x30C0, 0x30A6, 0x30F3 };
1001 OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
1002 aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
1003 i18n::WordType::DICTIONARY_WORD, true);
1005 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
1009 const sal_Unicode JAPANESE[] = { 0x9EBB, 0x306E, 0x8449, 0x9EBB, 0x306E, 0x8449 };
1011 OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
1012 aBounds = xBreak->getWordBoundary(aTest, 1, aLocale,
1013 i18n::WordType::DICTIONARY_WORD, true);
1015 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
1017 aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
1018 i18n::WordType::DICTIONARY_WORD, true);
1020 CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 6);
1024 void TestBreakIterator::testJapanese()
1026 doTestJapanese(m_xBreak);
1028 // fdo#78479 - test second / cached instantiation of xdictionary
1029 uno::Reference< i18n::XBreakIterator > xTmpBreak(m_xSFactory->createInstance(
1030 "com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
1032 doTestJapanese(xTmpBreak);
1035 void TestBreakIterator::testChinese()
1037 lang::Locale aLocale;
1038 aLocale.Language = "zh";
1039 aLocale.Country = "CN";
1040 i18n::Boundary aBounds;
1043 const sal_Unicode CHINESE[] = { 0x6A35, 0x6A30, 0x69FE, 0x8919, 0xD867, 0xDEDB };
1045 OUString aTest(CHINESE, SAL_N_ELEMENTS(CHINESE));
1046 aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale,
1047 i18n::WordType::DICTIONARY_WORD, true);
1048 CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 6);
1051 void TestBreakIterator::setUp()
1053 BootstrapFixtureBase::setUp();
1054 m_xBreak.set(m_xSFactory->createInstance("com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
1057 void TestBreakIterator::tearDown()
1059 m_xBreak.clear();
1060 BootstrapFixtureBase::tearDown();
1063 CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator);
1065 CPPUNIT_PLUGIN_IMPLEMENT();
1067 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */