Branch libreoffice-5-0-4
[LibreOffice.git] / i18npool / qa / cppunit / test_breakiterator.cxx
blobbe4dd6d13935f9682331a246d080b7590aa17a30
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 */
10 #include <cppuhelper/compbase1.hxx>
11 #include <cppuhelper/bootstrap.hxx>
12 #include <cppuhelper/basemutex.hxx>
13 #include <com/sun/star/i18n/XBreakIterator.hpp>
14 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
15 #include <com/sun/star/i18n/ScriptType.hpp>
16 #include <com/sun/star/i18n/WordType.hpp>
17 #include <unotest/bootstrapfixturebase.hxx>
19 #include <unicode/uversion.h>
21 #include <rtl/strbuf.hxx>
22 #include <rtl/ustrbuf.hxx>
24 #include <string.h>
26 #include <stack>
28 using namespace ::com::sun::star;
30 class TestBreakIterator : public test::BootstrapFixtureBase
32 public:
33 virtual void setUp() SAL_OVERRIDE;
34 virtual void tearDown() SAL_OVERRIDE;
36 void testLineBreaking();
37 void testWordBoundaries();
38 void testGraphemeIteration();
39 void testWeak();
40 void testAsian();
41 void testThai();
42 void testLao();
43 #ifdef TODO
44 void testNorthernThai();
45 #endif
46 void testKhmer();
47 void testJapanese();
48 void testChinese();
49 CPPUNIT_TEST_SUITE(TestBreakIterator);
50 CPPUNIT_TEST(testLineBreaking);
51 CPPUNIT_TEST(testGraphemeIteration);
52 CPPUNIT_TEST(testWeak);
53 CPPUNIT_TEST(testAsian);
54 CPPUNIT_TEST(testThai);
55 #ifdef TODO
56 CPPUNIT_TEST(testNorthernThai);
57 #endif
59 CPPUNIT_TEST(testWordBoundaries);
60 #if (U_ICU_VERSION_MAJOR_NUM > 4)
61 CPPUNIT_TEST(testKhmer);
62 #endif
63 #if (U_ICU_VERSION_MAJOR_NUM > 51)
64 CPPUNIT_TEST(testLao);
65 #endif
66 CPPUNIT_TEST(testJapanese);
67 CPPUNIT_TEST(testChinese);
68 CPPUNIT_TEST_SUITE_END();
69 private:
70 uno::Reference<i18n::XBreakIterator> m_xBreak;
71 void doTestJapanese(uno::Reference< i18n::XBreakIterator > &xBreak);
74 void TestBreakIterator::testLineBreaking()
76 i18n::LineBreakHyphenationOptions aHyphOptions;
77 i18n::LineBreakUserOptions aUserOptions;
78 lang::Locale aLocale;
80 //See https://bugs.libreoffice.org/show_bug.cgi?id=31271
82 OUString aTest("(some text here)");
84 aLocale.Language = "en";
85 aLocale.Country = "US";
88 //Here we want the line break to leave text here) on the next line
89 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
90 CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the word", aResult.breakIndex == 6);
94 //Here we want the line break to leave "here)" on the next line
95 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
96 CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the word", aResult.breakIndex == 11);
100 //See https://bugs.libreoffice.org/show_bug.cgi?id=49849
102 const sal_Unicode HEBREW1[] = { 0x05DE, 0x05D9, 0x05DC, 0x05D9, 0x5DD };
103 OUString aWord(HEBREW1, SAL_N_ELEMENTS(HEBREW1));
104 OUString aTest(OUStringBuffer(aWord).append(' ').append(aWord).makeStringAndClear());
106 aLocale.Language = "he";
107 aLocale.Country = "IL";
110 //Here we want the line break to happen at the whitespace
111 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
112 CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the word", aResult.breakIndex == aWord.getLength()+1);
116 //See https://bz.apache.org/ooo/show_bug.cgi?id=17155
118 OUString aTest("foo /bar/baz");
120 aLocale.Language = "en";
121 aLocale.Country = "US";
124 //Here we want the line break to leave /bar/ba clumped together on the next line
125 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("foo /bar/ba"), aLocale, 0,
126 aHyphOptions, aUserOptions);
127 CPPUNIT_ASSERT_MESSAGE("Expected a break at the first slash", aResult.breakIndex == 4);
131 //See https://bz.apache.org/ooo/show_bug.cgi?id=19716
133 OUString aTest("aaa]aaa");
135 aLocale.Language = "en";
136 aLocale.Country = "US";
139 //Here we want the line break to move the whole lot to the next line
140 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
141 aHyphOptions, aUserOptions);
142 CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the line, not at ]", aResult.breakIndex == 0);
147 //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
148 void TestBreakIterator::testWordBoundaries()
150 lang::Locale aLocale;
151 aLocale.Language = "en";
152 aLocale.Country = "US";
154 i18n::Boundary aBounds;
156 //See https://bz.apache.org/ooo/show_bug.cgi?id=11993
158 OUString aTest("abcd ef ghi??? KLM");
160 CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
161 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
162 aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
163 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
165 CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
166 CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
168 //next word
169 aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true);
170 CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
172 //previous word
173 aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
174 CPPUNIT_ASSERT(aBounds.startPos == 5 && aBounds.endPos == 7);
176 CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
177 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
178 aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true);
179 CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
181 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
182 CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
183 aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true);
184 CPPUNIT_ASSERT(aBounds.startPos == 16 && aBounds.endPos == 19);
187 //See https://bz.apache.org/ooo/show_bug.cgi?id=21907
189 OUString aTest("b a?");
191 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
192 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
193 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
195 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
197 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
198 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
199 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
201 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
204 //See https://bz.apache.org/ooo/show_bug.cgi?id=14904
206 const sal_Unicode TEST[] =
208 'W', 'o', 'r', 'k', 'i', 'n', 'g', ' ', 0x201C, 'W', 'o', 'r', 'd', 's',
209 ' ', 's', 't', 'a', 'r', 't', 'i', 'n', 'g', ' ', 'w', 'i', 't',
210 'h', ' ', 'q', 'u', 'o', 't', 'e', 's', 0x201D, ' ', 'W', 'o', 'r', 'k',
211 'i', 'n', 'g', ' ', 0x2018, 'B', 'r', 'o', 'k', 'e', 'n', 0x2019, ' ',
212 '?', 'S', 'p', 'a', 'n', 'i', 's', 'h', '?', ' ', 'd', 'o', 'e',
213 's', 'n', 0x2019, 't', ' ', 'w', 'o', 'r', 'k', '.', ' ', 'N', 'o',
214 't', ' ', 'e', 'v', 'e', 'n', ' ' , 0x00BF, 'r', 'e', 'a', 'l', '?', ' ',
215 'S', 'p', 'a', 'n', 'i', 's', 'h'
217 OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
219 aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false);
220 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
222 aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false);
223 CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 14);
225 aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false);
226 CPPUNIT_ASSERT(aBounds.startPos == 37 && aBounds.endPos == 44);
228 aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false);
229 CPPUNIT_ASSERT(aBounds.startPos == 46 && aBounds.endPos == 52);
231 aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false);
232 CPPUNIT_ASSERT(aBounds.startPos == 55 && aBounds.endPos == 62);
234 aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false);
235 CPPUNIT_ASSERT(aBounds.startPos == 64 && aBounds.endPos == 71);
237 aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false);
238 CPPUNIT_ASSERT(aBounds.startPos == 88 && aBounds.endPos == 92);
241 //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
242 sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF };
243 for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
245 //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
246 for (size_t i = 0; i < SAL_N_ELEMENTS(aBreakTests); ++i)
248 #if (U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM <= 2)
249 //Note the breakiterator test is known to fail on older icu
250 //versions (4.2.1) for the 200B (ZWSP) Zero Width Space testcase.
251 if (aBreakTests[i] == 0x200B)
252 continue;
253 #endif
254 OUString aTest = "Word" + OUString(aBreakTests[i]) + "Word";
255 aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
256 switch (mode)
258 case i18n::WordType::ANY_WORD:
259 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
260 break;
261 case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
262 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
263 break;
264 case i18n::WordType::DICTIONARY_WORD:
265 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
266 break;
267 case i18n::WordType::WORD_COUNT:
268 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
269 break;
272 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
273 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
277 sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB };
278 for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
280 //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
281 for (size_t i = 0; i < SAL_N_ELEMENTS(aJoinTests); ++i)
283 OUString aTest = "Word" + OUString(aJoinTests[i]) + "Word";
284 aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
285 switch (mode)
287 case i18n::WordType::ANY_WORD:
288 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
289 break;
290 case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
291 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
292 break;
293 case i18n::WordType::DICTIONARY_WORD:
294 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
295 break;
296 case i18n::WordType::WORD_COUNT:
297 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
298 break;
301 CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
302 CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
306 //See https://bz.apache.org/ooo/show_bug.cgi?id=13494
308 const OUString aBase("xxAAxxBBxxCCxx");
309 const sal_Unicode aTests[] =
311 '\'', ';', ',', '.', '!', '@', '#', '%', '&', '*',
312 '(', ')', '_', '-', '{', '}', '[', ']', '\"', '/',
313 '\\', '?', '~', '$', '+', '^', '=', '<', '>', '|'
316 const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14};
317 for (size_t j = 0; j < SAL_N_ELEMENTS(aTests); ++j)
319 OUString aTest = aBase.replace('x', aTests[j]);
320 sal_Int32 nPos = -1;
321 size_t i = 0;
324 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aDoublePositions));
325 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
326 CPPUNIT_ASSERT(nPos == aDoublePositions[i++]);
328 while (nPos < aTest.getLength());
329 nPos = aTest.getLength();
330 i = SAL_N_ELEMENTS(aDoublePositions)-1;
333 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
334 CPPUNIT_ASSERT(nPos == aDoublePositions[--i]);
336 while (nPos > 0);
339 const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10};
340 for (size_t j = 1; j < SAL_N_ELEMENTS(aTests); ++j)
342 OUString aTest = aBase.replaceAll(OUString("xx"), OUString(aTests[j]));
343 sal_Int32 nPos = -1;
344 size_t i = 0;
347 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSinglePositions));
348 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
349 CPPUNIT_ASSERT(nPos == aSinglePositions[i++]);
351 while (nPos < aTest.getLength());
352 nPos = aTest.getLength();
353 i = SAL_N_ELEMENTS(aSinglePositions)-1;
356 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
357 CPPUNIT_ASSERT(nPos == aSinglePositions[--i]);
359 while (nPos > 0);
362 const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10};
363 CPPUNIT_ASSERT(aTests[0] == '\'');
365 OUString aTest = aBase.replaceAll(OUString("xx"), OUString(aTests[0]));
366 sal_Int32 nPos = -1;
367 size_t i = 0;
370 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSingleQuotePositions));
371 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
372 CPPUNIT_ASSERT(nPos == aSingleQuotePositions[i++]);
374 while (nPos < aTest.getLength());
375 nPos = aTest.getLength();
376 i = SAL_N_ELEMENTS(aSingleQuotePositions)-1;
379 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
380 CPPUNIT_ASSERT(nPos == aSingleQuotePositions[--i]);
382 while (nPos > 0);
386 //See https://bz.apache.org/ooo/show_bug.cgi?id=13451
388 aLocale.Language = "ca";
389 aLocale.Country = "ES";
391 OUString aTest("mirar-se comprar-vos donem-nos les mans aneu-vos-en!");
393 sal_Int32 nPos = 0;
394 sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52};
395 size_t i = 0;
398 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
399 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
400 i18n::WordType::DICTIONARY_WORD, true).endPos;
401 CPPUNIT_ASSERT(aExpected[i++] == nPos);
403 while (nPos++ < aTest.getLength());
404 CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
407 //See https://bz.apache.org/ooo/show_bug.cgi?id=85411
408 for (int j = 0; j < 3; ++j)
410 switch (j)
412 case 0:
413 aLocale.Language = "en";
414 aLocale.Country = "US";
415 break;
416 case 1:
417 aLocale.Language = "ca";
418 aLocale.Country = "ES";
419 break;
420 case 2:
421 aLocale.Language = "fi";
422 aLocale.Country = "FI";
423 break;
424 default:
425 CPPUNIT_ASSERT(false);
426 break;
429 const sal_Unicode TEST[] =
431 'I', 0x200B, 'w', 'a', 'n', 't', 0x200B, 't', 'o', 0x200B, 'g', 'o'
433 OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
435 sal_Int32 nPos = 0;
436 sal_Int32 aExpected[] = {1, 6, 9, 12};
437 size_t i = 0;
440 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
441 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
442 i18n::WordType::DICTIONARY_WORD, true).endPos;
443 CPPUNIT_ASSERT(aExpected[i++] == nPos);
445 while (nPos++ < aTest.getLength());
446 CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
449 //https://bz.apache.org/ooo/show_bug.cgi?id=21290
450 for (int j = 0; j < 2; ++j)
452 switch (j)
454 case 0:
455 aLocale.Language = "en";
456 aLocale.Country = "US";
457 break;
458 case 1:
459 aLocale.Language = "grc";
460 aLocale.Country.clear();
461 break;
462 default:
463 CPPUNIT_ASSERT(false);
464 break;
467 const sal_Unicode TEST[] =
469 0x1F0C, 0x03BD, 0x03B4, 0x03C1, 0x03B1, 0x0020, 0x1F00,
470 0x03C1, 0x03BD, 0x1F7B, 0x03BC, 0x03B5, 0x03BD, 0x03BF,
471 0x03C2, 0x0020, 0x1F00, 0x03BB, 0x03BB, 0x0020, 0x1F24,
472 0x03C3, 0x03B8, 0x03B9, 0x03BF, 0x03BD
474 OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
476 sal_Int32 nPos = 0;
477 sal_Int32 aExpected[] = {5, 15, 19, 26};
478 size_t i = 0;
481 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
482 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
483 i18n::WordType::DICTIONARY_WORD, true).endPos;
484 CPPUNIT_ASSERT(aExpected[i++] == nPos);
486 while (nPos++ < aTest.getLength());
487 CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
490 //See https://bz.apache.org/ooo/show_bug.cgi?id=58513
491 //See https://bugs.libreoffice.org/show_bug.cgi?id=55707
493 aLocale.Language = "fi";
494 aLocale.Country = "FI";
496 OUString aTest("Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n");
499 sal_Int32 nPos = 0;
500 sal_Int32 aExpected[] = {11, 21, 24, 36, 42, 47, 51};
501 size_t i = 0;
504 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
505 nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
506 i18n::WordType::WORD_COUNT, true).endPos;
507 CPPUNIT_ASSERT(aExpected[i++] == nPos);
509 while (nPos++ < aTest.getLength());
510 CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
514 sal_Int32 nPos = 0;
515 sal_Int32 aExpected[] = {0, 11, 12, 20, 22, 24, 25, 36, 37,
516 40, 41, 42, 43, 45, 46, 47, 50, 51};
517 size_t i = 0;
520 CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
521 aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
522 i18n::WordType::DICTIONARY_WORD, true);
523 CPPUNIT_ASSERT(aExpected[i++] == aBounds.startPos);
524 CPPUNIT_ASSERT(aExpected[i++] == aBounds.endPos);
525 nPos = aBounds.endPos;
527 while (nPos++ < aTest.getLength());
528 CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
532 //See https://bz.apache.org/ooo/show_bug.cgi?id=107843
534 aLocale.Language = "en";
535 aLocale.Country = "US";
537 const sal_Unicode TEST[] =
539 'r', 'u', 0xFB00, 'l', 'e', ' ', 0xFB01, 's', 'h'
541 OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
543 aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
544 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 5);
546 aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false);
547 CPPUNIT_ASSERT(aBounds.startPos == 6 && aBounds.endPos == 9);
550 //See https://bz.apache.org/ooo/show_bug.cgi?id=113785
552 aLocale.Language = "en";
553 aLocale.Country = "US";
555 const sal_Unicode TEST[] =
557 'a', 0x2013, 'b', 0x2014, 'c'
559 OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
561 aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
562 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 1);
564 aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD);
565 CPPUNIT_ASSERT(aBounds.startPos == 2 && aBounds.endPos == 3);
567 aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD);
568 CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 5);
572 //See https://bugs.libreoffice.org/show_bug.cgi?id=40292
573 //See https://bz.apache.org/ooo/show_bug.cgi?id=80412
574 //See https://bz.apache.org/ooo/show_bug.cgi?id=111152
575 //See https://bz.apache.org/ooo/show_bug.cgi?id=50172
576 void TestBreakIterator::testGraphemeIteration()
578 lang::Locale aLocale;
579 aLocale.Language = "bn";
580 aLocale.Country = "IN";
583 const sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF };
584 OUString aTest(BA_HALANT_LA, SAL_N_ELEMENTS(BA_HALANT_LA));
586 sal_Int32 nDone=0;
587 sal_Int32 nPos;
588 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
589 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
590 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(BA_HALANT_LA));
591 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(BA_HALANT_LA), aLocale,
592 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
593 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
597 const sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF };
598 OUString aTest(HA_HALANT_NA_VOWELSIGNI, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
600 sal_Int32 nDone=0;
601 sal_Int32 nPos;
602 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
603 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
604 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
605 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI), aLocale,
606 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
607 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
611 const sal_Unicode TA_HALANT_MA_HALANT_YA [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF };
612 OUString aTest(TA_HALANT_MA_HALANT_YA, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
614 sal_Int32 nDone=0;
615 sal_Int32 nPos;
616 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
617 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
618 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
619 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA), aLocale,
620 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
621 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
624 aLocale.Language = "ta";
625 aLocale.Country = "IN";
628 const sal_Unicode KA_VIRAMA_SSA[] = { 0x0B95, 0x0BCD, 0x0BB7 };
629 OUString aTest(KA_VIRAMA_SSA, SAL_N_ELEMENTS(KA_VIRAMA_SSA));
631 sal_Int32 nDone=0;
632 sal_Int32 nPos = 0;
634 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
635 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
636 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(KA_VIRAMA_SSA));
637 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VIRAMA_SSA), aLocale,
638 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
639 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
643 const sal_Unicode KA_VOWELSIGNU[] = { 0x0B95, 0x0BC1 };
644 OUString aTest(KA_VOWELSIGNU, SAL_N_ELEMENTS(KA_VOWELSIGNU));
646 sal_Int32 nDone=0;
647 sal_Int32 nPos = 0;
649 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
650 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
651 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(KA_VOWELSIGNU));
652 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VOWELSIGNU), aLocale,
653 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
654 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
658 const sal_Unicode CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI[] =
659 { 0x0B9A, 0x0BBF, 0x0BA4, 0x0BCD, 0x0BA4, 0x0BBF, 0x0BB0, 0x0BC8 };
660 OUString aTest(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI,
661 SAL_N_ELEMENTS(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI));
663 sal_Int32 nDone=0;
664 sal_Int32 nPos=0;
666 for (sal_Int32 i = 0; i < 4; ++i)
668 sal_Int32 nOldPos = nPos;
669 nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale,
670 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
671 CPPUNIT_ASSERT_MESSAGE("Should skip 2 units", nPos == nOldPos+2);
674 for (sal_Int32 i = 0; i < 4; ++i)
676 sal_Int32 nOldPos = nPos;
677 nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale,
678 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
679 CPPUNIT_ASSERT_MESSAGE("Should skip 2 units", nPos == nOldPos-2);
684 const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
685 OUString aText(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
687 sal_Int32 nGraphemeCount = 0;
689 sal_Int32 nCurPos = 0;
690 while (nCurPos < aText.getLength())
692 sal_Int32 nCount2 = 1;
693 nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
694 i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
695 ++nGraphemeCount;
698 CPPUNIT_ASSERT_MESSAGE("Should be considered 1 grapheme", nGraphemeCount == 1);
701 aLocale.Language = "hi";
702 aLocale.Country = "IN";
705 const sal_Unicode SHA_VOWELSIGNII[] = { 0x936, 0x940 };
706 OUString aTest(SHA_VOWELSIGNII, SAL_N_ELEMENTS(SHA_VOWELSIGNII));
708 sal_Int32 nDone=0;
709 sal_Int32 nPos = 0;
711 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
712 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
713 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(SHA_VOWELSIGNII));
714 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(SHA_VOWELSIGNII), aLocale,
715 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
716 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
720 //A test to ensure that certain ranges and codepoints that are categorized as
721 //weak remain as weak, so that existing docs that depend on this don't silently
722 //change font for those weak chars
723 void TestBreakIterator::testWeak()
725 lang::Locale aLocale;
726 aLocale.Language = "en";
727 aLocale.Country = "US";
730 const sal_Unicode WEAKS[] =
732 0x0001, 0x0002,
733 0x0020, 0x00A0,
734 0x2150, 0x215F, //Number Forms, fractions
735 0x2160, 0x2180, //Number Forms, roman numerals
736 0x2200, 0x22FF, //Mathematical Operators
737 0x27C0, 0x27EF, //Miscellaneous Mathematical Symbols-A
738 0x2980, 0x29FF, //Miscellaneous Mathematical Symbols-B
739 0x2A00, 0x2AFF, //Supplemental Mathematical Operators
740 0x2100, 0x214F, //Letterlike Symbols
741 0x2308, 0x230B, //Miscellaneous technical
742 0x25A0, 0x25FF, //Geometric Shapes
743 0x2B30, 0x2B4C //Miscellaneous Symbols and Arrows
745 OUString aWeaks(WEAKS, SAL_N_ELEMENTS(WEAKS));
747 for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
749 sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
750 OStringBuffer aMsg;
751 aMsg.append("Char 0x");
752 aMsg.append(static_cast<sal_Int32>(aWeaks[i]), 16);
753 aMsg.append(" should have been weak");
754 CPPUNIT_ASSERT_MESSAGE(aMsg.getStr(),
755 nScript == i18n::ScriptType::WEAK);
760 //A test to ensure that certain ranges and codepoints that are categorized as
761 //asian remain as asian, so that existing docs that depend on this don't silently
762 //change font for those asian chars.
763 //See https://bugs.libreoffice.org/show_bug.cgi?id=38095
764 void TestBreakIterator::testAsian()
766 lang::Locale aLocale;
767 aLocale.Language = "en";
768 aLocale.Country = "US";
771 const sal_Unicode ASIANS[] =
773 //some typical CJK chars
774 0x4E00, 0x62FF,
775 //The full HalfWidth and FullWidth block has historically been
776 //designated as taking the CJK font :-(
777 //HalfWidth and FullWidth forms of ASCII 0-9, categorized under
778 //UAX24 as "Common" i.e. by that logic WEAK
779 0xFF10, 0xFF19,
780 //HalfWidth and FullWidth forms of ASCII A-z, categorized under
781 //UAX25 as "Latin", i.e. by that logic LATIN
782 0xFF21, 0xFF5A
784 OUString aAsians(ASIANS, SAL_N_ELEMENTS(ASIANS));
786 for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
788 sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
789 OStringBuffer aMsg;
790 aMsg.append("Char 0x");
791 aMsg.append(static_cast<sal_Int32>(aAsians[i]), 16);
792 aMsg.append(" should have been asian");
793 CPPUNIT_ASSERT_MESSAGE(aMsg.getStr(),
794 nScript == i18n::ScriptType::ASIAN);
799 //A test to ensure that our Lao word boundary detection is useful
800 void TestBreakIterator::testLao()
802 lang::Locale aLocale;
803 aLocale.Language = "lo";
804 aLocale.Country = "LA";
806 const sal_Unicode LAO[] = { 0x0e8d, 0x0eb4, 0x0e99, 0x0e94, 0x0eb5, 0x0e95, 0x0ec9, 0x0ead, 0x0e99, 0x0eae, 0x0eb1, 0x0e9a };
807 OUString aTest(LAO, SAL_N_ELEMENTS(LAO));
808 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
809 i18n::WordType::DICTIONARY_WORD, true);
811 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 5);
813 aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
814 i18n::WordType::DICTIONARY_WORD, true);
816 CPPUNIT_ASSERT(aBounds.startPos == 5 && aBounds.endPos == 9);
820 //A test to ensure that our thai word boundary detection is useful
821 void TestBreakIterator::testThai()
823 lang::Locale aLocale;
824 aLocale.Language = "th";
825 aLocale.Country = "TH";
827 //See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
829 const sal_Unicode THAI[] = { 0x0E01, 0x0E38, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
830 OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
831 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
832 i18n::WordType::DICTIONARY_WORD, true);
833 CPPUNIT_ASSERT_MESSAGE("Should skip full word",
834 aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
837 //See https://bz.apache.org/ooo/show_bug.cgi?id=29548
838 //make sure forwards and back are consistent
840 const sal_Unicode THAI[] =
842 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
843 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
844 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27,
845 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
846 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
847 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27
849 OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
851 std::stack<sal_Int32> aPositions;
852 sal_Int32 nPos = -1;
855 nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
856 aPositions.push(nPos);
858 while (nPos < aTest.getLength());
859 nPos = aTest.getLength();
860 CPPUNIT_ASSERT(!aPositions.empty());
861 aPositions.pop();
864 CPPUNIT_ASSERT(!aPositions.empty());
865 nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
866 CPPUNIT_ASSERT(nPos == aPositions.top());
867 aPositions.pop();
869 while (nPos > 0);
873 #ifdef TODO
874 void TestBreakIterator::testNorthernThai()
876 lang::Locale aLocale;
877 aLocale.Language = "nod";
878 aLocale.Country = "TH";
880 const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
881 OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1));
882 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
883 i18n::WordType::DICTIONARY_WORD, true);
884 CPPUNIT_ASSERT_MESSAGE("Should skip full word",
885 aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
887 #endif
889 #if (U_ICU_VERSION_MAJOR_NUM > 4)
890 // Not sure if any version earlier than 49 did have Khmer word boundary
891 // dictionaries, 4.6 does not.
893 //A test to ensure that our khmer word boundary detection is useful
894 //https://bugs.libreoffice.org/show_bug.cgi?id=52020
895 void TestBreakIterator::testKhmer()
897 lang::Locale aLocale;
898 aLocale.Language = "km";
899 aLocale.Country = "KH";
901 const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
903 OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER));
904 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
905 i18n::WordType::DICTIONARY_WORD, true);
907 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
909 aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
910 i18n::WordType::DICTIONARY_WORD, true);
912 CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5);
914 #endif
916 void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > &xBreak)
918 lang::Locale aLocale;
919 aLocale.Language = "ja";
920 aLocale.Country = "JP";
921 i18n::Boundary aBounds;
924 const sal_Unicode JAPANESE[] = { 0x30B7, 0x30E3, 0x30C3, 0x30C8, 0x30C0, 0x30A6, 0x30F3 };
926 OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
927 aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
928 i18n::WordType::DICTIONARY_WORD, true);
930 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
934 const sal_Unicode JAPANESE[] = { 0x9EBB, 0x306E, 0x8449, 0x9EBB, 0x306E, 0x8449 };
936 OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
937 aBounds = xBreak->getWordBoundary(aTest, 1, aLocale,
938 i18n::WordType::DICTIONARY_WORD, true);
940 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
942 aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
943 i18n::WordType::DICTIONARY_WORD, true);
945 CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 6);
949 void TestBreakIterator::testJapanese()
951 doTestJapanese(m_xBreak);
953 // fdo#78479 - test second / cached instantiation of xdictionary
954 uno::Reference< i18n::XBreakIterator > xTmpBreak(m_xSFactory->createInstance(
955 "com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
957 doTestJapanese(xTmpBreak);
960 void TestBreakIterator::testChinese()
962 lang::Locale aLocale;
963 aLocale.Language = "zh";
964 aLocale.Country = "CN";
965 i18n::Boundary aBounds;
968 const sal_Unicode CHINESE[] = { 0x6A35, 0x6A30, 0x69FE, 0x8919, 0xD867, 0xDEDB };
970 OUString aTest(CHINESE, SAL_N_ELEMENTS(CHINESE));
971 aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale,
972 i18n::WordType::DICTIONARY_WORD, true);
973 CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 6);
976 void TestBreakIterator::setUp()
978 BootstrapFixtureBase::setUp();
979 m_xBreak = uno::Reference< i18n::XBreakIterator >(m_xSFactory->createInstance(
980 "com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
983 void TestBreakIterator::tearDown()
985 m_xBreak.clear();
986 BootstrapFixtureBase::tearDown();
989 CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator);
991 CPPUNIT_PLUGIN_IMPLEMENT();
993 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */