Version 3.6.0.2, tag libreoffice-3.6.0.2
[LibreOffice.git] / i18npool / qa / cppunit / test_breakiterator.cxx
blob3a64c31025e4f824d0464145fd04d85c115ba240
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * Version: MPL 1.1 / GPLv3+ / LGPLv3+
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
13 * License.
15 * The Initial Developer of the Original Code is
16 * Caolán McNamara <caolanm@redhat.com>
18 * Contributor(s):
19 * Caolán McNamara <caolanm@redhat.com>
21 * Alternatively, the contents of this file may be used under the terms of
22 * either the GNU General Public License Version 3 or later (the "GPLv3+"), or
23 * the GNU Lesser General Public License Version 3 or later (the "LGPLv3+"),
24 * in which case the provisions of the GPLv3+ or the LGPLv3+ are applicable
25 * instead of those above.
28 #include "sal/config.h"
29 #include "sal/precppunit.hxx"
31 #ifdef IOS
32 #define CPPUNIT_PLUGIN_EXPORTED_NAME cppunitTest_i18npool_breakiterator
33 #endif
35 #include <cppuhelper/compbase1.hxx>
36 #include <cppuhelper/bootstrap.hxx>
37 #include <cppuhelper/basemutex.hxx>
38 #include <com/sun/star/i18n/XBreakIterator.hpp>
39 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
40 #include <com/sun/star/i18n/ScriptType.hpp>
41 #include <com/sun/star/i18n/WordType.hpp>
42 #include <unotest/bootstrapfixturebase.hxx>
44 #include <unicode/uvernum.h>
46 #include <rtl/strbuf.hxx>
47 #include <rtl/ustrbuf.hxx>
49 #include <string.h>
51 using namespace ::com::sun::star;
53 class TestBreakIterator : public test::BootstrapFixtureBase
55 public:
56 virtual void setUp();
57 virtual void tearDown();
59 void testLineBreaking();
60 void testGraphemeIteration();
61 void testWeak();
62 void testAsian();
63 void testThai();
64 #if TODO
65 void testNorthernThai();
66 #endif
67 #if (U_ICU_VERSION_MAJOR_NUM > 4)
68 void testKhmer();
69 #endif
71 CPPUNIT_TEST_SUITE(TestBreakIterator);
72 CPPUNIT_TEST(testLineBreaking);
73 CPPUNIT_TEST(testGraphemeIteration);
74 CPPUNIT_TEST(testWeak);
75 CPPUNIT_TEST(testAsian);
76 CPPUNIT_TEST(testThai);
77 #if TODO
78 CPPUNIT_TEST(testNorthernThai);
79 #endif
80 #if (U_ICU_VERSION_MAJOR_NUM > 4)
81 CPPUNIT_TEST(testKhmer);
82 #endif
83 CPPUNIT_TEST_SUITE_END();
84 private:
85 uno::Reference<i18n::XBreakIterator> m_xBreak;
88 void TestBreakIterator::testLineBreaking()
90 i18n::LineBreakHyphenationOptions aHyphOptions;
91 i18n::LineBreakUserOptions aUserOptions;
92 lang::Locale aLocale;
94 //See https://bugs.freedesktop.org/show_bug.cgi?id=31271
96 ::rtl::OUString aTest(RTL_CONSTASCII_USTRINGPARAM("(some text here)"));
98 aLocale.Language = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("en"));
99 aLocale.Country = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("US"));
102 //Here we want the line break to leave text here) on the next line
103 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
104 CPPUNIT_ASSERT_MESSAGE("Expected a break at the the start of the word", aResult.breakIndex == 6);
108 //Here we want the line break to leave "here)" on the next line
109 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
110 CPPUNIT_ASSERT_MESSAGE("Expected a break at the the start of the word", aResult.breakIndex == 11);
114 //See https://bugs.freedesktop.org/show_bug.cgi?id=49849
116 const sal_Unicode HEBREW1[] = { 0x05DE, 0x05D9, 0x05DC, 0x05D9, 0x5DD };
117 ::rtl::OUString aWord(HEBREW1, SAL_N_ELEMENTS(HEBREW1));
118 ::rtl::OUString aTest(rtl::OUStringBuffer(aWord).append(' ').append(aWord).makeStringAndClear());
120 aLocale.Language = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("he"));
121 aLocale.Country = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("IL"));
124 //Here we want the line break to happen at the whitespace
125 i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
126 CPPUNIT_ASSERT_MESSAGE("Expected a break at the the start of the word", aResult.breakIndex == aWord.getLength()+1);
131 //See http://qa.openoffice.org/issues/show_bug.cgi?id=111152
132 //See https://bugs.freedesktop.org/show_bug.cgi?id=40292
133 void TestBreakIterator::testGraphemeIteration()
135 lang::Locale aLocale;
136 aLocale.Language = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bn"));
137 aLocale.Country = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("IN"));
140 const sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF };
141 ::rtl::OUString aTest(BA_HALANT_LA, SAL_N_ELEMENTS(BA_HALANT_LA));
143 sal_Int32 nDone=0;
144 sal_Int32 nPos;
145 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
146 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
147 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(BA_HALANT_LA));
148 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(BA_HALANT_LA), aLocale,
149 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
150 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
154 const sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF };
155 ::rtl::OUString aTest(HA_HALANT_NA_VOWELSIGNI, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
157 sal_Int32 nDone=0;
158 sal_Int32 nPos;
159 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
160 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
161 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
162 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI), aLocale,
163 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
164 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
168 const sal_Unicode TA_HALANT_MA_HALANT_YA [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF };
169 ::rtl::OUString aTest(TA_HALANT_MA_HALANT_YA, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
171 sal_Int32 nDone=0;
172 sal_Int32 nPos;
173 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
174 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
175 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
176 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA), aLocale,
177 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
178 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
181 aLocale.Language = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("ta"));
182 aLocale.Country = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("IN"));
185 const sal_Unicode KA_VIRAMA_SSA[] = { 0x0B95, 0x0BCD, 0x0BB7 };
186 ::rtl::OUString aTest(KA_VIRAMA_SSA, SAL_N_ELEMENTS(KA_VIRAMA_SSA));
188 sal_Int32 nDone=0;
189 sal_Int32 nPos = 0;
191 nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
192 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
193 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(KA_VIRAMA_SSA));
194 nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VIRAMA_SSA), aLocale,
195 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
196 CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
200 const sal_Unicode CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI[] =
201 { 0x0B9A, 0x0BBF, 0x0BA4, 0x0BCD, 0x0BA4, 0x0BBF, 0x0BB0, 0x0BC8 };
202 ::rtl::OUString aTest(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI,
203 SAL_N_ELEMENTS(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI));
205 sal_Int32 nDone=0;
206 sal_Int32 nPos=0;
208 for (sal_Int32 i = 0; i < 4; ++i)
210 sal_Int32 nOldPos = nPos;
211 nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale,
212 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
213 CPPUNIT_ASSERT_MESSAGE("Should skip 2 units", nPos == nOldPos+2);
216 for (sal_Int32 i = 0; i < 4; ++i)
218 sal_Int32 nOldPos = nPos;
219 nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale,
220 i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
221 CPPUNIT_ASSERT_MESSAGE("Should skip 2 units", nPos == nOldPos-2);
226 const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
227 ::rtl::OUString aText(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
229 sal_Int32 nGraphemeCount = 0;
231 sal_Int32 nCurPos = 0;
232 while (nCurPos < aText.getLength())
234 sal_Int32 nCount2 = 1;
235 nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
236 i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
237 ++nGraphemeCount;
240 CPPUNIT_ASSERT_MESSAGE("Should be considered 1 grapheme", nGraphemeCount == 1);
244 //A test to ensure that certain ranges and codepoints that are categorized as
245 //weak remain as weak, so that existing docs that depend on this don't silently
246 //change font for those weak chars
247 void TestBreakIterator::testWeak()
249 lang::Locale aLocale;
250 aLocale.Language = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("en"));
251 aLocale.Country = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("US"));
254 const sal_Unicode WEAKS[] =
256 0x0001, 0x0002,
257 0x0020, 0x00A0,
258 0x2150, 0x215F, //Number Forms, fractions
259 0x2160, 0x2180, //Number Forms, roman numerals
260 0x2200, 0x22FF, //Mathematical Operators
261 0x27C0, 0x27EF, //Miscellaneous Mathematical Symbols-A
262 0x2980, 0x29FF, //Miscellaneous Mathematical Symbols-B
263 0x2A00, 0x2AFF, //Supplemental Mathematical Operators
264 0x2100, 0x214F, //Letterlike Symbols
265 0x2308, 0x230B, //Miscellaneous technical
266 0x25A0, 0x25FF, //Geometric Shapes
267 0x2B30, 0x2B4C //Miscellaneous Symbols and Arrows
269 ::rtl::OUString aWeaks(WEAKS, SAL_N_ELEMENTS(WEAKS));
271 for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
273 sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
274 rtl::OStringBuffer aMsg;
275 aMsg.append(RTL_CONSTASCII_STRINGPARAM("Char 0x"));
276 aMsg.append(static_cast<sal_Int32>(aWeaks.getStr()[i]), 16);
277 aMsg.append(RTL_CONSTASCII_STRINGPARAM(" should have been weak"));
278 CPPUNIT_ASSERT_MESSAGE(aMsg.getStr(),
279 nScript == i18n::ScriptType::WEAK);
284 //A test to ensure that certain ranges and codepoints that are categorized as
285 //asian remain as asian, so that existing docs that depend on this don't silently
286 //change font for those asian chars.
287 //See https://bugs.freedesktop.org/show_bug.cgi?id=38095
288 void TestBreakIterator::testAsian()
290 lang::Locale aLocale;
291 aLocale.Language = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("en"));
292 aLocale.Country = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("US"));
295 const sal_Unicode ASIANS[] =
297 //some typical CJK chars
298 0x4E00, 0x62FF,
299 //The full HalfWidth and FullWidth block has historically been
300 //designated as taking the CJK font :-(
301 //HalfWidth and FullWidth forms of ASCII 0-9, categorized under
302 //UAX24 as "Common" i.e. by that logic WEAK
303 0xFF10, 0xFF19,
304 //HalfWidth and FullWidth forms of ASCII A-z, categorized under
305 //UAX25 as "Latin", i.e. by that logic LATIN
306 0xFF21, 0xFF5A
308 ::rtl::OUString aAsians(ASIANS, SAL_N_ELEMENTS(ASIANS));
310 for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
312 sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
313 rtl::OStringBuffer aMsg;
314 aMsg.append(RTL_CONSTASCII_STRINGPARAM("Char 0x"));
315 aMsg.append(static_cast<sal_Int32>(aAsians.getStr()[i]), 16);
316 aMsg.append(RTL_CONSTASCII_STRINGPARAM(" should have been asian"));
317 CPPUNIT_ASSERT_MESSAGE(aMsg.getStr(),
318 nScript == i18n::ScriptType::ASIAN);
323 //A test to ensure that our thai word boundary detection is useful
324 //http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
325 void TestBreakIterator::testThai()
327 lang::Locale aLocale;
328 aLocale.Language = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("th"));
329 aLocale.Country = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("TH"));
331 const sal_Unicode THAI1[] = { 0x0E01, 0x0E38, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
332 ::rtl::OUString aTest(THAI1, SAL_N_ELEMENTS(THAI1));
333 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
334 i18n::WordType::DICTIONARY_WORD, true);
335 CPPUNIT_ASSERT_MESSAGE("Should skip full word",
336 aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
339 #if TODO
340 void TestBreakIterator::testNorthernThai()
342 lang::Locale aLocale;
343 aLocale.Language = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("nod"));
344 aLocale.Country = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("TH"));
346 const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
347 ::rtl::OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1));
348 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
349 i18n::WordType::DICTIONARY_WORD, true);
350 CPPUNIT_ASSERT_MESSAGE("Should skip full word",
351 aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
353 #endif
355 #if (U_ICU_VERSION_MAJOR_NUM > 4)
356 //A test to ensure that our khmer word boundary detection is useful
357 //https://bugs.freedesktop.org/show_bug.cgi?id=52020
359 //icu doesn't have the Khmer word boundary dictionaries in <= 4.0.0 but does in
360 //the current 49.x.y . Not sure which version first had them introduced.
361 void TestBreakIterator::testKhmer()
363 lang::Locale aLocale;
364 aLocale.Language = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("km"));
365 aLocale.Country = ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("KH"));
367 const sal_Unicode KHMER1[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
369 ::rtl::OUString aTest(KHMER1, SAL_N_ELEMENTS(KHMER1));
370 i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
371 i18n::WordType::DICTIONARY_WORD, true);
373 CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
375 aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
376 i18n::WordType::DICTIONARY_WORD, true);
378 CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5);
380 #endif
382 void TestBreakIterator::setUp()
384 BootstrapFixtureBase::setUp();
385 m_xBreak = uno::Reference< i18n::XBreakIterator >(m_xSFactory->createInstance(
386 "com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
389 void TestBreakIterator::tearDown()
391 BootstrapFixtureBase::tearDown();
392 m_xBreak.clear();
395 CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator);
397 CPPUNIT_PLUGIN_IMPLEMENT();
399 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */