bump product version to 4.1.6.2
[LibreOffice.git] / i18npool / source / breakiterator / breakiterator_unicode.cxx
blob549abd1c12893ddb85071a36c82fb39e3346dc4b
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <breakiterator_unicode.hxx>
21 #include <localedata.hxx>
22 #include <i18nlangtag/languagetag.hxx>
23 #include <i18nlangtag/languagetagicu.hxx>
24 #include <unicode/uchar.h>
25 #include <unicode/locid.h>
26 #include <unicode/rbbi.h>
27 #include <unicode/udata.h>
28 #include <rtl/strbuf.hxx>
29 #include <rtl/ustring.hxx>
30 #include <string.h>
32 U_CDECL_BEGIN
33 extern const char OpenOffice_dat[];
34 U_CDECL_END
36 using namespace ::com::sun::star;
37 using namespace ::com::sun::star::lang;
38 using namespace ::rtl;
40 namespace com { namespace sun { namespace star { namespace i18n {
43 BreakIterator_Unicode::BreakIterator_Unicode() :
44 cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ), // implementation name
45 wordRule( "word" ),
46 lineRule( "line" ),
47 icuBI( NULL )
51 BreakIterator_Unicode::~BreakIterator_Unicode()
53 delete character.aBreakIterator;
54 delete sentence.aBreakIterator;
55 delete line.aBreakIterator;
56 for (size_t i = 0; i < SAL_N_ELEMENTS(words); i++)
57 delete words[i].aBreakIterator;
61 Wrapper class to provide public access to the RuleBasedBreakIterator's
62 setbreakType method.
64 class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator {
65 public:
66 inline void publicSetBreakType(int32_t type) {
67 setBreakType(type);
69 OOoRuleBasedBreakIterator(UDataMemory* image,
70 UErrorCode &status) :
71 RuleBasedBreakIterator(image, status) { };
75 // loading ICU breakiterator on demand.
76 void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale,
77 sal_Int16 rBreakType, sal_Int16 nWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException)
79 sal_Bool newBreak = sal_False;
80 UErrorCode status = U_ZERO_ERROR;
81 sal_Int16 breakType = 0;
82 switch (rBreakType) {
83 case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
84 case LOAD_WORD_BREAKITERATOR:
85 assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
86 icuBI=&words[nWordType];
87 switch (nWordType) {
88 case WordType::ANY_WORD: break; // odd but previous behavior
89 case WordType::ANYWORD_IGNOREWHITESPACES:
90 breakType = 0; rule = wordRule = "edit_word"; break;
91 case WordType::DICTIONARY_WORD:
92 breakType = 1; rule = wordRule = "dict_word"; break;
93 default:
94 case WordType::WORD_COUNT:
95 breakType = 2; rule = wordRule = "count_word"; break;
97 break;
98 case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
99 case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
101 if (!icuBI->aBreakIterator ||
102 rLocale.Language != icuBI->maLocale.Language ||
103 rLocale.Country != icuBI->maLocale.Country ||
104 rLocale.Variant != icuBI->maLocale.Variant) {
105 if (icuBI->aBreakIterator) {
106 delete icuBI->aBreakIterator;
107 icuBI->aBreakIterator=NULL;
109 if (rule) {
110 uno::Sequence< OUString > breakRules = LocaleData().getBreakIteratorRules(rLocale);
112 status = U_ZERO_ERROR;
113 udata_setAppData("OpenOffice", OpenOffice_dat, &status);
114 if ( !U_SUCCESS(status) ) throw uno::RuntimeException();
116 OOoRuleBasedBreakIterator *rbi = NULL;
118 if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
120 rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
121 OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
123 //use icu's breakiterator for Thai, Khmer, Tibetan and Dzongkha
124 else if (rLocale.Language != "th" && rLocale.Language != "km" && rLocale.Language != "bo" && rLocale.Language != "dz")
126 status = U_ZERO_ERROR;
127 OStringBuffer aUDName(64);
128 aUDName.append(rule);
129 aUDName.append('_');
130 aUDName.append( OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US));
131 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
132 if( U_SUCCESS(status) )
133 rbi = new OOoRuleBasedBreakIterator( pUData, status);
134 if (!U_SUCCESS(status) ) {
135 status = U_ZERO_ERROR;
136 pUData = udata_open("OpenOffice", "brk", rule, &status);
137 if( U_SUCCESS(status) )
138 rbi = new OOoRuleBasedBreakIterator( pUData, status);
139 if (!U_SUCCESS(status) ) icuBI->aBreakIterator=NULL;
142 if (rbi) {
143 switch (rBreakType) {
144 case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
145 case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
146 case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
147 case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
149 icuBI->aBreakIterator = rbi;
153 if (!icuBI->aBreakIterator) {
154 icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale)));
156 status = U_ZERO_ERROR;
157 switch (rBreakType) {
158 case LOAD_CHARACTER_BREAKITERATOR:
159 icuBI->aBreakIterator = icu::BreakIterator::createCharacterInstance(icuLocale, status);
160 break;
161 case LOAD_WORD_BREAKITERATOR:
162 icuBI->aBreakIterator = icu::BreakIterator::createWordInstance(icuLocale, status);
163 break;
164 case LOAD_SENTENCE_BREAKITERATOR:
165 icuBI->aBreakIterator = icu::BreakIterator::createSentenceInstance(icuLocale, status);
166 break;
167 case LOAD_LINE_BREAKITERATOR:
168 icuBI->aBreakIterator = icu::BreakIterator::createLineInstance(icuLocale, status);
169 break;
171 if ( !U_SUCCESS(status) ) {
172 icuBI->aBreakIterator=NULL;
173 throw uno::RuntimeException();
176 if (icuBI->aBreakIterator) {
177 icuBI->maLocale=rLocale;
178 newBreak=sal_True;
179 } else {
180 throw uno::RuntimeException();
184 if (newBreak || !icuBI->aICUText.equals(rText))
186 // UChar != sal_Unicode in MinGW
187 const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
189 icuBI->ut = utext_openUChars(icuBI->ut, pText, rText.getLength(), &status);
191 if (!U_SUCCESS(status))
192 throw uno::RuntimeException();
194 icuBI->aBreakIterator->setText(icuBI->ut, status);
196 if (!U_SUCCESS(status))
197 throw uno::RuntimeException();
199 icuBI->aICUText = rText;
203 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
204 sal_Int32 nStartPos, const lang::Locale &rLocale,
205 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
206 throw(uno::RuntimeException)
208 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
209 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
210 for (nDone = 0; nDone < nCount; nDone++) {
211 nStartPos = character.aBreakIterator->following(nStartPos);
212 if (nStartPos == BreakIterator::DONE)
213 return Text.getLength();
215 } else { // for CHARACTER mode
216 for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
217 Text.iterateCodePoints(&nStartPos, 1);
219 return nStartPos;
222 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
223 sal_Int32 nStartPos, const lang::Locale& rLocale,
224 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
225 throw(uno::RuntimeException)
227 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
228 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
229 for (nDone = 0; nDone < nCount; nDone++) {
230 nStartPos = character.aBreakIterator->preceding(nStartPos);
231 if (nStartPos == BreakIterator::DONE)
232 return 0;
234 } else { // for BS to delete one char and CHARACTER mode.
235 for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
236 Text.iterateCodePoints(&nStartPos, -1);
238 return nStartPos;
242 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
243 const lang::Locale& rLocale, sal_Int16 rWordType ) throw(uno::RuntimeException)
245 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
247 result.startPos = icuBI->aBreakIterator->following(nStartPos);
248 if( result.startPos >= Text.getLength() || result.startPos == BreakIterator::DONE )
249 result.endPos = result.startPos;
250 else {
251 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
252 rWordType == WordType::DICTIONARY_WORD ) &&
253 u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
254 result.startPos = icuBI->aBreakIterator->following(result.startPos);
256 result.endPos = icuBI->aBreakIterator->following(result.startPos);
257 if(result.endPos == BreakIterator::DONE)
258 result.endPos = result.startPos;
260 return result;
264 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
265 const lang::Locale& rLocale, sal_Int16 rWordType) throw(uno::RuntimeException)
267 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
269 result.startPos = icuBI->aBreakIterator->preceding(nStartPos);
270 if( result.startPos < 0 || result.startPos == BreakIterator::DONE)
271 result.endPos = result.startPos;
272 else {
273 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
274 rWordType == WordType::DICTIONARY_WORD) &&
275 u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
276 result.startPos = icuBI->aBreakIterator->preceding(result.startPos);
278 result.endPos = icuBI->aBreakIterator->following(result.startPos);
279 if(result.endPos == BreakIterator::DONE)
280 result.endPos = result.startPos;
282 return result;
286 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
287 sal_Int16 rWordType, sal_Bool bDirection ) throw(uno::RuntimeException)
289 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
290 sal_Int32 len = Text.getLength();
292 if(icuBI->aBreakIterator->isBoundary(nPos)) {
293 result.startPos = result.endPos = nPos;
294 if((bDirection || nPos == 0) && nPos < len) //forward
295 result.endPos = icuBI->aBreakIterator->following(nPos);
296 else
297 result.startPos = icuBI->aBreakIterator->preceding(nPos);
298 } else {
299 if(nPos <= 0) {
300 result.startPos = 0;
301 result.endPos = len ? icuBI->aBreakIterator->following((sal_Int32)0) : 0;
302 } else if(nPos >= len) {
303 result.startPos = icuBI->aBreakIterator->preceding(len);
304 result.endPos = len;
305 } else {
306 result.startPos = icuBI->aBreakIterator->preceding(nPos);
307 result.endPos = icuBI->aBreakIterator->following(nPos);
310 if (result.startPos == BreakIterator::DONE)
311 result.startPos = result.endPos;
312 else if (result.endPos == BreakIterator::DONE)
313 result.endPos = result.startPos;
315 return result;
319 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
320 const lang::Locale &rLocale ) throw(uno::RuntimeException)
322 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
324 sal_Int32 len = Text.getLength();
325 if (len > 0 && nStartPos == len)
326 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
327 if (!sentence.aBreakIterator->isBoundary(nStartPos))
328 nStartPos = sentence.aBreakIterator->preceding(nStartPos);
330 // skip preceding space.
331 sal_uInt32 ch = Text.iterateCodePoints(&nStartPos, 1);
332 while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos, 1);
333 Text.iterateCodePoints(&nStartPos, -1);
335 return nStartPos;
338 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
339 const lang::Locale &rLocale ) throw(uno::RuntimeException)
341 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
343 sal_Int32 len = Text.getLength();
344 if (len > 0 && nStartPos == len)
345 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
346 nStartPos = sentence.aBreakIterator->following(nStartPos);
348 sal_Int32 nPos=nStartPos;
349 while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
351 return nStartPos;
354 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
355 const OUString& Text, sal_Int32 nStartPos,
356 const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
357 const LineBreakHyphenationOptions& hOptions,
358 const LineBreakUserOptions& /*rOptions*/ ) throw(uno::RuntimeException)
360 LineBreakResults lbr;
362 if (nStartPos >= Text.getLength()) {
363 lbr.breakIndex = Text.getLength();
364 lbr.breakType = BreakType::WORDBOUNDARY;
365 return lbr;
368 loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
370 sal_Bool GlueSpace=sal_True;
371 while (GlueSpace) {
372 if (line.aBreakIterator->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
373 lbr.breakIndex = nStartPos;
374 lbr.breakType = BreakType::WORDBOUNDARY;
375 } else if (hOptions.rHyphenator.is()) { //Hyphenation break
376 Boundary wBoundary = getWordBoundary( Text, nStartPos, rLocale,
377 WordType::DICTIONARY_WORD, false);
378 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord;
379 aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
380 wBoundary.endPos - wBoundary.startPos), rLocale,
381 (sal_Int16) (hOptions.hyphenIndex - wBoundary.startPos), hOptions.aHyphenationOptions);
382 if (aHyphenatedWord.is()) {
383 lbr.rHyphenatedWord = aHyphenatedWord;
384 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
385 lbr.breakIndex = -1;
386 else
387 lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
388 lbr.breakType = BreakType::HYPHENATION;
389 } else {
390 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
391 lbr.breakType = BreakType::WORDBOUNDARY;;
393 } else { //word boundary break
394 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
395 lbr.breakType = BreakType::WORDBOUNDARY;
398 #define WJ 0x2060 // Word Joiner
399 GlueSpace=sal_False;
400 if (lbr.breakType == BreakType::WORDBOUNDARY) {
401 nStartPos = lbr.breakIndex;
402 if (Text[nStartPos--] == WJ)
403 GlueSpace=sal_True;
404 while (nStartPos >= 0 &&
405 (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
406 if (Text[nStartPos--] == WJ)
407 GlueSpace=sal_True;
409 if (GlueSpace && nStartPos < 0) {
410 lbr.breakIndex = 0;
411 break;
416 return lbr;
421 OUString SAL_CALL
422 BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException )
424 return OUString::createFromAscii(cBreakIterator);
427 sal_Bool SAL_CALL
428 BreakIterator_Unicode::supportsService(const OUString& rServiceName) throw( uno::RuntimeException )
430 return !rServiceName.compareToAscii(cBreakIterator);
433 uno::Sequence< OUString > SAL_CALL
434 BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException )
436 uno::Sequence< OUString > aRet(1);
437 aRet[0] = OUString::createFromAscii(cBreakIterator);
438 return aRet;
441 } } } }
443 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */