Bump version to 4.3-4
[LibreOffice.git] / i18npool / source / breakiterator / breakiterator_unicode.cxx
blob5270b1db5bad61989eacd42f4d9ed5c62ab7c4d0
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <breakiterator_unicode.hxx>
21 #include <cppuhelper/supportsservice.hxx>
22 #include <localedata.hxx>
23 #include <i18nlangtag/languagetag.hxx>
24 #include <i18nlangtag/languagetagicu.hxx>
25 #include <unicode/uchar.h>
26 #include <unicode/locid.h>
27 #include <unicode/rbbi.h>
28 #include <unicode/udata.h>
29 #include <rtl/strbuf.hxx>
30 #include <rtl/ustring.hxx>
31 #include <string.h>
33 U_CDECL_BEGIN
34 extern const char OpenOffice_dat[];
35 U_CDECL_END
37 using namespace ::com::sun::star;
38 using namespace ::com::sun::star::lang;
40 namespace com { namespace sun { namespace star { namespace i18n {
43 BreakIterator_Unicode::BreakIterator_Unicode()
44 : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ) // implementation name
45 , wordRule( "word" )
46 , lineRule( "line" )
47 , icuBI( NULL )
48 , aBreakType(0)
52 BreakIterator_Unicode::~BreakIterator_Unicode()
54 delete character.aBreakIterator;
55 delete sentence.aBreakIterator;
56 delete line.aBreakIterator;
57 for (size_t i = 0; i < SAL_N_ELEMENTS(words); i++)
58 delete words[i].aBreakIterator;
62 Wrapper class to provide public access to the RuleBasedBreakIterator's
63 setbreakType method.
65 class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator
67 public:
68 inline void publicSetBreakType(int32_t type)
70 setBreakType(type);
72 OOoRuleBasedBreakIterator(UDataMemory* image,
73 UErrorCode &status)
74 : RuleBasedBreakIterator(image, status)
75 { };
79 // loading ICU breakiterator on demand.
80 void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale,
81 sal_Int16 rBreakType, sal_Int16 nWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException)
83 bool newBreak = false;
84 UErrorCode status = U_ZERO_ERROR;
85 sal_Int16 breakType = 0;
86 switch (rBreakType) {
87 case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
88 case LOAD_WORD_BREAKITERATOR:
89 assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
90 icuBI=&words[nWordType];
91 switch (nWordType) {
92 case WordType::ANY_WORD: break; // odd but previous behavior
93 case WordType::ANYWORD_IGNOREWHITESPACES:
94 breakType = 0; rule = wordRule = "edit_word"; break;
95 case WordType::DICTIONARY_WORD:
96 breakType = 1; rule = wordRule = "dict_word"; break;
97 default:
98 case WordType::WORD_COUNT:
99 breakType = 2; rule = wordRule = "count_word"; break;
101 break;
102 case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
103 case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
105 if (!icuBI->aBreakIterator ||
106 rLocale.Language != icuBI->maLocale.Language ||
107 rLocale.Country != icuBI->maLocale.Country ||
108 rLocale.Variant != icuBI->maLocale.Variant) {
109 if (icuBI->aBreakIterator) {
110 delete icuBI->aBreakIterator;
111 icuBI->aBreakIterator=NULL;
113 if (rule) {
114 uno::Sequence< OUString > breakRules = LocaleDataImpl().getBreakIteratorRules(rLocale);
116 status = U_ZERO_ERROR;
117 udata_setAppData("OpenOffice", OpenOffice_dat, &status);
118 if ( !U_SUCCESS(status) ) throw uno::RuntimeException();
120 OOoRuleBasedBreakIterator *rbi = NULL;
122 if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
124 rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
125 OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
127 //use icu's breakiterator for Thai, Khmer, Tibetan and Dzongkha
128 else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "km" && rLocale.Language != "bo" && rLocale.Language != "dz")
130 status = U_ZERO_ERROR;
131 OStringBuffer aUDName(64);
132 aUDName.append(rule);
133 aUDName.append('_');
134 aUDName.append( OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US));
135 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
136 if( U_SUCCESS(status) )
137 rbi = new OOoRuleBasedBreakIterator( pUData, status);
138 if (!U_SUCCESS(status) ) {
139 status = U_ZERO_ERROR;
140 pUData = udata_open("OpenOffice", "brk", rule, &status);
141 if( U_SUCCESS(status) )
142 rbi = new OOoRuleBasedBreakIterator( pUData, status);
143 if (!U_SUCCESS(status) ) icuBI->aBreakIterator=NULL;
146 if (rbi) {
147 switch (rBreakType) {
148 case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
149 case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
150 case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
151 case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
153 icuBI->aBreakIterator = rbi;
157 if (!icuBI->aBreakIterator) {
158 icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale)));
160 status = U_ZERO_ERROR;
161 switch (rBreakType) {
162 case LOAD_CHARACTER_BREAKITERATOR:
163 icuBI->aBreakIterator = icu::BreakIterator::createCharacterInstance(icuLocale, status);
164 break;
165 case LOAD_WORD_BREAKITERATOR:
166 icuBI->aBreakIterator = icu::BreakIterator::createWordInstance(icuLocale, status);
167 break;
168 case LOAD_SENTENCE_BREAKITERATOR:
169 icuBI->aBreakIterator = icu::BreakIterator::createSentenceInstance(icuLocale, status);
170 break;
171 case LOAD_LINE_BREAKITERATOR:
172 icuBI->aBreakIterator = icu::BreakIterator::createLineInstance(icuLocale, status);
173 break;
175 if ( !U_SUCCESS(status) ) {
176 icuBI->aBreakIterator=NULL;
177 throw uno::RuntimeException();
180 if (icuBI->aBreakIterator) {
181 icuBI->maLocale=rLocale;
182 newBreak=true;
183 } else {
184 throw uno::RuntimeException();
188 if (newBreak || !icuBI->aICUText.equals(rText))
190 // UChar != sal_Unicode in MinGW
191 const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
193 icuBI->ut = utext_openUChars(icuBI->ut, pText, rText.getLength(), &status);
195 if (!U_SUCCESS(status))
196 throw uno::RuntimeException();
198 icuBI->aBreakIterator->setText(icuBI->ut, status);
200 if (!U_SUCCESS(status))
201 throw uno::RuntimeException();
203 icuBI->aICUText = rText;
207 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
208 sal_Int32 nStartPos, const lang::Locale &rLocale,
209 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
210 throw(uno::RuntimeException, std::exception)
212 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
213 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
214 for (nDone = 0; nDone < nCount; nDone++) {
215 nStartPos = character.aBreakIterator->following(nStartPos);
216 if (nStartPos == BreakIterator::DONE)
217 return Text.getLength();
219 } else { // for CHARACTER mode
220 for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
221 Text.iterateCodePoints(&nStartPos, 1);
223 return nStartPos;
226 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
227 sal_Int32 nStartPos, const lang::Locale& rLocale,
228 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
229 throw(uno::RuntimeException, std::exception)
231 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
232 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
233 for (nDone = 0; nDone < nCount; nDone++) {
234 nStartPos = character.aBreakIterator->preceding(nStartPos);
235 if (nStartPos == BreakIterator::DONE)
236 return 0;
238 } else { // for BS to delete one char and CHARACTER mode.
239 for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
240 Text.iterateCodePoints(&nStartPos, -1);
242 return nStartPos;
246 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
247 const lang::Locale& rLocale, sal_Int16 rWordType ) throw(uno::RuntimeException, std::exception)
249 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
251 result.startPos = icuBI->aBreakIterator->following(nStartPos);
252 if( result.startPos >= Text.getLength() || result.startPos == BreakIterator::DONE )
253 result.endPos = result.startPos;
254 else {
255 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
256 rWordType == WordType::DICTIONARY_WORD ) &&
257 u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
258 result.startPos = icuBI->aBreakIterator->following(result.startPos);
260 result.endPos = icuBI->aBreakIterator->following(result.startPos);
261 if(result.endPos == BreakIterator::DONE)
262 result.endPos = result.startPos;
264 return result;
268 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
269 const lang::Locale& rLocale, sal_Int16 rWordType) throw(uno::RuntimeException, std::exception)
271 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
273 result.startPos = icuBI->aBreakIterator->preceding(nStartPos);
274 if( result.startPos < 0 || result.startPos == BreakIterator::DONE)
275 result.endPos = result.startPos;
276 else {
277 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
278 rWordType == WordType::DICTIONARY_WORD) &&
279 u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
280 result.startPos = icuBI->aBreakIterator->preceding(result.startPos);
282 result.endPos = icuBI->aBreakIterator->following(result.startPos);
283 if(result.endPos == BreakIterator::DONE)
284 result.endPos = result.startPos;
286 return result;
290 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
291 sal_Int16 rWordType, sal_Bool bDirection ) throw(uno::RuntimeException, std::exception)
293 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
294 sal_Int32 len = Text.getLength();
296 if(icuBI->aBreakIterator->isBoundary(nPos)) {
297 result.startPos = result.endPos = nPos;
298 if((bDirection || nPos == 0) && nPos < len) //forward
299 result.endPos = icuBI->aBreakIterator->following(nPos);
300 else
301 result.startPos = icuBI->aBreakIterator->preceding(nPos);
302 } else {
303 if(nPos <= 0) {
304 result.startPos = 0;
305 result.endPos = len ? icuBI->aBreakIterator->following((sal_Int32)0) : 0;
306 } else if(nPos >= len) {
307 result.startPos = icuBI->aBreakIterator->preceding(len);
308 result.endPos = len;
309 } else {
310 result.startPos = icuBI->aBreakIterator->preceding(nPos);
311 result.endPos = icuBI->aBreakIterator->following(nPos);
314 if (result.startPos == BreakIterator::DONE)
315 result.startPos = result.endPos;
316 else if (result.endPos == BreakIterator::DONE)
317 result.endPos = result.startPos;
319 return result;
323 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
324 const lang::Locale &rLocale ) throw(uno::RuntimeException, std::exception)
326 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
328 sal_Int32 len = Text.getLength();
329 if (len > 0 && nStartPos == len)
330 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
331 if (!sentence.aBreakIterator->isBoundary(nStartPos))
332 nStartPos = sentence.aBreakIterator->preceding(nStartPos);
334 // skip preceding space.
335 sal_uInt32 ch = Text.iterateCodePoints(&nStartPos, 1);
336 while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos, 1);
337 Text.iterateCodePoints(&nStartPos, -1);
339 return nStartPos;
342 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
343 const lang::Locale &rLocale ) throw(uno::RuntimeException, std::exception)
345 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
347 sal_Int32 len = Text.getLength();
348 if (len > 0 && nStartPos == len)
349 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
350 nStartPos = sentence.aBreakIterator->following(nStartPos);
352 sal_Int32 nPos=nStartPos;
353 while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
355 return nStartPos;
358 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
359 const OUString& Text, sal_Int32 nStartPos,
360 const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
361 const LineBreakHyphenationOptions& hOptions,
362 const LineBreakUserOptions& /*rOptions*/ ) throw(uno::RuntimeException, std::exception)
364 LineBreakResults lbr;
366 if (nStartPos >= Text.getLength()) {
367 lbr.breakIndex = Text.getLength();
368 lbr.breakType = BreakType::WORDBOUNDARY;
369 return lbr;
372 loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
374 bool GlueSpace=true;
375 while (GlueSpace) {
376 if (line.aBreakIterator->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
377 lbr.breakIndex = nStartPos;
378 lbr.breakType = BreakType::WORDBOUNDARY;
379 } else if (hOptions.rHyphenator.is()) { //Hyphenation break
380 sal_Int32 boundary_with_punctuation = (line.aBreakIterator->next() != BreakIterator::DONE) ? line.aBreakIterator->current() : 0;
381 line.aBreakIterator->preceding(nStartPos + 1); // reset to check correct hyphenation of "word-word"
383 sal_Int32 nStartPosWordEnd = nStartPos;
384 while (line.aBreakIterator->current() < nStartPosWordEnd && u_ispunct((sal_uInt32)Text[nStartPosWordEnd])) // starting punctuation
385 nStartPosWordEnd --;
387 Boundary wBoundary = getWordBoundary( Text, nStartPosWordEnd, rLocale,
388 WordType::DICTIONARY_WORD, false);
390 nStartPosWordEnd = wBoundary.endPos;
391 while (nStartPosWordEnd < Text.getLength() && (u_ispunct((sal_uInt32)Text[nStartPosWordEnd]))) // ending punctuation
392 nStartPosWordEnd ++;
393 nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos;
394 if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos;
395 #define SPACE 0x0020
396 while (boundary_with_punctuation > wBoundary.endPos && Text[--boundary_with_punctuation] == SPACE);
397 if (boundary_with_punctuation != 0) boundary_with_punctuation += 1 - wBoundary.endPos;
398 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord;
399 aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
400 wBoundary.endPos - wBoundary.startPos), rLocale,
401 (sal_Int16) (hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions);
402 if (aHyphenatedWord.is()) {
403 lbr.rHyphenatedWord = aHyphenatedWord;
404 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
405 lbr.breakIndex = -1;
406 else
407 lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
408 lbr.breakType = BreakType::HYPHENATION;
410 // check not optimal hyphenation of "word-word" (word with hyphens)
411 if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < line.aBreakIterator->current()) {
412 lbr.breakIndex = line.aBreakIterator->current();
413 lbr.breakType = BreakType::WORDBOUNDARY;
416 } else {
417 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
418 lbr.breakType = BreakType::WORDBOUNDARY;;
420 } else { //word boundary break
421 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
422 lbr.breakType = BreakType::WORDBOUNDARY;
425 #define WJ 0x2060 // Word Joiner
426 GlueSpace=false;
427 if (lbr.breakType == BreakType::WORDBOUNDARY) {
428 nStartPos = lbr.breakIndex;
429 if (Text[nStartPos--] == WJ)
430 GlueSpace=true;
431 while (nStartPos >= 0 &&
432 (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
433 if (Text[nStartPos--] == WJ)
434 GlueSpace=true;
436 if (GlueSpace && nStartPos < 0) {
437 lbr.breakIndex = 0;
438 break;
443 return lbr;
446 OUString SAL_CALL
447 BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException, std::exception )
449 return OUString::createFromAscii(cBreakIterator);
452 sal_Bool SAL_CALL
453 BreakIterator_Unicode::supportsService(const OUString& rServiceName) throw( uno::RuntimeException, std::exception )
455 return cppu::supportsService(this, rServiceName);
458 uno::Sequence< OUString > SAL_CALL
459 BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException, std::exception )
461 uno::Sequence< OUString > aRet(1);
462 aRet[0] = OUString::createFromAscii(cBreakIterator);
463 return aRet;
466 } } } }
468 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface * SAL_CALL
469 com_sun_star_i18n_BreakIterator_Unicode_get_implementation(
470 css::uno::XComponentContext *,
471 css::uno::Sequence<css::uno::Any> const &)
473 return cppu::acquire(new css::i18n::BreakIterator_Unicode());
476 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */