1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <breakiterator_unicode.hxx>
21 #include <cppuhelper/supportsservice.hxx>
22 #include <localedata.hxx>
23 #include <i18nlangtag/languagetag.hxx>
24 #include <i18nlangtag/languagetagicu.hxx>
25 #include <unicode/uchar.h>
26 #include <unicode/locid.h>
27 #include <unicode/rbbi.h>
28 #include <unicode/udata.h>
29 #include <rtl/strbuf.hxx>
30 #include <rtl/ustring.hxx>
34 extern const char OpenOffice_dat
[];
37 using namespace ::com::sun::star
;
38 using namespace ::com::sun::star::lang
;
40 namespace com
{ namespace sun
{ namespace star
{ namespace i18n
{
43 BreakIterator_Unicode::BreakIterator_Unicode()
44 : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ) // implementation name
52 BreakIterator_Unicode::~BreakIterator_Unicode()
54 delete character
.aBreakIterator
;
55 delete sentence
.aBreakIterator
;
56 delete line
.aBreakIterator
;
57 for (size_t i
= 0; i
< SAL_N_ELEMENTS(words
); i
++)
58 delete words
[i
].aBreakIterator
;
62 Wrapper class to provide public access to the RuleBasedBreakIterator's
65 class OOoRuleBasedBreakIterator
: public RuleBasedBreakIterator
68 inline void publicSetBreakType(int32_t type
)
72 OOoRuleBasedBreakIterator(UDataMemory
* image
,
74 : RuleBasedBreakIterator(image
, status
)
79 // loading ICU breakiterator on demand.
80 void SAL_CALL
BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale
& rLocale
,
81 sal_Int16 rBreakType
, sal_Int16 nWordType
, const sal_Char
*rule
, const OUString
& rText
) throw(uno::RuntimeException
)
83 bool newBreak
= false;
84 UErrorCode status
= U_ZERO_ERROR
;
85 sal_Int16 breakType
= 0;
87 case LOAD_CHARACTER_BREAKITERATOR
: icuBI
=&character
; breakType
= 3; break;
88 case LOAD_WORD_BREAKITERATOR
:
89 assert (nWordType
>= 0 && nWordType
<= WordType::WORD_COUNT
);
90 icuBI
=&words
[nWordType
];
92 case WordType::ANY_WORD
: break; // odd but previous behavior
93 case WordType::ANYWORD_IGNOREWHITESPACES
:
94 breakType
= 0; rule
= wordRule
= "edit_word"; break;
95 case WordType::DICTIONARY_WORD
:
96 breakType
= 1; rule
= wordRule
= "dict_word"; break;
98 case WordType::WORD_COUNT
:
99 breakType
= 2; rule
= wordRule
= "count_word"; break;
102 case LOAD_SENTENCE_BREAKITERATOR
: icuBI
=&sentence
; breakType
= 5; break;
103 case LOAD_LINE_BREAKITERATOR
: icuBI
=&line
; breakType
= 4; break;
105 if (!icuBI
->aBreakIterator
||
106 rLocale
.Language
!= icuBI
->maLocale
.Language
||
107 rLocale
.Country
!= icuBI
->maLocale
.Country
||
108 rLocale
.Variant
!= icuBI
->maLocale
.Variant
) {
109 if (icuBI
->aBreakIterator
) {
110 delete icuBI
->aBreakIterator
;
111 icuBI
->aBreakIterator
=NULL
;
114 uno::Sequence
< OUString
> breakRules
= LocaleDataImpl().getBreakIteratorRules(rLocale
);
116 status
= U_ZERO_ERROR
;
117 udata_setAppData("OpenOffice", OpenOffice_dat
, &status
);
118 if ( !U_SUCCESS(status
) ) throw uno::RuntimeException();
120 OOoRuleBasedBreakIterator
*rbi
= NULL
;
122 if (breakRules
.getLength() > breakType
&& !breakRules
[breakType
].isEmpty())
124 rbi
= new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
125 OUStringToOString(breakRules
[breakType
], RTL_TEXTENCODING_ASCII_US
).getStr(), &status
), status
);
127 //use icu's breakiterator for Thai, Khmer, Tibetan and Dzongkha
128 else if (rLocale
.Language
!= "th" && rLocale
.Language
!= "lo" && rLocale
.Language
!= "km" && rLocale
.Language
!= "bo" && rLocale
.Language
!= "dz")
130 status
= U_ZERO_ERROR
;
131 OStringBuffer
aUDName(64);
132 aUDName
.append(rule
);
134 aUDName
.append( OUStringToOString(rLocale
.Language
, RTL_TEXTENCODING_ASCII_US
));
135 UDataMemory
* pUData
= udata_open("OpenOffice", "brk", aUDName
.getStr(), &status
);
136 if( U_SUCCESS(status
) )
137 rbi
= new OOoRuleBasedBreakIterator( pUData
, status
);
138 if (!U_SUCCESS(status
) ) {
139 status
= U_ZERO_ERROR
;
140 pUData
= udata_open("OpenOffice", "brk", rule
, &status
);
141 if( U_SUCCESS(status
) )
142 rbi
= new OOoRuleBasedBreakIterator( pUData
, status
);
143 if (!U_SUCCESS(status
) ) icuBI
->aBreakIterator
=NULL
;
147 switch (rBreakType
) {
148 case LOAD_CHARACTER_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_CHARACTER
); break;
149 case LOAD_WORD_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_WORD
); break;
150 case LOAD_SENTENCE_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_SENTENCE
); break;
151 case LOAD_LINE_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_LINE
); break;
153 icuBI
->aBreakIterator
= rbi
;
157 if (!icuBI
->aBreakIterator
) {
158 icu::Locale
icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale
)));
160 status
= U_ZERO_ERROR
;
161 switch (rBreakType
) {
162 case LOAD_CHARACTER_BREAKITERATOR
:
163 icuBI
->aBreakIterator
= icu::BreakIterator::createCharacterInstance(icuLocale
, status
);
165 case LOAD_WORD_BREAKITERATOR
:
166 icuBI
->aBreakIterator
= icu::BreakIterator::createWordInstance(icuLocale
, status
);
168 case LOAD_SENTENCE_BREAKITERATOR
:
169 icuBI
->aBreakIterator
= icu::BreakIterator::createSentenceInstance(icuLocale
, status
);
171 case LOAD_LINE_BREAKITERATOR
:
172 icuBI
->aBreakIterator
= icu::BreakIterator::createLineInstance(icuLocale
, status
);
175 if ( !U_SUCCESS(status
) ) {
176 icuBI
->aBreakIterator
=NULL
;
177 throw uno::RuntimeException();
180 if (icuBI
->aBreakIterator
) {
181 icuBI
->maLocale
=rLocale
;
184 throw uno::RuntimeException();
188 if (newBreak
|| !icuBI
->aICUText
.equals(rText
))
190 // UChar != sal_Unicode in MinGW
191 const UChar
*pText
= reinterpret_cast<const UChar
*>(rText
.getStr());
193 icuBI
->ut
= utext_openUChars(icuBI
->ut
, pText
, rText
.getLength(), &status
);
195 if (!U_SUCCESS(status
))
196 throw uno::RuntimeException();
198 icuBI
->aBreakIterator
->setText(icuBI
->ut
, status
);
200 if (!U_SUCCESS(status
))
201 throw uno::RuntimeException();
203 icuBI
->aICUText
= rText
;
207 sal_Int32 SAL_CALL
BreakIterator_Unicode::nextCharacters( const OUString
& Text
,
208 sal_Int32 nStartPos
, const lang::Locale
&rLocale
,
209 sal_Int16 nCharacterIteratorMode
, sal_Int32 nCount
, sal_Int32
& nDone
)
210 throw(uno::RuntimeException
, std::exception
)
212 if (nCharacterIteratorMode
== CharacterIteratorMode::SKIPCELL
) { // for CELL mode
213 loadICUBreakIterator(rLocale
, LOAD_CHARACTER_BREAKITERATOR
, 0, "char", Text
);
214 for (nDone
= 0; nDone
< nCount
; nDone
++) {
215 nStartPos
= character
.aBreakIterator
->following(nStartPos
);
216 if (nStartPos
== BreakIterator::DONE
)
217 return Text
.getLength();
219 } else { // for CHARACTER mode
220 for (nDone
= 0; nDone
< nCount
&& nStartPos
< Text
.getLength(); nDone
++)
221 Text
.iterateCodePoints(&nStartPos
, 1);
226 sal_Int32 SAL_CALL
BreakIterator_Unicode::previousCharacters( const OUString
& Text
,
227 sal_Int32 nStartPos
, const lang::Locale
& rLocale
,
228 sal_Int16 nCharacterIteratorMode
, sal_Int32 nCount
, sal_Int32
& nDone
)
229 throw(uno::RuntimeException
, std::exception
)
231 if (nCharacterIteratorMode
== CharacterIteratorMode::SKIPCELL
) { // for CELL mode
232 loadICUBreakIterator(rLocale
, LOAD_CHARACTER_BREAKITERATOR
, 0, "char", Text
);
233 for (nDone
= 0; nDone
< nCount
; nDone
++) {
234 nStartPos
= character
.aBreakIterator
->preceding(nStartPos
);
235 if (nStartPos
== BreakIterator::DONE
)
238 } else { // for BS to delete one char and CHARACTER mode.
239 for (nDone
= 0; nDone
< nCount
&& nStartPos
> 0; nDone
++)
240 Text
.iterateCodePoints(&nStartPos
, -1);
246 Boundary SAL_CALL
BreakIterator_Unicode::nextWord( const OUString
& Text
, sal_Int32 nStartPos
,
247 const lang::Locale
& rLocale
, sal_Int16 rWordType
) throw(uno::RuntimeException
, std::exception
)
249 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, NULL
, Text
);
251 result
.startPos
= icuBI
->aBreakIterator
->following(nStartPos
);
252 if( result
.startPos
>= Text
.getLength() || result
.startPos
== BreakIterator::DONE
)
253 result
.endPos
= result
.startPos
;
255 if ( (rWordType
== WordType::ANYWORD_IGNOREWHITESPACES
||
256 rWordType
== WordType::DICTIONARY_WORD
) &&
257 u_isWhitespace(Text
.iterateCodePoints(&result
.startPos
, 0)) )
258 result
.startPos
= icuBI
->aBreakIterator
->following(result
.startPos
);
260 result
.endPos
= icuBI
->aBreakIterator
->following(result
.startPos
);
261 if(result
.endPos
== BreakIterator::DONE
)
262 result
.endPos
= result
.startPos
;
268 Boundary SAL_CALL
BreakIterator_Unicode::previousWord(const OUString
& Text
, sal_Int32 nStartPos
,
269 const lang::Locale
& rLocale
, sal_Int16 rWordType
) throw(uno::RuntimeException
, std::exception
)
271 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, NULL
, Text
);
273 result
.startPos
= icuBI
->aBreakIterator
->preceding(nStartPos
);
274 if( result
.startPos
< 0 || result
.startPos
== BreakIterator::DONE
)
275 result
.endPos
= result
.startPos
;
277 if ( (rWordType
== WordType::ANYWORD_IGNOREWHITESPACES
||
278 rWordType
== WordType::DICTIONARY_WORD
) &&
279 u_isWhitespace(Text
.iterateCodePoints(&result
.startPos
, 0)) )
280 result
.startPos
= icuBI
->aBreakIterator
->preceding(result
.startPos
);
282 result
.endPos
= icuBI
->aBreakIterator
->following(result
.startPos
);
283 if(result
.endPos
== BreakIterator::DONE
)
284 result
.endPos
= result
.startPos
;
290 Boundary SAL_CALL
BreakIterator_Unicode::getWordBoundary( const OUString
& Text
, sal_Int32 nPos
, const lang::Locale
& rLocale
,
291 sal_Int16 rWordType
, sal_Bool bDirection
) throw(uno::RuntimeException
, std::exception
)
293 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, NULL
, Text
);
294 sal_Int32 len
= Text
.getLength();
296 if(icuBI
->aBreakIterator
->isBoundary(nPos
)) {
297 result
.startPos
= result
.endPos
= nPos
;
298 if((bDirection
|| nPos
== 0) && nPos
< len
) //forward
299 result
.endPos
= icuBI
->aBreakIterator
->following(nPos
);
301 result
.startPos
= icuBI
->aBreakIterator
->preceding(nPos
);
305 result
.endPos
= len
? icuBI
->aBreakIterator
->following((sal_Int32
)0) : 0;
306 } else if(nPos
>= len
) {
307 result
.startPos
= icuBI
->aBreakIterator
->preceding(len
);
310 result
.startPos
= icuBI
->aBreakIterator
->preceding(nPos
);
311 result
.endPos
= icuBI
->aBreakIterator
->following(nPos
);
314 if (result
.startPos
== BreakIterator::DONE
)
315 result
.startPos
= result
.endPos
;
316 else if (result
.endPos
== BreakIterator::DONE
)
317 result
.endPos
= result
.startPos
;
323 sal_Int32 SAL_CALL
BreakIterator_Unicode::beginOfSentence( const OUString
& Text
, sal_Int32 nStartPos
,
324 const lang::Locale
&rLocale
) throw(uno::RuntimeException
, std::exception
)
326 loadICUBreakIterator(rLocale
, LOAD_SENTENCE_BREAKITERATOR
, 0, "sent", Text
);
328 sal_Int32 len
= Text
.getLength();
329 if (len
> 0 && nStartPos
== len
)
330 Text
.iterateCodePoints(&nStartPos
, -1); // issue #i27703# treat end position as part of last sentence
331 if (!sentence
.aBreakIterator
->isBoundary(nStartPos
))
332 nStartPos
= sentence
.aBreakIterator
->preceding(nStartPos
);
334 // skip preceding space.
335 sal_uInt32 ch
= Text
.iterateCodePoints(&nStartPos
, 1);
336 while (nStartPos
< len
&& u_isWhitespace(ch
)) ch
= Text
.iterateCodePoints(&nStartPos
, 1);
337 Text
.iterateCodePoints(&nStartPos
, -1);
342 sal_Int32 SAL_CALL
BreakIterator_Unicode::endOfSentence( const OUString
& Text
, sal_Int32 nStartPos
,
343 const lang::Locale
&rLocale
) throw(uno::RuntimeException
, std::exception
)
345 loadICUBreakIterator(rLocale
, LOAD_SENTENCE_BREAKITERATOR
, 0, "sent", Text
);
347 sal_Int32 len
= Text
.getLength();
348 if (len
> 0 && nStartPos
== len
)
349 Text
.iterateCodePoints(&nStartPos
, -1); // issue #i27703# treat end position as part of last sentence
350 nStartPos
= sentence
.aBreakIterator
->following(nStartPos
);
352 sal_Int32 nPos
=nStartPos
;
353 while (nPos
> 0 && u_isWhitespace(Text
.iterateCodePoints(&nPos
, -1))) nStartPos
=nPos
;
358 LineBreakResults SAL_CALL
BreakIterator_Unicode::getLineBreak(
359 const OUString
& Text
, sal_Int32 nStartPos
,
360 const lang::Locale
& rLocale
, sal_Int32 nMinBreakPos
,
361 const LineBreakHyphenationOptions
& hOptions
,
362 const LineBreakUserOptions
& /*rOptions*/ ) throw(uno::RuntimeException
, std::exception
)
364 LineBreakResults lbr
;
366 if (nStartPos
>= Text
.getLength()) {
367 lbr
.breakIndex
= Text
.getLength();
368 lbr
.breakType
= BreakType::WORDBOUNDARY
;
372 loadICUBreakIterator(rLocale
, LOAD_LINE_BREAKITERATOR
, 0, lineRule
, Text
);
376 if (line
.aBreakIterator
->preceding(nStartPos
+ 1) == nStartPos
) { //Line boundary break
377 lbr
.breakIndex
= nStartPos
;
378 lbr
.breakType
= BreakType::WORDBOUNDARY
;
379 } else if (hOptions
.rHyphenator
.is()) { //Hyphenation break
380 sal_Int32 boundary_with_punctuation
= (line
.aBreakIterator
->next() != BreakIterator::DONE
) ? line
.aBreakIterator
->current() : 0;
381 line
.aBreakIterator
->preceding(nStartPos
+ 1); // reset to check correct hyphenation of "word-word"
383 sal_Int32 nStartPosWordEnd
= nStartPos
;
384 while (line
.aBreakIterator
->current() < nStartPosWordEnd
&& u_ispunct((sal_uInt32
)Text
[nStartPosWordEnd
])) // starting punctuation
387 Boundary wBoundary
= getWordBoundary( Text
, nStartPosWordEnd
, rLocale
,
388 WordType::DICTIONARY_WORD
, false);
390 nStartPosWordEnd
= wBoundary
.endPos
;
391 while (nStartPosWordEnd
< Text
.getLength() && (u_ispunct((sal_uInt32
)Text
[nStartPosWordEnd
]))) // ending punctuation
393 nStartPosWordEnd
= nStartPosWordEnd
- wBoundary
.endPos
;
394 if (hOptions
.hyphenIndex
- wBoundary
.startPos
< nStartPosWordEnd
) nStartPosWordEnd
= hOptions
.hyphenIndex
- wBoundary
.startPos
;
396 while (boundary_with_punctuation
> wBoundary
.endPos
&& Text
[--boundary_with_punctuation
] == SPACE
);
397 if (boundary_with_punctuation
!= 0) boundary_with_punctuation
+= 1 - wBoundary
.endPos
;
398 uno::Reference
< linguistic2::XHyphenatedWord
> aHyphenatedWord
;
399 aHyphenatedWord
= hOptions
.rHyphenator
->hyphenate(Text
.copy(wBoundary
.startPos
,
400 wBoundary
.endPos
- wBoundary
.startPos
), rLocale
,
401 (sal_Int16
) (hOptions
.hyphenIndex
- wBoundary
.startPos
- ((hOptions
.hyphenIndex
== wBoundary
.endPos
)? nStartPosWordEnd
: 0)), hOptions
.aHyphenationOptions
);
402 if (aHyphenatedWord
.is()) {
403 lbr
.rHyphenatedWord
= aHyphenatedWord
;
404 if(wBoundary
.startPos
+ aHyphenatedWord
->getHyphenationPos() + 1 < nMinBreakPos
)
407 lbr
.breakIndex
= wBoundary
.startPos
; //aHyphenatedWord->getHyphenationPos();
408 lbr
.breakType
= BreakType::HYPHENATION
;
410 // check not optimal hyphenation of "word-word" (word with hyphens)
411 if (lbr
.breakIndex
> -1 && wBoundary
.startPos
+ aHyphenatedWord
->getHyphenationPos() < line
.aBreakIterator
->current()) {
412 lbr
.breakIndex
= line
.aBreakIterator
->current();
413 lbr
.breakType
= BreakType::WORDBOUNDARY
;
417 lbr
.breakIndex
= line
.aBreakIterator
->preceding(nStartPos
);
418 lbr
.breakType
= BreakType::WORDBOUNDARY
;;
420 } else { //word boundary break
421 lbr
.breakIndex
= line
.aBreakIterator
->preceding(nStartPos
);
422 lbr
.breakType
= BreakType::WORDBOUNDARY
;
425 #define WJ 0x2060 // Word Joiner
427 if (lbr
.breakType
== BreakType::WORDBOUNDARY
) {
428 nStartPos
= lbr
.breakIndex
;
429 if (Text
[nStartPos
--] == WJ
)
431 while (nStartPos
>= 0 &&
432 (u_isWhitespace(Text
.iterateCodePoints(&nStartPos
, 0)) || Text
[nStartPos
] == WJ
)) {
433 if (Text
[nStartPos
--] == WJ
)
436 if (GlueSpace
&& nStartPos
< 0) {
447 BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException
, std::exception
)
449 return OUString::createFromAscii(cBreakIterator
);
453 BreakIterator_Unicode::supportsService(const OUString
& rServiceName
) throw( uno::RuntimeException
, std::exception
)
455 return cppu::supportsService(this, rServiceName
);
458 uno::Sequence
< OUString
> SAL_CALL
459 BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException
, std::exception
)
461 uno::Sequence
< OUString
> aRet(1);
462 aRet
[0] = OUString::createFromAscii(cBreakIterator
);
468 extern "C" SAL_DLLPUBLIC_EXPORT
css::uno::XInterface
* SAL_CALL
469 com_sun_star_i18n_BreakIterator_Unicode_get_implementation(
470 css::uno::XComponentContext
*,
471 css::uno::Sequence
<css::uno::Any
> const &)
473 return cppu::acquire(new css::i18n::BreakIterator_Unicode());
476 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */