1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <breakiterator_unicode.hxx>
21 #include <localedata.hxx>
22 #include <i18nlangtag/languagetag.hxx>
23 #include <i18nlangtag/languagetagicu.hxx>
24 #include <unicode/uchar.h>
25 #include <unicode/locid.h>
26 #include <unicode/rbbi.h>
27 #include <unicode/udata.h>
28 #include <rtl/strbuf.hxx>
29 #include <rtl/ustring.hxx>
33 extern const char OpenOffice_dat
[];
36 using namespace ::com::sun::star
;
37 using namespace ::com::sun::star::lang
;
38 using namespace ::rtl
;
40 namespace com
{ namespace sun
{ namespace star
{ namespace i18n
{
43 BreakIterator_Unicode::BreakIterator_Unicode() :
44 cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ), // implementation name
51 BreakIterator_Unicode::~BreakIterator_Unicode()
53 delete character
.aBreakIterator
;
54 delete sentence
.aBreakIterator
;
55 delete line
.aBreakIterator
;
56 for (size_t i
= 0; i
< SAL_N_ELEMENTS(words
); i
++)
57 delete words
[i
].aBreakIterator
;
61 Wrapper class to provide public access to the RuleBasedBreakIterator's
64 class OOoRuleBasedBreakIterator
: public RuleBasedBreakIterator
{
66 inline void publicSetBreakType(int32_t type
) {
69 OOoRuleBasedBreakIterator(UDataMemory
* image
,
71 RuleBasedBreakIterator(image
, status
) { };
75 // loading ICU breakiterator on demand.
76 void SAL_CALL
BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale
& rLocale
,
77 sal_Int16 rBreakType
, sal_Int16 nWordType
, const sal_Char
*rule
, const OUString
& rText
) throw(uno::RuntimeException
)
79 sal_Bool newBreak
= sal_False
;
80 UErrorCode status
= U_ZERO_ERROR
;
81 sal_Int16 breakType
= 0;
83 case LOAD_CHARACTER_BREAKITERATOR
: icuBI
=&character
; breakType
= 3; break;
84 case LOAD_WORD_BREAKITERATOR
:
85 assert (nWordType
>= 0 && nWordType
<= WordType::WORD_COUNT
);
86 icuBI
=&words
[nWordType
];
88 case WordType::ANY_WORD
: break; // odd but previous behavior
89 case WordType::ANYWORD_IGNOREWHITESPACES
:
90 breakType
= 0; rule
= wordRule
= "edit_word"; break;
91 case WordType::DICTIONARY_WORD
:
92 breakType
= 1; rule
= wordRule
= "dict_word"; break;
94 case WordType::WORD_COUNT
:
95 breakType
= 2; rule
= wordRule
= "count_word"; break;
98 case LOAD_SENTENCE_BREAKITERATOR
: icuBI
=&sentence
; breakType
= 5; break;
99 case LOAD_LINE_BREAKITERATOR
: icuBI
=&line
; breakType
= 4; break;
101 if (!icuBI
->aBreakIterator
||
102 rLocale
.Language
!= icuBI
->maLocale
.Language
||
103 rLocale
.Country
!= icuBI
->maLocale
.Country
||
104 rLocale
.Variant
!= icuBI
->maLocale
.Variant
) {
105 if (icuBI
->aBreakIterator
) {
106 delete icuBI
->aBreakIterator
;
107 icuBI
->aBreakIterator
=NULL
;
110 uno::Sequence
< OUString
> breakRules
= LocaleData().getBreakIteratorRules(rLocale
);
112 status
= U_ZERO_ERROR
;
113 udata_setAppData("OpenOffice", OpenOffice_dat
, &status
);
114 if ( !U_SUCCESS(status
) ) throw uno::RuntimeException();
116 OOoRuleBasedBreakIterator
*rbi
= NULL
;
118 if (breakRules
.getLength() > breakType
&& !breakRules
[breakType
].isEmpty())
120 rbi
= new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
121 OUStringToOString(breakRules
[breakType
], RTL_TEXTENCODING_ASCII_US
).getStr(), &status
), status
);
123 //use icu's breakiterator for Thai, Khmer, Tibetan and Dzongkha
124 else if (rLocale
.Language
!= "th" && rLocale
.Language
!= "km" && rLocale
.Language
!= "bo" && rLocale
.Language
!= "dz")
126 status
= U_ZERO_ERROR
;
127 OStringBuffer
aUDName(64);
128 aUDName
.append(rule
);
130 aUDName
.append( OUStringToOString(rLocale
.Language
, RTL_TEXTENCODING_ASCII_US
));
131 UDataMemory
* pUData
= udata_open("OpenOffice", "brk", aUDName
.getStr(), &status
);
132 if( U_SUCCESS(status
) )
133 rbi
= new OOoRuleBasedBreakIterator( pUData
, status
);
134 if (!U_SUCCESS(status
) ) {
135 status
= U_ZERO_ERROR
;
136 pUData
= udata_open("OpenOffice", "brk", rule
, &status
);
137 if( U_SUCCESS(status
) )
138 rbi
= new OOoRuleBasedBreakIterator( pUData
, status
);
139 if (!U_SUCCESS(status
) ) icuBI
->aBreakIterator
=NULL
;
143 switch (rBreakType
) {
144 case LOAD_CHARACTER_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_CHARACTER
); break;
145 case LOAD_WORD_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_WORD
); break;
146 case LOAD_SENTENCE_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_SENTENCE
); break;
147 case LOAD_LINE_BREAKITERATOR
: rbi
->publicSetBreakType(UBRK_LINE
); break;
149 icuBI
->aBreakIterator
= rbi
;
153 if (!icuBI
->aBreakIterator
) {
154 icu::Locale
icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale
)));
156 status
= U_ZERO_ERROR
;
157 switch (rBreakType
) {
158 case LOAD_CHARACTER_BREAKITERATOR
:
159 icuBI
->aBreakIterator
= icu::BreakIterator::createCharacterInstance(icuLocale
, status
);
161 case LOAD_WORD_BREAKITERATOR
:
162 icuBI
->aBreakIterator
= icu::BreakIterator::createWordInstance(icuLocale
, status
);
164 case LOAD_SENTENCE_BREAKITERATOR
:
165 icuBI
->aBreakIterator
= icu::BreakIterator::createSentenceInstance(icuLocale
, status
);
167 case LOAD_LINE_BREAKITERATOR
:
168 icuBI
->aBreakIterator
= icu::BreakIterator::createLineInstance(icuLocale
, status
);
171 if ( !U_SUCCESS(status
) ) {
172 icuBI
->aBreakIterator
=NULL
;
173 throw uno::RuntimeException();
176 if (icuBI
->aBreakIterator
) {
177 icuBI
->maLocale
=rLocale
;
180 throw uno::RuntimeException();
184 if (newBreak
|| !icuBI
->aICUText
.equals(rText
))
186 // UChar != sal_Unicode in MinGW
187 const UChar
*pText
= reinterpret_cast<const UChar
*>(rText
.getStr());
189 icuBI
->ut
= utext_openUChars(icuBI
->ut
, pText
, rText
.getLength(), &status
);
191 if (!U_SUCCESS(status
))
192 throw uno::RuntimeException();
194 icuBI
->aBreakIterator
->setText(icuBI
->ut
, status
);
196 if (!U_SUCCESS(status
))
197 throw uno::RuntimeException();
199 icuBI
->aICUText
= rText
;
203 sal_Int32 SAL_CALL
BreakIterator_Unicode::nextCharacters( const OUString
& Text
,
204 sal_Int32 nStartPos
, const lang::Locale
&rLocale
,
205 sal_Int16 nCharacterIteratorMode
, sal_Int32 nCount
, sal_Int32
& nDone
)
206 throw(uno::RuntimeException
)
208 if (nCharacterIteratorMode
== CharacterIteratorMode::SKIPCELL
) { // for CELL mode
209 loadICUBreakIterator(rLocale
, LOAD_CHARACTER_BREAKITERATOR
, 0, "char", Text
);
210 for (nDone
= 0; nDone
< nCount
; nDone
++) {
211 nStartPos
= character
.aBreakIterator
->following(nStartPos
);
212 if (nStartPos
== BreakIterator::DONE
)
213 return Text
.getLength();
215 } else { // for CHARACTER mode
216 for (nDone
= 0; nDone
< nCount
&& nStartPos
< Text
.getLength(); nDone
++)
217 Text
.iterateCodePoints(&nStartPos
, 1);
222 sal_Int32 SAL_CALL
BreakIterator_Unicode::previousCharacters( const OUString
& Text
,
223 sal_Int32 nStartPos
, const lang::Locale
& rLocale
,
224 sal_Int16 nCharacterIteratorMode
, sal_Int32 nCount
, sal_Int32
& nDone
)
225 throw(uno::RuntimeException
)
227 if (nCharacterIteratorMode
== CharacterIteratorMode::SKIPCELL
) { // for CELL mode
228 loadICUBreakIterator(rLocale
, LOAD_CHARACTER_BREAKITERATOR
, 0, "char", Text
);
229 for (nDone
= 0; nDone
< nCount
; nDone
++) {
230 nStartPos
= character
.aBreakIterator
->preceding(nStartPos
);
231 if (nStartPos
== BreakIterator::DONE
)
234 } else { // for BS to delete one char and CHARACTER mode.
235 for (nDone
= 0; nDone
< nCount
&& nStartPos
> 0; nDone
++)
236 Text
.iterateCodePoints(&nStartPos
, -1);
242 Boundary SAL_CALL
BreakIterator_Unicode::nextWord( const OUString
& Text
, sal_Int32 nStartPos
,
243 const lang::Locale
& rLocale
, sal_Int16 rWordType
) throw(uno::RuntimeException
)
245 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, NULL
, Text
);
247 result
.startPos
= icuBI
->aBreakIterator
->following(nStartPos
);
248 if( result
.startPos
>= Text
.getLength() || result
.startPos
== BreakIterator::DONE
)
249 result
.endPos
= result
.startPos
;
251 if ( (rWordType
== WordType::ANYWORD_IGNOREWHITESPACES
||
252 rWordType
== WordType::DICTIONARY_WORD
) &&
253 u_isWhitespace(Text
.iterateCodePoints(&result
.startPos
, 0)) )
254 result
.startPos
= icuBI
->aBreakIterator
->following(result
.startPos
);
256 result
.endPos
= icuBI
->aBreakIterator
->following(result
.startPos
);
257 if(result
.endPos
== BreakIterator::DONE
)
258 result
.endPos
= result
.startPos
;
264 Boundary SAL_CALL
BreakIterator_Unicode::previousWord(const OUString
& Text
, sal_Int32 nStartPos
,
265 const lang::Locale
& rLocale
, sal_Int16 rWordType
) throw(uno::RuntimeException
)
267 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, NULL
, Text
);
269 result
.startPos
= icuBI
->aBreakIterator
->preceding(nStartPos
);
270 if( result
.startPos
< 0 || result
.startPos
== BreakIterator::DONE
)
271 result
.endPos
= result
.startPos
;
273 if ( (rWordType
== WordType::ANYWORD_IGNOREWHITESPACES
||
274 rWordType
== WordType::DICTIONARY_WORD
) &&
275 u_isWhitespace(Text
.iterateCodePoints(&result
.startPos
, 0)) )
276 result
.startPos
= icuBI
->aBreakIterator
->preceding(result
.startPos
);
278 result
.endPos
= icuBI
->aBreakIterator
->following(result
.startPos
);
279 if(result
.endPos
== BreakIterator::DONE
)
280 result
.endPos
= result
.startPos
;
286 Boundary SAL_CALL
BreakIterator_Unicode::getWordBoundary( const OUString
& Text
, sal_Int32 nPos
, const lang::Locale
& rLocale
,
287 sal_Int16 rWordType
, sal_Bool bDirection
) throw(uno::RuntimeException
)
289 loadICUBreakIterator(rLocale
, LOAD_WORD_BREAKITERATOR
, rWordType
, NULL
, Text
);
290 sal_Int32 len
= Text
.getLength();
292 if(icuBI
->aBreakIterator
->isBoundary(nPos
)) {
293 result
.startPos
= result
.endPos
= nPos
;
294 if((bDirection
|| nPos
== 0) && nPos
< len
) //forward
295 result
.endPos
= icuBI
->aBreakIterator
->following(nPos
);
297 result
.startPos
= icuBI
->aBreakIterator
->preceding(nPos
);
301 result
.endPos
= len
? icuBI
->aBreakIterator
->following((sal_Int32
)0) : 0;
302 } else if(nPos
>= len
) {
303 result
.startPos
= icuBI
->aBreakIterator
->preceding(len
);
306 result
.startPos
= icuBI
->aBreakIterator
->preceding(nPos
);
307 result
.endPos
= icuBI
->aBreakIterator
->following(nPos
);
310 if (result
.startPos
== BreakIterator::DONE
)
311 result
.startPos
= result
.endPos
;
312 else if (result
.endPos
== BreakIterator::DONE
)
313 result
.endPos
= result
.startPos
;
319 sal_Int32 SAL_CALL
BreakIterator_Unicode::beginOfSentence( const OUString
& Text
, sal_Int32 nStartPos
,
320 const lang::Locale
&rLocale
) throw(uno::RuntimeException
)
322 loadICUBreakIterator(rLocale
, LOAD_SENTENCE_BREAKITERATOR
, 0, "sent", Text
);
324 sal_Int32 len
= Text
.getLength();
325 if (len
> 0 && nStartPos
== len
)
326 Text
.iterateCodePoints(&nStartPos
, -1); // issue #i27703# treat end position as part of last sentence
327 if (!sentence
.aBreakIterator
->isBoundary(nStartPos
))
328 nStartPos
= sentence
.aBreakIterator
->preceding(nStartPos
);
330 // skip preceding space.
331 sal_uInt32 ch
= Text
.iterateCodePoints(&nStartPos
, 1);
332 while (nStartPos
< len
&& u_isWhitespace(ch
)) ch
= Text
.iterateCodePoints(&nStartPos
, 1);
333 Text
.iterateCodePoints(&nStartPos
, -1);
338 sal_Int32 SAL_CALL
BreakIterator_Unicode::endOfSentence( const OUString
& Text
, sal_Int32 nStartPos
,
339 const lang::Locale
&rLocale
) throw(uno::RuntimeException
)
341 loadICUBreakIterator(rLocale
, LOAD_SENTENCE_BREAKITERATOR
, 0, "sent", Text
);
343 sal_Int32 len
= Text
.getLength();
344 if (len
> 0 && nStartPos
== len
)
345 Text
.iterateCodePoints(&nStartPos
, -1); // issue #i27703# treat end position as part of last sentence
346 nStartPos
= sentence
.aBreakIterator
->following(nStartPos
);
348 sal_Int32 nPos
=nStartPos
;
349 while (nPos
> 0 && u_isWhitespace(Text
.iterateCodePoints(&nPos
, -1))) nStartPos
=nPos
;
354 LineBreakResults SAL_CALL
BreakIterator_Unicode::getLineBreak(
355 const OUString
& Text
, sal_Int32 nStartPos
,
356 const lang::Locale
& rLocale
, sal_Int32 nMinBreakPos
,
357 const LineBreakHyphenationOptions
& hOptions
,
358 const LineBreakUserOptions
& /*rOptions*/ ) throw(uno::RuntimeException
)
360 LineBreakResults lbr
;
362 if (nStartPos
>= Text
.getLength()) {
363 lbr
.breakIndex
= Text
.getLength();
364 lbr
.breakType
= BreakType::WORDBOUNDARY
;
368 loadICUBreakIterator(rLocale
, LOAD_LINE_BREAKITERATOR
, 0, lineRule
, Text
);
370 sal_Bool GlueSpace
=sal_True
;
372 if (line
.aBreakIterator
->preceding(nStartPos
+ 1) == nStartPos
) { //Line boundary break
373 lbr
.breakIndex
= nStartPos
;
374 lbr
.breakType
= BreakType::WORDBOUNDARY
;
375 } else if (hOptions
.rHyphenator
.is()) { //Hyphenation break
376 Boundary wBoundary
= getWordBoundary( Text
, nStartPos
, rLocale
,
377 WordType::DICTIONARY_WORD
, false);
378 uno::Reference
< linguistic2::XHyphenatedWord
> aHyphenatedWord
;
379 aHyphenatedWord
= hOptions
.rHyphenator
->hyphenate(Text
.copy(wBoundary
.startPos
,
380 wBoundary
.endPos
- wBoundary
.startPos
), rLocale
,
381 (sal_Int16
) (hOptions
.hyphenIndex
- wBoundary
.startPos
), hOptions
.aHyphenationOptions
);
382 if (aHyphenatedWord
.is()) {
383 lbr
.rHyphenatedWord
= aHyphenatedWord
;
384 if(wBoundary
.startPos
+ aHyphenatedWord
->getHyphenationPos() + 1 < nMinBreakPos
)
387 lbr
.breakIndex
= wBoundary
.startPos
; //aHyphenatedWord->getHyphenationPos();
388 lbr
.breakType
= BreakType::HYPHENATION
;
390 lbr
.breakIndex
= line
.aBreakIterator
->preceding(nStartPos
);
391 lbr
.breakType
= BreakType::WORDBOUNDARY
;;
393 } else { //word boundary break
394 lbr
.breakIndex
= line
.aBreakIterator
->preceding(nStartPos
);
395 lbr
.breakType
= BreakType::WORDBOUNDARY
;
398 #define WJ 0x2060 // Word Joiner
400 if (lbr
.breakType
== BreakType::WORDBOUNDARY
) {
401 nStartPos
= lbr
.breakIndex
;
402 if (Text
[nStartPos
--] == WJ
)
404 while (nStartPos
>= 0 &&
405 (u_isWhitespace(Text
.iterateCodePoints(&nStartPos
, 0)) || Text
[nStartPos
] == WJ
)) {
406 if (Text
[nStartPos
--] == WJ
)
409 if (GlueSpace
&& nStartPos
< 0) {
422 BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException
)
424 return OUString::createFromAscii(cBreakIterator
);
428 BreakIterator_Unicode::supportsService(const OUString
& rServiceName
) throw( uno::RuntimeException
)
430 return !rServiceName
.compareToAscii(cBreakIterator
);
433 uno::Sequence
< OUString
> SAL_CALL
434 BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException
)
436 uno::Sequence
< OUString
> aRet(1);
437 aRet
[0] = OUString::createFromAscii(cBreakIterator
);
443 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */