merged tag ooo/OOO330_m14
[LibreOffice.git] / i18npool / source / breakiterator / breakiterator_unicode.cxx
blobd7242d180d8586bcbe00c5731567762cc37493e0
1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * Copyright 2000, 2010 Oracle and/or its affiliates.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * This file is part of OpenOffice.org.
11 * OpenOffice.org is free software: you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser General Public License version 3
13 * only, as published by the Free Software Foundation.
15 * OpenOffice.org is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License version 3 for more details
19 * (a copy is included in the LICENSE file that accompanied this code).
21 * You should have received a copy of the GNU Lesser General Public License
22 * version 3 along with OpenOffice.org. If not, see
23 * <http://www.openoffice.org/license.html>
24 * for a copy of the LGPLv3 License.
26 ************************************************************************/
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_i18npool.hxx"
30 #include <breakiterator_unicode.hxx>
31 #include <localedata.hxx>
32 #include <unicode/uchar.h>
33 #include <unicode/locid.h>
34 #include <unicode/rbbi.h>
35 #include <unicode/udata.h>
36 #include <rtl/strbuf.hxx>
37 #include <rtl/ustring.hxx>
39 U_CDECL_BEGIN
40 extern const char OpenOffice_dat[];
41 U_CDECL_END
43 using namespace ::com::sun::star;
44 using namespace ::com::sun::star::lang;
45 using namespace ::rtl;
47 namespace com { namespace sun { namespace star { namespace i18n {
49 #define ERROR ::com::sun::star::uno::RuntimeException()
51 //#define ImplementName "com.sun.star.i18n.BreakIterator_Unicode";
54 BreakIterator_Unicode::BreakIterator_Unicode() :
55 cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ), // implementation name
56 wordRule( "word" ),
57 lineRule( "line" ),
58 result(),
59 character(),
60 word(),
61 sentence(),
62 line(),
63 icuBI( NULL ),
64 aLocale(),
65 aBreakType(),
66 aWordType()
71 BreakIterator_Unicode::~BreakIterator_Unicode()
73 if (icuBI && icuBI->aBreakIterator) {
74 delete icuBI->aBreakIterator;
75 icuBI->aBreakIterator=NULL;
77 if (character.aBreakIterator) delete character.aBreakIterator;
78 if (word.aBreakIterator) delete word.aBreakIterator;
79 if (sentence.aBreakIterator) delete sentence.aBreakIterator;
80 if (line.aBreakIterator) delete line.aBreakIterator;
84 Wrapper class to provide public access to the RuleBasedBreakIterator's
85 setbreakType method.
87 class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator {
88 public:
89 inline void publicSetBreakType(int32_t type) {
90 setBreakType(type);
92 OOoRuleBasedBreakIterator(UDataMemory* image,
93 UErrorCode &status) :
94 RuleBasedBreakIterator(image, status) { };
98 // loading ICU breakiterator on demand.
99 void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale,
100 sal_Int16 rBreakType, sal_Int16 rWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException)
102 sal_Bool newBreak = sal_False;
103 UErrorCode status = U_ZERO_ERROR;
104 sal_Int16 breakType = 0;
105 switch (rBreakType) {
106 case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
107 case LOAD_WORD_BREAKITERATOR: icuBI=&word;
108 switch (rWordType) {
109 case WordType::ANYWORD_IGNOREWHITESPACES: breakType = 0; rule=wordRule = "edit_word"; break;
110 case WordType::DICTIONARY_WORD: breakType = 1; rule=wordRule = "dict_word"; break;
111 case WordType::WORD_COUNT: breakType = 2; rule=wordRule = "count_word"; break;
113 break;
114 case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
115 case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
117 if (!icuBI->aBreakIterator || rWordType != aWordType ||
118 rLocale.Language != aLocale.Language || rLocale.Country != aLocale.Country ||
119 rLocale.Variant != aLocale.Variant) {
120 if (icuBI->aBreakIterator) {
121 delete icuBI->aBreakIterator;
122 icuBI->aBreakIterator=NULL;
124 if (rule) {
125 uno::Sequence< OUString > breakRules = LocaleData().getBreakIteratorRules(rLocale);
127 status = U_ZERO_ERROR;
128 udata_setAppData("OpenOffice", OpenOffice_dat, &status);
129 if ( !U_SUCCESS(status) ) throw ERROR;
131 OOoRuleBasedBreakIterator *rbi = NULL;
133 if (breakRules.getLength() > breakType && breakRules[breakType].getLength() > 0) {
134 rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
135 OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
136 } else {
137 status = U_ZERO_ERROR;
138 OStringBuffer aUDName(64);
139 aUDName.append(rule);
140 aUDName.append('_');
141 aUDName.append( OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US));
142 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
143 if( U_SUCCESS(status) )
144 rbi = new OOoRuleBasedBreakIterator( pUData, status);
145 if (!U_SUCCESS(status) ) {
146 status = U_ZERO_ERROR;
147 pUData = udata_open("OpenOffice", "brk", rule, &status);
148 if( U_SUCCESS(status) )
149 rbi = new OOoRuleBasedBreakIterator( pUData, status);
150 if (!U_SUCCESS(status) ) icuBI->aBreakIterator=NULL;
153 if (rbi) {
154 switch (rBreakType) {
155 case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
156 case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
157 case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
158 case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
160 icuBI->aBreakIterator = rbi;
164 if (!icuBI->aBreakIterator) {
165 icu::Locale icuLocale(
166 OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US).getStr(),
167 OUStringToOString(rLocale.Country, RTL_TEXTENCODING_ASCII_US).getStr(),
168 OUStringToOString(rLocale.Variant, RTL_TEXTENCODING_ASCII_US).getStr());
170 status = U_ZERO_ERROR;
171 switch (rBreakType) {
172 case LOAD_CHARACTER_BREAKITERATOR:
173 icuBI->aBreakIterator = icu::BreakIterator::createCharacterInstance(icuLocale, status);
174 break;
175 case LOAD_WORD_BREAKITERATOR:
176 icuBI->aBreakIterator = icu::BreakIterator::createWordInstance(icuLocale, status);
177 break;
178 case LOAD_SENTENCE_BREAKITERATOR:
179 icuBI->aBreakIterator = icu::BreakIterator::createSentenceInstance(icuLocale, status);
180 break;
181 case LOAD_LINE_BREAKITERATOR:
182 icuBI->aBreakIterator = icu::BreakIterator::createLineInstance(icuLocale, status);
183 break;
185 if ( !U_SUCCESS(status) ) {
186 icuBI->aBreakIterator=NULL;
187 throw ERROR;
190 if (icuBI->aBreakIterator) {
191 aLocale=rLocale;
192 aWordType=rWordType;
193 aBreakType=rBreakType;
194 newBreak=sal_True;
195 } else {
196 throw ERROR;
200 if (newBreak || icuBI->aICUText.compare(UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength()))) { // UChar != sal_Unicode in MinGW
201 icuBI->aICUText=UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength());
202 icuBI->aBreakIterator->setText(icuBI->aICUText);
207 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
208 sal_Int32 nStartPos, const lang::Locale &rLocale,
209 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
210 throw(uno::RuntimeException)
212 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
213 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
214 for (nDone = 0; nDone < nCount; nDone++) {
215 nStartPos = character.aBreakIterator->following(nStartPos);
216 if (nStartPos == BreakIterator::DONE)
217 return Text.getLength();
219 } else { // for CHARACTER mode
220 for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
221 Text.iterateCodePoints(&nStartPos, 1);
223 return nStartPos;
226 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
227 sal_Int32 nStartPos, const lang::Locale& rLocale,
228 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
229 throw(uno::RuntimeException)
231 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
232 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
233 for (nDone = 0; nDone < nCount; nDone++) {
234 nStartPos = character.aBreakIterator->preceding(nStartPos);
235 if (nStartPos == BreakIterator::DONE)
236 return 0;
238 } else { // for BS to delete one char and CHARACTER mode.
239 for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
240 Text.iterateCodePoints(&nStartPos, -1);
242 return nStartPos;
246 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
247 const lang::Locale& rLocale, sal_Int16 rWordType ) throw(uno::RuntimeException)
249 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
251 result.startPos = word.aBreakIterator->following(nStartPos);
252 if( result.startPos >= Text.getLength() || result.startPos == BreakIterator::DONE )
253 result.endPos = result.startPos;
254 else {
255 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
256 rWordType == WordType::DICTIONARY_WORD ) &&
257 u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
258 result.startPos = word.aBreakIterator->following(result.startPos);
260 result.endPos = word.aBreakIterator->following(result.startPos);
261 if(result.endPos == BreakIterator::DONE)
262 result.endPos = result.startPos;
264 return result;
268 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
269 const lang::Locale& rLocale, sal_Int16 rWordType) throw(uno::RuntimeException)
271 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
273 result.startPos = word.aBreakIterator->preceding(nStartPos);
274 if( result.startPos < 0 || result.startPos == BreakIterator::DONE)
275 result.endPos = result.startPos;
276 else {
277 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
278 rWordType == WordType::DICTIONARY_WORD) &&
279 u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
280 result.startPos = word.aBreakIterator->preceding(result.startPos);
282 result.endPos = word.aBreakIterator->following(result.startPos);
283 if(result.endPos == BreakIterator::DONE)
284 result.endPos = result.startPos;
286 return result;
290 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
291 sal_Int16 rWordType, sal_Bool bDirection ) throw(uno::RuntimeException)
293 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
294 sal_Int32 len = Text.getLength();
296 if(word.aBreakIterator->isBoundary(nPos)) {
297 result.startPos = result.endPos = nPos;
298 if((bDirection || nPos == 0) && nPos < len) //forward
299 result.endPos = word.aBreakIterator->following(nPos);
300 else
301 result.startPos = word.aBreakIterator->preceding(nPos);
302 } else {
303 if(nPos <= 0) {
304 result.startPos = 0;
305 result.endPos = len ? word.aBreakIterator->following((sal_Int32)0) : 0;
306 } else if(nPos >= len) {
307 result.startPos = word.aBreakIterator->preceding(len);
308 result.endPos = len;
309 } else {
310 result.startPos = word.aBreakIterator->preceding(nPos);
311 result.endPos = word.aBreakIterator->following(nPos);
314 if (result.startPos == BreakIterator::DONE)
315 result.startPos = result.endPos;
316 else if (result.endPos == BreakIterator::DONE)
317 result.endPos = result.startPos;
319 return result;
323 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
324 const lang::Locale &rLocale ) throw(uno::RuntimeException)
326 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
328 sal_Int32 len = Text.getLength();
329 if (len > 0 && nStartPos == len)
330 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
331 if (!sentence.aBreakIterator->isBoundary(nStartPos))
332 nStartPos = sentence.aBreakIterator->preceding(nStartPos);
334 // skip preceding space.
335 sal_uInt32 ch = Text.iterateCodePoints(&nStartPos, 1);
336 while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos, 1);
337 Text.iterateCodePoints(&nStartPos, -1);
339 return nStartPos;
342 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
343 const lang::Locale &rLocale ) throw(uno::RuntimeException)
345 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
347 sal_Int32 len = Text.getLength();
348 if (len > 0 && nStartPos == len)
349 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
350 nStartPos = sentence.aBreakIterator->following(nStartPos);
352 sal_Int32 nPos=nStartPos;
353 while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
355 return nStartPos;
358 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
359 const OUString& Text, sal_Int32 nStartPos,
360 const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
361 const LineBreakHyphenationOptions& hOptions,
362 const LineBreakUserOptions& /*rOptions*/ ) throw(uno::RuntimeException)
364 LineBreakResults lbr;
366 if (nStartPos >= Text.getLength()) {
367 lbr.breakIndex = Text.getLength();
368 lbr.breakType = BreakType::WORDBOUNDARY;
369 return lbr;
372 loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
374 sal_Bool GlueSpace=sal_True;
375 while (GlueSpace) {
376 if (line.aBreakIterator->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
377 lbr.breakIndex = nStartPos;
378 lbr.breakType = BreakType::WORDBOUNDARY;
379 } else if (hOptions.rHyphenator.is()) { //Hyphenation break
380 Boundary wBoundary = getWordBoundary( Text, nStartPos, rLocale,
381 WordType::DICTIONARY_WORD, false);
382 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord;
383 aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
384 wBoundary.endPos - wBoundary.startPos), rLocale,
385 (sal_Int16) (hOptions.hyphenIndex - wBoundary.startPos), hOptions.aHyphenationOptions);
386 if (aHyphenatedWord.is()) {
387 lbr.rHyphenatedWord = aHyphenatedWord;
388 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
389 lbr.breakIndex = -1;
390 else
391 lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
392 lbr.breakType = BreakType::HYPHENATION;
393 } else {
394 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
395 lbr.breakType = BreakType::WORDBOUNDARY;;
397 } else { //word boundary break
398 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
399 lbr.breakType = BreakType::WORDBOUNDARY;
402 #define WJ 0x2060 // Word Joiner
403 GlueSpace=sal_False;
404 if (lbr.breakType == BreakType::WORDBOUNDARY) {
405 nStartPos = lbr.breakIndex;
406 if (Text[nStartPos--] == WJ)
407 GlueSpace=sal_True;
408 while (nStartPos >= 0 &&
409 (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
410 if (Text[nStartPos--] == WJ)
411 GlueSpace=sal_True;
413 if (GlueSpace && nStartPos < 0) {
414 lbr.breakIndex = 0;
415 break;
420 return lbr;
425 OUString SAL_CALL
426 BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException )
428 return OUString::createFromAscii(cBreakIterator);
431 sal_Bool SAL_CALL
432 BreakIterator_Unicode::supportsService(const OUString& rServiceName) throw( uno::RuntimeException )
434 return !rServiceName.compareToAscii(cBreakIterator);
437 uno::Sequence< OUString > SAL_CALL
438 BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException )
440 uno::Sequence< OUString > aRet(1);
441 aRet[0] = OUString::createFromAscii(cBreakIterator);
442 return aRet;
445 } } } }