Update ooo320-m1
[ooovba.git] / i18npool / source / breakiterator / breakiterator_unicode.cxx
blob5a6e1ab1315674dab2619ec9f53118311ff99c54
1 /*************************************************************************
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * Copyright 2008 by Sun Microsystems, Inc.
7 * OpenOffice.org - a multi-platform office productivity suite
9 * $RCSfile: breakiterator_unicode.cxx,v $
10 * $Revision: 1.36.2.1 $
12 * This file is part of OpenOffice.org.
14 * OpenOffice.org is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License version 3
16 * only, as published by the Free Software Foundation.
18 * OpenOffice.org is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU Lesser General Public License version 3 for more details
22 * (a copy is included in the LICENSE file that accompanied this code).
24 * You should have received a copy of the GNU Lesser General Public License
25 * version 3 along with OpenOffice.org. If not, see
26 * <http://www.openoffice.org/license.html>
27 * for a copy of the LGPLv3 License.
29 ************************************************************************/
31 // MARKER(update_precomp.py): autogen include statement, do not remove
32 #include "precompiled_i18npool.hxx"
33 #include <breakiterator_unicode.hxx>
34 #include <localedata.hxx>
35 #include <unicode/uchar.h>
36 #include <unicode/locid.h>
37 #include <unicode/rbbi.h>
38 #include <unicode/udata.h>
39 #include <rtl/strbuf.hxx>
40 #include <rtl/ustring.hxx>
42 U_CDECL_BEGIN
43 extern const char OpenOffice_dat[];
44 U_CDECL_END
46 using namespace ::com::sun::star;
47 using namespace ::com::sun::star::lang;
48 using namespace ::rtl;
50 namespace com { namespace sun { namespace star { namespace i18n {
52 #define ERROR ::com::sun::star::uno::RuntimeException()
54 //#define ImplementName "com.sun.star.i18n.BreakIterator_Unicode";
57 BreakIterator_Unicode::BreakIterator_Unicode() :
58 cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ), // implementation name
59 wordRule( "word" ),
60 lineRule( "line" ),
61 result(),
62 character(),
63 word(),
64 sentence(),
65 line(),
66 icuBI( NULL ),
67 aLocale(),
68 aBreakType(),
69 aWordType()
74 BreakIterator_Unicode::~BreakIterator_Unicode()
76 if (icuBI && icuBI->aBreakIterator) {
77 delete icuBI->aBreakIterator;
78 icuBI->aBreakIterator=NULL;
80 if (character.aBreakIterator) delete character.aBreakIterator;
81 if (word.aBreakIterator) delete word.aBreakIterator;
82 if (sentence.aBreakIterator) delete sentence.aBreakIterator;
83 if (line.aBreakIterator) delete line.aBreakIterator;
87 Wrapper class to provide public access to the RuleBasedBreakIterator's
88 setbreakType method.
90 class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator {
91 public:
92 inline void publicSetBreakType(int32_t type) {
93 setBreakType(type);
95 OOoRuleBasedBreakIterator(UDataMemory* image,
96 UErrorCode &status) :
97 RuleBasedBreakIterator(image, status) { };
101 // loading ICU breakiterator on demand.
102 void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale,
103 sal_Int16 rBreakType, sal_Int16 rWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException)
105 sal_Bool newBreak = sal_False;
106 UErrorCode status = U_ZERO_ERROR;
107 sal_Int16 breakType = 0;
108 switch (rBreakType) {
109 case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
110 case LOAD_WORD_BREAKITERATOR: icuBI=&word;
111 switch (rWordType) {
112 case WordType::ANYWORD_IGNOREWHITESPACES: breakType = 0; rule=wordRule = "edit_word"; break;
113 case WordType::DICTIONARY_WORD: breakType = 1; rule=wordRule = "dict_word"; break;
114 case WordType::WORD_COUNT: breakType = 2; rule=wordRule = "count_word"; break;
116 break;
117 case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
118 case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
120 if (!icuBI->aBreakIterator || rWordType != aWordType ||
121 rLocale.Language != aLocale.Language || rLocale.Country != aLocale.Country ||
122 rLocale.Variant != aLocale.Variant) {
123 if (icuBI->aBreakIterator) {
124 delete icuBI->aBreakIterator;
125 icuBI->aBreakIterator=NULL;
127 if (rule) {
128 uno::Sequence< OUString > breakRules = LocaleData().getBreakIteratorRules(rLocale);
130 status = U_ZERO_ERROR;
131 udata_setAppData("OpenOffice", OpenOffice_dat, &status);
132 if ( !U_SUCCESS(status) ) throw ERROR;
134 OOoRuleBasedBreakIterator *rbi = NULL;
136 if (breakRules.getLength() > breakType && breakRules[breakType].getLength() > 0) {
137 rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
138 OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
139 } else {
140 status = U_ZERO_ERROR;
141 OStringBuffer aUDName(64);
142 aUDName.append(rule);
143 aUDName.append('_');
144 aUDName.append( OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US));
145 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
146 if( U_SUCCESS(status) )
147 rbi = new OOoRuleBasedBreakIterator( pUData, status);
148 if (!U_SUCCESS(status) ) {
149 status = U_ZERO_ERROR;
150 pUData = udata_open("OpenOffice", "brk", rule, &status);
151 if( U_SUCCESS(status) )
152 rbi = new OOoRuleBasedBreakIterator( pUData, status);
153 if (!U_SUCCESS(status) ) icuBI->aBreakIterator=NULL;
156 if (rbi) {
157 switch (rBreakType) {
158 case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
159 case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
160 case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
161 case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
163 icuBI->aBreakIterator = rbi;
167 if (!icuBI->aBreakIterator) {
168 icu::Locale icuLocale(
169 OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US).getStr(),
170 OUStringToOString(rLocale.Country, RTL_TEXTENCODING_ASCII_US).getStr(),
171 OUStringToOString(rLocale.Variant, RTL_TEXTENCODING_ASCII_US).getStr());
173 status = U_ZERO_ERROR;
174 switch (rBreakType) {
175 case LOAD_CHARACTER_BREAKITERATOR:
176 icuBI->aBreakIterator = icu::BreakIterator::createCharacterInstance(icuLocale, status);
177 break;
178 case LOAD_WORD_BREAKITERATOR:
179 icuBI->aBreakIterator = icu::BreakIterator::createWordInstance(icuLocale, status);
180 break;
181 case LOAD_SENTENCE_BREAKITERATOR:
182 icuBI->aBreakIterator = icu::BreakIterator::createSentenceInstance(icuLocale, status);
183 break;
184 case LOAD_LINE_BREAKITERATOR:
185 icuBI->aBreakIterator = icu::BreakIterator::createLineInstance(icuLocale, status);
186 break;
188 if ( !U_SUCCESS(status) ) {
189 icuBI->aBreakIterator=NULL;
190 throw ERROR;
193 if (icuBI->aBreakIterator) {
194 aLocale=rLocale;
195 aWordType=rWordType;
196 aBreakType=rBreakType;
197 newBreak=sal_True;
198 } else {
199 throw ERROR;
203 if (newBreak || icuBI->aICUText.compare(UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength()))) { // UChar != sal_Unicode in MinGW
204 icuBI->aICUText=UnicodeString(reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength());
205 icuBI->aBreakIterator->setText(icuBI->aICUText);
210 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
211 sal_Int32 nStartPos, const lang::Locale &rLocale,
212 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
213 throw(uno::RuntimeException)
215 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
216 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
217 for (nDone = 0; nDone < nCount; nDone++) {
218 nStartPos = character.aBreakIterator->following(nStartPos);
219 if (nStartPos == BreakIterator::DONE)
220 return Text.getLength();
222 } else { // for CHARACTER mode
223 for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
224 Text.iterateCodePoints(&nStartPos, 1);
226 return nStartPos;
229 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
230 sal_Int32 nStartPos, const lang::Locale& rLocale,
231 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
232 throw(uno::RuntimeException)
234 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
235 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
236 for (nDone = 0; nDone < nCount; nDone++) {
237 nStartPos = character.aBreakIterator->preceding(nStartPos);
238 if (nStartPos == BreakIterator::DONE)
239 return 0;
241 } else { // for BS to delete one char and CHARACTER mode.
242 for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
243 Text.iterateCodePoints(&nStartPos, -1);
245 return nStartPos;
249 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
250 const lang::Locale& rLocale, sal_Int16 rWordType ) throw(uno::RuntimeException)
252 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
254 result.startPos = word.aBreakIterator->following(nStartPos);
255 if( result.startPos >= Text.getLength() || result.startPos == BreakIterator::DONE )
256 result.endPos = result.startPos;
257 else {
258 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
259 rWordType == WordType::DICTIONARY_WORD ) &&
260 u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
261 result.startPos = word.aBreakIterator->following(result.startPos);
263 result.endPos = word.aBreakIterator->following(result.startPos);
264 if(result.endPos == BreakIterator::DONE)
265 result.endPos = result.startPos;
267 return result;
271 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
272 const lang::Locale& rLocale, sal_Int16 rWordType) throw(uno::RuntimeException)
274 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
276 result.startPos = word.aBreakIterator->preceding(nStartPos);
277 if( result.startPos < 0 || result.startPos == BreakIterator::DONE)
278 result.endPos = result.startPos;
279 else {
280 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
281 rWordType == WordType::DICTIONARY_WORD) &&
282 u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
283 result.startPos = word.aBreakIterator->preceding(result.startPos);
285 result.endPos = word.aBreakIterator->following(result.startPos);
286 if(result.endPos == BreakIterator::DONE)
287 result.endPos = result.startPos;
289 return result;
293 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
294 sal_Int16 rWordType, sal_Bool bDirection ) throw(uno::RuntimeException)
296 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
297 sal_Int32 len = Text.getLength();
299 if(word.aBreakIterator->isBoundary(nPos)) {
300 result.startPos = result.endPos = nPos;
301 if((bDirection || nPos == 0) && nPos < len) //forward
302 result.endPos = word.aBreakIterator->following(nPos);
303 else
304 result.startPos = word.aBreakIterator->preceding(nPos);
305 } else {
306 if(nPos <= 0) {
307 result.startPos = 0;
308 result.endPos = len ? word.aBreakIterator->following((sal_Int32)0) : 0;
309 } else if(nPos >= len) {
310 result.startPos = word.aBreakIterator->preceding(len);
311 result.endPos = len;
312 } else {
313 result.startPos = word.aBreakIterator->preceding(nPos);
314 result.endPos = word.aBreakIterator->following(nPos);
317 if (result.startPos == BreakIterator::DONE)
318 result.startPos = result.endPos;
319 else if (result.endPos == BreakIterator::DONE)
320 result.endPos = result.startPos;
322 return result;
326 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
327 const lang::Locale &rLocale ) throw(uno::RuntimeException)
329 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
331 sal_Int32 len = Text.getLength();
332 if (len > 0 && nStartPos == len)
333 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
334 if (!sentence.aBreakIterator->isBoundary(nStartPos))
335 nStartPos = sentence.aBreakIterator->preceding(nStartPos);
337 // skip preceding space.
338 sal_uInt32 ch = Text.iterateCodePoints(&nStartPos, 1);
339 while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos, 1);
340 Text.iterateCodePoints(&nStartPos, -1);
342 return nStartPos;
345 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
346 const lang::Locale &rLocale ) throw(uno::RuntimeException)
348 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
350 sal_Int32 len = Text.getLength();
351 if (len > 0 && nStartPos == len)
352 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
353 nStartPos = sentence.aBreakIterator->following(nStartPos);
355 sal_Int32 nPos=nStartPos;
356 while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
358 return nStartPos;
361 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
362 const OUString& Text, sal_Int32 nStartPos,
363 const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
364 const LineBreakHyphenationOptions& hOptions,
365 const LineBreakUserOptions& /*rOptions*/ ) throw(uno::RuntimeException)
367 LineBreakResults lbr;
369 if (nStartPos >= Text.getLength()) {
370 lbr.breakIndex = Text.getLength();
371 lbr.breakType = BreakType::WORDBOUNDARY;
372 return lbr;
375 loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
377 sal_Bool GlueSpace=sal_True;
378 while (GlueSpace) {
379 if (line.aBreakIterator->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
380 lbr.breakIndex = nStartPos;
381 lbr.breakType = BreakType::WORDBOUNDARY;
382 } else if (hOptions.rHyphenator.is()) { //Hyphenation break
383 Boundary wBoundary = getWordBoundary( Text, nStartPos, rLocale,
384 WordType::DICTIONARY_WORD, false);
385 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord;
386 aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
387 wBoundary.endPos - wBoundary.startPos), rLocale,
388 (sal_Int16) (hOptions.hyphenIndex - wBoundary.startPos), hOptions.aHyphenationOptions);
389 if (aHyphenatedWord.is()) {
390 lbr.rHyphenatedWord = aHyphenatedWord;
391 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
392 lbr.breakIndex = -1;
393 else
394 lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
395 lbr.breakType = BreakType::HYPHENATION;
396 } else {
397 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
398 lbr.breakType = BreakType::WORDBOUNDARY;;
400 } else { //word boundary break
401 lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
402 lbr.breakType = BreakType::WORDBOUNDARY;
405 #define WJ 0x2060 // Word Joiner
406 GlueSpace=sal_False;
407 if (lbr.breakType == BreakType::WORDBOUNDARY) {
408 nStartPos = lbr.breakIndex;
409 if (Text[nStartPos--] == WJ)
410 GlueSpace=sal_True;
411 while (nStartPos >= 0 &&
412 (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
413 if (Text[nStartPos--] == WJ)
414 GlueSpace=sal_True;
416 if (GlueSpace && nStartPos < 0) {
417 lbr.breakIndex = 0;
418 break;
423 return lbr;
428 OUString SAL_CALL
429 BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException )
431 return OUString::createFromAscii(cBreakIterator);
434 sal_Bool SAL_CALL
435 BreakIterator_Unicode::supportsService(const OUString& rServiceName) throw( uno::RuntimeException )
437 return !rServiceName.compareToAscii(cBreakIterator);
440 uno::Sequence< OUString > SAL_CALL
441 BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException )
443 uno::Sequence< OUString > aRet(1);
444 aRet[0] = OUString::createFromAscii(cBreakIterator);
445 return aRet;
448 } } } }