Version 6.1.4.1, tag libreoffice-6.1.4.1
[LibreOffice.git] / i18npool / source / breakiterator / breakiterator_unicode.cxx
blob6c8148fe048a1fe2015ecd2368b444d9afad1a6e
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <breakiterator_unicode.hxx>
21 #include <cppuhelper/supportsservice.hxx>
22 #include <localedata.hxx>
23 #include <i18nlangtag/languagetag.hxx>
24 #include <i18nlangtag/languagetagicu.hxx>
25 #include <unicode/uchar.h>
26 #include <unicode/locid.h>
27 #include <unicode/rbbi.h>
28 #include <unicode/udata.h>
29 #include <rtl/strbuf.hxx>
30 #include <rtl/ustring.hxx>
31 #include <string.h>
33 U_CDECL_BEGIN
34 extern const char OpenOffice_dat[];
35 U_CDECL_END
37 using namespace ::com::sun::star;
38 using namespace ::com::sun::star::i18n;
39 using namespace ::com::sun::star::lang;
41 namespace i18npool {
43 // Cache map of breakiterators, stores state information so has to be
44 // thread_local.
45 thread_local static BreakIterator_Unicode::BIMap theBIMap;
47 BreakIterator_Unicode::BreakIterator_Unicode()
48 : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ) // implementation name
49 , lineRule( "line" )
50 , icuBI( nullptr )
54 BreakIterator_Unicode::~BreakIterator_Unicode()
59 Wrapper class to provide public access to the icu::RuleBasedBreakIterator's
60 setbreakType method.
62 class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator
64 public:
65 #if (U_ICU_VERSION_MAJOR_NUM < 58)
66 // icu::RuleBasedBreakIterator::setBreakType() is private as of ICU 58.
67 void publicSetBreakType(int32_t type)
69 setBreakType(type);
71 #endif
72 OOoRuleBasedBreakIterator(UDataMemory* image,
73 UErrorCode &status)
74 : icu::RuleBasedBreakIterator(image, status)
75 { };
79 // loading ICU breakiterator on demand.
80 void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocale,
81 sal_Int16 rBreakType, sal_Int16 nWordType, const sal_Char *rule, const OUString& rText)
83 bool bNewBreak = false;
84 UErrorCode status = U_ZERO_ERROR;
85 sal_Int16 breakType = 0;
86 switch (rBreakType) {
87 case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
88 case LOAD_WORD_BREAKITERATOR:
89 assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
90 icuBI=&words[nWordType];
91 switch (nWordType) {
92 case WordType::ANY_WORD: break; // odd but previous behavior
93 case WordType::ANYWORD_IGNOREWHITESPACES:
94 breakType = 0; rule = "edit_word"; break;
95 case WordType::DICTIONARY_WORD:
96 breakType = 1; rule = "dict_word"; break;
97 default:
98 case WordType::WORD_COUNT:
99 breakType = 2; rule = "count_word"; break;
101 break;
102 case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
103 case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
106 // Using the cache map prevents accessing the file system for each
107 // udata_open() where ICU tries first files then data objects. And that for
108 // two fallbacks worst case.. for each new allocated EditEngine, layout
109 // cell, ... *ouch* Also non-rule locale based iterators can be mapped.
110 // This also speeds up loading iterators for alternating or generally more
111 // than one language/locale in that iterators are not constructed and
112 // destroyed en masse.
113 // Four possible keys, locale rule based with break type, locale rule based
114 // only, rule based only, locale based with break type. A fifth global key
115 // for the initial lookup.
116 // Multiple global keys may map to identical value data.
117 // All enums used here should be in the range 0..9 so assert that and avoid
118 // expensive numeric conversion in append() for faster construction of the
119 // always used global key.
120 assert( 0 <= breakType && breakType <= 9 && 0 <= rBreakType && rBreakType <= 9 && 0 <= nWordType && nWordType <= 9);
121 const OString aLangtagStr( LanguageTag::convertToBcp47( rLocale).toUtf8());
122 OStringBuffer aKeyBuf(64);
123 aKeyBuf.append( aLangtagStr).append(';');
124 if (rule)
125 aKeyBuf.append(rule);
126 aKeyBuf.append(';').append( static_cast<sal_Char>('0'+breakType)).append(';').
127 append( static_cast<sal_Char>('0'+rBreakType)).append(';').append( static_cast<sal_Char>('0'+nWordType));
128 // langtag;rule;breakType;rBreakType;nWordType
129 const OString aBIMapGlobalKey( aKeyBuf.makeStringAndClear());
131 if (icuBI->maBIMapKey != aBIMapGlobalKey || !icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
134 auto aMapIt( theBIMap.find( aBIMapGlobalKey));
135 bool bInMap = (aMapIt != theBIMap.end());
136 if (bInMap)
137 icuBI->mpValue = aMapIt->second;
138 else
139 icuBI->mpValue.reset();
141 if (!bInMap && rule) do {
142 uno::Sequence< OUString > breakRules = LocaleDataImpl::get()->getBreakIteratorRules(rLocale);
144 status = U_ZERO_ERROR;
145 udata_setAppData("OpenOffice", OpenOffice_dat, &status);
146 if ( !U_SUCCESS(status) ) throw uno::RuntimeException();
148 OOoRuleBasedBreakIterator *rbi = nullptr;
150 if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
152 // langtag;rule;breakType
153 const OString aBIMapRuleTypeKey( aLangtagStr + ";" + rule + ";" + OString::number(breakType));
154 aMapIt = theBIMap.find( aBIMapRuleTypeKey);
155 bInMap = (aMapIt != theBIMap.end());
156 if (bInMap)
158 icuBI->mpValue = aMapIt->second;
159 icuBI->maBIMapKey = aBIMapGlobalKey;
160 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
161 break; // do
164 rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
165 OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
167 if (U_SUCCESS(status))
169 icuBI->mpValue.reset( new BI_ValueData);
170 icuBI->mpValue->mpBreakIterator.reset( rbi);
171 theBIMap.insert( std::make_pair( aBIMapRuleTypeKey, icuBI->mpValue));
173 else
175 delete rbi;
176 rbi = nullptr;
179 //use icu's breakiterator for Thai, Tibetan and Dzongkha
180 else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "bo" && rLocale.Language != "dz" && rLocale.Language != "km")
182 // language;rule (not langtag, unless we'd actually load such)
183 OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());
184 const OString aBIMapRuleKey( aLanguage + ";" + rule);
185 aMapIt = theBIMap.find( aBIMapRuleKey);
186 bInMap = (aMapIt != theBIMap.end());
187 if (bInMap)
189 icuBI->mpValue = aMapIt->second;
190 icuBI->maBIMapKey = aBIMapGlobalKey;
191 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
192 break; // do
195 status = U_ZERO_ERROR;
196 OStringBuffer aUDName(64);
197 aUDName.append(rule);
198 aUDName.append('_');
199 aUDName.append( aLanguage);
200 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
201 if( U_SUCCESS(status) )
202 rbi = new OOoRuleBasedBreakIterator( pUData, status);
203 if ( U_SUCCESS(status) )
205 icuBI->mpValue.reset( new BI_ValueData);
206 icuBI->mpValue->mpBreakIterator.reset( rbi);
207 theBIMap.insert( std::make_pair( aBIMapRuleKey, icuBI->mpValue));
209 else
211 delete rbi;
212 rbi = nullptr;
214 // ;rule (only)
215 const OString aBIMapRuleOnlyKey( OString(";") + rule);
216 aMapIt = theBIMap.find( aBIMapRuleOnlyKey);
217 bInMap = (aMapIt != theBIMap.end());
218 if (bInMap)
220 icuBI->mpValue = aMapIt->second;
221 icuBI->maBIMapKey = aBIMapGlobalKey;
222 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
223 break; // do
226 status = U_ZERO_ERROR;
227 pUData = udata_open("OpenOffice", "brk", rule, &status);
228 if( U_SUCCESS(status) )
229 rbi = new OOoRuleBasedBreakIterator( pUData, status);
230 if ( U_SUCCESS(status) )
232 icuBI->mpValue.reset( new BI_ValueData);
233 icuBI->mpValue->mpBreakIterator.reset( rbi);
234 theBIMap.insert( std::make_pair( aBIMapRuleOnlyKey, icuBI->mpValue));
236 else
238 delete rbi;
239 rbi = nullptr;
243 if (rbi) {
244 #if (U_ICU_VERSION_MAJOR_NUM < 58)
245 // ICU 58 made RuleBasedBreakIterator::setBreakType() private
246 // instead of protected, so the old workaround of
247 // https://ssl.icu-project.org/trac/ticket/5498
248 // doesn't work anymore. However, they also claim to have fixed
249 // the cause that an initial fBreakType==-1 would lead to an
250 // endless loop under some circumstances.
251 // Let's see ...
252 switch (rBreakType) {
253 case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
254 case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
255 case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
256 case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
258 #endif
260 } while (false);
262 if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) do {
263 // langtag;;;rBreakType (empty rule; empty breakType)
264 const OString aBIMapLocaleTypeKey( aLangtagStr + ";;;" + OString::number(rBreakType));
265 aMapIt = theBIMap.find( aBIMapLocaleTypeKey);
266 bInMap = (aMapIt != theBIMap.end());
267 if (bInMap)
269 icuBI->mpValue = aMapIt->second;
270 icuBI->maBIMapKey = aBIMapGlobalKey;
271 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
272 break; // do
275 icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale)));
276 std::shared_ptr< icu::BreakIterator > pBI;
278 status = U_ZERO_ERROR;
279 switch (rBreakType) {
280 case LOAD_CHARACTER_BREAKITERATOR:
281 pBI.reset( icu::BreakIterator::createCharacterInstance(icuLocale, status) );
282 break;
283 case LOAD_WORD_BREAKITERATOR:
284 pBI.reset( icu::BreakIterator::createWordInstance(icuLocale, status) );
285 break;
286 case LOAD_SENTENCE_BREAKITERATOR:
287 pBI.reset( icu::BreakIterator::createSentenceInstance(icuLocale, status) );
288 break;
289 case LOAD_LINE_BREAKITERATOR:
290 pBI.reset( icu::BreakIterator::createLineInstance(icuLocale, status) );
291 break;
293 if ( !U_SUCCESS(status) || !pBI ) {
294 throw uno::RuntimeException();
296 icuBI->mpValue.reset( new BI_ValueData);
297 icuBI->mpValue->mpBreakIterator = pBI;
298 theBIMap.insert( std::make_pair( aBIMapLocaleTypeKey, icuBI->mpValue));
299 } while (false);
300 if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) {
301 throw uno::RuntimeException();
303 icuBI->maBIMapKey = aBIMapGlobalKey;
304 if (!bInMap)
305 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
306 bNewBreak=true;
309 if (bNewBreak || icuBI->mpValue->maICUText.pData != rText.pData)
311 const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
313 status = U_ZERO_ERROR;
314 icuBI->mpValue->mpUt = utext_openUChars(icuBI->mpValue->mpUt, pText, rText.getLength(), &status);
316 if (!U_SUCCESS(status))
317 throw uno::RuntimeException();
319 icuBI->mpValue->mpBreakIterator->setText(icuBI->mpValue->mpUt, status);
321 if (!U_SUCCESS(status))
322 throw uno::RuntimeException();
324 icuBI->mpValue->maICUText = rText;
328 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
329 sal_Int32 nStartPos, const lang::Locale &rLocale,
330 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
332 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
333 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
334 icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
335 for (nDone = 0; nDone < nCount; nDone++) {
336 nStartPos = pBI->following(nStartPos);
337 if (nStartPos == icu::BreakIterator::DONE)
338 return Text.getLength();
340 } else { // for CHARACTER mode
341 for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
342 Text.iterateCodePoints(&nStartPos);
344 return nStartPos;
347 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
348 sal_Int32 nStartPos, const lang::Locale& rLocale,
349 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
351 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
352 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
353 icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
354 for (nDone = 0; nDone < nCount; nDone++) {
355 nStartPos = pBI->preceding(nStartPos);
356 if (nStartPos == icu::BreakIterator::DONE)
357 return 0;
359 } else { // for BS to delete one char and CHARACTER mode.
360 for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
361 Text.iterateCodePoints(&nStartPos, -1);
363 return nStartPos;
367 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
368 const lang::Locale& rLocale, sal_Int16 rWordType )
370 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
372 Boundary rv;
373 rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos);
374 if( rv.startPos >= Text.getLength() || rv.startPos == icu::BreakIterator::DONE )
375 rv.endPos = result.startPos;
376 else {
377 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
378 rWordType == WordType::DICTIONARY_WORD ) &&
379 u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0)) )
380 rv.startPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
382 rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
383 if(rv.endPos == icu::BreakIterator::DONE)
384 rv.endPos = rv.startPos;
386 return rv;
390 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
391 const lang::Locale& rLocale, sal_Int16 rWordType)
393 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
395 Boundary rv;
396 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nStartPos);
397 if( rv.startPos < 0 || rv.startPos == icu::BreakIterator::DONE)
398 rv.endPos = rv.startPos;
399 else {
400 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
401 rWordType == WordType::DICTIONARY_WORD) &&
402 u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0)) )
403 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(rv.startPos);
405 rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
406 if(rv.endPos == icu::BreakIterator::DONE)
407 rv.endPos = rv.startPos;
409 return rv;
413 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
414 sal_Int16 rWordType, sal_Bool bDirection )
416 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
417 sal_Int32 len = Text.getLength();
419 Boundary rv;
420 if(icuBI->mpValue->mpBreakIterator->isBoundary(nPos)) {
421 rv.startPos = rv.endPos = nPos;
422 if((bDirection || nPos == 0) && nPos < len) //forward
423 rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
424 else
425 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
426 } else {
427 if(nPos <= 0) {
428 rv.startPos = 0;
429 rv.endPos = len ? icuBI->mpValue->mpBreakIterator->following(sal_Int32(0)) : 0;
430 } else if(nPos >= len) {
431 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(len);
432 rv.endPos = len;
433 } else {
434 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
435 rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
438 if (rv.startPos == icu::BreakIterator::DONE)
439 rv.startPos = rv.endPos;
440 else if (rv.endPos == icu::BreakIterator::DONE)
441 rv.endPos = rv.startPos;
443 return rv;
447 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
448 const lang::Locale &rLocale )
450 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
452 sal_Int32 len = Text.getLength();
453 if (len > 0 && nStartPos == len)
454 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
455 if (!sentence.mpValue->mpBreakIterator->isBoundary(nStartPos))
456 nStartPos = sentence.mpValue->mpBreakIterator->preceding(nStartPos);
458 // skip preceding space.
459 sal_uInt32 ch = Text.iterateCodePoints(&nStartPos);
460 while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos);
461 Text.iterateCodePoints(&nStartPos, -1);
463 return nStartPos;
466 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
467 const lang::Locale &rLocale )
469 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
471 sal_Int32 len = Text.getLength();
472 if (len > 0 && nStartPos == len)
473 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
474 nStartPos = sentence.mpValue->mpBreakIterator->following(nStartPos);
476 sal_Int32 nPos=nStartPos;
477 while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
479 return nStartPos;
482 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
483 const OUString& Text, sal_Int32 nStartPos,
484 const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
485 const LineBreakHyphenationOptions& hOptions,
486 const LineBreakUserOptions& /*rOptions*/ )
488 LineBreakResults lbr;
490 if (nStartPos >= Text.getLength()) {
491 lbr.breakIndex = Text.getLength();
492 lbr.breakType = BreakType::WORDBOUNDARY;
493 return lbr;
496 loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
498 icu::BreakIterator* pLineBI = line.mpValue->mpBreakIterator.get();
499 bool GlueSpace=true;
500 while (GlueSpace) {
501 if (pLineBI->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
502 lbr.breakIndex = nStartPos;
503 lbr.breakType = BreakType::WORDBOUNDARY;
504 } else if (hOptions.rHyphenator.is()) { //Hyphenation break
505 sal_Int32 boundary_with_punctuation = (pLineBI->next() != icu::BreakIterator::DONE) ? pLineBI->current() : 0;
506 pLineBI->preceding(nStartPos + 1); // reset to check correct hyphenation of "word-word"
508 sal_Int32 nStartPosWordEnd = nStartPos;
509 while (pLineBI->current() < nStartPosWordEnd && u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd]))) // starting punctuation
510 nStartPosWordEnd --;
512 Boundary wBoundary = getWordBoundary( Text, nStartPosWordEnd, rLocale,
513 WordType::DICTIONARY_WORD, false);
515 nStartPosWordEnd = wBoundary.endPos;
516 while (nStartPosWordEnd < Text.getLength() && (u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd])))) // ending punctuation
517 nStartPosWordEnd ++;
518 nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos;
519 if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos;
520 #define SPACE 0x0020
521 while (boundary_with_punctuation > wBoundary.endPos && Text[--boundary_with_punctuation] == SPACE);
522 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord;
523 aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
524 wBoundary.endPos - wBoundary.startPos), rLocale,
525 static_cast<sal_Int16>(hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions);
526 if (aHyphenatedWord.is()) {
527 lbr.rHyphenatedWord = aHyphenatedWord;
528 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
529 lbr.breakIndex = -1;
530 else
531 lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
532 lbr.breakType = BreakType::HYPHENATION;
534 // check not optimal hyphenation of "word-word" (word with hyphens)
535 if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < pLineBI->current()) {
536 lbr.breakIndex = pLineBI->current();
537 lbr.breakType = BreakType::WORDBOUNDARY;
540 } else {
541 lbr.breakIndex = pLineBI->preceding(nStartPos);
542 lbr.breakType = BreakType::WORDBOUNDARY;
544 } else { //word boundary break
545 lbr.breakIndex = pLineBI->preceding(nStartPos);
546 lbr.breakType = BreakType::WORDBOUNDARY;
548 // Special case for Slash U+002F SOLIDUS in URI and path names.
549 // TR14 defines that as SY: Symbols Allowing Break After (A).
550 // This is unwanted in paths, see also i#17155
551 if (lbr.breakIndex > 0 && Text[lbr.breakIndex-1] == '/')
553 // Look backward and take any whitespace before as a break
554 // opportunity. This also glues something like "w/o".
555 // Avoid an overly long path and break it as was indicated.
556 // Overly long here is arbitrarily defined.
557 const sal_Int32 nOverlyLong = 66;
558 sal_Int32 nPos = lbr.breakIndex - 1;
559 while (nPos > 0 && lbr.breakIndex - nPos < nOverlyLong)
561 if (u_isWhitespace(Text.iterateCodePoints( &nPos, -1)))
563 lbr.breakIndex = nPos + 1;
564 break;
570 #define WJ 0x2060 // Word Joiner
571 GlueSpace=false;
572 if (lbr.breakType == BreakType::WORDBOUNDARY) {
573 nStartPos = lbr.breakIndex;
574 if (nStartPos >= 0 && Text[nStartPos--] == WJ)
575 GlueSpace=true;
576 while (nStartPos >= 0 &&
577 (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
578 if (Text[nStartPos--] == WJ)
579 GlueSpace=true;
581 if (GlueSpace && nStartPos < 0) {
582 lbr.breakIndex = 0;
583 break;
588 return lbr;
591 OUString SAL_CALL
592 BreakIterator_Unicode::getImplementationName()
594 return OUString::createFromAscii(cBreakIterator);
597 sal_Bool SAL_CALL
598 BreakIterator_Unicode::supportsService(const OUString& rServiceName)
600 return cppu::supportsService(this, rServiceName);
603 uno::Sequence< OUString > SAL_CALL
604 BreakIterator_Unicode::getSupportedServiceNames()
606 uno::Sequence< OUString > aRet { OUString::createFromAscii(cBreakIterator) };
607 return aRet;
612 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
613 com_sun_star_i18n_BreakIterator_Unicode_get_implementation(
614 css::uno::XComponentContext *,
615 css::uno::Sequence<css::uno::Any> const &)
617 return cppu::acquire(new i18npool::BreakIterator_Unicode());
620 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */