Version 6.4.0.0.beta1, tag libreoffice-6.4.0.0.beta1
[LibreOffice.git] / i18npool / source / breakiterator / breakiterator_unicode.cxx
blobe1675ec6a41dcc1cf00441024292d48000de0cde
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <breakiterator_unicode.hxx>
21 #include <cppuhelper/supportsservice.hxx>
22 #include <localedata.hxx>
23 #include <i18nlangtag/languagetag.hxx>
24 #include <i18nlangtag/languagetagicu.hxx>
25 #include <unicode/uchar.h>
26 #include <unicode/locid.h>
27 #include <unicode/rbbi.h>
28 #include <unicode/udata.h>
29 #include <rtl/strbuf.hxx>
30 #include <rtl/ustring.hxx>
32 #include <com/sun/star/i18n/BreakType.hpp>
33 #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
34 #include <com/sun/star/i18n/WordType.hpp>
36 U_CDECL_BEGIN
37 extern const char OpenOffice_dat[];
38 U_CDECL_END
40 using namespace ::com::sun::star;
41 using namespace ::com::sun::star::i18n;
42 using namespace ::com::sun::star::lang;
44 namespace i18npool {
46 // Cache map of breakiterators, stores state information so has to be
47 // thread_local.
48 thread_local static BreakIterator_Unicode::BIMap theBIMap;
50 BreakIterator_Unicode::BreakIterator_Unicode()
51 : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ) // implementation name
52 , lineRule( "line" )
53 , icuBI( nullptr )
57 BreakIterator_Unicode::~BreakIterator_Unicode()
62 Wrapper class to provide public access to the icu::RuleBasedBreakIterator's
63 setbreakType method.
65 class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator
67 public:
68 #if (U_ICU_VERSION_MAJOR_NUM < 58)
69 // icu::RuleBasedBreakIterator::setBreakType() is private as of ICU 58.
70 void publicSetBreakType(int32_t type)
72 setBreakType(type);
74 #endif
75 OOoRuleBasedBreakIterator(UDataMemory* image,
76 UErrorCode &status)
77 : icu::RuleBasedBreakIterator(image, status)
78 { };
82 // loading ICU breakiterator on demand.
83 void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocale,
84 sal_Int16 rBreakType, sal_Int16 nWordType, const sal_Char *rule, const OUString& rText)
86 bool bNewBreak = false;
87 UErrorCode status = U_ZERO_ERROR;
88 sal_Int16 breakType = 0;
89 switch (rBreakType) {
90 case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
91 case LOAD_WORD_BREAKITERATOR:
92 assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
93 icuBI=&words[nWordType];
94 switch (nWordType) {
95 case WordType::ANY_WORD: break; // odd but previous behavior
96 case WordType::ANYWORD_IGNOREWHITESPACES:
97 breakType = 0; rule = "edit_word"; break;
98 case WordType::DICTIONARY_WORD:
99 breakType = 1; rule = "dict_word"; break;
100 default:
101 case WordType::WORD_COUNT:
102 breakType = 2; rule = "count_word"; break;
104 break;
105 case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
106 case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
109 // Using the cache map prevents accessing the file system for each
110 // udata_open() where ICU tries first files then data objects. And that for
111 // two fallbacks worst case... for each new allocated EditEngine, layout
112 // cell, ... *ouch* Also non-rule locale based iterators can be mapped.
113 // This also speeds up loading iterators for alternating or generally more
114 // than one language/locale in that iterators are not constructed and
115 // destroyed en masse.
116 // Four possible keys, locale rule based with break type, locale rule based
117 // only, rule based only, locale based with break type. A fifth global key
118 // for the initial lookup.
119 // Multiple global keys may map to identical value data.
120 // All enums used here should be in the range 0..9 so assert that and avoid
121 // expensive numeric conversion in append() for faster construction of the
122 // always used global key.
123 assert( 0 <= breakType && breakType <= 9 && 0 <= rBreakType && rBreakType <= 9 && 0 <= nWordType && nWordType <= 9);
124 const OString aLangtagStr( LanguageTag::convertToBcp47( rLocale).toUtf8());
125 OStringBuffer aKeyBuf(64);
126 aKeyBuf.append( aLangtagStr).append(';');
127 if (rule)
128 aKeyBuf.append(rule);
129 aKeyBuf.append(';').append( static_cast<sal_Char>('0'+breakType)).append(';').
130 append( static_cast<sal_Char>('0'+rBreakType)).append(';').append( static_cast<sal_Char>('0'+nWordType));
131 // langtag;rule;breakType;rBreakType;nWordType
132 const OString aBIMapGlobalKey( aKeyBuf.makeStringAndClear());
134 if (icuBI->maBIMapKey != aBIMapGlobalKey || !icuBI->mpValue || !icuBI->mpValue->mpBreakIterator)
137 auto aMapIt( theBIMap.find( aBIMapGlobalKey));
138 bool bInMap = (aMapIt != theBIMap.end());
139 if (bInMap)
140 icuBI->mpValue = aMapIt->second;
141 else
142 icuBI->mpValue.reset();
144 if (!bInMap && rule) do {
145 const uno::Sequence< OUString > breakRules = LocaleDataImpl::get()->getBreakIteratorRules(rLocale);
147 status = U_ZERO_ERROR;
148 udata_setAppData("OpenOffice", OpenOffice_dat, &status);
149 if ( !U_SUCCESS(status) ) throw uno::RuntimeException();
151 std::unique_ptr<OOoRuleBasedBreakIterator> rbi;
153 if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
155 // langtag;rule;breakType
156 const OString aBIMapRuleTypeKey( aLangtagStr + ";" + rule + ";" + OString::number(breakType));
157 aMapIt = theBIMap.find( aBIMapRuleTypeKey);
158 bInMap = (aMapIt != theBIMap.end());
159 if (bInMap)
161 icuBI->mpValue = aMapIt->second;
162 icuBI->maBIMapKey = aBIMapGlobalKey;
163 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
164 break; // do
167 rbi.reset(new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
168 OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status));
170 if (U_SUCCESS(status))
172 icuBI->mpValue.reset( new BI_ValueData);
173 icuBI->mpValue->mpBreakIterator = std::move( rbi);
174 theBIMap.insert( std::make_pair( aBIMapRuleTypeKey, icuBI->mpValue));
176 else
178 rbi.reset();
181 //use icu's breakiterator for Thai, Tibetan and Dzongkha
182 else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "bo" && rLocale.Language != "dz" && rLocale.Language != "km")
184 // language;rule (not langtag, unless we'd actually load such)
185 OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());
186 const OString aBIMapRuleKey( aLanguage + ";" + rule);
187 aMapIt = theBIMap.find( aBIMapRuleKey);
188 bInMap = (aMapIt != theBIMap.end());
189 if (bInMap)
191 icuBI->mpValue = aMapIt->second;
192 icuBI->maBIMapKey = aBIMapGlobalKey;
193 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
194 break; // do
197 status = U_ZERO_ERROR;
198 OString aUDName = rtl::OStringView(rule) + "_" + aLanguage;
199 UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
200 if( U_SUCCESS(status) )
201 rbi.reset(new OOoRuleBasedBreakIterator( pUData, status));
202 if ( U_SUCCESS(status) )
204 icuBI->mpValue.reset( new BI_ValueData);
205 icuBI->mpValue->mpBreakIterator = std::move( rbi);
206 theBIMap.insert( std::make_pair( aBIMapRuleKey, icuBI->mpValue));
208 else
210 rbi.reset();
212 // ;rule (only)
213 const OString aBIMapRuleOnlyKey( OStringLiteral(";") + rule);
214 aMapIt = theBIMap.find( aBIMapRuleOnlyKey);
215 bInMap = (aMapIt != theBIMap.end());
216 if (bInMap)
218 icuBI->mpValue = aMapIt->second;
219 icuBI->maBIMapKey = aBIMapGlobalKey;
220 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
221 break; // do
224 status = U_ZERO_ERROR;
225 pUData = udata_open("OpenOffice", "brk", rule, &status);
226 if( U_SUCCESS(status) )
227 rbi.reset(new OOoRuleBasedBreakIterator( pUData, status));
228 if ( U_SUCCESS(status) )
230 icuBI->mpValue.reset( new BI_ValueData);
231 icuBI->mpValue->mpBreakIterator = std::move( rbi);
232 theBIMap.insert( std::make_pair( aBIMapRuleOnlyKey, icuBI->mpValue));
234 else
236 rbi.reset();
240 if (rbi) {
241 #if (U_ICU_VERSION_MAJOR_NUM < 58)
242 // ICU 58 made RuleBasedBreakIterator::setBreakType() private
243 // instead of protected, so the old workaround of
244 // https://ssl.icu-project.org/trac/ticket/5498
245 // doesn't work anymore. However, they also claim to have fixed
246 // the cause that an initial fBreakType==-1 would lead to an
247 // endless loop under some circumstances.
248 // Let's see ...
249 switch (rBreakType) {
250 case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
251 case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
252 case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
253 case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
255 #endif
257 } while (false);
259 if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) do {
260 // langtag;;;rBreakType (empty rule; empty breakType)
261 const OString aBIMapLocaleTypeKey( aLangtagStr + ";;;" + OString::number(rBreakType));
262 aMapIt = theBIMap.find( aBIMapLocaleTypeKey);
263 bInMap = (aMapIt != theBIMap.end());
264 if (bInMap)
266 icuBI->mpValue = aMapIt->second;
267 icuBI->maBIMapKey = aBIMapGlobalKey;
268 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
269 break; // do
272 icu::Locale icuLocale( LanguageTagIcu::getIcuLocale( LanguageTag( rLocale)));
273 std::shared_ptr< icu::BreakIterator > pBI;
275 status = U_ZERO_ERROR;
276 switch (rBreakType) {
277 case LOAD_CHARACTER_BREAKITERATOR:
278 pBI.reset( icu::BreakIterator::createCharacterInstance(icuLocale, status) );
279 break;
280 case LOAD_WORD_BREAKITERATOR:
281 pBI.reset( icu::BreakIterator::createWordInstance(icuLocale, status) );
282 break;
283 case LOAD_SENTENCE_BREAKITERATOR:
284 pBI.reset( icu::BreakIterator::createSentenceInstance(icuLocale, status) );
285 break;
286 case LOAD_LINE_BREAKITERATOR:
287 pBI.reset( icu::BreakIterator::createLineInstance(icuLocale, status) );
288 break;
290 if ( !U_SUCCESS(status) || !pBI ) {
291 throw uno::RuntimeException();
293 icuBI->mpValue.reset( new BI_ValueData);
294 icuBI->mpValue->mpBreakIterator = pBI;
295 theBIMap.insert( std::make_pair( aBIMapLocaleTypeKey, icuBI->mpValue));
296 } while (false);
297 if (!icuBI->mpValue || !icuBI->mpValue->mpBreakIterator) {
298 throw uno::RuntimeException();
300 icuBI->maBIMapKey = aBIMapGlobalKey;
301 if (!bInMap)
302 theBIMap.insert( std::make_pair( aBIMapGlobalKey, icuBI->mpValue));
303 bNewBreak=true;
306 if (bNewBreak || icuBI->mpValue->maICUText.pData != rText.pData)
308 const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
310 status = U_ZERO_ERROR;
311 icuBI->mpValue->mpUt = utext_openUChars(icuBI->mpValue->mpUt, pText, rText.getLength(), &status);
313 if (!U_SUCCESS(status))
314 throw uno::RuntimeException();
316 icuBI->mpValue->mpBreakIterator->setText(icuBI->mpValue->mpUt, status);
318 if (!U_SUCCESS(status))
319 throw uno::RuntimeException();
321 icuBI->mpValue->maICUText = rText;
325 sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
326 sal_Int32 nStartPos, const lang::Locale &rLocale,
327 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
329 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
330 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
331 icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
332 for (nDone = 0; nDone < nCount; nDone++) {
333 nStartPos = pBI->following(nStartPos);
334 if (nStartPos == icu::BreakIterator::DONE)
335 return Text.getLength();
337 } else { // for CHARACTER mode
338 for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
339 Text.iterateCodePoints(&nStartPos);
341 return nStartPos;
344 sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
345 sal_Int32 nStartPos, const lang::Locale& rLocale,
346 sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
348 if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
349 loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
350 icu::BreakIterator* pBI = character.mpValue->mpBreakIterator.get();
351 for (nDone = 0; nDone < nCount; nDone++) {
352 nStartPos = pBI->preceding(nStartPos);
353 if (nStartPos == icu::BreakIterator::DONE)
354 return 0;
356 } else { // for BS to delete one char and CHARACTER mode.
357 for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
358 Text.iterateCodePoints(&nStartPos, -1);
360 return nStartPos;
364 Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
365 const lang::Locale& rLocale, sal_Int16 rWordType )
367 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
369 Boundary rv;
370 rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos);
371 if( rv.startPos >= Text.getLength() || rv.startPos == icu::BreakIterator::DONE )
372 rv.endPos = result.startPos;
373 else {
374 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
375 rWordType == WordType::DICTIONARY_WORD ) &&
376 u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0)) )
377 rv.startPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
379 rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
380 if(rv.endPos == icu::BreakIterator::DONE)
381 rv.endPos = rv.startPos;
383 return rv;
387 Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
388 const lang::Locale& rLocale, sal_Int16 rWordType)
390 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
392 Boundary rv;
393 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nStartPos);
394 if( rv.startPos < 0)
395 rv.endPos = rv.startPos;
396 else {
397 if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
398 rWordType == WordType::DICTIONARY_WORD) &&
399 u_isWhitespace(Text.iterateCodePoints(&rv.startPos, 0)) )
400 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(rv.startPos);
402 rv.endPos = icuBI->mpValue->mpBreakIterator->following(rv.startPos);
403 if(rv.endPos == icu::BreakIterator::DONE)
404 rv.endPos = rv.startPos;
406 return rv;
410 Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
411 sal_Int16 rWordType, sal_Bool bDirection )
413 loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, nullptr, Text);
414 sal_Int32 len = Text.getLength();
416 Boundary rv;
417 if(icuBI->mpValue->mpBreakIterator->isBoundary(nPos)) {
418 rv.startPos = rv.endPos = nPos;
419 if((bDirection || nPos == 0) && nPos < len) //forward
420 rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
421 else
422 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
423 } else {
424 if(nPos <= 0) {
425 rv.startPos = 0;
426 rv.endPos = len ? icuBI->mpValue->mpBreakIterator->following(sal_Int32(0)) : 0;
427 } else if(nPos >= len) {
428 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(len);
429 rv.endPos = len;
430 } else {
431 rv.startPos = icuBI->mpValue->mpBreakIterator->preceding(nPos);
432 rv.endPos = icuBI->mpValue->mpBreakIterator->following(nPos);
435 if (rv.startPos == icu::BreakIterator::DONE)
436 rv.startPos = rv.endPos;
437 else if (rv.endPos == icu::BreakIterator::DONE)
438 rv.endPos = rv.startPos;
440 return rv;
444 sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
445 const lang::Locale &rLocale )
447 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
449 sal_Int32 len = Text.getLength();
450 if (len > 0 && nStartPos == len)
451 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
452 if (!sentence.mpValue->mpBreakIterator->isBoundary(nStartPos))
453 nStartPos = sentence.mpValue->mpBreakIterator->preceding(nStartPos);
455 // skip preceding space.
456 sal_uInt32 ch = Text.iterateCodePoints(&nStartPos);
457 while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos);
458 Text.iterateCodePoints(&nStartPos, -1);
460 return nStartPos;
463 sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
464 const lang::Locale &rLocale )
466 loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
468 sal_Int32 len = Text.getLength();
469 if (len > 0 && nStartPos == len)
470 Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
471 nStartPos = sentence.mpValue->mpBreakIterator->following(nStartPos);
473 sal_Int32 nPos=nStartPos;
474 while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
476 return nStartPos;
479 LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
480 const OUString& Text, sal_Int32 nStartPos,
481 const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
482 const LineBreakHyphenationOptions& hOptions,
483 const LineBreakUserOptions& /*rOptions*/ )
485 LineBreakResults lbr;
487 if (nStartPos >= Text.getLength()) {
488 lbr.breakIndex = Text.getLength();
489 lbr.breakType = BreakType::WORDBOUNDARY;
490 return lbr;
493 loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
495 icu::BreakIterator* pLineBI = line.mpValue->mpBreakIterator.get();
496 bool GlueSpace=true;
497 while (GlueSpace) {
498 if (pLineBI->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
499 lbr.breakIndex = nStartPos;
500 lbr.breakType = BreakType::WORDBOUNDARY;
501 } else if (hOptions.rHyphenator.is()) { //Hyphenation break
502 sal_Int32 boundary_with_punctuation = (pLineBI->next() != icu::BreakIterator::DONE) ? pLineBI->current() : 0;
503 pLineBI->preceding(nStartPos + 1); // reset to check correct hyphenation of "word-word"
505 sal_Int32 nStartPosWordEnd = nStartPos;
506 while (pLineBI->current() < nStartPosWordEnd && u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd]))) // starting punctuation
507 nStartPosWordEnd --;
509 Boundary wBoundary = getWordBoundary( Text, nStartPosWordEnd, rLocale,
510 WordType::DICTIONARY_WORD, false);
512 nStartPosWordEnd = wBoundary.endPos;
513 while (nStartPosWordEnd < Text.getLength() && (u_ispunct(static_cast<sal_uInt32>(Text[nStartPosWordEnd])))) // ending punctuation
514 nStartPosWordEnd ++;
515 nStartPosWordEnd = nStartPosWordEnd - wBoundary.endPos;
516 if (hOptions.hyphenIndex - wBoundary.startPos < nStartPosWordEnd) nStartPosWordEnd = hOptions.hyphenIndex - wBoundary.startPos;
517 #define SPACE 0x0020
518 while (boundary_with_punctuation > wBoundary.endPos && Text[--boundary_with_punctuation] == SPACE);
519 uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
520 wBoundary.endPos - wBoundary.startPos), rLocale,
521 static_cast<sal_Int16>(hOptions.hyphenIndex - wBoundary.startPos - ((hOptions.hyphenIndex == wBoundary.endPos)? nStartPosWordEnd : 0)), hOptions.aHyphenationOptions);
522 if (aHyphenatedWord.is()) {
523 lbr.rHyphenatedWord = aHyphenatedWord;
524 if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
525 lbr.breakIndex = -1;
526 else
527 lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
528 lbr.breakType = BreakType::HYPHENATION;
530 // check not optimal hyphenation of "word-word" (word with hyphens)
531 if (lbr.breakIndex > -1 && wBoundary.startPos + aHyphenatedWord->getHyphenationPos() < pLineBI->current()) {
532 lbr.breakIndex = pLineBI->current();
533 lbr.breakType = BreakType::WORDBOUNDARY;
536 } else {
537 lbr.breakIndex = pLineBI->preceding(nStartPos);
538 lbr.breakType = BreakType::WORDBOUNDARY;
540 } else { //word boundary break
541 lbr.breakIndex = pLineBI->preceding(nStartPos);
542 lbr.breakType = BreakType::WORDBOUNDARY;
544 // Special case for Slash U+002F SOLIDUS in URI and path names.
545 // TR14 defines that as SY: Symbols Allowing Break After (A).
546 // This is unwanted in paths, see also i#17155
547 if (lbr.breakIndex > 0 && Text[lbr.breakIndex-1] == '/')
549 // Look backward and take any whitespace before as a break
550 // opportunity. This also glues something like "w/o".
551 // Avoid an overly long path and break it as was indicated.
552 // Overly long here is arbitrarily defined.
553 const sal_Int32 nOverlyLong = 66;
554 sal_Int32 nPos = lbr.breakIndex - 1;
555 while (nPos > 0 && lbr.breakIndex - nPos < nOverlyLong)
557 if (u_isWhitespace(Text.iterateCodePoints( &nPos, -1)))
559 lbr.breakIndex = nPos + 1;
560 break;
566 #define WJ 0x2060 // Word Joiner
567 GlueSpace=false;
568 if (lbr.breakType == BreakType::WORDBOUNDARY) {
569 nStartPos = lbr.breakIndex;
570 if (nStartPos >= 0 && Text[nStartPos--] == WJ)
571 GlueSpace=true;
572 while (nStartPos >= 0 &&
573 (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
574 if (Text[nStartPos--] == WJ)
575 GlueSpace=true;
577 if (GlueSpace && nStartPos < 0) {
578 lbr.breakIndex = 0;
579 break;
584 return lbr;
587 OUString SAL_CALL
588 BreakIterator_Unicode::getImplementationName()
590 return OUString::createFromAscii(cBreakIterator);
593 sal_Bool SAL_CALL
594 BreakIterator_Unicode::supportsService(const OUString& rServiceName)
596 return cppu::supportsService(this, rServiceName);
599 uno::Sequence< OUString > SAL_CALL
600 BreakIterator_Unicode::getSupportedServiceNames()
602 uno::Sequence< OUString > aRet { OUString::createFromAscii(cBreakIterator) };
603 return aRet;
608 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface *
609 com_sun_star_i18n_BreakIterator_Unicode_get_implementation(
610 css::uno::XComponentContext *,
611 css::uno::Sequence<css::uno::Any> const &)
613 return cppu::acquire(new i18npool::BreakIterator_Unicode());
616 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */