1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
24 #include <string_view>
29 #include <o3tl/safeint.hxx>
30 #include <o3tl/string_view.hxx>
31 #include <rtl/character.hxx>
32 #include <rtl/ustring.hxx>
33 #include <rtl/ustrbuf.hxx>
34 #include <rtl/string.hxx>
35 #include <rtl/strbuf.hxx>
36 #include <sal/log.hxx>
37 #include <sal/types.h>
39 #include <comphelper/string.hxx>
40 #include <comphelper/stl_types.hxx>
41 #include <comphelper/sequence.hxx>
43 #include <com/sun/star/i18n/BreakIterator.hpp>
44 #include <com/sun/star/i18n/CharType.hpp>
45 #include <com/sun/star/i18n/Collator.hpp>
48 namespace comphelper::string
{
52 template <typename T
, typename C
> T
tmpl_stripStart(const T
&rIn
,
58 typename
T::size_type i
= 0;
60 while (i
< rIn
.size())
62 if (rIn
[i
] != cRemove
)
69 template <typename T
, typename C
> T
tmpl_stripStartString(const T
&rIn
,
77 while (i
< rIn
.getLength())
79 if (rIn
[i
] != cRemove
)
88 OString
stripStart(const OString
& rIn
, char c
)
90 return tmpl_stripStartString
<OString
, char>(rIn
, c
);
93 std::string_view
stripStart(std::string_view rIn
, char c
)
95 return tmpl_stripStart
<std::string_view
, char>(rIn
, c
);
98 OUString
stripStart(const OUString
& rIn
, sal_Unicode c
)
100 return tmpl_stripStartString
<OUString
, sal_Unicode
>(rIn
, c
);
103 std::u16string_view
stripStart(std::u16string_view rIn
, sal_Unicode c
)
105 return tmpl_stripStart
<std::u16string_view
, sal_Unicode
>(rIn
, c
);
110 template <typename T
, typename C
> T
tmpl_stripEnd(const T
&rIn
,
116 typename
T::size_type i
= rIn
.size();
120 if (rIn
[i
-1] != cRemove
)
125 return rIn
.substr(0, i
);
127 template <typename T
, typename C
> T
tmpl_stripEndString(const T
&rIn
,
133 sal_Int32 i
= rIn
.getLength();
137 if (rIn
[i
-1] != cRemove
)
142 return rIn
.copy(0, i
);
146 OString
stripEnd(const OString
& rIn
, char c
)
148 return tmpl_stripEndString
<OString
, char>(rIn
, c
);
151 std::string_view
stripEnd(std::string_view rIn
, char c
)
153 return tmpl_stripEnd
<std::string_view
, char>(rIn
, c
);
156 OUString
stripEnd(const OUString
& rIn
, sal_Unicode c
)
158 return tmpl_stripEndString
<OUString
, sal_Unicode
>(rIn
, c
);
161 std::u16string_view
stripEnd(std::u16string_view rIn
, sal_Unicode c
)
163 return tmpl_stripEnd
<std::u16string_view
, sal_Unicode
>(rIn
, c
);
168 template <typename T
, typename C
> T
tmpl_strip(const T
&rIn
,
174 typename
T::size_type end
= rIn
.size();
177 if (rIn
[end
-1] != cRemove
)
182 typename
T::size_type start
= 0;
185 if (rIn
[start
] != cRemove
)
190 return rIn
.substr(start
, end
- start
);
192 template <typename T
, typename C
> T
tmpl_stripString(const T
&rIn
,
198 sal_Int32 end
= rIn
.getLength();
201 if (rIn
[end
-1] != cRemove
)
208 if (rIn
[start
] != cRemove
)
213 return rIn
.copy(start
, end
- start
);
217 OString
strip(const OString
& rIn
, char c
)
219 return tmpl_stripString
<OString
, char>(rIn
, c
);
222 std::string_view
strip(std::string_view rIn
, char c
)
224 return tmpl_strip
<std::string_view
, char>(rIn
, c
);
227 OUString
strip(const OUString
& rIn
, sal_Unicode c
)
229 return tmpl_stripString
<OUString
, sal_Unicode
>(rIn
, c
);
232 std::u16string_view
strip(std::u16string_view rIn
, sal_Unicode c
)
234 return tmpl_strip
<std::u16string_view
, sal_Unicode
>(rIn
, c
);
239 template <typename T
, typename C
> sal_Int32
tmpl_getTokenCount( T rIn
,
242 // Empty String: TokenCount by Definition is 0
246 sal_Int32 nTokCount
= 1;
247 for (typename
T::size_type i
= 0; i
< rIn
.size(); ++i
)
256 sal_Int32
getTokenCount(std::string_view rIn
, char cTok
)
258 return tmpl_getTokenCount
<std::string_view
, char>(rIn
, cTok
);
261 sal_Int32
getTokenCount(std::u16string_view rIn
, sal_Unicode cTok
)
263 return tmpl_getTokenCount
<std::u16string_view
, sal_Unicode
>(rIn
, cTok
);
266 sal_uInt32
decimalStringToNumber(std::u16string_view str
)
268 sal_uInt32 result
= 0;
269 for( sal_Int32 i
= 0; i
< static_cast<sal_Int32
>(str
.size()); )
271 sal_uInt32 c
= o3tl::iterateCodePoints(str
, &i
);
272 sal_uInt32 value
= 0;
273 if( c
<= 0x0039) // ASCII decimal digits, most common
275 else if( c
>= 0x1D7F6 ) // mathematical monospace digits
277 else if( c
>= 0x1D7EC ) // mathematical sans-serif bold digits
279 else if( c
>= 0x1D7E2 ) // mathematical sans-serif digits
281 else if( c
>= 0x1D7D8 ) // mathematical double-struck digits
283 else if( c
>= 0x1D7CE ) // mathematical bold digits
285 else if( c
>= 0x11066 ) // brahmi digits
287 else if( c
>= 0x104A0 ) // osmanya digits
289 else if( c
>= 0xFF10 ) // fullwidth digits
291 else if( c
>= 0xABF0 ) // meetei mayek digits
293 else if( c
>= 0xAA50 ) // cham digits
295 else if( c
>= 0xA9D0 ) // javanese digits
297 else if( c
>= 0xA900 ) // kayah li digits
299 else if( c
>= 0xA8D0 ) // saurashtra digits
301 else if( c
>= 0xA620 ) // vai digits
303 else if( c
>= 0x1C50 ) // ol chiki digits
305 else if( c
>= 0x1C40 ) // lepcha digits
307 else if( c
>= 0x1BB0 ) // sundanese digits
309 else if( c
>= 0x1B50 ) // balinese digits
311 else if( c
>= 0x1A90 ) // tai tham tham digits
313 else if( c
>= 0x1A80 ) // tai tham hora digits
315 else if( c
>= 0x19D0 ) // new tai lue digits
317 else if( c
>= 0x1946 ) // limbu digits
319 else if( c
>= 0x1810 ) // mongolian digits
321 else if( c
>= 0x17E0 ) // khmer digits
323 else if( c
>= 0x1090 ) // myanmar shan digits
325 else if( c
>= 0x1040 ) // myanmar digits
327 else if( c
>= 0x0F20 ) // tibetan digits
329 else if( c
>= 0x0ED0 ) // lao digits
331 else if( c
>= 0x0E50 ) // thai digits
333 else if( c
>= 0x0D66 ) // malayalam digits
335 else if( c
>= 0x0CE6 ) // kannada digits
337 else if( c
>= 0x0C66 ) // telugu digits
339 else if( c
>= 0x0BE6 ) // tamil digits
341 else if( c
>= 0x0B66 ) // odia digits
343 else if( c
>= 0x0AE6 ) // gujarati digits
345 else if( c
>= 0x0A66 ) // gurmukhi digits
347 else if( c
>= 0x09E6 ) // bengali digits
349 else if( c
>= 0x0966 ) // devanagari digit
351 else if( c
>= 0x07C0 ) // nko digits
353 else if( c
>= 0x06F0 ) // extended arabic-indic digits
355 else if( c
>= 0x0660 ) // arabic-indic digits
357 result
= result
* 10 + value
;
362 using namespace ::com::sun::star
;
364 // convert between sequence of string and comma separated string
366 OUString
convertCommaSeparated(
367 uno::Sequence
< OUString
> const& i_rSeq
)
370 ::comphelper::intersperse(
371 i_rSeq
.begin(), i_rSeq
.end(), ::comphelper::OUStringBufferAppender(buf
), OUString( ", " ));
372 return buf
.makeStringAndClear();
375 std::vector
<OUString
>
376 split(std::u16string_view rStr
, sal_Unicode cSeparator
)
378 std::vector
< OUString
> vec
;
382 std::u16string_view kw
= o3tl::getToken(rStr
, cSeparator
, idx
);
386 vec
.push_back(OUString(kw
));
389 } while (idx
!= std::u16string_view::npos
);
394 uno::Sequence
< OUString
>
395 convertCommaSeparated( std::u16string_view i_rString
)
397 std::vector
< OUString
> vec
= split(i_rString
, ',');
398 return comphelper::containerToSequence(vec
);
401 OString
join(std::string_view rSeparator
, const std::vector
<OString
>& rSequence
)
403 OStringBuffer aBuffer
;
404 for (size_t i
= 0; i
< rSequence
.size(); ++i
)
407 aBuffer
.append(rSeparator
);
408 aBuffer
.append(rSequence
[i
]);
410 return aBuffer
.makeStringAndClear();
413 sal_Int32
compareNatural( const OUString
& rLHS
, const OUString
& rRHS
,
414 const uno::Reference
< i18n::XCollator
> &rCollator
,
415 const uno::Reference
< i18n::XBreakIterator
> &rBI
,
416 const lang::Locale
&rLocale
)
420 sal_Int32 nLHSLastNonDigitPos
= 0;
421 sal_Int32 nRHSLastNonDigitPos
= 0;
422 sal_Int32 nLHSFirstDigitPos
= 0;
423 sal_Int32 nRHSFirstDigitPos
= 0;
425 // Check if the string starts with a digit
426 sal_Int32 nStartsDigitLHS
= rBI
->endOfCharBlock(rLHS
, nLHSFirstDigitPos
, rLocale
, i18n::CharType::DECIMAL_DIGIT_NUMBER
);
427 sal_Int32 nStartsDigitRHS
= rBI
->endOfCharBlock(rRHS
, nRHSFirstDigitPos
, rLocale
, i18n::CharType::DECIMAL_DIGIT_NUMBER
);
429 if (nStartsDigitLHS
> 0 && nStartsDigitRHS
> 0)
431 sal_uInt32 nLHS
= comphelper::string::decimalStringToNumber(rLHS
.subView(0, nStartsDigitLHS
));
432 sal_uInt32 nRHS
= comphelper::string::decimalStringToNumber(rRHS
.subView(0, nStartsDigitRHS
));
435 return nLHS
< nRHS
? -1 : 1;
436 nLHSLastNonDigitPos
= nStartsDigitLHS
;
437 nRHSLastNonDigitPos
= nStartsDigitRHS
;
439 else if (nStartsDigitLHS
> 0)
441 else if (nStartsDigitRHS
> 0)
444 while (nLHSFirstDigitPos
< rLHS
.getLength() || nRHSFirstDigitPos
< rRHS
.getLength())
446 sal_Int32 nLHSChunkLen
;
447 sal_Int32 nRHSChunkLen
;
449 //Compare non digit block as normal strings
450 nLHSFirstDigitPos
= rBI
->nextCharBlock(rLHS
, nLHSLastNonDigitPos
, rLocale
, i18n::CharType::DECIMAL_DIGIT_NUMBER
);
451 nRHSFirstDigitPos
= rBI
->nextCharBlock(rRHS
, nRHSLastNonDigitPos
, rLocale
, i18n::CharType::DECIMAL_DIGIT_NUMBER
);
453 if (nLHSFirstDigitPos
== -1)
454 nLHSFirstDigitPos
= rLHS
.getLength();
456 if (nRHSFirstDigitPos
== -1)
457 nRHSFirstDigitPos
= rRHS
.getLength();
459 nLHSChunkLen
= nLHSFirstDigitPos
- nLHSLastNonDigitPos
;
460 nRHSChunkLen
= nRHSFirstDigitPos
- nRHSLastNonDigitPos
;
462 nRet
= rCollator
->compareSubstring(rLHS
, nLHSLastNonDigitPos
, nLHSChunkLen
, rRHS
, nRHSLastNonDigitPos
, nRHSChunkLen
);
466 //Compare digit block as one number vs another
467 nLHSLastNonDigitPos
= rBI
->endOfCharBlock(rLHS
, nLHSFirstDigitPos
, rLocale
, i18n::CharType::DECIMAL_DIGIT_NUMBER
);
468 nRHSLastNonDigitPos
= rBI
->endOfCharBlock(rRHS
, nRHSFirstDigitPos
, rLocale
, i18n::CharType::DECIMAL_DIGIT_NUMBER
);
469 if (nLHSLastNonDigitPos
== -1)
470 nLHSLastNonDigitPos
= rLHS
.getLength();
471 if (nRHSLastNonDigitPos
== -1)
472 nRHSLastNonDigitPos
= rRHS
.getLength();
473 nLHSChunkLen
= nLHSLastNonDigitPos
- nLHSFirstDigitPos
;
474 nRHSChunkLen
= nRHSLastNonDigitPos
- nRHSFirstDigitPos
;
476 //To-Do: Possibly scale down those unicode codepoints that relate to
477 //numbers outside of the normal 0-9 range, e.g. see GetLocalizedChar in
480 sal_uInt32 nLHS
= comphelper::string::decimalStringToNumber(rLHS
.subView(nLHSFirstDigitPos
, nLHSChunkLen
));
481 sal_uInt32 nRHS
= comphelper::string::decimalStringToNumber(rRHS
.subView(nRHSFirstDigitPos
, nRHSChunkLen
));
485 nRet
= (nLHS
< nRHS
) ? -1 : 1;
493 NaturalStringSorter::NaturalStringSorter(
494 const uno::Reference
< uno::XComponentContext
> &rContext
,
495 lang::Locale aLocale
) : m_aLocale(std::move(aLocale
))
497 m_xCollator
= i18n::Collator::create( rContext
);
498 m_xCollator
->loadDefaultCollator(m_aLocale
, 0);
499 m_xBI
= i18n::BreakIterator::create( rContext
);
502 bool isdigitAsciiString(std::string_view rString
)
505 rString
.data(), rString
.data() + rString
.size(),
506 [](unsigned char c
){ return rtl::isAsciiDigit(c
); });
509 bool isdigitAsciiString(std::u16string_view rString
)
512 rString
.data(), rString
.data() + rString
.size(),
513 [](sal_Unicode c
){ return rtl::isAsciiDigit(c
); });
516 OUString
reverseString(std::u16string_view rStr
)
521 std::size_t i
= rStr
.size();
522 OUStringBuffer
sBuf(static_cast<sal_Int32
>(i
));
524 sBuf
.append(rStr
[--i
]);
525 return sBuf
.makeStringAndClear();
528 OUString
reverseCodePoints(OUString
const & str
) {
529 auto const len
= str
.getLength();
530 OUStringBuffer
buf(len
);
531 for (auto i
= len
; i
!= 0;) {
532 buf
.appendUtf32(str
.iterateCodePoints(&i
, -1));
534 return buf
.makeStringAndClear();
537 sal_Int32
indexOfAny(std::u16string_view rIn
,
538 sal_Unicode
const*const pChars
, sal_Int32
const nPos
)
540 for (std::u16string_view::size_type i
= nPos
; i
< rIn
.size(); ++i
)
542 sal_Unicode
const c
= rIn
[i
];
543 for (sal_Unicode
const* pChar
= pChars
; *pChar
; ++pChar
)
554 OUString
removeAny(std::u16string_view rIn
,
555 sal_Unicode
const*const pChars
)
559 for (std::u16string_view::size_type i
= 0; i
< rIn
.size(); ++i
)
561 sal_Unicode
const c
= rIn
[i
];
563 for (sal_Unicode
const* pChar
= pChars
; *pChar
; ++pChar
)
577 buf
.append(rIn
.substr(0, i
));
587 return isFound
? buf
.makeStringAndClear() : OUString(rIn
);
590 OUString
setToken(const OUString
& rIn
, sal_Int32 nToken
, sal_Unicode cTok
,
591 std::u16string_view rNewToken
)
593 sal_Int32 nLen
= rIn
.getLength();
595 sal_Int32 nFirstChar
= 0;
598 // Determine token position and length
601 // Increase token count if match
608 else if (nTok
> nToken
)
616 return rIn
.replaceAt(nFirstChar
, i
-nFirstChar
, rNewToken
);
620 /** Similar to OUString::replaceAt, but for an OUStringBuffer.
622 Replace n = count characters
623 from position index in this string with newStr.
625 void replaceAt(OUStringBuffer
& rIn
, sal_Int32 nIndex
, sal_Int32 nCount
, std::u16string_view newStr
)
627 assert(nIndex
>= 0 && nIndex
<= rIn
.getLength());
629 assert(nCount
<= rIn
.getLength() - nIndex
);
632 const sal_Int32 nOldLength
= rIn
.getLength();
633 if ( nIndex
== nOldLength
)
639 sal_Int32 nNewLength
= nOldLength
+ newStr
.size() - nCount
;
640 if (newStr
.size() > o3tl::make_unsigned(nCount
))
641 rIn
.ensureCapacity(nOldLength
+ newStr
.size() - nCount
);
643 sal_Unicode
* pStr
= const_cast<sal_Unicode
*>(rIn
.getStr());
644 memmove(pStr
+ nIndex
+ newStr
.size(), pStr
+ nIndex
+ nCount
, nOldLength
- nIndex
+ nCount
);
645 memcpy(pStr
+ nIndex
, newStr
.data(), newStr
.size());
647 rIn
.setLength(nNewLength
);
650 OUString
sanitizeStringSurrogates(const OUString
& rString
)
653 while (i
< rString
.getLength())
655 sal_Unicode c
= rString
[i
];
656 if (rtl::isHighSurrogate(c
))
658 if (i
+1 == rString
.getLength()
659 || !rtl::isLowSurrogate(rString
[i
+1]))
661 SAL_WARN("comphelper", "Surrogate error: high without low");
662 return rString
.copy(0, i
);
664 ++i
; //skip correct low
666 if (rtl::isLowSurrogate(c
)) //bare low without preceding high
668 SAL_WARN("comphelper", "Surrogate error: low without high");
669 return rString
.copy(0, i
);
678 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */