Version 7.6.3.2-android, tag libreoffice-7.6.3.2-android
[LibreOffice.git] / comphelper / source / misc / string.cxx
blobe17951fc43bec4621300f190a1b44d9b9c3ef5f5
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <sal/config.h>
22 #include <cassert>
23 #include <cstddef>
24 #include <string_view>
25 #include <utility>
26 #include <vector>
27 #include <algorithm>
29 #include <o3tl/safeint.hxx>
30 #include <o3tl/string_view.hxx>
31 #include <rtl/character.hxx>
32 #include <rtl/ustring.hxx>
33 #include <rtl/ustrbuf.hxx>
34 #include <rtl/string.hxx>
35 #include <rtl/strbuf.hxx>
36 #include <sal/log.hxx>
37 #include <sal/types.h>
39 #include <comphelper/string.hxx>
40 #include <comphelper/stl_types.hxx>
41 #include <comphelper/sequence.hxx>
43 #include <com/sun/star/i18n/BreakIterator.hpp>
44 #include <com/sun/star/i18n/CharType.hpp>
45 #include <com/sun/star/i18n/Collator.hpp>
48 namespace comphelper::string {
50 namespace
52 template <typename T, typename C> T tmpl_stripStart(const T &rIn,
53 const C cRemove)
55 if (rIn.empty())
56 return rIn;
58 typename T::size_type i = 0;
60 while (i < rIn.size())
62 if (rIn[i] != cRemove)
63 break;
64 ++i;
67 return rIn.substr(i);
69 template <typename T, typename C> T tmpl_stripStartString(const T &rIn,
70 const C cRemove)
72 if (rIn.isEmpty())
73 return rIn;
75 sal_Int32 i = 0;
77 while (i < rIn.getLength())
79 if (rIn[i] != cRemove)
80 break;
81 ++i;
84 return rIn.copy(i);
88 OString stripStart(const OString& rIn, char c)
90 return tmpl_stripStartString<OString, char>(rIn, c);
93 std::string_view stripStart(std::string_view rIn, char c)
95 return tmpl_stripStart<std::string_view, char>(rIn, c);
98 OUString stripStart(const OUString& rIn, sal_Unicode c)
100 return tmpl_stripStartString<OUString, sal_Unicode>(rIn, c);
103 std::u16string_view stripStart(std::u16string_view rIn, sal_Unicode c)
105 return tmpl_stripStart<std::u16string_view, sal_Unicode>(rIn, c);
108 namespace
110 template <typename T, typename C> T tmpl_stripEnd(const T &rIn,
111 const C cRemove)
113 if (rIn.empty())
114 return rIn;
116 typename T::size_type i = rIn.size();
118 while (i > 0)
120 if (rIn[i-1] != cRemove)
121 break;
122 --i;
125 return rIn.substr(0, i);
127 template <typename T, typename C> T tmpl_stripEndString(const T &rIn,
128 const C cRemove)
130 if (rIn.isEmpty())
131 return rIn;
133 sal_Int32 i = rIn.getLength();
135 while (i > 0)
137 if (rIn[i-1] != cRemove)
138 break;
139 --i;
142 return rIn.copy(0, i);
146 OString stripEnd(const OString& rIn, char c)
148 return tmpl_stripEndString<OString, char>(rIn, c);
151 std::string_view stripEnd(std::string_view rIn, char c)
153 return tmpl_stripEnd<std::string_view, char>(rIn, c);
156 OUString stripEnd(const OUString& rIn, sal_Unicode c)
158 return tmpl_stripEndString<OUString, sal_Unicode>(rIn, c);
161 std::u16string_view stripEnd(std::u16string_view rIn, sal_Unicode c)
163 return tmpl_stripEnd<std::u16string_view, sal_Unicode>(rIn, c);
166 namespace
168 template <typename T, typename C> T tmpl_strip(const T &rIn,
169 const C cRemove)
171 if (rIn.empty())
172 return rIn;
174 typename T::size_type end = rIn.size();
175 while (end > 0)
177 if (rIn[end-1] != cRemove)
178 break;
179 --end;
182 typename T::size_type start = 0;
183 while (start < end)
185 if (rIn[start] != cRemove)
186 break;
187 ++start;
190 return rIn.substr(start, end - start);
192 template <typename T, typename C> T tmpl_stripString(const T &rIn,
193 const C cRemove)
195 if (rIn.isEmpty())
196 return rIn;
198 sal_Int32 end = rIn.getLength();
199 while (end > 0)
201 if (rIn[end-1] != cRemove)
202 break;
203 --end;
205 sal_Int32 start = 0;
206 while (start < end)
208 if (rIn[start] != cRemove)
209 break;
210 ++start;
213 return rIn.copy(start, end - start);
217 OString strip(const OString& rIn, char c)
219 return tmpl_stripString<OString, char>(rIn, c);
222 std::string_view strip(std::string_view rIn, char c)
224 return tmpl_strip<std::string_view, char>(rIn, c);
227 OUString strip(const OUString& rIn, sal_Unicode c)
229 return tmpl_stripString<OUString, sal_Unicode>(rIn, c);
232 std::u16string_view strip(std::u16string_view rIn, sal_Unicode c)
234 return tmpl_strip<std::u16string_view, sal_Unicode>(rIn, c);
237 namespace
239 template <typename T, typename C> sal_Int32 tmpl_getTokenCount( T rIn,
240 C cTok)
242 // Empty String: TokenCount by Definition is 0
243 if (rIn.empty())
244 return 0;
246 sal_Int32 nTokCount = 1;
247 for (typename T::size_type i = 0; i < rIn.size(); ++i)
249 if (rIn[i] == cTok)
250 ++nTokCount;
252 return nTokCount;
256 sal_Int32 getTokenCount(std::string_view rIn, char cTok)
258 return tmpl_getTokenCount<std::string_view, char>(rIn, cTok);
261 sal_Int32 getTokenCount(std::u16string_view rIn, sal_Unicode cTok)
263 return tmpl_getTokenCount<std::u16string_view, sal_Unicode>(rIn, cTok);
266 sal_uInt32 decimalStringToNumber(std::u16string_view str)
268 sal_uInt32 result = 0;
269 for( sal_Int32 i = 0; i < static_cast<sal_Int32>(str.size()); )
271 sal_uInt32 c = o3tl::iterateCodePoints(str, &i);
272 sal_uInt32 value = 0;
273 if( c <= 0x0039) // ASCII decimal digits, most common
274 value = c - 0x0030;
275 else if( c >= 0x1D7F6 ) // mathematical monospace digits
276 value = c - 0x1D7F6;
277 else if( c >= 0x1D7EC ) // mathematical sans-serif bold digits
278 value = c - 0x1D7EC;
279 else if( c >= 0x1D7E2 ) // mathematical sans-serif digits
280 value = c - 0x1D7E2;
281 else if( c >= 0x1D7D8 ) // mathematical double-struck digits
282 value = c - 0x1D7D8;
283 else if( c >= 0x1D7CE ) // mathematical bold digits
284 value = c - 0x1D7CE;
285 else if( c >= 0x11066 ) // brahmi digits
286 value = c - 0x11066;
287 else if( c >= 0x104A0 ) // osmanya digits
288 value = c - 0x104A0;
289 else if( c >= 0xFF10 ) // fullwidth digits
290 value = c - 0xFF10;
291 else if( c >= 0xABF0 ) // meetei mayek digits
292 value = c - 0xABF0;
293 else if( c >= 0xAA50 ) // cham digits
294 value = c - 0xAA50;
295 else if( c >= 0xA9D0 ) // javanese digits
296 value = c - 0xA9D0;
297 else if( c >= 0xA900 ) // kayah li digits
298 value = c - 0xA900;
299 else if( c >= 0xA8D0 ) // saurashtra digits
300 value = c - 0xA8D0;
301 else if( c >= 0xA620 ) // vai digits
302 value = c - 0xA620;
303 else if( c >= 0x1C50 ) // ol chiki digits
304 value = c - 0x1C50;
305 else if( c >= 0x1C40 ) // lepcha digits
306 value = c - 0x1C40;
307 else if( c >= 0x1BB0 ) // sundanese digits
308 value = c - 0x1BB0;
309 else if( c >= 0x1B50 ) // balinese digits
310 value = c - 0x1B50;
311 else if( c >= 0x1A90 ) // tai tham tham digits
312 value = c - 0x1A90;
313 else if( c >= 0x1A80 ) // tai tham hora digits
314 value = c - 0x1A80;
315 else if( c >= 0x19D0 ) // new tai lue digits
316 value = c - 0x19D0;
317 else if( c >= 0x1946 ) // limbu digits
318 value = c - 0x1946;
319 else if( c >= 0x1810 ) // mongolian digits
320 value = c - 0x1810;
321 else if( c >= 0x17E0 ) // khmer digits
322 value = c - 0x17E0;
323 else if( c >= 0x1090 ) // myanmar shan digits
324 value = c - 0x1090;
325 else if( c >= 0x1040 ) // myanmar digits
326 value = c - 0x1040;
327 else if( c >= 0x0F20 ) // tibetan digits
328 value = c - 0x0F20;
329 else if( c >= 0x0ED0 ) // lao digits
330 value = c - 0x0ED0;
331 else if( c >= 0x0E50 ) // thai digits
332 value = c - 0x0E50;
333 else if( c >= 0x0D66 ) // malayalam digits
334 value = c - 0x0D66;
335 else if( c >= 0x0CE6 ) // kannada digits
336 value = c - 0x0CE6;
337 else if( c >= 0x0C66 ) // telugu digits
338 value = c - 0x0C66;
339 else if( c >= 0x0BE6 ) // tamil digits
340 value = c - 0x0BE6;
341 else if( c >= 0x0B66 ) // odia digits
342 value = c - 0x0B66;
343 else if( c >= 0x0AE6 ) // gujarati digits
344 value = c - 0x0AE6;
345 else if( c >= 0x0A66 ) // gurmukhi digits
346 value = c - 0x0A66;
347 else if( c >= 0x09E6 ) // bengali digits
348 value = c - 0x09E6;
349 else if( c >= 0x0966 ) // devanagari digit
350 value = c - 0x0966;
351 else if( c >= 0x07C0 ) // nko digits
352 value = c - 0x07C0;
353 else if( c >= 0x06F0 ) // extended arabic-indic digits
354 value = c - 0x06F0;
355 else if( c >= 0x0660 ) // arabic-indic digits
356 value = c - 0x0660;
357 result = result * 10 + value;
359 return result;
362 using namespace ::com::sun::star;
364 // convert between sequence of string and comma separated string
366 OUString convertCommaSeparated(
367 uno::Sequence< OUString > const& i_rSeq)
369 OUStringBuffer buf;
370 ::comphelper::intersperse(
371 i_rSeq.begin(), i_rSeq.end(), ::comphelper::OUStringBufferAppender(buf), OUString( ", " ));
372 return buf.makeStringAndClear();
375 std::vector<OUString>
376 split(std::u16string_view rStr, sal_Unicode cSeparator)
378 std::vector< OUString > vec;
379 std::size_t idx = 0;
382 std::u16string_view kw = o3tl::getToken(rStr, cSeparator, idx);
383 kw = o3tl::trim(kw);
384 if (!kw.empty())
386 vec.push_back(OUString(kw));
389 } while (idx != std::u16string_view::npos);
391 return vec;
394 uno::Sequence< OUString >
395 convertCommaSeparated( std::u16string_view i_rString )
397 std::vector< OUString > vec = split(i_rString, ',');
398 return comphelper::containerToSequence(vec);
401 OString join(std::string_view rSeparator, const std::vector<OString>& rSequence)
403 OStringBuffer aBuffer;
404 for (size_t i = 0; i < rSequence.size(); ++i)
406 if (i != 0)
407 aBuffer.append(rSeparator);
408 aBuffer.append(rSequence[i]);
410 return aBuffer.makeStringAndClear();
413 sal_Int32 compareNatural( const OUString & rLHS, const OUString & rRHS,
414 const uno::Reference< i18n::XCollator > &rCollator,
415 const uno::Reference< i18n::XBreakIterator > &rBI,
416 const lang::Locale &rLocale )
418 sal_Int32 nRet = 0;
420 sal_Int32 nLHSLastNonDigitPos = 0;
421 sal_Int32 nRHSLastNonDigitPos = 0;
422 sal_Int32 nLHSFirstDigitPos = 0;
423 sal_Int32 nRHSFirstDigitPos = 0;
425 // Check if the string starts with a digit
426 sal_Int32 nStartsDigitLHS = rBI->endOfCharBlock(rLHS, nLHSFirstDigitPos, rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
427 sal_Int32 nStartsDigitRHS = rBI->endOfCharBlock(rRHS, nRHSFirstDigitPos, rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
429 if (nStartsDigitLHS > 0 && nStartsDigitRHS > 0)
431 sal_uInt32 nLHS = comphelper::string::decimalStringToNumber(rLHS.subView(0, nStartsDigitLHS));
432 sal_uInt32 nRHS = comphelper::string::decimalStringToNumber(rRHS.subView(0, nStartsDigitRHS));
434 if (nLHS != nRHS)
435 return nLHS < nRHS ? -1 : 1;
436 nLHSLastNonDigitPos = nStartsDigitLHS;
437 nRHSLastNonDigitPos = nStartsDigitRHS;
439 else if (nStartsDigitLHS > 0)
440 return -1;
441 else if (nStartsDigitRHS > 0)
442 return 1;
444 while (nLHSFirstDigitPos < rLHS.getLength() || nRHSFirstDigitPos < rRHS.getLength())
446 sal_Int32 nLHSChunkLen;
447 sal_Int32 nRHSChunkLen;
449 //Compare non digit block as normal strings
450 nLHSFirstDigitPos = rBI->nextCharBlock(rLHS, nLHSLastNonDigitPos, rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
451 nRHSFirstDigitPos = rBI->nextCharBlock(rRHS, nRHSLastNonDigitPos, rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
453 if (nLHSFirstDigitPos == -1)
454 nLHSFirstDigitPos = rLHS.getLength();
456 if (nRHSFirstDigitPos == -1)
457 nRHSFirstDigitPos = rRHS.getLength();
459 nLHSChunkLen = nLHSFirstDigitPos - nLHSLastNonDigitPos;
460 nRHSChunkLen = nRHSFirstDigitPos - nRHSLastNonDigitPos;
462 nRet = rCollator->compareSubstring(rLHS, nLHSLastNonDigitPos, nLHSChunkLen, rRHS, nRHSLastNonDigitPos, nRHSChunkLen);
463 if (nRet != 0)
464 break;
466 //Compare digit block as one number vs another
467 nLHSLastNonDigitPos = rBI->endOfCharBlock(rLHS, nLHSFirstDigitPos, rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
468 nRHSLastNonDigitPos = rBI->endOfCharBlock(rRHS, nRHSFirstDigitPos, rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
469 if (nLHSLastNonDigitPos == -1)
470 nLHSLastNonDigitPos = rLHS.getLength();
471 if (nRHSLastNonDigitPos == -1)
472 nRHSLastNonDigitPos = rRHS.getLength();
473 nLHSChunkLen = nLHSLastNonDigitPos - nLHSFirstDigitPos;
474 nRHSChunkLen = nRHSLastNonDigitPos - nRHSFirstDigitPos;
476 //To-Do: Possibly scale down those unicode codepoints that relate to
477 //numbers outside of the normal 0-9 range, e.g. see GetLocalizedChar in
478 //vcl
480 sal_uInt32 nLHS = comphelper::string::decimalStringToNumber(rLHS.subView(nLHSFirstDigitPos, nLHSChunkLen));
481 sal_uInt32 nRHS = comphelper::string::decimalStringToNumber(rRHS.subView(nRHSFirstDigitPos, nRHSChunkLen));
483 if (nLHS != nRHS)
485 nRet = (nLHS < nRHS) ? -1 : 1;
486 break;
490 return nRet;
493 NaturalStringSorter::NaturalStringSorter(
494 const uno::Reference< uno::XComponentContext > &rContext,
495 lang::Locale aLocale) : m_aLocale(std::move(aLocale))
497 m_xCollator = i18n::Collator::create( rContext );
498 m_xCollator->loadDefaultCollator(m_aLocale, 0);
499 m_xBI = i18n::BreakIterator::create( rContext );
502 bool isdigitAsciiString(std::string_view rString)
504 return std::all_of(
505 rString.data(), rString.data() + rString.size(),
506 [](unsigned char c){ return rtl::isAsciiDigit(c); });
509 bool isdigitAsciiString(std::u16string_view rString)
511 return std::all_of(
512 rString.data(), rString.data() + rString.size(),
513 [](sal_Unicode c){ return rtl::isAsciiDigit(c); });
516 OUString reverseString(std::u16string_view rStr)
518 if (rStr.empty())
519 return OUString();
521 std::size_t i = rStr.size();
522 OUStringBuffer sBuf(static_cast<sal_Int32>(i));
523 while (i)
524 sBuf.append(rStr[--i]);
525 return sBuf.makeStringAndClear();
528 OUString reverseCodePoints(OUString const & str) {
529 auto const len = str.getLength();
530 OUStringBuffer buf(len);
531 for (auto i = len; i != 0;) {
532 buf.appendUtf32(str.iterateCodePoints(&i, -1));
534 return buf.makeStringAndClear();
537 sal_Int32 indexOfAny(std::u16string_view rIn,
538 sal_Unicode const*const pChars, sal_Int32 const nPos)
540 for (std::u16string_view::size_type i = nPos; i < rIn.size(); ++i)
542 sal_Unicode const c = rIn[i];
543 for (sal_Unicode const* pChar = pChars; *pChar; ++pChar)
545 if (c == *pChar)
547 return i;
551 return -1;
554 OUString removeAny(std::u16string_view rIn,
555 sal_Unicode const*const pChars)
557 OUStringBuffer buf;
558 bool isFound(false);
559 for (std::u16string_view::size_type i = 0; i < rIn.size(); ++i)
561 sal_Unicode const c = rIn[i];
562 bool removeC(false);
563 for (sal_Unicode const* pChar = pChars; *pChar; ++pChar)
565 if (c == *pChar)
567 removeC = true;
568 break;
571 if (removeC)
573 if (!isFound)
575 if (i > 0)
577 buf.append(rIn.substr(0, i));
579 isFound = true;
582 else if (isFound)
584 buf.append(c);
587 return isFound ? buf.makeStringAndClear() : OUString(rIn);
590 OUString setToken(const OUString& rIn, sal_Int32 nToken, sal_Unicode cTok,
591 std::u16string_view rNewToken)
593 sal_Int32 nLen = rIn.getLength();
594 sal_Int32 nTok = 0;
595 sal_Int32 nFirstChar = 0;
596 sal_Int32 i = 0;
598 // Determine token position and length
599 while ( i < nLen )
601 // Increase token count if match
602 if (rIn[i] == cTok)
604 ++nTok;
606 if (nTok == nToken)
607 nFirstChar = i+1;
608 else if (nTok > nToken)
609 break;
612 ++i;
615 if (nTok >= nToken)
616 return rIn.replaceAt(nFirstChar, i-nFirstChar, rNewToken);
617 return rIn;
620 /** Similar to OUString::replaceAt, but for an OUStringBuffer.
622 Replace n = count characters
623 from position index in this string with newStr.
625 void replaceAt(OUStringBuffer& rIn, sal_Int32 nIndex, sal_Int32 nCount, std::u16string_view newStr )
627 assert(nIndex >= 0 && nIndex <= rIn.getLength());
628 assert(nCount >= 0);
629 assert(nCount <= rIn.getLength() - nIndex);
631 /* Append? */
632 const sal_Int32 nOldLength = rIn.getLength();
633 if ( nIndex == nOldLength )
635 rIn.append(newStr);
636 return;
639 sal_Int32 nNewLength = nOldLength + newStr.size() - nCount;
640 if (newStr.size() > o3tl::make_unsigned(nCount))
641 rIn.ensureCapacity(nOldLength + newStr.size() - nCount);
643 sal_Unicode* pStr = const_cast<sal_Unicode*>(rIn.getStr());
644 memmove(pStr + nIndex + newStr.size(), pStr + nIndex + nCount, nOldLength - nIndex + nCount);
645 memcpy(pStr + nIndex, newStr.data(), newStr.size());
647 rIn.setLength(nNewLength);
650 OUString sanitizeStringSurrogates(const OUString& rString)
652 sal_Int32 i=0;
653 while (i < rString.getLength())
655 sal_Unicode c = rString[i];
656 if (rtl::isHighSurrogate(c))
658 if (i+1 == rString.getLength()
659 || !rtl::isLowSurrogate(rString[i+1]))
661 SAL_WARN("comphelper", "Surrogate error: high without low");
662 return rString.copy(0, i);
664 ++i; //skip correct low
666 if (rtl::isLowSurrogate(c)) //bare low without preceding high
668 SAL_WARN("comphelper", "Surrogate error: low without high");
669 return rString.copy(0, i);
671 ++i;
673 return rString;
678 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */