Merge pull request #26148 from ksooo/fix-secondstotimestring-warning
[xbmc.git] / xbmc / utils / CharsetConverter.cpp
blob7def229fcbfa612c44f3980e71f03d7a46284807
1 /*
2 * Copyright (C) 2005-2018 Team Kodi
3 * This file is part of Kodi - https://kodi.tv
5 * SPDX-License-Identifier: GPL-2.0-or-later
6 * See LICENSES/README.md for more information.
7 */
9 #include "CharsetConverter.h"
11 #include "LangInfo.h"
12 #include "guilib/LocalizeStrings.h"
13 #include "log.h"
14 #include "settings/Settings.h"
15 #include "settings/lib/Setting.h"
16 #include "settings/lib/SettingDefinitions.h"
17 #include "utils/StringUtils.h"
18 #include "utils/Utf8Utils.h"
20 #include <algorithm>
21 #include <mutex>
23 #include <fribidi.h>
24 #include <iconv.h>
26 #ifdef WORDS_BIGENDIAN
27 #define ENDIAN_SUFFIX "BE"
28 #else
29 #define ENDIAN_SUFFIX "LE"
30 #endif
32 #if defined(TARGET_DARWIN)
33 #define WCHAR_IS_UCS_4 1
34 #define UTF16_CHARSET "UTF-16" ENDIAN_SUFFIX
35 #define UTF32_CHARSET "UTF-32" ENDIAN_SUFFIX
36 #define UTF8_SOURCE "UTF-8-MAC"
37 #define WCHAR_CHARSET UTF32_CHARSET
38 #elif defined(TARGET_WINDOWS)
39 #define WCHAR_IS_UTF16 1
40 #define UTF16_CHARSET "UTF-16" ENDIAN_SUFFIX
41 #define UTF32_CHARSET "UTF-32" ENDIAN_SUFFIX
42 #define UTF8_SOURCE "UTF-8"
43 #define WCHAR_CHARSET UTF16_CHARSET
44 #elif defined(TARGET_FREEBSD)
45 #define WCHAR_IS_UCS_4 1
46 #define UTF16_CHARSET "UTF-16" ENDIAN_SUFFIX
47 #define UTF32_CHARSET "UTF-32" ENDIAN_SUFFIX
48 #define UTF8_SOURCE "UTF-8"
49 #define WCHAR_CHARSET UTF32_CHARSET
50 #elif defined(TARGET_ANDROID)
51 #define WCHAR_IS_UCS_4 1
52 #define UTF16_CHARSET "UTF-16" ENDIAN_SUFFIX
53 #define UTF32_CHARSET "UTF-32" ENDIAN_SUFFIX
54 #define UTF8_SOURCE "UTF-8"
55 #define WCHAR_CHARSET UTF32_CHARSET
56 #else
57 #define UTF16_CHARSET "UTF-16" ENDIAN_SUFFIX
58 #define UTF32_CHARSET "UTF-32" ENDIAN_SUFFIX
59 #define UTF8_SOURCE "UTF-8"
60 #define WCHAR_CHARSET "WCHAR_T"
61 #if __STDC_ISO_10646__
62 #ifdef SIZEOF_WCHAR_T
63 #if SIZEOF_WCHAR_T == 4
64 #define WCHAR_IS_UCS_4 1
65 #elif SIZEOF_WCHAR_T == 2
66 #define WCHAR_IS_UCS_2 1
67 #endif
68 #endif
69 #endif
70 #endif
72 #define NO_ICONV ((iconv_t)-1)
74 enum SpecialCharset
76 NotSpecialCharset = 0,
77 SystemCharset,
78 UserCharset /* locale.charset */,
79 SubtitleCharset /* subtitles.charset */,
82 class CConverterType : public CCriticalSection
84 public:
85 CConverterType(const std::string& sourceCharset, const std::string& targetCharset, unsigned int targetSingleCharMaxLen = 1);
86 CConverterType(enum SpecialCharset sourceSpecialCharset, const std::string& targetCharset, unsigned int targetSingleCharMaxLen = 1);
87 CConverterType(const std::string& sourceCharset, enum SpecialCharset targetSpecialCharset, unsigned int targetSingleCharMaxLen = 1);
88 CConverterType(enum SpecialCharset sourceSpecialCharset, enum SpecialCharset targetSpecialCharset, unsigned int targetSingleCharMaxLen = 1);
89 CConverterType(const CConverterType& other);
90 ~CConverterType();
92 iconv_t GetConverter(std::unique_lock<CCriticalSection>& converterLock);
94 void Reset(void);
95 void ReinitTo(const std::string& sourceCharset, const std::string& targetCharset, unsigned int targetSingleCharMaxLen = 1);
96 const std::string& GetSourceCharset() const { return m_sourceCharset; }
97 const std::string& GetTargetCharset() const { return m_targetCharset; }
98 unsigned int GetTargetSingleCharMaxLen(void) const { return m_targetSingleCharMaxLen; }
100 private:
101 static std::string ResolveSpecialCharset(enum SpecialCharset charset);
103 enum SpecialCharset m_sourceSpecialCharset;
104 std::string m_sourceCharset;
105 enum SpecialCharset m_targetSpecialCharset;
106 std::string m_targetCharset;
107 iconv_t m_iconv;
108 unsigned int m_targetSingleCharMaxLen;
111 CConverterType::CConverterType(const std::string& sourceCharset, const std::string& targetCharset, unsigned int targetSingleCharMaxLen /*= 1*/) : CCriticalSection(),
112 m_sourceSpecialCharset(NotSpecialCharset),
113 m_sourceCharset(sourceCharset),
114 m_targetSpecialCharset(NotSpecialCharset),
115 m_targetCharset(targetCharset),
116 m_iconv(NO_ICONV),
117 m_targetSingleCharMaxLen(targetSingleCharMaxLen)
121 CConverterType::CConverterType(enum SpecialCharset sourceSpecialCharset, const std::string& targetCharset, unsigned int targetSingleCharMaxLen /*= 1*/) : CCriticalSection(),
122 m_sourceSpecialCharset(sourceSpecialCharset),
123 m_sourceCharset(),
124 m_targetSpecialCharset(NotSpecialCharset),
125 m_targetCharset(targetCharset),
126 m_iconv(NO_ICONV),
127 m_targetSingleCharMaxLen(targetSingleCharMaxLen)
131 CConverterType::CConverterType(const std::string& sourceCharset, enum SpecialCharset targetSpecialCharset, unsigned int targetSingleCharMaxLen /*= 1*/) : CCriticalSection(),
132 m_sourceSpecialCharset(NotSpecialCharset),
133 m_sourceCharset(sourceCharset),
134 m_targetSpecialCharset(targetSpecialCharset),
135 m_targetCharset(),
136 m_iconv(NO_ICONV),
137 m_targetSingleCharMaxLen(targetSingleCharMaxLen)
141 CConverterType::CConverterType(enum SpecialCharset sourceSpecialCharset, enum SpecialCharset targetSpecialCharset, unsigned int targetSingleCharMaxLen /*= 1*/) : CCriticalSection(),
142 m_sourceSpecialCharset(sourceSpecialCharset),
143 m_sourceCharset(),
144 m_targetSpecialCharset(targetSpecialCharset),
145 m_targetCharset(),
146 m_iconv(NO_ICONV),
147 m_targetSingleCharMaxLen(targetSingleCharMaxLen)
151 CConverterType::CConverterType(const CConverterType& other) : CCriticalSection(),
152 m_sourceSpecialCharset(other.m_sourceSpecialCharset),
153 m_sourceCharset(other.m_sourceCharset),
154 m_targetSpecialCharset(other.m_targetSpecialCharset),
155 m_targetCharset(other.m_targetCharset),
156 m_iconv(NO_ICONV),
157 m_targetSingleCharMaxLen(other.m_targetSingleCharMaxLen)
161 CConverterType::~CConverterType()
163 std::unique_lock<CCriticalSection> lock(*this);
164 if (m_iconv != NO_ICONV)
165 iconv_close(m_iconv);
166 lock.unlock(); // ensure unlocking before final destruction
169 iconv_t CConverterType::GetConverter(std::unique_lock<CCriticalSection>& converterLock)
171 // ensure that this unique instance is locked externally
172 if (converterLock.mutex() != this)
173 return NO_ICONV;
175 if (m_iconv == NO_ICONV)
177 if (m_sourceSpecialCharset)
178 m_sourceCharset = ResolveSpecialCharset(m_sourceSpecialCharset);
179 if (m_targetSpecialCharset)
180 m_targetCharset = ResolveSpecialCharset(m_targetSpecialCharset);
182 m_iconv = iconv_open(m_targetCharset.c_str(), m_sourceCharset.c_str());
184 if (m_iconv == NO_ICONV)
185 CLog::Log(LOGERROR, "{}: iconv_open() for \"{}\" -> \"{}\" failed, errno = {} ({})",
186 __FUNCTION__, m_sourceCharset, m_targetCharset, errno, strerror(errno));
189 return m_iconv;
192 void CConverterType::Reset(void)
194 std::unique_lock<CCriticalSection> lock(*this);
195 if (m_iconv != NO_ICONV)
197 iconv_close(m_iconv);
198 m_iconv = NO_ICONV;
201 if (m_sourceSpecialCharset)
202 m_sourceCharset.clear();
203 if (m_targetSpecialCharset)
204 m_targetCharset.clear();
208 void CConverterType::ReinitTo(const std::string& sourceCharset, const std::string& targetCharset, unsigned int targetSingleCharMaxLen /*= 1*/)
210 std::unique_lock<CCriticalSection> lock(*this);
211 if (sourceCharset != m_sourceCharset || targetCharset != m_targetCharset)
213 if (m_iconv != NO_ICONV)
215 iconv_close(m_iconv);
216 m_iconv = NO_ICONV;
219 m_sourceSpecialCharset = NotSpecialCharset;
220 m_sourceCharset = sourceCharset;
221 m_targetSpecialCharset = NotSpecialCharset;
222 m_targetCharset = targetCharset;
223 m_targetSingleCharMaxLen = targetSingleCharMaxLen;
227 std::string CConverterType::ResolveSpecialCharset(enum SpecialCharset charset)
229 switch (charset)
231 case SystemCharset:
232 return "";
233 case UserCharset:
234 return g_langInfo.GetGuiCharSet();
235 case SubtitleCharset:
236 return g_langInfo.GetSubtitleCharSet();
237 case NotSpecialCharset:
238 default:
239 return "UTF-8"; /* dummy value */
243 enum StdConversionType /* Keep it in sync with CCharsetConverter::CInnerConverter::m_stdConversion */
245 NoConversion = -1,
246 Utf8ToUtf32 = 0,
247 Utf32ToUtf8,
248 Utf32ToW,
249 WToUtf32,
250 SubtitleCharsetToUtf8,
251 Utf8ToUserCharset,
252 UserCharsetToUtf8,
253 Utf32ToUserCharset,
254 WtoUtf8,
255 Utf16LEtoW,
256 Utf16BEtoUtf8,
257 Utf16LEtoUtf8,
258 Utf8toW,
259 Utf8ToSystem,
260 SystemToUtf8,
261 Ucs2CharsetToUtf8,
262 MacintoshToUtf8,
263 NumberOfStdConversionTypes /* Dummy sentinel entry */
266 /* We don't want to pollute header file with many additional includes and definitions, so put
267 here all staff that require usage of types defined in this file or in additional headers */
268 class CCharsetConverter::CInnerConverter
270 public:
271 static bool logicalToVisualBiDi(const std::u32string& stringSrc,
272 std::u32string& stringDst,
273 FriBidiCharType base = FRIBIDI_TYPE_LTR,
274 const bool failOnBadString = false,
275 int* visualToLogicalMap = nullptr);
276 static bool isBidiDirectionRTL(const std::string& stringSrc);
278 template<class INPUT,class OUTPUT>
279 static bool stdConvert(StdConversionType convertType, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar = false);
280 template<class INPUT,class OUTPUT>
281 static bool customConvert(const std::string& sourceCharset, const std::string& targetCharset, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar = false);
283 template<class INPUT,class OUTPUT>
284 static bool convert(iconv_t type, int multiplier, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar = false);
286 static CConverterType m_stdConversion[NumberOfStdConversionTypes];
287 static CCriticalSection m_critSectionFriBiDi;
290 /* single symbol sizes in chars */
291 const int CCharsetConverter::m_Utf8CharMinSize = 1;
292 const int CCharsetConverter::m_Utf8CharMaxSize = 4;
294 // clang-format off
295 CConverterType CCharsetConverter::CInnerConverter::m_stdConversion[NumberOfStdConversionTypes] = /* keep it in sync with enum StdConversionType */
297 /* Utf8ToUtf32 */ CConverterType(UTF8_SOURCE, UTF32_CHARSET),
298 /* Utf32ToUtf8 */ CConverterType(UTF32_CHARSET, "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
299 /* Utf32ToW */ CConverterType(UTF32_CHARSET, WCHAR_CHARSET),
300 /* WToUtf32 */ CConverterType(WCHAR_CHARSET, UTF32_CHARSET),
301 /* SubtitleCharsetToUtf8*/CConverterType(SubtitleCharset, "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
302 /* Utf8ToUserCharset */ CConverterType(UTF8_SOURCE, UserCharset),
303 /* UserCharsetToUtf8 */ CConverterType(UserCharset, "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
304 /* Utf32ToUserCharset */ CConverterType(UTF32_CHARSET, UserCharset),
305 /* WtoUtf8 */ CConverterType(WCHAR_CHARSET, "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
306 /* Utf16LEtoW */ CConverterType("UTF-16LE", WCHAR_CHARSET),
307 /* Utf16BEtoUtf8 */ CConverterType("UTF-16BE", "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
308 /* Utf16LEtoUtf8 */ CConverterType("UTF-16LE", "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
309 /* Utf8toW */ CConverterType(UTF8_SOURCE, WCHAR_CHARSET),
310 /* Utf8ToSystem */ CConverterType(UTF8_SOURCE, SystemCharset),
311 /* SystemToUtf8 */ CConverterType(SystemCharset, UTF8_SOURCE),
312 /* Ucs2CharsetToUtf8 */ CConverterType("UCS-2LE", "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
313 /* MacintoshToUtf8 */ CConverterType("macintosh", "UTF-8", CCharsetConverter::m_Utf8CharMaxSize)
315 // clang-format on
317 CCriticalSection CCharsetConverter::CInnerConverter::m_critSectionFriBiDi;
319 template<class INPUT,class OUTPUT>
320 bool CCharsetConverter::CInnerConverter::stdConvert(StdConversionType convertType, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar /*= false*/)
322 strDest.clear();
323 if (strSource.empty())
324 return true;
326 if (convertType < 0 || convertType >= NumberOfStdConversionTypes)
327 return false;
329 CConverterType& convType = m_stdConversion[convertType];
330 std::unique_lock<CCriticalSection> converterLock(convType);
332 return convert(convType.GetConverter(converterLock), convType.GetTargetSingleCharMaxLen(), strSource, strDest, failOnInvalidChar);
335 template<class INPUT,class OUTPUT>
336 bool CCharsetConverter::CInnerConverter::customConvert(const std::string& sourceCharset, const std::string& targetCharset, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar /*= false*/)
338 strDest.clear();
339 if (strSource.empty())
340 return true;
342 iconv_t conv = iconv_open(targetCharset.c_str(), sourceCharset.c_str());
343 if (conv == NO_ICONV)
345 CLog::Log(LOGERROR, "{}: iconv_open() for \"{}\" -> \"{}\" failed, errno = {} ({})",
346 __FUNCTION__, sourceCharset, targetCharset, errno, strerror(errno));
347 return false;
349 const int dstMultp = (targetCharset.compare(0, 5, "UTF-8") == 0) ? CCharsetConverter::m_Utf8CharMaxSize : 1;
350 const bool result = convert(conv, dstMultp, strSource, strDest, failOnInvalidChar);
351 iconv_close(conv);
353 return result;
356 /* iconv may declare inbuf to be char** rather than const char** depending on platform and version,
357 so provide a wrapper that handles both */
358 struct charPtrPtrAdapter
360 const char** pointer;
361 explicit charPtrPtrAdapter(const char** p) :
362 pointer(p) { }
363 operator char**()
364 { return const_cast<char**>(pointer); }
365 operator const char**()
366 { return pointer; }
369 template<class INPUT,class OUTPUT>
370 bool CCharsetConverter::CInnerConverter::convert(iconv_t type, int multiplier, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar /*= false*/)
372 if (type == NO_ICONV)
373 return false;
375 //input buffer for iconv() is the buffer from strSource
376 size_t inBufSize = (strSource.length() + 1) * sizeof(typename INPUT::value_type);
377 const char* inBuf = (const char*)strSource.c_str();
379 //allocate output buffer for iconv()
380 size_t outBufSize = (strSource.length() + 1) * sizeof(typename OUTPUT::value_type) * multiplier;
381 char* outBuf = (char*)malloc(outBufSize);
382 if (outBuf == NULL)
384 CLog::Log(LOGFATAL, "{}: malloc failed", __FUNCTION__);
385 return false;
388 size_t inBytesAvail = inBufSize; //how many bytes iconv() can read
389 size_t outBytesAvail = outBufSize; //how many bytes iconv() can write
390 const char* inBufStart = inBuf; //where in our input buffer iconv() should start reading
391 char* outBufStart = outBuf; //where in out output buffer iconv() should start writing
393 size_t returnV;
394 while(true)
396 //iconv() will update inBufStart, inBytesAvail, outBufStart and outBytesAvail
397 returnV = iconv(type, charPtrPtrAdapter(&inBufStart), &inBytesAvail, &outBufStart, &outBytesAvail);
399 if (returnV == (size_t)-1)
401 if (errno == E2BIG) //output buffer is not big enough
403 //save where iconv() ended converting, realloc might make outBufStart invalid
404 size_t bytesConverted = outBufSize - outBytesAvail;
406 //make buffer twice as big
407 outBufSize *= 2;
408 char* newBuf = (char*)realloc(outBuf, outBufSize);
409 if (!newBuf)
411 CLog::Log(LOGFATAL, "{} realloc failed with errno={}({})", __FUNCTION__, errno,
412 strerror(errno));
413 break;
415 outBuf = newBuf;
417 //update the buffer pointer and counter
418 outBufStart = outBuf + bytesConverted;
419 outBytesAvail = outBufSize - bytesConverted;
421 //continue in the loop and convert the rest
422 continue;
424 else if (errno == EILSEQ) //An invalid multibyte sequence has been encountered in the input
426 if (failOnInvalidChar)
427 break;
429 //skip invalid byte
430 inBufStart++;
431 inBytesAvail--;
432 //continue in the loop and convert the rest
433 continue;
435 else if (errno == EINVAL) /* Invalid sequence at the end of input buffer */
437 if (!failOnInvalidChar)
438 returnV = 0; /* reset error status to use converted part */
440 break;
442 else //iconv() had some other error
444 CLog::Log(LOGERROR, "{}: iconv() failed, errno={} ({})", __FUNCTION__, errno,
445 strerror(errno));
448 break;
451 //complete the conversion (reset buffers), otherwise the current data will prefix the data on the next call
452 if (iconv(type, NULL, NULL, &outBufStart, &outBytesAvail) == (size_t)-1)
453 CLog::Log(LOGERROR, "{} failed cleanup errno={}({})", __FUNCTION__, errno, strerror(errno));
455 if (returnV == (size_t)-1)
457 free(outBuf);
458 return false;
460 //we're done
462 const typename OUTPUT::size_type sizeInChars = (typename OUTPUT::size_type) (outBufSize - outBytesAvail) / sizeof(typename OUTPUT::value_type);
463 typename OUTPUT::const_pointer strPtr = (typename OUTPUT::const_pointer) outBuf;
464 /* Make sure that all buffer is assigned and string is stopped at end of buffer */
465 if (sizeInChars > 0 && strPtr[sizeInChars - 1] == 0 && strSource[strSource.length() - 1] != 0)
466 strDest.assign(strPtr, sizeInChars-1);
467 else
468 strDest.assign(strPtr, sizeInChars);
470 free(outBuf);
472 return true;
475 bool CCharsetConverter::CInnerConverter::logicalToVisualBiDi(
476 const std::u32string& stringSrc,
477 std::u32string& stringDst,
478 FriBidiCharType base /*= FRIBIDI_TYPE_LTR*/,
479 const bool failOnBadString /*= false*/,
480 int* visualToLogicalMap /*= nullptr*/)
482 stringDst.clear();
484 const size_t srcLen = stringSrc.length();
485 if (srcLen == 0)
486 return true;
488 stringDst.reserve(srcLen);
489 size_t lineStart = 0;
491 // libfribidi is not threadsafe, so make sure we make it so
492 std::unique_lock<CCriticalSection> lock(m_critSectionFriBiDi);
495 size_t lineEnd = stringSrc.find('\n', lineStart);
496 if (lineEnd >= srcLen) // equal to 'lineEnd == std::string::npos'
497 lineEnd = srcLen;
498 else
499 lineEnd++; // include '\n'
501 const size_t lineLen = lineEnd - lineStart;
503 FriBidiChar* visual = (FriBidiChar*) malloc((lineLen + 1) * sizeof(FriBidiChar));
504 if (visual == NULL)
506 free(visual);
507 CLog::Log(LOGFATAL, "{}: can't allocate memory", __FUNCTION__);
508 return false;
511 bool bidiFailed = false;
512 FriBidiCharType baseCopy = base; // preserve same value for all lines, required because fribidi_log2vis will modify parameter value
513 if (fribidi_log2vis(reinterpret_cast<const FriBidiChar*>(stringSrc.c_str() + lineStart),
514 lineLen, &baseCopy, visual, nullptr,
515 !visualToLogicalMap ? nullptr : visualToLogicalMap + lineStart, nullptr))
517 // Removes bidirectional marks
518 const int newLen = fribidi_remove_bidi_marks(
519 visual, lineLen, nullptr, !visualToLogicalMap ? nullptr : visualToLogicalMap + lineStart,
520 nullptr);
521 if (newLen > 0)
522 stringDst.append((const char32_t*)visual, (size_t)newLen);
523 else if (newLen < 0)
524 bidiFailed = failOnBadString;
526 else
527 bidiFailed = failOnBadString;
529 free(visual);
531 if (bidiFailed)
532 return false;
534 lineStart = lineEnd;
535 } while (lineStart < srcLen);
537 return !stringDst.empty();
540 bool CCharsetConverter::CInnerConverter::isBidiDirectionRTL(const std::string& str)
542 std::u32string converted;
543 if (!CInnerConverter::stdConvert(Utf8ToUtf32, str, converted, true))
544 return false;
546 int lineLen = static_cast<int>(str.size());
547 FriBidiCharType* charTypes = new FriBidiCharType[lineLen];
548 fribidi_get_bidi_types(reinterpret_cast<const FriBidiChar*>(converted.c_str()),
549 (FriBidiStrIndex)lineLen, charTypes);
550 FriBidiCharType charType = fribidi_get_par_direction(charTypes, (FriBidiStrIndex)lineLen);
551 delete[] charTypes;
552 return charType == FRIBIDI_PAR_RTL;
555 static struct SCharsetMapping
557 const char* charset;
558 const char* caption;
559 } g_charsets[] = {
560 { "ISO-8859-1", "Western Europe (ISO)" }
561 , { "ISO-8859-2", "Central Europe (ISO)" }
562 , { "ISO-8859-3", "South Europe (ISO)" }
563 , { "ISO-8859-4", "Baltic (ISO)" }
564 , { "ISO-8859-5", "Cyrillic (ISO)" }
565 , { "ISO-8859-6", "Arabic (ISO)" }
566 , { "ISO-8859-7", "Greek (ISO)" }
567 , { "ISO-8859-8", "Hebrew (ISO)" }
568 , { "ISO-8859-9", "Turkish (ISO)" }
569 , { "CP1250", "Central Europe (Windows)" }
570 , { "CP1251", "Cyrillic (Windows)" }
571 , { "CP1252", "Western Europe (Windows)" }
572 , { "CP1253", "Greek (Windows)" }
573 , { "CP1254", "Turkish (Windows)" }
574 , { "CP1255", "Hebrew (Windows)" }
575 , { "CP1256", "Arabic (Windows)" }
576 , { "CP1257", "Baltic (Windows)" }
577 , { "CP1258", "Vietnamese (Windows)" }
578 , { "CP874", "Thai (Windows)" }
579 , { "BIG5", "Chinese Traditional (Big5)" }
580 , { "GBK", "Chinese Simplified (GBK)" }
581 , { "SHIFT_JIS", "Japanese (Shift-JIS)" }
582 , { "CP949", "Korean" }
583 , { "BIG5-HKSCS", "Hong Kong (Big5-HKSCS)" }
584 , { NULL, NULL }
587 CCharsetConverter::CCharsetConverter() = default;
589 void CCharsetConverter::OnSettingChanged(const std::shared_ptr<const CSetting>& setting)
591 if (setting == NULL)
592 return;
594 const std::string& settingId = setting->GetId();
595 if (settingId == CSettings::SETTING_LOCALE_CHARSET)
596 resetUserCharset();
597 else if (settingId == CSettings::SETTING_SUBTITLES_CHARSET)
598 resetSubtitleCharset();
601 void CCharsetConverter::clear()
605 std::vector<std::string> CCharsetConverter::getCharsetLabels()
607 std::vector<std::string> lab;
608 for(SCharsetMapping* c = g_charsets; c->charset; c++)
609 lab.emplace_back(c->caption);
611 return lab;
614 std::string CCharsetConverter::getCharsetLabelByName(const std::string& charsetName)
616 for(SCharsetMapping* c = g_charsets; c->charset; c++)
618 if (StringUtils::EqualsNoCase(charsetName,c->charset))
619 return c->caption;
622 return "";
625 std::string CCharsetConverter::getCharsetNameByLabel(const std::string& charsetLabel)
627 for(SCharsetMapping* c = g_charsets; c->charset; c++)
629 if (StringUtils::EqualsNoCase(charsetLabel, c->caption))
630 return c->charset;
633 return "";
636 void CCharsetConverter::reset(void)
638 for (CConverterType& conversion : CInnerConverter::m_stdConversion)
639 conversion.Reset();
642 void CCharsetConverter::resetSystemCharset(void)
644 CInnerConverter::m_stdConversion[Utf8ToSystem].Reset();
645 CInnerConverter::m_stdConversion[SystemToUtf8].Reset();
648 void CCharsetConverter::resetUserCharset(void)
650 CInnerConverter::m_stdConversion[UserCharsetToUtf8].Reset();
651 CInnerConverter::m_stdConversion[UserCharsetToUtf8].Reset();
652 CInnerConverter::m_stdConversion[Utf32ToUserCharset].Reset();
653 resetSubtitleCharset();
656 void CCharsetConverter::resetSubtitleCharset(void)
658 CInnerConverter::m_stdConversion[SubtitleCharsetToUtf8].Reset();
661 void CCharsetConverter::reinitCharsetsFromSettings(void)
663 resetUserCharset(); // this will also reinit Subtitle charsets
666 bool CCharsetConverter::utf8ToUtf32(const std::string& utf8StringSrc, std::u32string& utf32StringDst, bool failOnBadChar /*= true*/)
668 return CInnerConverter::stdConvert(Utf8ToUtf32, utf8StringSrc, utf32StringDst, failOnBadChar);
671 std::u32string CCharsetConverter::utf8ToUtf32(const std::string& utf8StringSrc, bool failOnBadChar /*= true*/)
673 std::u32string converted;
674 utf8ToUtf32(utf8StringSrc, converted, failOnBadChar);
675 return converted;
678 bool CCharsetConverter::utf8ToUtf32Visual(const std::string& utf8StringSrc, std::u32string& utf32StringDst, bool bVisualBiDiFlip /*= false*/, bool forceLTRReadingOrder /*= false*/, bool failOnBadChar /*= false*/)
680 if (bVisualBiDiFlip)
682 std::u32string converted;
683 if (!CInnerConverter::stdConvert(Utf8ToUtf32, utf8StringSrc, converted, failOnBadChar))
684 return false;
686 return CInnerConverter::logicalToVisualBiDi(converted, utf32StringDst, forceLTRReadingOrder ? FRIBIDI_TYPE_LTR : FRIBIDI_TYPE_PDF, failOnBadChar);
688 return CInnerConverter::stdConvert(Utf8ToUtf32, utf8StringSrc, utf32StringDst, failOnBadChar);
691 bool CCharsetConverter::utf32ToUtf8(const std::u32string& utf32StringSrc, std::string& utf8StringDst, bool failOnBadChar /*= true*/)
693 return CInnerConverter::stdConvert(Utf32ToUtf8, utf32StringSrc, utf8StringDst, failOnBadChar);
696 std::string CCharsetConverter::utf32ToUtf8(const std::u32string& utf32StringSrc, bool failOnBadChar /*= false*/)
698 std::string converted;
699 utf32ToUtf8(utf32StringSrc, converted, failOnBadChar);
700 return converted;
703 bool CCharsetConverter::utf32ToW(const std::u32string& utf32StringSrc, std::wstring& wStringDst, bool failOnBadChar /*= true*/)
705 #ifdef WCHAR_IS_UCS_4
706 wStringDst.assign((const wchar_t*)utf32StringSrc.c_str(), utf32StringSrc.length());
707 return true;
708 #else // !WCHAR_IS_UCS_4
709 return CInnerConverter::stdConvert(Utf32ToW, utf32StringSrc, wStringDst, failOnBadChar);
710 #endif // !WCHAR_IS_UCS_4
713 bool CCharsetConverter::utf32logicalToVisualBiDi(const std::u32string& logicalStringSrc,
714 std::u32string& visualStringDst,
715 bool forceLTRReadingOrder /*= false*/,
716 bool failOnBadString /*= false*/,
717 int* visualToLogicalMap /*= nullptr*/)
719 return CInnerConverter::logicalToVisualBiDi(
720 logicalStringSrc, visualStringDst, forceLTRReadingOrder ? FRIBIDI_TYPE_LTR : FRIBIDI_TYPE_PDF,
721 failOnBadString, visualToLogicalMap);
724 bool CCharsetConverter::wToUtf32(const std::wstring& wStringSrc, std::u32string& utf32StringDst, bool failOnBadChar /*= true*/)
726 #ifdef WCHAR_IS_UCS_4
727 /* UCS-4 is almost equal to UTF-32, but UTF-32 has strict limits on possible values, while UCS-4 is usually unchecked.
728 * With this "conversion" we ensure that output will be valid UTF-32 string. */
729 #endif
730 return CInnerConverter::stdConvert(WToUtf32, wStringSrc, utf32StringDst, failOnBadChar);
733 // The bVisualBiDiFlip forces a flip of characters for hebrew/arabic languages, only set to false if the flipping
734 // of the string is already made or the string is not displayed in the GUI
735 bool CCharsetConverter::utf8ToW(const std::string& utf8StringSrc, std::wstring& wStringDst, bool bVisualBiDiFlip /*= true*/,
736 bool forceLTRReadingOrder /*= false*/, bool failOnBadChar /*= false*/)
738 // Try to flip hebrew/arabic characters, if any
739 if (bVisualBiDiFlip)
741 wStringDst.clear();
742 std::u32string utf32str;
743 if (!CInnerConverter::stdConvert(Utf8ToUtf32, utf8StringSrc, utf32str, failOnBadChar))
744 return false;
746 std::u32string utf32flipped;
747 const bool bidiResult = CInnerConverter::logicalToVisualBiDi(utf32str, utf32flipped, forceLTRReadingOrder ? FRIBIDI_TYPE_LTR : FRIBIDI_TYPE_PDF, failOnBadChar);
749 return CInnerConverter::stdConvert(Utf32ToW, utf32flipped, wStringDst, failOnBadChar) && bidiResult;
752 return CInnerConverter::stdConvert(Utf8toW, utf8StringSrc, wStringDst, failOnBadChar);
755 bool CCharsetConverter::subtitleCharsetToUtf8(const std::string& stringSrc, std::string& utf8StringDst)
757 return CInnerConverter::stdConvert(SubtitleCharsetToUtf8, stringSrc, utf8StringDst, false);
760 bool CCharsetConverter::fromW(const std::wstring& wStringSrc,
761 std::string& stringDst, const std::string& enc)
763 return CInnerConverter::customConvert(WCHAR_CHARSET, enc, wStringSrc, stringDst);
766 bool CCharsetConverter::toW(const std::string& stringSrc,
767 std::wstring& wStringDst, const std::string& enc)
769 return CInnerConverter::customConvert(enc, WCHAR_CHARSET, stringSrc, wStringDst);
772 bool CCharsetConverter::utf8ToStringCharset(const std::string& utf8StringSrc, std::string& stringDst)
774 return CInnerConverter::stdConvert(Utf8ToUserCharset, utf8StringSrc, stringDst);
777 bool CCharsetConverter::utf8ToStringCharset(std::string& stringSrcDst)
779 std::string strSrc(stringSrcDst);
780 return utf8ToStringCharset(strSrc, stringSrcDst);
783 bool CCharsetConverter::ToUtf8(const std::string& strSourceCharset, const std::string& stringSrc, std::string& utf8StringDst, bool failOnBadChar /*= false*/)
785 if (strSourceCharset == "UTF-8")
786 { // simple case - no conversion necessary
787 utf8StringDst = stringSrc;
788 return true;
791 return CInnerConverter::customConvert(strSourceCharset, "UTF-8", stringSrc, utf8StringDst, failOnBadChar);
794 bool CCharsetConverter::utf8To(const std::string& strDestCharset, const std::string& utf8StringSrc, std::string& stringDst)
796 if (strDestCharset == "UTF-8")
797 { // simple case - no conversion necessary
798 stringDst = utf8StringSrc;
799 return true;
802 return CInnerConverter::customConvert(UTF8_SOURCE, strDestCharset, utf8StringSrc, stringDst);
805 bool CCharsetConverter::utf8To(const std::string& strDestCharset, const std::string& utf8StringSrc, std::u16string& utf16StringDst)
807 return CInnerConverter::customConvert(UTF8_SOURCE, strDestCharset, utf8StringSrc, utf16StringDst);
810 bool CCharsetConverter::utf8To(const std::string& strDestCharset, const std::string& utf8StringSrc, std::u32string& utf32StringDst)
812 return CInnerConverter::customConvert(UTF8_SOURCE, strDestCharset, utf8StringSrc, utf32StringDst);
815 bool CCharsetConverter::unknownToUTF8(std::string& stringSrcDst)
817 std::string source(stringSrcDst);
818 return unknownToUTF8(source, stringSrcDst);
821 bool CCharsetConverter::unknownToUTF8(const std::string& stringSrc, std::string& utf8StringDst, bool failOnBadChar /*= false*/)
823 // checks whether it's utf8 already, and if not converts using the sourceCharset if given, else the string charset
824 if (CUtf8Utils::isValidUtf8(stringSrc))
826 utf8StringDst = stringSrc;
827 return true;
829 return CInnerConverter::stdConvert(UserCharsetToUtf8, stringSrc, utf8StringDst, failOnBadChar);
832 bool CCharsetConverter::wToUTF8(const std::wstring& wStringSrc, std::string& utf8StringDst, bool failOnBadChar /*= false*/)
834 return CInnerConverter::stdConvert(WtoUtf8, wStringSrc, utf8StringDst, failOnBadChar);
837 bool CCharsetConverter::utf16BEtoUTF8(const std::u16string& utf16StringSrc, std::string& utf8StringDst)
839 return CInnerConverter::stdConvert(Utf16BEtoUtf8, utf16StringSrc, utf8StringDst);
842 bool CCharsetConverter::utf16BEtoUTF8(const std::string& utf16StringSrc, std::string& utf8StringDst)
844 return CInnerConverter::stdConvert(Utf16BEtoUtf8, utf16StringSrc, utf8StringDst);
847 bool CCharsetConverter::utf16LEtoUTF8(const std::u16string& utf16StringSrc,
848 std::string& utf8StringDst)
850 return CInnerConverter::stdConvert(Utf16LEtoUtf8, utf16StringSrc, utf8StringDst);
853 bool CCharsetConverter::ucs2ToUTF8(const std::u16string& ucs2StringSrc, std::string& utf8StringDst)
855 return CInnerConverter::stdConvert(Ucs2CharsetToUtf8, ucs2StringSrc,utf8StringDst);
858 bool CCharsetConverter::utf16LEtoW(const std::u16string& utf16String, std::wstring& wString)
860 return CInnerConverter::stdConvert(Utf16LEtoW, utf16String, wString);
863 bool CCharsetConverter::utf32ToStringCharset(const std::u32string& utf32StringSrc, std::string& stringDst)
865 return CInnerConverter::stdConvert(Utf32ToUserCharset, utf32StringSrc, stringDst);
868 bool CCharsetConverter::utf8ToSystem(std::string& stringSrcDst, bool failOnBadChar /*= false*/)
870 std::string strSrc(stringSrcDst);
871 return CInnerConverter::stdConvert(Utf8ToSystem, strSrc, stringSrcDst, failOnBadChar);
874 bool CCharsetConverter::systemToUtf8(const std::string& sysStringSrc, std::string& utf8StringDst, bool failOnBadChar /*= false*/)
876 return CInnerConverter::stdConvert(SystemToUtf8, sysStringSrc, utf8StringDst, failOnBadChar);
879 bool CCharsetConverter::MacintoshToUTF8(const std::string& macStringSrc, std::string& utf8StringDst)
881 return CInnerConverter::stdConvert(MacintoshToUtf8, macStringSrc, utf8StringDst);
884 bool CCharsetConverter::utf8logicalToVisualBiDi(const std::string& utf8StringSrc, std::string& utf8StringDst, bool failOnBadString /*= false*/)
886 utf8StringDst.clear();
887 std::u32string utf32flipped;
888 if (!utf8ToUtf32Visual(utf8StringSrc, utf32flipped, true, true, failOnBadString))
889 return false;
891 return CInnerConverter::stdConvert(Utf32ToUtf8, utf32flipped, utf8StringDst, failOnBadString);
894 bool CCharsetConverter::utf8IsRTLBidiDirection(const std::string& utf8String)
896 return CInnerConverter::isBidiDirectionRTL(utf8String);
899 void CCharsetConverter::SettingOptionsCharsetsFiller(const SettingConstPtr& setting,
900 std::vector<StringSettingOption>& list,
901 std::string& current,
902 void* data)
904 std::vector<std::string> vecCharsets = g_charsetConverter.getCharsetLabels();
905 sort(vecCharsets.begin(), vecCharsets.end(), sortstringbyname());
907 list.emplace_back(g_localizeStrings.Get(13278), "DEFAULT"); // "Default"
908 for (int i = 0; i < (int) vecCharsets.size(); ++i)
909 list.emplace_back(vecCharsets[i], g_charsetConverter.getCharsetNameByLabel(vecCharsets[i]));