1 //===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----===
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "llvm/ADT/ArrayRef.h"
10 #include "llvm/ADT/StringRef.h"
11 #include "llvm/Support/ConvertUTF.h"
12 #include "llvm/Support/ErrorHandling.h"
13 #include "llvm/Support/SwapByteOrder.h"
19 bool ConvertUTF8toWide(unsigned WideCharWidth
, llvm::StringRef Source
,
20 char *&ResultPtr
, const UTF8
*&ErrorPtr
) {
21 assert(WideCharWidth
== 1 || WideCharWidth
== 2 || WideCharWidth
== 4);
22 ConversionResult result
= conversionOK
;
23 // Copy the character span over.
24 if (WideCharWidth
== 1) {
25 const UTF8
*Pos
= reinterpret_cast<const UTF8
*>(Source
.begin());
26 if (!isLegalUTF8String(&Pos
, reinterpret_cast<const UTF8
*>(Source
.end()))) {
27 result
= sourceIllegal
;
30 memcpy(ResultPtr
, Source
.data(), Source
.size());
31 ResultPtr
+= Source
.size();
33 } else if (WideCharWidth
== 2) {
34 const UTF8
*sourceStart
= (const UTF8
*)Source
.data();
35 // FIXME: Make the type of the result buffer correct instead of
36 // using reinterpret_cast.
37 UTF16
*targetStart
= reinterpret_cast<UTF16
*>(ResultPtr
);
38 ConversionFlags flags
= strictConversion
;
39 result
= ConvertUTF8toUTF16(
40 &sourceStart
, sourceStart
+ Source
.size(),
41 &targetStart
, targetStart
+ Source
.size(), flags
);
42 if (result
== conversionOK
)
43 ResultPtr
= reinterpret_cast<char*>(targetStart
);
45 ErrorPtr
= sourceStart
;
46 } else if (WideCharWidth
== 4) {
47 const UTF8
*sourceStart
= (const UTF8
*)Source
.data();
48 // FIXME: Make the type of the result buffer correct instead of
49 // using reinterpret_cast.
50 UTF32
*targetStart
= reinterpret_cast<UTF32
*>(ResultPtr
);
51 ConversionFlags flags
= strictConversion
;
52 result
= ConvertUTF8toUTF32(
53 &sourceStart
, sourceStart
+ Source
.size(),
54 &targetStart
, targetStart
+ Source
.size(), flags
);
55 if (result
== conversionOK
)
56 ResultPtr
= reinterpret_cast<char*>(targetStart
);
58 ErrorPtr
= sourceStart
;
60 assert((result
!= targetExhausted
)
61 && "ConvertUTF8toUTFXX exhausted target buffer");
62 return result
== conversionOK
;
65 bool ConvertCodePointToUTF8(unsigned Source
, char *&ResultPtr
) {
66 const UTF32
*SourceStart
= &Source
;
67 const UTF32
*SourceEnd
= SourceStart
+ 1;
68 UTF8
*TargetStart
= reinterpret_cast<UTF8
*>(ResultPtr
);
69 UTF8
*TargetEnd
= TargetStart
+ 4;
70 ConversionResult CR
= ConvertUTF32toUTF8(&SourceStart
, SourceEnd
,
71 &TargetStart
, TargetEnd
,
73 if (CR
!= conversionOK
)
76 ResultPtr
= reinterpret_cast<char*>(TargetStart
);
80 bool hasUTF16ByteOrderMark(ArrayRef
<char> S
) {
81 return (S
.size() >= 2 &&
82 ((S
[0] == '\xff' && S
[1] == '\xfe') ||
83 (S
[0] == '\xfe' && S
[1] == '\xff')));
86 bool convertUTF16ToUTF8String(ArrayRef
<char> SrcBytes
, std::string
&Out
) {
89 // Error out on an uneven byte count.
90 if (SrcBytes
.size() % 2)
93 // Avoid OOB by returning early on empty input.
97 const UTF16
*Src
= reinterpret_cast<const UTF16
*>(SrcBytes
.begin());
98 const UTF16
*SrcEnd
= reinterpret_cast<const UTF16
*>(SrcBytes
.end());
100 assert((uintptr_t)Src
% sizeof(UTF16
) == 0);
102 // Byteswap if necessary.
103 std::vector
<UTF16
> ByteSwapped
;
104 if (Src
[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED
) {
105 ByteSwapped
.insert(ByteSwapped
.end(), Src
, SrcEnd
);
106 for (unsigned I
= 0, E
= ByteSwapped
.size(); I
!= E
; ++I
)
107 ByteSwapped
[I
] = llvm::ByteSwap_16(ByteSwapped
[I
]);
108 Src
= &ByteSwapped
[0];
109 SrcEnd
= &ByteSwapped
[ByteSwapped
.size() - 1] + 1;
112 // Skip the BOM for conversion.
113 if (Src
[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE
)
116 // Just allocate enough space up front. We'll shrink it later. Allocate
117 // enough that we can fit a null terminator without reallocating.
118 Out
.resize(SrcBytes
.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT
+ 1);
119 UTF8
*Dst
= reinterpret_cast<UTF8
*>(&Out
[0]);
120 UTF8
*DstEnd
= Dst
+ Out
.size();
122 ConversionResult CR
=
123 ConvertUTF16toUTF8(&Src
, SrcEnd
, &Dst
, DstEnd
, strictConversion
);
124 assert(CR
!= targetExhausted
);
126 if (CR
!= conversionOK
) {
131 Out
.resize(reinterpret_cast<char *>(Dst
) - &Out
[0]);
137 bool convertUTF16ToUTF8String(ArrayRef
<UTF16
> Src
, std::string
&Out
)
139 return convertUTF16ToUTF8String(
140 llvm::ArrayRef
<char>(reinterpret_cast<const char *>(Src
.data()),
141 Src
.size() * sizeof(UTF16
)), Out
);
144 bool convertUTF8ToUTF16String(StringRef SrcUTF8
,
145 SmallVectorImpl
<UTF16
> &DstUTF16
) {
146 assert(DstUTF16
.empty());
148 // Avoid OOB by returning early on empty input.
149 if (SrcUTF8
.empty()) {
150 DstUTF16
.push_back(0);
155 const UTF8
*Src
= reinterpret_cast<const UTF8
*>(SrcUTF8
.begin());
156 const UTF8
*SrcEnd
= reinterpret_cast<const UTF8
*>(SrcUTF8
.end());
158 // Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding
159 // as UTF-16 should always require the same amount or less code units than the
160 // UTF-8 encoding. Allocate one extra byte for the null terminator though,
161 // so that someone calling DstUTF16.data() gets a null terminated string.
162 // We resize down later so we don't have to worry that this over allocates.
163 DstUTF16
.resize(SrcUTF8
.size()+1);
164 UTF16
*Dst
= &DstUTF16
[0];
165 UTF16
*DstEnd
= Dst
+ DstUTF16
.size();
167 ConversionResult CR
=
168 ConvertUTF8toUTF16(&Src
, SrcEnd
, &Dst
, DstEnd
, strictConversion
);
169 assert(CR
!= targetExhausted
);
171 if (CR
!= conversionOK
) {
176 DstUTF16
.resize(Dst
- &DstUTF16
[0]);
177 DstUTF16
.push_back(0);
182 static_assert(sizeof(wchar_t) == 1 || sizeof(wchar_t) == 2 ||
183 sizeof(wchar_t) == 4,
184 "Expected wchar_t to be 1, 2, or 4 bytes");
186 template <typename TResult
>
187 static inline bool ConvertUTF8toWideInternal(llvm::StringRef Source
,
189 // Even in the case of UTF-16, the number of bytes in a UTF-8 string is
190 // at least as large as the number of elements in the resulting wide
191 // string, because surrogate pairs take at least 4 bytes in UTF-8.
192 Result
.resize(Source
.size() + 1);
193 char *ResultPtr
= reinterpret_cast<char *>(&Result
[0]);
194 const UTF8
*ErrorPtr
;
195 if (!ConvertUTF8toWide(sizeof(wchar_t), Source
, ResultPtr
, ErrorPtr
)) {
199 Result
.resize(reinterpret_cast<wchar_t *>(ResultPtr
) - &Result
[0]);
203 bool ConvertUTF8toWide(llvm::StringRef Source
, std::wstring
&Result
) {
204 return ConvertUTF8toWideInternal(Source
, Result
);
207 bool ConvertUTF8toWide(const char *Source
, std::wstring
&Result
) {
212 return ConvertUTF8toWide(llvm::StringRef(Source
), Result
);
215 bool convertWideToUTF8(const std::wstring
&Source
, std::string
&Result
) {
216 if (sizeof(wchar_t) == 1) {
217 const UTF8
*Start
= reinterpret_cast<const UTF8
*>(Source
.data());
219 reinterpret_cast<const UTF8
*>(Source
.data() + Source
.size());
220 if (!isLegalUTF8String(&Start
, End
))
222 Result
.resize(Source
.size());
223 memcpy(&Result
[0], Source
.data(), Source
.size());
225 } else if (sizeof(wchar_t) == 2) {
226 return convertUTF16ToUTF8String(
227 llvm::ArrayRef
<UTF16
>(reinterpret_cast<const UTF16
*>(Source
.data()),
230 } else if (sizeof(wchar_t) == 4) {
231 const UTF32
*Start
= reinterpret_cast<const UTF32
*>(Source
.data());
233 reinterpret_cast<const UTF32
*>(Source
.data() + Source
.size());
234 Result
.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT
* Source
.size());
235 UTF8
*ResultPtr
= reinterpret_cast<UTF8
*>(&Result
[0]);
236 UTF8
*ResultEnd
= reinterpret_cast<UTF8
*>(&Result
[0] + Result
.size());
237 if (ConvertUTF32toUTF8(&Start
, End
, &ResultPtr
, ResultEnd
,
238 strictConversion
) == conversionOK
) {
239 Result
.resize(reinterpret_cast<char *>(ResultPtr
) - &Result
[0]);
247 "Control should never reach this point; see static_assert further up");
251 } // end namespace llvm