1 //===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----===
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "llvm/ADT/ArrayRef.h"
10 #include "llvm/ADT/StringRef.h"
11 #include "llvm/Support/ConvertUTF.h"
12 #include "llvm/Support/ErrorHandling.h"
13 #include "llvm/Support/SwapByteOrder.h"
19 bool ConvertUTF8toWide(unsigned WideCharWidth
, llvm::StringRef Source
,
20 char *&ResultPtr
, const UTF8
*&ErrorPtr
) {
21 assert(WideCharWidth
== 1 || WideCharWidth
== 2 || WideCharWidth
== 4);
22 ConversionResult result
= conversionOK
;
23 // Copy the character span over.
24 if (WideCharWidth
== 1) {
25 const UTF8
*Pos
= reinterpret_cast<const UTF8
*>(Source
.begin());
26 if (!isLegalUTF8String(&Pos
, reinterpret_cast<const UTF8
*>(Source
.end()))) {
27 result
= sourceIllegal
;
30 memcpy(ResultPtr
, Source
.data(), Source
.size());
31 ResultPtr
+= Source
.size();
33 } else if (WideCharWidth
== 2) {
34 const UTF8
*sourceStart
= (const UTF8
*)Source
.data();
35 // FIXME: Make the type of the result buffer correct instead of
36 // using reinterpret_cast.
37 UTF16
*targetStart
= reinterpret_cast<UTF16
*>(ResultPtr
);
38 ConversionFlags flags
= strictConversion
;
40 ConvertUTF8toUTF16(&sourceStart
, sourceStart
+ Source
.size(),
41 &targetStart
, targetStart
+ Source
.size(), flags
);
42 if (result
== conversionOK
)
43 ResultPtr
= reinterpret_cast<char *>(targetStart
);
45 ErrorPtr
= sourceStart
;
46 } else if (WideCharWidth
== 4) {
47 const UTF8
*sourceStart
= (const UTF8
*)Source
.data();
48 // FIXME: Make the type of the result buffer correct instead of
49 // using reinterpret_cast.
50 UTF32
*targetStart
= reinterpret_cast<UTF32
*>(ResultPtr
);
51 ConversionFlags flags
= strictConversion
;
53 ConvertUTF8toUTF32(&sourceStart
, sourceStart
+ Source
.size(),
54 &targetStart
, targetStart
+ Source
.size(), flags
);
55 if (result
== conversionOK
)
56 ResultPtr
= reinterpret_cast<char *>(targetStart
);
58 ErrorPtr
= sourceStart
;
60 assert((result
!= targetExhausted
) &&
61 "ConvertUTF8toUTFXX exhausted target buffer");
62 return result
== conversionOK
;
65 bool ConvertCodePointToUTF8(unsigned Source
, char *&ResultPtr
) {
66 const UTF32
*SourceStart
= &Source
;
67 const UTF32
*SourceEnd
= SourceStart
+ 1;
68 UTF8
*TargetStart
= reinterpret_cast<UTF8
*>(ResultPtr
);
69 UTF8
*TargetEnd
= TargetStart
+ 4;
70 ConversionResult CR
= ConvertUTF32toUTF8(
71 &SourceStart
, SourceEnd
, &TargetStart
, TargetEnd
, strictConversion
);
72 if (CR
!= conversionOK
)
75 ResultPtr
= reinterpret_cast<char *>(TargetStart
);
79 bool hasUTF16ByteOrderMark(ArrayRef
<char> S
) {
80 return (S
.size() >= 2 && ((S
[0] == '\xff' && S
[1] == '\xfe') ||
81 (S
[0] == '\xfe' && S
[1] == '\xff')));
84 bool convertUTF16ToUTF8String(ArrayRef
<char> SrcBytes
, std::string
&Out
) {
87 // Error out on an uneven byte count.
88 if (SrcBytes
.size() % 2)
91 // Avoid OOB by returning early on empty input.
95 const UTF16
*Src
= reinterpret_cast<const UTF16
*>(SrcBytes
.begin());
96 const UTF16
*SrcEnd
= reinterpret_cast<const UTF16
*>(SrcBytes
.end());
98 assert((uintptr_t)Src
% sizeof(UTF16
) == 0);
100 // Byteswap if necessary.
101 std::vector
<UTF16
> ByteSwapped
;
102 if (Src
[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED
) {
103 ByteSwapped
.insert(ByteSwapped
.end(), Src
, SrcEnd
);
104 for (UTF16
&I
: ByteSwapped
)
105 I
= llvm::byteswap
<uint16_t>(I
);
106 Src
= &ByteSwapped
[0];
107 SrcEnd
= &ByteSwapped
[ByteSwapped
.size() - 1] + 1;
110 // Skip the BOM for conversion.
111 if (Src
[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE
)
114 // Just allocate enough space up front. We'll shrink it later. Allocate
115 // enough that we can fit a null terminator without reallocating.
116 Out
.resize(SrcBytes
.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT
+ 1);
117 UTF8
*Dst
= reinterpret_cast<UTF8
*>(&Out
[0]);
118 UTF8
*DstEnd
= Dst
+ Out
.size();
120 ConversionResult CR
=
121 ConvertUTF16toUTF8(&Src
, SrcEnd
, &Dst
, DstEnd
, strictConversion
);
122 assert(CR
!= targetExhausted
);
124 if (CR
!= conversionOK
) {
129 Out
.resize(reinterpret_cast<char *>(Dst
) - &Out
[0]);
135 bool convertUTF16ToUTF8String(ArrayRef
<UTF16
> Src
, std::string
&Out
) {
136 return convertUTF16ToUTF8String(
137 llvm::ArrayRef
<char>(reinterpret_cast<const char *>(Src
.data()),
138 Src
.size() * sizeof(UTF16
)),
142 bool convertUTF32ToUTF8String(ArrayRef
<char> SrcBytes
, std::string
&Out
) {
145 // Error out on an uneven byte count.
146 if (SrcBytes
.size() % 4)
149 // Avoid OOB by returning early on empty input.
150 if (SrcBytes
.empty())
153 const UTF32
*Src
= reinterpret_cast<const UTF32
*>(SrcBytes
.begin());
154 const UTF32
*SrcEnd
= reinterpret_cast<const UTF32
*>(SrcBytes
.end());
156 assert((uintptr_t)Src
% sizeof(UTF32
) == 0);
158 // Byteswap if necessary.
159 std::vector
<UTF32
> ByteSwapped
;
160 if (Src
[0] == UNI_UTF32_BYTE_ORDER_MARK_SWAPPED
) {
161 ByteSwapped
.insert(ByteSwapped
.end(), Src
, SrcEnd
);
162 for (UTF32
&I
: ByteSwapped
)
163 I
= llvm::byteswap
<uint32_t>(I
);
164 Src
= &ByteSwapped
[0];
165 SrcEnd
= &ByteSwapped
[ByteSwapped
.size() - 1] + 1;
168 // Skip the BOM for conversion.
169 if (Src
[0] == UNI_UTF32_BYTE_ORDER_MARK_NATIVE
)
172 // Just allocate enough space up front. We'll shrink it later. Allocate
173 // enough that we can fit a null terminator without reallocating.
174 Out
.resize(SrcBytes
.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT
+ 1);
175 UTF8
*Dst
= reinterpret_cast<UTF8
*>(&Out
[0]);
176 UTF8
*DstEnd
= Dst
+ Out
.size();
178 ConversionResult CR
=
179 ConvertUTF32toUTF8(&Src
, SrcEnd
, &Dst
, DstEnd
, strictConversion
);
180 assert(CR
!= targetExhausted
);
182 if (CR
!= conversionOK
) {
187 Out
.resize(reinterpret_cast<char *>(Dst
) - &Out
[0]);
193 bool convertUTF32ToUTF8String(ArrayRef
<UTF32
> Src
, std::string
&Out
) {
194 return convertUTF32ToUTF8String(
195 llvm::ArrayRef
<char>(reinterpret_cast<const char *>(Src
.data()),
196 Src
.size() * sizeof(UTF32
)),
200 bool convertUTF8ToUTF16String(StringRef SrcUTF8
,
201 SmallVectorImpl
<UTF16
> &DstUTF16
) {
202 assert(DstUTF16
.empty());
204 // Avoid OOB by returning early on empty input.
205 if (SrcUTF8
.empty()) {
206 DstUTF16
.push_back(0);
211 const UTF8
*Src
= reinterpret_cast<const UTF8
*>(SrcUTF8
.begin());
212 const UTF8
*SrcEnd
= reinterpret_cast<const UTF8
*>(SrcUTF8
.end());
214 // Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding
215 // as UTF-16 should always require the same amount or less code units than the
216 // UTF-8 encoding. Allocate one extra byte for the null terminator though,
217 // so that someone calling DstUTF16.data() gets a null terminated string.
218 // We resize down later so we don't have to worry that this over allocates.
219 DstUTF16
.resize(SrcUTF8
.size()+1);
220 UTF16
*Dst
= &DstUTF16
[0];
221 UTF16
*DstEnd
= Dst
+ DstUTF16
.size();
223 ConversionResult CR
=
224 ConvertUTF8toUTF16(&Src
, SrcEnd
, &Dst
, DstEnd
, strictConversion
);
225 assert(CR
!= targetExhausted
);
227 if (CR
!= conversionOK
) {
232 DstUTF16
.resize(Dst
- &DstUTF16
[0]);
233 DstUTF16
.push_back(0);
238 static_assert(sizeof(wchar_t) == 1 || sizeof(wchar_t) == 2 ||
239 sizeof(wchar_t) == 4,
240 "Expected wchar_t to be 1, 2, or 4 bytes");
242 template <typename TResult
>
243 static inline bool ConvertUTF8toWideInternal(llvm::StringRef Source
,
245 // Even in the case of UTF-16, the number of bytes in a UTF-8 string is
246 // at least as large as the number of elements in the resulting wide
247 // string, because surrogate pairs take at least 4 bytes in UTF-8.
248 Result
.resize(Source
.size() + 1);
249 char *ResultPtr
= reinterpret_cast<char *>(&Result
[0]);
250 const UTF8
*ErrorPtr
;
251 if (!ConvertUTF8toWide(sizeof(wchar_t), Source
, ResultPtr
, ErrorPtr
)) {
255 Result
.resize(reinterpret_cast<wchar_t *>(ResultPtr
) - &Result
[0]);
259 bool ConvertUTF8toWide(llvm::StringRef Source
, std::wstring
&Result
) {
260 return ConvertUTF8toWideInternal(Source
, Result
);
263 bool ConvertUTF8toWide(const char *Source
, std::wstring
&Result
) {
268 return ConvertUTF8toWide(llvm::StringRef(Source
), Result
);
271 bool convertWideToUTF8(const std::wstring
&Source
, std::string
&Result
) {
272 if (sizeof(wchar_t) == 1) {
273 const UTF8
*Start
= reinterpret_cast<const UTF8
*>(Source
.data());
275 reinterpret_cast<const UTF8
*>(Source
.data() + Source
.size());
276 if (!isLegalUTF8String(&Start
, End
))
278 Result
.resize(Source
.size());
279 memcpy(&Result
[0], Source
.data(), Source
.size());
281 } else if (sizeof(wchar_t) == 2) {
282 return convertUTF16ToUTF8String(
283 llvm::ArrayRef
<UTF16
>(reinterpret_cast<const UTF16
*>(Source
.data()),
286 } else if (sizeof(wchar_t) == 4) {
287 const UTF32
*Start
= reinterpret_cast<const UTF32
*>(Source
.data());
289 reinterpret_cast<const UTF32
*>(Source
.data() + Source
.size());
290 Result
.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT
* Source
.size());
291 UTF8
*ResultPtr
= reinterpret_cast<UTF8
*>(&Result
[0]);
292 UTF8
*ResultEnd
= reinterpret_cast<UTF8
*>(&Result
[0] + Result
.size());
293 if (ConvertUTF32toUTF8(&Start
, End
, &ResultPtr
, ResultEnd
,
294 strictConversion
) == conversionOK
) {
295 Result
.resize(reinterpret_cast<char *>(ResultPtr
) - &Result
[0]);
303 "Control should never reach this point; see static_assert further up");
307 } // end namespace llvm