1 //===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----===
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "llvm/ADT/ArrayRef.h"
10 #include "llvm/ADT/StringRef.h"
11 #include "llvm/Support/ConvertUTF.h"
12 #include "llvm/Support/ErrorHandling.h"
13 #include "llvm/Support/SwapByteOrder.h"
19 bool ConvertUTF8toWide(unsigned WideCharWidth
, llvm::StringRef Source
,
20 char *&ResultPtr
, const UTF8
*&ErrorPtr
) {
21 assert(WideCharWidth
== 1 || WideCharWidth
== 2 || WideCharWidth
== 4);
22 ConversionResult result
= conversionOK
;
23 // Copy the character span over.
24 if (WideCharWidth
== 1) {
25 const UTF8
*Pos
= reinterpret_cast<const UTF8
*>(Source
.begin());
26 if (!isLegalUTF8String(&Pos
, reinterpret_cast<const UTF8
*>(Source
.end()))) {
27 result
= sourceIllegal
;
30 memcpy(ResultPtr
, Source
.data(), Source
.size());
31 ResultPtr
+= Source
.size();
33 } else if (WideCharWidth
== 2) {
34 const UTF8
*sourceStart
= (const UTF8
*)Source
.data();
35 // FIXME: Make the type of the result buffer correct instead of
36 // using reinterpret_cast.
37 UTF16
*targetStart
= reinterpret_cast<UTF16
*>(ResultPtr
);
38 ConversionFlags flags
= strictConversion
;
39 result
= ConvertUTF8toUTF16(
40 &sourceStart
, sourceStart
+ Source
.size(),
41 &targetStart
, targetStart
+ Source
.size(), flags
);
42 if (result
== conversionOK
)
43 ResultPtr
= reinterpret_cast<char*>(targetStart
);
45 ErrorPtr
= sourceStart
;
46 } else if (WideCharWidth
== 4) {
47 const UTF8
*sourceStart
= (const UTF8
*)Source
.data();
48 // FIXME: Make the type of the result buffer correct instead of
49 // using reinterpret_cast.
50 UTF32
*targetStart
= reinterpret_cast<UTF32
*>(ResultPtr
);
51 ConversionFlags flags
= strictConversion
;
52 result
= ConvertUTF8toUTF32(
53 &sourceStart
, sourceStart
+ Source
.size(),
54 &targetStart
, targetStart
+ Source
.size(), flags
);
55 if (result
== conversionOK
)
56 ResultPtr
= reinterpret_cast<char*>(targetStart
);
58 ErrorPtr
= sourceStart
;
60 assert((result
!= targetExhausted
)
61 && "ConvertUTF8toUTFXX exhausted target buffer");
62 return result
== conversionOK
;
65 bool ConvertCodePointToUTF8(unsigned Source
, char *&ResultPtr
) {
66 const UTF32
*SourceStart
= &Source
;
67 const UTF32
*SourceEnd
= SourceStart
+ 1;
68 UTF8
*TargetStart
= reinterpret_cast<UTF8
*>(ResultPtr
);
69 UTF8
*TargetEnd
= TargetStart
+ 4;
70 ConversionResult CR
= ConvertUTF32toUTF8(&SourceStart
, SourceEnd
,
71 &TargetStart
, TargetEnd
,
73 if (CR
!= conversionOK
)
76 ResultPtr
= reinterpret_cast<char*>(TargetStart
);
80 bool hasUTF16ByteOrderMark(ArrayRef
<char> S
) {
81 return (S
.size() >= 2 &&
82 ((S
[0] == '\xff' && S
[1] == '\xfe') ||
83 (S
[0] == '\xfe' && S
[1] == '\xff')));
86 bool convertUTF16ToUTF8String(ArrayRef
<char> SrcBytes
, std::string
&Out
) {
89 // Error out on an uneven byte count.
90 if (SrcBytes
.size() % 2)
93 // Avoid OOB by returning early on empty input.
97 const UTF16
*Src
= reinterpret_cast<const UTF16
*>(SrcBytes
.begin());
98 const UTF16
*SrcEnd
= reinterpret_cast<const UTF16
*>(SrcBytes
.end());
100 // Byteswap if necessary.
101 std::vector
<UTF16
> ByteSwapped
;
102 if (Src
[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED
) {
103 ByteSwapped
.insert(ByteSwapped
.end(), Src
, SrcEnd
);
104 for (unsigned I
= 0, E
= ByteSwapped
.size(); I
!= E
; ++I
)
105 ByteSwapped
[I
] = llvm::sys::SwapByteOrder_16(ByteSwapped
[I
]);
106 Src
= &ByteSwapped
[0];
107 SrcEnd
= &ByteSwapped
[ByteSwapped
.size() - 1] + 1;
110 // Skip the BOM for conversion.
111 if (Src
[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE
)
114 // Just allocate enough space up front. We'll shrink it later. Allocate
115 // enough that we can fit a null terminator without reallocating.
116 Out
.resize(SrcBytes
.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT
+ 1);
117 UTF8
*Dst
= reinterpret_cast<UTF8
*>(&Out
[0]);
118 UTF8
*DstEnd
= Dst
+ Out
.size();
120 ConversionResult CR
=
121 ConvertUTF16toUTF8(&Src
, SrcEnd
, &Dst
, DstEnd
, strictConversion
);
122 assert(CR
!= targetExhausted
);
124 if (CR
!= conversionOK
) {
129 Out
.resize(reinterpret_cast<char *>(Dst
) - &Out
[0]);
135 bool convertUTF16ToUTF8String(ArrayRef
<UTF16
> Src
, std::string
&Out
)
137 return convertUTF16ToUTF8String(
138 llvm::ArrayRef
<char>(reinterpret_cast<const char *>(Src
.data()),
139 Src
.size() * sizeof(UTF16
)), Out
);
142 bool convertUTF8ToUTF16String(StringRef SrcUTF8
,
143 SmallVectorImpl
<UTF16
> &DstUTF16
) {
144 assert(DstUTF16
.empty());
146 // Avoid OOB by returning early on empty input.
147 if (SrcUTF8
.empty()) {
148 DstUTF16
.push_back(0);
153 const UTF8
*Src
= reinterpret_cast<const UTF8
*>(SrcUTF8
.begin());
154 const UTF8
*SrcEnd
= reinterpret_cast<const UTF8
*>(SrcUTF8
.end());
156 // Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding
157 // as UTF-16 should always require the same amount or less code units than the
158 // UTF-8 encoding. Allocate one extra byte for the null terminator though,
159 // so that someone calling DstUTF16.data() gets a null terminated string.
160 // We resize down later so we don't have to worry that this over allocates.
161 DstUTF16
.resize(SrcUTF8
.size()+1);
162 UTF16
*Dst
= &DstUTF16
[0];
163 UTF16
*DstEnd
= Dst
+ DstUTF16
.size();
165 ConversionResult CR
=
166 ConvertUTF8toUTF16(&Src
, SrcEnd
, &Dst
, DstEnd
, strictConversion
);
167 assert(CR
!= targetExhausted
);
169 if (CR
!= conversionOK
) {
174 DstUTF16
.resize(Dst
- &DstUTF16
[0]);
175 DstUTF16
.push_back(0);
180 static_assert(sizeof(wchar_t) == 1 || sizeof(wchar_t) == 2 ||
181 sizeof(wchar_t) == 4,
182 "Expected wchar_t to be 1, 2, or 4 bytes");
184 template <typename TResult
>
185 static inline bool ConvertUTF8toWideInternal(llvm::StringRef Source
,
187 // Even in the case of UTF-16, the number of bytes in a UTF-8 string is
188 // at least as large as the number of elements in the resulting wide
189 // string, because surrogate pairs take at least 4 bytes in UTF-8.
190 Result
.resize(Source
.size() + 1);
191 char *ResultPtr
= reinterpret_cast<char *>(&Result
[0]);
192 const UTF8
*ErrorPtr
;
193 if (!ConvertUTF8toWide(sizeof(wchar_t), Source
, ResultPtr
, ErrorPtr
)) {
197 Result
.resize(reinterpret_cast<wchar_t *>(ResultPtr
) - &Result
[0]);
201 bool ConvertUTF8toWide(llvm::StringRef Source
, std::wstring
&Result
) {
202 return ConvertUTF8toWideInternal(Source
, Result
);
205 bool ConvertUTF8toWide(const char *Source
, std::wstring
&Result
) {
210 return ConvertUTF8toWide(llvm::StringRef(Source
), Result
);
213 bool convertWideToUTF8(const std::wstring
&Source
, std::string
&Result
) {
214 if (sizeof(wchar_t) == 1) {
215 const UTF8
*Start
= reinterpret_cast<const UTF8
*>(Source
.data());
217 reinterpret_cast<const UTF8
*>(Source
.data() + Source
.size());
218 if (!isLegalUTF8String(&Start
, End
))
220 Result
.resize(Source
.size());
221 memcpy(&Result
[0], Source
.data(), Source
.size());
223 } else if (sizeof(wchar_t) == 2) {
224 return convertUTF16ToUTF8String(
225 llvm::ArrayRef
<UTF16
>(reinterpret_cast<const UTF16
*>(Source
.data()),
228 } else if (sizeof(wchar_t) == 4) {
229 const UTF32
*Start
= reinterpret_cast<const UTF32
*>(Source
.data());
231 reinterpret_cast<const UTF32
*>(Source
.data() + Source
.size());
232 Result
.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT
* Source
.size());
233 UTF8
*ResultPtr
= reinterpret_cast<UTF8
*>(&Result
[0]);
234 UTF8
*ResultEnd
= reinterpret_cast<UTF8
*>(&Result
[0] + Result
.size());
235 if (ConvertUTF32toUTF8(&Start
, End
, &ResultPtr
, ResultEnd
,
236 strictConversion
) == conversionOK
) {
237 Result
.resize(reinterpret_cast<char *>(ResultPtr
) - &Result
[0]);
245 "Control should never reach this point; see static_assert further up");
249 } // end namespace llvm