1 //===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----===
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 #include "llvm/ADT/ArrayRef.h"
11 #include "llvm/ADT/StringRef.h"
12 #include "llvm/Support/ConvertUTF.h"
13 #include "llvm/Support/ErrorHandling.h"
14 #include "llvm/Support/SwapByteOrder.h"
20 bool ConvertUTF8toWide(unsigned WideCharWidth
, llvm::StringRef Source
,
21 char *&ResultPtr
, const UTF8
*&ErrorPtr
) {
22 assert(WideCharWidth
== 1 || WideCharWidth
== 2 || WideCharWidth
== 4);
23 ConversionResult result
= conversionOK
;
24 // Copy the character span over.
25 if (WideCharWidth
== 1) {
26 const UTF8
*Pos
= reinterpret_cast<const UTF8
*>(Source
.begin());
27 if (!isLegalUTF8String(&Pos
, reinterpret_cast<const UTF8
*>(Source
.end()))) {
28 result
= sourceIllegal
;
31 memcpy(ResultPtr
, Source
.data(), Source
.size());
32 ResultPtr
+= Source
.size();
34 } else if (WideCharWidth
== 2) {
35 const UTF8
*sourceStart
= (const UTF8
*)Source
.data();
36 // FIXME: Make the type of the result buffer correct instead of
37 // using reinterpret_cast.
38 UTF16
*targetStart
= reinterpret_cast<UTF16
*>(ResultPtr
);
39 ConversionFlags flags
= strictConversion
;
40 result
= ConvertUTF8toUTF16(
41 &sourceStart
, sourceStart
+ Source
.size(),
42 &targetStart
, targetStart
+ Source
.size(), flags
);
43 if (result
== conversionOK
)
44 ResultPtr
= reinterpret_cast<char*>(targetStart
);
46 ErrorPtr
= sourceStart
;
47 } else if (WideCharWidth
== 4) {
48 const UTF8
*sourceStart
= (const UTF8
*)Source
.data();
49 // FIXME: Make the type of the result buffer correct instead of
50 // using reinterpret_cast.
51 UTF32
*targetStart
= reinterpret_cast<UTF32
*>(ResultPtr
);
52 ConversionFlags flags
= strictConversion
;
53 result
= ConvertUTF8toUTF32(
54 &sourceStart
, sourceStart
+ Source
.size(),
55 &targetStart
, targetStart
+ Source
.size(), flags
);
56 if (result
== conversionOK
)
57 ResultPtr
= reinterpret_cast<char*>(targetStart
);
59 ErrorPtr
= sourceStart
;
61 assert((result
!= targetExhausted
)
62 && "ConvertUTF8toUTFXX exhausted target buffer");
63 return result
== conversionOK
;
66 bool ConvertCodePointToUTF8(unsigned Source
, char *&ResultPtr
) {
67 const UTF32
*SourceStart
= &Source
;
68 const UTF32
*SourceEnd
= SourceStart
+ 1;
69 UTF8
*TargetStart
= reinterpret_cast<UTF8
*>(ResultPtr
);
70 UTF8
*TargetEnd
= TargetStart
+ 4;
71 ConversionResult CR
= ConvertUTF32toUTF8(&SourceStart
, SourceEnd
,
72 &TargetStart
, TargetEnd
,
74 if (CR
!= conversionOK
)
77 ResultPtr
= reinterpret_cast<char*>(TargetStart
);
81 bool hasUTF16ByteOrderMark(ArrayRef
<char> S
) {
82 return (S
.size() >= 2 &&
83 ((S
[0] == '\xff' && S
[1] == '\xfe') ||
84 (S
[0] == '\xfe' && S
[1] == '\xff')));
87 bool convertUTF16ToUTF8String(ArrayRef
<char> SrcBytes
, std::string
&Out
) {
90 // Error out on an uneven byte count.
91 if (SrcBytes
.size() % 2)
94 // Avoid OOB by returning early on empty input.
98 const UTF16
*Src
= reinterpret_cast<const UTF16
*>(SrcBytes
.begin());
99 const UTF16
*SrcEnd
= reinterpret_cast<const UTF16
*>(SrcBytes
.end());
101 // Byteswap if necessary.
102 std::vector
<UTF16
> ByteSwapped
;
103 if (Src
[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED
) {
104 ByteSwapped
.insert(ByteSwapped
.end(), Src
, SrcEnd
);
105 for (unsigned I
= 0, E
= ByteSwapped
.size(); I
!= E
; ++I
)
106 ByteSwapped
[I
] = llvm::sys::SwapByteOrder_16(ByteSwapped
[I
]);
107 Src
= &ByteSwapped
[0];
108 SrcEnd
= &ByteSwapped
[ByteSwapped
.size() - 1] + 1;
111 // Skip the BOM for conversion.
112 if (Src
[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE
)
115 // Just allocate enough space up front. We'll shrink it later. Allocate
116 // enough that we can fit a null terminator without reallocating.
117 Out
.resize(SrcBytes
.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT
+ 1);
118 UTF8
*Dst
= reinterpret_cast<UTF8
*>(&Out
[0]);
119 UTF8
*DstEnd
= Dst
+ Out
.size();
121 ConversionResult CR
=
122 ConvertUTF16toUTF8(&Src
, SrcEnd
, &Dst
, DstEnd
, strictConversion
);
123 assert(CR
!= targetExhausted
);
125 if (CR
!= conversionOK
) {
130 Out
.resize(reinterpret_cast<char *>(Dst
) - &Out
[0]);
136 bool convertUTF16ToUTF8String(ArrayRef
<UTF16
> Src
, std::string
&Out
)
138 return convertUTF16ToUTF8String(
139 llvm::ArrayRef
<char>(reinterpret_cast<const char *>(Src
.data()),
140 Src
.size() * sizeof(UTF16
)), Out
);
143 bool convertUTF8ToUTF16String(StringRef SrcUTF8
,
144 SmallVectorImpl
<UTF16
> &DstUTF16
) {
145 assert(DstUTF16
.empty());
147 // Avoid OOB by returning early on empty input.
148 if (SrcUTF8
.empty()) {
149 DstUTF16
.push_back(0);
154 const UTF8
*Src
= reinterpret_cast<const UTF8
*>(SrcUTF8
.begin());
155 const UTF8
*SrcEnd
= reinterpret_cast<const UTF8
*>(SrcUTF8
.end());
157 // Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding
158 // as UTF-16 should always require the same amount or less code units than the
159 // UTF-8 encoding. Allocate one extra byte for the null terminator though,
160 // so that someone calling DstUTF16.data() gets a null terminated string.
161 // We resize down later so we don't have to worry that this over allocates.
162 DstUTF16
.resize(SrcUTF8
.size()+1);
163 UTF16
*Dst
= &DstUTF16
[0];
164 UTF16
*DstEnd
= Dst
+ DstUTF16
.size();
166 ConversionResult CR
=
167 ConvertUTF8toUTF16(&Src
, SrcEnd
, &Dst
, DstEnd
, strictConversion
);
168 assert(CR
!= targetExhausted
);
170 if (CR
!= conversionOK
) {
175 DstUTF16
.resize(Dst
- &DstUTF16
[0]);
176 DstUTF16
.push_back(0);
181 static_assert(sizeof(wchar_t) == 1 || sizeof(wchar_t) == 2 ||
182 sizeof(wchar_t) == 4,
183 "Expected wchar_t to be 1, 2, or 4 bytes");
185 template <typename TResult
>
186 static inline bool ConvertUTF8toWideInternal(llvm::StringRef Source
,
188 // Even in the case of UTF-16, the number of bytes in a UTF-8 string is
189 // at least as large as the number of elements in the resulting wide
190 // string, because surrogate pairs take at least 4 bytes in UTF-8.
191 Result
.resize(Source
.size() + 1);
192 char *ResultPtr
= reinterpret_cast<char *>(&Result
[0]);
193 const UTF8
*ErrorPtr
;
194 if (!ConvertUTF8toWide(sizeof(wchar_t), Source
, ResultPtr
, ErrorPtr
)) {
198 Result
.resize(reinterpret_cast<wchar_t *>(ResultPtr
) - &Result
[0]);
202 bool ConvertUTF8toWide(llvm::StringRef Source
, std::wstring
&Result
) {
203 return ConvertUTF8toWideInternal(Source
, Result
);
206 bool ConvertUTF8toWide(const char *Source
, std::wstring
&Result
) {
211 return ConvertUTF8toWide(llvm::StringRef(Source
), Result
);
214 bool convertWideToUTF8(const std::wstring
&Source
, std::string
&Result
) {
215 if (sizeof(wchar_t) == 1) {
216 const UTF8
*Start
= reinterpret_cast<const UTF8
*>(Source
.data());
218 reinterpret_cast<const UTF8
*>(Source
.data() + Source
.size());
219 if (!isLegalUTF8String(&Start
, End
))
221 Result
.resize(Source
.size());
222 memcpy(&Result
[0], Source
.data(), Source
.size());
224 } else if (sizeof(wchar_t) == 2) {
225 return convertUTF16ToUTF8String(
226 llvm::ArrayRef
<UTF16
>(reinterpret_cast<const UTF16
*>(Source
.data()),
229 } else if (sizeof(wchar_t) == 4) {
230 const UTF32
*Start
= reinterpret_cast<const UTF32
*>(Source
.data());
232 reinterpret_cast<const UTF32
*>(Source
.data() + Source
.size());
233 Result
.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT
* Source
.size());
234 UTF8
*ResultPtr
= reinterpret_cast<UTF8
*>(&Result
[0]);
235 UTF8
*ResultEnd
= reinterpret_cast<UTF8
*>(&Result
[0] + Result
.size());
236 if (ConvertUTF32toUTF8(&Start
, End
, &ResultPtr
, ResultEnd
,
237 strictConversion
) == conversionOK
) {
238 Result
.resize(reinterpret_cast<char *>(ResultPtr
) - &Result
[0]);
246 "Control should never reach this point; see static_assert further up");
250 } // end namespace llvm