1 //===-- StringPrinter.cpp -------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "lldb/DataFormatters/StringPrinter.h"
11 #include "lldb/Core/Debugger.h"
12 #include "lldb/Core/ValueObject.h"
13 #include "lldb/Target/Language.h"
14 #include "lldb/Target/Process.h"
15 #include "lldb/Target/Target.h"
16 #include "lldb/Utility/Status.h"
18 #include "llvm/ADT/StringExtras.h"
19 #include "llvm/Support/ConvertUTF.h"
26 using namespace lldb_private
;
27 using namespace lldb_private::formatters
;
28 using GetPrintableElementType
= StringPrinter::GetPrintableElementType
;
29 using StringElementType
= StringPrinter::StringElementType
;
31 /// DecodedCharBuffer stores the decoded contents of a single character. It
32 /// avoids managing memory on the heap by copying decoded bytes into an in-line
34 class DecodedCharBuffer
{
36 DecodedCharBuffer(std::nullptr_t
) {}
38 DecodedCharBuffer(const uint8_t *bytes
, size_t size
) : m_size(size
) {
40 llvm_unreachable("unsupported length");
41 memcpy(m_data
, bytes
, size
);
44 DecodedCharBuffer(const char *bytes
, size_t size
)
45 : DecodedCharBuffer(reinterpret_cast<const uint8_t *>(bytes
), size
) {}
47 const uint8_t *GetBytes() const { return m_data
; }
49 size_t GetSize() const { return m_size
; }
52 static constexpr unsigned MaxLength
= 16;
55 uint8_t m_data
[MaxLength
] = {0};
58 using EscapingHelper
=
59 std::function
<DecodedCharBuffer(uint8_t *, uint8_t *, uint8_t *&)>;
61 // we define this for all values of type but only implement it for those we
62 // care about that's good because we get linker errors for any unsupported type
63 template <StringElementType type
>
64 static DecodedCharBuffer
65 GetPrintableImpl(uint8_t *buffer
, uint8_t *buffer_end
, uint8_t *&next
,
66 StringPrinter::EscapeStyle escape_style
);
68 // Mimic isprint() for Unicode codepoints.
69 static bool isprint32(char32_t codepoint
) {
70 if (codepoint
<= 0x1F || codepoint
== 0x7F) // C0
74 if (codepoint
>= 0x80 && codepoint
<= 0x9F) // C1
78 if (codepoint
== 0x2028 || codepoint
== 0x2029) // line/paragraph separators
82 if (codepoint
== 0x200E || codepoint
== 0x200F ||
83 (codepoint
>= 0x202A &&
84 codepoint
<= 0x202E)) // bidirectional text control
88 if (codepoint
>= 0xFFF9 &&
89 codepoint
<= 0xFFFF) // interlinears and generally specials
96 DecodedCharBuffer
attemptASCIIEscape(llvm::UTF32 c
,
97 StringPrinter::EscapeStyle escape_style
) {
98 const bool is_swift_escape_style
=
99 escape_style
== StringPrinter::EscapeStyle::Swift
;
106 if (is_swift_escape_style
)
110 if (is_swift_escape_style
)
120 if (is_swift_escape_style
)
126 if (is_swift_escape_style
)
136 DecodedCharBuffer GetPrintableImpl
<StringElementType::ASCII
>(
137 uint8_t *buffer
, uint8_t *buffer_end
, uint8_t *&next
,
138 StringPrinter::EscapeStyle escape_style
) {
139 // The ASCII helper always advances 1 byte at a time.
142 DecodedCharBuffer retval
= attemptASCIIEscape(*buffer
, escape_style
);
143 if (retval
.GetSize())
146 // Use llvm's locale-independent isPrint(char), instead of the libc
147 // implementation which may give different results on different platforms.
148 if (llvm::isPrint(*buffer
))
151 unsigned escaped_len
;
152 constexpr unsigned max_buffer_size
= 7;
153 uint8_t data
[max_buffer_size
];
154 switch (escape_style
) {
155 case StringPrinter::EscapeStyle::CXX
:
156 // Prints 4 characters, then a \0 terminator.
157 escaped_len
= snprintf((char *)data
, max_buffer_size
, "\\x%02x", *buffer
);
159 case StringPrinter::EscapeStyle::Swift
:
160 // Prints up to 6 characters, then a \0 terminator.
161 escaped_len
= snprintf((char *)data
, max_buffer_size
, "\\u{%x}", *buffer
);
164 lldbassert(escaped_len
> 0 && "unknown string escape style");
165 return {data
, escaped_len
};
169 DecodedCharBuffer GetPrintableImpl
<StringElementType::UTF8
>(
170 uint8_t *buffer
, uint8_t *buffer_end
, uint8_t *&next
,
171 StringPrinter::EscapeStyle escape_style
) {
172 // If the utf8 encoded length is invalid (i.e., not in the closed interval
173 // [1;4]), or if there aren't enough bytes to print, or if the subsequence
174 // isn't valid utf8, fall back to printing an ASCII-escaped subsequence.
175 if (!llvm::isLegalUTF8Sequence(buffer
, buffer_end
))
176 return GetPrintableImpl
<StringElementType::ASCII
>(buffer
, buffer_end
, next
,
179 // Convert the valid utf8 sequence to a utf32 codepoint. This cannot fail.
180 llvm::UTF32 codepoint
= 0;
181 const llvm::UTF8
*buffer_for_conversion
= buffer
;
182 llvm::ConversionResult result
= llvm::convertUTF8Sequence(
183 &buffer_for_conversion
, buffer_end
, &codepoint
, llvm::strictConversion
);
184 assert(result
== llvm::conversionOK
&&
185 "Failed to convert legal utf8 sequence");
186 UNUSED_IF_ASSERT_DISABLED(result
);
188 // The UTF8 helper always advances by the utf8 encoded length.
189 const unsigned utf8_encoded_len
= buffer_for_conversion
- buffer
;
190 next
= buffer
+ utf8_encoded_len
;
192 DecodedCharBuffer retval
= attemptASCIIEscape(codepoint
, escape_style
);
193 if (retval
.GetSize())
195 if (isprint32(codepoint
))
196 return {buffer
, utf8_encoded_len
};
198 unsigned escaped_len
;
199 constexpr unsigned max_buffer_size
= 13;
200 uint8_t data
[max_buffer_size
];
201 switch (escape_style
) {
202 case StringPrinter::EscapeStyle::CXX
:
203 // Prints 10 characters, then a \0 terminator.
204 escaped_len
= snprintf((char *)data
, max_buffer_size
, "\\U%08x", codepoint
);
206 case StringPrinter::EscapeStyle::Swift
:
207 // Prints up to 12 characters, then a \0 terminator.
208 escaped_len
= snprintf((char *)data
, max_buffer_size
, "\\u{%x}", codepoint
);
211 lldbassert(escaped_len
> 0 && "unknown string escape style");
212 return {data
, escaped_len
};
215 // Given a sequence of bytes, this function returns: a sequence of bytes to
216 // actually print out + a length the following unscanned position of the buffer
218 static DecodedCharBuffer
GetPrintable(StringElementType type
, uint8_t *buffer
,
219 uint8_t *buffer_end
, uint8_t *&next
,
220 StringPrinter::EscapeStyle escape_style
) {
221 if (!buffer
|| buffer
>= buffer_end
)
225 case StringElementType::ASCII
:
226 return GetPrintableImpl
<StringElementType::ASCII
>(buffer
, buffer_end
, next
,
228 case StringElementType::UTF8
:
229 return GetPrintableImpl
<StringElementType::UTF8
>(buffer
, buffer_end
, next
,
236 static EscapingHelper
237 GetDefaultEscapingHelper(GetPrintableElementType elem_type
,
238 StringPrinter::EscapeStyle escape_style
) {
240 case GetPrintableElementType::UTF8
:
241 case GetPrintableElementType::ASCII
:
242 return [escape_style
, elem_type
](uint8_t *buffer
, uint8_t *buffer_end
,
243 uint8_t *&next
) -> DecodedCharBuffer
{
244 return GetPrintable(elem_type
== GetPrintableElementType::UTF8
245 ? StringElementType::UTF8
246 : StringElementType::ASCII
,
247 buffer
, buffer_end
, next
, escape_style
);
250 llvm_unreachable("bad element type");
253 /// Read a string encoded in accordance with \tparam SourceDataType from a
254 /// host-side LLDB buffer, then pretty-print it to a stream using \p style.
255 template <typename SourceDataType
>
256 static bool DumpEncodedBufferToStream(
257 GetPrintableElementType style
,
258 llvm::ConversionResult (*ConvertFunction
)(const SourceDataType
**,
259 const SourceDataType
*,
260 llvm::UTF8
**, llvm::UTF8
*,
261 llvm::ConversionFlags
),
262 const StringPrinter::ReadBufferAndDumpToStreamOptions
&dump_options
) {
263 assert(dump_options
.GetStream() && "need a Stream to print the string to");
264 Stream
&stream(*dump_options
.GetStream());
265 if (dump_options
.GetPrefixToken() != nullptr)
266 stream
.Printf("%s", dump_options
.GetPrefixToken());
267 if (dump_options
.GetQuote() != 0)
268 stream
.Printf("%c", dump_options
.GetQuote());
269 auto data(dump_options
.GetData());
270 auto source_size(dump_options
.GetSourceSize());
271 if (data
.GetByteSize() && data
.GetDataStart() && data
.GetDataEnd()) {
272 const int bufferSPSize
= data
.GetByteSize();
273 if (dump_options
.GetSourceSize() == 0) {
274 const int origin_encoding
= 8 * sizeof(SourceDataType
);
275 source_size
= bufferSPSize
/ (origin_encoding
/ 4);
278 const SourceDataType
*data_ptr
=
279 (const SourceDataType
*)data
.GetDataStart();
280 const SourceDataType
*data_end_ptr
= data_ptr
+ source_size
;
282 const bool zero_is_terminator
= dump_options
.GetBinaryZeroIsTerminator();
284 if (zero_is_terminator
) {
285 while (data_ptr
< data_end_ptr
) {
287 data_end_ptr
= data_ptr
;
293 data_ptr
= (const SourceDataType
*)data
.GetDataStart();
296 lldb::WritableDataBufferSP utf8_data_buffer_sp
;
297 llvm::UTF8
*utf8_data_ptr
= nullptr;
298 llvm::UTF8
*utf8_data_end_ptr
= nullptr;
300 if (ConvertFunction
) {
301 utf8_data_buffer_sp
=
302 std::make_shared
<DataBufferHeap
>(4 * bufferSPSize
, 0);
303 utf8_data_ptr
= (llvm::UTF8
*)utf8_data_buffer_sp
->GetBytes();
304 utf8_data_end_ptr
= utf8_data_ptr
+ utf8_data_buffer_sp
->GetByteSize();
305 ConvertFunction(&data_ptr
, data_end_ptr
, &utf8_data_ptr
,
306 utf8_data_end_ptr
, llvm::lenientConversion
);
307 if (!zero_is_terminator
)
308 utf8_data_end_ptr
= utf8_data_ptr
;
309 // needed because the ConvertFunction will change the value of the
312 (llvm::UTF8
*)utf8_data_buffer_sp
->GetBytes();
314 // just copy the pointers - the cast is necessary to make the compiler
315 // happy but this should only happen if we are reading UTF8 data
316 utf8_data_ptr
= const_cast<llvm::UTF8
*>(
317 reinterpret_cast<const llvm::UTF8
*>(data_ptr
));
318 utf8_data_end_ptr
= const_cast<llvm::UTF8
*>(
319 reinterpret_cast<const llvm::UTF8
*>(data_end_ptr
));
322 const bool escape_non_printables
= dump_options
.GetEscapeNonPrintables();
323 EscapingHelper escaping_callback
;
324 if (escape_non_printables
)
326 GetDefaultEscapingHelper(style
, dump_options
.GetEscapeStyle());
328 // since we tend to accept partial data (and even partially malformed data)
329 // we might end up with no NULL terminator before the end_ptr hence we need
330 // to take a slower route and ensure we stay within boundaries
331 for (; utf8_data_ptr
< utf8_data_end_ptr
;) {
332 if (zero_is_terminator
&& !*utf8_data_ptr
)
335 if (escape_non_printables
) {
336 uint8_t *next_data
= nullptr;
338 escaping_callback(utf8_data_ptr
, utf8_data_end_ptr
, next_data
);
339 auto printable_bytes
= printable
.GetBytes();
340 auto printable_size
= printable
.GetSize();
342 // We failed to figure out how to print this string.
343 if (!printable_bytes
|| !next_data
)
346 for (unsigned c
= 0; c
< printable_size
; c
++)
347 stream
.Printf("%c", *(printable_bytes
+ c
));
348 utf8_data_ptr
= (uint8_t *)next_data
;
350 stream
.Printf("%c", *utf8_data_ptr
);
355 if (dump_options
.GetQuote() != 0)
356 stream
.Printf("%c", dump_options
.GetQuote());
357 if (dump_options
.GetSuffixToken() != nullptr)
358 stream
.Printf("%s", dump_options
.GetSuffixToken());
359 if (dump_options
.GetIsTruncated())
360 stream
.Printf("...");
364 lldb_private::formatters::StringPrinter::ReadStringAndDumpToStreamOptions::
365 ReadStringAndDumpToStreamOptions(ValueObject
&valobj
)
366 : ReadStringAndDumpToStreamOptions() {
367 SetEscapeNonPrintables(
368 valobj
.GetTargetSP()->GetDebugger().GetEscapeNonPrintables());
371 lldb_private::formatters::StringPrinter::ReadBufferAndDumpToStreamOptions::
372 ReadBufferAndDumpToStreamOptions(ValueObject
&valobj
)
373 : ReadBufferAndDumpToStreamOptions() {
374 SetEscapeNonPrintables(
375 valobj
.GetTargetSP()->GetDebugger().GetEscapeNonPrintables());
378 lldb_private::formatters::StringPrinter::ReadBufferAndDumpToStreamOptions::
379 ReadBufferAndDumpToStreamOptions(
380 const ReadStringAndDumpToStreamOptions
&options
)
381 : ReadBufferAndDumpToStreamOptions() {
382 SetStream(options
.GetStream());
383 SetPrefixToken(options
.GetPrefixToken());
384 SetSuffixToken(options
.GetSuffixToken());
385 SetQuote(options
.GetQuote());
386 SetEscapeNonPrintables(options
.GetEscapeNonPrintables());
387 SetBinaryZeroIsTerminator(options
.GetBinaryZeroIsTerminator());
388 SetEscapeStyle(options
.GetEscapeStyle());
391 namespace lldb_private
{
393 namespace formatters
{
395 template <typename SourceDataType
>
396 static bool ReadEncodedBufferAndDumpToStream(
397 StringElementType elem_type
,
398 const StringPrinter::ReadStringAndDumpToStreamOptions
&options
,
399 llvm::ConversionResult (*ConvertFunction
)(const SourceDataType
**,
400 const SourceDataType
*,
401 llvm::UTF8
**, llvm::UTF8
*,
402 llvm::ConversionFlags
)) {
403 assert(options
.GetStream() && "need a Stream to print the string to");
404 if (!options
.GetStream())
407 if (options
.GetLocation() == 0 ||
408 options
.GetLocation() == LLDB_INVALID_ADDRESS
)
411 lldb::TargetSP target_sp
= options
.GetTargetSP();
415 constexpr int type_width
= sizeof(SourceDataType
);
416 constexpr int origin_encoding
= 8 * type_width
;
417 if (origin_encoding
!= 8 && origin_encoding
!= 16 && origin_encoding
!= 32)
419 // If not UTF8 or ASCII, conversion to UTF8 is necessary.
420 if (origin_encoding
!= 8 && !ConvertFunction
)
423 bool needs_zero_terminator
= options
.GetNeedsZeroTermination();
425 bool is_truncated
= false;
426 const auto max_size
= target_sp
->GetMaximumSizeOfStringSummary();
429 if (elem_type
== StringElementType::ASCII
&& !options
.GetSourceSize()) {
430 // FIXME: The NSString formatter sets HasSourceSize(true) when the size is
431 // actually unknown, as well as SetBinaryZeroIsTerminator(false). IIUC the
432 // C++ formatter also sets SetBinaryZeroIsTerminator(false) when it doesn't
433 // mean to. I don't see how this makes sense: we should fix the formatters.
435 // Until then, the behavior that's expected for ASCII strings with unknown
436 // lengths is to read up to the max size and then null-terminate. Do that.
437 sourceSize
= max_size
;
438 needs_zero_terminator
= true;
439 } else if (options
.HasSourceSize()) {
440 sourceSize
= options
.GetSourceSize();
441 if (!options
.GetIgnoreMaxLength()) {
442 if (sourceSize
> max_size
) {
443 sourceSize
= max_size
;
448 sourceSize
= max_size
;
449 needs_zero_terminator
= true;
452 const int bufferSPSize
= sourceSize
* type_width
;
453 lldb::WritableDataBufferSP
buffer_sp(new DataBufferHeap(bufferSPSize
, 0));
455 // Check if we got bytes. We never get any bytes if we have an empty
456 // string, but we still continue so that we end up actually printing
457 // an empty string ("").
458 if (sourceSize
!= 0 && !buffer_sp
->GetBytes())
462 char *buffer
= reinterpret_cast<char *>(buffer_sp
->GetBytes());
464 if (elem_type
== StringElementType::ASCII
)
465 target_sp
->ReadCStringFromMemory(options
.GetLocation(), buffer
,
466 bufferSPSize
, error
);
467 else if (needs_zero_terminator
)
468 target_sp
->ReadStringFromMemory(options
.GetLocation(), buffer
,
469 bufferSPSize
, error
, type_width
);
471 target_sp
->ReadMemory(options
.GetLocation(), buffer
, bufferSPSize
, error
);
473 options
.GetStream()->Printf("unable to read data");
477 StringPrinter::ReadBufferAndDumpToStreamOptions
dump_options(options
);
478 dump_options
.SetData(
479 DataExtractor(buffer_sp
, target_sp
->GetArchitecture().GetByteOrder(),
480 target_sp
->GetArchitecture().GetAddressByteSize()));
481 dump_options
.SetSourceSize(sourceSize
);
482 dump_options
.SetIsTruncated(is_truncated
);
483 dump_options
.SetNeedsZeroTermination(needs_zero_terminator
);
484 if (needs_zero_terminator
)
485 dump_options
.SetBinaryZeroIsTerminator(true);
487 GetPrintableElementType print_style
= (elem_type
== StringElementType::ASCII
)
488 ? GetPrintableElementType::ASCII
489 : GetPrintableElementType::UTF8
;
490 return DumpEncodedBufferToStream(print_style
, ConvertFunction
, dump_options
);
494 bool StringPrinter::ReadStringAndDumpToStream
<StringElementType::UTF8
>(
495 const ReadStringAndDumpToStreamOptions
&options
) {
496 return ReadEncodedBufferAndDumpToStream
<llvm::UTF8
>(StringElementType::UTF8
,
501 bool StringPrinter::ReadStringAndDumpToStream
<StringElementType::UTF16
>(
502 const ReadStringAndDumpToStreamOptions
&options
) {
503 return ReadEncodedBufferAndDumpToStream
<llvm::UTF16
>(
504 StringElementType::UTF16
, options
, llvm::ConvertUTF16toUTF8
);
508 bool StringPrinter::ReadStringAndDumpToStream
<StringElementType::UTF32
>(
509 const ReadStringAndDumpToStreamOptions
&options
) {
510 return ReadEncodedBufferAndDumpToStream
<llvm::UTF32
>(
511 StringElementType::UTF32
, options
, llvm::ConvertUTF32toUTF8
);
515 bool StringPrinter::ReadStringAndDumpToStream
<StringElementType::ASCII
>(
516 const ReadStringAndDumpToStreamOptions
&options
) {
517 return ReadEncodedBufferAndDumpToStream
<char>(StringElementType::ASCII
,
522 bool StringPrinter::ReadBufferAndDumpToStream
<StringElementType::UTF8
>(
523 const ReadBufferAndDumpToStreamOptions
&options
) {
524 return DumpEncodedBufferToStream
<llvm::UTF8
>(GetPrintableElementType::UTF8
,
529 bool StringPrinter::ReadBufferAndDumpToStream
<StringElementType::UTF16
>(
530 const ReadBufferAndDumpToStreamOptions
&options
) {
531 return DumpEncodedBufferToStream(GetPrintableElementType::UTF8
,
532 llvm::ConvertUTF16toUTF8
, options
);
536 bool StringPrinter::ReadBufferAndDumpToStream
<StringElementType::UTF32
>(
537 const ReadBufferAndDumpToStreamOptions
&options
) {
538 return DumpEncodedBufferToStream(GetPrintableElementType::UTF8
,
539 llvm::ConvertUTF32toUTF8
, options
);
543 bool StringPrinter::ReadBufferAndDumpToStream
<StringElementType::ASCII
>(
544 const ReadBufferAndDumpToStreamOptions
&options
) {
545 // Treat ASCII the same as UTF8.
547 // FIXME: This is probably not the right thing to do (well, it's debatable).
548 // If an ASCII-encoded string happens to contain a sequence of invalid bytes
549 // that forms a valid UTF8 character, we'll print out that character. This is
550 // good if you're playing fast and loose with encodings (probably good for
551 // std::string users), but maybe not so good if you care about your string
552 // formatter respecting the semantics of your selected string encoding. In
553 // the latter case you'd want to see the character byte sequence ('\x..'), not
554 // the UTF8 character itself.
555 return ReadBufferAndDumpToStream
<StringElementType::UTF8
>(options
);
558 } // namespace formatters
560 } // namespace lldb_private