1 //===--- Markup.cpp -----------------------------------------*- C++-*------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
8 #include "support/Markup.h"
9 #include "llvm/ADT/ArrayRef.h"
10 #include "llvm/ADT/STLExtras.h"
11 #include "llvm/ADT/SmallVector.h"
12 #include "llvm/ADT/StringExtras.h"
13 #include "llvm/ADT/StringRef.h"
14 #include "llvm/Support/Compiler.h"
15 #include "llvm/Support/raw_ostream.h"
27 // Is <contents a plausible start to an HTML tag?
28 // Contents may not be the rest of the line, but it's the rest of the plain
29 // text, so we expect to see at least the tag name.
30 bool looksLikeTag(llvm::StringRef Contents
) {
33 if (Contents
.front() == '!' || Contents
.front() == '?' ||
34 Contents
.front() == '/')
36 // Check the start of the tag name.
37 if (!llvm::isAlpha(Contents
.front()))
39 // Drop rest of the tag name, and following whitespace.
41 .drop_while([](char C
) {
42 return llvm::isAlnum(C
) || C
== '-' || C
== '_' || C
== ':';
44 .drop_while(llvm::isSpace
);
45 // The rest of the tag consists of attributes, which have restrictive names.
46 // If we hit '=', all bets are off (attribute values can contain anything).
47 for (; !Contents
.empty(); Contents
= Contents
.drop_front()) {
48 if (llvm::isAlnum(Contents
.front()) || llvm::isSpace(Contents
.front()))
50 if (Contents
.front() == '>' || Contents
.starts_with("/>"))
51 return true; // May close the tag.
52 if (Contents
.front() == '=')
53 return true; // Don't try to parse attribute values.
54 return false; // Random punctuation means this isn't a tag.
56 return true; // Potentially incomplete tag.
59 // Tests whether C should be backslash-escaped in markdown.
60 // The string being escaped is Before + C + After. This is part of a paragraph.
61 // StartsLine indicates whether `Before` is the start of the line.
62 // After may not be everything until the end of the line.
64 // It's always safe to escape punctuation, but want minimal escaping.
65 // The strategy is to escape the first character of anything that might start
66 // a markdown grammar construct.
67 bool needsLeadingEscape(char C
, llvm::StringRef Before
, llvm::StringRef After
,
69 assert(Before
.take_while(llvm::isSpace
).empty());
70 auto RulerLength
= [&]() -> /*Length*/ unsigned {
71 if (!StartsLine
|| !Before
.empty())
73 llvm::StringRef A
= After
.rtrim();
74 return llvm::all_of(A
, [C
](char D
) { return C
== D
; }) ? 1 + A
.size() : 0;
76 auto IsBullet
= [&]() {
77 return StartsLine
&& Before
.empty() &&
78 (After
.empty() || After
.starts_with(" "));
80 auto SpaceSurrounds
= [&]() {
81 return (After
.empty() || llvm::isSpace(After
.front())) &&
82 (Before
.empty() || llvm::isSpace(Before
.back()));
84 auto WordSurrounds
= [&]() {
85 return (!After
.empty() && llvm::isAlnum(After
.front())) &&
86 (!Before
.empty() && llvm::isAlnum(Before
.back()));
90 case '\\': // Escaped character.
92 case '`': // Code block or inline code
93 // Any number of backticks can delimit an inline code block that can end
94 // anywhere (including on another line). We must escape them all.
96 case '~': // Code block
97 return StartsLine
&& Before
.empty() && After
.starts_with("~~");
98 case '#': { // ATX heading.
99 if (!StartsLine
|| !Before
.empty())
101 llvm::StringRef Rest
= After
.ltrim(C
);
102 return Rest
.empty() || Rest
.starts_with(" ");
104 case ']': // Link or link reference.
105 // We escape ] rather than [ here, because it's more constrained:
106 // ](...) is an in-line link
107 // ]: is a link reference
108 // The following are only links if the link reference exists:
109 // ] by itself is a shortcut link
110 // ][...] is an out-of-line link
111 // Because we never emit link references, we don't need to handle these.
112 return After
.starts_with(":") || After
.starts_with("(");
113 case '=': // Setex heading.
114 return RulerLength() > 0;
115 case '_': // Horizontal ruler or matched delimiter.
116 if (RulerLength() >= 3)
118 // Not a delimiter if surrounded by space, or inside a word.
119 // (The rules at word boundaries are subtle).
120 return !(SpaceSurrounds() || WordSurrounds());
121 case '-': // Setex heading, horizontal ruler, or bullet.
122 if (RulerLength() > 0)
125 case '+': // Bullet list.
127 case '*': // Bullet list, horizontal ruler, or delimiter.
128 return IsBullet() || RulerLength() >= 3 || !SpaceSurrounds();
129 case '<': // HTML tag (or autolink, which we choose not to escape)
130 return looksLikeTag(After
);
131 case '>': // Quote marker. Needs escaping at start of line.
132 return StartsLine
&& Before
.empty();
133 case '&': { // HTML entity reference
134 auto End
= After
.find(';');
135 if (End
== llvm::StringRef::npos
)
137 llvm::StringRef Content
= After
.substr(0, End
);
138 if (Content
.consume_front("#")) {
139 if (Content
.consume_front("x") || Content
.consume_front("X"))
140 return llvm::all_of(Content
, llvm::isHexDigit
);
141 return llvm::all_of(Content
, llvm::isDigit
);
143 return llvm::all_of(Content
, llvm::isAlpha
);
145 case '.': // Numbered list indicator. Escape 12. -> 12\. at start of line.
147 return StartsLine
&& !Before
.empty() &&
148 llvm::all_of(Before
, llvm::isDigit
) && After
.starts_with(" ");
154 /// Escape a markdown text block. Ensures the punctuation will not introduce
155 /// any of the markdown constructs.
156 std::string
renderText(llvm::StringRef Input
, bool StartsLine
) {
158 for (unsigned I
= 0; I
< Input
.size(); ++I
) {
159 if (needsLeadingEscape(Input
[I
], Input
.substr(0, I
), Input
.substr(I
+ 1),
162 R
.push_back(Input
[I
]);
167 /// Renders \p Input as an inline block of code in markdown. The returned value
168 /// is surrounded by backticks and the inner contents are properly escaped.
169 std::string
renderInlineBlock(llvm::StringRef Input
) {
171 // Double all backticks to make sure we don't close the inline block early.
172 for (size_t From
= 0; From
< Input
.size();) {
173 size_t Next
= Input
.find("`", From
);
174 R
+= Input
.substr(From
, Next
- From
);
175 if (Next
== llvm::StringRef::npos
)
177 R
+= "``"; // double the found backtick.
181 // If results starts with a backtick, add spaces on both sides. The spaces
182 // are ignored by markdown renderers.
183 if (llvm::StringRef(R
).starts_with("`") || llvm::StringRef(R
).ends_with("`"))
184 return "` " + std::move(R
) + " `";
185 // Markdown render should ignore first and last space if both are there. We
186 // add an extra pair of spaces in that case to make sure we render what the
188 if (llvm::StringRef(R
).starts_with(" ") && llvm::StringRef(R
).ends_with(" "))
189 return "` " + std::move(R
) + " `";
190 return "`" + std::move(R
) + "`";
193 /// Get marker required for \p Input to represent a markdown codeblock. It
194 /// consists of at least 3 backticks(`). Although markdown also allows to use
195 /// tilde(~) for code blocks, they are never used.
196 std::string
getMarkerForCodeBlock(llvm::StringRef Input
) {
197 // Count the maximum number of consecutive backticks in \p Input. We need to
198 // start and end the code block with more.
199 unsigned MaxBackticks
= 0;
200 unsigned Backticks
= 0;
201 for (char C
: Input
) {
206 MaxBackticks
= std::max(MaxBackticks
, Backticks
);
209 MaxBackticks
= std::max(Backticks
, MaxBackticks
);
210 // Use the corresponding number of backticks to start and end a code block.
211 return std::string(/*Repeat=*/std::max(3u, MaxBackticks
+ 1), '`');
214 // Trims the input and concatenates whitespace blocks into a single ` `.
215 std::string
canonicalizeSpaces(llvm::StringRef Input
) {
216 llvm::SmallVector
<llvm::StringRef
> Words
;
217 llvm::SplitString(Input
, Words
);
218 return llvm::join(Words
, " ");
221 std::string
renderBlocks(llvm::ArrayRef
<std::unique_ptr
<Block
>> Children
,
222 void (Block::*RenderFunc
)(llvm::raw_ostream
&) const) {
224 llvm::raw_string_ostream
OS(R
);
227 Children
= Children
.drop_while(
228 [](const std::unique_ptr
<Block
> &C
) { return C
->isRuler(); });
229 auto Last
= llvm::find_if(
230 llvm::reverse(Children
),
231 [](const std::unique_ptr
<Block
> &C
) { return !C
->isRuler(); });
232 Children
= Children
.drop_back(Children
.end() - Last
.base());
234 bool LastBlockWasRuler
= true;
235 for (const auto &C
: Children
) {
236 if (C
->isRuler() && LastBlockWasRuler
)
238 LastBlockWasRuler
= C
->isRuler();
239 ((*C
).*RenderFunc
)(OS
);
242 // Get rid of redundant empty lines introduced in plaintext while imitating
243 // padding in markdown.
244 std::string AdjustedResult
;
245 llvm::StringRef
TrimmedText(OS
.str());
246 TrimmedText
= TrimmedText
.trim();
248 llvm::copy_if(TrimmedText
, std::back_inserter(AdjustedResult
),
249 [&TrimmedText
](const char &C
) {
250 return !llvm::StringRef(TrimmedText
.data(),
251 &C
- TrimmedText
.data() + 1)
252 // We allow at most two newlines.
253 .ends_with("\n\n\n");
256 return AdjustedResult
;
259 // Separates two blocks with extra spacing. Note that it might render strangely
260 // in vscode if the trailing block is a codeblock, see
261 // https://github.com/microsoft/vscode/issues/88416 for details.
262 class Ruler
: public Block
{
264 void renderMarkdown(llvm::raw_ostream
&OS
) const override
{
265 // Note that we need an extra new line before the ruler, otherwise we might
266 // make previous block a title instead of introducing a ruler.
269 void renderPlainText(llvm::raw_ostream
&OS
) const override
{ OS
<< '\n'; }
270 std::unique_ptr
<Block
> clone() const override
{
271 return std::make_unique
<Ruler
>(*this);
273 bool isRuler() const override
{ return true; }
276 class CodeBlock
: public Block
{
278 void renderMarkdown(llvm::raw_ostream
&OS
) const override
{
279 std::string Marker
= getMarkerForCodeBlock(Contents
);
280 // No need to pad from previous blocks, as they should end with a new line.
281 OS
<< Marker
<< Language
<< '\n' << Contents
<< '\n' << Marker
<< '\n';
284 void renderPlainText(llvm::raw_ostream
&OS
) const override
{
285 // In plaintext we want one empty line before and after codeblocks.
286 OS
<< '\n' << Contents
<< "\n\n";
289 std::unique_ptr
<Block
> clone() const override
{
290 return std::make_unique
<CodeBlock
>(*this);
293 CodeBlock(std::string Contents
, std::string Language
)
294 : Contents(std::move(Contents
)), Language(std::move(Language
)) {}
297 std::string Contents
;
298 std::string Language
;
301 // Inserts two spaces after each `\n` to indent each line. First line is not
303 std::string
indentLines(llvm::StringRef Input
) {
304 assert(!Input
.ends_with("\n") && "Input should've been trimmed.");
305 std::string IndentedR
;
306 // We'll add 2 spaces after each new line.
307 IndentedR
.reserve(Input
.size() + Input
.count('\n') * 2);
308 for (char C
: Input
) {
311 IndentedR
.append(" ");
316 class Heading
: public Paragraph
{
318 Heading(size_t Level
) : Level(Level
) {}
319 void renderMarkdown(llvm::raw_ostream
&OS
) const override
{
320 OS
<< std::string(Level
, '#') << ' ';
321 Paragraph::renderMarkdown(OS
);
330 std::string
Block::asMarkdown() const {
332 llvm::raw_string_ostream
OS(R
);
334 return llvm::StringRef(OS
.str()).trim().str();
337 std::string
Block::asPlainText() const {
339 llvm::raw_string_ostream
OS(R
);
341 return llvm::StringRef(OS
.str()).trim().str();
344 void Paragraph::renderMarkdown(llvm::raw_ostream
&OS
) const {
345 bool NeedsSpace
= false;
346 bool HasChunks
= false;
347 for (auto &C
: Chunks
) {
348 if (C
.SpaceBefore
|| NeedsSpace
)
351 case Chunk::PlainText
:
352 OS
<< renderText(C
.Contents
, !HasChunks
);
354 case Chunk::InlineCode
:
355 OS
<< renderInlineBlock(C
.Contents
);
359 NeedsSpace
= C
.SpaceAfter
;
361 // Paragraphs are translated into markdown lines, not markdown paragraphs.
362 // Therefore it only has a single linebreak afterwards.
363 // VSCode requires two spaces at the end of line to start a new one.
367 std::unique_ptr
<Block
> Paragraph::clone() const {
368 return std::make_unique
<Paragraph
>(*this);
371 /// Choose a marker to delimit `Text` from a prioritized list of options.
372 /// This is more readable than escaping for plain-text.
373 llvm::StringRef
chooseMarker(llvm::ArrayRef
<llvm::StringRef
> Options
,
374 llvm::StringRef Text
) {
375 // Prefer a delimiter whose characters don't appear in the text.
376 for (llvm::StringRef S
: Options
)
377 if (Text
.find_first_of(S
) == llvm::StringRef::npos
)
379 return Options
.front();
382 void Paragraph::renderPlainText(llvm::raw_ostream
&OS
) const {
383 bool NeedsSpace
= false;
384 for (auto &C
: Chunks
) {
385 if (C
.SpaceBefore
|| NeedsSpace
)
387 llvm::StringRef Marker
= "";
388 if (C
.Preserve
&& C
.Kind
== Chunk::InlineCode
)
389 Marker
= chooseMarker({"`", "'", "\""}, C
.Contents
);
390 OS
<< Marker
<< C
.Contents
<< Marker
;
391 NeedsSpace
= C
.SpaceAfter
;
396 BulletList::BulletList() = default;
397 BulletList::~BulletList() = default;
399 void BulletList::renderMarkdown(llvm::raw_ostream
&OS
) const {
400 for (auto &D
: Items
) {
401 // Instead of doing this we might prefer passing Indent to children to get
402 // rid of the copies, if it turns out to be a bottleneck.
403 OS
<< "- " << indentLines(D
.asMarkdown()) << '\n';
405 // We need a new line after list to terminate it in markdown.
409 void BulletList::renderPlainText(llvm::raw_ostream
&OS
) const {
410 for (auto &D
: Items
) {
411 // Instead of doing this we might prefer passing Indent to children to get
412 // rid of the copies, if it turns out to be a bottleneck.
413 OS
<< "- " << indentLines(D
.asPlainText()) << '\n';
417 Paragraph
&Paragraph::appendSpace() {
419 Chunks
.back().SpaceAfter
= true;
423 Paragraph
&Paragraph::appendText(llvm::StringRef Text
) {
424 std::string Norm
= canonicalizeSpaces(Text
);
427 Chunks
.emplace_back();
428 Chunk
&C
= Chunks
.back();
429 C
.Contents
= std::move(Norm
);
430 C
.Kind
= Chunk::PlainText
;
431 C
.SpaceBefore
= llvm::isSpace(Text
.front());
432 C
.SpaceAfter
= llvm::isSpace(Text
.back());
436 Paragraph
&Paragraph::appendCode(llvm::StringRef Code
, bool Preserve
) {
438 !Chunks
.empty() && Chunks
.back().Kind
== Chunk::InlineCode
;
439 std::string Norm
= canonicalizeSpaces(std::move(Code
));
442 Chunks
.emplace_back();
443 Chunk
&C
= Chunks
.back();
444 C
.Contents
= std::move(Norm
);
445 C
.Kind
= Chunk::InlineCode
;
446 C
.Preserve
= Preserve
;
447 // Disallow adjacent code spans without spaces, markdown can't render them.
448 C
.SpaceBefore
= AdjacentCode
;
452 std::unique_ptr
<Block
> BulletList::clone() const {
453 return std::make_unique
<BulletList
>(*this);
456 class Document
&BulletList::addItem() {
457 Items
.emplace_back();
461 Document
&Document::operator=(const Document
&Other
) {
463 for (const auto &C
: Other
.Children
)
464 Children
.push_back(C
->clone());
468 void Document::append(Document Other
) {
469 std::move(Other
.Children
.begin(), Other
.Children
.end(),
470 std::back_inserter(Children
));
473 Paragraph
&Document::addParagraph() {
474 Children
.push_back(std::make_unique
<Paragraph
>());
475 return *static_cast<Paragraph
*>(Children
.back().get());
478 void Document::addRuler() { Children
.push_back(std::make_unique
<Ruler
>()); }
480 void Document::addCodeBlock(std::string Code
, std::string Language
) {
481 Children
.emplace_back(
482 std::make_unique
<CodeBlock
>(std::move(Code
), std::move(Language
)));
485 std::string
Document::asMarkdown() const {
486 return renderBlocks(Children
, &Block::renderMarkdown
);
489 std::string
Document::asPlainText() const {
490 return renderBlocks(Children
, &Block::renderPlainText
);
493 BulletList
&Document::addBulletList() {
494 Children
.emplace_back(std::make_unique
<BulletList
>());
495 return *static_cast<BulletList
*>(Children
.back().get());
498 Paragraph
&Document::addHeading(size_t Level
) {
500 Children
.emplace_back(std::make_unique
<Heading
>(Level
));
501 return *static_cast<Paragraph
*>(Children
.back().get());
503 } // namespace markup
504 } // namespace clangd