clang-tools-extra/clangd/support/Markup.cpp

   1 //===--- Markup.cpp -----------------------------------------*- C++-*------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 #include "support/Markup.h"
   9 #include "llvm/ADT/ArrayRef.h"
  10 #include "llvm/ADT/STLExtras.h"
  11 #include "llvm/ADT/SmallVector.h"
  12 #include "llvm/ADT/StringExtras.h"
  13 #include "llvm/ADT/StringRef.h"
  14 #include "llvm/Support/Compiler.h"
  15 #include "llvm/Support/raw_ostream.h"
  16 #include <cstddef>
  17 #include <iterator>
  18 #include <memory>
  19 #include <string>
  20 #include <vector>
  21
  22 namespace clang {
  23 namespace clangd {
  24 namespace markup {
  25 namespace {
  26
  27 // Is <contents a plausible start to an HTML tag?
  28 // Contents may not be the rest of the line, but it's the rest of the plain
  29 // text, so we expect to see at least the tag name.
  30 bool looksLikeTag(llvm::StringRef Contents) {
  31   if (Contents.empty())
  32     return false;
  33   if (Contents.front() == '!' || Contents.front() == '?' ||
  34       Contents.front() == '/')
  35     return true;
  36   // Check the start of the tag name.
  37   if (!llvm::isAlpha(Contents.front()))
  38     return false;
  39   // Drop rest of the tag name, and following whitespace.
  40   Contents = Contents
  41                  .drop_while([](char C) {
  42                    return llvm::isAlnum(C) || C == '-' || C == '_' || C == ':';
  43                  })
  44                  .drop_while(llvm::isSpace);
  45   // The rest of the tag consists of attributes, which have restrictive names.
  46   // If we hit '=', all bets are off (attribute values can contain anything).
  47   for (; !Contents.empty(); Contents = Contents.drop_front()) {
  48     if (llvm::isAlnum(Contents.front()) || llvm::isSpace(Contents.front()))
  49       continue;
  50     if (Contents.front() == '>' || Contents.starts_with("/>"))
  51       return true; // May close the tag.
  52     if (Contents.front() == '=')
  53       return true; // Don't try to parse attribute values.
  54     return false;  // Random punctuation means this isn't a tag.
  55   }
  56   return true; // Potentially incomplete tag.
  57 }
  58
  59 // Tests whether C should be backslash-escaped in markdown.
  60 // The string being escaped is Before + C + After. This is part of a paragraph.
  61 // StartsLine indicates whether `Before` is the start of the line.
  62 // After may not be everything until the end of the line.
  63 //
  64 // It's always safe to escape punctuation, but want minimal escaping.
  65 // The strategy is to escape the first character of anything that might start
  66 // a markdown grammar construct.
  67 bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After,
  68                         bool StartsLine) {
  69   assert(Before.take_while(llvm::isSpace).empty());
  70   auto RulerLength = [&]() -> /*Length*/ unsigned {
  71     if (!StartsLine || !Before.empty())
  72       return false;
  73     llvm::StringRef A = After.rtrim();
  74     return llvm::all_of(A, [C](char D) { return C == D; }) ? 1 + A.size() : 0;
  75   };
  76   auto IsBullet = [&]() {
  77     return StartsLine && Before.empty() &&
  78            (After.empty() || After.starts_with(" "));
  79   };
  80   auto SpaceSurrounds = [&]() {
  81     return (After.empty() || llvm::isSpace(After.front())) &&
  82            (Before.empty() || llvm::isSpace(Before.back()));
  83   };
  84   auto WordSurrounds = [&]() {
  85     return (!After.empty() && llvm::isAlnum(After.front())) &&
  86            (!Before.empty() && llvm::isAlnum(Before.back()));
  87   };
  88
  89   switch (C) {
  90   case '\\': // Escaped character.
  91     return true;
  92   case '`': // Code block or inline code
  93     // Any number of backticks can delimit an inline code block that can end
  94     // anywhere (including on another line). We must escape them all.
  95     return true;
  96   case '~': // Code block
  97     return StartsLine && Before.empty() && After.starts_with("~~");
  98   case '#': { // ATX heading.
  99     if (!StartsLine || !Before.empty())
 100       return false;
 101     llvm::StringRef Rest = After.ltrim(C);
 102     return Rest.empty() || Rest.starts_with(" ");
 103   }
 104   case ']': // Link or link reference.
 105     // We escape ] rather than [ here, because it's more constrained:
 106     //   ](...) is an in-line link
 107     //   ]: is a link reference
 108     // The following are only links if the link reference exists:
 109     //   ] by itself is a shortcut link
 110     //   ][...] is an out-of-line link
 111     // Because we never emit link references, we don't need to handle these.
 112     return After.starts_with(":") || After.starts_with("(");
 113   case '=': // Setex heading.
 114     return RulerLength() > 0;
 115   case '_': // Horizontal ruler or matched delimiter.
 116     if (RulerLength() >= 3)
 117       return true;
 118     // Not a delimiter if surrounded by space, or inside a word.
 119     // (The rules at word boundaries are subtle).
 120     return !(SpaceSurrounds() || WordSurrounds());
 121   case '-': // Setex heading, horizontal ruler, or bullet.
 122     if (RulerLength() > 0)
 123       return true;
 124     return IsBullet();
 125   case '+': // Bullet list.
 126     return IsBullet();
 127   case '*': // Bullet list, horizontal ruler, or delimiter.
 128     return IsBullet() || RulerLength() >= 3 || !SpaceSurrounds();
 129   case '<': // HTML tag (or autolink, which we choose not to escape)
 130     return looksLikeTag(After);
 131   case '>': // Quote marker. Needs escaping at start of line.
 132     return StartsLine && Before.empty();
 133   case '&': { // HTML entity reference
 134     auto End = After.find(';');
 135     if (End == llvm::StringRef::npos)
 136       return false;
 137     llvm::StringRef Content = After.substr(0, End);
 138     if (Content.consume_front("#")) {
 139       if (Content.consume_front("x") || Content.consume_front("X"))
 140         return llvm::all_of(Content, llvm::isHexDigit);
 141       return llvm::all_of(Content, llvm::isDigit);
 142     }
 143     return llvm::all_of(Content, llvm::isAlpha);
 144   }
 145   case '.': // Numbered list indicator. Escape 12. -> 12\. at start of line.
 146   case ')':
 147     return StartsLine && !Before.empty() &&
 148            llvm::all_of(Before, llvm::isDigit) && After.starts_with(" ");
 149   default:
 150     return false;
 151   }
 152 }
 153
 154 /// Escape a markdown text block. Ensures the punctuation will not introduce
 155 /// any of the markdown constructs.
 156 std::string renderText(llvm::StringRef Input, bool StartsLine) {
 157   std::string R;
 158   for (unsigned I = 0; I < Input.size(); ++I) {
 159     if (needsLeadingEscape(Input[I], Input.substr(0, I), Input.substr(I + 1),
 160                            StartsLine))
 161       R.push_back('\\');
 162     R.push_back(Input[I]);
 163   }
 164   return R;
 165 }
 166
 167 /// Renders \p Input as an inline block of code in markdown. The returned value
 168 /// is surrounded by backticks and the inner contents are properly escaped.
 169 std::string renderInlineBlock(llvm::StringRef Input) {
 170   std::string R;
 171   // Double all backticks to make sure we don't close the inline block early.
 172   for (size_t From = 0; From < Input.size();) {
 173     size_t Next = Input.find("`", From);
 174     R += Input.substr(From, Next - From);
 175     if (Next == llvm::StringRef::npos)
 176       break;
 177     R += "``"; // double the found backtick.
 178
 179     From = Next + 1;
 180   }
 181   // If results starts with a backtick, add spaces on both sides. The spaces
 182   // are ignored by markdown renderers.
 183   if (llvm::StringRef(R).starts_with("`") || llvm::StringRef(R).ends_with("`"))
 184     return "` " + std::move(R) + " `";
 185   // Markdown render should ignore first and last space if both are there. We
 186   // add an extra pair of spaces in that case to make sure we render what the
 187   // user intended.
 188   if (llvm::StringRef(R).starts_with(" ") && llvm::StringRef(R).ends_with(" "))
 189     return "` " + std::move(R) + " `";
 190   return "`" + std::move(R) + "`";
 191 }
 192
 193 /// Get marker required for \p Input to represent a markdown codeblock. It
 194 /// consists of at least 3 backticks(`). Although markdown also allows to use
 195 /// tilde(~) for code blocks, they are never used.
 196 std::string getMarkerForCodeBlock(llvm::StringRef Input) {
 197   // Count the maximum number of consecutive backticks in \p Input. We need to
 198   // start and end the code block with more.
 199   unsigned MaxBackticks = 0;
 200   unsigned Backticks = 0;
 201   for (char C : Input) {
 202     if (C == '`') {
 203       ++Backticks;
 204       continue;
 205     }
 206     MaxBackticks = std::max(MaxBackticks, Backticks);
 207     Backticks = 0;
 208   }
 209   MaxBackticks = std::max(Backticks, MaxBackticks);
 210   // Use the corresponding number of backticks to start and end a code block.
 211   return std::string(/*Repeat=*/std::max(3u, MaxBackticks + 1), '`');
 212 }
 213
 214 // Trims the input and concatenates whitespace blocks into a single ` `.
 215 std::string canonicalizeSpaces(llvm::StringRef Input) {
 216   llvm::SmallVector<llvm::StringRef> Words;
 217   llvm::SplitString(Input, Words);
 218   return llvm::join(Words, " ");
 219 }
 220
 221 std::string renderBlocks(llvm::ArrayRef<std::unique_ptr<Block>> Children,
 222                          void (Block::*RenderFunc)(llvm::raw_ostream &) const) {
 223   std::string R;
 224   llvm::raw_string_ostream OS(R);
 225
 226   // Trim rulers.
 227   Children = Children.drop_while(
 228       [](const std::unique_ptr<Block> &C) { return C->isRuler(); });
 229   auto Last = llvm::find_if(
 230       llvm::reverse(Children),
 231       [](const std::unique_ptr<Block> &C) { return !C->isRuler(); });
 232   Children = Children.drop_back(Children.end() - Last.base());
 233
 234   bool LastBlockWasRuler = true;
 235   for (const auto &C : Children) {
 236     if (C->isRuler() && LastBlockWasRuler)
 237       continue;
 238     LastBlockWasRuler = C->isRuler();
 239     ((*C).*RenderFunc)(OS);
 240   }
 241
 242   // Get rid of redundant empty lines introduced in plaintext while imitating
 243   // padding in markdown.
 244   std::string AdjustedResult;
 245   llvm::StringRef TrimmedText(OS.str());
 246   TrimmedText = TrimmedText.trim();
 247
 248   llvm::copy_if(TrimmedText, std::back_inserter(AdjustedResult),
 249                 [&TrimmedText](const char &C) {
 250                   return !llvm::StringRef(TrimmedText.data(),
 251                                           &C - TrimmedText.data() + 1)
 252                               // We allow at most two newlines.
 253                               .ends_with("\n\n\n");
 254                 });
 255
 256   return AdjustedResult;
 257 }
 258
 259 // Separates two blocks with extra spacing. Note that it might render strangely
 260 // in vscode if the trailing block is a codeblock, see
 261 // https://github.com/microsoft/vscode/issues/88416 for details.
 262 class Ruler : public Block {
 263 public:
 264   void renderMarkdown(llvm::raw_ostream &OS) const override {
 265     // Note that we need an extra new line before the ruler, otherwise we might
 266     // make previous block a title instead of introducing a ruler.
 267     OS << "\n---\n";
 268   }
 269   void renderPlainText(llvm::raw_ostream &OS) const override { OS << '\n'; }
 270   std::unique_ptr<Block> clone() const override {
 271     return std::make_unique<Ruler>(*this);
 272   }
 273   bool isRuler() const override { return true; }
 274 };
 275
 276 class CodeBlock : public Block {
 277 public:
 278   void renderMarkdown(llvm::raw_ostream &OS) const override {
 279     std::string Marker = getMarkerForCodeBlock(Contents);
 280     // No need to pad from previous blocks, as they should end with a new line.
 281     OS << Marker << Language << '\n' << Contents << '\n' << Marker << '\n';
 282   }
 283
 284   void renderPlainText(llvm::raw_ostream &OS) const override {
 285     // In plaintext we want one empty line before and after codeblocks.
 286     OS << '\n' << Contents << "\n\n";
 287   }
 288
 289   std::unique_ptr<Block> clone() const override {
 290     return std::make_unique<CodeBlock>(*this);
 291   }
 292
 293   CodeBlock(std::string Contents, std::string Language)
 294       : Contents(std::move(Contents)), Language(std::move(Language)) {}
 295
 296 private:
 297   std::string Contents;
 298   std::string Language;
 299 };
 300
 301 // Inserts two spaces after each `\n` to indent each line. First line is not
 302 // indented.
 303 std::string indentLines(llvm::StringRef Input) {
 304   assert(!Input.ends_with("\n") && "Input should've been trimmed.");
 305   std::string IndentedR;
 306   // We'll add 2 spaces after each new line.
 307   IndentedR.reserve(Input.size() + Input.count('\n') * 2);
 308   for (char C : Input) {
 309     IndentedR += C;
 310     if (C == '\n')
 311       IndentedR.append("  ");
 312   }
 313   return IndentedR;
 314 }
 315
 316 class Heading : public Paragraph {
 317 public:
 318   Heading(size_t Level) : Level(Level) {}
 319   void renderMarkdown(llvm::raw_ostream &OS) const override {
 320     OS << std::string(Level, '#') << ' ';
 321     Paragraph::renderMarkdown(OS);
 322   }
 323
 324 private:
 325   size_t Level;
 326 };
 327
 328 } // namespace
 329
 330 std::string Block::asMarkdown() const {
 331   std::string R;
 332   llvm::raw_string_ostream OS(R);
 333   renderMarkdown(OS);
 334   return llvm::StringRef(OS.str()).trim().str();
 335 }
 336
 337 std::string Block::asPlainText() const {
 338   std::string R;
 339   llvm::raw_string_ostream OS(R);
 340   renderPlainText(OS);
 341   return llvm::StringRef(OS.str()).trim().str();
 342 }
 343
 344 void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const {
 345   bool NeedsSpace = false;
 346   bool HasChunks = false;
 347   for (auto &C : Chunks) {
 348     if (C.SpaceBefore || NeedsSpace)
 349       OS << " ";
 350     switch (C.Kind) {
 351     case Chunk::PlainText:
 352       OS << renderText(C.Contents, !HasChunks);
 353       break;
 354     case Chunk::InlineCode:
 355       OS << renderInlineBlock(C.Contents);
 356       break;
 357     }
 358     HasChunks = true;
 359     NeedsSpace = C.SpaceAfter;
 360   }
 361   // Paragraphs are translated into markdown lines, not markdown paragraphs.
 362   // Therefore it only has a single linebreak afterwards.
 363   // VSCode requires two spaces at the end of line to start a new one.
 364   OS << "  \n";
 365 }
 366
 367 std::unique_ptr<Block> Paragraph::clone() const {
 368   return std::make_unique<Paragraph>(*this);
 369 }
 370
 371 /// Choose a marker to delimit `Text` from a prioritized list of options.
 372 /// This is more readable than escaping for plain-text.
 373 llvm::StringRef chooseMarker(llvm::ArrayRef<llvm::StringRef> Options,
 374                              llvm::StringRef Text) {
 375   // Prefer a delimiter whose characters don't appear in the text.
 376   for (llvm::StringRef S : Options)
 377     if (Text.find_first_of(S) == llvm::StringRef::npos)
 378       return S;
 379   return Options.front();
 380 }
 381
 382 void Paragraph::renderPlainText(llvm::raw_ostream &OS) const {
 383   bool NeedsSpace = false;
 384   for (auto &C : Chunks) {
 385     if (C.SpaceBefore || NeedsSpace)
 386       OS << " ";
 387     llvm::StringRef Marker = "";
 388     if (C.Preserve && C.Kind == Chunk::InlineCode)
 389       Marker = chooseMarker({"`", "'", "\""}, C.Contents);
 390     OS << Marker << C.Contents << Marker;
 391     NeedsSpace = C.SpaceAfter;
 392   }
 393   OS << '\n';
 394 }
 395
 396 BulletList::BulletList() = default;
 397 BulletList::~BulletList() = default;
 398
 399 void BulletList::renderMarkdown(llvm::raw_ostream &OS) const {
 400   for (auto &D : Items) {
 401     // Instead of doing this we might prefer passing Indent to children to get
 402     // rid of the copies, if it turns out to be a bottleneck.
 403     OS << "- " << indentLines(D.asMarkdown()) << '\n';
 404   }
 405   // We need a new line after list to terminate it in markdown.
 406   OS << '\n';
 407 }
 408
 409 void BulletList::renderPlainText(llvm::raw_ostream &OS) const {
 410   for (auto &D : Items) {
 411     // Instead of doing this we might prefer passing Indent to children to get
 412     // rid of the copies, if it turns out to be a bottleneck.
 413     OS << "- " << indentLines(D.asPlainText()) << '\n';
 414   }
 415 }
 416
 417 Paragraph &Paragraph::appendSpace() {
 418   if (!Chunks.empty())
 419     Chunks.back().SpaceAfter = true;
 420   return *this;
 421 }
 422
 423 Paragraph &Paragraph::appendText(llvm::StringRef Text) {
 424   std::string Norm = canonicalizeSpaces(Text);
 425   if (Norm.empty())
 426     return *this;
 427   Chunks.emplace_back();
 428   Chunk &C = Chunks.back();
 429   C.Contents = std::move(Norm);
 430   C.Kind = Chunk::PlainText;
 431   C.SpaceBefore = llvm::isSpace(Text.front());
 432   C.SpaceAfter = llvm::isSpace(Text.back());
 433   return *this;
 434 }
 435
 436 Paragraph &Paragraph::appendCode(llvm::StringRef Code, bool Preserve) {
 437   bool AdjacentCode =
 438       !Chunks.empty() && Chunks.back().Kind == Chunk::InlineCode;
 439   std::string Norm = canonicalizeSpaces(std::move(Code));
 440   if (Norm.empty())
 441     return *this;
 442   Chunks.emplace_back();
 443   Chunk &C = Chunks.back();
 444   C.Contents = std::move(Norm);
 445   C.Kind = Chunk::InlineCode;
 446   C.Preserve = Preserve;
 447   // Disallow adjacent code spans without spaces, markdown can't render them.
 448   C.SpaceBefore = AdjacentCode;
 449   return *this;
 450 }
 451
 452 std::unique_ptr<Block> BulletList::clone() const {
 453   return std::make_unique<BulletList>(*this);
 454 }
 455
 456 class Document &BulletList::addItem() {
 457   Items.emplace_back();
 458   return Items.back();
 459 }
 460
 461 Document &Document::operator=(const Document &Other) {
 462   Children.clear();
 463   for (const auto &C : Other.Children)
 464     Children.push_back(C->clone());
 465   return *this;
 466 }
 467
 468 void Document::append(Document Other) {
 469   std::move(Other.Children.begin(), Other.Children.end(),
 470             std::back_inserter(Children));
 471 }
 472
 473 Paragraph &Document::addParagraph() {
 474   Children.push_back(std::make_unique<Paragraph>());
 475   return *static_cast<Paragraph *>(Children.back().get());
 476 }
 477
 478 void Document::addRuler() { Children.push_back(std::make_unique<Ruler>()); }
 479
 480 void Document::addCodeBlock(std::string Code, std::string Language) {
 481   Children.emplace_back(
 482       std::make_unique<CodeBlock>(std::move(Code), std::move(Language)));
 483 }
 484
 485 std::string Document::asMarkdown() const {
 486   return renderBlocks(Children, &Block::renderMarkdown);
 487 }
 488
 489 std::string Document::asPlainText() const {
 490   return renderBlocks(Children, &Block::renderPlainText);
 491 }
 492
 493 BulletList &Document::addBulletList() {
 494   Children.emplace_back(std::make_unique<BulletList>());
 495   return *static_cast<BulletList *>(Children.back().get());
 496 }
 497
 498 Paragraph &Document::addHeading(size_t Level) {
 499   assert(Level > 0);
 500   Children.emplace_back(std::make_unique<Heading>(Level));
 501   return *static_cast<Paragraph *>(Children.back().get());
 502 }
 503 } // namespace markup
 504 } // namespace clangd
 505 } // namespace clang