clang-tools-extra/clangd/support/Lex.cpp

   1 //===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "Token.h"
  10 #include "clang/Basic/IdentifierTable.h"
  11 #include "clang/Basic/SourceLocation.h"
  12 #include "clang/Basic/TokenKinds.h"
  13 #include "clang/Lex/Lexer.h"
  14 #include "clang/Lex/LiteralSupport.h"
  15
  16 namespace clang {
  17 namespace clangd {
  18
  19 TokenStream lex(const std::string &Code, const clang::LangOptions &LangOpts) {
  20   clang::SourceLocation Start;
  21   // Tokenize using clang's lexer in raw mode.
  22   // std::string guarantees null-termination, which the lexer needs.
  23   clang::Lexer Lexer(Start, LangOpts, Code.data(), Code.data(),
  24                      Code.data() + Code.size());
  25   Lexer.SetCommentRetentionState(true);
  26
  27   TokenStream Result;
  28   clang::Token CT;
  29   // Index into the token stream of original source code.
  30   Token::Index TokenIndex = 0;
  31   unsigned LastOffset = 0;
  32   unsigned Line = 0;
  33   unsigned Indent = 0;
  34   for (Lexer.LexFromRawLexer(CT); CT.getKind() != clang::tok::eof;
  35        Lexer.LexFromRawLexer(CT)) {
  36     unsigned Offset =
  37         CT.getLocation().getRawEncoding() - Start.getRawEncoding();
  38
  39     Token Tok;
  40     Tok.Data = &Code[Offset];
  41     Tok.Length = CT.getLength();
  42     Tok.Kind = CT.getKind();
  43
  44     // Update current line number and indentation from raw source code.
  45     unsigned NewLineStart = 0;
  46     for (unsigned I = LastOffset; I < Offset; ++I) {
  47       if (Code[I] == '\n') {
  48         NewLineStart = I + 1;
  49         ++Line;
  50       }
  51     }
  52     if (NewLineStart || !LastOffset) {
  53       Indent = 0;
  54       for (char C : StringRef(Code).slice(NewLineStart, Offset)) {
  55         if (C == ' ')
  56           ++Indent;
  57         else if (C == '\t')
  58           Indent += 8;
  59         else
  60           break;
  61       }
  62     }
  63     Tok.Indent = Indent;
  64     Tok.Line = Line;
  65
  66     if (CT.isAtStartOfLine())
  67       Tok.setFlag(LexFlags::StartsPPLine);
  68     if (CT.needsCleaning() || CT.hasUCN())
  69       Tok.setFlag(LexFlags::NeedsCleaning);
  70
  71     Tok.OriginalIndex = TokenIndex++;
  72     Result.push(Tok);
  73     LastOffset = Offset;
  74   }
  75   Result.finalize();
  76   return Result;
  77 }
  78
  79 TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
  80   auto CleanedStorage = std::make_shared<llvm::BumpPtrAllocator>();
  81   clang::IdentifierTable Identifiers(LangOpts);
  82   TokenStream Result(CleanedStorage);
  83   Result.addPayload(Code.getPayload());
  84   for (auto Tok : Code.tokens()) {
  85     if (Tok.flag(LexFlags::NeedsCleaning)) {
  86       // Remove escaped newlines and trigraphs.
  87       llvm::SmallString<64> CleanBuffer;
  88       const char *Pos = Tok.text().begin();
  89       while (Pos < Tok.text().end()) {
  90         auto [Char, CharSize] =
  91             clang::Lexer::getCharAndSizeNoWarn(Pos, LangOpts);
  92         CleanBuffer.push_back(Char);
  93         assert(CharSize != 0 && "no progress!");
  94         Pos += CharSize;
  95       }
  96       llvm::StringRef Text = CleanBuffer;
  97       llvm::SmallString<64> UCNBuffer;
  98       // A surface reading of the standard suggests UCNs might appear anywhere.
  99       // But we need only decode them in raw_identifiers.
 100       //  - they cannot appear in punctuation/keyword tokens, because UCNs
 101       //    cannot encode basic characters outside of literals [lex.charset]
 102       //  - they can appear in literals, but we need not unescape them now.
 103       //    We treat them as escape sequences when evaluating the literal.
 104       //  - comments are handled similarly to literals
 105       // This is good fortune, because expandUCNs requires its input to be a
 106       // reasonably valid identifier (e.g. without stray backslashes).
 107       if (Tok.Kind == tok::raw_identifier) {
 108         clang::expandUCNs(UCNBuffer, CleanBuffer);
 109         Text = UCNBuffer;
 110       }
 111
 112       Tok.Data = Text.copy(*CleanedStorage).data();
 113       Tok.Length = Text.size();
 114       Tok.Flags &= ~static_cast<decltype(Tok.Flags)>(LexFlags::NeedsCleaning);
 115     }
 116
 117     if (Tok.Kind == tok::raw_identifier) {
 118       // Cook raw_identifiers into identifier, keyword, etc.
 119       Tok.Kind = Identifiers.get(Tok.text()).getTokenID();
 120     } else if (Tok.Kind == tok::greatergreater) {
 121       // Split the greatergreater token.
 122       // FIXME: split lessless token to support Cuda triple angle brackets <<<.
 123       assert(Tok.text() == ">>");
 124       Tok.Kind = tok::greater;
 125       Tok.Length = 1;
 126       Result.push(Tok);
 127       // Line is wrong if the first greater is followed by an escaped newline!
 128       Tok.Data = Tok.text().data() + 1;
 129     }
 130
 131     Result.push(std::move(Tok));
 132   }
 133
 134   Result.finalize();
 135   return Result;
 136 }
 137
 138 } // namespace clangd
 139 } // namespace clang