1 //===--- Lex.cpp - extract token stream from source code ---------*- C++-*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 #include "clang/Basic/IdentifierTable.h"
11 #include "clang/Basic/SourceLocation.h"
12 #include "clang/Basic/TokenKinds.h"
13 #include "clang/Lex/Lexer.h"
14 #include "clang/Lex/LiteralSupport.h"
19 TokenStream
lex(const std::string
&Code
, const clang::LangOptions
&LangOpts
) {
20 clang::SourceLocation Start
;
21 // Tokenize using clang's lexer in raw mode.
22 // std::string guarantees null-termination, which the lexer needs.
23 clang::Lexer
Lexer(Start
, LangOpts
, Code
.data(), Code
.data(),
24 Code
.data() + Code
.size());
25 Lexer
.SetCommentRetentionState(true);
29 // Index into the token stream of original source code.
30 Token::Index TokenIndex
= 0;
31 unsigned LastOffset
= 0;
34 for (Lexer
.LexFromRawLexer(CT
); CT
.getKind() != clang::tok::eof
;
35 Lexer
.LexFromRawLexer(CT
)) {
37 CT
.getLocation().getRawEncoding() - Start
.getRawEncoding();
40 Tok
.Data
= &Code
[Offset
];
41 Tok
.Length
= CT
.getLength();
42 Tok
.Kind
= CT
.getKind();
44 // Update current line number and indentation from raw source code.
45 unsigned NewLineStart
= 0;
46 for (unsigned I
= LastOffset
; I
< Offset
; ++I
) {
47 if (Code
[I
] == '\n') {
52 if (NewLineStart
|| !LastOffset
) {
54 for (char C
: StringRef(Code
).slice(NewLineStart
, Offset
)) {
66 if (CT
.isAtStartOfLine())
67 Tok
.setFlag(LexFlags::StartsPPLine
);
68 if (CT
.needsCleaning() || CT
.hasUCN())
69 Tok
.setFlag(LexFlags::NeedsCleaning
);
71 Tok
.OriginalIndex
= TokenIndex
++;
79 TokenStream
cook(const TokenStream
&Code
, const LangOptions
&LangOpts
) {
80 auto CleanedStorage
= std::make_shared
<llvm::BumpPtrAllocator
>();
81 clang::IdentifierTable
Identifiers(LangOpts
);
82 TokenStream
Result(CleanedStorage
);
83 Result
.addPayload(Code
.getPayload());
84 for (auto Tok
: Code
.tokens()) {
85 if (Tok
.flag(LexFlags::NeedsCleaning
)) {
86 // Remove escaped newlines and trigraphs.
87 llvm::SmallString
<64> CleanBuffer
;
88 const char *Pos
= Tok
.text().begin();
89 while (Pos
< Tok
.text().end()) {
90 auto [Char
, CharSize
] =
91 clang::Lexer::getCharAndSizeNoWarn(Pos
, LangOpts
);
92 CleanBuffer
.push_back(Char
);
93 assert(CharSize
!= 0 && "no progress!");
96 llvm::StringRef Text
= CleanBuffer
;
97 llvm::SmallString
<64> UCNBuffer
;
98 // A surface reading of the standard suggests UCNs might appear anywhere.
99 // But we need only decode them in raw_identifiers.
100 // - they cannot appear in punctuation/keyword tokens, because UCNs
101 // cannot encode basic characters outside of literals [lex.charset]
102 // - they can appear in literals, but we need not unescape them now.
103 // We treat them as escape sequences when evaluating the literal.
104 // - comments are handled similarly to literals
105 // This is good fortune, because expandUCNs requires its input to be a
106 // reasonably valid identifier (e.g. without stray backslashes).
107 if (Tok
.Kind
== tok::raw_identifier
) {
108 clang::expandUCNs(UCNBuffer
, CleanBuffer
);
112 Tok
.Data
= Text
.copy(*CleanedStorage
).data();
113 Tok
.Length
= Text
.size();
114 Tok
.Flags
&= ~static_cast<decltype(Tok
.Flags
)>(LexFlags::NeedsCleaning
);
117 if (Tok
.Kind
== tok::raw_identifier
) {
118 // Cook raw_identifiers into identifier, keyword, etc.
119 Tok
.Kind
= Identifiers
.get(Tok
.text()).getTokenID();
120 } else if (Tok
.Kind
== tok::greatergreater
) {
121 // Split the greatergreater token.
122 // FIXME: split lessless token to support Cuda triple angle brackets <<<.
123 assert(Tok
.text() == ">>");
124 Tok
.Kind
= tok::greater
;
127 // Line is wrong if the first greater is followed by an escaped newline!
128 Tok
.Data
= Tok
.text().data() + 1;
131 Result
.push(std::move(Tok
));
138 } // namespace clangd