1 //===--- Macros.h - Format C++ code -----------------------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file contains the main building blocks of macro support in
13 /// In order to not violate the requirement that clang-format can format files
14 /// in isolation, clang-format's macro support uses expansions users provide
15 /// as part of clang-format's style configuration.
17 /// Macro definitions are of the form "MACRO(p1, p2)=p1 + p2", but only support
18 /// one level of expansion (\see MacroExpander for a full description of what
21 /// As part of parsing, clang-format uses the MacroExpander to expand the
22 /// spelled token streams into expanded token streams when it encounters a
23 /// macro call. The UnwrappedLineParser continues to parse UnwrappedLines
24 /// from the expanded token stream.
25 /// After the expanded unwrapped lines are parsed, the MacroCallReconstructor
26 /// matches the spelled token stream into unwrapped lines that best resemble the
27 /// structure of the expanded unwrapped lines. These reconstructed unwrapped
28 /// lines are aliasing the tokens in the expanded token stream, so that token
29 /// annotations will be reused when formatting the spelled macro calls.
31 /// When formatting, clang-format annotates and formats the expanded unwrapped
32 /// lines first, determining the token types. Next, it formats the spelled
33 /// unwrapped lines, keeping the token types fixed, while allowing other
34 /// formatting decisions to change.
36 //===----------------------------------------------------------------------===//
38 #ifndef CLANG_LIB_FORMAT_MACROS_H
39 #define CLANG_LIB_FORMAT_MACROS_H
46 #include "FormatToken.h"
47 #include "llvm/ADT/ArrayRef.h"
48 #include "llvm/ADT/DenseMap.h"
49 #include "llvm/ADT/SmallVector.h"
50 #include "llvm/ADT/StringRef.h"
56 struct UnwrappedLineNode
;
58 /// Takes a set of macro definitions as strings and allows expanding calls to
62 /// Definition: A(x, y)=x + y
63 /// Call : A(int a = 1, 2)
64 /// Expansion : int a = 1 + 2
66 /// Expansion does not check arity of the definition.
67 /// If fewer arguments than expected are provided, the remaining parameters
68 /// are considered empty:
71 /// If more arguments than expected are provided, they will be discarded.
73 /// The expander does not support:
74 /// - recursive expansion
79 /// Furthermore, only a single expansion of each macro argument is supported,
80 /// so that we cannot get conflicting formatting decisions from different
82 /// Definition: A(x)=x+x
88 using ArgsList
= llvm::ArrayRef
<llvm::SmallVector
<FormatToken
*, 8>>;
90 /// Construct a macro expander from a set of macro definitions.
91 /// Macro definitions must be encoded as UTF-8.
93 /// Each entry in \p Macros must conform to the following simple
94 /// macro-definition language:
95 /// <definition> ::= <id> <expansion> | <id> "(" <params> ")" <expansion>
96 /// <params> ::= <id-list> | ""
97 /// <id-list> ::= <id> | <id> "," <params>
98 /// <expansion> ::= "=" <tail> | <eof>
99 /// <tail> ::= <tok> <tail> | <eof>
101 /// Macros that cannot be parsed will be silently discarded.
103 MacroExpander(const std::vector
<std::string
> &Macros
,
104 clang::SourceManager
&SourceMgr
, const FormatStyle
&Style
,
105 llvm::SpecificBumpPtrAllocator
<FormatToken
> &Allocator
,
106 IdentifierTable
&IdentTable
);
109 /// Returns whether any macro \p Name is defined, regardless of overloads.
110 bool defined(llvm::StringRef Name
) const;
112 /// Returns whetherh there is an object-like overload, i.e. where the macro
113 /// has no arguments and should not consume subsequent parentheses.
114 bool objectLike(llvm::StringRef Name
) const;
116 /// Returns whether macro \p Name provides an overload with the given arity.
117 bool hasArity(llvm::StringRef Name
, unsigned Arity
) const;
119 /// Returns the expanded stream of format tokens for \p ID, where
120 /// each element in \p Args is a positional argument to the macro call.
121 /// If \p Args is not set, the object-like overload is used.
122 /// If \p Args is set, the overload with the arity equal to \c Args.size() is
124 llvm::SmallVector
<FormatToken
*, 8>
125 expand(FormatToken
*ID
, std::optional
<ArgsList
> OptionalArgs
) const;
129 class DefinitionParser
;
131 void parseDefinition(const std::string
&Macro
);
133 clang::SourceManager
&SourceMgr
;
134 const FormatStyle
&Style
;
135 llvm::SpecificBumpPtrAllocator
<FormatToken
> &Allocator
;
136 IdentifierTable
&IdentTable
;
137 SmallVector
<std::unique_ptr
<llvm::MemoryBuffer
>> Buffers
;
138 llvm::StringMap
<llvm::DenseMap
<int, Definition
>> FunctionLike
;
139 llvm::StringMap
<Definition
> ObjectLike
;
142 /// Converts a sequence of UnwrappedLines containing expanded macros into a
143 /// single UnwrappedLine containing the macro calls. This UnwrappedLine may be
144 /// broken into child lines, in a way that best conveys the structure of the
147 /// In the simplest case, a spelled UnwrappedLine contains one macro, and after
148 /// expanding it we have one expanded UnwrappedLine. In general, macro
149 /// expansions can span UnwrappedLines, and multiple macros can contribute
150 /// tokens to the same line. We keep consuming expanded lines until:
151 /// * all expansions that started have finished (we're not chopping any macros
153 /// * *and* we've reached the end of a *spelled* unwrapped line.
155 /// A single UnwrappedLine represents this chunk of code.
157 /// After this point, the state of the spelled/expanded stream is "in sync"
158 /// (both at the start of an UnwrappedLine, with no macros open), so the
159 /// Reconstructor can be thrown away and parsing can continue.
161 /// Given a mapping from the macro name identifier token in the macro call
162 /// to the tokens of the macro call, for example:
163 /// CLASSA -> CLASSA({public: void x();})
165 /// When getting the formatted lines of the expansion via the \c addLine method
166 /// (each '->' specifies a call to \c addLine ):
172 /// Creates the tree of unwrapped lines containing the macro call tokens so that
173 /// the macro call tokens fit the semantic structure of the expanded formatted
179 class MacroCallReconstructor
{
181 /// Create an Reconstructor whose resulting \p UnwrappedLine will start at
182 /// \p Level, using the map from name identifier token to the corresponding
183 /// tokens of the spelled macro call.
184 MacroCallReconstructor(
186 const llvm::DenseMap
<FormatToken
*, std::unique_ptr
<UnwrappedLine
>>
189 /// For the given \p Line, match all occurences of tokens expanded from a
190 /// macro to unwrapped lines in the spelled macro call so that the resulting
191 /// tree of unwrapped lines best resembles the structure of unwrapped lines
192 /// passed in via \c addLine.
193 void addLine(const UnwrappedLine
&Line
);
195 /// Check whether at the current state there is no open macro expansion
196 /// that needs to be processed to finish an macro call.
197 /// Only when \c finished() is true, \c takeResult() can be called to retrieve
198 /// the resulting \c UnwrappedLine.
199 /// If there are multiple subsequent macro calls within an unwrapped line in
200 /// the spelled token stream, the calling code may also continue to call
201 /// \c addLine() when \c finished() is true.
202 bool finished() const { return ActiveExpansions
.empty(); }
204 /// Retrieve the formatted \c UnwrappedLine containing the orginal
205 /// macro calls, formatted according to the expanded token stream received
206 /// via \c addLine().
207 /// Generally, this line tries to have the same structure as the expanded,
208 /// formatted unwrapped lines handed in via \c addLine(), with the exception
209 /// that for multiple top-level lines, each subsequent line will be the
210 /// child of the last token in its predecessor. This representation is chosen
211 /// because it is a precondition to the formatter that we get what looks like
212 /// a single statement in a single \c UnwrappedLine (i.e. matching parens).
214 /// If a token in a macro argument is a child of a token in the expansion,
215 /// the parent will be the corresponding token in the macro call.
217 /// #define C(a, b) class C { a b
218 /// C(int x;, int y;)
220 /// class C { int x; int y;
221 /// where in a formatted line "int x;" and "int y;" would both be new separate
224 /// In the result, "int x;" will be a child of the opening parenthesis in "C("
225 /// and "int y;" will be a child of the "," token:
231 UnwrappedLine
takeResult() &&;
234 void add(FormatToken
*Token
, FormatToken
*ExpandedParent
, bool First
);
235 void prepareParent(FormatToken
*ExpandedParent
, bool First
);
236 FormatToken
*getParentInResult(FormatToken
*Parent
);
237 void reconstruct(FormatToken
*Token
);
238 void startReconstruction(FormatToken
*Token
);
239 bool reconstructActiveCallUntil(FormatToken
*Token
);
240 void endReconstruction(FormatToken
*Token
);
241 bool processNextReconstructed();
244 struct ReconstructedLine
;
246 void appendToken(FormatToken
*Token
, ReconstructedLine
*L
= nullptr);
247 UnwrappedLine
createUnwrappedLine(const ReconstructedLine
&Line
, int Level
);
248 void debug(const ReconstructedLine
&Line
, int Level
);
249 ReconstructedLine
&parentLine();
250 ReconstructedLine
*currentLine();
251 void debugParentMap() const;
254 enum ReconstructorState
{
255 Start
, // No macro expansion was found in the input yet.
256 InProgress
, // During a macro reconstruction.
257 Finalized
, // Past macro reconstruction, the result is finalized.
259 ReconstructorState State
= Start
;
262 // Node in which we build up the resulting unwrapped line; this type is
263 // analogous to UnwrappedLineNode.
265 LineNode() = default;
266 LineNode(FormatToken
*Tok
) : Tok(Tok
) {}
267 FormatToken
*Tok
= nullptr;
268 llvm::SmallVector
<std::unique_ptr
<ReconstructedLine
>> Children
;
271 // Line in which we build up the resulting unwrapped line.
272 // FIXME: Investigate changing UnwrappedLine to a pointer type and using it
273 // instead of rolling our own type.
274 struct ReconstructedLine
{
275 llvm::SmallVector
<std::unique_ptr
<LineNode
>> Tokens
;
278 // The line in which we collect the resulting reconstructed output.
279 // To reduce special cases in the algorithm, the first level of the line
280 // contains a single null token that has the reconstructed incoming
281 // lines as children.
282 // In the end, we stich the lines together so that each subsequent line
283 // is a child of the last token of the previous line. This is necessary
284 // in order to format the overall expression as a single logical line -
285 // if we created separate lines, we'd format them with their own top-level
286 // indent depending on the semantic structure, which is not desired.
287 ReconstructedLine Result
;
289 // Stack of currently "open" lines, where each line's predecessor's last
290 // token is the parent token for that line.
291 llvm::SmallVector
<ReconstructedLine
*> ActiveReconstructedLines
;
293 // Maps from the expanded token to the token that takes its place in the
294 // reconstructed token stream in terms of parent-child relationships.
295 // Note that it might take multiple steps to arrive at the correct
296 // parent in the output.
297 // Given: #define C(a, b) []() { a; b; }
298 // And a call: C(f(), g())
299 // The structure in the incoming formatted unwrapped line will be:
304 // with f and g being children of the opening brace.
305 // In the reconstructed call:
309 // We want f to be a child of the opening parenthesis and g to be a child
310 // of the comma token in the macro call.
315 // once we're past the comma in the reconstruction.
316 llvm::DenseMap
<FormatToken
*, FormatToken
*>
317 SpelledParentToReconstructedParent
;
319 // Keeps track of a single expansion while we're reconstructing tokens it
322 // The identifier token of the macro call.
324 // Our current position in the reconstruction.
325 std::list
<UnwrappedLineNode
>::iterator SpelledI
;
326 // The end of the reconstructed token sequence.
327 std::list
<UnwrappedLineNode
>::iterator SpelledE
;
330 // Stack of macro calls for which we're in the middle of an expansion.
331 llvm::SmallVector
<Expansion
> ActiveExpansions
;
333 struct MacroCallState
{
334 MacroCallState(ReconstructedLine
*Line
, FormatToken
*ParentLastToken
,
335 FormatToken
*MacroCallLParen
);
337 ReconstructedLine
*Line
;
339 // The last token in the parent line or expansion, or nullptr if the macro
340 // expansion is on a top-level line.
342 // For example, in the macro call:
343 // auto f = []() { ID(1); };
344 // The MacroCallState for ID will have '{' as ParentLastToken.
346 // In the macro call:
348 // The MacroCallState of the outer ID will have nullptr as ParentLastToken,
349 // while the MacroCallState for the inner ID will have the '(' of the outer
350 // ID as ParentLastToken.
352 // In the macro call:
354 // The MacroCallState of ID will have ',' as ParentLastToken.
355 FormatToken
*ParentLastToken
;
357 // The l_paren of this MacroCallState's macro call.
358 FormatToken
*MacroCallLParen
;
361 // Keeps track of the lines into which the opening brace/parenthesis &
362 // argument separating commas for each level in the macro call go in order to
363 // put the corresponding closing brace/parenthesis into the same line in the
364 // output and keep track of which parents in the expanded token stream map to
365 // which tokens in the reconstructed stream.
366 // When an opening brace/parenthesis has children, we want the structure of
367 // the output line to be:
374 llvm::SmallVector
<MacroCallState
> MacroCallStructure
;
376 // Level the generated UnwrappedLine will be at.
377 const unsigned Level
;
379 // Maps from identifier of the macro call to an unwrapped line containing
380 // all tokens of the macro call.
381 const llvm::DenseMap
<FormatToken
*, std::unique_ptr
<UnwrappedLine
>>
385 } // namespace format