1 //===----- HipStdPar.cpp - HIP C++ Standard Parallelism Support Passes ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
8 // This file implements two passes that enable HIP C++ Standard Parallelism
11 // 1. AcceleratorCodeSelection (required): Given that only algorithms are
12 // accelerated, and that the accelerated implementation exists in the form of
13 // a compute kernel, we assume that only the kernel, and all functions
14 // reachable from it, constitute code that the user expects the accelerator
15 // to execute. Thus, we identify the set of all functions reachable from
16 // kernels, and then remove all unreachable ones. This last part is necessary
17 // because it is possible for code that the user did not expect to execute on
18 // an accelerator to contain constructs that cannot be handled by the target
19 // BE, which cannot be provably demonstrated to be dead code in general, and
20 // thus can lead to mis-compilation. The degenerate case of this is when a
21 // Module contains no kernels (the parent TU had no algorithm invocations fit
22 // for acceleration), which we handle by completely emptying said module.
23 // **NOTE**: The above does not handle indirectly reachable functions i.e.
24 // it is possible to obtain a case where the target of an indirect
25 // call is otherwise unreachable and thus is removed; this
26 // restriction is aligned with the current `-hipstdpar` limitations
27 // and will be relaxed in the future.
29 // 2. AllocationInterposition (required only when on-demand paging is
30 // unsupported): Some accelerators or operating systems might not support
31 // transparent on-demand paging. Thus, they would only be able to access
32 // memory that is allocated by an accelerator-aware mechanism. For such cases
33 // the user can opt into enabling allocation / deallocation interposition,
34 // whereby we replace calls to known allocation / deallocation functions with
35 // calls to runtime implemented equivalents that forward the requests to
36 // accelerator-aware interfaces. We also support freeing system allocated
37 // memory that ends up in one of the runtime equivalents, since this can
38 // happen if e.g. a library that was compiled without interposition returns
39 // an allocation that can be validly passed to `free`.
40 //===----------------------------------------------------------------------===//
42 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
44 #include "llvm/ADT/SmallPtrSet.h"
45 #include "llvm/ADT/SmallVector.h"
46 #include "llvm/ADT/STLExtras.h"
47 #include "llvm/Analysis/CallGraph.h"
48 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
49 #include "llvm/IR/Constants.h"
50 #include "llvm/IR/DebugInfoMetadata.h"
51 #include "llvm/IR/Function.h"
52 #include "llvm/IR/Module.h"
53 #include "llvm/Transforms/Utils/ModuleUtils.h"
62 static inline void eraseFromModule(T
&ToErase
) {
63 ToErase
.replaceAllUsesWith(PoisonValue::get(ToErase
.getType()));
64 ToErase
.eraseFromParent();
67 static inline bool checkIfSupported(GlobalVariable
&G
) {
68 if (!G
.isThreadLocal())
71 G
.dropDroppableUses();
73 if (!G
.isConstantUsed())
77 raw_string_ostream
OS(W
);
79 OS
<< "Accelerator does not support the thread_local variable "
82 Instruction
*I
= nullptr;
83 SmallVector
<User
*> Tmp(G
.user_begin(), G
.user_end());
84 SmallPtrSet
<User
*, 5> Visited
;
86 auto U
= std::move(Tmp
.back());
89 if (Visited
.contains(U
))
92 if (isa
<Instruction
>(U
))
93 I
= cast
<Instruction
>(U
);
95 Tmp
.insert(Tmp
.end(), U
->user_begin(), U
->user_end());
98 } while (!I
&& !Tmp
.empty());
100 assert(I
&& "thread_local global should have at least one non-constant use.");
102 G
.getContext().diagnose(
103 DiagnosticInfoUnsupported(*I
->getParent()->getParent(), W
,
104 I
->getDebugLoc(), DS_Error
));
109 static inline void clearModule(Module
&M
) { // TODO: simplify.
110 while (!M
.functions().empty())
111 eraseFromModule(*M
.begin());
112 while (!M
.globals().empty())
113 eraseFromModule(*M
.globals().begin());
114 while (!M
.aliases().empty())
115 eraseFromModule(*M
.aliases().begin());
116 while (!M
.ifuncs().empty())
117 eraseFromModule(*M
.ifuncs().begin());
120 static inline void maybeHandleGlobals(Module
&M
) {
121 unsigned GlobAS
= M
.getDataLayout().getDefaultGlobalsAddressSpace();
122 for (auto &&G
: M
.globals()) { // TODO: should we handle these in the FE?
123 if (!checkIfSupported(G
))
124 return clearModule(M
);
126 if (G
.isThreadLocal())
130 if (G
.getAddressSpace() != GlobAS
)
132 if (G
.getLinkage() != GlobalVariable::ExternalLinkage
)
135 G
.setLinkage(GlobalVariable::ExternalWeakLinkage
);
136 G
.setExternallyInitialized(true);
141 static inline void removeUnreachableFunctions(
142 const SmallPtrSet
<const Function
*, N
>& Reachable
, Module
&M
) {
143 removeFromUsedLists(M
, [&](Constant
*C
) {
144 if (auto F
= dyn_cast
<Function
>(C
))
145 return !Reachable
.contains(F
);
150 SmallVector
<std::reference_wrapper
<Function
>> ToRemove
;
151 copy_if(M
, std::back_inserter(ToRemove
), [&](auto &&F
) {
152 return !F
.isIntrinsic() && !Reachable
.contains(&F
);
155 for_each(ToRemove
, eraseFromModule
<Function
>);
158 static inline bool isAcceleratorExecutionRoot(const Function
*F
) {
162 return F
->getCallingConv() == CallingConv::AMDGPU_KERNEL
;
165 static inline bool checkIfSupported(const Function
*F
, const CallBase
*CB
) {
166 const auto Dx
= F
->getName().rfind("__hipstdpar_unsupported");
168 if (Dx
== StringRef::npos
)
171 const auto N
= F
->getName().substr(0, Dx
);
174 raw_string_ostream
OS(W
);
177 OS
<< "Accelerator does not support the ASM block:\n"
178 << cast
<ConstantDataArray
>(CB
->getArgOperand(0))->getAsCString();
180 OS
<< "Accelerator does not support the " << N
<< " function.";
182 auto Caller
= CB
->getParent()->getParent();
184 Caller
->getContext().diagnose(
185 DiagnosticInfoUnsupported(*Caller
, W
, CB
->getDebugLoc(), DS_Error
));
191 HipStdParAcceleratorCodeSelectionPass::run(Module
&M
,
192 ModuleAnalysisManager
&MAM
) {
193 auto &CGA
= MAM
.getResult
<CallGraphAnalysis
>(M
);
195 SmallPtrSet
<const Function
*, 32> Reachable
;
196 for (auto &&CGN
: CGA
) {
197 if (!isAcceleratorExecutionRoot(CGN
.first
))
200 Reachable
.insert(CGN
.first
);
202 SmallVector
<const Function
*> Tmp({CGN
.first
});
204 auto F
= std::move(Tmp
.back());
207 for (auto &&N
: *CGA
[F
]) {
210 if (!N
.second
->getFunction())
212 if (Reachable
.contains(N
.second
->getFunction()))
215 if (!checkIfSupported(N
.second
->getFunction(),
216 dyn_cast
<CallBase
>(*N
.first
)))
217 return PreservedAnalyses::none();
219 Reachable
.insert(N
.second
->getFunction());
220 Tmp
.push_back(N
.second
->getFunction());
222 } while (!std::empty(Tmp
));
225 if (std::empty(Reachable
))
228 removeUnreachableFunctions(Reachable
, M
);
230 maybeHandleGlobals(M
);
232 return PreservedAnalyses::none();
235 static constexpr std::pair
<StringLiteral
, StringLiteral
> ReplaceMap
[]{
236 {"aligned_alloc", "__hipstdpar_aligned_alloc"},
237 {"calloc", "__hipstdpar_calloc"},
238 {"free", "__hipstdpar_free"},
239 {"malloc", "__hipstdpar_malloc"},
240 {"memalign", "__hipstdpar_aligned_alloc"},
241 {"posix_memalign", "__hipstdpar_posix_aligned_alloc"},
242 {"realloc", "__hipstdpar_realloc"},
243 {"reallocarray", "__hipstdpar_realloc_array"},
244 {"_ZdaPv", "__hipstdpar_operator_delete"},
245 {"_ZdaPvm", "__hipstdpar_operator_delete_sized"},
246 {"_ZdaPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"},
247 {"_ZdaPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"},
248 {"_ZdlPv", "__hipstdpar_operator_delete"},
249 {"_ZdlPvm", "__hipstdpar_operator_delete_sized"},
250 {"_ZdlPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"},
251 {"_ZdlPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"},
252 {"_Znam", "__hipstdpar_operator_new"},
253 {"_ZnamRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"},
254 {"_ZnamSt11align_val_t", "__hipstdpar_operator_new_aligned"},
255 {"_ZnamSt11align_val_tRKSt9nothrow_t",
256 "__hipstdpar_operator_new_aligned_nothrow"},
258 {"_Znwm", "__hipstdpar_operator_new"},
259 {"_ZnwmRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"},
260 {"_ZnwmSt11align_val_t", "__hipstdpar_operator_new_aligned"},
261 {"_ZnwmSt11align_val_tRKSt9nothrow_t",
262 "__hipstdpar_operator_new_aligned_nothrow"},
263 {"__builtin_calloc", "__hipstdpar_calloc"},
264 {"__builtin_free", "__hipstdpar_free"},
265 {"__builtin_malloc", "__hipstdpar_malloc"},
266 {"__builtin_operator_delete", "__hipstdpar_operator_delete"},
267 {"__builtin_operator_new", "__hipstdpar_operator_new"},
268 {"__builtin_realloc", "__hipstdpar_realloc"},
269 {"__libc_calloc", "__hipstdpar_calloc"},
270 {"__libc_free", "__hipstdpar_free"},
271 {"__libc_malloc", "__hipstdpar_malloc"},
272 {"__libc_memalign", "__hipstdpar_aligned_alloc"},
273 {"__libc_realloc", "__hipstdpar_realloc"}
277 HipStdParAllocationInterpositionPass::run(Module
&M
, ModuleAnalysisManager
&) {
278 SmallDenseMap
<StringRef
, StringRef
> AllocReplacements(std::cbegin(ReplaceMap
),
279 std::cend(ReplaceMap
));
284 if (!AllocReplacements
.contains(F
.getName()))
287 if (auto R
= M
.getFunction(AllocReplacements
[F
.getName()])) {
288 F
.replaceAllUsesWith(R
);
291 raw_string_ostream
OS(W
);
293 OS
<< "cannot be interposed, missing: " << AllocReplacements
[F
.getName()]
294 << ". Tried to run the allocation interposition pass without the "
295 << "replacement functions available.";
297 F
.getContext().diagnose(DiagnosticInfoUnsupported(F
, W
,
303 if (auto F
= M
.getFunction("__hipstdpar_hidden_free")) {
304 auto LibcFree
= M
.getOrInsertFunction("__libc_free", F
->getFunctionType(),
306 F
->replaceAllUsesWith(LibcFree
.getCallee());
311 return PreservedAnalyses::none();