1 //===----- HipStdPar.cpp - HIP C++ Standard Parallelism Support Passes ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
8 // This file implements two passes that enable HIP C++ Standard Parallelism
11 // 1. AcceleratorCodeSelection (required): Given that only algorithms are
12 // accelerated, and that the accelerated implementation exists in the form of
13 // a compute kernel, we assume that only the kernel, and all functions
14 // reachable from it, constitute code that the user expects the accelerator
15 // to execute. Thus, we identify the set of all functions reachable from
16 // kernels, and then remove all unreachable ones. This last part is necessary
17 // because it is possible for code that the user did not expect to execute on
18 // an accelerator to contain constructs that cannot be handled by the target
19 // BE, which cannot be provably demonstrated to be dead code in general, and
20 // thus can lead to mis-compilation. The degenerate case of this is when a
21 // Module contains no kernels (the parent TU had no algorithm invocations fit
22 // for acceleration), which we handle by completely emptying said module.
23 // **NOTE**: The above does not handle indirectly reachable functions i.e.
24 // it is possible to obtain a case where the target of an indirect
25 // call is otherwise unreachable and thus is removed; this
26 // restriction is aligned with the current `-hipstdpar` limitations
27 // and will be relaxed in the future.
29 // 2. AllocationInterposition (required only when on-demand paging is
30 // unsupported): Some accelerators or operating systems might not support
31 // transparent on-demand paging. Thus, they would only be able to access
32 // memory that is allocated by an accelerator-aware mechanism. For such cases
33 // the user can opt into enabling allocation / deallocation interposition,
34 // whereby we replace calls to known allocation / deallocation functions with
35 // calls to runtime implemented equivalents that forward the requests to
36 // accelerator-aware interfaces. We also support freeing system allocated
37 // memory that ends up in one of the runtime equivalents, since this can
38 // happen if e.g. a library that was compiled without interposition returns
39 // an allocation that can be validly passed to `free`.
40 //===----------------------------------------------------------------------===//
42 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
44 #include "llvm/ADT/STLExtras.h"
45 #include "llvm/ADT/SmallPtrSet.h"
46 #include "llvm/ADT/SmallVector.h"
47 #include "llvm/Analysis/CallGraph.h"
48 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
49 #include "llvm/IR/Constants.h"
50 #include "llvm/IR/Function.h"
51 #include "llvm/IR/Module.h"
52 #include "llvm/Transforms/Utils/ModuleUtils.h"
61 static inline void eraseFromModule(T
&ToErase
) {
62 ToErase
.replaceAllUsesWith(PoisonValue::get(ToErase
.getType()));
63 ToErase
.eraseFromParent();
66 static inline bool checkIfSupported(GlobalVariable
&G
) {
67 if (!G
.isThreadLocal())
70 G
.dropDroppableUses();
72 if (!G
.isConstantUsed())
76 raw_string_ostream
OS(W
);
78 OS
<< "Accelerator does not support the thread_local variable "
81 Instruction
*I
= nullptr;
82 SmallVector
<User
*> Tmp(G
.users());
83 SmallPtrSet
<User
*, 5> Visited
;
85 auto U
= std::move(Tmp
.back());
88 if (!Visited
.insert(U
).second
)
91 if (isa
<Instruction
>(U
))
92 I
= cast
<Instruction
>(U
);
94 Tmp
.insert(Tmp
.end(), U
->user_begin(), U
->user_end());
95 } while (!I
&& !Tmp
.empty());
97 assert(I
&& "thread_local global should have at least one non-constant use.");
99 G
.getContext().diagnose(
100 DiagnosticInfoUnsupported(*I
->getParent()->getParent(), W
,
101 I
->getDebugLoc(), DS_Error
));
106 static inline void clearModule(Module
&M
) { // TODO: simplify.
107 while (!M
.functions().empty())
108 eraseFromModule(*M
.begin());
109 while (!M
.globals().empty())
110 eraseFromModule(*M
.globals().begin());
111 while (!M
.aliases().empty())
112 eraseFromModule(*M
.aliases().begin());
113 while (!M
.ifuncs().empty())
114 eraseFromModule(*M
.ifuncs().begin());
117 static inline void maybeHandleGlobals(Module
&M
) {
118 unsigned GlobAS
= M
.getDataLayout().getDefaultGlobalsAddressSpace();
119 for (auto &&G
: M
.globals()) { // TODO: should we handle these in the FE?
120 if (!checkIfSupported(G
))
121 return clearModule(M
);
123 if (G
.isThreadLocal())
127 if (G
.getAddressSpace() != GlobAS
)
129 if (G
.getLinkage() != GlobalVariable::ExternalLinkage
)
132 G
.setLinkage(GlobalVariable::ExternalWeakLinkage
);
133 G
.setInitializer(nullptr);
134 G
.setExternallyInitialized(true);
139 static inline void removeUnreachableFunctions(
140 const SmallPtrSet
<const Function
*, N
>& Reachable
, Module
&M
) {
141 removeFromUsedLists(M
, [&](Constant
*C
) {
142 if (auto F
= dyn_cast
<Function
>(C
))
143 return !Reachable
.contains(F
);
148 SmallVector
<std::reference_wrapper
<Function
>> ToRemove
;
149 copy_if(M
, std::back_inserter(ToRemove
), [&](auto &&F
) {
150 return !F
.isIntrinsic() && !Reachable
.contains(&F
);
153 for_each(ToRemove
, eraseFromModule
<Function
>);
156 static inline bool isAcceleratorExecutionRoot(const Function
*F
) {
160 return F
->getCallingConv() == CallingConv::AMDGPU_KERNEL
;
163 static inline bool checkIfSupported(const Function
*F
, const CallBase
*CB
) {
164 const auto Dx
= F
->getName().rfind("__hipstdpar_unsupported");
166 if (Dx
== StringRef::npos
)
169 const auto N
= F
->getName().substr(0, Dx
);
172 raw_string_ostream
OS(W
);
175 OS
<< "Accelerator does not support the ASM block:\n"
176 << cast
<ConstantDataArray
>(CB
->getArgOperand(0))->getAsCString();
178 OS
<< "Accelerator does not support the " << N
<< " function.";
180 auto Caller
= CB
->getParent()->getParent();
182 Caller
->getContext().diagnose(
183 DiagnosticInfoUnsupported(*Caller
, W
, CB
->getDebugLoc(), DS_Error
));
189 HipStdParAcceleratorCodeSelectionPass::run(Module
&M
,
190 ModuleAnalysisManager
&MAM
) {
191 auto &CGA
= MAM
.getResult
<CallGraphAnalysis
>(M
);
193 SmallPtrSet
<const Function
*, 32> Reachable
;
194 for (auto &&CGN
: CGA
) {
195 if (!isAcceleratorExecutionRoot(CGN
.first
))
198 Reachable
.insert(CGN
.first
);
200 SmallVector
<const Function
*> Tmp({CGN
.first
});
202 auto F
= std::move(Tmp
.back());
205 for (auto &&N
: *CGA
[F
]) {
208 if (!N
.second
->getFunction())
210 if (Reachable
.contains(N
.second
->getFunction()))
213 if (!checkIfSupported(N
.second
->getFunction(),
214 dyn_cast
<CallBase
>(*N
.first
)))
215 return PreservedAnalyses::none();
217 Reachable
.insert(N
.second
->getFunction());
218 Tmp
.push_back(N
.second
->getFunction());
220 } while (!std::empty(Tmp
));
223 if (std::empty(Reachable
))
226 removeUnreachableFunctions(Reachable
, M
);
228 maybeHandleGlobals(M
);
230 return PreservedAnalyses::none();
233 static constexpr std::pair
<StringLiteral
, StringLiteral
> ReplaceMap
[]{
234 {"aligned_alloc", "__hipstdpar_aligned_alloc"},
235 {"calloc", "__hipstdpar_calloc"},
236 {"free", "__hipstdpar_free"},
237 {"malloc", "__hipstdpar_malloc"},
238 {"memalign", "__hipstdpar_aligned_alloc"},
239 {"posix_memalign", "__hipstdpar_posix_aligned_alloc"},
240 {"realloc", "__hipstdpar_realloc"},
241 {"reallocarray", "__hipstdpar_realloc_array"},
242 {"_ZdaPv", "__hipstdpar_operator_delete"},
243 {"_ZdaPvm", "__hipstdpar_operator_delete_sized"},
244 {"_ZdaPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"},
245 {"_ZdaPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"},
246 {"_ZdlPv", "__hipstdpar_operator_delete"},
247 {"_ZdlPvm", "__hipstdpar_operator_delete_sized"},
248 {"_ZdlPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"},
249 {"_ZdlPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"},
250 {"_Znam", "__hipstdpar_operator_new"},
251 {"_ZnamRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"},
252 {"_ZnamSt11align_val_t", "__hipstdpar_operator_new_aligned"},
253 {"_ZnamSt11align_val_tRKSt9nothrow_t",
254 "__hipstdpar_operator_new_aligned_nothrow"},
256 {"_Znwm", "__hipstdpar_operator_new"},
257 {"_ZnwmRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"},
258 {"_ZnwmSt11align_val_t", "__hipstdpar_operator_new_aligned"},
259 {"_ZnwmSt11align_val_tRKSt9nothrow_t",
260 "__hipstdpar_operator_new_aligned_nothrow"},
261 {"__builtin_calloc", "__hipstdpar_calloc"},
262 {"__builtin_free", "__hipstdpar_free"},
263 {"__builtin_malloc", "__hipstdpar_malloc"},
264 {"__builtin_operator_delete", "__hipstdpar_operator_delete"},
265 {"__builtin_operator_new", "__hipstdpar_operator_new"},
266 {"__builtin_realloc", "__hipstdpar_realloc"},
267 {"__libc_calloc", "__hipstdpar_calloc"},
268 {"__libc_free", "__hipstdpar_free"},
269 {"__libc_malloc", "__hipstdpar_malloc"},
270 {"__libc_memalign", "__hipstdpar_aligned_alloc"},
271 {"__libc_realloc", "__hipstdpar_realloc"}
275 HipStdParAllocationInterpositionPass::run(Module
&M
, ModuleAnalysisManager
&) {
276 SmallDenseMap
<StringRef
, StringRef
> AllocReplacements(std::cbegin(ReplaceMap
),
277 std::cend(ReplaceMap
));
282 if (!AllocReplacements
.contains(F
.getName()))
285 if (auto R
= M
.getFunction(AllocReplacements
[F
.getName()])) {
286 F
.replaceAllUsesWith(R
);
289 raw_string_ostream
OS(W
);
291 OS
<< "cannot be interposed, missing: " << AllocReplacements
[F
.getName()]
292 << ". Tried to run the allocation interposition pass without the "
293 << "replacement functions available.";
295 F
.getContext().diagnose(DiagnosticInfoUnsupported(F
, W
,
301 if (auto F
= M
.getFunction("__hipstdpar_hidden_free")) {
302 auto LibcFree
= M
.getOrInsertFunction("__libc_free", F
->getFunctionType(),
304 F
->replaceAllUsesWith(LibcFree
.getCallee());
309 return PreservedAnalyses::none();