1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
2 ; RUN: opt --mtriple=amdgcn-amd-amdhsa --data-layout=A5 -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=AMDGPU
3 ; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=NVPTX
4 ; RUN: opt --mtriple=amdgcn-amd-amdhsa --data-layout=A5 -S -passes=openmp-opt -openmp-opt-disable-spmdization < %s | FileCheck %s --check-prefix=AMDGPU-DISABLED
5 ; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt -openmp-opt-disable-spmdization < %s | FileCheck %s --check-prefix=NVPTX-DISABLED
8 ;; void spmd_amenable(void) __attribute__((assume("ompx_spmd_amenable")));
10 ;; void sequential_loop() {
11 ;; #pragma omp target teams
13 ;; for (int i = 0; i < 100; ++i) {
14 ;; #pragma omp parallel
23 ;; void use(__attribute__((noescape)) int *) __attribute__((assume("ompx_spmd_amenable")));
25 ;; void sequential_loop_to_stack_var() {
26 ;; #pragma omp target teams
30 ;; for (int i = 0; i < 100; ++i) {
31 ;; #pragma omp parallel
40 ;; void sequential_loop_to_shared_var() {
41 ;; #pragma omp target teams
44 ;; for (int i = 0; i < 100; ++i) {
45 ;; #pragma omp parallel
55 ;; void sequential_loop_to_shared_var_guarded() {
56 ;; #pragma omp target teams
59 ;; for (int i = 0; i < 100; ++i) {
60 ;; #pragma omp parallel
70 ;; void do_not_spmdize_target() {
71 ;; #pragma omp target teams
73 ;; // Incompatible parallel level, called both
74 ;; // from parallel and target regions
79 ;; void do_not_spmdize_task() {
84 ;; #pragma omp parallel
89 %struct.ident_t = type { i32, i32, i32, i32, i8* }
90 %struct.kmp_task_t_with_privates = type { %struct.kmp_task_t }
91 %struct.kmp_task_t = type { i8*, i32 (i32, i8*)*, i32, %union.kmp_cmplrdata_t, %union.kmp_cmplrdata_t }
92 %union.kmp_cmplrdata_t = type { i32 (i32, i8*)* }
93 %struct.anon = type {}
95 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
96 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
97 @__omp_offloading_fd02_2044372e_sequential_loop_l5_exec_mode = weak constant i8 1
98 @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_exec_mode = weak constant i8 1
99 @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_exec_mode = weak constant i8 1
100 @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_exec_mode = weak constant i8 1
101 @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_exec_mode = weak constant i8 1
102 @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_exec_mode = weak constant i8 1
103 @llvm.compiler.used = appending global [6 x i8*] [i8* @__omp_offloading_fd02_2044372e_sequential_loop_l5_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_exec_mode], section "llvm.metadata"
105 ; Function Attrs: alwaysinline convergent norecurse nounwind
107 ; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
108 ; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
109 ; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
110 ; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
111 ; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
112 ; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
113 ; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
114 ; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
115 ; AMDGPU: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [6 x i8*] [i8* @__omp_offloading_fd02_2044372e_sequential_loop_l5_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_exec_mode], section "llvm.metadata"
116 ; AMDGPU: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
117 ; AMDGPU: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
118 ; AMDGPU: @[[X_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
119 ; AMDGPU: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
121 ; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
122 ; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
123 ; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
124 ; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
125 ; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
126 ; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
127 ; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
128 ; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
129 ; NVPTX: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [6 x i8*] [i8* @__omp_offloading_fd02_2044372e_sequential_loop_l5_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_exec_mode], section "llvm.metadata"
130 ; NVPTX: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
131 ; NVPTX: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
132 ; NVPTX: @[[X1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
133 ; NVPTX: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
135 ; AMDGPU-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
136 ; AMDGPU-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
137 ; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
138 ; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
139 ; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
140 ; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
141 ; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
142 ; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
143 ; AMDGPU-DISABLED: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [6 x i8*] [i8* @__omp_offloading_fd02_2044372e_sequential_loop_l5_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_exec_mode], section "llvm.metadata"
144 ; AMDGPU-DISABLED: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
145 ; AMDGPU-DISABLED: @[[X_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
146 ; AMDGPU-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
147 ; AMDGPU-DISABLED: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
148 ; AMDGPU-DISABLED: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
149 ; AMDGPU-DISABLED: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
150 ; AMDGPU-DISABLED: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
152 ; NVPTX-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
153 ; NVPTX-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
154 ; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
155 ; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
156 ; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
157 ; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
158 ; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
159 ; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
160 ; NVPTX-DISABLED: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [6 x i8*] [i8* @__omp_offloading_fd02_2044372e_sequential_loop_l5_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_exec_mode, i8* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_exec_mode], section "llvm.metadata"
161 ; NVPTX-DISABLED: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
162 ; NVPTX-DISABLED: @[[X1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
163 ; NVPTX-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
164 ; NVPTX-DISABLED: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
165 ; NVPTX-DISABLED: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
166 ; NVPTX-DISABLED: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
167 ; NVPTX-DISABLED: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
169 define weak void @__omp_offloading_fd02_2044372e_sequential_loop_l5() #0 {
170 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
171 ; AMDGPU-SAME: () #[[ATTR0:[0-9]+]] {
172 ; AMDGPU-NEXT: entry:
173 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
174 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
175 ; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false)
176 ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
177 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
178 ; AMDGPU: common.ret:
179 ; AMDGPU-NEXT: ret void
180 ; AMDGPU: user_code.entry:
181 ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]]
182 ; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
183 ; AMDGPU-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
184 ; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
185 ; AMDGPU-NEXT: br label [[COMMON_RET]]
187 ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
188 ; NVPTX-SAME: () #[[ATTR0:[0-9]+]] {
190 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
191 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
192 ; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false)
193 ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
194 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
196 ; NVPTX-NEXT: ret void
197 ; NVPTX: user_code.entry:
198 ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]]
199 ; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
200 ; NVPTX-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
201 ; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
202 ; NVPTX-NEXT: br label [[COMMON_RET]]
204 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
205 ; AMDGPU-DISABLED-SAME: () #[[ATTR0:[0-9]+]] {
206 ; AMDGPU-DISABLED-NEXT: entry:
207 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5)
208 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
209 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
210 ; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
211 ; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
212 ; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
213 ; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
214 ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
215 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
216 ; AMDGPU-DISABLED: is_worker_check:
217 ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
218 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
219 ; AMDGPU-DISABLED: worker_state_machine.begin:
220 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
221 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8**
222 ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]])
223 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
224 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
225 ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
226 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
227 ; AMDGPU-DISABLED: worker_state_machine.finished:
228 ; AMDGPU-DISABLED-NEXT: ret void
229 ; AMDGPU-DISABLED: worker_state_machine.is_active.check:
230 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
231 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.check:
232 ; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__1_wrapper.ID to void (i16, i32)*)
233 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
234 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute:
235 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]])
236 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
237 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute:
238 ; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
239 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
240 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.end:
241 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
242 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
243 ; AMDGPU-DISABLED: worker_state_machine.done.barrier:
244 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
245 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
246 ; AMDGPU-DISABLED: thread.user_code.check:
247 ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
248 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
249 ; AMDGPU-DISABLED: common.ret:
250 ; AMDGPU-DISABLED-NEXT: ret void
251 ; AMDGPU-DISABLED: user_code.entry:
252 ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]]
253 ; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
254 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
255 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
256 ; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]]
258 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
259 ; NVPTX-DISABLED-SAME: () #[[ATTR0:[0-9]+]] {
260 ; NVPTX-DISABLED-NEXT: entry:
261 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
262 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
263 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
264 ; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
265 ; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
266 ; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
267 ; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
268 ; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
269 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
270 ; NVPTX-DISABLED: is_worker_check:
271 ; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
272 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
273 ; NVPTX-DISABLED: worker_state_machine.begin:
274 ; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
275 ; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]])
276 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
277 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
278 ; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
279 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
280 ; NVPTX-DISABLED: worker_state_machine.finished:
281 ; NVPTX-DISABLED-NEXT: ret void
282 ; NVPTX-DISABLED: worker_state_machine.is_active.check:
283 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
284 ; NVPTX-DISABLED: worker_state_machine.parallel_region.check:
285 ; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__1_wrapper.ID to void (i16, i32)*)
286 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
287 ; NVPTX-DISABLED: worker_state_machine.parallel_region.execute:
288 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]])
289 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
290 ; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute:
291 ; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
292 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
293 ; NVPTX-DISABLED: worker_state_machine.parallel_region.end:
294 ; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
295 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
296 ; NVPTX-DISABLED: worker_state_machine.done.barrier:
297 ; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
298 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
299 ; NVPTX-DISABLED: thread.user_code.check:
300 ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
301 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
302 ; NVPTX-DISABLED: common.ret:
303 ; NVPTX-DISABLED-NEXT: ret void
304 ; NVPTX-DISABLED: user_code.entry:
305 ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]]
306 ; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
307 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
308 ; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
309 ; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]]
312 %.zero.addr = alloca i32, align 4
313 %.threadid_temp. = alloca i32, align 4
314 %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true)
315 %exec_user_code = icmp eq i32 %0, -1
316 br i1 %exec_user_code, label %user_code.entry, label %common.ret
318 common.ret: ; preds = %entry, %user_code.entry
321 user_code.entry: ; preds = %entry
322 %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
323 store i32 0, i32* %.zero.addr, align 4
324 store i32 %1, i32* %.threadid_temp., align 4, !tbaa !18
325 call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #6
326 call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true)
330 ; Function Attrs: alwaysinline convergent norecurse nounwind
331 define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
332 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__
333 ; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
334 ; AMDGPU-NEXT: entry:
335 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
336 ; AMDGPU-NEXT: br label [[FOR_COND:%.*]]
338 ; AMDGPU-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
339 ; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
340 ; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
341 ; AMDGPU: for.cond.cleanup:
342 ; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6:[0-9]+]]
343 ; AMDGPU-NEXT: ret void
345 ; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
346 ; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
347 ; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
348 ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
349 ; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
351 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__
352 ; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
354 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
355 ; NVPTX-NEXT: br label [[FOR_COND:%.*]]
357 ; NVPTX-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
358 ; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
359 ; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
360 ; NVPTX: for.cond.cleanup:
361 ; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6:[0-9]+]]
362 ; NVPTX-NEXT: ret void
364 ; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
365 ; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
366 ; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
367 ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
368 ; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
370 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__
371 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
372 ; AMDGPU-DISABLED-NEXT: entry:
373 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
374 ; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]]
375 ; AMDGPU-DISABLED: for.cond:
376 ; AMDGPU-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
377 ; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
378 ; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
379 ; AMDGPU-DISABLED: for.cond.cleanup:
380 ; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6:[0-9]+]]
381 ; AMDGPU-DISABLED-NEXT: ret void
382 ; AMDGPU-DISABLED: for.body:
383 ; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
384 ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
385 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
386 ; AMDGPU-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
387 ; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
389 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__
390 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
391 ; NVPTX-DISABLED-NEXT: entry:
392 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
393 ; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]]
394 ; NVPTX-DISABLED: for.cond:
395 ; NVPTX-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
396 ; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
397 ; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
398 ; NVPTX-DISABLED: for.cond.cleanup:
399 ; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6:[0-9]+]]
400 ; NVPTX-DISABLED-NEXT: ret void
401 ; NVPTX-DISABLED: for.body:
402 ; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
403 ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
404 ; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
405 ; NVPTX-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
406 ; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
409 %captured_vars_addrs = alloca [0 x i8*], align 8
412 for.cond: ; preds = %for.body, %entry
413 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
414 %cmp = icmp slt i32 %i.0, 100
415 br i1 %cmp, label %for.body, label %for.cond.cleanup
417 for.cond.cleanup: ; preds = %for.cond
418 call void @spmd_amenable() #10
421 for.body: ; preds = %for.cond
422 %0 = load i32, i32* %.global_tid., align 4, !tbaa !18
423 %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
424 call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** %1, i64 0)
425 %inc = add nsw i32 %i.0, 1
426 br label %for.cond, !llvm.loop !22
429 ; Function Attrs: alwaysinline convergent norecurse nounwind
430 define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
431 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1
432 ; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
433 ; AMDGPU-NEXT: entry:
434 ; AMDGPU-NEXT: call void @unknown() #[[ATTR7:[0-9]+]]
435 ; AMDGPU-NEXT: ret void
437 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1
438 ; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
440 ; NVPTX-NEXT: call void @unknown() #[[ATTR7:[0-9]+]]
441 ; NVPTX-NEXT: ret void
443 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1
444 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
445 ; AMDGPU-DISABLED-NEXT: entry:
446 ; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7:[0-9]+]]
447 ; AMDGPU-DISABLED-NEXT: ret void
449 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1
450 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
451 ; NVPTX-DISABLED-NEXT: entry:
452 ; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7:[0-9]+]]
453 ; NVPTX-DISABLED-NEXT: ret void
456 call void @unknown() #11
460 ; Function Attrs: convergent norecurse nounwind
461 define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #3 {
462 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
463 ; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
464 ; AMDGPU-NEXT: entry:
465 ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
466 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
467 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
468 ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
469 ; AMDGPU-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
470 ; AMDGPU-NEXT: ret void
472 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
473 ; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
475 ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
476 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
477 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
478 ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
479 ; NVPTX-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
480 ; NVPTX-NEXT: ret void
482 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
483 ; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
484 ; AMDGPU-DISABLED-NEXT: entry:
485 ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
486 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
487 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
488 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
489 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
490 ; AMDGPU-DISABLED-NEXT: ret void
492 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
493 ; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
494 ; NVPTX-DISABLED-NEXT: entry:
495 ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
496 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
497 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
498 ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
499 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
500 ; NVPTX-DISABLED-NEXT: ret void
503 %.addr1 = alloca i32, align 4
504 %.zero.addr = alloca i32, align 4
505 %global_args = alloca i8**, align 8
506 store i32 %1, i32* %.addr1, align 4, !tbaa !18
507 store i32 0, i32* %.zero.addr, align 4
508 call void @__kmpc_get_shared_variables(i8*** %global_args)
509 call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr) #6
513 ; Function Attrs: alwaysinline convergent norecurse nounwind
514 define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20() #0 {
515 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
516 ; AMDGPU-SAME: () #[[ATTR0]] {
517 ; AMDGPU-NEXT: entry:
518 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
519 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
520 ; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false)
521 ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
522 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
523 ; AMDGPU: common.ret:
524 ; AMDGPU-NEXT: ret void
525 ; AMDGPU: user_code.entry:
526 ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
527 ; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
528 ; AMDGPU-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
529 ; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
530 ; AMDGPU-NEXT: br label [[COMMON_RET]]
532 ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
533 ; NVPTX-SAME: () #[[ATTR0]] {
535 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
536 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
537 ; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false)
538 ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
539 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
541 ; NVPTX-NEXT: ret void
542 ; NVPTX: user_code.entry:
543 ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
544 ; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
545 ; NVPTX-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
546 ; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
547 ; NVPTX-NEXT: br label [[COMMON_RET]]
549 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
550 ; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
551 ; AMDGPU-DISABLED-NEXT: entry:
552 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5)
553 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
554 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
555 ; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
556 ; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
557 ; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
558 ; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
559 ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
560 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
561 ; AMDGPU-DISABLED: is_worker_check:
562 ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
563 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
564 ; AMDGPU-DISABLED: worker_state_machine.begin:
565 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
566 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8**
567 ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]])
568 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
569 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
570 ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
571 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
572 ; AMDGPU-DISABLED: worker_state_machine.finished:
573 ; AMDGPU-DISABLED-NEXT: ret void
574 ; AMDGPU-DISABLED: worker_state_machine.is_active.check:
575 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
576 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.check:
577 ; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__3_wrapper.ID to void (i16, i32)*)
578 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
579 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute:
580 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]])
581 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
582 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute:
583 ; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
584 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
585 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.end:
586 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
587 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
588 ; AMDGPU-DISABLED: worker_state_machine.done.barrier:
589 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
590 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
591 ; AMDGPU-DISABLED: thread.user_code.check:
592 ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
593 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
594 ; AMDGPU-DISABLED: common.ret:
595 ; AMDGPU-DISABLED-NEXT: ret void
596 ; AMDGPU-DISABLED: user_code.entry:
597 ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
598 ; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
599 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
600 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
601 ; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]]
603 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
604 ; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
605 ; NVPTX-DISABLED-NEXT: entry:
606 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
607 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
608 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
609 ; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
610 ; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
611 ; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
612 ; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
613 ; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
614 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
615 ; NVPTX-DISABLED: is_worker_check:
616 ; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
617 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
618 ; NVPTX-DISABLED: worker_state_machine.begin:
619 ; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
620 ; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]])
621 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
622 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
623 ; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
624 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
625 ; NVPTX-DISABLED: worker_state_machine.finished:
626 ; NVPTX-DISABLED-NEXT: ret void
627 ; NVPTX-DISABLED: worker_state_machine.is_active.check:
628 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
629 ; NVPTX-DISABLED: worker_state_machine.parallel_region.check:
630 ; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__3_wrapper.ID to void (i16, i32)*)
631 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
632 ; NVPTX-DISABLED: worker_state_machine.parallel_region.execute:
633 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]])
634 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
635 ; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute:
636 ; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
637 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
638 ; NVPTX-DISABLED: worker_state_machine.parallel_region.end:
639 ; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
640 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
641 ; NVPTX-DISABLED: worker_state_machine.done.barrier:
642 ; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
643 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
644 ; NVPTX-DISABLED: thread.user_code.check:
645 ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
646 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
647 ; NVPTX-DISABLED: common.ret:
648 ; NVPTX-DISABLED-NEXT: ret void
649 ; NVPTX-DISABLED: user_code.entry:
650 ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
651 ; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
652 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__2(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
653 ; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
654 ; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]]
657 %.zero.addr = alloca i32, align 4
658 %.threadid_temp. = alloca i32, align 4
659 %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true)
660 %exec_user_code = icmp eq i32 %0, -1
661 br i1 %exec_user_code, label %user_code.entry, label %common.ret
663 common.ret: ; preds = %entry, %user_code.entry
666 user_code.entry: ; preds = %entry
667 %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
668 store i32 0, i32* %.zero.addr, align 4
669 store i32 %1, i32* %.threadid_temp., align 4, !tbaa !18
670 call void @__omp_outlined__2(i32* %.threadid_temp., i32* %.zero.addr) #6
671 call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true)
675 ; Function Attrs: alwaysinline convergent norecurse nounwind
676 define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
677 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__2
678 ; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
679 ; AMDGPU-NEXT: entry:
680 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
681 ; AMDGPU-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4
682 ; AMDGPU-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
683 ; AMDGPU-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR6]]
684 ; AMDGPU-NEXT: br label [[FOR_COND:%.*]]
686 ; AMDGPU-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
687 ; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
688 ; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
689 ; AMDGPU: for.cond.cleanup:
690 ; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6]]
691 ; AMDGPU-NEXT: ret void
693 ; AMDGPU-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
694 ; AMDGPU-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
695 ; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
696 ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
697 ; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
699 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__2
700 ; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
702 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
703 ; NVPTX-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4
704 ; NVPTX-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
705 ; NVPTX-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR6]]
706 ; NVPTX-NEXT: br label [[FOR_COND:%.*]]
708 ; NVPTX-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
709 ; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
710 ; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
711 ; NVPTX: for.cond.cleanup:
712 ; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6]]
713 ; NVPTX-NEXT: ret void
715 ; NVPTX-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
716 ; NVPTX-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
717 ; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
718 ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
719 ; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
721 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2
722 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
723 ; AMDGPU-DISABLED-NEXT: entry:
724 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
725 ; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4
726 ; AMDGPU-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
727 ; AMDGPU-DISABLED-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR6]]
728 ; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]]
729 ; AMDGPU-DISABLED: for.cond:
730 ; AMDGPU-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
731 ; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
732 ; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
733 ; AMDGPU-DISABLED: for.cond.cleanup:
734 ; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]]
735 ; AMDGPU-DISABLED-NEXT: ret void
736 ; AMDGPU-DISABLED: for.body:
737 ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
738 ; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
739 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
740 ; AMDGPU-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
741 ; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
743 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2
744 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
745 ; NVPTX-DISABLED-NEXT: entry:
746 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
747 ; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4
748 ; NVPTX-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
749 ; NVPTX-DISABLED-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR6]]
750 ; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]]
751 ; NVPTX-DISABLED: for.cond:
752 ; NVPTX-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
753 ; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
754 ; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
755 ; NVPTX-DISABLED: for.cond.cleanup:
756 ; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]]
757 ; NVPTX-DISABLED-NEXT: ret void
758 ; NVPTX-DISABLED: for.body:
759 ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
760 ; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
761 ; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
762 ; NVPTX-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
763 ; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
766 %captured_vars_addrs = alloca [0 x i8*], align 8
767 %x = call align 4 i8* @__kmpc_alloc_shared(i64 4)
768 %x_on_stack = bitcast i8* %x to i32*
769 call void @use(i32* nocapture %x_on_stack) #10
772 for.cond: ; preds = %for.body, %entry
773 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
774 %cmp = icmp slt i32 %i.0, 100
775 br i1 %cmp, label %for.body, label %for.cond.cleanup
777 for.cond.cleanup: ; preds = %for.cond
778 call void @spmd_amenable() #10
779 call void @__kmpc_free_shared(i8* %x, i64 4)
782 for.body: ; preds = %for.cond
783 %0 = load i32, i32* %.global_tid., align 4, !tbaa !18
784 %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
785 call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** %1, i64 0)
786 %inc = add nsw i32 %i.0, 1
787 br label %for.cond, !llvm.loop !25
789 ; Function Attrs: alwaysinline convergent norecurse nounwind
790 define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
791 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3
792 ; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
793 ; AMDGPU-NEXT: entry:
794 ; AMDGPU-NEXT: call void @unknown() #[[ATTR7]]
795 ; AMDGPU-NEXT: ret void
797 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3
798 ; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
800 ; NVPTX-NEXT: call void @unknown() #[[ATTR7]]
801 ; NVPTX-NEXT: ret void
803 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3
804 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
805 ; AMDGPU-DISABLED-NEXT: entry:
806 ; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7]]
807 ; AMDGPU-DISABLED-NEXT: ret void
809 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3
810 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
811 ; NVPTX-DISABLED-NEXT: entry:
812 ; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7]]
813 ; NVPTX-DISABLED-NEXT: ret void
816 call void @unknown() #11
820 ; Function Attrs: convergent norecurse nounwind
821 define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #3 {
822 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
823 ; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] {
824 ; AMDGPU-NEXT: entry:
825 ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
826 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
827 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
828 ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
829 ; AMDGPU-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
830 ; AMDGPU-NEXT: ret void
832 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
833 ; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] {
835 ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
836 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
837 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
838 ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
839 ; NVPTX-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
840 ; NVPTX-NEXT: ret void
842 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
843 ; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] {
844 ; AMDGPU-DISABLED-NEXT: entry:
845 ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
846 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
847 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
848 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
849 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
850 ; AMDGPU-DISABLED-NEXT: ret void
852 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
853 ; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] {
854 ; NVPTX-DISABLED-NEXT: entry:
855 ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
856 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
857 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
858 ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
859 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__3(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
860 ; NVPTX-DISABLED-NEXT: ret void
863 %.addr1 = alloca i32, align 4
864 %.zero.addr = alloca i32, align 4
865 %global_args = alloca i8**, align 8
866 store i32 %1, i32* %.addr1, align 4, !tbaa !18
867 store i32 0, i32* %.zero.addr, align 4
868 call void @__kmpc_get_shared_variables(i8*** %global_args)
869 call void @__omp_outlined__3(i32* %.addr1, i32* %.zero.addr) #6
874 ; Function Attrs: alwaysinline convergent norecurse nounwind
875 define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35() #0 {
876 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
877 ; AMDGPU-SAME: () #[[ATTR0]] {
878 ; AMDGPU-NEXT: entry:
879 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
880 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
881 ; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false)
882 ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
883 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
884 ; AMDGPU: common.ret:
885 ; AMDGPU-NEXT: ret void
886 ; AMDGPU: user_code.entry:
887 ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
888 ; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
889 ; AMDGPU-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
890 ; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
891 ; AMDGPU-NEXT: br label [[COMMON_RET]]
893 ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
894 ; NVPTX-SAME: () #[[ATTR0]] {
896 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
897 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
898 ; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false)
899 ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
900 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
902 ; NVPTX-NEXT: ret void
903 ; NVPTX: user_code.entry:
904 ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
905 ; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
906 ; NVPTX-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
907 ; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
908 ; NVPTX-NEXT: br label [[COMMON_RET]]
910 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
911 ; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
912 ; AMDGPU-DISABLED-NEXT: entry:
913 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5)
914 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
915 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
916 ; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
917 ; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
918 ; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
919 ; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
920 ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
921 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
922 ; AMDGPU-DISABLED: is_worker_check:
923 ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
924 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
925 ; AMDGPU-DISABLED: worker_state_machine.begin:
926 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
927 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8**
928 ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]])
929 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
930 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
931 ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
932 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
933 ; AMDGPU-DISABLED: worker_state_machine.finished:
934 ; AMDGPU-DISABLED-NEXT: ret void
935 ; AMDGPU-DISABLED: worker_state_machine.is_active.check:
936 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
937 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.check:
938 ; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__5_wrapper.ID to void (i16, i32)*)
939 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
940 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute:
941 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]])
942 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
943 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute:
944 ; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
945 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
946 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.end:
947 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
948 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
949 ; AMDGPU-DISABLED: worker_state_machine.done.barrier:
950 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
951 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
952 ; AMDGPU-DISABLED: thread.user_code.check:
953 ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
954 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
955 ; AMDGPU-DISABLED: common.ret:
956 ; AMDGPU-DISABLED-NEXT: ret void
957 ; AMDGPU-DISABLED: user_code.entry:
958 ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
959 ; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
960 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
961 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
962 ; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]]
964 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
965 ; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
966 ; NVPTX-DISABLED-NEXT: entry:
967 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
968 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
969 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
970 ; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
971 ; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
972 ; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
973 ; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
974 ; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
975 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
976 ; NVPTX-DISABLED: is_worker_check:
977 ; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
978 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
979 ; NVPTX-DISABLED: worker_state_machine.begin:
980 ; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
981 ; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]])
982 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
983 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
984 ; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
985 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
986 ; NVPTX-DISABLED: worker_state_machine.finished:
987 ; NVPTX-DISABLED-NEXT: ret void
988 ; NVPTX-DISABLED: worker_state_machine.is_active.check:
989 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
990 ; NVPTX-DISABLED: worker_state_machine.parallel_region.check:
991 ; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__5_wrapper.ID to void (i16, i32)*)
992 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
993 ; NVPTX-DISABLED: worker_state_machine.parallel_region.execute:
994 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]])
995 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
996 ; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute:
997 ; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
998 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
999 ; NVPTX-DISABLED: worker_state_machine.parallel_region.end:
1000 ; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
1001 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
1002 ; NVPTX-DISABLED: worker_state_machine.done.barrier:
1003 ; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
1004 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
1005 ; NVPTX-DISABLED: thread.user_code.check:
1006 ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1007 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
1008 ; NVPTX-DISABLED: common.ret:
1009 ; NVPTX-DISABLED-NEXT: ret void
1010 ; NVPTX-DISABLED: user_code.entry:
1011 ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
1012 ; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
1013 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
1014 ; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
1015 ; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]]
1018 %.zero.addr = alloca i32, align 4
1019 %.threadid_temp. = alloca i32, align 4
1020 %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true)
1021 %exec_user_code = icmp eq i32 %0, -1
1022 br i1 %exec_user_code, label %user_code.entry, label %common.ret
1024 common.ret: ; preds = %entry, %user_code.entry
1027 user_code.entry: ; preds = %entry
1028 %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
1029 store i32 0, i32* %.zero.addr, align 4
1030 store i32 %1, i32* %.threadid_temp., align 4, !tbaa !18
1031 call void @__omp_outlined__4(i32* %.threadid_temp., i32* %.zero.addr) #6
1032 call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true)
1033 br label %common.ret
1036 ; Function Attrs: alwaysinline convergent norecurse nounwind
1037 define internal void @__omp_outlined__4(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
1038 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__4
1039 ; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
1040 ; AMDGPU-NEXT: entry:
1041 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
1042 ; AMDGPU-NEXT: br label [[FOR_COND:%.*]]
1044 ; AMDGPU-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
1045 ; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
1046 ; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
1047 ; AMDGPU: for.cond.cleanup:
1048 ; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6]]
1049 ; AMDGPU-NEXT: ret void
1051 ; AMDGPU-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
1052 ; AMDGPU-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP0]], align 8, !tbaa [[TBAA26:![0-9]+]]
1053 ; AMDGPU-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
1054 ; AMDGPU-NEXT: [[TMP2:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1055 ; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 1)
1056 ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
1057 ; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
1059 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__4
1060 ; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
1061 ; NVPTX-NEXT: entry:
1062 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
1063 ; NVPTX-NEXT: br label [[FOR_COND:%.*]]
1065 ; NVPTX-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
1066 ; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
1067 ; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
1068 ; NVPTX: for.cond.cleanup:
1069 ; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6]]
1070 ; NVPTX-NEXT: ret void
1072 ; NVPTX-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
1073 ; NVPTX-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP0]], align 8, !tbaa [[TBAA26:![0-9]+]]
1074 ; NVPTX-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
1075 ; NVPTX-NEXT: [[TMP2:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1076 ; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 1)
1077 ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
1078 ; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
1080 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4
1081 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
1082 ; AMDGPU-DISABLED-NEXT: entry:
1083 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
1084 ; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]]
1085 ; AMDGPU-DISABLED: for.cond:
1086 ; AMDGPU-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
1087 ; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
1088 ; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
1089 ; AMDGPU-DISABLED: for.cond.cleanup:
1090 ; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]]
1091 ; AMDGPU-DISABLED-NEXT: ret void
1092 ; AMDGPU-DISABLED: for.body:
1093 ; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
1094 ; AMDGPU-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP0]], align 8, !tbaa [[TBAA26:![0-9]+]]
1095 ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
1096 ; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1097 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 1)
1098 ; AMDGPU-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
1099 ; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
1101 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4
1102 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
1103 ; NVPTX-DISABLED-NEXT: entry:
1104 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
1105 ; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]]
1106 ; NVPTX-DISABLED: for.cond:
1107 ; NVPTX-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
1108 ; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
1109 ; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
1110 ; NVPTX-DISABLED: for.cond.cleanup:
1111 ; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]]
1112 ; NVPTX-DISABLED-NEXT: ret void
1113 ; NVPTX-DISABLED: for.body:
1114 ; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
1115 ; NVPTX-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x, i32 0, i32 0) to i8*), i8** [[TMP0]], align 8, !tbaa [[TBAA26:![0-9]+]]
1116 ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
1117 ; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1118 ; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 1)
1119 ; NVPTX-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
1120 ; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
1123 %captured_vars_addrs = alloca [1 x i8*], align 8
1124 %x = call align 4 i8* @__kmpc_alloc_shared(i64 4)
1125 %x_on_stack = bitcast i8* %x to i32*
1128 for.cond: ; preds = %for.body, %entry
1129 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
1130 %cmp = icmp slt i32 %i.0, 100
1131 br i1 %cmp, label %for.body, label %for.cond.cleanup
1133 for.cond.cleanup: ; preds = %for.cond
1134 call void @spmd_amenable() #10
1135 call void @__kmpc_free_shared(i8* %x, i64 4)
1138 for.body: ; preds = %for.cond
1139 %0 = getelementptr inbounds [1 x i8*], [1 x i8*]* %captured_vars_addrs, i64 0, i64 0
1140 store i8* %x, i8** %0, align 8, !tbaa !26
1141 %1 = load i32, i32* %.global_tid., align 4, !tbaa !18
1142 %2 = bitcast [1 x i8*]* %captured_vars_addrs to i8**
1143 call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__5 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** %2, i64 1)
1144 %inc = add nsw i32 %i.0, 1
1145 br label %for.cond, !llvm.loop !28
1148 ; Function Attrs: alwaysinline convergent norecurse nounwind
1149 define internal void @__omp_outlined__5(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %x) #0 {
1150 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5
1151 ; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
1152 ; AMDGPU-NEXT: entry:
1153 ; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4, !tbaa [[TBAA18]]
1154 ; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
1155 ; AMDGPU-NEXT: store i32 [[INC]], i32* [[X]], align 4, !tbaa [[TBAA18]]
1156 ; AMDGPU-NEXT: call void @unknown() #[[ATTR7]]
1157 ; AMDGPU-NEXT: ret void
1159 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5
1160 ; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
1161 ; NVPTX-NEXT: entry:
1162 ; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4, !tbaa [[TBAA18]]
1163 ; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
1164 ; NVPTX-NEXT: store i32 [[INC]], i32* [[X]], align 4, !tbaa [[TBAA18]]
1165 ; NVPTX-NEXT: call void @unknown() #[[ATTR7]]
1166 ; NVPTX-NEXT: ret void
1168 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5
1169 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
1170 ; AMDGPU-DISABLED-NEXT: entry:
1171 ; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4, !tbaa [[TBAA18]]
1172 ; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
1173 ; AMDGPU-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4, !tbaa [[TBAA18]]
1174 ; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7]]
1175 ; AMDGPU-DISABLED-NEXT: ret void
1177 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5
1178 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
1179 ; NVPTX-DISABLED-NEXT: entry:
1180 ; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4, !tbaa [[TBAA18]]
1181 ; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
1182 ; NVPTX-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4, !tbaa [[TBAA18]]
1183 ; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7]]
1184 ; NVPTX-DISABLED-NEXT: ret void
1187 %0 = load i32, i32* %x, align 4, !tbaa !18
1188 %inc = add nsw i32 %0, 1
1189 store i32 %inc, i32* %x, align 4, !tbaa !18
1190 call void @unknown() #11
1194 ; Function Attrs: convergent norecurse nounwind
1195 define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 {
1196 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
1197 ; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] {
1198 ; AMDGPU-NEXT: entry:
1199 ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1200 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1201 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
1202 ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1203 ; AMDGPU-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
1204 ; AMDGPU-NEXT: [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32**
1205 ; AMDGPU-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8, !tbaa [[TBAA26]]
1206 ; AMDGPU-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP4]]) #[[ATTR3]]
1207 ; AMDGPU-NEXT: ret void
1209 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
1210 ; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] {
1211 ; NVPTX-NEXT: entry:
1212 ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1213 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1214 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
1215 ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1216 ; NVPTX-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
1217 ; NVPTX-NEXT: [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32**
1218 ; NVPTX-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8, !tbaa [[TBAA26]]
1219 ; NVPTX-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP4]]) #[[ATTR3]]
1220 ; NVPTX-NEXT: ret void
1222 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
1223 ; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] {
1224 ; AMDGPU-DISABLED-NEXT: entry:
1225 ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1226 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1227 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
1228 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1229 ; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
1230 ; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32**
1231 ; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8, !tbaa [[TBAA26]]
1232 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP4]]) #[[ATTR3]]
1233 ; AMDGPU-DISABLED-NEXT: ret void
1235 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
1236 ; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] {
1237 ; NVPTX-DISABLED-NEXT: entry:
1238 ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1239 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1240 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
1241 ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1242 ; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
1243 ; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32**
1244 ; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8, !tbaa [[TBAA26]]
1245 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__5(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP4]]) #[[ATTR3]]
1246 ; NVPTX-DISABLED-NEXT: ret void
1249 %.addr1 = alloca i32, align 4
1250 %.zero.addr = alloca i32, align 4
1251 %global_args = alloca i8**, align 8
1252 store i32 %1, i32* %.addr1, align 4, !tbaa !18
1253 store i32 0, i32* %.zero.addr, align 4
1254 call void @__kmpc_get_shared_variables(i8*** %global_args)
1255 %2 = load i8**, i8*** %global_args, align 8
1256 %3 = bitcast i8** %2 to i32**
1257 %4 = load i32*, i32** %3, align 8, !tbaa !26
1258 call void @__omp_outlined__5(i32* %.addr1, i32* %.zero.addr, i32* %4) #6
1262 ; Function Attrs: alwaysinline convergent norecurse nounwind
1263 define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50() #0 {
1264 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
1265 ; AMDGPU-SAME: () #[[ATTR0]] {
1266 ; AMDGPU-NEXT: entry:
1267 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1268 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1269 ; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false)
1270 ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1271 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
1272 ; AMDGPU: common.ret:
1273 ; AMDGPU-NEXT: ret void
1274 ; AMDGPU: user_code.entry:
1275 ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
1276 ; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
1277 ; AMDGPU-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
1278 ; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
1279 ; AMDGPU-NEXT: br label [[COMMON_RET]]
1281 ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
1282 ; NVPTX-SAME: () #[[ATTR0]] {
1283 ; NVPTX-NEXT: entry:
1284 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1285 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1286 ; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false)
1287 ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1288 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
1289 ; NVPTX: common.ret:
1290 ; NVPTX-NEXT: ret void
1291 ; NVPTX: user_code.entry:
1292 ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
1293 ; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
1294 ; NVPTX-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
1295 ; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false)
1296 ; NVPTX-NEXT: br label [[COMMON_RET]]
1298 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
1299 ; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
1300 ; AMDGPU-DISABLED-NEXT: entry:
1301 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5)
1302 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1303 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1304 ; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
1305 ; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
1306 ; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
1307 ; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
1308 ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
1309 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
1310 ; AMDGPU-DISABLED: is_worker_check:
1311 ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
1312 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
1313 ; AMDGPU-DISABLED: worker_state_machine.begin:
1314 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
1315 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8**
1316 ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]])
1317 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
1318 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
1319 ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
1320 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
1321 ; AMDGPU-DISABLED: worker_state_machine.finished:
1322 ; AMDGPU-DISABLED-NEXT: ret void
1323 ; AMDGPU-DISABLED: worker_state_machine.is_active.check:
1324 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
1325 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.check:
1326 ; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__7_wrapper.ID to void (i16, i32)*)
1327 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
1328 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute:
1329 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]])
1330 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
1331 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute:
1332 ; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
1333 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
1334 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.end:
1335 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
1336 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
1337 ; AMDGPU-DISABLED: worker_state_machine.done.barrier:
1338 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
1339 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
1340 ; AMDGPU-DISABLED: thread.user_code.check:
1341 ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1342 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
1343 ; AMDGPU-DISABLED: common.ret:
1344 ; AMDGPU-DISABLED-NEXT: ret void
1345 ; AMDGPU-DISABLED: user_code.entry:
1346 ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
1347 ; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
1348 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
1349 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
1350 ; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]]
1352 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
1353 ; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
1354 ; NVPTX-DISABLED-NEXT: entry:
1355 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
1356 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1357 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1358 ; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
1359 ; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
1360 ; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
1361 ; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
1362 ; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
1363 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
1364 ; NVPTX-DISABLED: is_worker_check:
1365 ; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
1366 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
1367 ; NVPTX-DISABLED: worker_state_machine.begin:
1368 ; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
1369 ; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]])
1370 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
1371 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
1372 ; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
1373 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
1374 ; NVPTX-DISABLED: worker_state_machine.finished:
1375 ; NVPTX-DISABLED-NEXT: ret void
1376 ; NVPTX-DISABLED: worker_state_machine.is_active.check:
1377 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
1378 ; NVPTX-DISABLED: worker_state_machine.parallel_region.check:
1379 ; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__7_wrapper.ID to void (i16, i32)*)
1380 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
1381 ; NVPTX-DISABLED: worker_state_machine.parallel_region.execute:
1382 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]])
1383 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
1384 ; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute:
1385 ; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
1386 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
1387 ; NVPTX-DISABLED: worker_state_machine.parallel_region.end:
1388 ; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
1389 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
1390 ; NVPTX-DISABLED: worker_state_machine.done.barrier:
1391 ; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
1392 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
1393 ; NVPTX-DISABLED: thread.user_code.check:
1394 ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1395 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
1396 ; NVPTX-DISABLED: common.ret:
1397 ; NVPTX-DISABLED-NEXT: ret void
1398 ; NVPTX-DISABLED: user_code.entry:
1399 ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
1400 ; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
1401 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
1402 ; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
1403 ; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]]
1406 %.zero.addr = alloca i32, align 4
1407 %.threadid_temp. = alloca i32, align 4
1408 %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true)
1409 %exec_user_code = icmp eq i32 %0, -1
1410 br i1 %exec_user_code, label %user_code.entry, label %common.ret
1412 common.ret: ; preds = %entry, %user_code.entry
1415 user_code.entry: ; preds = %entry
1416 %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
1417 store i32 0, i32* %.zero.addr, align 4
1418 store i32 %1, i32* %.threadid_temp., align 4, !tbaa !18
1419 call void @__omp_outlined__6(i32* %.threadid_temp., i32* %.zero.addr) #6
1420 call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true)
1421 br label %common.ret
1424 ; Function Attrs: alwaysinline convergent norecurse nounwind
1425 define internal void @__omp_outlined__6(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
1426 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__6
1427 ; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
1428 ; AMDGPU-NEXT: entry:
1429 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
1430 ; AMDGPU-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x.1, i32 0, i32 0) to i8*) to i32*
1431 ; AMDGPU-NEXT: br label [[REGION_CHECK_TID:%.*]]
1432 ; AMDGPU: region.check.tid:
1433 ; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1434 ; AMDGPU-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
1435 ; AMDGPU-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
1436 ; AMDGPU: region.guarded:
1437 ; AMDGPU-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4, !tbaa [[TBAA18]]
1438 ; AMDGPU-NEXT: br label [[REGION_GUARDED_END:%.*]]
1439 ; AMDGPU: region.guarded.end:
1440 ; AMDGPU-NEXT: br label [[REGION_BARRIER]]
1441 ; AMDGPU: region.barrier:
1442 ; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
1443 ; AMDGPU-NEXT: br label [[REGION_EXIT:%.*]]
1444 ; AMDGPU: region.exit:
1445 ; AMDGPU-NEXT: br label [[FOR_COND:%.*]]
1447 ; AMDGPU-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[REGION_EXIT]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
1448 ; AMDGPU-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
1449 ; AMDGPU-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
1450 ; AMDGPU: for.cond.cleanup:
1451 ; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6]]
1452 ; AMDGPU-NEXT: ret void
1454 ; AMDGPU-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
1455 ; AMDGPU-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x.1, i32 0, i32 0) to i8*), i8** [[TMP2]], align 8, !tbaa [[TBAA26]]
1456 ; AMDGPU-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
1457 ; AMDGPU-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1458 ; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP3]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP4]], i64 noundef 1)
1459 ; AMDGPU-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
1460 ; AMDGPU-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
1462 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__6
1463 ; NVPTX-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
1464 ; NVPTX-NEXT: entry:
1465 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
1466 ; NVPTX-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*) to i32*
1467 ; NVPTX-NEXT: br label [[REGION_CHECK_TID:%.*]]
1468 ; NVPTX: region.check.tid:
1469 ; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1470 ; NVPTX-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
1471 ; NVPTX-NEXT: br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
1472 ; NVPTX: region.guarded:
1473 ; NVPTX-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4, !tbaa [[TBAA18]]
1474 ; NVPTX-NEXT: br label [[REGION_GUARDED_END:%.*]]
1475 ; NVPTX: region.guarded.end:
1476 ; NVPTX-NEXT: br label [[REGION_BARRIER]]
1477 ; NVPTX: region.barrier:
1478 ; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
1479 ; NVPTX-NEXT: br label [[REGION_EXIT:%.*]]
1480 ; NVPTX: region.exit:
1481 ; NVPTX-NEXT: br label [[FOR_COND:%.*]]
1483 ; NVPTX-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[REGION_EXIT]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
1484 ; NVPTX-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
1485 ; NVPTX-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
1486 ; NVPTX: for.cond.cleanup:
1487 ; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6]]
1488 ; NVPTX-NEXT: ret void
1490 ; NVPTX-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
1491 ; NVPTX-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*), i8** [[TMP2]], align 8, !tbaa [[TBAA26]]
1492 ; NVPTX-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
1493 ; NVPTX-NEXT: [[TMP4:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1494 ; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP3]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP4]], i64 noundef 1)
1495 ; NVPTX-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
1496 ; NVPTX-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
1498 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6
1499 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
1500 ; AMDGPU-DISABLED-NEXT: entry:
1501 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
1502 ; AMDGPU-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x.1, i32 0, i32 0) to i8*) to i32*
1503 ; AMDGPU-DISABLED-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4, !tbaa [[TBAA18]]
1504 ; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]]
1505 ; AMDGPU-DISABLED: for.cond:
1506 ; AMDGPU-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
1507 ; AMDGPU-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
1508 ; AMDGPU-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
1509 ; AMDGPU-DISABLED: for.cond.cleanup:
1510 ; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]]
1511 ; AMDGPU-DISABLED-NEXT: ret void
1512 ; AMDGPU-DISABLED: for.body:
1513 ; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
1514 ; AMDGPU-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x.1, i32 0, i32 0) to i8*), i8** [[TMP0]], align 8, !tbaa [[TBAA26]]
1515 ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
1516 ; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1517 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 1)
1518 ; AMDGPU-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
1519 ; AMDGPU-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
1521 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6
1522 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
1523 ; NVPTX-DISABLED-NEXT: entry:
1524 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
1525 ; NVPTX-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*) to i32*
1526 ; NVPTX-DISABLED-NEXT: store i32 42, i32* [[X_ON_STACK]], align 4, !tbaa [[TBAA18]]
1527 ; NVPTX-DISABLED-NEXT: br label [[FOR_COND:%.*]]
1528 ; NVPTX-DISABLED: for.cond:
1529 ; NVPTX-DISABLED-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
1530 ; NVPTX-DISABLED-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
1531 ; NVPTX-DISABLED-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
1532 ; NVPTX-DISABLED: for.cond.cleanup:
1533 ; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]]
1534 ; NVPTX-DISABLED-NEXT: ret void
1535 ; NVPTX-DISABLED: for.body:
1536 ; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
1537 ; NVPTX-DISABLED-NEXT: store i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* @x1, i32 0, i32 0) to i8*), i8** [[TMP0]], align 8, !tbaa [[TBAA26]]
1538 ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
1539 ; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1540 ; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP1]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 1)
1541 ; NVPTX-DISABLED-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
1542 ; NVPTX-DISABLED-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
1545 %captured_vars_addrs = alloca [1 x i8*], align 8
1546 %x = call align 4 i8* @__kmpc_alloc_shared(i64 4)
1547 %x_on_stack = bitcast i8* %x to i32*
1548 store i32 42, i32* %x_on_stack, align 4, !tbaa !18
1551 for.cond: ; preds = %for.body, %entry
1552 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
1553 %cmp = icmp slt i32 %i.0, 100
1554 br i1 %cmp, label %for.body, label %for.cond.cleanup
1556 for.cond.cleanup: ; preds = %for.cond
1557 call void @spmd_amenable() #10
1558 call void @__kmpc_free_shared(i8* %x, i64 4)
1561 for.body: ; preds = %for.cond
1562 %0 = getelementptr inbounds [1 x i8*], [1 x i8*]* %captured_vars_addrs, i64 0, i64 0
1563 store i8* %x, i8** %0, align 8, !tbaa !26
1564 %1 = load i32, i32* %.global_tid., align 4, !tbaa !18
1565 %2 = bitcast [1 x i8*]* %captured_vars_addrs to i8**
1566 call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__7 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** %2, i64 1)
1567 %inc = add nsw i32 %i.0, 1
1568 br label %for.cond, !llvm.loop !29
1571 ; Function Attrs: alwaysinline convergent norecurse nounwind
1572 define internal void @__omp_outlined__7(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* nonnull align 4 dereferenceable(4) %x) #0 {
1573 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7
1574 ; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
1575 ; AMDGPU-NEXT: entry:
1576 ; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4, !tbaa [[TBAA18]]
1577 ; AMDGPU-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
1578 ; AMDGPU-NEXT: store i32 [[INC]], i32* [[X]], align 4, !tbaa [[TBAA18]]
1579 ; AMDGPU-NEXT: call void @unknown() #[[ATTR7]]
1580 ; AMDGPU-NEXT: ret void
1582 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7
1583 ; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
1584 ; NVPTX-NEXT: entry:
1585 ; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4, !tbaa [[TBAA18]]
1586 ; NVPTX-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
1587 ; NVPTX-NEXT: store i32 [[INC]], i32* [[X]], align 4, !tbaa [[TBAA18]]
1588 ; NVPTX-NEXT: call void @unknown() #[[ATTR7]]
1589 ; NVPTX-NEXT: ret void
1591 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7
1592 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
1593 ; AMDGPU-DISABLED-NEXT: entry:
1594 ; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4, !tbaa [[TBAA18]]
1595 ; AMDGPU-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
1596 ; AMDGPU-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4, !tbaa [[TBAA18]]
1597 ; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7]]
1598 ; AMDGPU-DISABLED-NEXT: ret void
1600 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7
1601 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
1602 ; NVPTX-DISABLED-NEXT: entry:
1603 ; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = load i32, i32* [[X]], align 4, !tbaa [[TBAA18]]
1604 ; NVPTX-DISABLED-NEXT: [[INC:%.*]] = add nsw i32 [[TMP0]], 1
1605 ; NVPTX-DISABLED-NEXT: store i32 [[INC]], i32* [[X]], align 4, !tbaa [[TBAA18]]
1606 ; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7]]
1607 ; NVPTX-DISABLED-NEXT: ret void
1610 %0 = load i32, i32* %x, align 4, !tbaa !18
1611 %inc = add nsw i32 %0, 1
1612 store i32 %inc, i32* %x, align 4, !tbaa !18
1613 call void @unknown() #11
1617 ; Function Attrs: convergent norecurse nounwind
1618 define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 {
1619 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
1620 ; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] {
1621 ; AMDGPU-NEXT: entry:
1622 ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1623 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1624 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
1625 ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1626 ; AMDGPU-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
1627 ; AMDGPU-NEXT: [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32**
1628 ; AMDGPU-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8, !tbaa [[TBAA26]]
1629 ; AMDGPU-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP4]]) #[[ATTR3]]
1630 ; AMDGPU-NEXT: ret void
1632 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
1633 ; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] {
1634 ; NVPTX-NEXT: entry:
1635 ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1636 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1637 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
1638 ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1639 ; NVPTX-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
1640 ; NVPTX-NEXT: [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32**
1641 ; NVPTX-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8, !tbaa [[TBAA26]]
1642 ; NVPTX-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP4]]) #[[ATTR3]]
1643 ; NVPTX-NEXT: ret void
1645 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
1646 ; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] {
1647 ; AMDGPU-DISABLED-NEXT: entry:
1648 ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1649 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1650 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
1651 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1652 ; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
1653 ; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32**
1654 ; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8, !tbaa [[TBAA26]]
1655 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP4]]) #[[ATTR3]]
1656 ; AMDGPU-DISABLED-NEXT: ret void
1658 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
1659 ; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] {
1660 ; NVPTX-DISABLED-NEXT: entry:
1661 ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1662 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1663 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
1664 ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
1665 ; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
1666 ; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = bitcast i8** [[TMP2]] to i32**
1667 ; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP3]], align 8, !tbaa [[TBAA26]]
1668 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__7(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP4]]) #[[ATTR3]]
1669 ; NVPTX-DISABLED-NEXT: ret void
1672 %.addr1 = alloca i32, align 4
1673 %.zero.addr = alloca i32, align 4
1674 %global_args = alloca i8**, align 8
1675 store i32 %1, i32* %.addr1, align 4, !tbaa !18
1676 store i32 0, i32* %.zero.addr, align 4
1677 call void @__kmpc_get_shared_variables(i8*** %global_args)
1678 %2 = load i8**, i8*** %global_args, align 8
1679 %3 = bitcast i8** %2 to i32**
1680 %4 = load i32*, i32** %3, align 8, !tbaa !26
1681 call void @__omp_outlined__7(i32* %.addr1, i32* %.zero.addr, i32* %4) #6
1685 ; Function Attrs: alwaysinline convergent norecurse nounwind
1686 define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0 {
1687 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
1688 ; AMDGPU-SAME: () #[[ATTR0]] {
1689 ; AMDGPU-NEXT: entry:
1690 ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5)
1691 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1692 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1693 ; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
1694 ; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
1695 ; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
1696 ; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
1697 ; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
1698 ; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
1699 ; AMDGPU: is_worker_check:
1700 ; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
1701 ; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
1702 ; AMDGPU: worker_state_machine.begin:
1703 ; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
1704 ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8**
1705 ; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]])
1706 ; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
1707 ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
1708 ; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
1709 ; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
1710 ; AMDGPU: worker_state_machine.finished:
1711 ; AMDGPU-NEXT: ret void
1712 ; AMDGPU: worker_state_machine.is_active.check:
1713 ; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
1714 ; AMDGPU: worker_state_machine.parallel_region.fallback.execute:
1715 ; AMDGPU-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
1716 ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
1717 ; AMDGPU: worker_state_machine.parallel_region.end:
1718 ; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel()
1719 ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
1720 ; AMDGPU: worker_state_machine.done.barrier:
1721 ; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
1722 ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
1723 ; AMDGPU: thread.user_code.check:
1724 ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1725 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
1726 ; AMDGPU: common.ret:
1727 ; AMDGPU-NEXT: ret void
1728 ; AMDGPU: user_code.entry:
1729 ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
1730 ; AMDGPU-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
1731 ; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
1732 ; AMDGPU-NEXT: br label [[COMMON_RET]]
1734 ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
1735 ; NVPTX-SAME: () #[[ATTR0]] {
1736 ; NVPTX-NEXT: entry:
1737 ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
1738 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1739 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1740 ; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
1741 ; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
1742 ; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
1743 ; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
1744 ; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
1745 ; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
1746 ; NVPTX: is_worker_check:
1747 ; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
1748 ; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
1749 ; NVPTX: worker_state_machine.begin:
1750 ; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
1751 ; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]])
1752 ; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
1753 ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
1754 ; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
1755 ; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
1756 ; NVPTX: worker_state_machine.finished:
1757 ; NVPTX-NEXT: ret void
1758 ; NVPTX: worker_state_machine.is_active.check:
1759 ; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
1760 ; NVPTX: worker_state_machine.parallel_region.fallback.execute:
1761 ; NVPTX-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
1762 ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
1763 ; NVPTX: worker_state_machine.parallel_region.end:
1764 ; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel()
1765 ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
1766 ; NVPTX: worker_state_machine.done.barrier:
1767 ; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
1768 ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
1769 ; NVPTX: thread.user_code.check:
1770 ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1771 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
1772 ; NVPTX: common.ret:
1773 ; NVPTX-NEXT: ret void
1774 ; NVPTX: user_code.entry:
1775 ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
1776 ; NVPTX-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
1777 ; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
1778 ; NVPTX-NEXT: br label [[COMMON_RET]]
1780 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
1781 ; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
1782 ; AMDGPU-DISABLED-NEXT: entry:
1783 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5)
1784 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1785 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1786 ; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
1787 ; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
1788 ; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
1789 ; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
1790 ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
1791 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
1792 ; AMDGPU-DISABLED: is_worker_check:
1793 ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
1794 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
1795 ; AMDGPU-DISABLED: worker_state_machine.begin:
1796 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
1797 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8**
1798 ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]])
1799 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
1800 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
1801 ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
1802 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
1803 ; AMDGPU-DISABLED: worker_state_machine.finished:
1804 ; AMDGPU-DISABLED-NEXT: ret void
1805 ; AMDGPU-DISABLED: worker_state_machine.is_active.check:
1806 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
1807 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute:
1808 ; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
1809 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
1810 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.end:
1811 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
1812 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
1813 ; AMDGPU-DISABLED: worker_state_machine.done.barrier:
1814 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
1815 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
1816 ; AMDGPU-DISABLED: thread.user_code.check:
1817 ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1818 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
1819 ; AMDGPU-DISABLED: common.ret:
1820 ; AMDGPU-DISABLED-NEXT: ret void
1821 ; AMDGPU-DISABLED: user_code.entry:
1822 ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
1823 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
1824 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
1825 ; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]]
1827 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
1828 ; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
1829 ; NVPTX-DISABLED-NEXT: entry:
1830 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
1831 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1832 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1833 ; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
1834 ; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
1835 ; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
1836 ; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
1837 ; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
1838 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
1839 ; NVPTX-DISABLED: is_worker_check:
1840 ; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
1841 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
1842 ; NVPTX-DISABLED: worker_state_machine.begin:
1843 ; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
1844 ; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]])
1845 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
1846 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
1847 ; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
1848 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
1849 ; NVPTX-DISABLED: worker_state_machine.finished:
1850 ; NVPTX-DISABLED-NEXT: ret void
1851 ; NVPTX-DISABLED: worker_state_machine.is_active.check:
1852 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
1853 ; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute:
1854 ; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
1855 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
1856 ; NVPTX-DISABLED: worker_state_machine.parallel_region.end:
1857 ; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
1858 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
1859 ; NVPTX-DISABLED: worker_state_machine.done.barrier:
1860 ; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
1861 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
1862 ; NVPTX-DISABLED: thread.user_code.check:
1863 ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1864 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
1865 ; NVPTX-DISABLED: common.ret:
1866 ; NVPTX-DISABLED-NEXT: ret void
1867 ; NVPTX-DISABLED: user_code.entry:
1868 ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
1869 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__8(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]]
1870 ; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
1871 ; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]]
1874 %.zero.addr = alloca i32, align 4
1875 %.threadid_temp. = alloca i32, align 4
1876 %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true)
1877 %exec_user_code = icmp eq i32 %0, -1
1878 br i1 %exec_user_code, label %user_code.entry, label %common.ret
1880 common.ret: ; preds = %entry, %user_code.entry
1883 user_code.entry: ; preds = %entry
1884 %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
1885 store i32 0, i32* %.zero.addr, align 4
1886 store i32 %1, i32* %.threadid_temp., align 4, !tbaa !18
1887 call void @__omp_outlined__8(i32* %.threadid_temp., i32* %.zero.addr) #6
1888 call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true)
1889 br label %common.ret
1892 ; Function Attrs: alwaysinline convergent norecurse nounwind
1893 define internal void @__omp_outlined__8(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
1894 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__8
1895 ; AMDGPU-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
1896 ; AMDGPU-NEXT: entry:
1897 ; AMDGPU-NEXT: call void @unknown() #[[ATTR7]]
1898 ; AMDGPU-NEXT: ret void
1900 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__8
1901 ; NVPTX-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
1902 ; NVPTX-NEXT: entry:
1903 ; NVPTX-NEXT: call void @unknown() #[[ATTR7]]
1904 ; NVPTX-NEXT: ret void
1906 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8
1907 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
1908 ; AMDGPU-DISABLED-NEXT: entry:
1909 ; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7]]
1910 ; AMDGPU-DISABLED-NEXT: ret void
1912 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8
1913 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
1914 ; NVPTX-DISABLED-NEXT: entry:
1915 ; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7]]
1916 ; NVPTX-DISABLED-NEXT: ret void
1919 call void @unknown() #11
1923 ; Function Attrs: alwaysinline convergent norecurse nounwind
1924 define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 {
1925 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
1926 ; AMDGPU-SAME: () #[[ATTR0]] {
1927 ; AMDGPU-NEXT: entry:
1928 ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5)
1929 ; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
1930 ; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
1931 ; AMDGPU-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
1932 ; AMDGPU-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
1933 ; AMDGPU-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
1934 ; AMDGPU-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
1935 ; AMDGPU-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
1936 ; AMDGPU: is_worker_check:
1937 ; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
1938 ; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
1939 ; AMDGPU: worker_state_machine.begin:
1940 ; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
1941 ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8**
1942 ; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]])
1943 ; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
1944 ; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
1945 ; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
1946 ; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
1947 ; AMDGPU: worker_state_machine.finished:
1948 ; AMDGPU-NEXT: ret void
1949 ; AMDGPU: worker_state_machine.is_active.check:
1950 ; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
1951 ; AMDGPU: worker_state_machine.parallel_region.check:
1952 ; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__9_wrapper.ID to void (i16, i32)*)
1953 ; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
1954 ; AMDGPU: worker_state_machine.parallel_region.execute:
1955 ; AMDGPU-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
1956 ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
1957 ; AMDGPU: worker_state_machine.parallel_region.fallback.execute:
1958 ; AMDGPU-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
1959 ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
1960 ; AMDGPU: worker_state_machine.parallel_region.end:
1961 ; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel()
1962 ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
1963 ; AMDGPU: worker_state_machine.done.barrier:
1964 ; AMDGPU-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
1965 ; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
1966 ; AMDGPU: thread.user_code.check:
1967 ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1968 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
1969 ; AMDGPU: common.ret:
1970 ; AMDGPU-NEXT: ret void
1971 ; AMDGPU: user_code.entry:
1972 ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
1973 ; AMDGPU-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @"_omp_task_entry$" to i32 (i32, i8*)*)) #[[ATTR3]]
1974 ; AMDGPU-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP2]]) #[[ATTR3]]
1975 ; AMDGPU-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
1976 ; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* @__omp_outlined__9_wrapper.ID, i8** [[TMP4]], i64 0)
1977 ; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
1978 ; AMDGPU-NEXT: br label [[COMMON_RET]]
1980 ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
1981 ; NVPTX-SAME: () #[[ATTR0]] {
1982 ; NVPTX-NEXT: entry:
1983 ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
1984 ; NVPTX-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
1985 ; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
1986 ; NVPTX-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
1987 ; NVPTX-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
1988 ; NVPTX-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
1989 ; NVPTX-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
1990 ; NVPTX-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
1991 ; NVPTX: is_worker_check:
1992 ; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
1993 ; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
1994 ; NVPTX: worker_state_machine.begin:
1995 ; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
1996 ; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]])
1997 ; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
1998 ; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
1999 ; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
2000 ; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
2001 ; NVPTX: worker_state_machine.finished:
2002 ; NVPTX-NEXT: ret void
2003 ; NVPTX: worker_state_machine.is_active.check:
2004 ; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
2005 ; NVPTX: worker_state_machine.parallel_region.check:
2006 ; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__9_wrapper.ID to void (i16, i32)*)
2007 ; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
2008 ; NVPTX: worker_state_machine.parallel_region.execute:
2009 ; NVPTX-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
2010 ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
2011 ; NVPTX: worker_state_machine.parallel_region.fallback.execute:
2012 ; NVPTX-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
2013 ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
2014 ; NVPTX: worker_state_machine.parallel_region.end:
2015 ; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel()
2016 ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
2017 ; NVPTX: worker_state_machine.done.barrier:
2018 ; NVPTX-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
2019 ; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
2020 ; NVPTX: thread.user_code.check:
2021 ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
2022 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
2023 ; NVPTX: common.ret:
2024 ; NVPTX-NEXT: ret void
2025 ; NVPTX: user_code.entry:
2026 ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
2027 ; NVPTX-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @"_omp_task_entry$" to i32 (i32, i8*)*)) #[[ATTR3]]
2028 ; NVPTX-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP2]]) #[[ATTR3]]
2029 ; NVPTX-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
2030 ; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* @__omp_outlined__9_wrapper.ID, i8** [[TMP4]], i64 0)
2031 ; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
2032 ; NVPTX-NEXT: br label [[COMMON_RET]]
2034 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
2035 ; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
2036 ; AMDGPU-DISABLED-NEXT: entry:
2037 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5)
2038 ; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
2039 ; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
2040 ; AMDGPU-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
2041 ; AMDGPU-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
2042 ; AMDGPU-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
2043 ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
2044 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
2045 ; AMDGPU-DISABLED: is_worker_check:
2046 ; AMDGPU-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
2047 ; AMDGPU-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
2048 ; AMDGPU-DISABLED: worker_state_machine.begin:
2049 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
2050 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8**
2051 ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]])
2052 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
2053 ; AMDGPU-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
2054 ; AMDGPU-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
2055 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
2056 ; AMDGPU-DISABLED: worker_state_machine.finished:
2057 ; AMDGPU-DISABLED-NEXT: ret void
2058 ; AMDGPU-DISABLED: worker_state_machine.is_active.check:
2059 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
2060 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.check:
2061 ; AMDGPU-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__9_wrapper.ID to void (i16, i32)*)
2062 ; AMDGPU-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
2063 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.execute:
2064 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
2065 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
2066 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.fallback.execute:
2067 ; AMDGPU-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
2068 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
2069 ; AMDGPU-DISABLED: worker_state_machine.parallel_region.end:
2070 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
2071 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
2072 ; AMDGPU-DISABLED: worker_state_machine.done.barrier:
2073 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
2074 ; AMDGPU-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
2075 ; AMDGPU-DISABLED: thread.user_code.check:
2076 ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
2077 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
2078 ; AMDGPU-DISABLED: common.ret:
2079 ; AMDGPU-DISABLED-NEXT: ret void
2080 ; AMDGPU-DISABLED: user_code.entry:
2081 ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
2082 ; AMDGPU-DISABLED-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @"_omp_task_entry$" to i32 (i32, i8*)*)) #[[ATTR3]]
2083 ; AMDGPU-DISABLED-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP2]]) #[[ATTR3]]
2084 ; AMDGPU-DISABLED-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
2085 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* @__omp_outlined__9_wrapper.ID, i8** [[TMP4]], i64 0)
2086 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
2087 ; AMDGPU-DISABLED-NEXT: br label [[COMMON_RET]]
2089 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
2090 ; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
2091 ; NVPTX-DISABLED-NEXT: entry:
2092 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
2093 ; NVPTX-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
2094 ; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true)
2095 ; NVPTX-DISABLED-NEXT: [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
2096 ; NVPTX-DISABLED-NEXT: [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
2097 ; NVPTX-DISABLED-NEXT: [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
2098 ; NVPTX-DISABLED-NEXT: [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
2099 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
2100 ; NVPTX-DISABLED: is_worker_check:
2101 ; NVPTX-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
2102 ; NVPTX-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
2103 ; NVPTX-DISABLED: worker_state_machine.begin:
2104 ; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
2105 ; NVPTX-DISABLED-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]])
2106 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
2107 ; NVPTX-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
2108 ; NVPTX-DISABLED-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
2109 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
2110 ; NVPTX-DISABLED: worker_state_machine.finished:
2111 ; NVPTX-DISABLED-NEXT: ret void
2112 ; NVPTX-DISABLED: worker_state_machine.is_active.check:
2113 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
2114 ; NVPTX-DISABLED: worker_state_machine.parallel_region.check:
2115 ; NVPTX-DISABLED-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__9_wrapper.ID to void (i16, i32)*)
2116 ; NVPTX-DISABLED-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
2117 ; NVPTX-DISABLED: worker_state_machine.parallel_region.execute:
2118 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
2119 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
2120 ; NVPTX-DISABLED: worker_state_machine.parallel_region.fallback.execute:
2121 ; NVPTX-DISABLED-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]])
2122 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
2123 ; NVPTX-DISABLED: worker_state_machine.parallel_region.end:
2124 ; NVPTX-DISABLED-NEXT: call void @__kmpc_kernel_end_parallel()
2125 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
2126 ; NVPTX-DISABLED: worker_state_machine.done.barrier:
2127 ; NVPTX-DISABLED-NEXT: call void @__kmpc_barrier_simple_generic(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
2128 ; NVPTX-DISABLED-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
2129 ; NVPTX-DISABLED: thread.user_code.check:
2130 ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
2131 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
2132 ; NVPTX-DISABLED: common.ret:
2133 ; NVPTX-DISABLED-NEXT: ret void
2134 ; NVPTX-DISABLED: user_code.entry:
2135 ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]]
2136 ; NVPTX-DISABLED-NEXT: [[TMP2:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @"_omp_task_entry$" to i32 (i32, i8*)*)) #[[ATTR3]]
2137 ; NVPTX-DISABLED-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i8* [[TMP2]]) #[[ATTR3]]
2138 ; NVPTX-DISABLED-NEXT: [[TMP4:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
2139 ; NVPTX-DISABLED-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* @__omp_outlined__9_wrapper.ID, i8** [[TMP4]], i64 0)
2140 ; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
2141 ; NVPTX-DISABLED-NEXT: br label [[COMMON_RET]]
2144 %captured_vars_addrs = alloca [0 x i8*], align 8
2145 %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true)
2146 %exec_user_code = icmp eq i32 %0, -1
2147 br i1 %exec_user_code, label %user_code.entry, label %common.ret
2149 common.ret: ; preds = %entry, %user_code.entry
2152 user_code.entry: ; preds = %entry
2153 %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
2154 %2 = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @1, i32 %1, i32 1, i64 40, i64 0, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @"_omp_task_entry$" to i32 (i32, i8*)*))
2155 %3 = bitcast i8* %2 to %struct.kmp_task_t_with_privates*
2156 %4 = call i32 @__kmpc_omp_task(%struct.ident_t* @1, i32 %1, i8* %2)
2157 %5 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
2158 call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__9 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__9_wrapper to i8*), i8** %5, i64 0)
2159 call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true)
2160 br label %common.ret
2163 ; Function Attrs: alwaysinline convergent nounwind
2164 define internal void @.omp_outlined.(i32 %.global_tid., i32* noalias %.part_id., i8* noalias %.privates., void (i8*, ...)* noalias %.copy_fn., i8* %.task_t., %struct.anon* noalias %__context) #9 {
2165 ; AMDGPU-LABEL: define {{[^@]+}}@.omp_outlined.
2166 ; AMDGPU-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone [[DOTPART_ID_:%.*]], i8* noalias nocapture nofree readnone align 4294967296 [[DOTPRIVATES_:%.*]], void (i8*, ...)* noalias nocapture nofree readnone [[DOTCOPY_FN_:%.*]], i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(8) [[DOTTASK_T_:%.*]], %struct.anon* noalias nocapture nofree readnone [[__CONTEXT:%.*]]) #[[ATTR2:[0-9]+]] {
2167 ; AMDGPU-NEXT: entry:
2168 ; AMDGPU-NEXT: call void @spmd_amenable() #[[ATTR6]]
2169 ; AMDGPU-NEXT: ret void
2171 ; NVPTX-LABEL: define {{[^@]+}}@.omp_outlined.
2172 ; NVPTX-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone [[DOTPART_ID_:%.*]], i8* noalias nocapture nofree readnone align 4294967296 [[DOTPRIVATES_:%.*]], void (i8*, ...)* noalias nocapture nofree readnone [[DOTCOPY_FN_:%.*]], i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(8) [[DOTTASK_T_:%.*]], %struct.anon* noalias nocapture nofree readnone [[__CONTEXT:%.*]]) #[[ATTR2:[0-9]+]] {
2173 ; NVPTX-NEXT: entry:
2174 ; NVPTX-NEXT: call void @spmd_amenable() #[[ATTR6]]
2175 ; NVPTX-NEXT: ret void
2177 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@.omp_outlined.
2178 ; AMDGPU-DISABLED-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone [[DOTPART_ID_:%.*]], i8* noalias nocapture nofree readnone align 4294967296 [[DOTPRIVATES_:%.*]], void (i8*, ...)* noalias nocapture nofree readnone [[DOTCOPY_FN_:%.*]], i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(8) [[DOTTASK_T_:%.*]], %struct.anon* noalias nocapture nofree readnone [[__CONTEXT:%.*]]) #[[ATTR2:[0-9]+]] {
2179 ; AMDGPU-DISABLED-NEXT: entry:
2180 ; AMDGPU-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]]
2181 ; AMDGPU-DISABLED-NEXT: ret void
2183 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@.omp_outlined.
2184 ; NVPTX-DISABLED-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone [[DOTPART_ID_:%.*]], i8* noalias nocapture nofree readnone align 4294967296 [[DOTPRIVATES_:%.*]], void (i8*, ...)* noalias nocapture nofree readnone [[DOTCOPY_FN_:%.*]], i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(8) [[DOTTASK_T_:%.*]], %struct.anon* noalias nocapture nofree readnone [[__CONTEXT:%.*]]) #[[ATTR2:[0-9]+]] {
2185 ; NVPTX-DISABLED-NEXT: entry:
2186 ; NVPTX-DISABLED-NEXT: call void @spmd_amenable() #[[ATTR6]]
2187 ; NVPTX-DISABLED-NEXT: ret void
2190 call void @spmd_amenable() #10
2194 ; Function Attrs: convergent norecurse nounwind
2195 define internal i32 @"_omp_task_entry$"(i32 %0, %struct.kmp_task_t_with_privates* noalias %1) #3 {
2197 %2 = getelementptr inbounds %struct.kmp_task_t_with_privates, %struct.kmp_task_t_with_privates* %1, i32 0, i32 0
2198 %3 = getelementptr inbounds %struct.kmp_task_t, %struct.kmp_task_t* %2, i32 0, i32 2
2199 %4 = getelementptr inbounds %struct.kmp_task_t, %struct.kmp_task_t* %2, i32 0, i32 0
2200 %5 = load i8*, i8** %4, align 8, !tbaa !30
2201 %6 = bitcast i8* %5 to %struct.anon*
2202 %7 = bitcast %struct.kmp_task_t_with_privates* %1 to i8*
2203 call void @.omp_outlined.(i32 %0, i32* %3, i8* null, void (i8*, ...)* null, i8* %7, %struct.anon* %6) #6
2207 ; Function Attrs: nounwind
2208 declare i8* @__kmpc_omp_task_alloc(%struct.ident_t*, i32, i32, i64, i64, i32 (i32, i8*)*) #6
2210 ; Function Attrs: nounwind
2211 declare i32 @__kmpc_omp_task(%struct.ident_t*, i32, i8*) #6
2213 ; Function Attrs: nosync nounwind
2214 declare void @__kmpc_free_shared(i8* nocapture, i64) #8
2216 ; Function Attrs: nofree nosync nounwind
2217 declare i8* @__kmpc_alloc_shared(i64) #7
2219 ; Function Attrs: convergent
2220 declare void @use(i32* nocapture) #5
2222 ; Function Attrs: convergent
2223 declare void @unknown() #2
2225 ; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn
2226 declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1
2228 declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1)
2230 declare void @__kmpc_get_shared_variables(i8***)
2232 ; Function Attrs: alwaysinline
2233 declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) #4
2235 ; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn
2236 declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1
2238 ; Function Attrs: convergent
2239 declare void @spmd_amenable() #5
2241 ; Function Attrs: nounwind
2242 declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #6
2244 declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1)
2247 ; Function Attrs: alwaysinline convergent norecurse nounwind
2248 define internal void @__omp_outlined__9(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
2249 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__9
2250 ; AMDGPU-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
2251 ; AMDGPU-NEXT: entry:
2252 ; AMDGPU-NEXT: call void @unknown() #[[ATTR7]]
2253 ; AMDGPU-NEXT: ret void
2255 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__9
2256 ; NVPTX-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
2257 ; NVPTX-NEXT: entry:
2258 ; NVPTX-NEXT: call void @unknown() #[[ATTR7]]
2259 ; NVPTX-NEXT: ret void
2261 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9
2262 ; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
2263 ; AMDGPU-DISABLED-NEXT: entry:
2264 ; AMDGPU-DISABLED-NEXT: call void @unknown() #[[ATTR7]]
2265 ; AMDGPU-DISABLED-NEXT: ret void
2267 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9
2268 ; NVPTX-DISABLED-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
2269 ; NVPTX-DISABLED-NEXT: entry:
2270 ; NVPTX-DISABLED-NEXT: call void @unknown() #[[ATTR7]]
2271 ; NVPTX-DISABLED-NEXT: ret void
2274 call void @unknown() #11
2278 ; Function Attrs: convergent norecurse nounwind
2279 define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #3 {
2280 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
2281 ; AMDGPU-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] {
2282 ; AMDGPU-NEXT: entry:
2283 ; AMDGPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2284 ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
2285 ; AMDGPU-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
2286 ; AMDGPU-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
2287 ; AMDGPU-NEXT: call void @__omp_outlined__9(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
2288 ; AMDGPU-NEXT: ret void
2290 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
2291 ; NVPTX-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] {
2292 ; NVPTX-NEXT: entry:
2293 ; NVPTX-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2294 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
2295 ; NVPTX-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
2296 ; NVPTX-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
2297 ; NVPTX-NEXT: call void @__omp_outlined__9(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
2298 ; NVPTX-NEXT: ret void
2300 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
2301 ; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] {
2302 ; AMDGPU-DISABLED-NEXT: entry:
2303 ; AMDGPU-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2304 ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
2305 ; AMDGPU-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
2306 ; AMDGPU-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
2307 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__9(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
2308 ; AMDGPU-DISABLED-NEXT: ret void
2310 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
2311 ; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR1]] {
2312 ; NVPTX-DISABLED-NEXT: entry:
2313 ; NVPTX-DISABLED-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2314 ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
2315 ; NVPTX-DISABLED-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
2316 ; NVPTX-DISABLED-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
2317 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__9(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR3]]
2318 ; NVPTX-DISABLED-NEXT: ret void
2321 %.addr1 = alloca i32, align 4
2322 %.zero.addr = alloca i32, align 4
2323 %global_args = alloca i8**, align 8
2324 store i32 %1, i32* %.addr1, align 4, !tbaa !18
2325 store i32 0, i32* %.zero.addr, align 4
2326 call void @__kmpc_get_shared_variables(i8*** %global_args)
2327 call void @__omp_outlined__9(i32* %.addr1, i32* %.zero.addr) #6
2331 attributes #0 = { alwaysinline convergent norecurse nounwind }
2332 attributes #1 = { argmemonly mustprogress nofree nosync nounwind willreturn }
2333 attributes #2 = { convergent }
2334 attributes #3 = { convergent norecurse nounwind }
2335 attributes #4 = { alwaysinline }
2336 attributes #5 = { convergent "llvm.assume"="ompx_spmd_amenable" }
2337 attributes #6 = { nounwind }
2338 attributes #7 = { nofree nosync nounwind }
2339 attributes #8 = { nosync nounwind }
2340 attributes #9 = { alwaysinline convergent nounwind }
2341 attributes #10 = { convergent "llvm.assume"="ompx_spmd_amenable" }
2342 attributes #11 = { convergent }
2344 !omp_offload.info = !{!0, !1, !2, !3, !4, !5}
2345 !nvvm.annotations = !{!6, !7, !8, !9, !10, !11}
2346 !llvm.module.flags = !{!12, !13, !14, !15, !16}
2347 !llvm.ident = !{!17}
2349 !0 = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
2350 !1 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
2351 !2 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
2352 !3 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
2353 !4 = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
2354 !5 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
2355 !6 = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
2356 !7 = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
2357 !8 = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
2358 !9 = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
2359 !10 = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
2360 !11 = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
2361 !12 = !{i32 1, !"wchar_size", i32 4}
2362 !13 = !{i32 7, !"openmp", i32 50}
2363 !14 = !{i32 7, !"openmp-device", i32 50}
2364 !15 = !{i32 7, !"PIC Level", i32 2}
2365 !16 = !{i32 7, !"frame-pointer", i32 2}
2366 !17 = !{!"clang version 14.0.0"}
2367 !18 = !{!19, !19, i64 0}
2368 !19 = !{!"int", !20, i64 0}
2369 !20 = !{!"omnipotent char", !21, i64 0}
2370 !21 = !{!"Simple C/C++ TBAA"}
2371 !22 = distinct !{!22, !23, !24}
2372 !23 = !{!"llvm.loop.mustprogress"}
2373 !24 = !{!"llvm.loop.unroll.disable"}
2374 !25 = distinct !{!25, !23, !24}
2375 !26 = !{!27, !27, i64 0}
2376 !27 = !{!"any pointer", !20, i64 0}
2377 !28 = distinct !{!28, !23, !24}
2378 !29 = distinct !{!29, !23, !24}
2379 !30 = !{!31, !27, i64 0}
2380 !31 = !{!"kmp_task_t_with_privates", !32, i64 0}
2381 !32 = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32}
2383 ; AMDGPU: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind }
2384 ; AMDGPU: attributes #[[ATTR1]] = { convergent norecurse nounwind }
2385 ; AMDGPU: attributes #[[ATTR2]] = { alwaysinline convergent nounwind }
2386 ; AMDGPU: attributes #[[ATTR3]] = { nounwind }
2387 ; AMDGPU: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind }
2388 ; AMDGPU: attributes #[[ATTR5:[0-9]+]] = { nofree nosync nounwind }
2389 ; AMDGPU: attributes #[[ATTR6]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
2390 ; AMDGPU: attributes #[[ATTR7]] = { convergent }
2391 ; AMDGPU: attributes #[[ATTR8:[0-9]+]] = { argmemonly nofree nosync nounwind willreturn }
2392 ; AMDGPU: attributes #[[ATTR9:[0-9]+]] = { alwaysinline }
2393 ; AMDGPU: attributes #[[ATTR10:[0-9]+]] = { convergent nounwind }
2395 ; NVPTX: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind }
2396 ; NVPTX: attributes #[[ATTR1]] = { convergent norecurse nounwind }
2397 ; NVPTX: attributes #[[ATTR2]] = { alwaysinline convergent nounwind }
2398 ; NVPTX: attributes #[[ATTR3]] = { nounwind }
2399 ; NVPTX: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind }
2400 ; NVPTX: attributes #[[ATTR5:[0-9]+]] = { nofree nosync nounwind }
2401 ; NVPTX: attributes #[[ATTR6]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
2402 ; NVPTX: attributes #[[ATTR7]] = { convergent }
2403 ; NVPTX: attributes #[[ATTR8:[0-9]+]] = { argmemonly nofree nosync nounwind willreturn }
2404 ; NVPTX: attributes #[[ATTR9:[0-9]+]] = { alwaysinline }
2405 ; NVPTX: attributes #[[ATTR10:[0-9]+]] = { convergent nounwind }
2407 ; AMDGPU-DISABLED: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind }
2408 ; AMDGPU-DISABLED: attributes #[[ATTR1]] = { convergent norecurse nounwind }
2409 ; AMDGPU-DISABLED: attributes #[[ATTR2]] = { alwaysinline convergent nounwind }
2410 ; AMDGPU-DISABLED: attributes #[[ATTR3]] = { nounwind }
2411 ; AMDGPU-DISABLED: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind }
2412 ; AMDGPU-DISABLED: attributes #[[ATTR5:[0-9]+]] = { nofree nosync nounwind }
2413 ; AMDGPU-DISABLED: attributes #[[ATTR6]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
2414 ; AMDGPU-DISABLED: attributes #[[ATTR7]] = { convergent }
2415 ; AMDGPU-DISABLED: attributes #[[ATTR8:[0-9]+]] = { argmemonly nofree nosync nounwind willreturn }
2416 ; AMDGPU-DISABLED: attributes #[[ATTR9:[0-9]+]] = { alwaysinline }
2417 ; AMDGPU-DISABLED: attributes #[[ATTR10:[0-9]+]] = { convergent nounwind }
2419 ; NVPTX-DISABLED: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind }
2420 ; NVPTX-DISABLED: attributes #[[ATTR1]] = { convergent norecurse nounwind }
2421 ; NVPTX-DISABLED: attributes #[[ATTR2]] = { alwaysinline convergent nounwind }
2422 ; NVPTX-DISABLED: attributes #[[ATTR3]] = { nounwind }
2423 ; NVPTX-DISABLED: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind }
2424 ; NVPTX-DISABLED: attributes #[[ATTR5:[0-9]+]] = { nofree nosync nounwind }
2425 ; NVPTX-DISABLED: attributes #[[ATTR6]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
2426 ; NVPTX-DISABLED: attributes #[[ATTR7]] = { convergent }
2427 ; NVPTX-DISABLED: attributes #[[ATTR8:[0-9]+]] = { argmemonly nofree nosync nounwind willreturn }
2428 ; NVPTX-DISABLED: attributes #[[ATTR9:[0-9]+]] = { alwaysinline }
2429 ; NVPTX-DISABLED: attributes #[[ATTR10:[0-9]+]] = { convergent nounwind }
2431 ; AMDGPU: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
2432 ; AMDGPU: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
2433 ; AMDGPU: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
2434 ; AMDGPU: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
2435 ; AMDGPU: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
2436 ; AMDGPU: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
2437 ; AMDGPU: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
2438 ; AMDGPU: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
2439 ; AMDGPU: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
2440 ; AMDGPU: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
2441 ; AMDGPU: [[META10:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
2442 ; AMDGPU: [[META11:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
2443 ; AMDGPU: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
2444 ; AMDGPU: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
2445 ; AMDGPU: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
2446 ; AMDGPU: [[META15:![0-9]+]] = !{i32 7, !"PIC Level", i32 2}
2447 ; AMDGPU: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
2448 ; AMDGPU: [[META17:![0-9]+]] = !{!"clang version 14.0.0"}
2449 ; AMDGPU: [[TBAA18]] = !{!19, !19, i64 0}
2450 ; AMDGPU: [[META19:![0-9]+]] = !{!"int", !20, i64 0}
2451 ; AMDGPU: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0}
2452 ; AMDGPU: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"}
2453 ; AMDGPU: [[LOOP22]] = distinct !{!22, !23, !24}
2454 ; AMDGPU: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"}
2455 ; AMDGPU: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
2456 ; AMDGPU: [[LOOP25]] = distinct !{!25, !23, !24}
2457 ; AMDGPU: [[TBAA26]] = !{!27, !27, i64 0}
2458 ; AMDGPU: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
2459 ; AMDGPU: [[LOOP28]] = distinct !{!28, !23, !24}
2460 ; AMDGPU: [[LOOP29]] = distinct !{!29, !23, !24}
2461 ; AMDGPU: [[META30:![0-9]+]] = !{!31, !27, i64 0}
2462 ; AMDGPU: [[META31:![0-9]+]] = !{!"kmp_task_t_with_privates", !32, i64 0}
2463 ; AMDGPU: [[META32:![0-9]+]] = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32}
2465 ; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
2466 ; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
2467 ; NVPTX: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
2468 ; NVPTX: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
2469 ; NVPTX: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
2470 ; NVPTX: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
2471 ; NVPTX: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
2472 ; NVPTX: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
2473 ; NVPTX: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
2474 ; NVPTX: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
2475 ; NVPTX: [[META10:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
2476 ; NVPTX: [[META11:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
2477 ; NVPTX: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
2478 ; NVPTX: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
2479 ; NVPTX: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
2480 ; NVPTX: [[META15:![0-9]+]] = !{i32 7, !"PIC Level", i32 2}
2481 ; NVPTX: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
2482 ; NVPTX: [[META17:![0-9]+]] = !{!"clang version 14.0.0"}
2483 ; NVPTX: [[TBAA18]] = !{!19, !19, i64 0}
2484 ; NVPTX: [[META19:![0-9]+]] = !{!"int", !20, i64 0}
2485 ; NVPTX: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0}
2486 ; NVPTX: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"}
2487 ; NVPTX: [[LOOP22]] = distinct !{!22, !23, !24}
2488 ; NVPTX: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"}
2489 ; NVPTX: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
2490 ; NVPTX: [[LOOP25]] = distinct !{!25, !23, !24}
2491 ; NVPTX: [[TBAA26]] = !{!27, !27, i64 0}
2492 ; NVPTX: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
2493 ; NVPTX: [[LOOP28]] = distinct !{!28, !23, !24}
2494 ; NVPTX: [[LOOP29]] = distinct !{!29, !23, !24}
2495 ; NVPTX: [[META30:![0-9]+]] = !{!31, !27, i64 0}
2496 ; NVPTX: [[META31:![0-9]+]] = !{!"kmp_task_t_with_privates", !32, i64 0}
2497 ; NVPTX: [[META32:![0-9]+]] = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32}
2499 ; AMDGPU-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
2500 ; AMDGPU-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
2501 ; AMDGPU-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
2502 ; AMDGPU-DISABLED: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
2503 ; AMDGPU-DISABLED: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
2504 ; AMDGPU-DISABLED: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
2505 ; AMDGPU-DISABLED: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
2506 ; AMDGPU-DISABLED: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
2507 ; AMDGPU-DISABLED: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
2508 ; AMDGPU-DISABLED: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
2509 ; AMDGPU-DISABLED: [[META10:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
2510 ; AMDGPU-DISABLED: [[META11:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
2511 ; AMDGPU-DISABLED: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
2512 ; AMDGPU-DISABLED: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
2513 ; AMDGPU-DISABLED: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
2514 ; AMDGPU-DISABLED: [[META15:![0-9]+]] = !{i32 7, !"PIC Level", i32 2}
2515 ; AMDGPU-DISABLED: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
2516 ; AMDGPU-DISABLED: [[META17:![0-9]+]] = !{!"clang version 14.0.0"}
2517 ; AMDGPU-DISABLED: [[TBAA18]] = !{!19, !19, i64 0}
2518 ; AMDGPU-DISABLED: [[META19:![0-9]+]] = !{!"int", !20, i64 0}
2519 ; AMDGPU-DISABLED: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0}
2520 ; AMDGPU-DISABLED: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"}
2521 ; AMDGPU-DISABLED: [[LOOP22]] = distinct !{!22, !23, !24}
2522 ; AMDGPU-DISABLED: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"}
2523 ; AMDGPU-DISABLED: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
2524 ; AMDGPU-DISABLED: [[LOOP25]] = distinct !{!25, !23, !24}
2525 ; AMDGPU-DISABLED: [[TBAA26]] = !{!27, !27, i64 0}
2526 ; AMDGPU-DISABLED: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
2527 ; AMDGPU-DISABLED: [[LOOP28]] = distinct !{!28, !23, !24}
2528 ; AMDGPU-DISABLED: [[LOOP29]] = distinct !{!29, !23, !24}
2529 ; AMDGPU-DISABLED: [[META30:![0-9]+]] = !{!31, !27, i64 0}
2530 ; AMDGPU-DISABLED: [[META31:![0-9]+]] = !{!"kmp_task_t_with_privates", !32, i64 0}
2531 ; AMDGPU-DISABLED: [[META32:![0-9]+]] = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32}
2533 ; NVPTX-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
2534 ; NVPTX-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
2535 ; NVPTX-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
2536 ; NVPTX-DISABLED: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
2537 ; NVPTX-DISABLED: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
2538 ; NVPTX-DISABLED: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
2539 ; NVPTX-DISABLED: [[META6:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
2540 ; NVPTX-DISABLED: [[META7:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
2541 ; NVPTX-DISABLED: [[META8:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
2542 ; NVPTX-DISABLED: [[META9:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
2543 ; NVPTX-DISABLED: [[META10:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
2544 ; NVPTX-DISABLED: [[META11:![0-9]+]] = !{void ()* @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
2545 ; NVPTX-DISABLED: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
2546 ; NVPTX-DISABLED: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
2547 ; NVPTX-DISABLED: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
2548 ; NVPTX-DISABLED: [[META15:![0-9]+]] = !{i32 7, !"PIC Level", i32 2}
2549 ; NVPTX-DISABLED: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
2550 ; NVPTX-DISABLED: [[META17:![0-9]+]] = !{!"clang version 14.0.0"}
2551 ; NVPTX-DISABLED: [[TBAA18]] = !{!19, !19, i64 0}
2552 ; NVPTX-DISABLED: [[META19:![0-9]+]] = !{!"int", !20, i64 0}
2553 ; NVPTX-DISABLED: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0}
2554 ; NVPTX-DISABLED: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"}
2555 ; NVPTX-DISABLED: [[LOOP22]] = distinct !{!22, !23, !24}
2556 ; NVPTX-DISABLED: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"}
2557 ; NVPTX-DISABLED: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
2558 ; NVPTX-DISABLED: [[LOOP25]] = distinct !{!25, !23, !24}
2559 ; NVPTX-DISABLED: [[TBAA26]] = !{!27, !27, i64 0}
2560 ; NVPTX-DISABLED: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
2561 ; NVPTX-DISABLED: [[LOOP28]] = distinct !{!28, !23, !24}
2562 ; NVPTX-DISABLED: [[LOOP29]] = distinct !{!29, !23, !24}
2563 ; NVPTX-DISABLED: [[META30:![0-9]+]] = !{!31, !27, i64 0}
2564 ; NVPTX-DISABLED: [[META31:![0-9]+]] = !{!"kmp_task_t_with_privates", !32, i64 0}
2565 ; NVPTX-DISABLED: [[META32:![0-9]+]] = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32}