1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
2 // Test target codegen - host bc file has to be created first.
3 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
4 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1
5 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
6 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK2
7 // RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3
8 // expected-no-diagnostics
21 #pragma omp teams reduction(+: e)
27 #pragma omp teams reduction(^: c) reduction(*: d)
34 #pragma omp teams reduction(|: a) reduction(max: b)
35 #pragma omp parallel reduction(|: a) reduction(max: b)
47 a
+= ftemplate
<char>(n
);
53 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20
54 // CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[E:%.*]]) #[[ATTR0:[0-9]+]] {
55 // CHECK1-NEXT: entry:
56 // CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
57 // CHECK1-NEXT: [[E_ADDR:%.*]] = alloca i64, align 8
58 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
59 // CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
60 // CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
61 // CHECK1-NEXT: store i64 [[E]], ptr [[E_ADDR]], align 8
62 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_kernel_environment, ptr [[DYN_PTR]])
63 // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
64 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
65 // CHECK1: user_code.entry:
66 // CHECK1-NEXT: [[TMP1:%.*]] = load double, ptr [[E_ADDR]], align 8
67 // CHECK1-NEXT: [[E1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8)
68 // CHECK1-NEXT: store double [[TMP1]], ptr [[E1]], align 8
69 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
70 // CHECK1-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
71 // CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
72 // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[E1]]) #[[ATTR4:[0-9]+]]
73 // CHECK1-NEXT: call void @__kmpc_free_shared(ptr [[E1]], i64 8)
74 // CHECK1-NEXT: call void @__kmpc_target_deinit()
75 // CHECK1-NEXT: ret void
76 // CHECK1: worker.exit:
77 // CHECK1-NEXT: ret void
80 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_omp_outlined
81 // CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR2:[0-9]+]] {
82 // CHECK1-NEXT: entry:
83 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
84 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
85 // CHECK1-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8
86 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
87 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
88 // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
89 // CHECK1-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8
90 // CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 8
91 // CHECK1-NEXT: [[E1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 8)
92 // CHECK1-NEXT: store double 0.000000e+00, ptr [[E1]], align 8
93 // CHECK1-NEXT: [[TMP1:%.*]] = load double, ptr [[E1]], align 8
94 // CHECK1-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00
95 // CHECK1-NEXT: store double [[ADD]], ptr [[E1]], align 8
96 // CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
97 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
98 // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
99 // CHECK1-NEXT: store ptr [[E1]], ptr [[TMP4]], align 8
100 // CHECK1-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
101 // CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
102 // CHECK1-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
103 // CHECK1-NEXT: br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
104 // CHECK1: .omp.reduction.then:
105 // CHECK1-NEXT: [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8
106 // CHECK1-NEXT: [[TMP8:%.*]] = load double, ptr [[E1]], align 8
107 // CHECK1-NEXT: [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
108 // CHECK1-NEXT: store double [[ADD2]], ptr [[TMP0]], align 8
109 // CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
110 // CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
111 // CHECK1: .omp.reduction.done:
112 // CHECK1-NEXT: call void @__kmpc_free_shared(ptr [[E1]], i64 8)
113 // CHECK1-NEXT: ret void
116 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
117 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
118 // CHECK1-NEXT: entry:
119 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
120 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
121 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
122 // CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
123 // CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8
124 // CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
125 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
126 // CHECK1-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
127 // CHECK1-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
128 // CHECK1-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
129 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
130 // CHECK1-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
131 // CHECK1-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
132 // CHECK1-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
133 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
134 // CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
135 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
136 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[TMP9]], i64 1
137 // CHECK1-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP9]], align 8
138 // CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_get_warp_size()
139 // CHECK1-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i16
140 // CHECK1-NEXT: [[TMP15:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP12]], i16 [[TMP6]], i16 [[TMP14]])
141 // CHECK1-NEXT: store i64 [[TMP15]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 8
142 // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr i64, ptr [[TMP9]], i64 1
143 // CHECK1-NEXT: [[TMP17:%.*]] = getelementptr i64, ptr [[DOTOMP_REDUCTION_ELEMENT]], i64 1
144 // CHECK1-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8
145 // CHECK1-NEXT: [[TMP18:%.*]] = icmp eq i16 [[TMP7]], 0
146 // CHECK1-NEXT: [[TMP19:%.*]] = icmp eq i16 [[TMP7]], 1
147 // CHECK1-NEXT: [[TMP20:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
148 // CHECK1-NEXT: [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]]
149 // CHECK1-NEXT: [[TMP22:%.*]] = icmp eq i16 [[TMP7]], 2
150 // CHECK1-NEXT: [[TMP23:%.*]] = and i16 [[TMP5]], 1
151 // CHECK1-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP23]], 0
152 // CHECK1-NEXT: [[TMP25:%.*]] = and i1 [[TMP22]], [[TMP24]]
153 // CHECK1-NEXT: [[TMP26:%.*]] = icmp sgt i16 [[TMP6]], 0
154 // CHECK1-NEXT: [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
155 // CHECK1-NEXT: [[TMP28:%.*]] = or i1 [[TMP18]], [[TMP21]]
156 // CHECK1-NEXT: [[TMP29:%.*]] = or i1 [[TMP28]], [[TMP27]]
157 // CHECK1-NEXT: br i1 [[TMP29]], label [[THEN:%.*]], label [[ELSE:%.*]]
159 // CHECK1-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]]) #[[ATTR4]]
160 // CHECK1-NEXT: br label [[IFCONT:%.*]]
162 // CHECK1-NEXT: br label [[IFCONT]]
164 // CHECK1-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP7]], 1
165 // CHECK1-NEXT: [[TMP31:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
166 // CHECK1-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
167 // CHECK1-NEXT: br i1 [[TMP32]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
169 // CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
170 // CHECK1-NEXT: [[TMP34:%.*]] = load ptr, ptr [[TMP33]], align 8
171 // CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
172 // CHECK1-NEXT: [[TMP36:%.*]] = load ptr, ptr [[TMP35]], align 8
173 // CHECK1-NEXT: [[TMP37:%.*]] = load double, ptr [[TMP34]], align 8
174 // CHECK1-NEXT: store double [[TMP37]], ptr [[TMP36]], align 8
175 // CHECK1-NEXT: br label [[IFCONT6:%.*]]
177 // CHECK1-NEXT: br label [[IFCONT6]]
179 // CHECK1-NEXT: ret void
182 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
183 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
184 // CHECK1-NEXT: entry:
185 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
186 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
187 // CHECK1-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
188 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
189 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
190 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
191 // CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
192 // CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
193 // CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
194 // CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
195 // CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
196 // CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
197 // CHECK1-NEXT: store i32 0, ptr [[DOTCNT_ADDR]], align 4
198 // CHECK1-NEXT: br label [[PRECOND:%.*]]
200 // CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
201 // CHECK1-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
202 // CHECK1-NEXT: br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
204 // CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
205 // CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
206 // CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
208 // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
209 // CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
210 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
211 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
212 // CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
213 // CHECK1-NEXT: store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
214 // CHECK1-NEXT: br label [[IFCONT:%.*]]
216 // CHECK1-NEXT: br label [[IFCONT]]
218 // CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
219 // CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
220 // CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
221 // CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
223 // CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
224 // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
225 // CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
226 // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
227 // CHECK1-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
228 // CHECK1-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4
229 // CHECK1-NEXT: br label [[IFCONT4:%.*]]
231 // CHECK1-NEXT: br label [[IFCONT4]]
233 // CHECK1-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
234 // CHECK1-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
235 // CHECK1-NEXT: br label [[PRECOND]]
237 // CHECK1-NEXT: ret void
240 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
241 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
242 // CHECK1-NEXT: entry:
243 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
244 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
245 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8
246 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
247 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
248 // CHECK1-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
249 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
250 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
251 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
252 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
253 // CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
254 // CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
255 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x double], ptr [[E]], i32 0, i32 [[TMP5]]
256 // CHECK1-NEXT: [[TMP9:%.*]] = load double, ptr [[TMP7]], align 8
257 // CHECK1-NEXT: store double [[TMP9]], ptr [[TMP8]], align 128
258 // CHECK1-NEXT: ret void
261 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
262 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
263 // CHECK1-NEXT: entry:
264 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
265 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
266 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8
267 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
268 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
269 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
270 // CHECK1-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
271 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
272 // CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
273 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
274 // CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
275 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x double], ptr [[E]], i32 0, i32 [[TMP4]]
276 // CHECK1-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
277 // CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
278 // CHECK1-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_omp_outlined_omp$reduction$reduction_func"(ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr [[TMP7]]) #[[ATTR4]]
279 // CHECK1-NEXT: ret void
282 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
283 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
284 // CHECK1-NEXT: entry:
285 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
286 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
287 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8
288 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
289 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
290 // CHECK1-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
291 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
292 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
293 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
294 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
295 // CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
296 // CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
297 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x double], ptr [[E]], i32 0, i32 [[TMP5]]
298 // CHECK1-NEXT: [[TMP9:%.*]] = load double, ptr [[TMP8]], align 128
299 // CHECK1-NEXT: store double [[TMP9]], ptr [[TMP7]], align 8
300 // CHECK1-NEXT: ret void
303 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
304 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
305 // CHECK1-NEXT: entry:
306 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
307 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
308 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8
309 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
310 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
311 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
312 // CHECK1-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
313 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
314 // CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
315 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
316 // CHECK1-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
317 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x double], ptr [[E]], i32 0, i32 [[TMP4]]
318 // CHECK1-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
319 // CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
320 // CHECK1-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP7]], ptr [[DOTOMP_REDUCTION_RED_LIST]]) #[[ATTR4]]
321 // CHECK1-NEXT: ret void
324 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26
325 // CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR0]] {
326 // CHECK1-NEXT: entry:
327 // CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
328 // CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i64, align 8
329 // CHECK1-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8
330 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
331 // CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
332 // CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
333 // CHECK1-NEXT: store i64 [[C]], ptr [[C_ADDR]], align 8
334 // CHECK1-NEXT: store i64 [[D]], ptr [[D_ADDR]], align 8
335 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_kernel_environment, ptr [[DYN_PTR]])
336 // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
337 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
338 // CHECK1: user_code.entry:
339 // CHECK1-NEXT: [[TMP1:%.*]] = load i8, ptr [[C_ADDR]], align 1
340 // CHECK1-NEXT: [[C1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 1)
341 // CHECK1-NEXT: store i8 [[TMP1]], ptr [[C1]], align 1
342 // CHECK1-NEXT: [[TMP2:%.*]] = load float, ptr [[D_ADDR]], align 4
343 // CHECK1-NEXT: [[D2:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
344 // CHECK1-NEXT: store float [[TMP2]], ptr [[D2]], align 4
345 // CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
346 // CHECK1-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
347 // CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTTHREADID_TEMP_]], align 4
348 // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[C1]], ptr [[D2]]) #[[ATTR4]]
349 // CHECK1-NEXT: call void @__kmpc_free_shared(ptr [[D2]], i64 4)
350 // CHECK1-NEXT: call void @__kmpc_free_shared(ptr [[C1]], i64 1)
351 // CHECK1-NEXT: call void @__kmpc_target_deinit()
352 // CHECK1-NEXT: ret void
353 // CHECK1: worker.exit:
354 // CHECK1-NEXT: ret void
357 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_omp_outlined
358 // CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR2]] {
359 // CHECK1-NEXT: entry:
360 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
361 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
362 // CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
363 // CHECK1-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8
364 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 8
365 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
366 // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
367 // CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
368 // CHECK1-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8
369 // CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
370 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8
371 // CHECK1-NEXT: [[C1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 1)
372 // CHECK1-NEXT: [[D2:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
373 // CHECK1-NEXT: store i8 0, ptr [[C1]], align 1
374 // CHECK1-NEXT: store float 1.000000e+00, ptr [[D2]], align 4
375 // CHECK1-NEXT: [[TMP2:%.*]] = load i8, ptr [[C1]], align 1
376 // CHECK1-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32
377 // CHECK1-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2
378 // CHECK1-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
379 // CHECK1-NEXT: store i8 [[CONV3]], ptr [[C1]], align 1
380 // CHECK1-NEXT: [[TMP3:%.*]] = load float, ptr [[D2]], align 4
381 // CHECK1-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01
382 // CHECK1-NEXT: store float [[MUL]], ptr [[D2]], align 4
383 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
384 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
385 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
386 // CHECK1-NEXT: store ptr [[C1]], ptr [[TMP6]], align 8
387 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
388 // CHECK1-NEXT: store ptr [[D2]], ptr [[TMP7]], align 8
389 // CHECK1-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
390 // CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
391 // CHECK1-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
392 // CHECK1-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
393 // CHECK1: .omp.reduction.then:
394 // CHECK1-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1
395 // CHECK1-NEXT: [[CONV4:%.*]] = sext i8 [[TMP10]] to i32
396 // CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[C1]], align 1
397 // CHECK1-NEXT: [[CONV5:%.*]] = sext i8 [[TMP11]] to i32
398 // CHECK1-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
399 // CHECK1-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
400 // CHECK1-NEXT: store i8 [[CONV7]], ptr [[TMP0]], align 1
401 // CHECK1-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4
402 // CHECK1-NEXT: [[TMP13:%.*]] = load float, ptr [[D2]], align 4
403 // CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
404 // CHECK1-NEXT: store float [[MUL8]], ptr [[TMP1]], align 4
405 // CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
406 // CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
407 // CHECK1: .omp.reduction.done:
408 // CHECK1-NEXT: call void @__kmpc_free_shared(ptr [[D2]], i64 4)
409 // CHECK1-NEXT: call void @__kmpc_free_shared(ptr [[C1]], i64 1)
410 // CHECK1-NEXT: ret void
413 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func1
414 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
415 // CHECK1-NEXT: entry:
416 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
417 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
418 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
419 // CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
420 // CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x ptr], align 8
421 // CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
422 // CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
423 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
424 // CHECK1-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
425 // CHECK1-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
426 // CHECK1-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
427 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
428 // CHECK1-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
429 // CHECK1-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
430 // CHECK1-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
431 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i64 0, i64 0
432 // CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
433 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
434 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP9]], i64 1
435 // CHECK1-NEXT: [[TMP12:%.*]] = load i8, ptr [[TMP9]], align 1
436 // CHECK1-NEXT: [[TMP13:%.*]] = sext i8 [[TMP12]] to i32
437 // CHECK1-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_get_warp_size()
438 // CHECK1-NEXT: [[TMP15:%.*]] = trunc i32 [[TMP14]] to i16
439 // CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP13]], i16 [[TMP6]], i16 [[TMP15]])
440 // CHECK1-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
441 // CHECK1-NEXT: store i8 [[TMP17]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 1
442 // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP9]], i64 1
443 // CHECK1-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTOMP_REDUCTION_ELEMENT]], i64 1
444 // CHECK1-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8
445 // CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i64 0, i64 1
446 // CHECK1-NEXT: [[TMP21:%.*]] = load ptr, ptr [[TMP20]], align 8
447 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
448 // CHECK1-NEXT: [[TMP23:%.*]] = getelementptr float, ptr [[TMP21]], i64 1
449 // CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP21]], align 4
450 // CHECK1-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_get_warp_size()
451 // CHECK1-NEXT: [[TMP26:%.*]] = trunc i32 [[TMP25]] to i16
452 // CHECK1-NEXT: [[TMP27:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP24]], i16 [[TMP6]], i16 [[TMP26]])
453 // CHECK1-NEXT: store i32 [[TMP27]], ptr [[DOTOMP_REDUCTION_ELEMENT4]], align 4
454 // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[TMP21]], i64 1
455 // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[DOTOMP_REDUCTION_ELEMENT4]], i64 1
456 // CHECK1-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT4]], ptr [[TMP22]], align 8
457 // CHECK1-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP7]], 0
458 // CHECK1-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP7]], 1
459 // CHECK1-NEXT: [[TMP32:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
460 // CHECK1-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
461 // CHECK1-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP7]], 2
462 // CHECK1-NEXT: [[TMP35:%.*]] = and i16 [[TMP5]], 1
463 // CHECK1-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP35]], 0
464 // CHECK1-NEXT: [[TMP37:%.*]] = and i1 [[TMP34]], [[TMP36]]
465 // CHECK1-NEXT: [[TMP38:%.*]] = icmp sgt i16 [[TMP6]], 0
466 // CHECK1-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
467 // CHECK1-NEXT: [[TMP40:%.*]] = or i1 [[TMP30]], [[TMP33]]
468 // CHECK1-NEXT: [[TMP41:%.*]] = or i1 [[TMP40]], [[TMP39]]
469 // CHECK1-NEXT: br i1 [[TMP41]], label [[THEN:%.*]], label [[ELSE:%.*]]
471 // CHECK1-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]]) #[[ATTR4]]
472 // CHECK1-NEXT: br label [[IFCONT:%.*]]
474 // CHECK1-NEXT: br label [[IFCONT]]
476 // CHECK1-NEXT: [[TMP42:%.*]] = icmp eq i16 [[TMP7]], 1
477 // CHECK1-NEXT: [[TMP43:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
478 // CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
479 // CHECK1-NEXT: br i1 [[TMP44]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
481 // CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
482 // CHECK1-NEXT: [[TMP46:%.*]] = load ptr, ptr [[TMP45]], align 8
483 // CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i64 0, i64 0
484 // CHECK1-NEXT: [[TMP48:%.*]] = load ptr, ptr [[TMP47]], align 8
485 // CHECK1-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP46]], align 1
486 // CHECK1-NEXT: store i8 [[TMP49]], ptr [[TMP48]], align 1
487 // CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
488 // CHECK1-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 8
489 // CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i64 0, i64 1
490 // CHECK1-NEXT: [[TMP53:%.*]] = load ptr, ptr [[TMP52]], align 8
491 // CHECK1-NEXT: [[TMP54:%.*]] = load float, ptr [[TMP51]], align 4
492 // CHECK1-NEXT: store float [[TMP54]], ptr [[TMP53]], align 4
493 // CHECK1-NEXT: br label [[IFCONT7:%.*]]
495 // CHECK1-NEXT: br label [[IFCONT7]]
497 // CHECK1-NEXT: ret void
500 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func2
501 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
502 // CHECK1-NEXT: entry:
503 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
504 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
505 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
506 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
507 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
508 // CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
509 // CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
510 // CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
511 // CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
512 // CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
513 // CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
514 // CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
515 // CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
516 // CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
518 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 0
519 // CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8
520 // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
521 // CHECK1-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
522 // CHECK1-NEXT: store volatile i8 [[TMP10]], ptr addrspace(3) [[TMP9]], align 1
523 // CHECK1-NEXT: br label [[IFCONT:%.*]]
525 // CHECK1-NEXT: br label [[IFCONT]]
527 // CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
528 // CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
529 // CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
530 // CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
532 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
533 // CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 0
534 // CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8
535 // CHECK1-NEXT: [[TMP15:%.*]] = load volatile i8, ptr addrspace(3) [[TMP12]], align 1
536 // CHECK1-NEXT: store i8 [[TMP15]], ptr [[TMP14]], align 1
537 // CHECK1-NEXT: br label [[IFCONT4:%.*]]
539 // CHECK1-NEXT: br label [[IFCONT4]]
541 // CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
542 // CHECK1-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
543 // CHECK1-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
545 // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
546 // CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
547 // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
548 // CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
549 // CHECK1-NEXT: store volatile i32 [[TMP19]], ptr addrspace(3) [[TMP18]], align 4
550 // CHECK1-NEXT: br label [[IFCONT8:%.*]]
552 // CHECK1-NEXT: br label [[IFCONT8]]
554 // CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
555 // CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
556 // CHECK1-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
557 // CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
559 // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
560 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
561 // CHECK1-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 8
562 // CHECK1-NEXT: [[TMP24:%.*]] = load volatile i32, ptr addrspace(3) [[TMP21]], align 4
563 // CHECK1-NEXT: store i32 [[TMP24]], ptr [[TMP23]], align 4
564 // CHECK1-NEXT: br label [[IFCONT12:%.*]]
566 // CHECK1-NEXT: br label [[IFCONT12]]
568 // CHECK1-NEXT: ret void
571 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func3
572 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
573 // CHECK1-NEXT: entry:
574 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
575 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
576 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8
577 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
578 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
579 // CHECK1-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
580 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
581 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
582 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
583 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0
584 // CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
585 // CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 0, i32 0
586 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i8], ptr [[C]], i32 0, i32 [[TMP5]]
587 // CHECK1-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
588 // CHECK1-NEXT: store i8 [[TMP9]], ptr [[TMP8]], align 128
589 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
590 // CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
591 // CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 0, i32 1
592 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x float], ptr [[D]], i32 0, i32 [[TMP5]]
593 // CHECK1-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP11]], align 4
594 // CHECK1-NEXT: store float [[TMP13]], ptr [[TMP12]], align 128
595 // CHECK1-NEXT: ret void
598 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func4
599 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
600 // CHECK1-NEXT: entry:
601 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
602 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
603 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8
604 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 8
605 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
606 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
607 // CHECK1-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
608 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
609 // CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
610 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
611 // CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP3]], i32 0, i32 0
612 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i8], ptr [[C]], i32 0, i32 [[TMP4]]
613 // CHECK1-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
614 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
615 // CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP3]], i32 0, i32 1
616 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x float], ptr [[D]], i32 0, i32 [[TMP4]]
617 // CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 8
618 // CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
619 // CHECK1-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_omp_outlined_omp$reduction$reduction_func"(ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr [[TMP9]]) #[[ATTR4]]
620 // CHECK1-NEXT: ret void
623 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func5
624 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
625 // CHECK1-NEXT: entry:
626 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
627 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
628 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8
629 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
630 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
631 // CHECK1-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
632 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
633 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
634 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
635 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0
636 // CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
637 // CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 0, i32 0
638 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i8], ptr [[C]], i32 0, i32 [[TMP5]]
639 // CHECK1-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 128
640 // CHECK1-NEXT: store i8 [[TMP9]], ptr [[TMP7]], align 1
641 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
642 // CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
643 // CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 0, i32 1
644 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x float], ptr [[D]], i32 0, i32 [[TMP5]]
645 // CHECK1-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 128
646 // CHECK1-NEXT: store float [[TMP13]], ptr [[TMP11]], align 4
647 // CHECK1-NEXT: ret void
650 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func6
651 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
652 // CHECK1-NEXT: entry:
653 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
654 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
655 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8
656 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 8
657 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
658 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
659 // CHECK1-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
660 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
661 // CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
662 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
663 // CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP3]], i32 0, i32 0
664 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i8], ptr [[C]], i32 0, i32 [[TMP4]]
665 // CHECK1-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
666 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
667 // CHECK1-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP3]], i32 0, i32 1
668 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x float], ptr [[D]], i32 0, i32 [[TMP4]]
669 // CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 8
670 // CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
671 // CHECK1-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP9]], ptr [[DOTOMP_REDUCTION_RED_LIST]]) #[[ATTR4]]
672 // CHECK1-NEXT: ret void
675 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33
676 // CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
677 // CHECK1-NEXT: entry:
678 // CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
679 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
680 // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8
681 // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
682 // CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
683 // CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
684 // CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
685 // CHECK1-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8
686 // CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_kernel_environment, ptr [[DYN_PTR]])
687 // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
688 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
689 // CHECK1: user_code.entry:
690 // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
691 // CHECK1-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
692 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4
693 // CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[A_ADDR]], ptr [[B_ADDR]]) #[[ATTR4]]
694 // CHECK1-NEXT: call void @__kmpc_target_deinit()
695 // CHECK1-NEXT: ret void
696 // CHECK1: worker.exit:
697 // CHECK1-NEXT: ret void
700 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined
701 // CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR2]] {
702 // CHECK1-NEXT: entry:
703 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
704 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
705 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
706 // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
707 // CHECK1-NEXT: [[A1:%.*]] = alloca i32, align 4
708 // CHECK1-NEXT: [[B2:%.*]] = alloca i16, align 2
709 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 8
710 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 8
711 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
712 // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
713 // CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
714 // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
715 // CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
716 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8
717 // CHECK1-NEXT: store i32 0, ptr [[A1]], align 4
718 // CHECK1-NEXT: store i16 -32768, ptr [[B2]], align 2
719 // CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
720 // CHECK1-NEXT: store ptr [[A1]], ptr [[TMP2]], align 8
721 // CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
722 // CHECK1-NEXT: store ptr [[B2]], ptr [[TMP3]], align 8
723 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
724 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
725 // CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2)
726 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
727 // CHECK1-NEXT: store ptr [[A1]], ptr [[TMP6]], align 8
728 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
729 // CHECK1-NEXT: store ptr [[B2]], ptr [[TMP7]], align 8
730 // CHECK1-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
731 // CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
732 // CHECK1-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
733 // CHECK1-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
734 // CHECK1: .omp.reduction.then:
735 // CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
736 // CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[A1]], align 4
737 // CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]]
738 // CHECK1-NEXT: store i32 [[OR]], ptr [[TMP0]], align 4
739 // CHECK1-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2
740 // CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP12]] to i32
741 // CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[B2]], align 2
742 // CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP13]] to i32
743 // CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
744 // CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
745 // CHECK1: cond.true:
746 // CHECK1-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2
747 // CHECK1-NEXT: br label [[COND_END:%.*]]
748 // CHECK1: cond.false:
749 // CHECK1-NEXT: [[TMP15:%.*]] = load i16, ptr [[B2]], align 2
750 // CHECK1-NEXT: br label [[COND_END]]
752 // CHECK1-NEXT: [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
753 // CHECK1-NEXT: store i16 [[COND]], ptr [[TMP1]], align 2
754 // CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
755 // CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
756 // CHECK1: .omp.reduction.done:
757 // CHECK1-NEXT: ret void
760 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp_outlined
761 // CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR2]] {
762 // CHECK1-NEXT: entry:
763 // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
764 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
765 // CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
766 // CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
767 // CHECK1-NEXT: [[A1:%.*]] = alloca i32, align 4
768 // CHECK1-NEXT: [[B2:%.*]] = alloca i16, align 2
769 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 8
770 // CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
771 // CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
772 // CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
773 // CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
774 // CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
775 // CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8
776 // CHECK1-NEXT: store i32 0, ptr [[A1]], align 4
777 // CHECK1-NEXT: store i16 -32768, ptr [[B2]], align 2
778 // CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[A1]], align 4
779 // CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1
780 // CHECK1-NEXT: store i32 [[OR]], ptr [[A1]], align 4
781 // CHECK1-NEXT: [[TMP3:%.*]] = load i16, ptr [[B2]], align 2
782 // CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32
783 // CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
784 // CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
785 // CHECK1: cond.true:
786 // CHECK1-NEXT: br label [[COND_END:%.*]]
787 // CHECK1: cond.false:
788 // CHECK1-NEXT: [[TMP4:%.*]] = load i16, ptr [[B2]], align 2
789 // CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
790 // CHECK1-NEXT: br label [[COND_END]]
792 // CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
793 // CHECK1-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16
794 // CHECK1-NEXT: store i16 [[CONV4]], ptr [[B2]], align 2
795 // CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
796 // CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
797 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
798 // CHECK1-NEXT: store ptr [[A1]], ptr [[TMP7]], align 8
799 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
800 // CHECK1-NEXT: store ptr [[B2]], ptr [[TMP8]], align 8
801 // CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP6]], i32 2, i64 16, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func7, ptr @_omp_reduction_inter_warp_copy_func8)
802 // CHECK1-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
803 // CHECK1-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
804 // CHECK1: .omp.reduction.then:
805 // CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
806 // CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[A1]], align 4
807 // CHECK1-NEXT: [[OR5:%.*]] = or i32 [[TMP11]], [[TMP12]]
808 // CHECK1-NEXT: store i32 [[OR5]], ptr [[TMP0]], align 4
809 // CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2
810 // CHECK1-NEXT: [[CONV6:%.*]] = sext i16 [[TMP13]] to i32
811 // CHECK1-NEXT: [[TMP14:%.*]] = load i16, ptr [[B2]], align 2
812 // CHECK1-NEXT: [[CONV7:%.*]] = sext i16 [[TMP14]] to i32
813 // CHECK1-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
814 // CHECK1-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
815 // CHECK1: cond.true9:
816 // CHECK1-NEXT: [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2
817 // CHECK1-NEXT: br label [[COND_END11:%.*]]
818 // CHECK1: cond.false10:
819 // CHECK1-NEXT: [[TMP16:%.*]] = load i16, ptr [[B2]], align 2
820 // CHECK1-NEXT: br label [[COND_END11]]
821 // CHECK1: cond.end11:
822 // CHECK1-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE9]] ], [ [[TMP16]], [[COND_FALSE10]] ]
823 // CHECK1-NEXT: store i16 [[COND12]], ptr [[TMP1]], align 2
824 // CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
825 // CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
826 // CHECK1: .omp.reduction.done:
827 // CHECK1-NEXT: ret void
830 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func7
831 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
832 // CHECK1-NEXT: entry:
833 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
834 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
835 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
836 // CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
837 // CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x ptr], align 8
838 // CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
839 // CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
840 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
841 // CHECK1-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
842 // CHECK1-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
843 // CHECK1-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
844 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
845 // CHECK1-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
846 // CHECK1-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
847 // CHECK1-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
848 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i64 0, i64 0
849 // CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
850 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
851 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP9]], i64 1
852 // CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP9]], align 4
853 // CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_get_warp_size()
854 // CHECK1-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i16
855 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP12]], i16 [[TMP6]], i16 [[TMP14]])
856 // CHECK1-NEXT: store i32 [[TMP15]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 4
857 // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP9]], i64 1
858 // CHECK1-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[DOTOMP_REDUCTION_ELEMENT]], i64 1
859 // CHECK1-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8
860 // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i64 0, i64 1
861 // CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP18]], align 8
862 // CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
863 // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[TMP19]], i64 1
864 // CHECK1-NEXT: [[TMP22:%.*]] = load i16, ptr [[TMP19]], align 2
865 // CHECK1-NEXT: [[TMP23:%.*]] = sext i16 [[TMP22]] to i32
866 // CHECK1-NEXT: [[TMP24:%.*]] = call i32 @__kmpc_get_warp_size()
867 // CHECK1-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
868 // CHECK1-NEXT: [[TMP26:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP23]], i16 [[TMP6]], i16 [[TMP25]])
869 // CHECK1-NEXT: [[TMP27:%.*]] = trunc i32 [[TMP26]] to i16
870 // CHECK1-NEXT: store i16 [[TMP27]], ptr [[DOTOMP_REDUCTION_ELEMENT4]], align 2
871 // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[TMP19]], i64 1
872 // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[DOTOMP_REDUCTION_ELEMENT4]], i64 1
873 // CHECK1-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT4]], ptr [[TMP20]], align 8
874 // CHECK1-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP7]], 0
875 // CHECK1-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP7]], 1
876 // CHECK1-NEXT: [[TMP32:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
877 // CHECK1-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
878 // CHECK1-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP7]], 2
879 // CHECK1-NEXT: [[TMP35:%.*]] = and i16 [[TMP5]], 1
880 // CHECK1-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP35]], 0
881 // CHECK1-NEXT: [[TMP37:%.*]] = and i1 [[TMP34]], [[TMP36]]
882 // CHECK1-NEXT: [[TMP38:%.*]] = icmp sgt i16 [[TMP6]], 0
883 // CHECK1-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
884 // CHECK1-NEXT: [[TMP40:%.*]] = or i1 [[TMP30]], [[TMP33]]
885 // CHECK1-NEXT: [[TMP41:%.*]] = or i1 [[TMP40]], [[TMP39]]
886 // CHECK1-NEXT: br i1 [[TMP41]], label [[THEN:%.*]], label [[ELSE:%.*]]
888 // CHECK1-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]]) #[[ATTR4]]
889 // CHECK1-NEXT: br label [[IFCONT:%.*]]
891 // CHECK1-NEXT: br label [[IFCONT]]
893 // CHECK1-NEXT: [[TMP42:%.*]] = icmp eq i16 [[TMP7]], 1
894 // CHECK1-NEXT: [[TMP43:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
895 // CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
896 // CHECK1-NEXT: br i1 [[TMP44]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
898 // CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
899 // CHECK1-NEXT: [[TMP46:%.*]] = load ptr, ptr [[TMP45]], align 8
900 // CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i64 0, i64 0
901 // CHECK1-NEXT: [[TMP48:%.*]] = load ptr, ptr [[TMP47]], align 8
902 // CHECK1-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP46]], align 4
903 // CHECK1-NEXT: store i32 [[TMP49]], ptr [[TMP48]], align 4
904 // CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
905 // CHECK1-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 8
906 // CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i64 0, i64 1
907 // CHECK1-NEXT: [[TMP53:%.*]] = load ptr, ptr [[TMP52]], align 8
908 // CHECK1-NEXT: [[TMP54:%.*]] = load i16, ptr [[TMP51]], align 2
909 // CHECK1-NEXT: store i16 [[TMP54]], ptr [[TMP53]], align 2
910 // CHECK1-NEXT: br label [[IFCONT7:%.*]]
912 // CHECK1-NEXT: br label [[IFCONT7]]
914 // CHECK1-NEXT: ret void
917 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func8
918 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
919 // CHECK1-NEXT: entry:
920 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
921 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
922 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
923 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
924 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
925 // CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
926 // CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
927 // CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
928 // CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
929 // CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
930 // CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
931 // CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
932 // CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
933 // CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
935 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 0
936 // CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8
937 // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
938 // CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 4
939 // CHECK1-NEXT: store volatile i32 [[TMP10]], ptr addrspace(3) [[TMP9]], align 4
940 // CHECK1-NEXT: br label [[IFCONT:%.*]]
942 // CHECK1-NEXT: br label [[IFCONT]]
944 // CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
945 // CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
946 // CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
947 // CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
949 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
950 // CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 0
951 // CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8
952 // CHECK1-NEXT: [[TMP15:%.*]] = load volatile i32, ptr addrspace(3) [[TMP12]], align 4
953 // CHECK1-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4
954 // CHECK1-NEXT: br label [[IFCONT4:%.*]]
956 // CHECK1-NEXT: br label [[IFCONT4]]
958 // CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
959 // CHECK1-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
960 // CHECK1-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
962 // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
963 // CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
964 // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
965 // CHECK1-NEXT: [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 2
966 // CHECK1-NEXT: store volatile i16 [[TMP19]], ptr addrspace(3) [[TMP18]], align 2
967 // CHECK1-NEXT: br label [[IFCONT8:%.*]]
969 // CHECK1-NEXT: br label [[IFCONT8]]
971 // CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
972 // CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
973 // CHECK1-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
974 // CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
976 // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
977 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
978 // CHECK1-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 8
979 // CHECK1-NEXT: [[TMP24:%.*]] = load volatile i16, ptr addrspace(3) [[TMP21]], align 2
980 // CHECK1-NEXT: store i16 [[TMP24]], ptr [[TMP23]], align 2
981 // CHECK1-NEXT: br label [[IFCONT12:%.*]]
983 // CHECK1-NEXT: br label [[IFCONT12]]
985 // CHECK1-NEXT: ret void
988 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func9
989 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
990 // CHECK1-NEXT: entry:
991 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
992 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
993 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
994 // CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
995 // CHECK1-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x ptr], align 8
996 // CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
997 // CHECK1-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
998 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
999 // CHECK1-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
1000 // CHECK1-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
1001 // CHECK1-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
1002 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
1003 // CHECK1-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
1004 // CHECK1-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
1005 // CHECK1-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
1006 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i64 0, i64 0
1007 // CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
1008 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
1009 // CHECK1-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP9]], i64 1
1010 // CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP9]], align 4
1011 // CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_get_warp_size()
1012 // CHECK1-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i16
1013 // CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP12]], i16 [[TMP6]], i16 [[TMP14]])
1014 // CHECK1-NEXT: store i32 [[TMP15]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 4
1015 // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP9]], i64 1
1016 // CHECK1-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[DOTOMP_REDUCTION_ELEMENT]], i64 1
1017 // CHECK1-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8
1018 // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i64 0, i64 1
1019 // CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP18]], align 8
1020 // CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
1021 // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[TMP19]], i64 1
1022 // CHECK1-NEXT: [[TMP22:%.*]] = load i16, ptr [[TMP19]], align 2
1023 // CHECK1-NEXT: [[TMP23:%.*]] = sext i16 [[TMP22]] to i32
1024 // CHECK1-NEXT: [[TMP24:%.*]] = call i32 @__kmpc_get_warp_size()
1025 // CHECK1-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
1026 // CHECK1-NEXT: [[TMP26:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP23]], i16 [[TMP6]], i16 [[TMP25]])
1027 // CHECK1-NEXT: [[TMP27:%.*]] = trunc i32 [[TMP26]] to i16
1028 // CHECK1-NEXT: store i16 [[TMP27]], ptr [[DOTOMP_REDUCTION_ELEMENT4]], align 2
1029 // CHECK1-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[TMP19]], i64 1
1030 // CHECK1-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[DOTOMP_REDUCTION_ELEMENT4]], i64 1
1031 // CHECK1-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT4]], ptr [[TMP20]], align 8
1032 // CHECK1-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP7]], 0
1033 // CHECK1-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP7]], 1
1034 // CHECK1-NEXT: [[TMP32:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
1035 // CHECK1-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
1036 // CHECK1-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP7]], 2
1037 // CHECK1-NEXT: [[TMP35:%.*]] = and i16 [[TMP5]], 1
1038 // CHECK1-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP35]], 0
1039 // CHECK1-NEXT: [[TMP37:%.*]] = and i1 [[TMP34]], [[TMP36]]
1040 // CHECK1-NEXT: [[TMP38:%.*]] = icmp sgt i16 [[TMP6]], 0
1041 // CHECK1-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
1042 // CHECK1-NEXT: [[TMP40:%.*]] = or i1 [[TMP30]], [[TMP33]]
1043 // CHECK1-NEXT: [[TMP41:%.*]] = or i1 [[TMP40]], [[TMP39]]
1044 // CHECK1-NEXT: br i1 [[TMP41]], label [[THEN:%.*]], label [[ELSE:%.*]]
1046 // CHECK1-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]]) #[[ATTR4]]
1047 // CHECK1-NEXT: br label [[IFCONT:%.*]]
1049 // CHECK1-NEXT: br label [[IFCONT]]
1051 // CHECK1-NEXT: [[TMP42:%.*]] = icmp eq i16 [[TMP7]], 1
1052 // CHECK1-NEXT: [[TMP43:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
1053 // CHECK1-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
1054 // CHECK1-NEXT: br i1 [[TMP44]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
1056 // CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
1057 // CHECK1-NEXT: [[TMP46:%.*]] = load ptr, ptr [[TMP45]], align 8
1058 // CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i64 0, i64 0
1059 // CHECK1-NEXT: [[TMP48:%.*]] = load ptr, ptr [[TMP47]], align 8
1060 // CHECK1-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP46]], align 4
1061 // CHECK1-NEXT: store i32 [[TMP49]], ptr [[TMP48]], align 4
1062 // CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
1063 // CHECK1-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 8
1064 // CHECK1-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i64 0, i64 1
1065 // CHECK1-NEXT: [[TMP53:%.*]] = load ptr, ptr [[TMP52]], align 8
1066 // CHECK1-NEXT: [[TMP54:%.*]] = load i16, ptr [[TMP51]], align 2
1067 // CHECK1-NEXT: store i16 [[TMP54]], ptr [[TMP53]], align 2
1068 // CHECK1-NEXT: br label [[IFCONT7:%.*]]
1070 // CHECK1-NEXT: br label [[IFCONT7]]
1072 // CHECK1-NEXT: ret void
1075 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func10
1076 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
1077 // CHECK1-NEXT: entry:
1078 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
1079 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1080 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
1081 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
1082 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
1083 // CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1084 // CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1085 // CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
1086 // CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1087 // CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
1088 // CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
1089 // CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
1090 // CHECK1-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1091 // CHECK1-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
1093 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 0
1094 // CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8
1095 // CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1096 // CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 4
1097 // CHECK1-NEXT: store volatile i32 [[TMP10]], ptr addrspace(3) [[TMP9]], align 4
1098 // CHECK1-NEXT: br label [[IFCONT:%.*]]
1100 // CHECK1-NEXT: br label [[IFCONT]]
1102 // CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
1103 // CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1104 // CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
1105 // CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
1107 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
1108 // CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 0
1109 // CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8
1110 // CHECK1-NEXT: [[TMP15:%.*]] = load volatile i32, ptr addrspace(3) [[TMP12]], align 4
1111 // CHECK1-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4
1112 // CHECK1-NEXT: br label [[IFCONT4:%.*]]
1114 // CHECK1-NEXT: br label [[IFCONT4]]
1116 // CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
1117 // CHECK1-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1118 // CHECK1-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
1120 // CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
1121 // CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
1122 // CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1123 // CHECK1-NEXT: [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 2
1124 // CHECK1-NEXT: store volatile i16 [[TMP19]], ptr addrspace(3) [[TMP18]], align 2
1125 // CHECK1-NEXT: br label [[IFCONT8:%.*]]
1127 // CHECK1-NEXT: br label [[IFCONT8]]
1129 // CHECK1-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
1130 // CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1131 // CHECK1-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
1132 // CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
1134 // CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
1135 // CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
1136 // CHECK1-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 8
1137 // CHECK1-NEXT: [[TMP24:%.*]] = load volatile i16, ptr addrspace(3) [[TMP21]], align 2
1138 // CHECK1-NEXT: store i16 [[TMP24]], ptr [[TMP23]], align 2
1139 // CHECK1-NEXT: br label [[IFCONT12:%.*]]
1141 // CHECK1-NEXT: br label [[IFCONT12]]
1142 // CHECK1: ifcont12:
1143 // CHECK1-NEXT: ret void
1146 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func11
1147 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1148 // CHECK1-NEXT: entry:
1149 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
1150 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1151 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8
1152 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
1153 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
1154 // CHECK1-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
1155 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
1156 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
1157 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1158 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0
1159 // CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
1160 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
1161 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
1162 // CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
1163 // CHECK1-NEXT: store i32 [[TMP9]], ptr [[TMP8]], align 128
1164 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
1165 // CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
1166 // CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
1167 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
1168 // CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
1169 // CHECK1-NEXT: store i16 [[TMP13]], ptr [[TMP12]], align 128
1170 // CHECK1-NEXT: ret void
1173 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func12
1174 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1175 // CHECK1-NEXT: entry:
1176 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
1177 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1178 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8
1179 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 8
1180 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
1181 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
1182 // CHECK1-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
1183 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
1184 // CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1185 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
1186 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
1187 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
1188 // CHECK1-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
1189 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
1190 // CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
1191 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
1192 // CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 8
1193 // CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
1194 // CHECK1-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp$reduction$reduction_func"(ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr [[TMP9]]) #[[ATTR4]]
1195 // CHECK1-NEXT: ret void
1198 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func13
1199 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1200 // CHECK1-NEXT: entry:
1201 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
1202 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1203 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8
1204 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
1205 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
1206 // CHECK1-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
1207 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
1208 // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
1209 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1210 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0
1211 // CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
1212 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
1213 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
1214 // CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
1215 // CHECK1-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 4
1216 // CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
1217 // CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
1218 // CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
1219 // CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
1220 // CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
1221 // CHECK1-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 2
1222 // CHECK1-NEXT: ret void
1225 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func14
1226 // CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1227 // CHECK1-NEXT: entry:
1228 // CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
1229 // CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1230 // CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8
1231 // CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 8
1232 // CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
1233 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
1234 // CHECK1-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
1235 // CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
1236 // CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1237 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
1238 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
1239 // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
1240 // CHECK1-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
1241 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
1242 // CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
1243 // CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
1244 // CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 8
1245 // CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
1246 // CHECK1-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP9]], ptr [[DOTOMP_REDUCTION_RED_LIST]]) #[[ATTR4]]
1247 // CHECK1-NEXT: ret void
1250 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20
1251 // CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
1252 // CHECK2-NEXT: entry:
1253 // CHECK2-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
1254 // CHECK2-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 4
1255 // CHECK2-NEXT: [[E1:%.*]] = alloca double, align 8
1256 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1257 // CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1258 // CHECK2-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
1259 // CHECK2-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 4
1260 // CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 4
1261 // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_kernel_environment, ptr [[DYN_PTR]])
1262 // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
1263 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
1264 // CHECK2: user_code.entry:
1265 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
1266 // CHECK2-NEXT: [[TMP3:%.*]] = load double, ptr [[TMP0]], align 8
1267 // CHECK2-NEXT: store double [[TMP3]], ptr [[E1]], align 8
1268 // CHECK2-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
1269 // CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
1270 // CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[E1]]) #[[ATTR4:[0-9]+]]
1271 // CHECK2-NEXT: call void @__kmpc_target_deinit()
1272 // CHECK2-NEXT: ret void
1273 // CHECK2: worker.exit:
1274 // CHECK2-NEXT: ret void
1277 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_omp_outlined
1278 // CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] {
1279 // CHECK2-NEXT: entry:
1280 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
1281 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
1282 // CHECK2-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 4
1283 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 4
1284 // CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
1285 // CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
1286 // CHECK2-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 4
1287 // CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 4
1288 // CHECK2-NEXT: [[E1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 8)
1289 // CHECK2-NEXT: store double 0.000000e+00, ptr [[E1]], align 8
1290 // CHECK2-NEXT: [[TMP1:%.*]] = load double, ptr [[E1]], align 8
1291 // CHECK2-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00
1292 // CHECK2-NEXT: store double [[ADD]], ptr [[E1]], align 8
1293 // CHECK2-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
1294 // CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
1295 // CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
1296 // CHECK2-NEXT: store ptr [[E1]], ptr [[TMP4]], align 4
1297 // CHECK2-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
1298 // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
1299 // CHECK2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
1300 // CHECK2-NEXT: br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
1301 // CHECK2: .omp.reduction.then:
1302 // CHECK2-NEXT: [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8
1303 // CHECK2-NEXT: [[TMP8:%.*]] = load double, ptr [[E1]], align 8
1304 // CHECK2-NEXT: [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
1305 // CHECK2-NEXT: store double [[ADD2]], ptr [[TMP0]], align 8
1306 // CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
1307 // CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
1308 // CHECK2: .omp.reduction.done:
1309 // CHECK2-NEXT: call void @__kmpc_free_shared(ptr [[E1]], i32 8)
1310 // CHECK2-NEXT: ret void
1313 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
1314 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
1315 // CHECK2-NEXT: entry:
1316 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
1317 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
1318 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
1319 // CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
1320 // CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 4
1321 // CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
1322 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
1323 // CHECK2-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
1324 // CHECK2-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
1325 // CHECK2-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
1326 // CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
1327 // CHECK2-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
1328 // CHECK2-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
1329 // CHECK2-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
1330 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i32 0, i32 0
1331 // CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 4
1332 // CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
1333 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[TMP9]], i32 1
1334 // CHECK2-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP9]], align 8
1335 // CHECK2-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_get_warp_size()
1336 // CHECK2-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i16
1337 // CHECK2-NEXT: [[TMP15:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP12]], i16 [[TMP6]], i16 [[TMP14]])
1338 // CHECK2-NEXT: store i64 [[TMP15]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 8
1339 // CHECK2-NEXT: [[TMP16:%.*]] = getelementptr i64, ptr [[TMP9]], i32 1
1340 // CHECK2-NEXT: [[TMP17:%.*]] = getelementptr i64, ptr [[DOTOMP_REDUCTION_ELEMENT]], i32 1
1341 // CHECK2-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 4
1342 // CHECK2-NEXT: [[TMP18:%.*]] = icmp eq i16 [[TMP7]], 0
1343 // CHECK2-NEXT: [[TMP19:%.*]] = icmp eq i16 [[TMP7]], 1
1344 // CHECK2-NEXT: [[TMP20:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
1345 // CHECK2-NEXT: [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]]
1346 // CHECK2-NEXT: [[TMP22:%.*]] = icmp eq i16 [[TMP7]], 2
1347 // CHECK2-NEXT: [[TMP23:%.*]] = and i16 [[TMP5]], 1
1348 // CHECK2-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP23]], 0
1349 // CHECK2-NEXT: [[TMP25:%.*]] = and i1 [[TMP22]], [[TMP24]]
1350 // CHECK2-NEXT: [[TMP26:%.*]] = icmp sgt i16 [[TMP6]], 0
1351 // CHECK2-NEXT: [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
1352 // CHECK2-NEXT: [[TMP28:%.*]] = or i1 [[TMP18]], [[TMP21]]
1353 // CHECK2-NEXT: [[TMP29:%.*]] = or i1 [[TMP28]], [[TMP27]]
1354 // CHECK2-NEXT: br i1 [[TMP29]], label [[THEN:%.*]], label [[ELSE:%.*]]
1356 // CHECK2-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]]) #[[ATTR4]]
1357 // CHECK2-NEXT: br label [[IFCONT:%.*]]
1359 // CHECK2-NEXT: br label [[IFCONT]]
1361 // CHECK2-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP7]], 1
1362 // CHECK2-NEXT: [[TMP31:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
1363 // CHECK2-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
1364 // CHECK2-NEXT: br i1 [[TMP32]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
1366 // CHECK2-NEXT: [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
1367 // CHECK2-NEXT: [[TMP34:%.*]] = load ptr, ptr [[TMP33]], align 4
1368 // CHECK2-NEXT: [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i32 0, i32 0
1369 // CHECK2-NEXT: [[TMP36:%.*]] = load ptr, ptr [[TMP35]], align 4
1370 // CHECK2-NEXT: [[TMP37:%.*]] = load double, ptr [[TMP34]], align 8
1371 // CHECK2-NEXT: store double [[TMP37]], ptr [[TMP36]], align 8
1372 // CHECK2-NEXT: br label [[IFCONT6:%.*]]
1374 // CHECK2-NEXT: br label [[IFCONT6]]
1376 // CHECK2-NEXT: ret void
1379 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
1380 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
1381 // CHECK2-NEXT: entry:
1382 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
1383 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1384 // CHECK2-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
1385 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
1386 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
1387 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
1388 // CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1389 // CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1390 // CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
1391 // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1392 // CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
1393 // CHECK2-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
1394 // CHECK2-NEXT: store i32 0, ptr [[DOTCNT_ADDR]], align 4
1395 // CHECK2-NEXT: br label [[PRECOND:%.*]]
1397 // CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
1398 // CHECK2-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
1399 // CHECK2-NEXT: br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
1401 // CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
1402 // CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1403 // CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
1405 // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i32 0, i32 0
1406 // CHECK2-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 4
1407 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
1408 // CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1409 // CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
1410 // CHECK2-NEXT: store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
1411 // CHECK2-NEXT: br label [[IFCONT:%.*]]
1413 // CHECK2-NEXT: br label [[IFCONT]]
1415 // CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
1416 // CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1417 // CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
1418 // CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
1420 // CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
1421 // CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i32 0, i32 0
1422 // CHECK2-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
1423 // CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
1424 // CHECK2-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
1425 // CHECK2-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4
1426 // CHECK2-NEXT: br label [[IFCONT4:%.*]]
1428 // CHECK2-NEXT: br label [[IFCONT4]]
1430 // CHECK2-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
1431 // CHECK2-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
1432 // CHECK2-NEXT: br label [[PRECOND]]
1434 // CHECK2-NEXT: ret void
1437 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
1438 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1439 // CHECK2-NEXT: entry:
1440 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
1441 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1442 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
1443 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
1444 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
1445 // CHECK2-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
1446 // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
1447 // CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
1448 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1449 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i32 0, i32 0
1450 // CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
1451 // CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
1452 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x double], ptr [[E]], i32 0, i32 [[TMP5]]
1453 // CHECK2-NEXT: [[TMP9:%.*]] = load double, ptr [[TMP7]], align 8
1454 // CHECK2-NEXT: store double [[TMP9]], ptr [[TMP8]], align 128
1455 // CHECK2-NEXT: ret void
1458 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
1459 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1460 // CHECK2-NEXT: entry:
1461 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
1462 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1463 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
1464 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 4
1465 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
1466 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
1467 // CHECK2-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
1468 // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
1469 // CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1470 // CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
1471 // CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
1472 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x double], ptr [[E]], i32 0, i32 [[TMP4]]
1473 // CHECK2-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
1474 // CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
1475 // CHECK2-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_omp_outlined_omp$reduction$reduction_func"(ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr [[TMP7]]) #[[ATTR4]]
1476 // CHECK2-NEXT: ret void
1479 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
1480 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1481 // CHECK2-NEXT: entry:
1482 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
1483 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1484 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
1485 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
1486 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
1487 // CHECK2-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
1488 // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
1489 // CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
1490 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1491 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i32 0, i32 0
1492 // CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
1493 // CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
1494 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x double], ptr [[E]], i32 0, i32 [[TMP5]]
1495 // CHECK2-NEXT: [[TMP9:%.*]] = load double, ptr [[TMP8]], align 128
1496 // CHECK2-NEXT: store double [[TMP9]], ptr [[TMP7]], align 8
1497 // CHECK2-NEXT: ret void
1500 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
1501 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1502 // CHECK2-NEXT: entry:
1503 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
1504 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1505 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
1506 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 4
1507 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
1508 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
1509 // CHECK2-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
1510 // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
1511 // CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1512 // CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
1513 // CHECK2-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
1514 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x double], ptr [[E]], i32 0, i32 [[TMP4]]
1515 // CHECK2-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
1516 // CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
1517 // CHECK2-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP7]], ptr [[DOTOMP_REDUCTION_RED_LIST]]) #[[ATTR4]]
1518 // CHECK2-NEXT: ret void
1521 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26
1522 // CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]]) #[[ATTR0]] {
1523 // CHECK2-NEXT: entry:
1524 // CHECK2-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
1525 // CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4
1526 // CHECK2-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4
1527 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1528 // CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1529 // CHECK2-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
1530 // CHECK2-NEXT: store i32 [[C]], ptr [[C_ADDR]], align 4
1531 // CHECK2-NEXT: store i32 [[D]], ptr [[D_ADDR]], align 4
1532 // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_kernel_environment, ptr [[DYN_PTR]])
1533 // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1534 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
1535 // CHECK2: user_code.entry:
1536 // CHECK2-NEXT: [[TMP1:%.*]] = load i8, ptr [[C_ADDR]], align 1
1537 // CHECK2-NEXT: [[C1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 1)
1538 // CHECK2-NEXT: store i8 [[TMP1]], ptr [[C1]], align 1
1539 // CHECK2-NEXT: [[TMP2:%.*]] = load float, ptr [[D_ADDR]], align 4
1540 // CHECK2-NEXT: [[D2:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 4)
1541 // CHECK2-NEXT: store float [[TMP2]], ptr [[D2]], align 4
1542 // CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
1543 // CHECK2-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
1544 // CHECK2-NEXT: store i32 [[TMP3]], ptr [[DOTTHREADID_TEMP_]], align 4
1545 // CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[C1]], ptr [[D2]]) #[[ATTR4]]
1546 // CHECK2-NEXT: call void @__kmpc_free_shared(ptr [[D2]], i32 4)
1547 // CHECK2-NEXT: call void @__kmpc_free_shared(ptr [[C1]], i32 1)
1548 // CHECK2-NEXT: call void @__kmpc_target_deinit()
1549 // CHECK2-NEXT: ret void
1550 // CHECK2: worker.exit:
1551 // CHECK2-NEXT: ret void
1554 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_omp_outlined
1555 // CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
1556 // CHECK2-NEXT: entry:
1557 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
1558 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
1559 // CHECK2-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 4
1560 // CHECK2-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 4
1561 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 4
1562 // CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
1563 // CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
1564 // CHECK2-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 4
1565 // CHECK2-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 4
1566 // CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
1567 // CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 4
1568 // CHECK2-NEXT: [[C1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 1)
1569 // CHECK2-NEXT: [[D2:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 4)
1570 // CHECK2-NEXT: store i8 0, ptr [[C1]], align 1
1571 // CHECK2-NEXT: store float 1.000000e+00, ptr [[D2]], align 4
1572 // CHECK2-NEXT: [[TMP2:%.*]] = load i8, ptr [[C1]], align 1
1573 // CHECK2-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32
1574 // CHECK2-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2
1575 // CHECK2-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
1576 // CHECK2-NEXT: store i8 [[CONV3]], ptr [[C1]], align 1
1577 // CHECK2-NEXT: [[TMP3:%.*]] = load float, ptr [[D2]], align 4
1578 // CHECK2-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01
1579 // CHECK2-NEXT: store float [[MUL]], ptr [[D2]], align 4
1580 // CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
1581 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
1582 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
1583 // CHECK2-NEXT: store ptr [[C1]], ptr [[TMP6]], align 4
1584 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
1585 // CHECK2-NEXT: store ptr [[D2]], ptr [[TMP7]], align 4
1586 // CHECK2-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
1587 // CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
1588 // CHECK2-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
1589 // CHECK2-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
1590 // CHECK2: .omp.reduction.then:
1591 // CHECK2-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1
1592 // CHECK2-NEXT: [[CONV4:%.*]] = sext i8 [[TMP10]] to i32
1593 // CHECK2-NEXT: [[TMP11:%.*]] = load i8, ptr [[C1]], align 1
1594 // CHECK2-NEXT: [[CONV5:%.*]] = sext i8 [[TMP11]] to i32
1595 // CHECK2-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
1596 // CHECK2-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
1597 // CHECK2-NEXT: store i8 [[CONV7]], ptr [[TMP0]], align 1
1598 // CHECK2-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4
1599 // CHECK2-NEXT: [[TMP13:%.*]] = load float, ptr [[D2]], align 4
1600 // CHECK2-NEXT: [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
1601 // CHECK2-NEXT: store float [[MUL8]], ptr [[TMP1]], align 4
1602 // CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
1603 // CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
1604 // CHECK2: .omp.reduction.done:
1605 // CHECK2-NEXT: call void @__kmpc_free_shared(ptr [[D2]], i32 4)
1606 // CHECK2-NEXT: call void @__kmpc_free_shared(ptr [[C1]], i32 1)
1607 // CHECK2-NEXT: ret void
1610 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func1
1611 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
1612 // CHECK2-NEXT: entry:
1613 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
1614 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
1615 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
1616 // CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
1617 // CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x ptr], align 4
1618 // CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
1619 // CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
1620 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
1621 // CHECK2-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
1622 // CHECK2-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
1623 // CHECK2-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
1624 // CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
1625 // CHECK2-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
1626 // CHECK2-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
1627 // CHECK2-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
1628 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 0
1629 // CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 4
1630 // CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
1631 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP9]], i32 1
1632 // CHECK2-NEXT: [[TMP12:%.*]] = load i8, ptr [[TMP9]], align 1
1633 // CHECK2-NEXT: [[TMP13:%.*]] = sext i8 [[TMP12]] to i32
1634 // CHECK2-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_get_warp_size()
1635 // CHECK2-NEXT: [[TMP15:%.*]] = trunc i32 [[TMP14]] to i16
1636 // CHECK2-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP13]], i16 [[TMP6]], i16 [[TMP15]])
1637 // CHECK2-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
1638 // CHECK2-NEXT: store i8 [[TMP17]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 1
1639 // CHECK2-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP9]], i32 1
1640 // CHECK2-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTOMP_REDUCTION_ELEMENT]], i32 1
1641 // CHECK2-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 4
1642 // CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 1
1643 // CHECK2-NEXT: [[TMP21:%.*]] = load ptr, ptr [[TMP20]], align 4
1644 // CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
1645 // CHECK2-NEXT: [[TMP23:%.*]] = getelementptr float, ptr [[TMP21]], i32 1
1646 // CHECK2-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP21]], align 4
1647 // CHECK2-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_get_warp_size()
1648 // CHECK2-NEXT: [[TMP26:%.*]] = trunc i32 [[TMP25]] to i16
1649 // CHECK2-NEXT: [[TMP27:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP24]], i16 [[TMP6]], i16 [[TMP26]])
1650 // CHECK2-NEXT: store i32 [[TMP27]], ptr [[DOTOMP_REDUCTION_ELEMENT4]], align 4
1651 // CHECK2-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[TMP21]], i32 1
1652 // CHECK2-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
1653 // CHECK2-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT4]], ptr [[TMP22]], align 4
1654 // CHECK2-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP7]], 0
1655 // CHECK2-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP7]], 1
1656 // CHECK2-NEXT: [[TMP32:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
1657 // CHECK2-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
1658 // CHECK2-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP7]], 2
1659 // CHECK2-NEXT: [[TMP35:%.*]] = and i16 [[TMP5]], 1
1660 // CHECK2-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP35]], 0
1661 // CHECK2-NEXT: [[TMP37:%.*]] = and i1 [[TMP34]], [[TMP36]]
1662 // CHECK2-NEXT: [[TMP38:%.*]] = icmp sgt i16 [[TMP6]], 0
1663 // CHECK2-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
1664 // CHECK2-NEXT: [[TMP40:%.*]] = or i1 [[TMP30]], [[TMP33]]
1665 // CHECK2-NEXT: [[TMP41:%.*]] = or i1 [[TMP40]], [[TMP39]]
1666 // CHECK2-NEXT: br i1 [[TMP41]], label [[THEN:%.*]], label [[ELSE:%.*]]
1668 // CHECK2-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]]) #[[ATTR4]]
1669 // CHECK2-NEXT: br label [[IFCONT:%.*]]
1671 // CHECK2-NEXT: br label [[IFCONT]]
1673 // CHECK2-NEXT: [[TMP42:%.*]] = icmp eq i16 [[TMP7]], 1
1674 // CHECK2-NEXT: [[TMP43:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
1675 // CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
1676 // CHECK2-NEXT: br i1 [[TMP44]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
1678 // CHECK2-NEXT: [[TMP45:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
1679 // CHECK2-NEXT: [[TMP46:%.*]] = load ptr, ptr [[TMP45]], align 4
1680 // CHECK2-NEXT: [[TMP47:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 0
1681 // CHECK2-NEXT: [[TMP48:%.*]] = load ptr, ptr [[TMP47]], align 4
1682 // CHECK2-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP46]], align 1
1683 // CHECK2-NEXT: store i8 [[TMP49]], ptr [[TMP48]], align 1
1684 // CHECK2-NEXT: [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
1685 // CHECK2-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 4
1686 // CHECK2-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 1
1687 // CHECK2-NEXT: [[TMP53:%.*]] = load ptr, ptr [[TMP52]], align 4
1688 // CHECK2-NEXT: [[TMP54:%.*]] = load float, ptr [[TMP51]], align 4
1689 // CHECK2-NEXT: store float [[TMP54]], ptr [[TMP53]], align 4
1690 // CHECK2-NEXT: br label [[IFCONT7:%.*]]
1692 // CHECK2-NEXT: br label [[IFCONT7]]
1694 // CHECK2-NEXT: ret void
1697 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func2
1698 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
1699 // CHECK2-NEXT: entry:
1700 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
1701 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1702 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
1703 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
1704 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
1705 // CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1706 // CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1707 // CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
1708 // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1709 // CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
1710 // CHECK2-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
1711 // CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
1712 // CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1713 // CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
1715 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
1716 // CHECK2-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 4
1717 // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1718 // CHECK2-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
1719 // CHECK2-NEXT: store volatile i8 [[TMP10]], ptr addrspace(3) [[TMP9]], align 1
1720 // CHECK2-NEXT: br label [[IFCONT:%.*]]
1722 // CHECK2-NEXT: br label [[IFCONT]]
1724 // CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
1725 // CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1726 // CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
1727 // CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
1729 // CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
1730 // CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
1731 // CHECK2-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
1732 // CHECK2-NEXT: [[TMP15:%.*]] = load volatile i8, ptr addrspace(3) [[TMP12]], align 1
1733 // CHECK2-NEXT: store i8 [[TMP15]], ptr [[TMP14]], align 1
1734 // CHECK2-NEXT: br label [[IFCONT4:%.*]]
1736 // CHECK2-NEXT: br label [[IFCONT4]]
1738 // CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
1739 // CHECK2-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1740 // CHECK2-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
1742 // CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
1743 // CHECK2-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
1744 // CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1745 // CHECK2-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
1746 // CHECK2-NEXT: store volatile i32 [[TMP19]], ptr addrspace(3) [[TMP18]], align 4
1747 // CHECK2-NEXT: br label [[IFCONT8:%.*]]
1749 // CHECK2-NEXT: br label [[IFCONT8]]
1751 // CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
1752 // CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1753 // CHECK2-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
1754 // CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
1756 // CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
1757 // CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
1758 // CHECK2-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 4
1759 // CHECK2-NEXT: [[TMP24:%.*]] = load volatile i32, ptr addrspace(3) [[TMP21]], align 4
1760 // CHECK2-NEXT: store i32 [[TMP24]], ptr [[TMP23]], align 4
1761 // CHECK2-NEXT: br label [[IFCONT12:%.*]]
1763 // CHECK2-NEXT: br label [[IFCONT12]]
1764 // CHECK2: ifcont12:
1765 // CHECK2-NEXT: ret void
1768 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func3
1769 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1770 // CHECK2-NEXT: entry:
1771 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
1772 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1773 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
1774 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
1775 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
1776 // CHECK2-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
1777 // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
1778 // CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
1779 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1780 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
1781 // CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
1782 // CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 0, i32 0
1783 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i8], ptr [[C]], i32 0, i32 [[TMP5]]
1784 // CHECK2-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
1785 // CHECK2-NEXT: store i8 [[TMP9]], ptr [[TMP8]], align 128
1786 // CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
1787 // CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
1788 // CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 0, i32 1
1789 // CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x float], ptr [[D]], i32 0, i32 [[TMP5]]
1790 // CHECK2-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP11]], align 4
1791 // CHECK2-NEXT: store float [[TMP13]], ptr [[TMP12]], align 128
1792 // CHECK2-NEXT: ret void
1795 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func4
1796 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1797 // CHECK2-NEXT: entry:
1798 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
1799 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1800 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
1801 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 4
1802 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
1803 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
1804 // CHECK2-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
1805 // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
1806 // CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1807 // CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
1808 // CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP3]], i32 0, i32 0
1809 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i8], ptr [[C]], i32 0, i32 [[TMP4]]
1810 // CHECK2-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
1811 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
1812 // CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP3]], i32 0, i32 1
1813 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x float], ptr [[D]], i32 0, i32 [[TMP4]]
1814 // CHECK2-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4
1815 // CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
1816 // CHECK2-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_omp_outlined_omp$reduction$reduction_func"(ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr [[TMP9]]) #[[ATTR4]]
1817 // CHECK2-NEXT: ret void
1820 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func5
1821 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1822 // CHECK2-NEXT: entry:
1823 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
1824 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1825 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
1826 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
1827 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
1828 // CHECK2-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
1829 // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
1830 // CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
1831 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1832 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
1833 // CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
1834 // CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 0, i32 0
1835 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i8], ptr [[C]], i32 0, i32 [[TMP5]]
1836 // CHECK2-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 128
1837 // CHECK2-NEXT: store i8 [[TMP9]], ptr [[TMP7]], align 1
1838 // CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
1839 // CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
1840 // CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 0, i32 1
1841 // CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x float], ptr [[D]], i32 0, i32 [[TMP5]]
1842 // CHECK2-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 128
1843 // CHECK2-NEXT: store float [[TMP13]], ptr [[TMP11]], align 4
1844 // CHECK2-NEXT: ret void
1847 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func6
1848 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1849 // CHECK2-NEXT: entry:
1850 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
1851 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
1852 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
1853 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 4
1854 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
1855 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
1856 // CHECK2-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
1857 // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
1858 // CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
1859 // CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
1860 // CHECK2-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP3]], i32 0, i32 0
1861 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i8], ptr [[C]], i32 0, i32 [[TMP4]]
1862 // CHECK2-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
1863 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
1864 // CHECK2-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP3]], i32 0, i32 1
1865 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x float], ptr [[D]], i32 0, i32 [[TMP4]]
1866 // CHECK2-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4
1867 // CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
1868 // CHECK2-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP9]], ptr [[DOTOMP_REDUCTION_RED_LIST]]) #[[ATTR4]]
1869 // CHECK2-NEXT: ret void
1872 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33
1873 // CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
1874 // CHECK2-NEXT: entry:
1875 // CHECK2-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
1876 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
1877 // CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
1878 // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1879 // CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1880 // CHECK2-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
1881 // CHECK2-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
1882 // CHECK2-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4
1883 // CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_kernel_environment, ptr [[DYN_PTR]])
1884 // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
1885 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
1886 // CHECK2: user_code.entry:
1887 // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
1888 // CHECK2-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
1889 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4
1890 // CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[A_ADDR]], ptr [[B_ADDR]]) #[[ATTR4]]
1891 // CHECK2-NEXT: call void @__kmpc_target_deinit()
1892 // CHECK2-NEXT: ret void
1893 // CHECK2: worker.exit:
1894 // CHECK2-NEXT: ret void
1897 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined
1898 // CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
1899 // CHECK2-NEXT: entry:
1900 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
1901 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
1902 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
1903 // CHECK2-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4
1904 // CHECK2-NEXT: [[A1:%.*]] = alloca i32, align 4
1905 // CHECK2-NEXT: [[B2:%.*]] = alloca i16, align 2
1906 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 4
1907 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 4
1908 // CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
1909 // CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
1910 // CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
1911 // CHECK2-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 4
1912 // CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
1913 // CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
1914 // CHECK2-NEXT: store i32 0, ptr [[A1]], align 4
1915 // CHECK2-NEXT: store i16 -32768, ptr [[B2]], align 2
1916 // CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
1917 // CHECK2-NEXT: store ptr [[A1]], ptr [[TMP2]], align 4
1918 // CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
1919 // CHECK2-NEXT: store ptr [[B2]], ptr [[TMP3]], align 4
1920 // CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
1921 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
1922 // CHECK2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2)
1923 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
1924 // CHECK2-NEXT: store ptr [[A1]], ptr [[TMP6]], align 4
1925 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
1926 // CHECK2-NEXT: store ptr [[B2]], ptr [[TMP7]], align 4
1927 // CHECK2-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
1928 // CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
1929 // CHECK2-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
1930 // CHECK2-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
1931 // CHECK2: .omp.reduction.then:
1932 // CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
1933 // CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[A1]], align 4
1934 // CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]]
1935 // CHECK2-NEXT: store i32 [[OR]], ptr [[TMP0]], align 4
1936 // CHECK2-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2
1937 // CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP12]] to i32
1938 // CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[B2]], align 2
1939 // CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP13]] to i32
1940 // CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
1941 // CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
1942 // CHECK2: cond.true:
1943 // CHECK2-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2
1944 // CHECK2-NEXT: br label [[COND_END:%.*]]
1945 // CHECK2: cond.false:
1946 // CHECK2-NEXT: [[TMP15:%.*]] = load i16, ptr [[B2]], align 2
1947 // CHECK2-NEXT: br label [[COND_END]]
1948 // CHECK2: cond.end:
1949 // CHECK2-NEXT: [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
1950 // CHECK2-NEXT: store i16 [[COND]], ptr [[TMP1]], align 2
1951 // CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
1952 // CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
1953 // CHECK2: .omp.reduction.done:
1954 // CHECK2-NEXT: ret void
1957 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp_outlined
1958 // CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
1959 // CHECK2-NEXT: entry:
1960 // CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
1961 // CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
1962 // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
1963 // CHECK2-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4
1964 // CHECK2-NEXT: [[A1:%.*]] = alloca i32, align 4
1965 // CHECK2-NEXT: [[B2:%.*]] = alloca i16, align 2
1966 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 4
1967 // CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
1968 // CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
1969 // CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
1970 // CHECK2-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 4
1971 // CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
1972 // CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
1973 // CHECK2-NEXT: store i32 0, ptr [[A1]], align 4
1974 // CHECK2-NEXT: store i16 -32768, ptr [[B2]], align 2
1975 // CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[A1]], align 4
1976 // CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1
1977 // CHECK2-NEXT: store i32 [[OR]], ptr [[A1]], align 4
1978 // CHECK2-NEXT: [[TMP3:%.*]] = load i16, ptr [[B2]], align 2
1979 // CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32
1980 // CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
1981 // CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
1982 // CHECK2: cond.true:
1983 // CHECK2-NEXT: br label [[COND_END:%.*]]
1984 // CHECK2: cond.false:
1985 // CHECK2-NEXT: [[TMP4:%.*]] = load i16, ptr [[B2]], align 2
1986 // CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
1987 // CHECK2-NEXT: br label [[COND_END]]
1988 // CHECK2: cond.end:
1989 // CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
1990 // CHECK2-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16
1991 // CHECK2-NEXT: store i16 [[CONV4]], ptr [[B2]], align 2
1992 // CHECK2-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
1993 // CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
1994 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
1995 // CHECK2-NEXT: store ptr [[A1]], ptr [[TMP7]], align 4
1996 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
1997 // CHECK2-NEXT: store ptr [[B2]], ptr [[TMP8]], align 4
1998 // CHECK2-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP6]], i32 2, i32 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func7, ptr @_omp_reduction_inter_warp_copy_func8)
1999 // CHECK2-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
2000 // CHECK2-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
2001 // CHECK2: .omp.reduction.then:
2002 // CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
2003 // CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[A1]], align 4
2004 // CHECK2-NEXT: [[OR5:%.*]] = or i32 [[TMP11]], [[TMP12]]
2005 // CHECK2-NEXT: store i32 [[OR5]], ptr [[TMP0]], align 4
2006 // CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2
2007 // CHECK2-NEXT: [[CONV6:%.*]] = sext i16 [[TMP13]] to i32
2008 // CHECK2-NEXT: [[TMP14:%.*]] = load i16, ptr [[B2]], align 2
2009 // CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP14]] to i32
2010 // CHECK2-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
2011 // CHECK2-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
2012 // CHECK2: cond.true9:
2013 // CHECK2-NEXT: [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2
2014 // CHECK2-NEXT: br label [[COND_END11:%.*]]
2015 // CHECK2: cond.false10:
2016 // CHECK2-NEXT: [[TMP16:%.*]] = load i16, ptr [[B2]], align 2
2017 // CHECK2-NEXT: br label [[COND_END11]]
2018 // CHECK2: cond.end11:
2019 // CHECK2-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE9]] ], [ [[TMP16]], [[COND_FALSE10]] ]
2020 // CHECK2-NEXT: store i16 [[COND12]], ptr [[TMP1]], align 2
2021 // CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
2022 // CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
2023 // CHECK2: .omp.reduction.done:
2024 // CHECK2-NEXT: ret void
2027 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func7
2028 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
2029 // CHECK2-NEXT: entry:
2030 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2031 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
2032 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
2033 // CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
2034 // CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x ptr], align 4
2035 // CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
2036 // CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
2037 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2038 // CHECK2-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
2039 // CHECK2-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
2040 // CHECK2-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
2041 // CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2042 // CHECK2-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
2043 // CHECK2-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
2044 // CHECK2-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
2045 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 0
2046 // CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 4
2047 // CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
2048 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP9]], i32 1
2049 // CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP9]], align 4
2050 // CHECK2-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_get_warp_size()
2051 // CHECK2-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i16
2052 // CHECK2-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP12]], i16 [[TMP6]], i16 [[TMP14]])
2053 // CHECK2-NEXT: store i32 [[TMP15]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 4
2054 // CHECK2-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP9]], i32 1
2055 // CHECK2-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[DOTOMP_REDUCTION_ELEMENT]], i32 1
2056 // CHECK2-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 4
2057 // CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 1
2058 // CHECK2-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP18]], align 4
2059 // CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
2060 // CHECK2-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[TMP19]], i32 1
2061 // CHECK2-NEXT: [[TMP22:%.*]] = load i16, ptr [[TMP19]], align 2
2062 // CHECK2-NEXT: [[TMP23:%.*]] = sext i16 [[TMP22]] to i32
2063 // CHECK2-NEXT: [[TMP24:%.*]] = call i32 @__kmpc_get_warp_size()
2064 // CHECK2-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
2065 // CHECK2-NEXT: [[TMP26:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP23]], i16 [[TMP6]], i16 [[TMP25]])
2066 // CHECK2-NEXT: [[TMP27:%.*]] = trunc i32 [[TMP26]] to i16
2067 // CHECK2-NEXT: store i16 [[TMP27]], ptr [[DOTOMP_REDUCTION_ELEMENT4]], align 2
2068 // CHECK2-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[TMP19]], i32 1
2069 // CHECK2-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
2070 // CHECK2-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT4]], ptr [[TMP20]], align 4
2071 // CHECK2-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP7]], 0
2072 // CHECK2-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP7]], 1
2073 // CHECK2-NEXT: [[TMP32:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
2074 // CHECK2-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
2075 // CHECK2-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP7]], 2
2076 // CHECK2-NEXT: [[TMP35:%.*]] = and i16 [[TMP5]], 1
2077 // CHECK2-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP35]], 0
2078 // CHECK2-NEXT: [[TMP37:%.*]] = and i1 [[TMP34]], [[TMP36]]
2079 // CHECK2-NEXT: [[TMP38:%.*]] = icmp sgt i16 [[TMP6]], 0
2080 // CHECK2-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
2081 // CHECK2-NEXT: [[TMP40:%.*]] = or i1 [[TMP30]], [[TMP33]]
2082 // CHECK2-NEXT: [[TMP41:%.*]] = or i1 [[TMP40]], [[TMP39]]
2083 // CHECK2-NEXT: br i1 [[TMP41]], label [[THEN:%.*]], label [[ELSE:%.*]]
2085 // CHECK2-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]]) #[[ATTR4]]
2086 // CHECK2-NEXT: br label [[IFCONT:%.*]]
2088 // CHECK2-NEXT: br label [[IFCONT]]
2090 // CHECK2-NEXT: [[TMP42:%.*]] = icmp eq i16 [[TMP7]], 1
2091 // CHECK2-NEXT: [[TMP43:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
2092 // CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
2093 // CHECK2-NEXT: br i1 [[TMP44]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
2095 // CHECK2-NEXT: [[TMP45:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
2096 // CHECK2-NEXT: [[TMP46:%.*]] = load ptr, ptr [[TMP45]], align 4
2097 // CHECK2-NEXT: [[TMP47:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 0
2098 // CHECK2-NEXT: [[TMP48:%.*]] = load ptr, ptr [[TMP47]], align 4
2099 // CHECK2-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP46]], align 4
2100 // CHECK2-NEXT: store i32 [[TMP49]], ptr [[TMP48]], align 4
2101 // CHECK2-NEXT: [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
2102 // CHECK2-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 4
2103 // CHECK2-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 1
2104 // CHECK2-NEXT: [[TMP53:%.*]] = load ptr, ptr [[TMP52]], align 4
2105 // CHECK2-NEXT: [[TMP54:%.*]] = load i16, ptr [[TMP51]], align 2
2106 // CHECK2-NEXT: store i16 [[TMP54]], ptr [[TMP53]], align 2
2107 // CHECK2-NEXT: br label [[IFCONT7:%.*]]
2109 // CHECK2-NEXT: br label [[IFCONT7]]
2111 // CHECK2-NEXT: ret void
2114 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func8
2115 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
2116 // CHECK2-NEXT: entry:
2117 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2118 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2119 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
2120 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2121 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
2122 // CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
2123 // CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
2124 // CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
2125 // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
2126 // CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
2127 // CHECK2-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2128 // CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
2129 // CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
2130 // CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
2132 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
2133 // CHECK2-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 4
2134 // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
2135 // CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 4
2136 // CHECK2-NEXT: store volatile i32 [[TMP10]], ptr addrspace(3) [[TMP9]], align 4
2137 // CHECK2-NEXT: br label [[IFCONT:%.*]]
2139 // CHECK2-NEXT: br label [[IFCONT]]
2141 // CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
2142 // CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
2143 // CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
2144 // CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
2146 // CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
2147 // CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
2148 // CHECK2-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
2149 // CHECK2-NEXT: [[TMP15:%.*]] = load volatile i32, ptr addrspace(3) [[TMP12]], align 4
2150 // CHECK2-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4
2151 // CHECK2-NEXT: br label [[IFCONT4:%.*]]
2153 // CHECK2-NEXT: br label [[IFCONT4]]
2155 // CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
2156 // CHECK2-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
2157 // CHECK2-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
2159 // CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
2160 // CHECK2-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
2161 // CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
2162 // CHECK2-NEXT: [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 2
2163 // CHECK2-NEXT: store volatile i16 [[TMP19]], ptr addrspace(3) [[TMP18]], align 2
2164 // CHECK2-NEXT: br label [[IFCONT8:%.*]]
2166 // CHECK2-NEXT: br label [[IFCONT8]]
2168 // CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
2169 // CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
2170 // CHECK2-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
2171 // CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
2173 // CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
2174 // CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
2175 // CHECK2-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 4
2176 // CHECK2-NEXT: [[TMP24:%.*]] = load volatile i16, ptr addrspace(3) [[TMP21]], align 2
2177 // CHECK2-NEXT: store i16 [[TMP24]], ptr [[TMP23]], align 2
2178 // CHECK2-NEXT: br label [[IFCONT12:%.*]]
2180 // CHECK2-NEXT: br label [[IFCONT12]]
2181 // CHECK2: ifcont12:
2182 // CHECK2-NEXT: ret void
2185 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func9
2186 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
2187 // CHECK2-NEXT: entry:
2188 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2189 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
2190 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
2191 // CHECK2-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
2192 // CHECK2-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x ptr], align 4
2193 // CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
2194 // CHECK2-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
2195 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2196 // CHECK2-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
2197 // CHECK2-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
2198 // CHECK2-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
2199 // CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2200 // CHECK2-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
2201 // CHECK2-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
2202 // CHECK2-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
2203 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 0
2204 // CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 4
2205 // CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
2206 // CHECK2-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP9]], i32 1
2207 // CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP9]], align 4
2208 // CHECK2-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_get_warp_size()
2209 // CHECK2-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i16
2210 // CHECK2-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP12]], i16 [[TMP6]], i16 [[TMP14]])
2211 // CHECK2-NEXT: store i32 [[TMP15]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 4
2212 // CHECK2-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP9]], i32 1
2213 // CHECK2-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[DOTOMP_REDUCTION_ELEMENT]], i32 1
2214 // CHECK2-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 4
2215 // CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 1
2216 // CHECK2-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP18]], align 4
2217 // CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
2218 // CHECK2-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[TMP19]], i32 1
2219 // CHECK2-NEXT: [[TMP22:%.*]] = load i16, ptr [[TMP19]], align 2
2220 // CHECK2-NEXT: [[TMP23:%.*]] = sext i16 [[TMP22]] to i32
2221 // CHECK2-NEXT: [[TMP24:%.*]] = call i32 @__kmpc_get_warp_size()
2222 // CHECK2-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
2223 // CHECK2-NEXT: [[TMP26:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP23]], i16 [[TMP6]], i16 [[TMP25]])
2224 // CHECK2-NEXT: [[TMP27:%.*]] = trunc i32 [[TMP26]] to i16
2225 // CHECK2-NEXT: store i16 [[TMP27]], ptr [[DOTOMP_REDUCTION_ELEMENT4]], align 2
2226 // CHECK2-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[TMP19]], i32 1
2227 // CHECK2-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
2228 // CHECK2-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT4]], ptr [[TMP20]], align 4
2229 // CHECK2-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP7]], 0
2230 // CHECK2-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP7]], 1
2231 // CHECK2-NEXT: [[TMP32:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
2232 // CHECK2-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
2233 // CHECK2-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP7]], 2
2234 // CHECK2-NEXT: [[TMP35:%.*]] = and i16 [[TMP5]], 1
2235 // CHECK2-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP35]], 0
2236 // CHECK2-NEXT: [[TMP37:%.*]] = and i1 [[TMP34]], [[TMP36]]
2237 // CHECK2-NEXT: [[TMP38:%.*]] = icmp sgt i16 [[TMP6]], 0
2238 // CHECK2-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
2239 // CHECK2-NEXT: [[TMP40:%.*]] = or i1 [[TMP30]], [[TMP33]]
2240 // CHECK2-NEXT: [[TMP41:%.*]] = or i1 [[TMP40]], [[TMP39]]
2241 // CHECK2-NEXT: br i1 [[TMP41]], label [[THEN:%.*]], label [[ELSE:%.*]]
2243 // CHECK2-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]]) #[[ATTR4]]
2244 // CHECK2-NEXT: br label [[IFCONT:%.*]]
2246 // CHECK2-NEXT: br label [[IFCONT]]
2248 // CHECK2-NEXT: [[TMP42:%.*]] = icmp eq i16 [[TMP7]], 1
2249 // CHECK2-NEXT: [[TMP43:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
2250 // CHECK2-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
2251 // CHECK2-NEXT: br i1 [[TMP44]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
2253 // CHECK2-NEXT: [[TMP45:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
2254 // CHECK2-NEXT: [[TMP46:%.*]] = load ptr, ptr [[TMP45]], align 4
2255 // CHECK2-NEXT: [[TMP47:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 0
2256 // CHECK2-NEXT: [[TMP48:%.*]] = load ptr, ptr [[TMP47]], align 4
2257 // CHECK2-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP46]], align 4
2258 // CHECK2-NEXT: store i32 [[TMP49]], ptr [[TMP48]], align 4
2259 // CHECK2-NEXT: [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
2260 // CHECK2-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 4
2261 // CHECK2-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 1
2262 // CHECK2-NEXT: [[TMP53:%.*]] = load ptr, ptr [[TMP52]], align 4
2263 // CHECK2-NEXT: [[TMP54:%.*]] = load i16, ptr [[TMP51]], align 2
2264 // CHECK2-NEXT: store i16 [[TMP54]], ptr [[TMP53]], align 2
2265 // CHECK2-NEXT: br label [[IFCONT7:%.*]]
2267 // CHECK2-NEXT: br label [[IFCONT7]]
2269 // CHECK2-NEXT: ret void
2272 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func10
2273 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
2274 // CHECK2-NEXT: entry:
2275 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2276 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2277 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
2278 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2279 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
2280 // CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
2281 // CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
2282 // CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
2283 // CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
2284 // CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
2285 // CHECK2-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2286 // CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
2287 // CHECK2-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
2288 // CHECK2-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
2290 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
2291 // CHECK2-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 4
2292 // CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
2293 // CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 4
2294 // CHECK2-NEXT: store volatile i32 [[TMP10]], ptr addrspace(3) [[TMP9]], align 4
2295 // CHECK2-NEXT: br label [[IFCONT:%.*]]
2297 // CHECK2-NEXT: br label [[IFCONT]]
2299 // CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
2300 // CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
2301 // CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
2302 // CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
2304 // CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
2305 // CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
2306 // CHECK2-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
2307 // CHECK2-NEXT: [[TMP15:%.*]] = load volatile i32, ptr addrspace(3) [[TMP12]], align 4
2308 // CHECK2-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4
2309 // CHECK2-NEXT: br label [[IFCONT4:%.*]]
2311 // CHECK2-NEXT: br label [[IFCONT4]]
2313 // CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
2314 // CHECK2-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
2315 // CHECK2-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
2317 // CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
2318 // CHECK2-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
2319 // CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
2320 // CHECK2-NEXT: [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 2
2321 // CHECK2-NEXT: store volatile i16 [[TMP19]], ptr addrspace(3) [[TMP18]], align 2
2322 // CHECK2-NEXT: br label [[IFCONT8:%.*]]
2324 // CHECK2-NEXT: br label [[IFCONT8]]
2326 // CHECK2-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
2327 // CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
2328 // CHECK2-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
2329 // CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
2331 // CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
2332 // CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
2333 // CHECK2-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 4
2334 // CHECK2-NEXT: [[TMP24:%.*]] = load volatile i16, ptr addrspace(3) [[TMP21]], align 2
2335 // CHECK2-NEXT: store i16 [[TMP24]], ptr [[TMP23]], align 2
2336 // CHECK2-NEXT: br label [[IFCONT12:%.*]]
2338 // CHECK2-NEXT: br label [[IFCONT12]]
2339 // CHECK2: ifcont12:
2340 // CHECK2-NEXT: ret void
2343 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func11
2344 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
2345 // CHECK2-NEXT: entry:
2346 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2347 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2348 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
2349 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2350 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
2351 // CHECK2-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
2352 // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
2353 // CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2354 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
2355 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
2356 // CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
2357 // CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
2358 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
2359 // CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
2360 // CHECK2-NEXT: store i32 [[TMP9]], ptr [[TMP8]], align 128
2361 // CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
2362 // CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
2363 // CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
2364 // CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
2365 // CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
2366 // CHECK2-NEXT: store i16 [[TMP13]], ptr [[TMP12]], align 128
2367 // CHECK2-NEXT: ret void
2370 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func12
2371 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
2372 // CHECK2-NEXT: entry:
2373 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2374 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2375 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
2376 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 4
2377 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2378 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
2379 // CHECK2-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
2380 // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2381 // CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
2382 // CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
2383 // CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
2384 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
2385 // CHECK2-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
2386 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
2387 // CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
2388 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
2389 // CHECK2-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4
2390 // CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
2391 // CHECK2-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp$reduction$reduction_func"(ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr [[TMP9]]) #[[ATTR4]]
2392 // CHECK2-NEXT: ret void
2395 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func13
2396 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
2397 // CHECK2-NEXT: entry:
2398 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2399 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2400 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
2401 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2402 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
2403 // CHECK2-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
2404 // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
2405 // CHECK2-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2406 // CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
2407 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
2408 // CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
2409 // CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
2410 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
2411 // CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
2412 // CHECK2-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 4
2413 // CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
2414 // CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
2415 // CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
2416 // CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
2417 // CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
2418 // CHECK2-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 2
2419 // CHECK2-NEXT: ret void
2422 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func14
2423 // CHECK2-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
2424 // CHECK2-NEXT: entry:
2425 // CHECK2-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2426 // CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2427 // CHECK2-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
2428 // CHECK2-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 4
2429 // CHECK2-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2430 // CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
2431 // CHECK2-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
2432 // CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2433 // CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
2434 // CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
2435 // CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
2436 // CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
2437 // CHECK2-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
2438 // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
2439 // CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
2440 // CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
2441 // CHECK2-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4
2442 // CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
2443 // CHECK2-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP9]], ptr [[DOTOMP_REDUCTION_RED_LIST]]) #[[ATTR4]]
2444 // CHECK2-NEXT: ret void
2447 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20
2448 // CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
2449 // CHECK3-NEXT: entry:
2450 // CHECK3-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
2451 // CHECK3-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 4
2452 // CHECK3-NEXT: [[E1:%.*]] = alloca double, align 8
2453 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
2454 // CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
2455 // CHECK3-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
2456 // CHECK3-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 4
2457 // CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 4
2458 // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_kernel_environment, ptr [[DYN_PTR]])
2459 // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
2460 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
2461 // CHECK3: user_code.entry:
2462 // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
2463 // CHECK3-NEXT: [[TMP3:%.*]] = load double, ptr [[TMP0]], align 8
2464 // CHECK3-NEXT: store double [[TMP3]], ptr [[E1]], align 8
2465 // CHECK3-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
2466 // CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
2467 // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[E1]]) #[[ATTR4:[0-9]+]]
2468 // CHECK3-NEXT: call void @__kmpc_target_deinit()
2469 // CHECK3-NEXT: ret void
2470 // CHECK3: worker.exit:
2471 // CHECK3-NEXT: ret void
2474 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_omp_outlined
2475 // CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] {
2476 // CHECK3-NEXT: entry:
2477 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
2478 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
2479 // CHECK3-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 4
2480 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 4
2481 // CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
2482 // CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
2483 // CHECK3-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 4
2484 // CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 4
2485 // CHECK3-NEXT: [[E1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 8)
2486 // CHECK3-NEXT: store double 0.000000e+00, ptr [[E1]], align 8
2487 // CHECK3-NEXT: [[TMP1:%.*]] = load double, ptr [[E1]], align 8
2488 // CHECK3-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00
2489 // CHECK3-NEXT: store double [[ADD]], ptr [[E1]], align 8
2490 // CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
2491 // CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
2492 // CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
2493 // CHECK3-NEXT: store ptr [[E1]], ptr [[TMP4]], align 4
2494 // CHECK3-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
2495 // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
2496 // CHECK3-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
2497 // CHECK3-NEXT: br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
2498 // CHECK3: .omp.reduction.then:
2499 // CHECK3-NEXT: [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8
2500 // CHECK3-NEXT: [[TMP8:%.*]] = load double, ptr [[E1]], align 8
2501 // CHECK3-NEXT: [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
2502 // CHECK3-NEXT: store double [[ADD2]], ptr [[TMP0]], align 8
2503 // CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
2504 // CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
2505 // CHECK3: .omp.reduction.done:
2506 // CHECK3-NEXT: call void @__kmpc_free_shared(ptr [[E1]], i32 8)
2507 // CHECK3-NEXT: ret void
2510 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
2511 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
2512 // CHECK3-NEXT: entry:
2513 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2514 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
2515 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
2516 // CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
2517 // CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 4
2518 // CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
2519 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2520 // CHECK3-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
2521 // CHECK3-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
2522 // CHECK3-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
2523 // CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2524 // CHECK3-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
2525 // CHECK3-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
2526 // CHECK3-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
2527 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i32 0, i32 0
2528 // CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 4
2529 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
2530 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[TMP9]], i32 1
2531 // CHECK3-NEXT: [[TMP12:%.*]] = load i64, ptr [[TMP9]], align 8
2532 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_get_warp_size()
2533 // CHECK3-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i16
2534 // CHECK3-NEXT: [[TMP15:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP12]], i16 [[TMP6]], i16 [[TMP14]])
2535 // CHECK3-NEXT: store i64 [[TMP15]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 8
2536 // CHECK3-NEXT: [[TMP16:%.*]] = getelementptr i64, ptr [[TMP9]], i32 1
2537 // CHECK3-NEXT: [[TMP17:%.*]] = getelementptr i64, ptr [[DOTOMP_REDUCTION_ELEMENT]], i32 1
2538 // CHECK3-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 4
2539 // CHECK3-NEXT: [[TMP18:%.*]] = icmp eq i16 [[TMP7]], 0
2540 // CHECK3-NEXT: [[TMP19:%.*]] = icmp eq i16 [[TMP7]], 1
2541 // CHECK3-NEXT: [[TMP20:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
2542 // CHECK3-NEXT: [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]]
2543 // CHECK3-NEXT: [[TMP22:%.*]] = icmp eq i16 [[TMP7]], 2
2544 // CHECK3-NEXT: [[TMP23:%.*]] = and i16 [[TMP5]], 1
2545 // CHECK3-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP23]], 0
2546 // CHECK3-NEXT: [[TMP25:%.*]] = and i1 [[TMP22]], [[TMP24]]
2547 // CHECK3-NEXT: [[TMP26:%.*]] = icmp sgt i16 [[TMP6]], 0
2548 // CHECK3-NEXT: [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
2549 // CHECK3-NEXT: [[TMP28:%.*]] = or i1 [[TMP18]], [[TMP21]]
2550 // CHECK3-NEXT: [[TMP29:%.*]] = or i1 [[TMP28]], [[TMP27]]
2551 // CHECK3-NEXT: br i1 [[TMP29]], label [[THEN:%.*]], label [[ELSE:%.*]]
2553 // CHECK3-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]]) #[[ATTR4]]
2554 // CHECK3-NEXT: br label [[IFCONT:%.*]]
2556 // CHECK3-NEXT: br label [[IFCONT]]
2558 // CHECK3-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP7]], 1
2559 // CHECK3-NEXT: [[TMP31:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
2560 // CHECK3-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
2561 // CHECK3-NEXT: br i1 [[TMP32]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
2563 // CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
2564 // CHECK3-NEXT: [[TMP34:%.*]] = load ptr, ptr [[TMP33]], align 4
2565 // CHECK3-NEXT: [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i32 0, i32 0
2566 // CHECK3-NEXT: [[TMP36:%.*]] = load ptr, ptr [[TMP35]], align 4
2567 // CHECK3-NEXT: [[TMP37:%.*]] = load double, ptr [[TMP34]], align 8
2568 // CHECK3-NEXT: store double [[TMP37]], ptr [[TMP36]], align 8
2569 // CHECK3-NEXT: br label [[IFCONT6:%.*]]
2571 // CHECK3-NEXT: br label [[IFCONT6]]
2573 // CHECK3-NEXT: ret void
2576 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
2577 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
2578 // CHECK3-NEXT: entry:
2579 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2580 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2581 // CHECK3-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
2582 // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
2583 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2584 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
2585 // CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
2586 // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
2587 // CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
2588 // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
2589 // CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
2590 // CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2591 // CHECK3-NEXT: store i32 0, ptr [[DOTCNT_ADDR]], align 4
2592 // CHECK3-NEXT: br label [[PRECOND:%.*]]
2594 // CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
2595 // CHECK3-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
2596 // CHECK3-NEXT: br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
2598 // CHECK3-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
2599 // CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
2600 // CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
2602 // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i32 0, i32 0
2603 // CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 4
2604 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
2605 // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
2606 // CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
2607 // CHECK3-NEXT: store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
2608 // CHECK3-NEXT: br label [[IFCONT:%.*]]
2610 // CHECK3-NEXT: br label [[IFCONT]]
2612 // CHECK3-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
2613 // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
2614 // CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
2615 // CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
2617 // CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
2618 // CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i32 0, i32 0
2619 // CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
2620 // CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
2621 // CHECK3-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
2622 // CHECK3-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4
2623 // CHECK3-NEXT: br label [[IFCONT4:%.*]]
2625 // CHECK3-NEXT: br label [[IFCONT4]]
2627 // CHECK3-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
2628 // CHECK3-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
2629 // CHECK3-NEXT: br label [[PRECOND]]
2631 // CHECK3-NEXT: ret void
2634 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
2635 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
2636 // CHECK3-NEXT: entry:
2637 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2638 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2639 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
2640 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2641 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
2642 // CHECK3-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
2643 // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
2644 // CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2645 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
2646 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i32 0, i32 0
2647 // CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
2648 // CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
2649 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x double], ptr [[E]], i32 0, i32 [[TMP5]]
2650 // CHECK3-NEXT: [[TMP9:%.*]] = load double, ptr [[TMP7]], align 8
2651 // CHECK3-NEXT: store double [[TMP9]], ptr [[TMP8]], align 128
2652 // CHECK3-NEXT: ret void
2655 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
2656 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
2657 // CHECK3-NEXT: entry:
2658 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2659 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2660 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
2661 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 4
2662 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2663 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
2664 // CHECK3-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
2665 // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2666 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
2667 // CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
2668 // CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
2669 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x double], ptr [[E]], i32 0, i32 [[TMP4]]
2670 // CHECK3-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
2671 // CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
2672 // CHECK3-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_omp_outlined_omp$reduction$reduction_func"(ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr [[TMP7]]) #[[ATTR4]]
2673 // CHECK3-NEXT: ret void
2676 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
2677 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
2678 // CHECK3-NEXT: entry:
2679 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2680 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2681 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
2682 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2683 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
2684 // CHECK3-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
2685 // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
2686 // CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2687 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
2688 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i32 0, i32 0
2689 // CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
2690 // CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
2691 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x double], ptr [[E]], i32 0, i32 [[TMP5]]
2692 // CHECK3-NEXT: [[TMP9:%.*]] = load double, ptr [[TMP8]], align 128
2693 // CHECK3-NEXT: store double [[TMP9]], ptr [[TMP7]], align 8
2694 // CHECK3-NEXT: ret void
2697 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
2698 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
2699 // CHECK3-NEXT: entry:
2700 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2701 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2702 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
2703 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 4
2704 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2705 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
2706 // CHECK3-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
2707 // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2708 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
2709 // CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
2710 // CHECK3-NEXT: [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
2711 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x double], ptr [[E]], i32 0, i32 [[TMP4]]
2712 // CHECK3-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
2713 // CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
2714 // CHECK3-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP7]], ptr [[DOTOMP_REDUCTION_RED_LIST]]) #[[ATTR4]]
2715 // CHECK3-NEXT: ret void
2718 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26
2719 // CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]]) #[[ATTR0]] {
2720 // CHECK3-NEXT: entry:
2721 // CHECK3-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
2722 // CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4
2723 // CHECK3-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4
2724 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
2725 // CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
2726 // CHECK3-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
2727 // CHECK3-NEXT: store i32 [[C]], ptr [[C_ADDR]], align 4
2728 // CHECK3-NEXT: store i32 [[D]], ptr [[D_ADDR]], align 4
2729 // CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_kernel_environment, ptr [[DYN_PTR]])
2730 // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
2731 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
2732 // CHECK3: user_code.entry:
2733 // CHECK3-NEXT: [[TMP1:%.*]] = load i8, ptr [[C_ADDR]], align 1
2734 // CHECK3-NEXT: [[C1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 1)
2735 // CHECK3-NEXT: store i8 [[TMP1]], ptr [[C1]], align 1
2736 // CHECK3-NEXT: [[TMP2:%.*]] = load float, ptr [[D_ADDR]], align 4
2737 // CHECK3-NEXT: [[D2:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 4)
2738 // CHECK3-NEXT: store float [[TMP2]], ptr [[D2]], align 4
2739 // CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
2740 // CHECK3-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
2741 // CHECK3-NEXT: store i32 [[TMP3]], ptr [[DOTTHREADID_TEMP_]], align 4
2742 // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[C1]], ptr [[D2]]) #[[ATTR4]]
2743 // CHECK3-NEXT: call void @__kmpc_free_shared(ptr [[D2]], i32 4)
2744 // CHECK3-NEXT: call void @__kmpc_free_shared(ptr [[C1]], i32 1)
2745 // CHECK3-NEXT: call void @__kmpc_target_deinit()
2746 // CHECK3-NEXT: ret void
2747 // CHECK3: worker.exit:
2748 // CHECK3-NEXT: ret void
2751 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_omp_outlined
2752 // CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
2753 // CHECK3-NEXT: entry:
2754 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
2755 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
2756 // CHECK3-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 4
2757 // CHECK3-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 4
2758 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 4
2759 // CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
2760 // CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
2761 // CHECK3-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 4
2762 // CHECK3-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 4
2763 // CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
2764 // CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 4
2765 // CHECK3-NEXT: [[C1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 1)
2766 // CHECK3-NEXT: [[D2:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 4)
2767 // CHECK3-NEXT: store i8 0, ptr [[C1]], align 1
2768 // CHECK3-NEXT: store float 1.000000e+00, ptr [[D2]], align 4
2769 // CHECK3-NEXT: [[TMP2:%.*]] = load i8, ptr [[C1]], align 1
2770 // CHECK3-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32
2771 // CHECK3-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2
2772 // CHECK3-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
2773 // CHECK3-NEXT: store i8 [[CONV3]], ptr [[C1]], align 1
2774 // CHECK3-NEXT: [[TMP3:%.*]] = load float, ptr [[D2]], align 4
2775 // CHECK3-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01
2776 // CHECK3-NEXT: store float [[MUL]], ptr [[D2]], align 4
2777 // CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
2778 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
2779 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
2780 // CHECK3-NEXT: store ptr [[C1]], ptr [[TMP6]], align 4
2781 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
2782 // CHECK3-NEXT: store ptr [[D2]], ptr [[TMP7]], align 4
2783 // CHECK3-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
2784 // CHECK3-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
2785 // CHECK3-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
2786 // CHECK3-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
2787 // CHECK3: .omp.reduction.then:
2788 // CHECK3-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1
2789 // CHECK3-NEXT: [[CONV4:%.*]] = sext i8 [[TMP10]] to i32
2790 // CHECK3-NEXT: [[TMP11:%.*]] = load i8, ptr [[C1]], align 1
2791 // CHECK3-NEXT: [[CONV5:%.*]] = sext i8 [[TMP11]] to i32
2792 // CHECK3-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
2793 // CHECK3-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
2794 // CHECK3-NEXT: store i8 [[CONV7]], ptr [[TMP0]], align 1
2795 // CHECK3-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4
2796 // CHECK3-NEXT: [[TMP13:%.*]] = load float, ptr [[D2]], align 4
2797 // CHECK3-NEXT: [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
2798 // CHECK3-NEXT: store float [[MUL8]], ptr [[TMP1]], align 4
2799 // CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
2800 // CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
2801 // CHECK3: .omp.reduction.done:
2802 // CHECK3-NEXT: call void @__kmpc_free_shared(ptr [[D2]], i32 4)
2803 // CHECK3-NEXT: call void @__kmpc_free_shared(ptr [[C1]], i32 1)
2804 // CHECK3-NEXT: ret void
2807 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func1
2808 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
2809 // CHECK3-NEXT: entry:
2810 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2811 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
2812 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
2813 // CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
2814 // CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x ptr], align 4
2815 // CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
2816 // CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
2817 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2818 // CHECK3-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
2819 // CHECK3-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
2820 // CHECK3-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
2821 // CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2822 // CHECK3-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
2823 // CHECK3-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
2824 // CHECK3-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
2825 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 0
2826 // CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 4
2827 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
2828 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP9]], i32 1
2829 // CHECK3-NEXT: [[TMP12:%.*]] = load i8, ptr [[TMP9]], align 1
2830 // CHECK3-NEXT: [[TMP13:%.*]] = sext i8 [[TMP12]] to i32
2831 // CHECK3-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_get_warp_size()
2832 // CHECK3-NEXT: [[TMP15:%.*]] = trunc i32 [[TMP14]] to i16
2833 // CHECK3-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP13]], i16 [[TMP6]], i16 [[TMP15]])
2834 // CHECK3-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
2835 // CHECK3-NEXT: store i8 [[TMP17]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 1
2836 // CHECK3-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP9]], i32 1
2837 // CHECK3-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DOTOMP_REDUCTION_ELEMENT]], i32 1
2838 // CHECK3-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 4
2839 // CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 1
2840 // CHECK3-NEXT: [[TMP21:%.*]] = load ptr, ptr [[TMP20]], align 4
2841 // CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
2842 // CHECK3-NEXT: [[TMP23:%.*]] = getelementptr float, ptr [[TMP21]], i32 1
2843 // CHECK3-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP21]], align 4
2844 // CHECK3-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_get_warp_size()
2845 // CHECK3-NEXT: [[TMP26:%.*]] = trunc i32 [[TMP25]] to i16
2846 // CHECK3-NEXT: [[TMP27:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP24]], i16 [[TMP6]], i16 [[TMP26]])
2847 // CHECK3-NEXT: store i32 [[TMP27]], ptr [[DOTOMP_REDUCTION_ELEMENT4]], align 4
2848 // CHECK3-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[TMP21]], i32 1
2849 // CHECK3-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
2850 // CHECK3-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT4]], ptr [[TMP22]], align 4
2851 // CHECK3-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP7]], 0
2852 // CHECK3-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP7]], 1
2853 // CHECK3-NEXT: [[TMP32:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
2854 // CHECK3-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
2855 // CHECK3-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP7]], 2
2856 // CHECK3-NEXT: [[TMP35:%.*]] = and i16 [[TMP5]], 1
2857 // CHECK3-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP35]], 0
2858 // CHECK3-NEXT: [[TMP37:%.*]] = and i1 [[TMP34]], [[TMP36]]
2859 // CHECK3-NEXT: [[TMP38:%.*]] = icmp sgt i16 [[TMP6]], 0
2860 // CHECK3-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
2861 // CHECK3-NEXT: [[TMP40:%.*]] = or i1 [[TMP30]], [[TMP33]]
2862 // CHECK3-NEXT: [[TMP41:%.*]] = or i1 [[TMP40]], [[TMP39]]
2863 // CHECK3-NEXT: br i1 [[TMP41]], label [[THEN:%.*]], label [[ELSE:%.*]]
2865 // CHECK3-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]]) #[[ATTR4]]
2866 // CHECK3-NEXT: br label [[IFCONT:%.*]]
2868 // CHECK3-NEXT: br label [[IFCONT]]
2870 // CHECK3-NEXT: [[TMP42:%.*]] = icmp eq i16 [[TMP7]], 1
2871 // CHECK3-NEXT: [[TMP43:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
2872 // CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
2873 // CHECK3-NEXT: br i1 [[TMP44]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
2875 // CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
2876 // CHECK3-NEXT: [[TMP46:%.*]] = load ptr, ptr [[TMP45]], align 4
2877 // CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 0
2878 // CHECK3-NEXT: [[TMP48:%.*]] = load ptr, ptr [[TMP47]], align 4
2879 // CHECK3-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP46]], align 1
2880 // CHECK3-NEXT: store i8 [[TMP49]], ptr [[TMP48]], align 1
2881 // CHECK3-NEXT: [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
2882 // CHECK3-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 4
2883 // CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 1
2884 // CHECK3-NEXT: [[TMP53:%.*]] = load ptr, ptr [[TMP52]], align 4
2885 // CHECK3-NEXT: [[TMP54:%.*]] = load float, ptr [[TMP51]], align 4
2886 // CHECK3-NEXT: store float [[TMP54]], ptr [[TMP53]], align 4
2887 // CHECK3-NEXT: br label [[IFCONT7:%.*]]
2889 // CHECK3-NEXT: br label [[IFCONT7]]
2891 // CHECK3-NEXT: ret void
2894 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func2
2895 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
2896 // CHECK3-NEXT: entry:
2897 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2898 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2899 // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
2900 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2901 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
2902 // CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
2903 // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
2904 // CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
2905 // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
2906 // CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
2907 // CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2908 // CHECK3-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
2909 // CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
2910 // CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
2912 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
2913 // CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 4
2914 // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
2915 // CHECK3-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
2916 // CHECK3-NEXT: store volatile i8 [[TMP10]], ptr addrspace(3) [[TMP9]], align 1
2917 // CHECK3-NEXT: br label [[IFCONT:%.*]]
2919 // CHECK3-NEXT: br label [[IFCONT]]
2921 // CHECK3-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
2922 // CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
2923 // CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
2924 // CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
2926 // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
2927 // CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
2928 // CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
2929 // CHECK3-NEXT: [[TMP15:%.*]] = load volatile i8, ptr addrspace(3) [[TMP12]], align 1
2930 // CHECK3-NEXT: store i8 [[TMP15]], ptr [[TMP14]], align 1
2931 // CHECK3-NEXT: br label [[IFCONT4:%.*]]
2933 // CHECK3-NEXT: br label [[IFCONT4]]
2935 // CHECK3-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
2936 // CHECK3-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
2937 // CHECK3-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
2939 // CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
2940 // CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
2941 // CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
2942 // CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
2943 // CHECK3-NEXT: store volatile i32 [[TMP19]], ptr addrspace(3) [[TMP18]], align 4
2944 // CHECK3-NEXT: br label [[IFCONT8:%.*]]
2946 // CHECK3-NEXT: br label [[IFCONT8]]
2948 // CHECK3-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
2949 // CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
2950 // CHECK3-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
2951 // CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
2953 // CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
2954 // CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
2955 // CHECK3-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 4
2956 // CHECK3-NEXT: [[TMP24:%.*]] = load volatile i32, ptr addrspace(3) [[TMP21]], align 4
2957 // CHECK3-NEXT: store i32 [[TMP24]], ptr [[TMP23]], align 4
2958 // CHECK3-NEXT: br label [[IFCONT12:%.*]]
2960 // CHECK3-NEXT: br label [[IFCONT12]]
2961 // CHECK3: ifcont12:
2962 // CHECK3-NEXT: ret void
2965 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func3
2966 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
2967 // CHECK3-NEXT: entry:
2968 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2969 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2970 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
2971 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
2972 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
2973 // CHECK3-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
2974 // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
2975 // CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
2976 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
2977 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
2978 // CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
2979 // CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 0, i32 0
2980 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i8], ptr [[C]], i32 0, i32 [[TMP5]]
2981 // CHECK3-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
2982 // CHECK3-NEXT: store i8 [[TMP9]], ptr [[TMP8]], align 128
2983 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
2984 // CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
2985 // CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 0, i32 1
2986 // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2048 x float], ptr [[D]], i32 0, i32 [[TMP5]]
2987 // CHECK3-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP11]], align 4
2988 // CHECK3-NEXT: store float [[TMP13]], ptr [[TMP12]], align 128
2989 // CHECK3-NEXT: ret void
2992 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func4
2993 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
2994 // CHECK3-NEXT: entry:
2995 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
2996 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
2997 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
2998 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 4
2999 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
3000 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
3001 // CHECK3-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
3002 // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
3003 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
3004 // CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
3005 // CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP3]], i32 0, i32 0
3006 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x i8], ptr [[C]], i32 0, i32 [[TMP4]]
3007 // CHECK3-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
3008 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
3009 // CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP3]], i32 0, i32 1
3010 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x float], ptr [[D]], i32 0, i32 [[TMP4]]
3011 // CHECK3-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4
3012 // CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
3013 // CHECK3-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_omp_outlined_omp$reduction$reduction_func"(ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr [[TMP9]]) #[[ATTR4]]
3014 // CHECK3-NEXT: ret void
3017 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func5
3018 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
3019 // CHECK3-NEXT: entry:
3020 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
3021 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
3022 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
3023 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
3024 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
3025 // CHECK3-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
3026 // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
3027 // CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
3028 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
3029 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
3030 // CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
3031 // CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 0, i32 0
3032 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i8], ptr [[C]], i32 0, i32 [[TMP5]]
3033 // CHECK3-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 128
3034 // CHECK3-NEXT: store i8 [[TMP9]], ptr [[TMP7]], align 1
3035 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
3036 // CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
3037 // CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 0, i32 1
3038 // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2048 x float], ptr [[D]], i32 0, i32 [[TMP5]]
3039 // CHECK3-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 128
3040 // CHECK3-NEXT: store float [[TMP13]], ptr [[TMP11]], align 4
3041 // CHECK3-NEXT: ret void
3044 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func6
3045 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
3046 // CHECK3-NEXT: entry:
3047 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
3048 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
3049 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
3050 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 4
3051 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
3052 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
3053 // CHECK3-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
3054 // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
3055 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
3056 // CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
3057 // CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP3]], i32 0, i32 0
3058 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x i8], ptr [[C]], i32 0, i32 [[TMP4]]
3059 // CHECK3-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
3060 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
3061 // CHECK3-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP3]], i32 0, i32 1
3062 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x float], ptr [[D]], i32 0, i32 [[TMP4]]
3063 // CHECK3-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4
3064 // CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
3065 // CHECK3-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP9]], ptr [[DOTOMP_REDUCTION_RED_LIST]]) #[[ATTR4]]
3066 // CHECK3-NEXT: ret void
3069 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33
3070 // CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
3071 // CHECK3-NEXT: entry:
3072 // CHECK3-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
3073 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
3074 // CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
3075 // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
3076 // CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
3077 // CHECK3-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
3078 // CHECK3-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
3079 // CHECK3-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4
3080 // CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_kernel_environment, ptr [[DYN_PTR]])
3081 // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
3082 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
3083 // CHECK3: user_code.entry:
3084 // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
3085 // CHECK3-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
3086 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4
3087 // CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[A_ADDR]], ptr [[B_ADDR]]) #[[ATTR4]]
3088 // CHECK3-NEXT: call void @__kmpc_target_deinit()
3089 // CHECK3-NEXT: ret void
3090 // CHECK3: worker.exit:
3091 // CHECK3-NEXT: ret void
3094 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined
3095 // CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
3096 // CHECK3-NEXT: entry:
3097 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
3098 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
3099 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
3100 // CHECK3-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4
3101 // CHECK3-NEXT: [[A1:%.*]] = alloca i32, align 4
3102 // CHECK3-NEXT: [[B2:%.*]] = alloca i16, align 2
3103 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 4
3104 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 4
3105 // CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
3106 // CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
3107 // CHECK3-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
3108 // CHECK3-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 4
3109 // CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
3110 // CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
3111 // CHECK3-NEXT: store i32 0, ptr [[A1]], align 4
3112 // CHECK3-NEXT: store i16 -32768, ptr [[B2]], align 2
3113 // CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
3114 // CHECK3-NEXT: store ptr [[A1]], ptr [[TMP2]], align 4
3115 // CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
3116 // CHECK3-NEXT: store ptr [[B2]], ptr [[TMP3]], align 4
3117 // CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
3118 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
3119 // CHECK3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2)
3120 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
3121 // CHECK3-NEXT: store ptr [[A1]], ptr [[TMP6]], align 4
3122 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
3123 // CHECK3-NEXT: store ptr [[B2]], ptr [[TMP7]], align 4
3124 // CHECK3-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
3125 // CHECK3-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
3126 // CHECK3-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
3127 // CHECK3-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
3128 // CHECK3: .omp.reduction.then:
3129 // CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
3130 // CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[A1]], align 4
3131 // CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]]
3132 // CHECK3-NEXT: store i32 [[OR]], ptr [[TMP0]], align 4
3133 // CHECK3-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2
3134 // CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP12]] to i32
3135 // CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[B2]], align 2
3136 // CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP13]] to i32
3137 // CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
3138 // CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
3139 // CHECK3: cond.true:
3140 // CHECK3-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2
3141 // CHECK3-NEXT: br label [[COND_END:%.*]]
3142 // CHECK3: cond.false:
3143 // CHECK3-NEXT: [[TMP15:%.*]] = load i16, ptr [[B2]], align 2
3144 // CHECK3-NEXT: br label [[COND_END]]
3145 // CHECK3: cond.end:
3146 // CHECK3-NEXT: [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
3147 // CHECK3-NEXT: store i16 [[COND]], ptr [[TMP1]], align 2
3148 // CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
3149 // CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
3150 // CHECK3: .omp.reduction.done:
3151 // CHECK3-NEXT: ret void
3154 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp_outlined
3155 // CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
3156 // CHECK3-NEXT: entry:
3157 // CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
3158 // CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
3159 // CHECK3-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
3160 // CHECK3-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4
3161 // CHECK3-NEXT: [[A1:%.*]] = alloca i32, align 4
3162 // CHECK3-NEXT: [[B2:%.*]] = alloca i16, align 2
3163 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 4
3164 // CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
3165 // CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
3166 // CHECK3-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
3167 // CHECK3-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 4
3168 // CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
3169 // CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
3170 // CHECK3-NEXT: store i32 0, ptr [[A1]], align 4
3171 // CHECK3-NEXT: store i16 -32768, ptr [[B2]], align 2
3172 // CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[A1]], align 4
3173 // CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1
3174 // CHECK3-NEXT: store i32 [[OR]], ptr [[A1]], align 4
3175 // CHECK3-NEXT: [[TMP3:%.*]] = load i16, ptr [[B2]], align 2
3176 // CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32
3177 // CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
3178 // CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
3179 // CHECK3: cond.true:
3180 // CHECK3-NEXT: br label [[COND_END:%.*]]
3181 // CHECK3: cond.false:
3182 // CHECK3-NEXT: [[TMP4:%.*]] = load i16, ptr [[B2]], align 2
3183 // CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
3184 // CHECK3-NEXT: br label [[COND_END]]
3185 // CHECK3: cond.end:
3186 // CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
3187 // CHECK3-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16
3188 // CHECK3-NEXT: store i16 [[CONV4]], ptr [[B2]], align 2
3189 // CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
3190 // CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
3191 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
3192 // CHECK3-NEXT: store ptr [[A1]], ptr [[TMP7]], align 4
3193 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
3194 // CHECK3-NEXT: store ptr [[B2]], ptr [[TMP8]], align 4
3195 // CHECK3-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP6]], i32 2, i32 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func7, ptr @_omp_reduction_inter_warp_copy_func8)
3196 // CHECK3-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
3197 // CHECK3-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
3198 // CHECK3: .omp.reduction.then:
3199 // CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
3200 // CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[A1]], align 4
3201 // CHECK3-NEXT: [[OR5:%.*]] = or i32 [[TMP11]], [[TMP12]]
3202 // CHECK3-NEXT: store i32 [[OR5]], ptr [[TMP0]], align 4
3203 // CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2
3204 // CHECK3-NEXT: [[CONV6:%.*]] = sext i16 [[TMP13]] to i32
3205 // CHECK3-NEXT: [[TMP14:%.*]] = load i16, ptr [[B2]], align 2
3206 // CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP14]] to i32
3207 // CHECK3-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
3208 // CHECK3-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
3209 // CHECK3: cond.true9:
3210 // CHECK3-NEXT: [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2
3211 // CHECK3-NEXT: br label [[COND_END11:%.*]]
3212 // CHECK3: cond.false10:
3213 // CHECK3-NEXT: [[TMP16:%.*]] = load i16, ptr [[B2]], align 2
3214 // CHECK3-NEXT: br label [[COND_END11]]
3215 // CHECK3: cond.end11:
3216 // CHECK3-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE9]] ], [ [[TMP16]], [[COND_FALSE10]] ]
3217 // CHECK3-NEXT: store i16 [[COND12]], ptr [[TMP1]], align 2
3218 // CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
3219 // CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
3220 // CHECK3: .omp.reduction.done:
3221 // CHECK3-NEXT: ret void
3224 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func7
3225 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
3226 // CHECK3-NEXT: entry:
3227 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
3228 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
3229 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
3230 // CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
3231 // CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x ptr], align 4
3232 // CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
3233 // CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
3234 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
3235 // CHECK3-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
3236 // CHECK3-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
3237 // CHECK3-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
3238 // CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
3239 // CHECK3-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
3240 // CHECK3-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
3241 // CHECK3-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
3242 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 0
3243 // CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 4
3244 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
3245 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP9]], i32 1
3246 // CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP9]], align 4
3247 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_get_warp_size()
3248 // CHECK3-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i16
3249 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP12]], i16 [[TMP6]], i16 [[TMP14]])
3250 // CHECK3-NEXT: store i32 [[TMP15]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 4
3251 // CHECK3-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP9]], i32 1
3252 // CHECK3-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[DOTOMP_REDUCTION_ELEMENT]], i32 1
3253 // CHECK3-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 4
3254 // CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 1
3255 // CHECK3-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP18]], align 4
3256 // CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
3257 // CHECK3-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[TMP19]], i32 1
3258 // CHECK3-NEXT: [[TMP22:%.*]] = load i16, ptr [[TMP19]], align 2
3259 // CHECK3-NEXT: [[TMP23:%.*]] = sext i16 [[TMP22]] to i32
3260 // CHECK3-NEXT: [[TMP24:%.*]] = call i32 @__kmpc_get_warp_size()
3261 // CHECK3-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
3262 // CHECK3-NEXT: [[TMP26:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP23]], i16 [[TMP6]], i16 [[TMP25]])
3263 // CHECK3-NEXT: [[TMP27:%.*]] = trunc i32 [[TMP26]] to i16
3264 // CHECK3-NEXT: store i16 [[TMP27]], ptr [[DOTOMP_REDUCTION_ELEMENT4]], align 2
3265 // CHECK3-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[TMP19]], i32 1
3266 // CHECK3-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
3267 // CHECK3-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT4]], ptr [[TMP20]], align 4
3268 // CHECK3-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP7]], 0
3269 // CHECK3-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP7]], 1
3270 // CHECK3-NEXT: [[TMP32:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
3271 // CHECK3-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
3272 // CHECK3-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP7]], 2
3273 // CHECK3-NEXT: [[TMP35:%.*]] = and i16 [[TMP5]], 1
3274 // CHECK3-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP35]], 0
3275 // CHECK3-NEXT: [[TMP37:%.*]] = and i1 [[TMP34]], [[TMP36]]
3276 // CHECK3-NEXT: [[TMP38:%.*]] = icmp sgt i16 [[TMP6]], 0
3277 // CHECK3-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
3278 // CHECK3-NEXT: [[TMP40:%.*]] = or i1 [[TMP30]], [[TMP33]]
3279 // CHECK3-NEXT: [[TMP41:%.*]] = or i1 [[TMP40]], [[TMP39]]
3280 // CHECK3-NEXT: br i1 [[TMP41]], label [[THEN:%.*]], label [[ELSE:%.*]]
3282 // CHECK3-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]]) #[[ATTR4]]
3283 // CHECK3-NEXT: br label [[IFCONT:%.*]]
3285 // CHECK3-NEXT: br label [[IFCONT]]
3287 // CHECK3-NEXT: [[TMP42:%.*]] = icmp eq i16 [[TMP7]], 1
3288 // CHECK3-NEXT: [[TMP43:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
3289 // CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
3290 // CHECK3-NEXT: br i1 [[TMP44]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
3292 // CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
3293 // CHECK3-NEXT: [[TMP46:%.*]] = load ptr, ptr [[TMP45]], align 4
3294 // CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 0
3295 // CHECK3-NEXT: [[TMP48:%.*]] = load ptr, ptr [[TMP47]], align 4
3296 // CHECK3-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP46]], align 4
3297 // CHECK3-NEXT: store i32 [[TMP49]], ptr [[TMP48]], align 4
3298 // CHECK3-NEXT: [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
3299 // CHECK3-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 4
3300 // CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 1
3301 // CHECK3-NEXT: [[TMP53:%.*]] = load ptr, ptr [[TMP52]], align 4
3302 // CHECK3-NEXT: [[TMP54:%.*]] = load i16, ptr [[TMP51]], align 2
3303 // CHECK3-NEXT: store i16 [[TMP54]], ptr [[TMP53]], align 2
3304 // CHECK3-NEXT: br label [[IFCONT7:%.*]]
3306 // CHECK3-NEXT: br label [[IFCONT7]]
3308 // CHECK3-NEXT: ret void
3311 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func8
3312 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
3313 // CHECK3-NEXT: entry:
3314 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
3315 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
3316 // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
3317 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
3318 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
3319 // CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
3320 // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
3321 // CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
3322 // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
3323 // CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
3324 // CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
3325 // CHECK3-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
3326 // CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
3327 // CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
3329 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
3330 // CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 4
3331 // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
3332 // CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 4
3333 // CHECK3-NEXT: store volatile i32 [[TMP10]], ptr addrspace(3) [[TMP9]], align 4
3334 // CHECK3-NEXT: br label [[IFCONT:%.*]]
3336 // CHECK3-NEXT: br label [[IFCONT]]
3338 // CHECK3-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
3339 // CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
3340 // CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
3341 // CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
3343 // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
3344 // CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
3345 // CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
3346 // CHECK3-NEXT: [[TMP15:%.*]] = load volatile i32, ptr addrspace(3) [[TMP12]], align 4
3347 // CHECK3-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4
3348 // CHECK3-NEXT: br label [[IFCONT4:%.*]]
3350 // CHECK3-NEXT: br label [[IFCONT4]]
3352 // CHECK3-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
3353 // CHECK3-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
3354 // CHECK3-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
3356 // CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
3357 // CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
3358 // CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
3359 // CHECK3-NEXT: [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 2
3360 // CHECK3-NEXT: store volatile i16 [[TMP19]], ptr addrspace(3) [[TMP18]], align 2
3361 // CHECK3-NEXT: br label [[IFCONT8:%.*]]
3363 // CHECK3-NEXT: br label [[IFCONT8]]
3365 // CHECK3-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
3366 // CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
3367 // CHECK3-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
3368 // CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
3370 // CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
3371 // CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
3372 // CHECK3-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 4
3373 // CHECK3-NEXT: [[TMP24:%.*]] = load volatile i16, ptr addrspace(3) [[TMP21]], align 2
3374 // CHECK3-NEXT: store i16 [[TMP24]], ptr [[TMP23]], align 2
3375 // CHECK3-NEXT: br label [[IFCONT12:%.*]]
3377 // CHECK3-NEXT: br label [[IFCONT12]]
3378 // CHECK3: ifcont12:
3379 // CHECK3-NEXT: ret void
3382 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func9
3383 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
3384 // CHECK3-NEXT: entry:
3385 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
3386 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2
3387 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2
3388 // CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2
3389 // CHECK3-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x ptr], align 4
3390 // CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
3391 // CHECK3-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
3392 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
3393 // CHECK3-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
3394 // CHECK3-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
3395 // CHECK3-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
3396 // CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
3397 // CHECK3-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
3398 // CHECK3-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
3399 // CHECK3-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
3400 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 0
3401 // CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 4
3402 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
3403 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP9]], i32 1
3404 // CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP9]], align 4
3405 // CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_get_warp_size()
3406 // CHECK3-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i16
3407 // CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP12]], i16 [[TMP6]], i16 [[TMP14]])
3408 // CHECK3-NEXT: store i32 [[TMP15]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 4
3409 // CHECK3-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP9]], i32 1
3410 // CHECK3-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[DOTOMP_REDUCTION_ELEMENT]], i32 1
3411 // CHECK3-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 4
3412 // CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 1
3413 // CHECK3-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP18]], align 4
3414 // CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
3415 // CHECK3-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[TMP19]], i32 1
3416 // CHECK3-NEXT: [[TMP22:%.*]] = load i16, ptr [[TMP19]], align 2
3417 // CHECK3-NEXT: [[TMP23:%.*]] = sext i16 [[TMP22]] to i32
3418 // CHECK3-NEXT: [[TMP24:%.*]] = call i32 @__kmpc_get_warp_size()
3419 // CHECK3-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
3420 // CHECK3-NEXT: [[TMP26:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP23]], i16 [[TMP6]], i16 [[TMP25]])
3421 // CHECK3-NEXT: [[TMP27:%.*]] = trunc i32 [[TMP26]] to i16
3422 // CHECK3-NEXT: store i16 [[TMP27]], ptr [[DOTOMP_REDUCTION_ELEMENT4]], align 2
3423 // CHECK3-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[TMP19]], i32 1
3424 // CHECK3-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
3425 // CHECK3-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT4]], ptr [[TMP20]], align 4
3426 // CHECK3-NEXT: [[TMP30:%.*]] = icmp eq i16 [[TMP7]], 0
3427 // CHECK3-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP7]], 1
3428 // CHECK3-NEXT: [[TMP32:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
3429 // CHECK3-NEXT: [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
3430 // CHECK3-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP7]], 2
3431 // CHECK3-NEXT: [[TMP35:%.*]] = and i16 [[TMP5]], 1
3432 // CHECK3-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP35]], 0
3433 // CHECK3-NEXT: [[TMP37:%.*]] = and i1 [[TMP34]], [[TMP36]]
3434 // CHECK3-NEXT: [[TMP38:%.*]] = icmp sgt i16 [[TMP6]], 0
3435 // CHECK3-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
3436 // CHECK3-NEXT: [[TMP40:%.*]] = or i1 [[TMP30]], [[TMP33]]
3437 // CHECK3-NEXT: [[TMP41:%.*]] = or i1 [[TMP40]], [[TMP39]]
3438 // CHECK3-NEXT: br i1 [[TMP41]], label [[THEN:%.*]], label [[ELSE:%.*]]
3440 // CHECK3-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]]) #[[ATTR4]]
3441 // CHECK3-NEXT: br label [[IFCONT:%.*]]
3443 // CHECK3-NEXT: br label [[IFCONT]]
3445 // CHECK3-NEXT: [[TMP42:%.*]] = icmp eq i16 [[TMP7]], 1
3446 // CHECK3-NEXT: [[TMP43:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
3447 // CHECK3-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
3448 // CHECK3-NEXT: br i1 [[TMP44]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
3450 // CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
3451 // CHECK3-NEXT: [[TMP46:%.*]] = load ptr, ptr [[TMP45]], align 4
3452 // CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 0
3453 // CHECK3-NEXT: [[TMP48:%.*]] = load ptr, ptr [[TMP47]], align 4
3454 // CHECK3-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP46]], align 4
3455 // CHECK3-NEXT: store i32 [[TMP49]], ptr [[TMP48]], align 4
3456 // CHECK3-NEXT: [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
3457 // CHECK3-NEXT: [[TMP51:%.*]] = load ptr, ptr [[TMP50]], align 4
3458 // CHECK3-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i32 0, i32 1
3459 // CHECK3-NEXT: [[TMP53:%.*]] = load ptr, ptr [[TMP52]], align 4
3460 // CHECK3-NEXT: [[TMP54:%.*]] = load i16, ptr [[TMP51]], align 2
3461 // CHECK3-NEXT: store i16 [[TMP54]], ptr [[TMP53]], align 2
3462 // CHECK3-NEXT: br label [[IFCONT7:%.*]]
3464 // CHECK3-NEXT: br label [[IFCONT7]]
3466 // CHECK3-NEXT: ret void
3469 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func10
3470 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
3471 // CHECK3-NEXT: entry:
3472 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
3473 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
3474 // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
3475 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
3476 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
3477 // CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
3478 // CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
3479 // CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
3480 // CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
3481 // CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
3482 // CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 4
3483 // CHECK3-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
3484 // CHECK3-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
3485 // CHECK3-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
3487 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
3488 // CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 4
3489 // CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
3490 // CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 4
3491 // CHECK3-NEXT: store volatile i32 [[TMP10]], ptr addrspace(3) [[TMP9]], align 4
3492 // CHECK3-NEXT: br label [[IFCONT:%.*]]
3494 // CHECK3-NEXT: br label [[IFCONT]]
3496 // CHECK3-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
3497 // CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
3498 // CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
3499 // CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
3501 // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
3502 // CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 0
3503 // CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4
3504 // CHECK3-NEXT: [[TMP15:%.*]] = load volatile i32, ptr addrspace(3) [[TMP12]], align 4
3505 // CHECK3-NEXT: store i32 [[TMP15]], ptr [[TMP14]], align 4
3506 // CHECK3-NEXT: br label [[IFCONT4:%.*]]
3508 // CHECK3-NEXT: br label [[IFCONT4]]
3510 // CHECK3-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
3511 // CHECK3-NEXT: [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
3512 // CHECK3-NEXT: br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
3514 // CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
3515 // CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 4
3516 // CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
3517 // CHECK3-NEXT: [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 2
3518 // CHECK3-NEXT: store volatile i16 [[TMP19]], ptr addrspace(3) [[TMP18]], align 2
3519 // CHECK3-NEXT: br label [[IFCONT8:%.*]]
3521 // CHECK3-NEXT: br label [[IFCONT8]]
3523 // CHECK3-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
3524 // CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
3525 // CHECK3-NEXT: [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
3526 // CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
3528 // CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
3529 // CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i32 0, i32 1
3530 // CHECK3-NEXT: [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 4
3531 // CHECK3-NEXT: [[TMP24:%.*]] = load volatile i16, ptr addrspace(3) [[TMP21]], align 2
3532 // CHECK3-NEXT: store i16 [[TMP24]], ptr [[TMP23]], align 2
3533 // CHECK3-NEXT: br label [[IFCONT12:%.*]]
3535 // CHECK3-NEXT: br label [[IFCONT12]]
3536 // CHECK3: ifcont12:
3537 // CHECK3-NEXT: ret void
3540 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func11
3541 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
3542 // CHECK3-NEXT: entry:
3543 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
3544 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
3545 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
3546 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
3547 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
3548 // CHECK3-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
3549 // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
3550 // CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
3551 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
3552 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
3553 // CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
3554 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
3555 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
3556 // CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
3557 // CHECK3-NEXT: store i32 [[TMP9]], ptr [[TMP8]], align 128
3558 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
3559 // CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
3560 // CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
3561 // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
3562 // CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
3563 // CHECK3-NEXT: store i16 [[TMP13]], ptr [[TMP12]], align 128
3564 // CHECK3-NEXT: ret void
3567 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func12
3568 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
3569 // CHECK3-NEXT: entry:
3570 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
3571 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
3572 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
3573 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 4
3574 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
3575 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
3576 // CHECK3-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
3577 // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
3578 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
3579 // CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
3580 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
3581 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
3582 // CHECK3-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
3583 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
3584 // CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
3585 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
3586 // CHECK3-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4
3587 // CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
3588 // CHECK3-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp$reduction$reduction_func"(ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr [[TMP9]]) #[[ATTR4]]
3589 // CHECK3-NEXT: ret void
3592 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func13
3593 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
3594 // CHECK3-NEXT: entry:
3595 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
3596 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
3597 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
3598 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
3599 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
3600 // CHECK3-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
3601 // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
3602 // CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
3603 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
3604 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
3605 // CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
3606 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
3607 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
3608 // CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
3609 // CHECK3-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 4
3610 // CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
3611 // CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
3612 // CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
3613 // CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
3614 // CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
3615 // CHECK3-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 2
3616 // CHECK3-NEXT: ret void
3619 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func14
3620 // CHECK3-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
3621 // CHECK3-NEXT: entry:
3622 // CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
3623 // CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
3624 // CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
3625 // CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 4
3626 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
3627 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
3628 // CHECK3-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
3629 // CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
3630 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
3631 // CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
3632 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
3633 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
3634 // CHECK3-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
3635 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
3636 // CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
3637 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
3638 // CHECK3-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4
3639 // CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
3640 // CHECK3-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP9]], ptr [[DOTOMP_REDUCTION_RED_LIST]]) #[[ATTR4]]
3641 // CHECK3-NEXT: ret void