Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / clang / test / OpenMP / target_teams_generic_loop_codegen.cpp
blobf9aef3acb1c611e9be9db1ebc25d9ad2c6df4d73
1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
2 // REQUIRES: amdgpu-registered-target
4 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
5 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=IR-GPU
7 // RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
9 // Check same results after serialization round-trip
10 // RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -emit-pch -o %t %s
11 // RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR-PCH
13 // expected-no-diagnostics
15 #ifndef HEADER
16 #define HEADER
17 int foo() {
18 int i;
19 int j;
20 int sum[10][10];
22 #pragma omp target teams loop reduction(+:sum) collapse(2) \
23 bind(parallel) order(concurrent) lastprivate(j) map(tofrom:sum)
24 for(i=0; i<10; i++)
25 for(j=0; j<10; j++)
26 sum[i][j] += i;
28 return 0;
30 #endif
31 // IR-PCH-HOST-LABEL: define {{[^@]+}}@_Z3foov
32 // IR-PCH-HOST-SAME: () #[[ATTR0:[0-9]+]] {
33 // IR-PCH-HOST-NEXT: entry:
34 // IR-PCH-HOST-NEXT: [[I:%.*]] = alloca i32, align 4
35 // IR-PCH-HOST-NEXT: [[J:%.*]] = alloca i32, align 4
36 // IR-PCH-HOST-NEXT: [[SUM:%.*]] = alloca [10 x [10 x i32]], align 16
37 // IR-PCH-HOST-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
38 // IR-PCH-HOST-NEXT: [[TMP0:%.*]] = load i32, ptr [[J]], align 4
39 // IR-PCH-HOST-NEXT: store i32 [[TMP0]], ptr [[J_CASTED]], align 4
40 // IR-PCH-HOST-NEXT: [[TMP1:%.*]] = load i64, ptr [[J_CASTED]], align 8
41 // IR-PCH-HOST-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22(i64 [[TMP1]], ptr [[SUM]]) #[[ATTR2:[0-9]+]]
42 // IR-PCH-HOST-NEXT: ret i32 0
43 // IR-PCH-HOST-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
44 // IR-PCH-HOST-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
45 // IR-PCH-HOST-NEXT: entry:
46 // IR-PCH-HOST-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
47 // IR-PCH-HOST-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
48 // IR-PCH-HOST-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
49 // IR-PCH-HOST-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
50 // IR-PCH-HOST-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
51 // IR-PCH-HOST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
52 // IR-PCH-HOST-NEXT: [[TMP1:%.*]] = load i32, ptr [[J_ADDR]], align 4
53 // IR-PCH-HOST-NEXT: store i32 [[TMP1]], ptr [[J_CASTED]], align 4
54 // IR-PCH-HOST-NEXT: [[TMP2:%.*]] = load i64, ptr [[J_CASTED]], align 8
55 // IR-PCH-HOST-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4:[0-9]+]], i32 2, ptr @.omp_outlined., i64 [[TMP2]], ptr [[TMP0]])
56 // IR-PCH-HOST-NEXT: ret void
57 // IR-PCH-HOST-LABEL: define {{[^@]+}}@.omp_outlined.
58 // IR-PCH-HOST-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
59 // IR-PCH-HOST-NEXT: entry:
60 // IR-PCH-HOST-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
61 // IR-PCH-HOST-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
62 // IR-PCH-HOST-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
63 // IR-PCH-HOST-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
64 // IR-PCH-HOST-NEXT: [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 16
65 // IR-PCH-HOST-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
66 // IR-PCH-HOST-NEXT: [[TMP:%.*]] = alloca i32, align 4
67 // IR-PCH-HOST-NEXT: [[_TMP2:%.*]] = alloca i32, align 4
68 // IR-PCH-HOST-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
69 // IR-PCH-HOST-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
70 // IR-PCH-HOST-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
71 // IR-PCH-HOST-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
72 // IR-PCH-HOST-NEXT: [[J3:%.*]] = alloca i32, align 4
73 // IR-PCH-HOST-NEXT: [[I:%.*]] = alloca i32, align 4
74 // IR-PCH-HOST-NEXT: [[J4:%.*]] = alloca i32, align 4
75 // IR-PCH-HOST-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
76 // IR-PCH-HOST-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
77 // IR-PCH-HOST-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
78 // IR-PCH-HOST-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
79 // IR-PCH-HOST-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
80 // IR-PCH-HOST-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
81 // IR-PCH-HOST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
82 // IR-PCH-HOST-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1]], i32 0, i32 0, i32 0
83 // IR-PCH-HOST-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
84 // IR-PCH-HOST-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
85 // IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
86 // IR-PCH-HOST: omp.arrayinit.body:
87 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
88 // IR-PCH-HOST-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
89 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
90 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
91 // IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
92 // IR-PCH-HOST: omp.arrayinit.done:
93 // IR-PCH-HOST-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
94 // IR-PCH-HOST-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
95 // IR-PCH-HOST-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
96 // IR-PCH-HOST-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
97 // IR-PCH-HOST-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
98 // IR-PCH-HOST-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
99 // IR-PCH-HOST-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
100 // IR-PCH-HOST-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
101 // IR-PCH-HOST-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
102 // IR-PCH-HOST-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
103 // IR-PCH-HOST: cond.true:
104 // IR-PCH-HOST-NEXT: br label [[COND_END:%.*]]
105 // IR-PCH-HOST: cond.false:
106 // IR-PCH-HOST-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
107 // IR-PCH-HOST-NEXT: br label [[COND_END]]
108 // IR-PCH-HOST: cond.end:
109 // IR-PCH-HOST-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
110 // IR-PCH-HOST-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
111 // IR-PCH-HOST-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
112 // IR-PCH-HOST-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
113 // IR-PCH-HOST-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
114 // IR-PCH-HOST: omp.inner.for.cond:
115 // IR-PCH-HOST-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
116 // IR-PCH-HOST-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
117 // IR-PCH-HOST-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
118 // IR-PCH-HOST-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
119 // IR-PCH-HOST: omp.inner.for.body:
120 // IR-PCH-HOST-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
121 // IR-PCH-HOST-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
122 // IR-PCH-HOST-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
123 // IR-PCH-HOST-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
124 // IR-PCH-HOST-NEXT: [[TMP13:%.*]] = load i32, ptr [[J3]], align 4
125 // IR-PCH-HOST-NEXT: store i32 [[TMP13]], ptr [[J_CASTED]], align 4
126 // IR-PCH-HOST-NEXT: [[TMP14:%.*]] = load i64, ptr [[J_CASTED]], align 8
127 // IR-PCH-HOST-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 4, ptr @.omp_outlined..1, i64 [[TMP10]], i64 [[TMP12]], i64 [[TMP14]], ptr [[SUM1]])
128 // IR-PCH-HOST-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
129 // IR-PCH-HOST: omp.inner.for.inc:
130 // IR-PCH-HOST-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
131 // IR-PCH-HOST-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
132 // IR-PCH-HOST-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
133 // IR-PCH-HOST-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
134 // IR-PCH-HOST-NEXT: br label [[OMP_INNER_FOR_COND]]
135 // IR-PCH-HOST: omp.inner.for.end:
136 // IR-PCH-HOST-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
137 // IR-PCH-HOST: omp.loop.exit:
138 // IR-PCH-HOST-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
139 // IR-PCH-HOST-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
140 // IR-PCH-HOST-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP18]])
141 // IR-PCH-HOST-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
142 // IR-PCH-HOST-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
143 // IR-PCH-HOST-NEXT: br i1 [[TMP20]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
144 // IR-PCH-HOST: .omp.lastprivate.then:
145 // IR-PCH-HOST-NEXT: store i32 10, ptr [[J3]], align 4
146 // IR-PCH-HOST-NEXT: [[TMP21:%.*]] = load i32, ptr [[J3]], align 4
147 // IR-PCH-HOST-NEXT: store i32 [[TMP21]], ptr [[J_ADDR]], align 4
148 // IR-PCH-HOST-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
149 // IR-PCH-HOST: .omp.lastprivate.done:
150 // IR-PCH-HOST-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
151 // IR-PCH-HOST-NEXT: store ptr [[SUM1]], ptr [[TMP22]], align 8
152 // IR-PCH-HOST-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
153 // IR-PCH-HOST-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
154 // IR-PCH-HOST-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3:[0-9]+]], i32 [[TMP24]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @.omp.reduction.reduction_func.2, ptr @.gomp_critical_user_.reduction.var)
155 // IR-PCH-HOST-NEXT: switch i32 [[TMP25]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
156 // IR-PCH-HOST-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
157 // IR-PCH-HOST-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
158 // IR-PCH-HOST-NEXT: ]
159 // IR-PCH-HOST: .omp.reduction.case1:
160 // IR-PCH-HOST-NEXT: [[TMP26:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
161 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP26]]
162 // IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE10:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
163 // IR-PCH-HOST: omp.arraycpy.body:
164 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
165 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST6:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT8:%.*]], [[OMP_ARRAYCPY_BODY]] ]
166 // IR-PCH-HOST-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
167 // IR-PCH-HOST-NEXT: [[TMP28:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
168 // IR-PCH-HOST-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
169 // IR-PCH-HOST-NEXT: store i32 [[ADD7]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
170 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT8]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], i32 1
171 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
172 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DONE9:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT8]], [[TMP26]]
173 // IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_DONE9]], label [[OMP_ARRAYCPY_DONE10]], label [[OMP_ARRAYCPY_BODY]]
174 // IR-PCH-HOST: omp.arraycpy.done10:
175 // IR-PCH-HOST-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
176 // IR-PCH-HOST-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
177 // IR-PCH-HOST: .omp.reduction.case2:
178 // IR-PCH-HOST-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
179 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_ISEMPTY11:%.*]] = icmp eq ptr [[TMP0]], [[TMP29]]
180 // IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY11]], label [[OMP_ARRAYCPY_DONE18:%.*]], label [[OMP_ARRAYCPY_BODY12:%.*]]
181 // IR-PCH-HOST: omp.arraycpy.body12:
182 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST13:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT16:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
183 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST14:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
184 // IR-PCH-HOST-NEXT: [[TMP30:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], align 4
185 // IR-PCH-HOST-NEXT: [[TMP31:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 [[TMP30]] monotonic, align 4
186 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 1
187 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT16]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], i32 1
188 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DONE17:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP29]]
189 // IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_BODY12]]
190 // IR-PCH-HOST: omp.arraycpy.done18:
191 // IR-PCH-HOST-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
192 // IR-PCH-HOST-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
193 // IR-PCH-HOST: .omp.reduction.default:
194 // IR-PCH-HOST-NEXT: ret void
195 // IR-PCH-HOST-LABEL: define {{[^@]+}}@.omp_outlined..1
196 // IR-PCH-HOST-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
197 // IR-PCH-HOST-NEXT: entry:
198 // IR-PCH-HOST-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
199 // IR-PCH-HOST-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
200 // IR-PCH-HOST-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
201 // IR-PCH-HOST-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
202 // IR-PCH-HOST-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
203 // IR-PCH-HOST-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
204 // IR-PCH-HOST-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
205 // IR-PCH-HOST-NEXT: [[TMP:%.*]] = alloca i32, align 4
206 // IR-PCH-HOST-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
207 // IR-PCH-HOST-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
208 // IR-PCH-HOST-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
209 // IR-PCH-HOST-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
210 // IR-PCH-HOST-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
211 // IR-PCH-HOST-NEXT: [[J3:%.*]] = alloca i32, align 4
212 // IR-PCH-HOST-NEXT: [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 16
213 // IR-PCH-HOST-NEXT: [[I:%.*]] = alloca i32, align 4
214 // IR-PCH-HOST-NEXT: [[J5:%.*]] = alloca i32, align 4
215 // IR-PCH-HOST-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
216 // IR-PCH-HOST-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
217 // IR-PCH-HOST-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
218 // IR-PCH-HOST-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
219 // IR-PCH-HOST-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
220 // IR-PCH-HOST-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
221 // IR-PCH-HOST-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
222 // IR-PCH-HOST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
223 // IR-PCH-HOST-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
224 // IR-PCH-HOST-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
225 // IR-PCH-HOST-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
226 // IR-PCH-HOST-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
227 // IR-PCH-HOST-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
228 // IR-PCH-HOST-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
229 // IR-PCH-HOST-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
230 // IR-PCH-HOST-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
231 // IR-PCH-HOST-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
232 // IR-PCH-HOST-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
233 // IR-PCH-HOST-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i32 0, i32 0, i32 0
234 // IR-PCH-HOST-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
235 // IR-PCH-HOST-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
236 // IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
237 // IR-PCH-HOST: omp.arrayinit.body:
238 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
239 // IR-PCH-HOST-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
240 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
241 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
242 // IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
243 // IR-PCH-HOST: omp.arrayinit.done:
244 // IR-PCH-HOST-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
245 // IR-PCH-HOST-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
246 // IR-PCH-HOST-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP5]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
247 // IR-PCH-HOST-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
248 // IR-PCH-HOST-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP6]], 99
249 // IR-PCH-HOST-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
250 // IR-PCH-HOST: cond.true:
251 // IR-PCH-HOST-NEXT: br label [[COND_END:%.*]]
252 // IR-PCH-HOST: cond.false:
253 // IR-PCH-HOST-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
254 // IR-PCH-HOST-NEXT: br label [[COND_END]]
255 // IR-PCH-HOST: cond.end:
256 // IR-PCH-HOST-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP7]], [[COND_FALSE]] ]
257 // IR-PCH-HOST-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
258 // IR-PCH-HOST-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
259 // IR-PCH-HOST-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4
260 // IR-PCH-HOST-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
261 // IR-PCH-HOST: omp.inner.for.cond:
262 // IR-PCH-HOST-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]]
263 // IR-PCH-HOST-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP3]]
264 // IR-PCH-HOST-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
265 // IR-PCH-HOST-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
266 // IR-PCH-HOST: omp.inner.for.body:
267 // IR-PCH-HOST-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
268 // IR-PCH-HOST-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP11]], 10
269 // IR-PCH-HOST-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
270 // IR-PCH-HOST-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
271 // IR-PCH-HOST-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
272 // IR-PCH-HOST-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
273 // IR-PCH-HOST-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
274 // IR-PCH-HOST-NEXT: [[DIV7:%.*]] = sdiv i32 [[TMP13]], 10
275 // IR-PCH-HOST-NEXT: [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
276 // IR-PCH-HOST-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP12]], [[MUL8]]
277 // IR-PCH-HOST-NEXT: [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
278 // IR-PCH-HOST-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
279 // IR-PCH-HOST-NEXT: store i32 [[ADD10]], ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
280 // IR-PCH-HOST-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
281 // IR-PCH-HOST-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
282 // IR-PCH-HOST-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64
283 // IR-PCH-HOST-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i64 0, i64 [[IDXPROM]]
284 // IR-PCH-HOST-NEXT: [[TMP16:%.*]] = load i32, ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
285 // IR-PCH-HOST-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP16]] to i64
286 // IR-PCH-HOST-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
287 // IR-PCH-HOST-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
288 // IR-PCH-HOST-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP14]]
289 // IR-PCH-HOST-NEXT: store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
290 // IR-PCH-HOST-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
291 // IR-PCH-HOST: omp.body.continue:
292 // IR-PCH-HOST-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
293 // IR-PCH-HOST: omp.inner.for.inc:
294 // IR-PCH-HOST-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
295 // IR-PCH-HOST-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP18]], 1
296 // IR-PCH-HOST-NEXT: store i32 [[ADD14]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
297 // IR-PCH-HOST-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
298 // IR-PCH-HOST: omp.inner.for.end:
299 // IR-PCH-HOST-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
300 // IR-PCH-HOST: omp.loop.exit:
301 // IR-PCH-HOST-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
302 // IR-PCH-HOST-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
303 // IR-PCH-HOST-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
304 // IR-PCH-HOST-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
305 // IR-PCH-HOST-NEXT: store ptr [[SUM4]], ptr [[TMP21]], align 8
306 // IR-PCH-HOST-NEXT: [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
307 // IR-PCH-HOST-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
308 // IR-PCH-HOST-NEXT: [[TMP24:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP23]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
309 // IR-PCH-HOST-NEXT: switch i32 [[TMP24]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
310 // IR-PCH-HOST-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
311 // IR-PCH-HOST-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
312 // IR-PCH-HOST-NEXT: ]
313 // IR-PCH-HOST: .omp.reduction.case1:
314 // IR-PCH-HOST-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
315 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
316 // IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
317 // IR-PCH-HOST: omp.arraycpy.body:
318 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
319 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
320 // IR-PCH-HOST-NEXT: [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
321 // IR-PCH-HOST-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
322 // IR-PCH-HOST-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
323 // IR-PCH-HOST-NEXT: store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
324 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
325 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
326 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
327 // IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
328 // IR-PCH-HOST: omp.arraycpy.done19:
329 // IR-PCH-HOST-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
330 // IR-PCH-HOST-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
331 // IR-PCH-HOST: .omp.reduction.case2:
332 // IR-PCH-HOST-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
333 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_ISEMPTY20:%.*]] = icmp eq ptr [[TMP0]], [[TMP28]]
334 // IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY20]], label [[OMP_ARRAYCPY_DONE27:%.*]], label [[OMP_ARRAYCPY_BODY21:%.*]]
335 // IR-PCH-HOST: omp.arraycpy.body21:
336 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST22:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT25:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
337 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST23:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT24:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
338 // IR-PCH-HOST-NEXT: [[TMP29:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], align 4
339 // IR-PCH-HOST-NEXT: [[TMP30:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 [[TMP29]] monotonic, align 4
340 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT24]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 1
341 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT25]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], i32 1
342 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DONE26:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT24]], [[TMP28]]
343 // IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_DONE26]], label [[OMP_ARRAYCPY_DONE27]], label [[OMP_ARRAYCPY_BODY21]]
344 // IR-PCH-HOST: omp.arraycpy.done27:
345 // IR-PCH-HOST-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
346 // IR-PCH-HOST-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
347 // IR-PCH-HOST: .omp.reduction.default:
348 // IR-PCH-HOST-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
349 // IR-PCH-HOST-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
350 // IR-PCH-HOST-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
351 // IR-PCH-HOST: .omp.lastprivate.then:
352 // IR-PCH-HOST-NEXT: store i32 10, ptr [[J3]], align 4
353 // IR-PCH-HOST-NEXT: [[TMP33:%.*]] = load i32, ptr [[J3]], align 4
354 // IR-PCH-HOST-NEXT: store i32 [[TMP33]], ptr [[J_ADDR]], align 4
355 // IR-PCH-HOST-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
356 // IR-PCH-HOST: .omp.lastprivate.done:
357 // IR-PCH-HOST-NEXT: ret void
358 // IR-PCH-HOST-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func
359 // IR-PCH-HOST-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
360 // IR-PCH-HOST-NEXT: entry:
361 // IR-PCH-HOST-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
362 // IR-PCH-HOST-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
363 // IR-PCH-HOST-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
364 // IR-PCH-HOST-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
365 // IR-PCH-HOST-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
366 // IR-PCH-HOST-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
367 // IR-PCH-HOST-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
368 // IR-PCH-HOST-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
369 // IR-PCH-HOST-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
370 // IR-PCH-HOST-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
371 // IR-PCH-HOST-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
372 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
373 // IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
374 // IR-PCH-HOST: omp.arraycpy.body:
375 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
376 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
377 // IR-PCH-HOST-NEXT: [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
378 // IR-PCH-HOST-NEXT: [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
379 // IR-PCH-HOST-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
380 // IR-PCH-HOST-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
381 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
382 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
383 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
384 // IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
385 // IR-PCH-HOST: omp.arraycpy.done2:
386 // IR-PCH-HOST-NEXT: ret void
387 // IR-PCH-HOST-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func.2
388 // IR-PCH-HOST-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
389 // IR-PCH-HOST-NEXT: entry:
390 // IR-PCH-HOST-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
391 // IR-PCH-HOST-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
392 // IR-PCH-HOST-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
393 // IR-PCH-HOST-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
394 // IR-PCH-HOST-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
395 // IR-PCH-HOST-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
396 // IR-PCH-HOST-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
397 // IR-PCH-HOST-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
398 // IR-PCH-HOST-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
399 // IR-PCH-HOST-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
400 // IR-PCH-HOST-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
401 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
402 // IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
403 // IR-PCH-HOST: omp.arraycpy.body:
404 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
405 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
406 // IR-PCH-HOST-NEXT: [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
407 // IR-PCH-HOST-NEXT: [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
408 // IR-PCH-HOST-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
409 // IR-PCH-HOST-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
410 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
411 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
412 // IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
413 // IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
414 // IR-PCH-HOST: omp.arraycpy.done2:
415 // IR-PCH-HOST-NEXT: ret void
416 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l23
417 // CHECK-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
418 // CHECK-NEXT: entry:
419 // CHECK-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
420 // CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
421 // CHECK-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
422 // CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
423 // CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
424 // CHECK-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
425 // CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
426 // CHECK-NEXT: [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
427 // CHECK-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
428 // CHECK-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
429 // CHECK-NEXT: store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
430 // CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
431 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
432 // CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 2, i1 false)
433 // CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
434 // CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
435 // CHECK: user_code.entry:
436 // CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
437 // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[J_ADDR_ASCAST]], align 4
438 // CHECK-NEXT: store i32 [[TMP3]], ptr [[J_CASTED_ASCAST]], align 4
439 // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
440 // CHECK-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
441 // CHECK-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
442 // CHECK-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
443 // CHECK-NEXT: call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2)
444 // CHECK-NEXT: ret void
445 // CHECK: worker.exit:
446 // CHECK-NEXT: ret void
447 // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
448 // CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
449 // CHECK-NEXT: entry:
450 // CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
451 // CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
452 // CHECK-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
453 // CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
454 // CHECK-NEXT: [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
455 // CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
456 // CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
457 // CHECK-NEXT: [[_TMP2:%.*]] = alloca i32, align 4, addrspace(5)
458 // CHECK-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4, addrspace(5)
459 // CHECK-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4, addrspace(5)
460 // CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
461 // CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
462 // CHECK-NEXT: [[J3:%.*]] = alloca i32, align 4, addrspace(5)
463 // CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
464 // CHECK-NEXT: [[J4:%.*]] = alloca i32, align 4, addrspace(5)
465 // CHECK-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
466 // CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8, addrspace(5)
467 // CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
468 // CHECK-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
469 // CHECK-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
470 // CHECK-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
471 // CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
472 // CHECK-NEXT: [[SUM1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM1]] to ptr
473 // CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
474 // CHECK-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
475 // CHECK-NEXT: [[TMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[_TMP2]] to ptr
476 // CHECK-NEXT: [[DOTOMP_COMB_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_LB]] to ptr
477 // CHECK-NEXT: [[DOTOMP_COMB_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_UB]] to ptr
478 // CHECK-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
479 // CHECK-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
480 // CHECK-NEXT: [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
481 // CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
482 // CHECK-NEXT: [[J4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J4]] to ptr
483 // CHECK-NEXT: [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
484 // CHECK-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
485 // CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
486 // CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
487 // CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
488 // CHECK-NEXT: store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
489 // CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
490 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
491 // CHECK-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1_ASCAST]], i32 0, i32 0, i32 0
492 // CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
493 // CHECK-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
494 // CHECK-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
495 // CHECK: omp.arrayinit.body:
496 // CHECK-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
497 // CHECK-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
498 // CHECK-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
499 // CHECK-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
500 // CHECK-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
501 // CHECK: omp.arrayinit.done:
502 // CHECK-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
503 // CHECK-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
504 // CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
505 // CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
506 // CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
507 // CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
508 // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
509 // CHECK-NEXT: call void @__kmpc_distribute_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), i32 [[TMP3]], i32 91, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_COMB_LB_ASCAST]], ptr [[DOTOMP_COMB_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 [[NVPTX_NUM_THREADS]])
510 // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
511 // CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
512 // CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
513 // CHECK: cond.true:
514 // CHECK-NEXT: br label [[COND_END:%.*]]
515 // CHECK: cond.false:
516 // CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
517 // CHECK-NEXT: br label [[COND_END]]
518 // CHECK: cond.end:
519 // CHECK-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
520 // CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
521 // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
522 // CHECK-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4
523 // CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
524 // CHECK: omp.inner.for.cond:
525 // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
526 // CHECK-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP7]], 100
527 // CHECK-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
528 // CHECK: omp.inner.for.body:
529 // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
530 // CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
531 // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
532 // CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
533 // CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
534 // CHECK-NEXT: store i32 [[TMP12]], ptr [[J_CASTED_ASCAST]], align 4
535 // CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
536 // CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
537 // CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP9]] to ptr
538 // CHECK-NEXT: store ptr [[TMP15]], ptr [[TMP14]], align 8
539 // CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 1
540 // CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP11]] to ptr
541 // CHECK-NEXT: store ptr [[TMP17]], ptr [[TMP16]], align 8
542 // CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 2
543 // CHECK-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP13]] to ptr
544 // CHECK-NEXT: store ptr [[TMP19]], ptr [[TMP18]], align 8
545 // CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 3
546 // CHECK-NEXT: store ptr [[SUM1_ASCAST]], ptr [[TMP20]], align 8
547 // CHECK-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
548 // CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
549 // CHECK-NEXT: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP22]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__.1, ptr null, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 4)
550 // CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
551 // CHECK: omp.inner.for.inc:
552 // CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
553 // CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
554 // CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], [[TMP24]]
555 // CHECK-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV_ASCAST]], align 4
556 // CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
557 // CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
558 // CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP25]], [[TMP26]]
559 // CHECK-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
560 // CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
561 // CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
562 // CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
563 // CHECK-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
564 // CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
565 // CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[TMP29]], 99
566 // CHECK-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
567 // CHECK: cond.true9:
568 // CHECK-NEXT: br label [[COND_END11:%.*]]
569 // CHECK: cond.false10:
570 // CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
571 // CHECK-NEXT: br label [[COND_END11]]
572 // CHECK: cond.end11:
573 // CHECK-NEXT: [[COND12:%.*]] = phi i32 [ 99, [[COND_TRUE9]] ], [ [[TMP30]], [[COND_FALSE10]] ]
574 // CHECK-NEXT: store i32 [[COND12]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
575 // CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
576 // CHECK-NEXT: store i32 [[TMP31]], ptr [[DOTOMP_IV_ASCAST]], align 4
577 // CHECK-NEXT: br label [[OMP_INNER_FOR_COND]]
578 // CHECK: omp.inner.for.end:
579 // CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
580 // CHECK: omp.loop.exit:
581 // CHECK-NEXT: [[TMP32:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
582 // CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
583 // CHECK-NEXT: call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), i32 [[TMP33]])
584 // CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
585 // CHECK-NEXT: [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
586 // CHECK-NEXT: br i1 [[TMP35]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
587 // CHECK: .omp.lastprivate.then:
588 // CHECK-NEXT: store i32 10, ptr [[J3_ASCAST]], align 4
589 // CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
590 // CHECK-NEXT: store i32 [[TMP36]], ptr [[J_ADDR_ASCAST]], align 4
591 // CHECK-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
592 // CHECK: .omp.lastprivate.done:
593 // CHECK-NEXT: [[TMP37:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
594 // CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
595 // CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
596 // CHECK-NEXT: store ptr [[SUM1_ASCAST]], ptr [[TMP39]], align 8
597 // CHECK-NEXT: [[TMP40:%.*]] = load ptr, ptr addrspace(1) @"_openmp_teams_reductions_buffer_$_$ptr", align 8
598 // CHECK-NEXT: [[TMP41:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr [[TMP40]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.3, ptr @_omp_reduction_inter_warp_copy_func.4, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
599 // CHECK-NEXT: [[TMP42:%.*]] = icmp eq i32 [[TMP41]], 1
600 // CHECK-NEXT: br i1 [[TMP42]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
601 // CHECK: .omp.reduction.then:
602 // CHECK-NEXT: [[TMP43:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
603 // CHECK-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP43]]
604 // CHECK-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE17:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
605 // CHECK: omp.arraycpy.body:
606 // CHECK-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
607 // CHECK-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST13:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY]] ]
608 // CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
609 // CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
610 // CHECK-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP44]], [[TMP45]]
611 // CHECK-NEXT: store i32 [[ADD14]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
612 // CHECK-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], i32 1
613 // CHECK-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
614 // CHECK-NEXT: [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP43]]
615 // CHECK-NEXT: br i1 [[OMP_ARRAYCPY_DONE16]], label [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_BODY]]
616 // CHECK: omp.arraycpy.done17:
617 // CHECK-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP38]])
618 // CHECK-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
619 // CHECK: .omp.reduction.done:
620 // CHECK-NEXT: ret void
621 // CHECK-LABEL: define {{[^@]+}}@__omp_outlined__.1
622 // CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
623 // CHECK-NEXT: entry:
624 // CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
625 // CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
626 // CHECK-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
627 // CHECK-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
628 // CHECK-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
629 // CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
630 // CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
631 // CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
632 // CHECK-NEXT: [[_TMP1:%.*]] = alloca i32, align 4, addrspace(5)
633 // CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5)
634 // CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5)
635 // CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
636 // CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
637 // CHECK-NEXT: [[J3:%.*]] = alloca i32, align 4, addrspace(5)
638 // CHECK-NEXT: [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
639 // CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
640 // CHECK-NEXT: [[J5:%.*]] = alloca i32, align 4, addrspace(5)
641 // CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
642 // CHECK-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
643 // CHECK-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
644 // CHECK-NEXT: [[DOTPREVIOUS_LB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_LB__ADDR]] to ptr
645 // CHECK-NEXT: [[DOTPREVIOUS_UB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_UB__ADDR]] to ptr
646 // CHECK-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
647 // CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
648 // CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
649 // CHECK-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
650 // CHECK-NEXT: [[TMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[_TMP1]] to ptr
651 // CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr
652 // CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr
653 // CHECK-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
654 // CHECK-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
655 // CHECK-NEXT: [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
656 // CHECK-NEXT: [[SUM4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM4]] to ptr
657 // CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
658 // CHECK-NEXT: [[J5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J5]] to ptr
659 // CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
660 // CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
661 // CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
662 // CHECK-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
663 // CHECK-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
664 // CHECK-NEXT: store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
665 // CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
666 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
667 // CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4
668 // CHECK-NEXT: store i32 99, ptr [[DOTOMP_UB_ASCAST]], align 4
669 // CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
670 // CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
671 // CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
672 // CHECK-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
673 // CHECK-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB_ASCAST]], align 4
674 // CHECK-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB_ASCAST]], align 4
675 // CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
676 // CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
677 // CHECK-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4_ASCAST]], i32 0, i32 0, i32 0
678 // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
679 // CHECK-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
680 // CHECK-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
681 // CHECK: omp.arrayinit.body:
682 // CHECK-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
683 // CHECK-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
684 // CHECK-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
685 // CHECK-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
686 // CHECK-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
687 // CHECK: omp.arrayinit.done:
688 // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
689 // CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
690 // CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP5]], i32 33, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_LB_ASCAST]], ptr [[DOTOMP_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 1)
691 // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4
692 // CHECK-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4
693 // CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
694 // CHECK: omp.inner.for.cond:
695 // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]]
696 // CHECK-NEXT: [[CONV6:%.*]] = sext i32 [[TMP7]] to i64
697 // CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8, !llvm.access.group [[ACC_GRP7]]
698 // CHECK-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV6]], [[TMP8]]
699 // CHECK-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
700 // CHECK: omp.inner.for.body:
701 // CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
702 // CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP9]], 10
703 // CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
704 // CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
705 // CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
706 // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
707 // CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
708 // CHECK-NEXT: [[DIV7:%.*]] = sdiv i32 [[TMP11]], 10
709 // CHECK-NEXT: [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
710 // CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], [[MUL8]]
711 // CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
712 // CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
713 // CHECK-NEXT: store i32 [[ADD10]], ptr [[J3_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
714 // CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
715 // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
716 // CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
717 // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4_ASCAST]], i64 0, i64 [[IDXPROM]]
718 // CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[J3_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
719 // CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP14]] to i64
720 // CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
721 // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP7]]
722 // CHECK-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP15]], [[TMP12]]
723 // CHECK-NEXT: store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP7]]
724 // CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
725 // CHECK: omp.body.continue:
726 // CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
727 // CHECK: omp.inner.for.inc:
728 // CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
729 // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
730 // CHECK-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
731 // CHECK-NEXT: store i32 [[ADD14]], ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
732 // CHECK-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
733 // CHECK: omp.inner.for.end:
734 // CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
735 // CHECK: omp.loop.exit:
736 // CHECK-NEXT: [[TMP18:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
737 // CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
738 // CHECK-NEXT: call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP19]])
739 // CHECK-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
740 // CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
741 // CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
742 // CHECK-NEXT: store ptr [[SUM4_ASCAST]], ptr [[TMP22]], align 8
743 // CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP21]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func)
744 // CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP23]], 1
745 // CHECK-NEXT: br i1 [[TMP24]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
746 // CHECK: .omp.reduction.then:
747 // CHECK-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
748 // CHECK-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
749 // CHECK-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
750 // CHECK: omp.arraycpy.body:
751 // CHECK-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
752 // CHECK-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
753 // CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
754 // CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
755 // CHECK-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
756 // CHECK-NEXT: store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
757 // CHECK-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
758 // CHECK-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
759 // CHECK-NEXT: [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
760 // CHECK-NEXT: br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
761 // CHECK: omp.arraycpy.done19:
762 // CHECK-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP21]])
763 // CHECK-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
764 // CHECK: .omp.reduction.done:
765 // CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
766 // CHECK-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
767 // CHECK-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
768 // CHECK: .omp.lastprivate.then:
769 // CHECK-NEXT: store i32 10, ptr [[J3_ASCAST]], align 4
770 // CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
771 // CHECK-NEXT: store i32 [[TMP30]], ptr [[J_ADDR_ASCAST]], align 4
772 // CHECK-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
773 // CHECK: .omp.lastprivate.done:
774 // CHECK-NEXT: ret void
775 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
776 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
777 // CHECK-NEXT: entry:
778 // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
779 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2, addrspace(5)
780 // CHECK-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2, addrspace(5)
781 // CHECK-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
782 // CHECK-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
783 // CHECK-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
784 // CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
785 // CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
786 // CHECK-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
787 // CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
788 // CHECK-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
789 // CHECK-NEXT: [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
790 // CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
791 // CHECK-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
792 // CHECK-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
793 // CHECK-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3_ASCAST]], align 2
794 // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
795 // CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1_ASCAST]], align 2
796 // CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2_ASCAST]], align 2
797 // CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3_ASCAST]], align 2
798 // CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
799 // CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
800 // CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
801 // CHECK-NEXT: [[TMP11:%.*]] = getelementptr [10 x [10 x i32]], ptr [[TMP9]], i64 1
802 // CHECK-NEXT: br label [[DOTSHUFFLE_PRE_COND:%.*]]
803 // CHECK: .shuffle.pre_cond:
804 // CHECK-NEXT: [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
805 // CHECK-NEXT: [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
806 // CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
807 // CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
808 // CHECK-NEXT: [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
809 // CHECK-NEXT: [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
810 // CHECK-NEXT: [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
811 // CHECK-NEXT: br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
812 // CHECK: .shuffle.then:
813 // CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 4
814 // CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
815 // CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
816 // CHECK-NEXT: [[TMP22:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP19]], i16 [[TMP6]], i16 [[TMP21]])
817 // CHECK-NEXT: store i64 [[TMP22]], ptr [[TMP13]], align 4
818 // CHECK-NEXT: [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
819 // CHECK-NEXT: [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
820 // CHECK-NEXT: br label [[DOTSHUFFLE_PRE_COND]]
821 // CHECK: .shuffle.exit:
822 // CHECK-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], ptr [[TMP10]], align 8
823 // CHECK-NEXT: [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
824 // CHECK-NEXT: [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
825 // CHECK-NEXT: [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
826 // CHECK-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
827 // CHECK-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP7]], 2
828 // CHECK-NEXT: [[TMP30:%.*]] = and i16 [[TMP5]], 1
829 // CHECK-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP30]], 0
830 // CHECK-NEXT: [[TMP32:%.*]] = and i1 [[TMP29]], [[TMP31]]
831 // CHECK-NEXT: [[TMP33:%.*]] = icmp sgt i16 [[TMP6]], 0
832 // CHECK-NEXT: [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
833 // CHECK-NEXT: [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
834 // CHECK-NEXT: [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
835 // CHECK-NEXT: br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
836 // CHECK: then:
837 // CHECK-NEXT: call void @"_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]]) #[[ATTR2]]
838 // CHECK-NEXT: br label [[IFCONT:%.*]]
839 // CHECK: else:
840 // CHECK-NEXT: br label [[IFCONT]]
841 // CHECK: ifcont:
842 // CHECK-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
843 // CHECK-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
844 // CHECK-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
845 // CHECK-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
846 // CHECK: then4:
847 // CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
848 // CHECK-NEXT: [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
849 // CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
850 // CHECK-NEXT: [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
851 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP43]], ptr align 4 [[TMP41]], i64 400, i1 false)
852 // CHECK-NEXT: br label [[IFCONT6:%.*]]
853 // CHECK: else5:
854 // CHECK-NEXT: br label [[IFCONT6]]
855 // CHECK: ifcont6:
856 // CHECK-NEXT: ret void
857 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
858 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
859 // CHECK-NEXT: entry:
860 // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
861 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
862 // CHECK-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
863 // CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
864 // CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
865 // CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
866 // CHECK-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
867 // CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
868 // CHECK-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
869 // CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
870 // CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
871 // CHECK-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 63
872 // CHECK-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
873 // CHECK-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 6
874 // CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
875 // CHECK-NEXT: store i32 0, ptr [[DOTCNT_ADDR_ASCAST]], align 4
876 // CHECK-NEXT: br label [[PRECOND:%.*]]
877 // CHECK: precond:
878 // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR_ASCAST]], align 4
879 // CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
880 // CHECK-NEXT: br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
881 // CHECK: body:
882 // CHECK-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4:[0-9]+]] to ptr), i32 [[TMP2]])
883 // CHECK-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
884 // CHECK-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
885 // CHECK: then:
886 // CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
887 // CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
888 // CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
889 // CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
890 // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
891 // CHECK-NEXT: store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
892 // CHECK-NEXT: br label [[IFCONT:%.*]]
893 // CHECK: else:
894 // CHECK-NEXT: br label [[IFCONT]]
895 // CHECK: ifcont:
896 // CHECK-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
897 // CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
898 // CHECK-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
899 // CHECK-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
900 // CHECK: then2:
901 // CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
902 // CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
903 // CHECK-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
904 // CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
905 // CHECK-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
906 // CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4
907 // CHECK-NEXT: br label [[IFCONT4:%.*]]
908 // CHECK: else3:
909 // CHECK-NEXT: br label [[IFCONT4]]
910 // CHECK: ifcont4:
911 // CHECK-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
912 // CHECK-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
913 // CHECK-NEXT: br label [[PRECOND]]
914 // CHECK: exit:
915 // CHECK-NEXT: ret void
916 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func.3
917 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
918 // CHECK-NEXT: entry:
919 // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
920 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2, addrspace(5)
921 // CHECK-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2, addrspace(5)
922 // CHECK-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
923 // CHECK-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
924 // CHECK-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
925 // CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
926 // CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
927 // CHECK-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
928 // CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
929 // CHECK-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
930 // CHECK-NEXT: [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
931 // CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
932 // CHECK-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
933 // CHECK-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
934 // CHECK-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3_ASCAST]], align 2
935 // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
936 // CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1_ASCAST]], align 2
937 // CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2_ASCAST]], align 2
938 // CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3_ASCAST]], align 2
939 // CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
940 // CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
941 // CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
942 // CHECK-NEXT: [[TMP11:%.*]] = getelementptr [10 x [10 x i32]], ptr [[TMP9]], i64 1
943 // CHECK-NEXT: br label [[DOTSHUFFLE_PRE_COND:%.*]]
944 // CHECK: .shuffle.pre_cond:
945 // CHECK-NEXT: [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
946 // CHECK-NEXT: [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
947 // CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
948 // CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
949 // CHECK-NEXT: [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
950 // CHECK-NEXT: [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
951 // CHECK-NEXT: [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
952 // CHECK-NEXT: br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
953 // CHECK: .shuffle.then:
954 // CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 4
955 // CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
956 // CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
957 // CHECK-NEXT: [[TMP22:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP19]], i16 [[TMP6]], i16 [[TMP21]])
958 // CHECK-NEXT: store i64 [[TMP22]], ptr [[TMP13]], align 4
959 // CHECK-NEXT: [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
960 // CHECK-NEXT: [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
961 // CHECK-NEXT: br label [[DOTSHUFFLE_PRE_COND]]
962 // CHECK: .shuffle.exit:
963 // CHECK-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], ptr [[TMP10]], align 8
964 // CHECK-NEXT: [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
965 // CHECK-NEXT: [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
966 // CHECK-NEXT: [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
967 // CHECK-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
968 // CHECK-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP7]], 2
969 // CHECK-NEXT: [[TMP30:%.*]] = and i16 [[TMP5]], 1
970 // CHECK-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP30]], 0
971 // CHECK-NEXT: [[TMP32:%.*]] = and i1 [[TMP29]], [[TMP31]]
972 // CHECK-NEXT: [[TMP33:%.*]] = icmp sgt i16 [[TMP6]], 0
973 // CHECK-NEXT: [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
974 // CHECK-NEXT: [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
975 // CHECK-NEXT: [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
976 // CHECK-NEXT: br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
977 // CHECK: then:
978 // CHECK-NEXT: call void @"_omp$reduction$reduction_func.2"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]]) #[[ATTR2]]
979 // CHECK-NEXT: br label [[IFCONT:%.*]]
980 // CHECK: else:
981 // CHECK-NEXT: br label [[IFCONT]]
982 // CHECK: ifcont:
983 // CHECK-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
984 // CHECK-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
985 // CHECK-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
986 // CHECK-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
987 // CHECK: then4:
988 // CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
989 // CHECK-NEXT: [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
990 // CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
991 // CHECK-NEXT: [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
992 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP43]], ptr align 4 [[TMP41]], i64 400, i1 false)
993 // CHECK-NEXT: br label [[IFCONT6:%.*]]
994 // CHECK: else5:
995 // CHECK-NEXT: br label [[IFCONT6]]
996 // CHECK: ifcont6:
997 // CHECK-NEXT: ret void
998 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func.4
999 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
1000 // CHECK-NEXT: entry:
1001 // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1002 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1003 // CHECK-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1004 // CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
1005 // CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1006 // CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1007 // CHECK-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
1008 // CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1009 // CHECK-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1010 // CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1011 // CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1012 // CHECK-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 63
1013 // CHECK-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1014 // CHECK-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 6
1015 // CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1016 // CHECK-NEXT: store i32 0, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1017 // CHECK-NEXT: br label [[PRECOND:%.*]]
1018 // CHECK: precond:
1019 // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1020 // CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
1021 // CHECK-NEXT: br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
1022 // CHECK: body:
1023 // CHECK-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
1024 // CHECK-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1025 // CHECK-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
1026 // CHECK: then:
1027 // CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1028 // CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
1029 // CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
1030 // CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1031 // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
1032 // CHECK-NEXT: store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
1033 // CHECK-NEXT: br label [[IFCONT:%.*]]
1034 // CHECK: else:
1035 // CHECK-NEXT: br label [[IFCONT]]
1036 // CHECK: ifcont:
1037 // CHECK-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
1038 // CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1039 // CHECK-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
1040 // CHECK-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
1041 // CHECK: then2:
1042 // CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
1043 // CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1044 // CHECK-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
1045 // CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
1046 // CHECK-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
1047 // CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4
1048 // CHECK-NEXT: br label [[IFCONT4:%.*]]
1049 // CHECK: else3:
1050 // CHECK-NEXT: br label [[IFCONT4]]
1051 // CHECK: ifcont4:
1052 // CHECK-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
1053 // CHECK-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
1054 // CHECK-NEXT: br label [[PRECOND]]
1055 // CHECK: exit:
1056 // CHECK-NEXT: ret void
1057 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
1058 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1059 // CHECK-NEXT: entry:
1060 // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1061 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1062 // CHECK-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1063 // CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1064 // CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1065 // CHECK-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1066 // CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1067 // CHECK-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1068 // CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1069 // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1070 // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1071 // CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1072 // CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
1073 // CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
1074 // CHECK-NEXT: [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
1075 // CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP5]]
1076 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[TMP8]], ptr align 4 [[TMP7]], i64 400, i1 false)
1077 // CHECK-NEXT: ret void
1078 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
1079 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1080 // CHECK-NEXT: entry:
1081 // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1082 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1083 // CHECK-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1084 // CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1085 // CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1086 // CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1087 // CHECK-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1088 // CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1089 // CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1090 // CHECK-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1091 // CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1092 // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1093 // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1094 // CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1095 // CHECK-NEXT: [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
1096 // CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP4]]
1097 // CHECK-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
1098 // CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1099 // CHECK-NEXT: call void @"_omp$reduction$reduction_func.2"(ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr [[TMP7]]) #[[ATTR2]]
1100 // CHECK-NEXT: ret void
1101 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
1102 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1103 // CHECK-NEXT: entry:
1104 // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1105 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1106 // CHECK-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1107 // CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1108 // CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1109 // CHECK-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1110 // CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1111 // CHECK-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1112 // CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1113 // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1114 // CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1115 // CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1116 // CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
1117 // CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
1118 // CHECK-NEXT: [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
1119 // CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP5]]
1120 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP7]], ptr align 128 [[TMP8]], i64 400, i1 false)
1121 // CHECK-NEXT: ret void
1122 // CHECK-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
1123 // CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1124 // CHECK-NEXT: entry:
1125 // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1126 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1127 // CHECK-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1128 // CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1129 // CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1130 // CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1131 // CHECK-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1132 // CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1133 // CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1134 // CHECK-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1135 // CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1136 // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1137 // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1138 // CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1139 // CHECK-NEXT: [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
1140 // CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP4]]
1141 // CHECK-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
1142 // CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1143 // CHECK-NEXT: call void @"_omp$reduction$reduction_func.2"(ptr [[TMP7]], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]]) #[[ATTR2]]
1144 // CHECK-NEXT: ret void
1145 // IR-GPU-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
1146 // IR-GPU-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
1147 // IR-GPU-NEXT: entry:
1148 // IR-GPU-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1149 // IR-GPU-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
1150 // IR-GPU-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1151 // IR-GPU-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
1152 // IR-GPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1153 // IR-GPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
1154 // IR-GPU-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
1155 // IR-GPU-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
1156 // IR-GPU-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
1157 // IR-GPU-NEXT: [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
1158 // IR-GPU-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
1159 // IR-GPU-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
1160 // IR-GPU-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
1161 // IR-GPU-NEXT: store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
1162 // IR-GPU-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
1163 // IR-GPU-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
1164 // IR-GPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_kernel_environment to ptr), ptr [[DYN_PTR]])
1165 // IR-GPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
1166 // IR-GPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
1167 // IR-GPU: user_code.entry:
1168 // IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
1169 // IR-GPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[J_ADDR_ASCAST]], align 4
1170 // IR-GPU-NEXT: store i32 [[TMP3]], ptr [[J_CASTED_ASCAST]], align 4
1171 // IR-GPU-NEXT: [[TMP4:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
1172 // IR-GPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
1173 // IR-GPU-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
1174 // IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
1175 // IR-GPU-NEXT: call void @__kmpc_target_deinit()
1176 // IR-GPU-NEXT: ret void
1177 // IR-GPU: worker.exit:
1178 // IR-GPU-NEXT: ret void
1181 // IR-GPU-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined
1182 // IR-GPU-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
1183 // IR-GPU-NEXT: entry:
1184 // IR-GPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1185 // IR-GPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1186 // IR-GPU-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
1187 // IR-GPU-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1188 // IR-GPU-NEXT: [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
1189 // IR-GPU-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
1190 // IR-GPU-NEXT: [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
1191 // IR-GPU-NEXT: [[_TMP2:%.*]] = alloca i32, align 4, addrspace(5)
1192 // IR-GPU-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4, addrspace(5)
1193 // IR-GPU-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4, addrspace(5)
1194 // IR-GPU-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
1195 // IR-GPU-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
1196 // IR-GPU-NEXT: [[J3:%.*]] = alloca i32, align 4, addrspace(5)
1197 // IR-GPU-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
1198 // IR-GPU-NEXT: [[J4:%.*]] = alloca i32, align 4, addrspace(5)
1199 // IR-GPU-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
1200 // IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8, addrspace(5)
1201 // IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1202 // IR-GPU-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
1203 // IR-GPU-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
1204 // IR-GPU-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
1205 // IR-GPU-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
1206 // IR-GPU-NEXT: [[SUM1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM1]] to ptr
1207 // IR-GPU-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
1208 // IR-GPU-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
1209 // IR-GPU-NEXT: [[TMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[_TMP2]] to ptr
1210 // IR-GPU-NEXT: [[DOTOMP_COMB_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_LB]] to ptr
1211 // IR-GPU-NEXT: [[DOTOMP_COMB_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_UB]] to ptr
1212 // IR-GPU-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
1213 // IR-GPU-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
1214 // IR-GPU-NEXT: [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
1215 // IR-GPU-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
1216 // IR-GPU-NEXT: [[J4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J4]] to ptr
1217 // IR-GPU-NEXT: [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
1218 // IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
1219 // IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1220 // IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1221 // IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
1222 // IR-GPU-NEXT: store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
1223 // IR-GPU-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
1224 // IR-GPU-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
1225 // IR-GPU-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1_ASCAST]], i32 0, i32 0, i32 0
1226 // IR-GPU-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
1227 // IR-GPU-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
1228 // IR-GPU-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
1229 // IR-GPU: omp.arrayinit.body:
1230 // IR-GPU-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
1231 // IR-GPU-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
1232 // IR-GPU-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
1233 // IR-GPU-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
1234 // IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
1235 // IR-GPU: omp.arrayinit.done:
1236 // IR-GPU-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1237 // IR-GPU-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1238 // IR-GPU-NEXT: store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
1239 // IR-GPU-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
1240 // IR-GPU-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
1241 // IR-GPU-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1242 // IR-GPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
1243 // IR-GPU-NEXT: call void @__kmpc_distribute_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), i32 [[TMP3]], i32 91, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_COMB_LB_ASCAST]], ptr [[DOTOMP_COMB_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 [[NVPTX_NUM_THREADS]])
1244 // IR-GPU-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1245 // IR-GPU-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
1246 // IR-GPU-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
1247 // IR-GPU: cond.true:
1248 // IR-GPU-NEXT: br label [[COND_END:%.*]]
1249 // IR-GPU: cond.false:
1250 // IR-GPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1251 // IR-GPU-NEXT: br label [[COND_END]]
1252 // IR-GPU: cond.end:
1253 // IR-GPU-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
1254 // IR-GPU-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1255 // IR-GPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1256 // IR-GPU-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4
1257 // IR-GPU-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
1258 // IR-GPU: omp.inner.for.cond:
1259 // IR-GPU-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
1260 // IR-GPU-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP7]], 100
1261 // IR-GPU-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
1262 // IR-GPU: omp.inner.for.body:
1263 // IR-GPU-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1264 // IR-GPU-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
1265 // IR-GPU-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1266 // IR-GPU-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
1267 // IR-GPU-NEXT: [[TMP12:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
1268 // IR-GPU-NEXT: store i32 [[TMP12]], ptr [[J_CASTED_ASCAST]], align 4
1269 // IR-GPU-NEXT: [[TMP13:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
1270 // IR-GPU-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
1271 // IR-GPU-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP9]] to ptr
1272 // IR-GPU-NEXT: store ptr [[TMP15]], ptr [[TMP14]], align 8
1273 // IR-GPU-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 1
1274 // IR-GPU-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP11]] to ptr
1275 // IR-GPU-NEXT: store ptr [[TMP17]], ptr [[TMP16]], align 8
1276 // IR-GPU-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 2
1277 // IR-GPU-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP13]] to ptr
1278 // IR-GPU-NEXT: store ptr [[TMP19]], ptr [[TMP18]], align 8
1279 // IR-GPU-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 3
1280 // IR-GPU-NEXT: store ptr [[SUM1_ASCAST]], ptr [[TMP20]], align 8
1281 // IR-GPU-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1282 // IR-GPU-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
1283 // IR-GPU-NEXT: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP22]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 4)
1284 // IR-GPU-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
1285 // IR-GPU: omp.inner.for.inc:
1286 // IR-GPU-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
1287 // IR-GPU-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
1288 // IR-GPU-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], [[TMP24]]
1289 // IR-GPU-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV_ASCAST]], align 4
1290 // IR-GPU-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1291 // IR-GPU-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
1292 // IR-GPU-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP25]], [[TMP26]]
1293 // IR-GPU-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1294 // IR-GPU-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1295 // IR-GPU-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
1296 // IR-GPU-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
1297 // IR-GPU-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1298 // IR-GPU-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1299 // IR-GPU-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[TMP29]], 99
1300 // IR-GPU-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
1301 // IR-GPU: cond.true9:
1302 // IR-GPU-NEXT: br label [[COND_END11:%.*]]
1303 // IR-GPU: cond.false10:
1304 // IR-GPU-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1305 // IR-GPU-NEXT: br label [[COND_END11]]
1306 // IR-GPU: cond.end11:
1307 // IR-GPU-NEXT: [[COND12:%.*]] = phi i32 [ 99, [[COND_TRUE9]] ], [ [[TMP30]], [[COND_FALSE10]] ]
1308 // IR-GPU-NEXT: store i32 [[COND12]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
1309 // IR-GPU-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
1310 // IR-GPU-NEXT: store i32 [[TMP31]], ptr [[DOTOMP_IV_ASCAST]], align 4
1311 // IR-GPU-NEXT: br label [[OMP_INNER_FOR_COND]]
1312 // IR-GPU: omp.inner.for.end:
1313 // IR-GPU-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
1314 // IR-GPU: omp.loop.exit:
1315 // IR-GPU-NEXT: [[TMP32:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1316 // IR-GPU-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
1317 // IR-GPU-NEXT: call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), i32 [[TMP33]])
1318 // IR-GPU-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
1319 // IR-GPU-NEXT: [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
1320 // IR-GPU-NEXT: br i1 [[TMP35]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
1321 // IR-GPU: .omp.lastprivate.then:
1322 // IR-GPU-NEXT: store i32 10, ptr [[J3_ASCAST]], align 4
1323 // IR-GPU-NEXT: [[TMP36:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
1324 // IR-GPU-NEXT: store i32 [[TMP36]], ptr [[J_ADDR_ASCAST]], align 4
1325 // IR-GPU-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
1326 // IR-GPU: .omp.lastprivate.done:
1327 // IR-GPU-NEXT: [[TMP37:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1328 // IR-GPU-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
1329 // IR-GPU-NEXT: [[TMP39:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1330 // IR-GPU-NEXT: store ptr [[SUM1_ASCAST]], ptr [[TMP39]], align 8
1331 // IR-GPU-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
1332 // IR-GPU-NEXT: [[TMP40:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.1, ptr @_omp_reduction_inter_warp_copy_func.2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
1333 // IR-GPU-NEXT: [[TMP41:%.*]] = icmp eq i32 [[TMP40]], 1
1334 // IR-GPU-NEXT: br i1 [[TMP41]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
1335 // IR-GPU: .omp.reduction.then:
1336 // IR-GPU-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
1337 // IR-GPU-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP42]]
1338 // IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE17:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
1339 // IR-GPU: omp.arraycpy.body:
1340 // IR-GPU-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
1341 // IR-GPU-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST13:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY]] ]
1342 // IR-GPU-NEXT: [[TMP43:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
1343 // IR-GPU-NEXT: [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
1344 // IR-GPU-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP43]], [[TMP44]]
1345 // IR-GPU-NEXT: store i32 [[ADD14]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
1346 // IR-GPU-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], i32 1
1347 // IR-GPU-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
1348 // IR-GPU-NEXT: [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP42]]
1349 // IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_DONE16]], label [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_BODY]]
1350 // IR-GPU: omp.arraycpy.done17:
1351 // IR-GPU-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP38]])
1352 // IR-GPU-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
1353 // IR-GPU: .omp.reduction.done:
1354 // IR-GPU-NEXT: ret void
1357 // IR-GPU-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp_outlined
1358 // IR-GPU-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
1359 // IR-GPU-NEXT: entry:
1360 // IR-GPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1361 // IR-GPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1362 // IR-GPU-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
1363 // IR-GPU-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
1364 // IR-GPU-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
1365 // IR-GPU-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1366 // IR-GPU-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
1367 // IR-GPU-NEXT: [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
1368 // IR-GPU-NEXT: [[_TMP1:%.*]] = alloca i32, align 4, addrspace(5)
1369 // IR-GPU-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5)
1370 // IR-GPU-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5)
1371 // IR-GPU-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
1372 // IR-GPU-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
1373 // IR-GPU-NEXT: [[J3:%.*]] = alloca i32, align 4, addrspace(5)
1374 // IR-GPU-NEXT: [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
1375 // IR-GPU-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
1376 // IR-GPU-NEXT: [[J5:%.*]] = alloca i32, align 4, addrspace(5)
1377 // IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1378 // IR-GPU-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
1379 // IR-GPU-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
1380 // IR-GPU-NEXT: [[DOTPREVIOUS_LB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_LB__ADDR]] to ptr
1381 // IR-GPU-NEXT: [[DOTPREVIOUS_UB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_UB__ADDR]] to ptr
1382 // IR-GPU-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
1383 // IR-GPU-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
1384 // IR-GPU-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
1385 // IR-GPU-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
1386 // IR-GPU-NEXT: [[TMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[_TMP1]] to ptr
1387 // IR-GPU-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr
1388 // IR-GPU-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr
1389 // IR-GPU-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
1390 // IR-GPU-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
1391 // IR-GPU-NEXT: [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
1392 // IR-GPU-NEXT: [[SUM4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM4]] to ptr
1393 // IR-GPU-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
1394 // IR-GPU-NEXT: [[J5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J5]] to ptr
1395 // IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1396 // IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1397 // IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
1398 // IR-GPU-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
1399 // IR-GPU-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
1400 // IR-GPU-NEXT: store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
1401 // IR-GPU-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
1402 // IR-GPU-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
1403 // IR-GPU-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4
1404 // IR-GPU-NEXT: store i32 99, ptr [[DOTOMP_UB_ASCAST]], align 4
1405 // IR-GPU-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
1406 // IR-GPU-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
1407 // IR-GPU-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
1408 // IR-GPU-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
1409 // IR-GPU-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB_ASCAST]], align 4
1410 // IR-GPU-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB_ASCAST]], align 4
1411 // IR-GPU-NEXT: store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
1412 // IR-GPU-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
1413 // IR-GPU-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4_ASCAST]], i32 0, i32 0, i32 0
1414 // IR-GPU-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
1415 // IR-GPU-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
1416 // IR-GPU-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
1417 // IR-GPU: omp.arrayinit.body:
1418 // IR-GPU-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
1419 // IR-GPU-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
1420 // IR-GPU-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
1421 // IR-GPU-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
1422 // IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
1423 // IR-GPU: omp.arrayinit.done:
1424 // IR-GPU-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1425 // IR-GPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
1426 // IR-GPU-NEXT: call void @__kmpc_for_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP5]], i32 33, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_LB_ASCAST]], ptr [[DOTOMP_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 1)
1427 // IR-GPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4
1428 // IR-GPU-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4
1429 // IR-GPU-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
1430 // IR-GPU: omp.inner.for.cond:
1431 // IR-GPU-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]]
1432 // IR-GPU-NEXT: [[CONV6:%.*]] = sext i32 [[TMP7]] to i64
1433 // IR-GPU-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8, !llvm.access.group [[ACC_GRP7]]
1434 // IR-GPU-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV6]], [[TMP8]]
1435 // IR-GPU-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
1436 // IR-GPU: omp.inner.for.body:
1437 // IR-GPU-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1438 // IR-GPU-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP9]], 10
1439 // IR-GPU-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
1440 // IR-GPU-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
1441 // IR-GPU-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1442 // IR-GPU-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1443 // IR-GPU-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1444 // IR-GPU-NEXT: [[DIV7:%.*]] = sdiv i32 [[TMP11]], 10
1445 // IR-GPU-NEXT: [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
1446 // IR-GPU-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], [[MUL8]]
1447 // IR-GPU-NEXT: [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
1448 // IR-GPU-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
1449 // IR-GPU-NEXT: store i32 [[ADD10]], ptr [[J3_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1450 // IR-GPU-NEXT: [[TMP12:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1451 // IR-GPU-NEXT: [[TMP13:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1452 // IR-GPU-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
1453 // IR-GPU-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4_ASCAST]], i64 0, i64 [[IDXPROM]]
1454 // IR-GPU-NEXT: [[TMP14:%.*]] = load i32, ptr [[J3_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1455 // IR-GPU-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP14]] to i64
1456 // IR-GPU-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
1457 // IR-GPU-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP7]]
1458 // IR-GPU-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP15]], [[TMP12]]
1459 // IR-GPU-NEXT: store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP7]]
1460 // IR-GPU-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
1461 // IR-GPU: omp.body.continue:
1462 // IR-GPU-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
1463 // IR-GPU: omp.inner.for.inc:
1464 // IR-GPU-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1465 // IR-GPU-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1466 // IR-GPU-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
1467 // IR-GPU-NEXT: store i32 [[ADD14]], ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
1468 // IR-GPU-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
1469 // IR-GPU: omp.inner.for.end:
1470 // IR-GPU-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
1471 // IR-GPU: omp.loop.exit:
1472 // IR-GPU-NEXT: [[TMP18:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1473 // IR-GPU-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
1474 // IR-GPU-NEXT: call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP19]])
1475 // IR-GPU-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
1476 // IR-GPU-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
1477 // IR-GPU-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1478 // IR-GPU-NEXT: store ptr [[SUM4_ASCAST]], ptr [[TMP22]], align 8
1479 // IR-GPU-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP21]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func)
1480 // IR-GPU-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP23]], 1
1481 // IR-GPU-NEXT: br i1 [[TMP24]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
1482 // IR-GPU: .omp.reduction.then:
1483 // IR-GPU-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
1484 // IR-GPU-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
1485 // IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
1486 // IR-GPU: omp.arraycpy.body:
1487 // IR-GPU-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
1488 // IR-GPU-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
1489 // IR-GPU-NEXT: [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
1490 // IR-GPU-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
1491 // IR-GPU-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
1492 // IR-GPU-NEXT: store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
1493 // IR-GPU-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
1494 // IR-GPU-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
1495 // IR-GPU-NEXT: [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
1496 // IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
1497 // IR-GPU: omp.arraycpy.done19:
1498 // IR-GPU-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP21]])
1499 // IR-GPU-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
1500 // IR-GPU: .omp.reduction.done:
1501 // IR-GPU-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
1502 // IR-GPU-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
1503 // IR-GPU-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
1504 // IR-GPU: .omp.lastprivate.then:
1505 // IR-GPU-NEXT: store i32 10, ptr [[J3_ASCAST]], align 4
1506 // IR-GPU-NEXT: [[TMP30:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
1507 // IR-GPU-NEXT: store i32 [[TMP30]], ptr [[J_ADDR_ASCAST]], align 4
1508 // IR-GPU-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
1509 // IR-GPU: .omp.lastprivate.done:
1510 // IR-GPU-NEXT: ret void
1513 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
1514 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
1515 // IR-GPU-NEXT: entry:
1516 // IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1517 // IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2, addrspace(5)
1518 // IR-GPU-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2, addrspace(5)
1519 // IR-GPU-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
1520 // IR-GPU-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1521 // IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
1522 // IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1523 // IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1524 // IR-GPU-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1525 // IR-GPU-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
1526 // IR-GPU-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
1527 // IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
1528 // IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1529 // IR-GPU-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
1530 // IR-GPU-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
1531 // IR-GPU-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3_ASCAST]], align 2
1532 // IR-GPU-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1533 // IR-GPU-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1_ASCAST]], align 2
1534 // IR-GPU-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2_ASCAST]], align 2
1535 // IR-GPU-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3_ASCAST]], align 2
1536 // IR-GPU-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
1537 // IR-GPU-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
1538 // IR-GPU-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
1539 // IR-GPU-NEXT: [[TMP11:%.*]] = getelementptr [10 x [10 x i32]], ptr [[TMP9]], i64 1
1540 // IR-GPU-NEXT: br label [[DOTSHUFFLE_PRE_COND:%.*]]
1541 // IR-GPU: .shuffle.pre_cond:
1542 // IR-GPU-NEXT: [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
1543 // IR-GPU-NEXT: [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
1544 // IR-GPU-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
1545 // IR-GPU-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
1546 // IR-GPU-NEXT: [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
1547 // IR-GPU-NEXT: [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
1548 // IR-GPU-NEXT: [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
1549 // IR-GPU-NEXT: br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
1550 // IR-GPU: .shuffle.then:
1551 // IR-GPU-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 4
1552 // IR-GPU-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
1553 // IR-GPU-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
1554 // IR-GPU-NEXT: [[TMP22:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP19]], i16 [[TMP6]], i16 [[TMP21]])
1555 // IR-GPU-NEXT: store i64 [[TMP22]], ptr [[TMP13]], align 4
1556 // IR-GPU-NEXT: [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
1557 // IR-GPU-NEXT: [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
1558 // IR-GPU-NEXT: br label [[DOTSHUFFLE_PRE_COND]]
1559 // IR-GPU: .shuffle.exit:
1560 // IR-GPU-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], ptr [[TMP10]], align 8
1561 // IR-GPU-NEXT: [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
1562 // IR-GPU-NEXT: [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
1563 // IR-GPU-NEXT: [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
1564 // IR-GPU-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
1565 // IR-GPU-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP7]], 2
1566 // IR-GPU-NEXT: [[TMP30:%.*]] = and i16 [[TMP5]], 1
1567 // IR-GPU-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP30]], 0
1568 // IR-GPU-NEXT: [[TMP32:%.*]] = and i1 [[TMP29]], [[TMP31]]
1569 // IR-GPU-NEXT: [[TMP33:%.*]] = icmp sgt i16 [[TMP6]], 0
1570 // IR-GPU-NEXT: [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
1571 // IR-GPU-NEXT: [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
1572 // IR-GPU-NEXT: [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
1573 // IR-GPU-NEXT: br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
1574 // IR-GPU: then:
1575 // IR-GPU-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]]) #[[ATTR2]]
1576 // IR-GPU-NEXT: br label [[IFCONT:%.*]]
1577 // IR-GPU: else:
1578 // IR-GPU-NEXT: br label [[IFCONT]]
1579 // IR-GPU: ifcont:
1580 // IR-GPU-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
1581 // IR-GPU-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
1582 // IR-GPU-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
1583 // IR-GPU-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
1584 // IR-GPU: then4:
1585 // IR-GPU-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
1586 // IR-GPU-NEXT: [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
1587 // IR-GPU-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
1588 // IR-GPU-NEXT: [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
1589 // IR-GPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP43]], ptr align 4 [[TMP41]], i64 400, i1 false)
1590 // IR-GPU-NEXT: br label [[IFCONT6:%.*]]
1591 // IR-GPU: else5:
1592 // IR-GPU-NEXT: br label [[IFCONT6]]
1593 // IR-GPU: ifcont6:
1594 // IR-GPU-NEXT: ret void
1597 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
1598 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
1599 // IR-GPU-NEXT: entry:
1600 // IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1601 // IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1602 // IR-GPU-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1603 // IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
1604 // IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1605 // IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1606 // IR-GPU-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
1607 // IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1608 // IR-GPU-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1609 // IR-GPU-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1610 // IR-GPU-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1611 // IR-GPU-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 63
1612 // IR-GPU-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1613 // IR-GPU-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 6
1614 // IR-GPU-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1615 // IR-GPU-NEXT: store i32 0, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1616 // IR-GPU-NEXT: br label [[PRECOND:%.*]]
1617 // IR-GPU: precond:
1618 // IR-GPU-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1619 // IR-GPU-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
1620 // IR-GPU-NEXT: br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
1621 // IR-GPU: body:
1622 // IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4:[0-9]+]] to ptr), i32 [[TMP2]])
1623 // IR-GPU-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1624 // IR-GPU-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
1625 // IR-GPU: then:
1626 // IR-GPU-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1627 // IR-GPU-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
1628 // IR-GPU-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
1629 // IR-GPU-NEXT: [[TMP12:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1630 // IR-GPU-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
1631 // IR-GPU-NEXT: store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
1632 // IR-GPU-NEXT: br label [[IFCONT:%.*]]
1633 // IR-GPU: else:
1634 // IR-GPU-NEXT: br label [[IFCONT]]
1635 // IR-GPU: ifcont:
1636 // IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
1637 // IR-GPU-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1638 // IR-GPU-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
1639 // IR-GPU-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
1640 // IR-GPU: then2:
1641 // IR-GPU-NEXT: [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
1642 // IR-GPU-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1643 // IR-GPU-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
1644 // IR-GPU-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
1645 // IR-GPU-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
1646 // IR-GPU-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4
1647 // IR-GPU-NEXT: br label [[IFCONT4:%.*]]
1648 // IR-GPU: else3:
1649 // IR-GPU-NEXT: br label [[IFCONT4]]
1650 // IR-GPU: ifcont4:
1651 // IR-GPU-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
1652 // IR-GPU-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
1653 // IR-GPU-NEXT: br label [[PRECOND]]
1654 // IR-GPU: exit:
1655 // IR-GPU-NEXT: ret void
1658 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func.1
1659 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
1660 // IR-GPU-NEXT: entry:
1661 // IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1662 // IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2, addrspace(5)
1663 // IR-GPU-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2, addrspace(5)
1664 // IR-GPU-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
1665 // IR-GPU-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1666 // IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
1667 // IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1668 // IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1669 // IR-GPU-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1670 // IR-GPU-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
1671 // IR-GPU-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
1672 // IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
1673 // IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1674 // IR-GPU-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
1675 // IR-GPU-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
1676 // IR-GPU-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3_ASCAST]], align 2
1677 // IR-GPU-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1678 // IR-GPU-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1_ASCAST]], align 2
1679 // IR-GPU-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2_ASCAST]], align 2
1680 // IR-GPU-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3_ASCAST]], align 2
1681 // IR-GPU-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
1682 // IR-GPU-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
1683 // IR-GPU-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
1684 // IR-GPU-NEXT: [[TMP11:%.*]] = getelementptr [10 x [10 x i32]], ptr [[TMP9]], i64 1
1685 // IR-GPU-NEXT: br label [[DOTSHUFFLE_PRE_COND:%.*]]
1686 // IR-GPU: .shuffle.pre_cond:
1687 // IR-GPU-NEXT: [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
1688 // IR-GPU-NEXT: [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
1689 // IR-GPU-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
1690 // IR-GPU-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
1691 // IR-GPU-NEXT: [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
1692 // IR-GPU-NEXT: [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
1693 // IR-GPU-NEXT: [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
1694 // IR-GPU-NEXT: br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
1695 // IR-GPU: .shuffle.then:
1696 // IR-GPU-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 4
1697 // IR-GPU-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
1698 // IR-GPU-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
1699 // IR-GPU-NEXT: [[TMP22:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP19]], i16 [[TMP6]], i16 [[TMP21]])
1700 // IR-GPU-NEXT: store i64 [[TMP22]], ptr [[TMP13]], align 4
1701 // IR-GPU-NEXT: [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
1702 // IR-GPU-NEXT: [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
1703 // IR-GPU-NEXT: br label [[DOTSHUFFLE_PRE_COND]]
1704 // IR-GPU: .shuffle.exit:
1705 // IR-GPU-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], ptr [[TMP10]], align 8
1706 // IR-GPU-NEXT: [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
1707 // IR-GPU-NEXT: [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
1708 // IR-GPU-NEXT: [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
1709 // IR-GPU-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
1710 // IR-GPU-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP7]], 2
1711 // IR-GPU-NEXT: [[TMP30:%.*]] = and i16 [[TMP5]], 1
1712 // IR-GPU-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP30]], 0
1713 // IR-GPU-NEXT: [[TMP32:%.*]] = and i1 [[TMP29]], [[TMP31]]
1714 // IR-GPU-NEXT: [[TMP33:%.*]] = icmp sgt i16 [[TMP6]], 0
1715 // IR-GPU-NEXT: [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
1716 // IR-GPU-NEXT: [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
1717 // IR-GPU-NEXT: [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
1718 // IR-GPU-NEXT: br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
1719 // IR-GPU: then:
1720 // IR-GPU-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]]) #[[ATTR2]]
1721 // IR-GPU-NEXT: br label [[IFCONT:%.*]]
1722 // IR-GPU: else:
1723 // IR-GPU-NEXT: br label [[IFCONT]]
1724 // IR-GPU: ifcont:
1725 // IR-GPU-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
1726 // IR-GPU-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
1727 // IR-GPU-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
1728 // IR-GPU-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
1729 // IR-GPU: then4:
1730 // IR-GPU-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
1731 // IR-GPU-NEXT: [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
1732 // IR-GPU-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
1733 // IR-GPU-NEXT: [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
1734 // IR-GPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP43]], ptr align 4 [[TMP41]], i64 400, i1 false)
1735 // IR-GPU-NEXT: br label [[IFCONT6:%.*]]
1736 // IR-GPU: else5:
1737 // IR-GPU-NEXT: br label [[IFCONT6]]
1738 // IR-GPU: ifcont6:
1739 // IR-GPU-NEXT: ret void
1742 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func.2
1743 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
1744 // IR-GPU-NEXT: entry:
1745 // IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1746 // IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1747 // IR-GPU-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1748 // IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
1749 // IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1750 // IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1751 // IR-GPU-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
1752 // IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1753 // IR-GPU-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1754 // IR-GPU-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1755 // IR-GPU-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1756 // IR-GPU-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 63
1757 // IR-GPU-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
1758 // IR-GPU-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 6
1759 // IR-GPU-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1760 // IR-GPU-NEXT: store i32 0, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1761 // IR-GPU-NEXT: br label [[PRECOND:%.*]]
1762 // IR-GPU: precond:
1763 // IR-GPU-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR_ASCAST]], align 4
1764 // IR-GPU-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
1765 // IR-GPU-NEXT: br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
1766 // IR-GPU: body:
1767 // IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
1768 // IR-GPU-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1769 // IR-GPU-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
1770 // IR-GPU: then:
1771 // IR-GPU-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1772 // IR-GPU-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
1773 // IR-GPU-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
1774 // IR-GPU-NEXT: [[TMP12:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1775 // IR-GPU-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
1776 // IR-GPU-NEXT: store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
1777 // IR-GPU-NEXT: br label [[IFCONT:%.*]]
1778 // IR-GPU: else:
1779 // IR-GPU-NEXT: br label [[IFCONT]]
1780 // IR-GPU: ifcont:
1781 // IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
1782 // IR-GPU-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1783 // IR-GPU-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
1784 // IR-GPU-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
1785 // IR-GPU: then2:
1786 // IR-GPU-NEXT: [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
1787 // IR-GPU-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
1788 // IR-GPU-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
1789 // IR-GPU-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
1790 // IR-GPU-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
1791 // IR-GPU-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4
1792 // IR-GPU-NEXT: br label [[IFCONT4:%.*]]
1793 // IR-GPU: else3:
1794 // IR-GPU-NEXT: br label [[IFCONT4]]
1795 // IR-GPU: ifcont4:
1796 // IR-GPU-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
1797 // IR-GPU-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
1798 // IR-GPU-NEXT: br label [[PRECOND]]
1799 // IR-GPU: exit:
1800 // IR-GPU-NEXT: ret void
1803 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
1804 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1805 // IR-GPU-NEXT: entry:
1806 // IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1807 // IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1808 // IR-GPU-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1809 // IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1810 // IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1811 // IR-GPU-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1812 // IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1813 // IR-GPU-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1814 // IR-GPU-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1815 // IR-GPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1816 // IR-GPU-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1817 // IR-GPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1818 // IR-GPU-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
1819 // IR-GPU-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
1820 // IR-GPU-NEXT: [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
1821 // IR-GPU-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP5]]
1822 // IR-GPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[TMP8]], ptr align 4 [[TMP7]], i64 400, i1 false)
1823 // IR-GPU-NEXT: ret void
1826 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
1827 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1828 // IR-GPU-NEXT: entry:
1829 // IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1830 // IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1831 // IR-GPU-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1832 // IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1833 // IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1834 // IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1835 // IR-GPU-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1836 // IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1837 // IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1838 // IR-GPU-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1839 // IR-GPU-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1840 // IR-GPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1841 // IR-GPU-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1842 // IR-GPU-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1843 // IR-GPU-NEXT: [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
1844 // IR-GPU-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP4]]
1845 // IR-GPU-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
1846 // IR-GPU-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1847 // IR-GPU-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp$reduction$reduction_func"(ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr [[TMP7]]) #[[ATTR2]]
1848 // IR-GPU-NEXT: ret void
1851 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
1852 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1853 // IR-GPU-NEXT: entry:
1854 // IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1855 // IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1856 // IR-GPU-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1857 // IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1858 // IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1859 // IR-GPU-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1860 // IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1861 // IR-GPU-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1862 // IR-GPU-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1863 // IR-GPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1864 // IR-GPU-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1865 // IR-GPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1866 // IR-GPU-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
1867 // IR-GPU-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
1868 // IR-GPU-NEXT: [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
1869 // IR-GPU-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP5]]
1870 // IR-GPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP7]], ptr align 128 [[TMP8]], i64 400, i1 false)
1871 // IR-GPU-NEXT: ret void
1874 // IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
1875 // IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
1876 // IR-GPU-NEXT: entry:
1877 // IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
1878 // IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
1879 // IR-GPU-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
1880 // IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
1881 // IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
1882 // IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
1883 // IR-GPU-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
1884 // IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
1885 // IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
1886 // IR-GPU-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
1887 // IR-GPU-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
1888 // IR-GPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
1889 // IR-GPU-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
1890 // IR-GPU-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
1891 // IR-GPU-NEXT: [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
1892 // IR-GPU-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP4]]
1893 // IR-GPU-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
1894 // IR-GPU-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
1895 // IR-GPU-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP7]], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]]) #[[ATTR2]]
1896 // IR-GPU-NEXT: ret void
1899 // IR-LABEL: define {{[^@]+}}@_Z3foov
1900 // IR-SAME: () #[[ATTR0:[0-9]+]] {
1901 // IR-NEXT: entry:
1902 // IR-NEXT: [[I:%.*]] = alloca i32, align 4
1903 // IR-NEXT: [[J:%.*]] = alloca i32, align 4
1904 // IR-NEXT: [[SUM:%.*]] = alloca [10 x [10 x i32]], align 16
1905 // IR-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
1906 // IR-NEXT: [[TMP0:%.*]] = load i32, ptr [[J]], align 4
1907 // IR-NEXT: store i32 [[TMP0]], ptr [[J_CASTED]], align 4
1908 // IR-NEXT: [[TMP1:%.*]] = load i64, ptr [[J_CASTED]], align 8
1909 // IR-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22(i64 [[TMP1]], ptr [[SUM]]) #[[ATTR2:[0-9]+]]
1910 // IR-NEXT: ret i32 0
1913 // IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
1914 // IR-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
1915 // IR-NEXT: entry:
1916 // IR-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
1917 // IR-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
1918 // IR-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
1919 // IR-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
1920 // IR-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
1921 // IR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
1922 // IR-NEXT: [[TMP1:%.*]] = load i32, ptr [[J_ADDR]], align 4
1923 // IR-NEXT: store i32 [[TMP1]], ptr [[J_CASTED]], align 4
1924 // IR-NEXT: [[TMP2:%.*]] = load i64, ptr [[J_CASTED]], align 8
1925 // IR-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4:[0-9]+]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined, i64 [[TMP2]], ptr [[TMP0]])
1926 // IR-NEXT: ret void
1929 // IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined
1930 // IR-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
1931 // IR-NEXT: entry:
1932 // IR-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
1933 // IR-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
1934 // IR-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
1935 // IR-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
1936 // IR-NEXT: [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 16
1937 // IR-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
1938 // IR-NEXT: [[TMP:%.*]] = alloca i32, align 4
1939 // IR-NEXT: [[_TMP2:%.*]] = alloca i32, align 4
1940 // IR-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
1941 // IR-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
1942 // IR-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
1943 // IR-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
1944 // IR-NEXT: [[J3:%.*]] = alloca i32, align 4
1945 // IR-NEXT: [[I:%.*]] = alloca i32, align 4
1946 // IR-NEXT: [[J4:%.*]] = alloca i32, align 4
1947 // IR-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
1948 // IR-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
1949 // IR-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
1950 // IR-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
1951 // IR-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
1952 // IR-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
1953 // IR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
1954 // IR-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1]], i32 0, i32 0, i32 0
1955 // IR-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
1956 // IR-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
1957 // IR-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
1958 // IR: omp.arrayinit.body:
1959 // IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
1960 // IR-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
1961 // IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
1962 // IR-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
1963 // IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
1964 // IR: omp.arrayinit.done:
1965 // IR-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
1966 // IR-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
1967 // IR-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
1968 // IR-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
1969 // IR-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
1970 // IR-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
1971 // IR-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
1972 // IR-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
1973 // IR-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
1974 // IR-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
1975 // IR: cond.true:
1976 // IR-NEXT: br label [[COND_END:%.*]]
1977 // IR: cond.false:
1978 // IR-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
1979 // IR-NEXT: br label [[COND_END]]
1980 // IR: cond.end:
1981 // IR-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
1982 // IR-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
1983 // IR-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
1984 // IR-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
1985 // IR-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
1986 // IR: omp.inner.for.cond:
1987 // IR-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
1988 // IR-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
1989 // IR-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
1990 // IR-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
1991 // IR: omp.inner.for.body:
1992 // IR-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
1993 // IR-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
1994 // IR-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
1995 // IR-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
1996 // IR-NEXT: [[TMP13:%.*]] = load i32, ptr [[J3]], align 4
1997 // IR-NEXT: store i32 [[TMP13]], ptr [[J_CASTED]], align 4
1998 // IR-NEXT: [[TMP14:%.*]] = load i64, ptr [[J_CASTED]], align 8
1999 // IR-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined, i64 [[TMP10]], i64 [[TMP12]], i64 [[TMP14]], ptr [[SUM1]])
2000 // IR-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
2001 // IR: omp.inner.for.inc:
2002 // IR-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
2003 // IR-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
2004 // IR-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
2005 // IR-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
2006 // IR-NEXT: br label [[OMP_INNER_FOR_COND]]
2007 // IR: omp.inner.for.end:
2008 // IR-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
2009 // IR: omp.loop.exit:
2010 // IR-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2011 // IR-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
2012 // IR-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP18]])
2013 // IR-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
2014 // IR-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
2015 // IR-NEXT: br i1 [[TMP20]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
2016 // IR: .omp.lastprivate.then:
2017 // IR-NEXT: store i32 10, ptr [[J3]], align 4
2018 // IR-NEXT: [[TMP21:%.*]] = load i32, ptr [[J3]], align 4
2019 // IR-NEXT: store i32 [[TMP21]], ptr [[J_ADDR]], align 4
2020 // IR-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
2021 // IR: .omp.lastprivate.done:
2022 // IR-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
2023 // IR-NEXT: store ptr [[SUM1]], ptr [[TMP22]], align 8
2024 // IR-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2025 // IR-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
2026 // IR-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3:[0-9]+]], i32 [[TMP24]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
2027 // IR-NEXT: switch i32 [[TMP25]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
2028 // IR-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
2029 // IR-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
2030 // IR-NEXT: ]
2031 // IR: .omp.reduction.case1:
2032 // IR-NEXT: [[TMP26:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2033 // IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP26]]
2034 // IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE10:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2035 // IR: omp.arraycpy.body:
2036 // IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2037 // IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST6:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT8:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2038 // IR-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
2039 // IR-NEXT: [[TMP28:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2040 // IR-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
2041 // IR-NEXT: store i32 [[ADD7]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
2042 // IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT8]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], i32 1
2043 // IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2044 // IR-NEXT: [[OMP_ARRAYCPY_DONE9:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT8]], [[TMP26]]
2045 // IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE9]], label [[OMP_ARRAYCPY_DONE10]], label [[OMP_ARRAYCPY_BODY]]
2046 // IR: omp.arraycpy.done10:
2047 // IR-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
2048 // IR-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
2049 // IR: .omp.reduction.case2:
2050 // IR-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2051 // IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY11:%.*]] = icmp eq ptr [[TMP0]], [[TMP29]]
2052 // IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY11]], label [[OMP_ARRAYCPY_DONE18:%.*]], label [[OMP_ARRAYCPY_BODY12:%.*]]
2053 // IR: omp.arraycpy.body12:
2054 // IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST13:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT16:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
2055 // IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST14:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
2056 // IR-NEXT: [[TMP30:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], align 4
2057 // IR-NEXT: [[TMP31:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 [[TMP30]] monotonic, align 4
2058 // IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 1
2059 // IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT16]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], i32 1
2060 // IR-NEXT: [[OMP_ARRAYCPY_DONE17:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP29]]
2061 // IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_BODY12]]
2062 // IR: omp.arraycpy.done18:
2063 // IR-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
2064 // IR-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
2065 // IR: .omp.reduction.default:
2066 // IR-NEXT: ret void
2069 // IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined
2070 // IR-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
2071 // IR-NEXT: entry:
2072 // IR-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
2073 // IR-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
2074 // IR-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
2075 // IR-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
2076 // IR-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
2077 // IR-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
2078 // IR-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
2079 // IR-NEXT: [[TMP:%.*]] = alloca i32, align 4
2080 // IR-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
2081 // IR-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
2082 // IR-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
2083 // IR-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
2084 // IR-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
2085 // IR-NEXT: [[J3:%.*]] = alloca i32, align 4
2086 // IR-NEXT: [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 16
2087 // IR-NEXT: [[I:%.*]] = alloca i32, align 4
2088 // IR-NEXT: [[J5:%.*]] = alloca i32, align 4
2089 // IR-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
2090 // IR-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
2091 // IR-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
2092 // IR-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
2093 // IR-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
2094 // IR-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
2095 // IR-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
2096 // IR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
2097 // IR-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
2098 // IR-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
2099 // IR-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
2100 // IR-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
2101 // IR-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
2102 // IR-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
2103 // IR-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
2104 // IR-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
2105 // IR-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
2106 // IR-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
2107 // IR-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i32 0, i32 0, i32 0
2108 // IR-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
2109 // IR-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
2110 // IR-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
2111 // IR: omp.arrayinit.body:
2112 // IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
2113 // IR-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2114 // IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2115 // IR-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
2116 // IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
2117 // IR: omp.arrayinit.done:
2118 // IR-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2119 // IR-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
2120 // IR-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP5]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
2121 // IR-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
2122 // IR-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP6]], 99
2123 // IR-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
2124 // IR: cond.true:
2125 // IR-NEXT: br label [[COND_END:%.*]]
2126 // IR: cond.false:
2127 // IR-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
2128 // IR-NEXT: br label [[COND_END]]
2129 // IR: cond.end:
2130 // IR-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP7]], [[COND_FALSE]] ]
2131 // IR-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
2132 // IR-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
2133 // IR-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4
2134 // IR-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
2135 // IR: omp.inner.for.cond:
2136 // IR-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]]
2137 // IR-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP3]]
2138 // IR-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
2139 // IR-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
2140 // IR: omp.inner.for.body:
2141 // IR-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2142 // IR-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP11]], 10
2143 // IR-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
2144 // IR-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
2145 // IR-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2146 // IR-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2147 // IR-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2148 // IR-NEXT: [[DIV7:%.*]] = sdiv i32 [[TMP13]], 10
2149 // IR-NEXT: [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
2150 // IR-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP12]], [[MUL8]]
2151 // IR-NEXT: [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
2152 // IR-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
2153 // IR-NEXT: store i32 [[ADD10]], ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
2154 // IR-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2155 // IR-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2156 // IR-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64
2157 // IR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i64 0, i64 [[IDXPROM]]
2158 // IR-NEXT: [[TMP16:%.*]] = load i32, ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
2159 // IR-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP16]] to i64
2160 // IR-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
2161 // IR-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
2162 // IR-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP14]]
2163 // IR-NEXT: store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
2164 // IR-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
2165 // IR: omp.body.continue:
2166 // IR-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
2167 // IR: omp.inner.for.inc:
2168 // IR-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2169 // IR-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP18]], 1
2170 // IR-NEXT: store i32 [[ADD14]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2171 // IR-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
2172 // IR: omp.inner.for.end:
2173 // IR-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
2174 // IR: omp.loop.exit:
2175 // IR-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2176 // IR-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
2177 // IR-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
2178 // IR-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
2179 // IR-NEXT: store ptr [[SUM4]], ptr [[TMP21]], align 8
2180 // IR-NEXT: [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2181 // IR-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
2182 // IR-NEXT: [[TMP24:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP23]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
2183 // IR-NEXT: switch i32 [[TMP24]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
2184 // IR-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
2185 // IR-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
2186 // IR-NEXT: ]
2187 // IR: .omp.reduction.case1:
2188 // IR-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2189 // IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
2190 // IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2191 // IR: omp.arraycpy.body:
2192 // IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2193 // IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2194 // IR-NEXT: [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
2195 // IR-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2196 // IR-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
2197 // IR-NEXT: store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
2198 // IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
2199 // IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2200 // IR-NEXT: [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
2201 // IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
2202 // IR: omp.arraycpy.done19:
2203 // IR-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
2204 // IR-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
2205 // IR: .omp.reduction.case2:
2206 // IR-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2207 // IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY20:%.*]] = icmp eq ptr [[TMP0]], [[TMP28]]
2208 // IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY20]], label [[OMP_ARRAYCPY_DONE27:%.*]], label [[OMP_ARRAYCPY_BODY21:%.*]]
2209 // IR: omp.arraycpy.body21:
2210 // IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST22:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT25:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
2211 // IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST23:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT24:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
2212 // IR-NEXT: [[TMP29:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], align 4
2213 // IR-NEXT: [[TMP30:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 [[TMP29]] monotonic, align 4
2214 // IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT24]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 1
2215 // IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT25]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], i32 1
2216 // IR-NEXT: [[OMP_ARRAYCPY_DONE26:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT24]], [[TMP28]]
2217 // IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE26]], label [[OMP_ARRAYCPY_DONE27]], label [[OMP_ARRAYCPY_BODY21]]
2218 // IR: omp.arraycpy.done27:
2219 // IR-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
2220 // IR-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
2221 // IR: .omp.reduction.default:
2222 // IR-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
2223 // IR-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
2224 // IR-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
2225 // IR: .omp.lastprivate.then:
2226 // IR-NEXT: store i32 10, ptr [[J3]], align 4
2227 // IR-NEXT: [[TMP33:%.*]] = load i32, ptr [[J3]], align 4
2228 // IR-NEXT: store i32 [[TMP33]], ptr [[J_ADDR]], align 4
2229 // IR-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
2230 // IR: .omp.lastprivate.done:
2231 // IR-NEXT: ret void
2234 // IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined.omp.reduction.reduction_func
2235 // IR-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
2236 // IR-NEXT: entry:
2237 // IR-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
2238 // IR-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
2239 // IR-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
2240 // IR-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
2241 // IR-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
2242 // IR-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
2243 // IR-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
2244 // IR-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
2245 // IR-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
2246 // IR-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
2247 // IR-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
2248 // IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
2249 // IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2250 // IR: omp.arraycpy.body:
2251 // IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2252 // IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2253 // IR-NEXT: [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2254 // IR-NEXT: [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2255 // IR-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
2256 // IR-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2257 // IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2258 // IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2259 // IR-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
2260 // IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
2261 // IR: omp.arraycpy.done2:
2262 // IR-NEXT: ret void
2265 // IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp.reduction.reduction_func
2266 // IR-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
2267 // IR-NEXT: entry:
2268 // IR-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
2269 // IR-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
2270 // IR-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
2271 // IR-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
2272 // IR-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
2273 // IR-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
2274 // IR-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
2275 // IR-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
2276 // IR-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
2277 // IR-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
2278 // IR-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
2279 // IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
2280 // IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2281 // IR: omp.arraycpy.body:
2282 // IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2283 // IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2284 // IR-NEXT: [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2285 // IR-NEXT: [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2286 // IR-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
2287 // IR-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2288 // IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2289 // IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2290 // IR-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
2291 // IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
2292 // IR: omp.arraycpy.done2:
2293 // IR-NEXT: ret void
2296 // IR-PCH-LABEL: define {{[^@]+}}@_Z3foov
2297 // IR-PCH-SAME: () #[[ATTR0:[0-9]+]] {
2298 // IR-PCH-NEXT: entry:
2299 // IR-PCH-NEXT: [[I:%.*]] = alloca i32, align 4
2300 // IR-PCH-NEXT: [[J:%.*]] = alloca i32, align 4
2301 // IR-PCH-NEXT: [[SUM:%.*]] = alloca [10 x [10 x i32]], align 16
2302 // IR-PCH-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
2303 // IR-PCH-NEXT: [[TMP0:%.*]] = load i32, ptr [[J]], align 4
2304 // IR-PCH-NEXT: store i32 [[TMP0]], ptr [[J_CASTED]], align 4
2305 // IR-PCH-NEXT: [[TMP1:%.*]] = load i64, ptr [[J_CASTED]], align 8
2306 // IR-PCH-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22(i64 [[TMP1]], ptr [[SUM]]) #[[ATTR2:[0-9]+]]
2307 // IR-PCH-NEXT: ret i32 0
2310 // IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
2311 // IR-PCH-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
2312 // IR-PCH-NEXT: entry:
2313 // IR-PCH-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
2314 // IR-PCH-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
2315 // IR-PCH-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
2316 // IR-PCH-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
2317 // IR-PCH-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
2318 // IR-PCH-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
2319 // IR-PCH-NEXT: [[TMP1:%.*]] = load i32, ptr [[J_ADDR]], align 4
2320 // IR-PCH-NEXT: store i32 [[TMP1]], ptr [[J_CASTED]], align 4
2321 // IR-PCH-NEXT: [[TMP2:%.*]] = load i64, ptr [[J_CASTED]], align 8
2322 // IR-PCH-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4:[0-9]+]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined, i64 [[TMP2]], ptr [[TMP0]])
2323 // IR-PCH-NEXT: ret void
2326 // IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined
2327 // IR-PCH-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
2328 // IR-PCH-NEXT: entry:
2329 // IR-PCH-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
2330 // IR-PCH-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
2331 // IR-PCH-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
2332 // IR-PCH-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
2333 // IR-PCH-NEXT: [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 16
2334 // IR-PCH-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
2335 // IR-PCH-NEXT: [[TMP:%.*]] = alloca i32, align 4
2336 // IR-PCH-NEXT: [[_TMP2:%.*]] = alloca i32, align 4
2337 // IR-PCH-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
2338 // IR-PCH-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
2339 // IR-PCH-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
2340 // IR-PCH-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
2341 // IR-PCH-NEXT: [[J3:%.*]] = alloca i32, align 4
2342 // IR-PCH-NEXT: [[I:%.*]] = alloca i32, align 4
2343 // IR-PCH-NEXT: [[J4:%.*]] = alloca i32, align 4
2344 // IR-PCH-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
2345 // IR-PCH-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
2346 // IR-PCH-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
2347 // IR-PCH-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
2348 // IR-PCH-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
2349 // IR-PCH-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
2350 // IR-PCH-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
2351 // IR-PCH-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1]], i32 0, i32 0, i32 0
2352 // IR-PCH-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
2353 // IR-PCH-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
2354 // IR-PCH-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
2355 // IR-PCH: omp.arrayinit.body:
2356 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
2357 // IR-PCH-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2358 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2359 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
2360 // IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
2361 // IR-PCH: omp.arrayinit.done:
2362 // IR-PCH-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
2363 // IR-PCH-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
2364 // IR-PCH-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
2365 // IR-PCH-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
2366 // IR-PCH-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2367 // IR-PCH-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
2368 // IR-PCH-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
2369 // IR-PCH-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
2370 // IR-PCH-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
2371 // IR-PCH-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
2372 // IR-PCH: cond.true:
2373 // IR-PCH-NEXT: br label [[COND_END:%.*]]
2374 // IR-PCH: cond.false:
2375 // IR-PCH-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
2376 // IR-PCH-NEXT: br label [[COND_END]]
2377 // IR-PCH: cond.end:
2378 // IR-PCH-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
2379 // IR-PCH-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
2380 // IR-PCH-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
2381 // IR-PCH-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
2382 // IR-PCH-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
2383 // IR-PCH: omp.inner.for.cond:
2384 // IR-PCH-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
2385 // IR-PCH-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
2386 // IR-PCH-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
2387 // IR-PCH-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
2388 // IR-PCH: omp.inner.for.body:
2389 // IR-PCH-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
2390 // IR-PCH-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
2391 // IR-PCH-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
2392 // IR-PCH-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
2393 // IR-PCH-NEXT: [[TMP13:%.*]] = load i32, ptr [[J3]], align 4
2394 // IR-PCH-NEXT: store i32 [[TMP13]], ptr [[J_CASTED]], align 4
2395 // IR-PCH-NEXT: [[TMP14:%.*]] = load i64, ptr [[J_CASTED]], align 8
2396 // IR-PCH-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined, i64 [[TMP10]], i64 [[TMP12]], i64 [[TMP14]], ptr [[SUM1]])
2397 // IR-PCH-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
2398 // IR-PCH: omp.inner.for.inc:
2399 // IR-PCH-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
2400 // IR-PCH-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
2401 // IR-PCH-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
2402 // IR-PCH-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
2403 // IR-PCH-NEXT: br label [[OMP_INNER_FOR_COND]]
2404 // IR-PCH: omp.inner.for.end:
2405 // IR-PCH-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
2406 // IR-PCH: omp.loop.exit:
2407 // IR-PCH-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2408 // IR-PCH-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
2409 // IR-PCH-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP18]])
2410 // IR-PCH-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
2411 // IR-PCH-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
2412 // IR-PCH-NEXT: br i1 [[TMP20]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
2413 // IR-PCH: .omp.lastprivate.then:
2414 // IR-PCH-NEXT: store i32 10, ptr [[J3]], align 4
2415 // IR-PCH-NEXT: [[TMP21:%.*]] = load i32, ptr [[J3]], align 4
2416 // IR-PCH-NEXT: store i32 [[TMP21]], ptr [[J_ADDR]], align 4
2417 // IR-PCH-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
2418 // IR-PCH: .omp.lastprivate.done:
2419 // IR-PCH-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
2420 // IR-PCH-NEXT: store ptr [[SUM1]], ptr [[TMP22]], align 8
2421 // IR-PCH-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2422 // IR-PCH-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
2423 // IR-PCH-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3:[0-9]+]], i32 [[TMP24]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
2424 // IR-PCH-NEXT: switch i32 [[TMP25]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
2425 // IR-PCH-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
2426 // IR-PCH-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
2427 // IR-PCH-NEXT: ]
2428 // IR-PCH: .omp.reduction.case1:
2429 // IR-PCH-NEXT: [[TMP26:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2430 // IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP26]]
2431 // IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE10:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2432 // IR-PCH: omp.arraycpy.body:
2433 // IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2434 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST6:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT8:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2435 // IR-PCH-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
2436 // IR-PCH-NEXT: [[TMP28:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2437 // IR-PCH-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
2438 // IR-PCH-NEXT: store i32 [[ADD7]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
2439 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT8]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], i32 1
2440 // IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2441 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE9:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT8]], [[TMP26]]
2442 // IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE9]], label [[OMP_ARRAYCPY_DONE10]], label [[OMP_ARRAYCPY_BODY]]
2443 // IR-PCH: omp.arraycpy.done10:
2444 // IR-PCH-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
2445 // IR-PCH-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
2446 // IR-PCH: .omp.reduction.case2:
2447 // IR-PCH-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2448 // IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY11:%.*]] = icmp eq ptr [[TMP0]], [[TMP29]]
2449 // IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY11]], label [[OMP_ARRAYCPY_DONE18:%.*]], label [[OMP_ARRAYCPY_BODY12:%.*]]
2450 // IR-PCH: omp.arraycpy.body12:
2451 // IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST13:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT16:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
2452 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST14:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
2453 // IR-PCH-NEXT: [[TMP30:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], align 4
2454 // IR-PCH-NEXT: [[TMP31:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 [[TMP30]] monotonic, align 4
2455 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 1
2456 // IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT16]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], i32 1
2457 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE17:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP29]]
2458 // IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_BODY12]]
2459 // IR-PCH: omp.arraycpy.done18:
2460 // IR-PCH-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
2461 // IR-PCH-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
2462 // IR-PCH: .omp.reduction.default:
2463 // IR-PCH-NEXT: ret void
2466 // IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined
2467 // IR-PCH-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
2468 // IR-PCH-NEXT: entry:
2469 // IR-PCH-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
2470 // IR-PCH-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
2471 // IR-PCH-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
2472 // IR-PCH-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
2473 // IR-PCH-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
2474 // IR-PCH-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
2475 // IR-PCH-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
2476 // IR-PCH-NEXT: [[TMP:%.*]] = alloca i32, align 4
2477 // IR-PCH-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
2478 // IR-PCH-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
2479 // IR-PCH-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
2480 // IR-PCH-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
2481 // IR-PCH-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
2482 // IR-PCH-NEXT: [[J3:%.*]] = alloca i32, align 4
2483 // IR-PCH-NEXT: [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 16
2484 // IR-PCH-NEXT: [[I:%.*]] = alloca i32, align 4
2485 // IR-PCH-NEXT: [[J5:%.*]] = alloca i32, align 4
2486 // IR-PCH-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
2487 // IR-PCH-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
2488 // IR-PCH-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
2489 // IR-PCH-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
2490 // IR-PCH-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
2491 // IR-PCH-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
2492 // IR-PCH-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
2493 // IR-PCH-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
2494 // IR-PCH-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
2495 // IR-PCH-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
2496 // IR-PCH-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
2497 // IR-PCH-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
2498 // IR-PCH-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
2499 // IR-PCH-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
2500 // IR-PCH-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
2501 // IR-PCH-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
2502 // IR-PCH-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
2503 // IR-PCH-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
2504 // IR-PCH-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i32 0, i32 0, i32 0
2505 // IR-PCH-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
2506 // IR-PCH-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
2507 // IR-PCH-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
2508 // IR-PCH: omp.arrayinit.body:
2509 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
2510 // IR-PCH-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2511 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2512 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
2513 // IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
2514 // IR-PCH: omp.arrayinit.done:
2515 // IR-PCH-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2516 // IR-PCH-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
2517 // IR-PCH-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP5]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
2518 // IR-PCH-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
2519 // IR-PCH-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP6]], 99
2520 // IR-PCH-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
2521 // IR-PCH: cond.true:
2522 // IR-PCH-NEXT: br label [[COND_END:%.*]]
2523 // IR-PCH: cond.false:
2524 // IR-PCH-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
2525 // IR-PCH-NEXT: br label [[COND_END]]
2526 // IR-PCH: cond.end:
2527 // IR-PCH-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP7]], [[COND_FALSE]] ]
2528 // IR-PCH-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
2529 // IR-PCH-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
2530 // IR-PCH-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4
2531 // IR-PCH-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
2532 // IR-PCH: omp.inner.for.cond:
2533 // IR-PCH-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]]
2534 // IR-PCH-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP3]]
2535 // IR-PCH-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
2536 // IR-PCH-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
2537 // IR-PCH: omp.inner.for.body:
2538 // IR-PCH-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2539 // IR-PCH-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP11]], 10
2540 // IR-PCH-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
2541 // IR-PCH-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
2542 // IR-PCH-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2543 // IR-PCH-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2544 // IR-PCH-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2545 // IR-PCH-NEXT: [[DIV7:%.*]] = sdiv i32 [[TMP13]], 10
2546 // IR-PCH-NEXT: [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
2547 // IR-PCH-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP12]], [[MUL8]]
2548 // IR-PCH-NEXT: [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
2549 // IR-PCH-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
2550 // IR-PCH-NEXT: store i32 [[ADD10]], ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
2551 // IR-PCH-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2552 // IR-PCH-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
2553 // IR-PCH-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64
2554 // IR-PCH-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i64 0, i64 [[IDXPROM]]
2555 // IR-PCH-NEXT: [[TMP16:%.*]] = load i32, ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
2556 // IR-PCH-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP16]] to i64
2557 // IR-PCH-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
2558 // IR-PCH-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
2559 // IR-PCH-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP14]]
2560 // IR-PCH-NEXT: store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
2561 // IR-PCH-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
2562 // IR-PCH: omp.body.continue:
2563 // IR-PCH-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
2564 // IR-PCH: omp.inner.for.inc:
2565 // IR-PCH-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2566 // IR-PCH-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP18]], 1
2567 // IR-PCH-NEXT: store i32 [[ADD14]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
2568 // IR-PCH-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
2569 // IR-PCH: omp.inner.for.end:
2570 // IR-PCH-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
2571 // IR-PCH: omp.loop.exit:
2572 // IR-PCH-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2573 // IR-PCH-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
2574 // IR-PCH-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
2575 // IR-PCH-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
2576 // IR-PCH-NEXT: store ptr [[SUM4]], ptr [[TMP21]], align 8
2577 // IR-PCH-NEXT: [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
2578 // IR-PCH-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
2579 // IR-PCH-NEXT: [[TMP24:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP23]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
2580 // IR-PCH-NEXT: switch i32 [[TMP24]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
2581 // IR-PCH-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
2582 // IR-PCH-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
2583 // IR-PCH-NEXT: ]
2584 // IR-PCH: .omp.reduction.case1:
2585 // IR-PCH-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2586 // IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
2587 // IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2588 // IR-PCH: omp.arraycpy.body:
2589 // IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2590 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2591 // IR-PCH-NEXT: [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
2592 // IR-PCH-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2593 // IR-PCH-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
2594 // IR-PCH-NEXT: store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
2595 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
2596 // IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2597 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
2598 // IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
2599 // IR-PCH: omp.arraycpy.done19:
2600 // IR-PCH-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
2601 // IR-PCH-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
2602 // IR-PCH: .omp.reduction.case2:
2603 // IR-PCH-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
2604 // IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY20:%.*]] = icmp eq ptr [[TMP0]], [[TMP28]]
2605 // IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY20]], label [[OMP_ARRAYCPY_DONE27:%.*]], label [[OMP_ARRAYCPY_BODY21:%.*]]
2606 // IR-PCH: omp.arraycpy.body21:
2607 // IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST22:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT25:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
2608 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST23:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT24:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
2609 // IR-PCH-NEXT: [[TMP29:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], align 4
2610 // IR-PCH-NEXT: [[TMP30:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 [[TMP29]] monotonic, align 4
2611 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT24]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 1
2612 // IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT25]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], i32 1
2613 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE26:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT24]], [[TMP28]]
2614 // IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE26]], label [[OMP_ARRAYCPY_DONE27]], label [[OMP_ARRAYCPY_BODY21]]
2615 // IR-PCH: omp.arraycpy.done27:
2616 // IR-PCH-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
2617 // IR-PCH-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
2618 // IR-PCH: .omp.reduction.default:
2619 // IR-PCH-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
2620 // IR-PCH-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
2621 // IR-PCH-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
2622 // IR-PCH: .omp.lastprivate.then:
2623 // IR-PCH-NEXT: store i32 10, ptr [[J3]], align 4
2624 // IR-PCH-NEXT: [[TMP33:%.*]] = load i32, ptr [[J3]], align 4
2625 // IR-PCH-NEXT: store i32 [[TMP33]], ptr [[J_ADDR]], align 4
2626 // IR-PCH-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
2627 // IR-PCH: .omp.lastprivate.done:
2628 // IR-PCH-NEXT: ret void
2631 // IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined.omp.reduction.reduction_func
2632 // IR-PCH-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
2633 // IR-PCH-NEXT: entry:
2634 // IR-PCH-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
2635 // IR-PCH-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
2636 // IR-PCH-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
2637 // IR-PCH-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
2638 // IR-PCH-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
2639 // IR-PCH-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
2640 // IR-PCH-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
2641 // IR-PCH-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
2642 // IR-PCH-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
2643 // IR-PCH-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
2644 // IR-PCH-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
2645 // IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
2646 // IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2647 // IR-PCH: omp.arraycpy.body:
2648 // IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2649 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2650 // IR-PCH-NEXT: [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2651 // IR-PCH-NEXT: [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2652 // IR-PCH-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
2653 // IR-PCH-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2654 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2655 // IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2656 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
2657 // IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
2658 // IR-PCH: omp.arraycpy.done2:
2659 // IR-PCH-NEXT: ret void
2662 // IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp.reduction.reduction_func
2663 // IR-PCH-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
2664 // IR-PCH-NEXT: entry:
2665 // IR-PCH-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
2666 // IR-PCH-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
2667 // IR-PCH-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
2668 // IR-PCH-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
2669 // IR-PCH-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
2670 // IR-PCH-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
2671 // IR-PCH-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
2672 // IR-PCH-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
2673 // IR-PCH-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
2674 // IR-PCH-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
2675 // IR-PCH-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
2676 // IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
2677 // IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
2678 // IR-PCH: omp.arraycpy.body:
2679 // IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2680 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
2681 // IR-PCH-NEXT: [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2682 // IR-PCH-NEXT: [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
2683 // IR-PCH-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
2684 // IR-PCH-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
2685 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
2686 // IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
2687 // IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
2688 // IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
2689 // IR-PCH: omp.arraycpy.done2:
2690 // IR-PCH-NEXT: ret void