1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=riscv64 -mattr=+v -S %s | FileCheck --check-prefixes=CHECK,NON-POW2 %s
3 ; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=riscv64 -mattr=+v -S %s | FileCheck --check-prefixes=CHECK,POW2-ONLY %s
5 define void @v3_load_i32_mul_by_constant_store(ptr %src, ptr %dst) {
6 ; NON-POW2-LABEL: @v3_load_i32_mul_by_constant_store(
7 ; NON-POW2-NEXT: entry:
8 ; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
9 ; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4
10 ; NON-POW2-NEXT: [[TMP1:%.*]] = mul nsw <3 x i32> [[TMP0]], splat (i32 10)
11 ; NON-POW2-NEXT: store <3 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
12 ; NON-POW2-NEXT: ret void
14 ; POW2-ONLY-LABEL: @v3_load_i32_mul_by_constant_store(
15 ; POW2-ONLY-NEXT: entry:
16 ; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
17 ; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
18 ; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
19 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10
20 ; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_0]], align 4
21 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = mul nsw <2 x i32> [[TMP0]], splat (i32 10)
22 ; POW2-ONLY-NEXT: store <2 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
23 ; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
24 ; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
25 ; POW2-ONLY-NEXT: ret void
28 %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
29 %l.src.0 = load i32, ptr %gep.src.0, align 4
30 %mul.0 = mul nsw i32 %l.src.0, 10
32 %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
33 %l.src.1 = load i32, ptr %gep.src.1, align 4
34 %mul.1 = mul nsw i32 %l.src.1, 10
36 %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
37 %l.src.2 = load i32, ptr %gep.src.2, align 4
38 %mul.2 = mul nsw i32 %l.src.2, 10
40 store i32 %mul.0, ptr %dst
42 %dst.1 = getelementptr i32, ptr %dst, i32 1
43 store i32 %mul.1, ptr %dst.1
45 %dst.2 = getelementptr i32, ptr %dst, i32 2
46 store i32 %mul.2, ptr %dst.2
51 ; Should no be vectorized with a undef/poison element as padding, as
52 ; division by undef/poison may cause UB. Must use VL predication or
53 ; masking instead, where RISCV wins.
54 define void @v3_load_i32_udiv_by_constant_store(ptr %src, ptr %dst) {
55 ; NON-POW2-LABEL: @v3_load_i32_udiv_by_constant_store(
56 ; NON-POW2-NEXT: entry:
57 ; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
58 ; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4
59 ; NON-POW2-NEXT: [[TMP1:%.*]] = udiv <3 x i32> splat (i32 10), [[TMP0]]
60 ; NON-POW2-NEXT: store <3 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
61 ; NON-POW2-NEXT: ret void
63 ; POW2-ONLY-LABEL: @v3_load_i32_udiv_by_constant_store(
64 ; POW2-ONLY-NEXT: entry:
65 ; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
66 ; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
67 ; POW2-ONLY-NEXT: [[MUL_0:%.*]] = udiv i32 10, [[L_SRC_0]]
68 ; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
69 ; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
70 ; POW2-ONLY-NEXT: [[MUL_1:%.*]] = udiv i32 10, [[L_SRC_1]]
71 ; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
72 ; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
73 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = udiv i32 10, [[L_SRC_2]]
74 ; POW2-ONLY-NEXT: store i32 [[MUL_0]], ptr [[DST:%.*]], align 4
75 ; POW2-ONLY-NEXT: [[DST_1:%.*]] = getelementptr i32, ptr [[DST]], i32 1
76 ; POW2-ONLY-NEXT: store i32 [[MUL_1]], ptr [[DST_1]], align 4
77 ; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
78 ; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
79 ; POW2-ONLY-NEXT: ret void
82 %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
83 %l.src.0 = load i32, ptr %gep.src.0, align 4
84 %mul.0 = udiv i32 10, %l.src.0
86 %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
87 %l.src.1 = load i32, ptr %gep.src.1, align 4
88 %mul.1 = udiv i32 10, %l.src.1
90 %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
91 %l.src.2 = load i32, ptr %gep.src.2, align 4
92 %mul.2 = udiv i32 10, %l.src.2
94 store i32 %mul.0, ptr %dst
96 %dst.1 = getelementptr i32, ptr %dst, i32 1
97 store i32 %mul.1, ptr %dst.1
99 %dst.2 = getelementptr i32, ptr %dst, i32 2
100 store i32 %mul.2, ptr %dst.2
107 define void @v3_load_i32_mul_store(ptr %src.1, ptr %src.2, ptr %dst) {
108 ; NON-POW2-LABEL: @v3_load_i32_mul_store(
109 ; NON-POW2-NEXT: entry:
110 ; NON-POW2-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
111 ; NON-POW2-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
112 ; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4
113 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4
114 ; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]]
115 ; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
116 ; NON-POW2-NEXT: ret void
118 ; POW2-ONLY-LABEL: @v3_load_i32_mul_store(
119 ; POW2-ONLY-NEXT: entry:
120 ; POW2-ONLY-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
121 ; POW2-ONLY-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
122 ; POW2-ONLY-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
123 ; POW2-ONLY-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
124 ; POW2-ONLY-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
125 ; POW2-ONLY-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
126 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
127 ; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4
128 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4
129 ; POW2-ONLY-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]]
130 ; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[DST:%.*]], align 4
131 ; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
132 ; POW2-ONLY-NEXT: store i32 [[MUL_2]], ptr [[DST_2]], align 4
133 ; POW2-ONLY-NEXT: ret void
136 %gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0
137 %l.src.1.0 = load i32, ptr %gep.src.1.0, align 4
138 %gep.src.2.0 = getelementptr inbounds i32, ptr %src.2, i32 0
139 %l.src.2.0 = load i32, ptr %gep.src.2.0, align 4
140 %mul.0 = mul nsw i32 %l.src.1.0, %l.src.2.0
142 %gep.src.1.1 = getelementptr inbounds i32, ptr %src.1, i32 1
143 %l.src.1.1 = load i32, ptr %gep.src.1.1, align 4
144 %gep.src.2.1 = getelementptr inbounds i32, ptr %src.2, i32 1
145 %l.src.2.1 = load i32, ptr %gep.src.2.1, align 4
146 %mul.1 = mul nsw i32 %l.src.1.1, %l.src.2.1
148 %gep.src.1.2 = getelementptr inbounds i32, ptr %src.1, i32 2
149 %l.src.1.2 = load i32, ptr %gep.src.1.2, align 4
150 %gep.src.2.2 = getelementptr inbounds i32, ptr %src.2, i32 2
151 %l.src.2.2 = load i32, ptr %gep.src.2.2, align 4
152 %mul.2 = mul nsw i32 %l.src.1.2, %l.src.2.2
154 store i32 %mul.0, ptr %dst
156 %dst.1 = getelementptr i32, ptr %dst, i32 1
157 store i32 %mul.1, ptr %dst.1
159 %dst.2 = getelementptr i32, ptr %dst, i32 2
160 store i32 %mul.2, ptr %dst.2
165 define void @v3_load_i32_mul_add_const_store(ptr %src.1, ptr %src.2, ptr %dst) {
166 ; NON-POW2-LABEL: @v3_load_i32_mul_add_const_store(
167 ; NON-POW2-NEXT: entry:
168 ; NON-POW2-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
169 ; NON-POW2-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
170 ; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x i32>, ptr [[GEP_SRC_1_0]], align 4
171 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_2_0]], align 4
172 ; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP0]], [[TMP1]]
173 ; NON-POW2-NEXT: [[TMP3:%.*]] = add <3 x i32> [[TMP2]], splat (i32 9)
174 ; NON-POW2-NEXT: store <3 x i32> [[TMP3]], ptr [[DST:%.*]], align 4
175 ; NON-POW2-NEXT: ret void
177 ; POW2-ONLY-LABEL: @v3_load_i32_mul_add_const_store(
178 ; POW2-ONLY-NEXT: entry:
179 ; POW2-ONLY-NEXT: [[GEP_SRC_1_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_1:%.*]], i32 0
180 ; POW2-ONLY-NEXT: [[GEP_SRC_2_0:%.*]] = getelementptr inbounds i32, ptr [[SRC_2:%.*]], i32 0
181 ; POW2-ONLY-NEXT: [[GEP_SRC_1_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_1]], i32 2
182 ; POW2-ONLY-NEXT: [[L_SRC_1_2:%.*]] = load i32, ptr [[GEP_SRC_1_2]], align 4
183 ; POW2-ONLY-NEXT: [[GEP_SRC_2_2:%.*]] = getelementptr inbounds i32, ptr [[SRC_2]], i32 2
184 ; POW2-ONLY-NEXT: [[L_SRC_2_2:%.*]] = load i32, ptr [[GEP_SRC_2_2]], align 4
185 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_1_2]], [[L_SRC_2_2]]
186 ; POW2-ONLY-NEXT: [[ADD_2:%.*]] = add i32 [[MUL_2]], 9
187 ; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[GEP_SRC_1_0]], align 4
188 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_SRC_2_0]], align 4
189 ; POW2-ONLY-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP0]], [[TMP1]]
190 ; POW2-ONLY-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], splat (i32 9)
191 ; POW2-ONLY-NEXT: store <2 x i32> [[TMP3]], ptr [[DST:%.*]], align 4
192 ; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
193 ; POW2-ONLY-NEXT: store i32 [[ADD_2]], ptr [[DST_2]], align 4
194 ; POW2-ONLY-NEXT: ret void
197 %gep.src.1.0 = getelementptr inbounds i32, ptr %src.1, i32 0
198 %l.src.1.0 = load i32, ptr %gep.src.1.0, align 4
199 %gep.src.2.0 = getelementptr inbounds i32, ptr %src.2, i32 0
200 %l.src.2.0 = load i32, ptr %gep.src.2.0, align 4
201 %mul.0 = mul nsw i32 %l.src.1.0, %l.src.2.0
202 %add.0 = add i32 %mul.0, 9
204 %gep.src.1.1 = getelementptr inbounds i32, ptr %src.1, i32 1
205 %l.src.1.1 = load i32, ptr %gep.src.1.1, align 4
206 %gep.src.2.1 = getelementptr inbounds i32, ptr %src.2, i32 1
207 %l.src.2.1 = load i32, ptr %gep.src.2.1, align 4
208 %mul.1 = mul nsw i32 %l.src.1.1, %l.src.2.1
209 %add.1 = add i32 %mul.1, 9
211 %gep.src.1.2 = getelementptr inbounds i32, ptr %src.1, i32 2
212 %l.src.1.2 = load i32, ptr %gep.src.1.2, align 4
213 %gep.src.2.2 = getelementptr inbounds i32, ptr %src.2, i32 2
214 %l.src.2.2 = load i32, ptr %gep.src.2.2, align 4
215 %mul.2 = mul nsw i32 %l.src.1.2, %l.src.2.2
216 %add.2 = add i32 %mul.2, 9
218 store i32 %add.0, ptr %dst
220 %dst.1 = getelementptr i32, ptr %dst, i32 1
221 store i32 %add.1, ptr %dst.1
223 %dst.2 = getelementptr i32, ptr %dst, i32 2
224 store i32 %add.2, ptr %dst.2
229 define void @v3_load_f32_fadd_fadd_by_constant_store(ptr %src, ptr %dst) {
230 ; NON-POW2-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
231 ; NON-POW2-NEXT: entry:
232 ; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
233 ; NON-POW2-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4
234 ; NON-POW2-NEXT: [[TMP1:%.*]] = fadd <3 x float> [[TMP0]], splat (float 1.000000e+01)
235 ; NON-POW2-NEXT: store <3 x float> [[TMP1]], ptr [[DST:%.*]], align 4
236 ; NON-POW2-NEXT: ret void
238 ; POW2-ONLY-LABEL: @v3_load_f32_fadd_fadd_by_constant_store(
239 ; POW2-ONLY-NEXT: entry:
240 ; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
241 ; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
242 ; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
243 ; POW2-ONLY-NEXT: [[FADD_2:%.*]] = fadd float [[L_SRC_2]], 1.000000e+01
244 ; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[GEP_SRC_0]], align 4
245 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[TMP0]], splat (float 1.000000e+01)
246 ; POW2-ONLY-NEXT: store <2 x float> [[TMP1]], ptr [[DST:%.*]], align 4
247 ; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr float, ptr [[DST]], i32 2
248 ; POW2-ONLY-NEXT: store float [[FADD_2]], ptr [[DST_2]], align 4
249 ; POW2-ONLY-NEXT: ret void
252 %gep.src.0 = getelementptr inbounds float, ptr %src, i32 0
253 %l.src.0 = load float , ptr %gep.src.0, align 4
254 %fadd.0 = fadd float %l.src.0, 10.0
256 %gep.src.1 = getelementptr inbounds float , ptr %src, i32 1
257 %l.src.1 = load float, ptr %gep.src.1, align 4
258 %fadd.1 = fadd float %l.src.1, 10.0
260 %gep.src.2 = getelementptr inbounds float, ptr %src, i32 2
261 %l.src.2 = load float, ptr %gep.src.2, align 4
262 %fadd.2 = fadd float %l.src.2, 10.0
264 store float %fadd.0, ptr %dst
266 %dst.1 = getelementptr float, ptr %dst, i32 1
267 store float %fadd.1, ptr %dst.1
269 %dst.2 = getelementptr float, ptr %dst, i32 2
270 store float %fadd.2, ptr %dst.2
275 define void @phi_store3(ptr %dst) {
276 ; NON-POW2-LABEL: @phi_store3(
277 ; NON-POW2-NEXT: entry:
278 ; NON-POW2-NEXT: br label [[EXIT:%.*]]
279 ; NON-POW2: invoke.cont8.loopexit:
280 ; NON-POW2-NEXT: br label [[EXIT]]
282 ; NON-POW2-NEXT: [[TMP0:%.*]] = phi <3 x i32> [ <i32 1, i32 2, i32 3>, [[ENTRY:%.*]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
283 ; NON-POW2-NEXT: store <3 x i32> [[TMP0]], ptr [[DST:%.*]], align 4
284 ; NON-POW2-NEXT: ret void
286 ; POW2-ONLY-LABEL: @phi_store3(
287 ; POW2-ONLY-NEXT: entry:
288 ; POW2-ONLY-NEXT: br label [[EXIT:%.*]]
289 ; POW2-ONLY: invoke.cont8.loopexit:
290 ; POW2-ONLY-NEXT: br label [[EXIT]]
292 ; POW2-ONLY-NEXT: [[P_2:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
293 ; POW2-ONLY-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ <i32 1, i32 2>, [[ENTRY]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT]] ]
294 ; POW2-ONLY-NEXT: [[DST_2:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 2
295 ; POW2-ONLY-NEXT: store <2 x i32> [[TMP0]], ptr [[DST]], align 4
296 ; POW2-ONLY-NEXT: store i32 [[P_2]], ptr [[DST_2]], align 4
297 ; POW2-ONLY-NEXT: ret void
302 invoke.cont8.loopexit: ; No predecessors!
306 %p.0 = phi i32 [ 1, %entry ], [ 0, %invoke.cont8.loopexit ]
307 %p.1 = phi i32 [ 2, %entry ], [ 0, %invoke.cont8.loopexit ]
308 %p.2 = phi i32 [ 3, %entry ], [ 0, %invoke.cont8.loopexit ]
310 %dst.1 = getelementptr i32, ptr %dst, i32 1
311 %dst.2 = getelementptr i32, ptr %dst, i32 2
313 store i32 %p.0, ptr %dst, align 4
314 store i32 %p.1, ptr %dst.1, align 4
315 store i32 %p.2, ptr %dst.2, align 4
319 define void @store_try_reorder(ptr %dst) {
320 ; NON-POW2-LABEL: @store_try_reorder(
321 ; NON-POW2-NEXT: entry:
322 ; NON-POW2-NEXT: store <3 x i32> zeroinitializer, ptr [[DST:%.*]], align 4
323 ; NON-POW2-NEXT: ret void
325 ; POW2-ONLY-LABEL: @store_try_reorder(
326 ; POW2-ONLY-NEXT: entry:
327 ; POW2-ONLY-NEXT: [[ADD:%.*]] = add i32 0, 0
328 ; POW2-ONLY-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4
329 ; POW2-ONLY-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1
330 ; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4
331 ; POW2-ONLY-NEXT: ret void
335 store i32 %add, ptr %dst, align 4
336 %add207 = sub i32 0, 0
337 %arrayidx.i1887 = getelementptr i32, ptr %dst, i64 1
338 store i32 %add207, ptr %arrayidx.i1887, align 4
339 %add216 = sub i32 0, 0
340 %arrayidx.i1891 = getelementptr i32, ptr %dst, i64 2
341 store i32 %add216, ptr %arrayidx.i1891, align 4
345 define void @vec3_fpext_cost(ptr %Colour, float %0) {
346 ; NON-POW2-LABEL: @vec3_fpext_cost(
347 ; NON-POW2-NEXT: entry:
348 ; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 0
349 ; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> zeroinitializer
350 ; NON-POW2-NEXT: [[TMP3:%.*]] = fpext <3 x float> [[TMP2]] to <3 x double>
351 ; NON-POW2-NEXT: [[TMP4:%.*]] = call <3 x double> @llvm.fmuladd.v3f64(<3 x double> [[TMP3]], <3 x double> zeroinitializer, <3 x double> zeroinitializer)
352 ; NON-POW2-NEXT: [[TMP5:%.*]] = fptrunc <3 x double> [[TMP4]] to <3 x float>
353 ; NON-POW2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR:%.*]], align 4
354 ; NON-POW2-NEXT: ret void
356 ; POW2-ONLY-LABEL: @vec3_fpext_cost(
357 ; POW2-ONLY-NEXT: entry:
358 ; POW2-ONLY-NEXT: [[ARRAYIDX80:%.*]] = getelementptr float, ptr [[COLOUR:%.*]], i64 2
359 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0:%.*]], i32 0
360 ; POW2-ONLY-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
361 ; POW2-ONLY-NEXT: [[TMP3:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
362 ; POW2-ONLY-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP3]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
363 ; POW2-ONLY-NEXT: [[TMP5:%.*]] = fptrunc <2 x double> [[TMP4]] to <2 x float>
364 ; POW2-ONLY-NEXT: store <2 x float> [[TMP5]], ptr [[COLOUR]], align 4
365 ; POW2-ONLY-NEXT: [[CONV78:%.*]] = fpext float [[TMP0]] to double
366 ; POW2-ONLY-NEXT: [[TMP6:%.*]] = call double @llvm.fmuladd.f64(double [[CONV78]], double 0.000000e+00, double 0.000000e+00)
367 ; POW2-ONLY-NEXT: [[CONV82:%.*]] = fptrunc double [[TMP6]] to float
368 ; POW2-ONLY-NEXT: store float [[CONV82]], ptr [[ARRAYIDX80]], align 4
369 ; POW2-ONLY-NEXT: ret void
372 %arrayidx72 = getelementptr float, ptr %Colour, i64 1
373 %arrayidx80 = getelementptr float, ptr %Colour, i64 2
374 %conv62 = fpext float %0 to double
375 %1 = call double @llvm.fmuladd.f64(double %conv62, double 0.000000e+00, double 0.000000e+00)
376 %conv66 = fptrunc double %1 to float
377 store float %conv66, ptr %Colour, align 4
378 %conv70 = fpext float %0 to double
379 %2 = call double @llvm.fmuladd.f64(double %conv70, double 0.000000e+00, double 0.000000e+00)
380 %conv74 = fptrunc double %2 to float
381 store float %conv74, ptr %arrayidx72, align 4
382 %conv78 = fpext float %0 to double
383 %3 = call double @llvm.fmuladd.f64(double %conv78, double 0.000000e+00, double 0.000000e+00)
384 %conv82 = fptrunc double %3 to float
385 store float %conv82, ptr %arrayidx80, align 4
389 define void @fpext_scatter(ptr %dst, double %conv) {
390 ; CHECK-LABEL: @fpext_scatter(
392 ; CHECK-NEXT: [[CONV25:%.*]] = fptrunc double [[CONV:%.*]] to float
393 ; CHECK-NEXT: [[LENGTHS:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 0
394 ; CHECK-NEXT: store float [[CONV25]], ptr [[LENGTHS]], align 4
395 ; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr float, ptr [[DST]], i64 1
396 ; CHECK-NEXT: store float [[CONV25]], ptr [[ARRAYIDX32]], align 4
397 ; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr float, ptr [[DST]], i64 2
398 ; CHECK-NEXT: store float [[CONV25]], ptr [[ARRAYIDX37]], align 4
399 ; CHECK-NEXT: ret void
402 %conv25 = fptrunc double %conv to float
403 %Lengths = getelementptr float, ptr %dst, i64 0
404 store float %conv25, ptr %Lengths, align 4
405 %arrayidx32 = getelementptr float, ptr %dst, i64 1
406 store float %conv25, ptr %arrayidx32, align 4
407 %arrayidx37 = getelementptr float, ptr %dst, i64 2
408 store float %conv25, ptr %arrayidx37, align 4
412 define i32 @reduce_add(ptr %src) {
413 ; CHECK-LABEL: @reduce_add(
414 ; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
415 ; CHECK-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
416 ; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
417 ; CHECK-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
418 ; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
419 ; CHECK-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
420 ; CHECK-NEXT: [[ADD_0:%.*]] = add i32 [[L_SRC_0]], [[L_SRC_1]]
421 ; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[L_SRC_2]]
422 ; CHECK-NEXT: ret i32 [[ADD_1]]
424 %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
425 %l.src.0 = load i32, ptr %gep.src.0, align 4
426 %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
427 %l.src.1 = load i32, ptr %gep.src.1, align 4
428 %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
429 %l.src.2 = load i32, ptr %gep.src.2, align 4
431 %add.0 = add i32 %l.src.0, %l.src.1
432 %add.1 = add i32 %add.0, %l.src.2
436 define float @reduce_fadd(ptr %src) {
437 ; NON-POW2-LABEL: @reduce_fadd(
438 ; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
439 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_SRC_0]], align 4
440 ; NON-POW2-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP1]])
441 ; NON-POW2-NEXT: ret float [[TMP2]]
443 ; POW2-ONLY-LABEL: @reduce_fadd(
444 ; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i32 0
445 ; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4
446 ; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 1
447 ; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
448 ; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i32 2
449 ; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
450 ; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[L_SRC_0]], [[L_SRC_1]]
451 ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[L_SRC_2]]
452 ; POW2-ONLY-NEXT: ret float [[ADD_1]]
454 %gep.src.0 = getelementptr inbounds float, ptr %src, i32 0
455 %l.src.0 = load float, ptr %gep.src.0, align 4
456 %gep.src.1 = getelementptr inbounds float, ptr %src, i32 1
457 %l.src.1 = load float, ptr %gep.src.1, align 4
458 %gep.src.2 = getelementptr inbounds float, ptr %src, i32 2
459 %l.src.2 = load float, ptr %gep.src.2, align 4
461 %add.0 = fadd fast float %l.src.0, %l.src.1
462 %add.1 = fadd fast float %add.0, %l.src.2
466 define i32 @reduce_add_after_mul(ptr %src) {
467 ; NON-POW2-LABEL: @reduce_add_after_mul(
468 ; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
469 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_SRC_0]], align 4
470 ; NON-POW2-NEXT: [[TMP2:%.*]] = mul nsw <3 x i32> [[TMP1]], splat (i32 10)
471 ; NON-POW2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP2]])
472 ; NON-POW2-NEXT: ret i32 [[TMP3]]
474 ; POW2-ONLY-LABEL: @reduce_add_after_mul(
475 ; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 0
476 ; POW2-ONLY-NEXT: [[L_SRC_0:%.*]] = load i32, ptr [[GEP_SRC_0]], align 4
477 ; POW2-ONLY-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 1
478 ; POW2-ONLY-NEXT: [[L_SRC_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 4
479 ; POW2-ONLY-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 2
480 ; POW2-ONLY-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
481 ; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_SRC_0]], 10
482 ; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_SRC_1]], 10
483 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_SRC_2]], 10
484 ; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
485 ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
486 ; POW2-ONLY-NEXT: ret i32 [[ADD_1]]
488 %gep.src.0 = getelementptr inbounds i32, ptr %src, i32 0
489 %l.src.0 = load i32, ptr %gep.src.0, align 4
490 %gep.src.1 = getelementptr inbounds i32, ptr %src, i32 1
491 %l.src.1 = load i32, ptr %gep.src.1, align 4
492 %gep.src.2 = getelementptr inbounds i32, ptr %src, i32 2
493 %l.src.2 = load i32, ptr %gep.src.2, align 4
495 %mul.0 = mul nsw i32 %l.src.0, 10
496 %mul.1 = mul nsw i32 %l.src.1, 10
497 %mul.2 = mul nsw i32 %l.src.2, 10
499 %add.0 = add i32 %mul.0, %mul.1
500 %add.1 = add i32 %add.0, %mul.2
504 define i32 @dot_product_i32(ptr %a, ptr %b) {
505 ; NON-POW2-LABEL: @dot_product_i32(
506 ; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
507 ; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
508 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_A_0]], align 4
509 ; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr [[GEP_B_0]], align 4
510 ; NON-POW2-NEXT: [[TMP3:%.*]] = mul nsw <3 x i32> [[TMP1]], [[TMP2]]
511 ; NON-POW2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP3]])
512 ; NON-POW2-NEXT: ret i32 [[TMP4]]
514 ; POW2-ONLY-LABEL: @dot_product_i32(
515 ; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
516 ; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
517 ; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
518 ; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
519 ; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
520 ; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
521 ; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
522 ; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
523 ; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
524 ; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
525 ; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
526 ; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
527 ; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
528 ; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
529 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
530 ; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
531 ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
532 ; POW2-ONLY-NEXT: ret i32 [[ADD_1]]
534 %gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0
535 %l.a.0 = load i32, ptr %gep.a.0, align 4
536 %gep.a.1 = getelementptr inbounds i32, ptr %a, i32 1
537 %l.a.1 = load i32, ptr %gep.a.1, align 4
538 %gep.a.2 = getelementptr inbounds i32, ptr %a, i32 2
539 %l.a.2 = load i32, ptr %gep.a.2, align 4
541 %gep.b.0 = getelementptr inbounds i32, ptr %b, i32 0
542 %l.b.0 = load i32, ptr %gep.b.0, align 4
543 %gep.b.1 = getelementptr inbounds i32, ptr %b, i32 1
544 %l.b.1 = load i32, ptr %gep.b.1, align 4
545 %gep.b.2 = getelementptr inbounds i32, ptr %b, i32 2
546 %l.b.2 = load i32, ptr %gep.b.2, align 4
548 %mul.0 = mul nsw i32 %l.a.0, %l.b.0
549 %mul.1 = mul nsw i32 %l.a.1, %l.b.1
550 %mul.2 = mul nsw i32 %l.a.2, %l.b.2
552 %add.0 = add i32 %mul.0, %mul.1
553 %add.1 = add i32 %add.0, %mul.2
557 ; Same as above, except the reduction order has been perturbed. This
558 ; is checking for our ability to reorder.
559 define i32 @dot_product_i32_reorder(ptr %a, ptr %b) {
560 ; NON-POW2-LABEL: @dot_product_i32_reorder(
561 ; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
562 ; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
563 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[GEP_A_0]], align 4
564 ; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x i32>, ptr [[GEP_B_0]], align 4
565 ; NON-POW2-NEXT: [[TMP3:%.*]] = mul nsw <3 x i32> [[TMP1]], [[TMP2]]
566 ; NON-POW2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[TMP3]])
567 ; NON-POW2-NEXT: ret i32 [[TMP4]]
569 ; POW2-ONLY-LABEL: @dot_product_i32_reorder(
570 ; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
571 ; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
572 ; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
573 ; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
574 ; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
575 ; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
576 ; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
577 ; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
578 ; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
579 ; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
580 ; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
581 ; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
582 ; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
583 ; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
584 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
585 ; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]]
586 ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
587 ; POW2-ONLY-NEXT: ret i32 [[ADD_1]]
589 %gep.a.0 = getelementptr inbounds i32, ptr %a, i32 0
590 %l.a.0 = load i32, ptr %gep.a.0, align 4
591 %gep.a.1 = getelementptr inbounds i32, ptr %a, i32 1
592 %l.a.1 = load i32, ptr %gep.a.1, align 4
593 %gep.a.2 = getelementptr inbounds i32, ptr %a, i32 2
594 %l.a.2 = load i32, ptr %gep.a.2, align 4
596 %gep.b.0 = getelementptr inbounds i32, ptr %b, i32 0
597 %l.b.0 = load i32, ptr %gep.b.0, align 4
598 %gep.b.1 = getelementptr inbounds i32, ptr %b, i32 1
599 %l.b.1 = load i32, ptr %gep.b.1, align 4
600 %gep.b.2 = getelementptr inbounds i32, ptr %b, i32 2
601 %l.b.2 = load i32, ptr %gep.b.2, align 4
603 %mul.0 = mul nsw i32 %l.a.0, %l.b.0
604 %mul.1 = mul nsw i32 %l.a.1, %l.b.1
605 %mul.2 = mul nsw i32 %l.a.2, %l.b.2
607 %add.0 = add i32 %mul.1, %mul.0
608 %add.1 = add i32 %add.0, %mul.2
612 define float @dot_product_fp32(ptr %a, ptr %b) {
613 ; NON-POW2-LABEL: @dot_product_fp32(
614 ; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
615 ; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
616 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
617 ; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
618 ; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
619 ; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
620 ; NON-POW2-NEXT: ret float [[TMP4]]
622 ; POW2-ONLY-LABEL: @dot_product_fp32(
623 ; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
624 ; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
625 ; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
626 ; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
627 ; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
628 ; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
629 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
630 ; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
631 ; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
632 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
633 ; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
634 ; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
635 ; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
636 ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
637 ; POW2-ONLY-NEXT: ret float [[ADD_1]]
639 %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
640 %l.a.0 = load float, ptr %gep.a.0, align 4
641 %gep.a.1 = getelementptr inbounds float, ptr %a, i32 1
642 %l.a.1 = load float, ptr %gep.a.1, align 4
643 %gep.a.2 = getelementptr inbounds float, ptr %a, i32 2
644 %l.a.2 = load float, ptr %gep.a.2, align 4
646 %gep.b.0 = getelementptr inbounds float, ptr %b, i32 0
647 %l.b.0 = load float, ptr %gep.b.0, align 4
648 %gep.b.1 = getelementptr inbounds float, ptr %b, i32 1
649 %l.b.1 = load float, ptr %gep.b.1, align 4
650 %gep.b.2 = getelementptr inbounds float, ptr %b, i32 2
651 %l.b.2 = load float, ptr %gep.b.2, align 4
653 %mul.0 = fmul fast float %l.a.0, %l.b.0
654 %mul.1 = fmul fast float %l.a.1, %l.b.1
655 %mul.2 = fmul fast float %l.a.2, %l.b.2
657 %add.0 = fadd fast float %mul.0, %mul.1
658 %add.1 = fadd fast float %add.0, %mul.2
662 ; Same as above, except the reduction order has been perturbed. This
663 ; is checking for our ability to reorder.
664 define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
665 ; NON-POW2-LABEL: @dot_product_fp32_reorder(
666 ; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
667 ; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
668 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
669 ; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
670 ; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
671 ; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
672 ; NON-POW2-NEXT: ret float [[TMP4]]
674 ; POW2-ONLY-LABEL: @dot_product_fp32_reorder(
675 ; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
676 ; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
677 ; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
678 ; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
679 ; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
680 ; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
681 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_A_0]], align 4
682 ; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
683 ; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
684 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
685 ; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
686 ; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
687 ; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]]
688 ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
689 ; POW2-ONLY-NEXT: ret float [[ADD_1]]
691 %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
692 %l.a.0 = load float, ptr %gep.a.0, align 4
693 %gep.a.1 = getelementptr inbounds float, ptr %a, i32 1
694 %l.a.1 = load float, ptr %gep.a.1, align 4
695 %gep.a.2 = getelementptr inbounds float, ptr %a, i32 2
696 %l.a.2 = load float, ptr %gep.a.2, align 4
698 %gep.b.0 = getelementptr inbounds float, ptr %b, i32 0
699 %l.b.0 = load float, ptr %gep.b.0, align 4
700 %gep.b.1 = getelementptr inbounds float, ptr %b, i32 1
701 %l.b.1 = load float, ptr %gep.b.1, align 4
702 %gep.b.2 = getelementptr inbounds float, ptr %b, i32 2
703 %l.b.2 = load float, ptr %gep.b.2, align 4
705 %mul.0 = fmul fast float %l.a.0, %l.b.0
706 %mul.1 = fmul fast float %l.a.1, %l.b.1
707 %mul.2 = fmul fast float %l.a.2, %l.b.2
709 %add.0 = fadd fast float %mul.1, %mul.0
710 %add.1 = fadd fast float %add.0, %mul.2
715 define double @dot_product_fp64(ptr %a, ptr %b) {
716 ; NON-POW2-LABEL: @dot_product_fp64(
717 ; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
718 ; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
719 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x double>, ptr [[GEP_A_0]], align 4
720 ; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x double>, ptr [[GEP_B_0]], align 4
721 ; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x double> [[TMP1]], [[TMP2]]
722 ; NON-POW2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]])
723 ; NON-POW2-NEXT: ret double [[TMP4]]
725 ; POW2-ONLY-LABEL: @dot_product_fp64(
726 ; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
727 ; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
728 ; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
729 ; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
730 ; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
731 ; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
732 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[GEP_A_0]], align 4
733 ; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[GEP_B_0]], align 4
734 ; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]]
735 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
736 ; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
737 ; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
738 ; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
739 ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
740 ; POW2-ONLY-NEXT: ret double [[ADD_1]]
742 %gep.a.0 = getelementptr inbounds double, ptr %a, i32 0
743 %l.a.0 = load double, ptr %gep.a.0, align 4
744 %gep.a.1 = getelementptr inbounds double, ptr %a, i32 1
745 %l.a.1 = load double, ptr %gep.a.1, align 4
746 %gep.a.2 = getelementptr inbounds double, ptr %a, i32 2
747 %l.a.2 = load double, ptr %gep.a.2, align 4
749 %gep.b.0 = getelementptr inbounds double, ptr %b, i32 0
750 %l.b.0 = load double, ptr %gep.b.0, align 4
751 %gep.b.1 = getelementptr inbounds double, ptr %b, i32 1
752 %l.b.1 = load double, ptr %gep.b.1, align 4
753 %gep.b.2 = getelementptr inbounds double, ptr %b, i32 2
754 %l.b.2 = load double, ptr %gep.b.2, align 4
756 %mul.0 = fmul fast double %l.a.0, %l.b.0
757 %mul.1 = fmul fast double %l.a.1, %l.b.1
758 %mul.2 = fmul fast double %l.a.2, %l.b.2
760 %add.0 = fadd fast double %mul.0, %mul.1
761 %add.1 = fadd fast double %add.0, %mul.2
765 ;; Covers a case where SLP would previous crash due to a
766 ;; missing bailout in TryToFindDuplicates for the case
767 ;; where a VL=3 list was vectorized directly (without
768 ;; a root instruction such as a store or reduce).
769 define double @no_root_reshuffle(ptr %ptr) {
770 ; CHECK-LABEL: @no_root_reshuffle(
772 ; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[PTR:%.*]], align 8
773 ; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[TMP0]], [[TMP0]]
774 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
775 ; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX2]], align 8
776 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 16
777 ; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
778 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast double [[TMP2]], [[TMP2]]
779 ; CHECK-NEXT: [[MUL6:%.*]] = fmul fast double [[TMP3]], [[TMP1]]
780 ; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[MUL6]], [[MUL]]
781 ; CHECK-NEXT: ret double [[ADD]]
784 %0 = load double, ptr %ptr, align 8
785 %mul = fmul fast double %0, %0
786 %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 8
787 %1 = load double, ptr %arrayidx2, align 8
788 %arrayidx3 = getelementptr inbounds i8, ptr %ptr, i64 16
789 %2 = load double, ptr %arrayidx3, align 8
790 %3 = fmul fast double %2, %2
791 %mul6 = fmul fast double %3, %1
792 %add = fadd fast double %mul6, %mul
796 define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) {
797 ; NON-POW2-LABEL: @reduce_fadd_after_fmul_of_buildvec(
798 ; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[A:%.*]], i32 0
799 ; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[B:%.*]], i32 1
800 ; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[C:%.*]], i32 2
801 ; NON-POW2-NEXT: [[TMP4:%.*]] = fmul fast <3 x float> [[TMP3]], splat (float 1.000000e+01)
802 ; NON-POW2-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP4]])
803 ; NON-POW2-NEXT: ret float [[TMP5]]
805 ; POW2-ONLY-LABEL: @reduce_fadd_after_fmul_of_buildvec(
806 ; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
807 ; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
808 ; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
809 ; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
810 ; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
811 ; POW2-ONLY-NEXT: ret float [[ADD_1]]
813 %mul.0 = fmul fast float %a, 10.0
814 %mul.1 = fmul fast float %b, 10.0
815 %mul.2 = fmul fast float %c, 10.0
817 %add.0 = fadd fast float %mul.0, %mul.1
818 %add.1 = fadd fast float %add.0, %mul.2
823 declare float @llvm.fmuladd.f32(float, float, float)
825 declare double @llvm.fmuladd.f64(double, double, double)