1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s
4 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
5 target triple = "arm64-apple-darwin"
7 declare void @use(double)
9 ; The extracts %v1.lane.0 and %v1.lane.1 should be considered free during SLP,
10 ; because they will be directly in a vector register on AArch64.
11 define void @noop_extracts_first_2_lanes(ptr %ptr.1, ptr %ptr.2) {
12 ; CHECK-LABEL: @noop_extracts_first_2_lanes(
14 ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, ptr [[PTR_1:%.*]], align 8
15 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
16 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
17 ; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[V_1]], [[TMP0]]
18 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[V_1]], i32 0
19 ; CHECK-NEXT: call void @use(double [[TMP2]])
20 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 1
21 ; CHECK-NEXT: call void @use(double [[TMP3]])
22 ; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[PTR_1]], align 8
23 ; CHECK-NEXT: ret void
26 %v.1 = load <2 x double>, ptr %ptr.1, align 8
27 %v1.lane.0 = extractelement <2 x double> %v.1, i32 0
28 %v1.lane.1 = extractelement <2 x double> %v.1, i32 1
30 %v.2 = load <4 x double>, ptr %ptr.2, align 16
31 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
32 %v2.lane.3 = extractelement <4 x double> %v.2, i32 3
34 %a.lane.0 = fmul double %v1.lane.0, %v2.lane.2
35 %a.lane.1 = fmul double %v1.lane.1, %v2.lane.3
37 %a.ins.0 = insertelement <2 x double> undef, double %a.lane.0, i32 0
38 %a.ins.1 = insertelement <2 x double> %a.ins.0, double %a.lane.1, i32 1
40 call void @use(double %v1.lane.0)
41 call void @use(double %v1.lane.1)
43 store <2 x double> %a.ins.1, ptr %ptr.1, align 8
47 ; Extracts of consecutive indices, but different vector operand.
48 define void @extracts_first_2_lanes_different_vectors(ptr %ptr.1, ptr %ptr.2, ptr %ptr.3) {
49 ; CHECK-LABEL: @extracts_first_2_lanes_different_vectors(
51 ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, ptr [[PTR_1:%.*]], align 8
52 ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <2 x double> [[V_1]], i32 0
53 ; CHECK-NEXT: [[V_3:%.*]] = load <2 x double>, ptr [[PTR_3:%.*]], align 8
54 ; CHECK-NEXT: [[V3_LANE_1:%.*]] = extractelement <2 x double> [[V_3]], i32 1
55 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
56 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x double> [[V_1]], <2 x double> [[V_3]], <2 x i32> <i32 0, i32 3>
57 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> <i32 2, i32 2>
58 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
59 ; CHECK-NEXT: call void @use(double [[V1_LANE_0]])
60 ; CHECK-NEXT: call void @use(double [[V3_LANE_1]])
61 ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[PTR_1]], align 8
62 ; CHECK-NEXT: ret void
65 %v.1 = load <2 x double>, ptr %ptr.1, align 8
66 %v1.lane.0 = extractelement <2 x double> %v.1, i32 0
67 %v.3 = load <2 x double>, ptr %ptr.3, align 8
68 %v3.lane.1 = extractelement <2 x double> %v.3, i32 1
70 %v.2 = load <4 x double>, ptr %ptr.2, align 16
71 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
73 %a.lane.0 = fmul double %v1.lane.0, %v2.lane.2
74 %a.lane.1 = fmul double %v3.lane.1, %v2.lane.2
76 %a.ins.0 = insertelement <2 x double> undef, double %a.lane.0, i32 0
77 %a.ins.1 = insertelement <2 x double> %a.ins.0, double %a.lane.1, i32 1
79 call void @use(double %v1.lane.0)
80 call void @use(double %v3.lane.1)
82 store <2 x double> %a.ins.1, ptr %ptr.1, align 8
86 ; The extracts %v1.lane.2 and %v1.lane.3 should be considered free during SLP,
87 ; because they will be directly in a vector register on AArch64.
88 define void @noop_extract_second_2_lanes(ptr %ptr.1, ptr %ptr.2) {
89 ; CHECK-LABEL: @noop_extract_second_2_lanes(
91 ; CHECK-NEXT: [[V_1:%.*]] = load <4 x double>, ptr [[PTR_1:%.*]], align 8
92 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <4 x double> [[V_1]], i32 2
93 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <4 x double> [[V_1]], i32 3
94 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
95 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_1]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
96 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> <i32 2, i32 2>
97 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
98 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
99 ; CHECK-NEXT: call void @use(double [[V1_LANE_2]])
100 ; CHECK-NEXT: call void @use(double [[V1_LANE_3]])
101 ; CHECK-NEXT: store <4 x double> [[TMP3]], ptr [[PTR_1]], align 8
102 ; CHECK-NEXT: ret void
105 %v.1 = load <4 x double>, ptr %ptr.1, align 8
106 %v1.lane.2 = extractelement <4 x double> %v.1, i32 2
107 %v1.lane.3 = extractelement <4 x double> %v.1, i32 3
109 %v.2 = load <4 x double>, ptr %ptr.2, align 16
110 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
112 %a.lane.0 = fmul double %v1.lane.2, %v2.lane.2
113 %a.lane.1 = fmul double %v1.lane.3, %v2.lane.2
115 %a.ins.0 = insertelement <4 x double> undef, double %a.lane.0, i32 0
116 %a.ins.1 = insertelement <4 x double> %a.ins.0, double %a.lane.1, i32 1
118 call void @use(double %v1.lane.2)
119 call void @use(double %v1.lane.3)
120 store <4 x double> %a.ins.1, ptr %ptr.1, align 8
124 ; %v1.lane.0 and %v1.lane.1 are used in reverse-order, so they won't be
125 ; directly in a vector register on AArch64.
126 define void @extract_reverse_order(ptr %ptr.1, ptr %ptr.2) {
127 ; CHECK-LABEL: @extract_reverse_order(
129 ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, ptr [[PTR_1:%.*]], align 8
130 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
131 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> <i32 2, i32 2>
132 ; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[V_1]], [[TMP0]]
133 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
134 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 0
135 ; CHECK-NEXT: call void @use(double [[TMP3]])
136 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 1
137 ; CHECK-NEXT: call void @use(double [[TMP4]])
138 ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[PTR_1]], align 8
139 ; CHECK-NEXT: ret void
142 %v.1 = load <2 x double>, ptr %ptr.1, align 8
143 %v1.lane.0 = extractelement <2 x double> %v.1, i32 0
144 %v1.lane.1 = extractelement <2 x double> %v.1, i32 1
146 %v.2 = load <4 x double>, ptr %ptr.2, align 16
147 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
149 %a.lane.0 = fmul double %v1.lane.1, %v2.lane.2
150 %a.lane.1 = fmul double %v1.lane.0, %v2.lane.2
152 %a.ins.0 = insertelement <2 x double> undef, double %a.lane.0, i32 0
153 %a.ins.1 = insertelement <2 x double> %a.ins.0, double %a.lane.1, i32 1
155 call void @use(double %v1.lane.0)
156 call void @use(double %v1.lane.1)
158 store <2 x double> %a.ins.1, ptr %ptr.1, align 8
162 ; %v1.lane.1 and %v1.lane.2 are extracted from different vector registers on AArch64.
163 define void @extract_lanes_1_and_2(ptr %ptr.1, ptr %ptr.2) {
164 ; CHECK-LABEL: @extract_lanes_1_and_2(
166 ; CHECK-NEXT: [[V_1:%.*]] = load <4 x double>, ptr [[PTR_1:%.*]], align 8
167 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <4 x double> [[V_1]], i32 1
168 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <4 x double> [[V_1]], i32 2
169 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
170 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_1]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
171 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> <i32 2, i32 2>
172 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
173 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
174 ; CHECK-NEXT: call void @use(double [[V1_LANE_1]])
175 ; CHECK-NEXT: call void @use(double [[V1_LANE_2]])
176 ; CHECK-NEXT: store <4 x double> [[TMP3]], ptr [[PTR_1]], align 8
177 ; CHECK-NEXT: ret void
180 %v.1 = load <4 x double>, ptr %ptr.1, align 8
181 %v1.lane.1 = extractelement <4 x double> %v.1, i32 1
182 %v1.lane.2 = extractelement <4 x double> %v.1, i32 2
184 %v.2 = load <4 x double>, ptr %ptr.2, align 16
185 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
187 %a.lane.0 = fmul double %v1.lane.1, %v2.lane.2
188 %a.lane.1 = fmul double %v1.lane.2, %v2.lane.2
190 %a.ins.0 = insertelement <4 x double> undef, double %a.lane.0, i32 0
191 %a.ins.1 = insertelement <4 x double> %a.ins.0, double %a.lane.1, i32 1
193 call void @use(double %v1.lane.1)
194 call void @use(double %v1.lane.2)
196 store <4 x double> %a.ins.1, ptr %ptr.1, align 8
200 ; More complex case where the extracted lanes are directly from a vector
201 ; register on AArch64 and should be considered free, because we can
202 ; directly use the source vector register.
203 define void @noop_extracts_existing_vector_4_lanes(ptr %ptr.1, ptr %ptr.2) {
204 ; CHECK-LABEL: @noop_extracts_existing_vector_4_lanes(
206 ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, ptr [[PTR_1:%.*]], align 8
207 ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0
208 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1
209 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2
210 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <9 x double> [[V_1]], i32 3
211 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
212 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1
213 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
214 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <4 x i32> <i32 2, i32 0, i32 2, i32 2>
215 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP0]], [[TMP1]]
216 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <9 x i32> <i32 2, i32 3, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 7>
217 ; CHECK-NEXT: call void @use(double [[V1_LANE_0]])
218 ; CHECK-NEXT: call void @use(double [[V1_LANE_1]])
219 ; CHECK-NEXT: call void @use(double [[V1_LANE_2]])
220 ; CHECK-NEXT: call void @use(double [[V1_LANE_3]])
221 ; CHECK-NEXT: store <9 x double> [[TMP3]], ptr [[PTR_1]], align 8
222 ; CHECK-NEXT: ret void
225 %v.1 = load <9 x double>, ptr %ptr.1, align 8
226 %v1.lane.0 = extractelement <9 x double> %v.1, i32 0
227 %v1.lane.1 = extractelement <9 x double> %v.1, i32 1
228 %v1.lane.2 = extractelement <9 x double> %v.1, i32 2
229 %v1.lane.3 = extractelement <9 x double> %v.1, i32 3
230 %v.2 = load <4 x double>, ptr %ptr.2, align 16
231 %v2.lane.0 = extractelement <4 x double> %v.2, i32 0
232 %v2.lane.1 = extractelement <4 x double> %v.2, i32 1
233 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
234 %a.lane.0 = fmul double %v1.lane.2, %v2.lane.2
235 %a.lane.1 = fmul double %v1.lane.3, %v2.lane.2
236 %a.lane.2 = fmul double %v1.lane.0, %v2.lane.2
237 %a.lane.3 = fmul double %v1.lane.1, %v2.lane.0
238 %a.ins.0 = insertelement <9 x double> undef, double %a.lane.0, i32 0
239 %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
240 %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
241 %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
242 call void @use(double %v1.lane.0)
243 call void @use(double %v1.lane.1)
244 call void @use(double %v1.lane.2)
245 call void @use(double %v1.lane.3)
246 store <9 x double> %a.ins.3, ptr %ptr.1, align 8
250 ; Extracted lanes are not used in the right order, so we cannot reuse the
251 ; source vector registers directly.
252 define void @extracts_jumbled_4_lanes(ptr %ptr.1, ptr %ptr.2) {
253 ; CHECK-LABEL: @extracts_jumbled_4_lanes(
255 ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, ptr [[PTR_1:%.*]], align 8
256 ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0
257 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1
258 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2
259 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <9 x double> [[V_1]], i32 3
260 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
261 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
262 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <4 x i32> <i32 2, i32 2, i32 1, i32 0>
263 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP0]], [[TMP1]]
264 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <9 x i32> <i32 0, i32 2, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 7>
265 ; CHECK-NEXT: call void @use(double [[V1_LANE_0]])
266 ; CHECK-NEXT: call void @use(double [[V1_LANE_1]])
267 ; CHECK-NEXT: call void @use(double [[V1_LANE_2]])
268 ; CHECK-NEXT: call void @use(double [[V1_LANE_3]])
269 ; CHECK-NEXT: store <9 x double> [[TMP3]], ptr [[PTR_1]], align 8
270 ; CHECK-NEXT: ret void
273 %v.1 = load <9 x double>, ptr %ptr.1, align 8
274 %v1.lane.0 = extractelement <9 x double> %v.1, i32 0
275 %v1.lane.1 = extractelement <9 x double> %v.1, i32 1
276 %v1.lane.2 = extractelement <9 x double> %v.1, i32 2
277 %v1.lane.3 = extractelement <9 x double> %v.1, i32 3
278 %v.2 = load <4 x double>, ptr %ptr.2, align 16
279 %v2.lane.0 = extractelement <4 x double> %v.2, i32 0
280 %v2.lane.1 = extractelement <4 x double> %v.2, i32 1
281 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
282 %a.lane.0 = fmul double %v1.lane.0, %v2.lane.2
283 %a.lane.1 = fmul double %v1.lane.2, %v2.lane.1
284 %a.lane.2 = fmul double %v1.lane.1, %v2.lane.2
285 %a.lane.3 = fmul double %v1.lane.3, %v2.lane.0
286 %a.ins.0 = insertelement <9 x double> undef, double %a.lane.0, i32 0
287 %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
288 %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
289 %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
290 call void @use(double %v1.lane.0)
291 call void @use(double %v1.lane.1)
292 call void @use(double %v1.lane.2)
293 call void @use(double %v1.lane.3)
294 store <9 x double> %a.ins.3, ptr %ptr.1, align 8
299 ; Even more complex case where the extracted lanes are directly from a vector
300 ; register on AArch64 and should be considered free, because we can
301 ; directly use the source vector register.
302 define void @noop_extracts_9_lanes(ptr %ptr.1, ptr %ptr.2) {
303 ; CHECK-LABEL: @noop_extracts_9_lanes(
305 ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, ptr [[PTR_1:%.*]], align 8
306 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2
307 ; CHECK-NEXT: [[V1_LANE_5:%.*]] = extractelement <9 x double> [[V_1]], i32 5
308 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
309 ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0
310 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 0, i32 1>
311 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> <i32 0, i32 2, i32 1, i32 0, i32 2, i32 0, i32 2, i32 1>
312 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <8 x double> [[TMP0]], [[TMP1]]
313 ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_0]]
314 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
315 ; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP3]], double [[A_LANE_8]], i32 8
316 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> <i32 6, i32 7, i32 8, i32 0, i32 1, i32 2, i32 3, i32 4>
317 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> <i32 2, i32 1, i32 0, i32 2, i32 1, i32 0, i32 2, i32 1>
318 ; CHECK-NEXT: [[TMP6:%.*]] = fmul <8 x double> [[TMP4]], [[TMP5]]
319 ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]]
320 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
321 ; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP7]], double [[B_LANE_8]], i32 8
322 ; CHECK-NEXT: [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]]
323 ; CHECK-NEXT: store <9 x double> [[RES]], ptr [[PTR_1]], align 8
324 ; CHECK-NEXT: ret void
327 %v.1 = load <9 x double>, ptr %ptr.1, align 8
328 %v1.lane.0 = extractelement <9 x double> %v.1, i32 0
329 %v1.lane.1 = extractelement <9 x double> %v.1, i32 1
330 %v1.lane.2 = extractelement <9 x double> %v.1, i32 2
331 %v1.lane.3 = extractelement <9 x double> %v.1, i32 3
332 %v1.lane.4 = extractelement <9 x double> %v.1, i32 4
333 %v1.lane.5 = extractelement <9 x double> %v.1, i32 5
334 %v1.lane.6 = extractelement <9 x double> %v.1, i32 6
335 %v1.lane.7 = extractelement <9 x double> %v.1, i32 7
336 %v1.lane.8 = extractelement <9 x double> %v.1, i32 8
338 %v.2 = load <4 x double>, ptr %ptr.2, align 16
339 %v2.lane.0 = extractelement <4 x double> %v.2, i32 0
340 %v2.lane.1 = extractelement <4 x double> %v.2, i32 1
341 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
343 %a.lane.0 = fmul double %v1.lane.3, %v2.lane.0
344 %a.lane.1 = fmul double %v1.lane.4, %v2.lane.2
345 %a.lane.2 = fmul double %v1.lane.5, %v2.lane.1
346 %a.lane.3 = fmul double %v1.lane.6, %v2.lane.0
347 %a.lane.4 = fmul double %v1.lane.7, %v2.lane.2
348 %a.lane.5 = fmul double %v1.lane.8, %v2.lane.0
349 %a.lane.6 = fmul double %v1.lane.0, %v2.lane.2
350 %a.lane.7 = fmul double %v1.lane.1, %v2.lane.1
351 %a.lane.8 = fmul double %v1.lane.2, %v2.lane.0
353 %a.ins.0 = insertelement <9 x double> undef, double %a.lane.0, i32 0
354 %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
355 %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
356 %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
357 %a.ins.4 = insertelement <9 x double> %a.ins.3, double %a.lane.4, i32 4
358 %a.ins.5 = insertelement <9 x double> %a.ins.4, double %a.lane.5, i32 5
359 %a.ins.6 = insertelement <9 x double> %a.ins.5, double %a.lane.6, i32 6
360 %a.ins.7 = insertelement <9 x double> %a.ins.6, double %a.lane.7, i32 7
361 %a.ins.8 = insertelement <9 x double> %a.ins.7, double %a.lane.8, i32 8
363 %b.lane.0 = fmul double %v1.lane.6, %v2.lane.2
364 %b.lane.1 = fmul double %v1.lane.7, %v2.lane.1
365 %b.lane.2 = fmul double %v1.lane.8, %v2.lane.0
366 %b.lane.3 = fmul double %v1.lane.0, %v2.lane.2
367 %b.lane.4 = fmul double %v1.lane.1, %v2.lane.1
368 %b.lane.5 = fmul double %v1.lane.2, %v2.lane.0
369 %b.lane.6 = fmul double %v1.lane.3, %v2.lane.2
370 %b.lane.7 = fmul double %v1.lane.4, %v2.lane.1
371 %b.lane.8 = fmul double %v1.lane.5, %v2.lane.0
373 %b.ins.0 = insertelement <9 x double> undef, double %b.lane.0, i32 0
374 %b.ins.1 = insertelement <9 x double> %b.ins.0, double %b.lane.1, i32 1
375 %b.ins.2 = insertelement <9 x double> %b.ins.1, double %b.lane.2, i32 2
376 %b.ins.3 = insertelement <9 x double> %b.ins.2, double %b.lane.3, i32 3
377 %b.ins.4 = insertelement <9 x double> %b.ins.3, double %b.lane.4, i32 4
378 %b.ins.5 = insertelement <9 x double> %b.ins.4, double %b.lane.5, i32 5
379 %b.ins.6 = insertelement <9 x double> %b.ins.5, double %b.lane.6, i32 6
380 %b.ins.7 = insertelement <9 x double> %b.ins.6, double %b.lane.7, i32 7
381 %b.ins.8 = insertelement <9 x double> %b.ins.7, double %b.lane.8, i32 8
383 %res = fsub <9 x double> %a.ins.8, %b.ins.8
384 store <9 x double> %res, ptr %ptr.1, align 8
388 ; Extracted lanes used in first fmul chain are not used in the right order, so
389 ; we cannot reuse the source vector registers directly.
390 define void @first_mul_chain_jumbled(ptr %ptr.1, ptr %ptr.2) {
391 ; CHECK-LABEL: @first_mul_chain_jumbled(
393 ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, ptr [[PTR_1:%.*]], align 8
394 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2
395 ; CHECK-NEXT: [[V1_LANE_5:%.*]] = extractelement <9 x double> [[V_1]], i32 5
396 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
397 ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0
398 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1
399 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> <i32 4, i32 3, i32 6, i32 5, i32 8, i32 7, i32 1, i32 0>
400 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 2, i32 1, i32 0, i32 2>
401 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <8 x double> [[TMP0]], [[TMP1]]
402 ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_1]]
403 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
404 ; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP3]], double [[A_LANE_8]], i32 8
405 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> <i32 6, i32 7, i32 8, i32 0, i32 1, i32 2, i32 3, i32 4>
406 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <8 x double> [[TMP4]], [[TMP1]]
407 ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]]
408 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
409 ; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP6]], double [[B_LANE_8]], i32 8
410 ; CHECK-NEXT: [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]]
411 ; CHECK-NEXT: store <9 x double> [[RES]], ptr [[PTR_1]], align 8
412 ; CHECK-NEXT: ret void
415 %v.1 = load <9 x double>, ptr %ptr.1, align 8
416 %v1.lane.0 = extractelement <9 x double> %v.1, i32 0
417 %v1.lane.1 = extractelement <9 x double> %v.1, i32 1
418 %v1.lane.2 = extractelement <9 x double> %v.1, i32 2
419 %v1.lane.3 = extractelement <9 x double> %v.1, i32 3
420 %v1.lane.4 = extractelement <9 x double> %v.1, i32 4
421 %v1.lane.5 = extractelement <9 x double> %v.1, i32 5
422 %v1.lane.6 = extractelement <9 x double> %v.1, i32 6
423 %v1.lane.7 = extractelement <9 x double> %v.1, i32 7
424 %v1.lane.8 = extractelement <9 x double> %v.1, i32 8
426 %v.2 = load <4 x double>, ptr %ptr.2, align 16
427 %v2.lane.0 = extractelement <4 x double> %v.2, i32 0
428 %v2.lane.1 = extractelement <4 x double> %v.2, i32 1
429 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
431 %a.lane.0 = fmul double %v1.lane.4, %v2.lane.1
432 %a.lane.1 = fmul double %v1.lane.3, %v2.lane.0
433 %a.lane.2 = fmul double %v1.lane.6, %v2.lane.2
434 %a.lane.3 = fmul double %v1.lane.5, %v2.lane.0
435 %a.lane.4 = fmul double %v1.lane.8, %v2.lane.2
436 %a.lane.5 = fmul double %v1.lane.7, %v2.lane.1
437 %a.lane.6 = fmul double %v1.lane.1, %v2.lane.0
438 %a.lane.7 = fmul double %v1.lane.0, %v2.lane.2
439 %a.lane.8 = fmul double %v1.lane.2, %v2.lane.1
441 %a.ins.0 = insertelement <9 x double> undef, double %a.lane.0, i32 0
442 %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
443 %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
444 %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
445 %a.ins.4 = insertelement <9 x double> %a.ins.3, double %a.lane.4, i32 4
446 %a.ins.5 = insertelement <9 x double> %a.ins.4, double %a.lane.5, i32 5
447 %a.ins.6 = insertelement <9 x double> %a.ins.5, double %a.lane.6, i32 6
448 %a.ins.7 = insertelement <9 x double> %a.ins.6, double %a.lane.7, i32 7
449 %a.ins.8 = insertelement <9 x double> %a.ins.7, double %a.lane.8, i32 8
451 %b.lane.0 = fmul double %v1.lane.6, %v2.lane.1
452 %b.lane.1 = fmul double %v1.lane.7, %v2.lane.0
453 %b.lane.2 = fmul double %v1.lane.8, %v2.lane.2
454 %b.lane.3 = fmul double %v1.lane.0, %v2.lane.0
455 %b.lane.4 = fmul double %v1.lane.1, %v2.lane.2
456 %b.lane.5 = fmul double %v1.lane.2, %v2.lane.1
457 %b.lane.6 = fmul double %v1.lane.3, %v2.lane.0
458 %b.lane.7 = fmul double %v1.lane.4, %v2.lane.2
459 %b.lane.8 = fmul double %v1.lane.5, %v2.lane.0
461 %b.ins.0 = insertelement <9 x double> undef, double %b.lane.0, i32 0
462 %b.ins.1 = insertelement <9 x double> %b.ins.0, double %b.lane.1, i32 1
463 %b.ins.2 = insertelement <9 x double> %b.ins.1, double %b.lane.2, i32 2
464 %b.ins.3 = insertelement <9 x double> %b.ins.2, double %b.lane.3, i32 3
465 %b.ins.4 = insertelement <9 x double> %b.ins.3, double %b.lane.4, i32 4
466 %b.ins.5 = insertelement <9 x double> %b.ins.4, double %b.lane.5, i32 5
467 %b.ins.6 = insertelement <9 x double> %b.ins.5, double %b.lane.6, i32 6
468 %b.ins.7 = insertelement <9 x double> %b.ins.6, double %b.lane.7, i32 7
469 %b.ins.8 = insertelement <9 x double> %b.ins.7, double %b.lane.8, i32 8
471 %res = fsub <9 x double> %a.ins.8, %b.ins.8
472 store <9 x double> %res, ptr %ptr.1, align 8
476 ; Extracted lanes used in both fmul chain are not used in the right order, so
477 ; we cannot reuse the source vector registers directly.
478 define void @first_and_second_mul_chain_jumbled(ptr %ptr.1, ptr %ptr.2) {
479 ; CHECK-LABEL: @first_and_second_mul_chain_jumbled(
481 ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, ptr [[PTR_1:%.*]], align 8
482 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2
483 ; CHECK-NEXT: [[V1_LANE_4:%.*]] = extractelement <9 x double> [[V_1]], i32 4
484 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
485 ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0
486 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
487 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> <i32 4, i32 3, i32 5, i32 6, i32 8, i32 7, i32 1, i32 0>
488 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> <i32 0, i32 2, i32 1, i32 2, i32 1, i32 0, i32 2, i32 1>
489 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <8 x double> [[TMP0]], [[TMP1]]
490 ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_0]]
491 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
492 ; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP3]], double [[A_LANE_8]], i32 8
493 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> <i32 7, i32 6, i32 8, i32 1, i32 0, i32 3, i32 2, i32 5>
494 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> <i32 2, i32 1, i32 0, i32 2, i32 0, i32 2, i32 1, i32 0>
495 ; CHECK-NEXT: [[TMP6:%.*]] = fmul <8 x double> [[TMP4]], [[TMP5]]
496 ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_4]], [[V2_LANE_2]]
497 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
498 ; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP7]], double [[B_LANE_8]], i32 8
499 ; CHECK-NEXT: [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]]
500 ; CHECK-NEXT: store <9 x double> [[RES]], ptr [[PTR_1]], align 8
501 ; CHECK-NEXT: ret void
504 %v.1 = load <9 x double>, ptr %ptr.1, align 8
505 %v1.lane.0 = extractelement <9 x double> %v.1, i32 0
506 %v1.lane.1 = extractelement <9 x double> %v.1, i32 1
507 %v1.lane.2 = extractelement <9 x double> %v.1, i32 2
508 %v1.lane.3 = extractelement <9 x double> %v.1, i32 3
509 %v1.lane.4 = extractelement <9 x double> %v.1, i32 4
510 %v1.lane.5 = extractelement <9 x double> %v.1, i32 5
511 %v1.lane.6 = extractelement <9 x double> %v.1, i32 6
512 %v1.lane.7 = extractelement <9 x double> %v.1, i32 7
513 %v1.lane.8 = extractelement <9 x double> %v.1, i32 8
515 %v.2 = load <4 x double>, ptr %ptr.2, align 16
516 %v2.lane.0 = extractelement <4 x double> %v.2, i32 0
517 %v2.lane.1 = extractelement <4 x double> %v.2, i32 1
518 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
520 %a.lane.0 = fmul double %v1.lane.4, %v2.lane.0
521 %a.lane.1 = fmul double %v1.lane.3, %v2.lane.2
522 %a.lane.2 = fmul double %v1.lane.5, %v2.lane.1
523 %a.lane.3 = fmul double %v1.lane.6, %v2.lane.2
524 %a.lane.4 = fmul double %v1.lane.8, %v2.lane.1
525 %a.lane.5 = fmul double %v1.lane.7, %v2.lane.0
526 %a.lane.6 = fmul double %v1.lane.1, %v2.lane.2
527 %a.lane.7 = fmul double %v1.lane.0, %v2.lane.1
528 %a.lane.8 = fmul double %v1.lane.2, %v2.lane.0
530 %a.ins.0 = insertelement <9 x double> undef, double %a.lane.0, i32 0
531 %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
532 %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
533 %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
534 %a.ins.4 = insertelement <9 x double> %a.ins.3, double %a.lane.4, i32 4
535 %a.ins.5 = insertelement <9 x double> %a.ins.4, double %a.lane.5, i32 5
536 %a.ins.6 = insertelement <9 x double> %a.ins.5, double %a.lane.6, i32 6
537 %a.ins.7 = insertelement <9 x double> %a.ins.6, double %a.lane.7, i32 7
538 %a.ins.8 = insertelement <9 x double> %a.ins.7, double %a.lane.8, i32 8
540 %b.lane.0 = fmul double %v1.lane.7, %v2.lane.2
541 %b.lane.1 = fmul double %v1.lane.6, %v2.lane.1
542 %b.lane.2 = fmul double %v1.lane.8, %v2.lane.0
543 %b.lane.3 = fmul double %v1.lane.1, %v2.lane.2
544 %b.lane.4 = fmul double %v1.lane.0, %v2.lane.0
545 %b.lane.5 = fmul double %v1.lane.3, %v2.lane.2
546 %b.lane.6 = fmul double %v1.lane.2, %v2.lane.1
547 %b.lane.7 = fmul double %v1.lane.5, %v2.lane.0
548 %b.lane.8 = fmul double %v1.lane.4, %v2.lane.2
550 %b.ins.0 = insertelement <9 x double> undef, double %b.lane.0, i32 0
551 %b.ins.1 = insertelement <9 x double> %b.ins.0, double %b.lane.1, i32 1
552 %b.ins.2 = insertelement <9 x double> %b.ins.1, double %b.lane.2, i32 2
553 %b.ins.3 = insertelement <9 x double> %b.ins.2, double %b.lane.3, i32 3
554 %b.ins.4 = insertelement <9 x double> %b.ins.3, double %b.lane.4, i32 4
555 %b.ins.5 = insertelement <9 x double> %b.ins.4, double %b.lane.5, i32 5
556 %b.ins.6 = insertelement <9 x double> %b.ins.5, double %b.lane.6, i32 6
557 %b.ins.7 = insertelement <9 x double> %b.ins.6, double %b.lane.7, i32 7
558 %b.ins.8 = insertelement <9 x double> %b.ins.7, double %b.lane.8, i32 8
560 %res = fsub <9 x double> %a.ins.8, %b.ins.8
561 store <9 x double> %res, ptr %ptr.1, align 8