1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s
4 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
5 target triple = "arm64-apple-darwin"
7 declare void @use(double)
9 ; The extracts %v1.lane.0 and %v1.lane.1 should be considered free during SLP,
10 ; because they will be directly in a vector register on AArch64.
11 define void @noop_extracts_first_2_lanes(ptr %ptr.1, ptr %ptr.2) {
12 ; CHECK-LABEL: @noop_extracts_first_2_lanes(
14 ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, ptr [[PTR_1:%.*]], align 8
15 ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <2 x double> [[V_1]], i32 0
16 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <2 x double> [[V_1]], i32 1
17 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
18 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
19 ; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[V_1]], [[TMP0]]
20 ; CHECK-NEXT: call void @use(double [[V1_LANE_0]])
21 ; CHECK-NEXT: call void @use(double [[V1_LANE_1]])
22 ; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[PTR_1]], align 8
23 ; CHECK-NEXT: ret void
26 %v.1 = load <2 x double>, ptr %ptr.1, align 8
27 %v1.lane.0 = extractelement <2 x double> %v.1, i32 0
28 %v1.lane.1 = extractelement <2 x double> %v.1, i32 1
30 %v.2 = load <4 x double>, ptr %ptr.2, align 16
31 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
32 %v2.lane.3 = extractelement <4 x double> %v.2, i32 3
34 %a.lane.0 = fmul double %v1.lane.0, %v2.lane.2
35 %a.lane.1 = fmul double %v1.lane.1, %v2.lane.3
37 %a.ins.0 = insertelement <2 x double> zeroinitializer, double %a.lane.0, i32 0
38 %a.ins.1 = insertelement <2 x double> %a.ins.0, double %a.lane.1, i32 1
40 call void @use(double %v1.lane.0)
41 call void @use(double %v1.lane.1)
43 store <2 x double> %a.ins.1, ptr %ptr.1, align 8
47 ; Extracts of consecutive indices, but different vector operand.
48 define void @extracts_first_2_lanes_different_vectors(ptr %ptr.1, ptr %ptr.2, ptr %ptr.3) {
49 ; CHECK-LABEL: @extracts_first_2_lanes_different_vectors(
51 ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, ptr [[PTR_1:%.*]], align 8
52 ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <2 x double> [[V_1]], i32 0
53 ; CHECK-NEXT: [[V_3:%.*]] = load <2 x double>, ptr [[PTR_3:%.*]], align 8
54 ; CHECK-NEXT: [[V3_LANE_1:%.*]] = extractelement <2 x double> [[V_3]], i32 1
55 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
56 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x double> [[V_1]], <2 x double> [[V_3]], <2 x i32> <i32 0, i32 3>
57 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> <i32 2, i32 2>
58 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
59 ; CHECK-NEXT: call void @use(double [[V1_LANE_0]])
60 ; CHECK-NEXT: call void @use(double [[V3_LANE_1]])
61 ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[PTR_1]], align 8
62 ; CHECK-NEXT: ret void
65 %v.1 = load <2 x double>, ptr %ptr.1, align 8
66 %v1.lane.0 = extractelement <2 x double> %v.1, i32 0
67 %v.3 = load <2 x double>, ptr %ptr.3, align 8
68 %v3.lane.1 = extractelement <2 x double> %v.3, i32 1
70 %v.2 = load <4 x double>, ptr %ptr.2, align 16
71 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
73 %a.lane.0 = fmul double %v1.lane.0, %v2.lane.2
74 %a.lane.1 = fmul double %v3.lane.1, %v2.lane.2
76 %a.ins.0 = insertelement <2 x double> zeroinitializer, double %a.lane.0, i32 0
77 %a.ins.1 = insertelement <2 x double> %a.ins.0, double %a.lane.1, i32 1
79 call void @use(double %v1.lane.0)
80 call void @use(double %v3.lane.1)
82 store <2 x double> %a.ins.1, ptr %ptr.1, align 8
86 ; The extracts %v1.lane.2 and %v1.lane.3 should be considered free during SLP,
87 ; because they will be directly in a vector register on AArch64.
88 define void @noop_extract_second_2_lanes(ptr %ptr.1, ptr %ptr.2) {
89 ; CHECK-LABEL: @noop_extract_second_2_lanes(
91 ; CHECK-NEXT: [[V_1:%.*]] = load <4 x double>, ptr [[PTR_1:%.*]], align 8
92 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <4 x double> [[V_1]], i32 2
93 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <4 x double> [[V_1]], i32 3
94 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
95 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_1]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
96 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> <i32 2, i32 2>
97 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
98 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
99 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP4]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
100 ; CHECK-NEXT: call void @use(double [[V1_LANE_2]])
101 ; CHECK-NEXT: call void @use(double [[V1_LANE_3]])
102 ; CHECK-NEXT: store <4 x double> [[TMP3]], ptr [[PTR_1]], align 8
103 ; CHECK-NEXT: ret void
106 %v.1 = load <4 x double>, ptr %ptr.1, align 8
107 %v1.lane.2 = extractelement <4 x double> %v.1, i32 2
108 %v1.lane.3 = extractelement <4 x double> %v.1, i32 3
110 %v.2 = load <4 x double>, ptr %ptr.2, align 16
111 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
113 %a.lane.0 = fmul double %v1.lane.2, %v2.lane.2
114 %a.lane.1 = fmul double %v1.lane.3, %v2.lane.2
116 %a.ins.0 = insertelement <4 x double> zeroinitializer, double %a.lane.0, i32 0
117 %a.ins.1 = insertelement <4 x double> %a.ins.0, double %a.lane.1, i32 1
119 call void @use(double %v1.lane.2)
120 call void @use(double %v1.lane.3)
121 store <4 x double> %a.ins.1, ptr %ptr.1, align 8
125 ; %v1.lane.0 and %v1.lane.1 are used in reverse-order, so they won't be
126 ; directly in a vector register on AArch64.
127 define void @extract_reverse_order(ptr %ptr.1, ptr %ptr.2) {
128 ; CHECK-LABEL: @extract_reverse_order(
130 ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, ptr [[PTR_1:%.*]], align 8
131 ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <2 x double> [[V_1]], i32 0
132 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <2 x double> [[V_1]], i32 1
133 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
134 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> <i32 2, i32 2>
135 ; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[V_1]], [[TMP0]]
136 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
137 ; CHECK-NEXT: call void @use(double [[V1_LANE_0]])
138 ; CHECK-NEXT: call void @use(double [[V1_LANE_1]])
139 ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[PTR_1]], align 8
140 ; CHECK-NEXT: ret void
143 %v.1 = load <2 x double>, ptr %ptr.1, align 8
144 %v1.lane.0 = extractelement <2 x double> %v.1, i32 0
145 %v1.lane.1 = extractelement <2 x double> %v.1, i32 1
147 %v.2 = load <4 x double>, ptr %ptr.2, align 16
148 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
150 %a.lane.0 = fmul double %v1.lane.1, %v2.lane.2
151 %a.lane.1 = fmul double %v1.lane.0, %v2.lane.2
153 %a.ins.0 = insertelement <2 x double> zeroinitializer, double %a.lane.0, i32 0
154 %a.ins.1 = insertelement <2 x double> %a.ins.0, double %a.lane.1, i32 1
156 call void @use(double %v1.lane.0)
157 call void @use(double %v1.lane.1)
159 store <2 x double> %a.ins.1, ptr %ptr.1, align 8
163 ; %v1.lane.1 and %v1.lane.2 are extracted from different vector registers on AArch64.
164 define void @extract_lanes_1_and_2(ptr %ptr.1, ptr %ptr.2) {
165 ; CHECK-LABEL: @extract_lanes_1_and_2(
167 ; CHECK-NEXT: [[V_1:%.*]] = load <4 x double>, ptr [[PTR_1:%.*]], align 8
168 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <4 x double> [[V_1]], i32 1
169 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <4 x double> [[V_1]], i32 2
170 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
171 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[V_1]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
172 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <2 x i32> <i32 2, i32 2>
173 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
174 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
175 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP4]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
176 ; CHECK-NEXT: call void @use(double [[V1_LANE_1]])
177 ; CHECK-NEXT: call void @use(double [[V1_LANE_2]])
178 ; CHECK-NEXT: store <4 x double> [[TMP3]], ptr [[PTR_1]], align 8
179 ; CHECK-NEXT: ret void
182 %v.1 = load <4 x double>, ptr %ptr.1, align 8
183 %v1.lane.1 = extractelement <4 x double> %v.1, i32 1
184 %v1.lane.2 = extractelement <4 x double> %v.1, i32 2
186 %v.2 = load <4 x double>, ptr %ptr.2, align 16
187 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
189 %a.lane.0 = fmul double %v1.lane.1, %v2.lane.2
190 %a.lane.1 = fmul double %v1.lane.2, %v2.lane.2
192 %a.ins.0 = insertelement <4 x double> zeroinitializer, double %a.lane.0, i32 0
193 %a.ins.1 = insertelement <4 x double> %a.ins.0, double %a.lane.1, i32 1
195 call void @use(double %v1.lane.1)
196 call void @use(double %v1.lane.2)
198 store <4 x double> %a.ins.1, ptr %ptr.1, align 8
202 ; More complex case where the extracted lanes are directly from a vector
203 ; register on AArch64 and should be considered free, because we can
204 ; directly use the source vector register.
205 define void @noop_extracts_existing_vector_4_lanes(ptr %ptr.1, ptr %ptr.2) {
206 ; CHECK-LABEL: @noop_extracts_existing_vector_4_lanes(
208 ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, ptr [[PTR_1:%.*]], align 8
209 ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0
210 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1
211 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2
212 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <9 x double> [[V_1]], i32 3
213 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
214 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1
215 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
216 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <4 x i32> <i32 2, i32 0, i32 2, i32 2>
217 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP0]], [[TMP1]]
218 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <9 x i32> <i32 2, i32 3, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
219 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP4]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 4, i32 5, i32 6, i32 7, i32 8>
220 ; CHECK-NEXT: call void @use(double [[V1_LANE_0]])
221 ; CHECK-NEXT: call void @use(double [[V1_LANE_1]])
222 ; CHECK-NEXT: call void @use(double [[V1_LANE_2]])
223 ; CHECK-NEXT: call void @use(double [[V1_LANE_3]])
224 ; CHECK-NEXT: store <9 x double> [[TMP3]], ptr [[PTR_1]], align 8
225 ; CHECK-NEXT: ret void
228 %v.1 = load <9 x double>, ptr %ptr.1, align 8
229 %v1.lane.0 = extractelement <9 x double> %v.1, i32 0
230 %v1.lane.1 = extractelement <9 x double> %v.1, i32 1
231 %v1.lane.2 = extractelement <9 x double> %v.1, i32 2
232 %v1.lane.3 = extractelement <9 x double> %v.1, i32 3
233 %v.2 = load <4 x double>, ptr %ptr.2, align 16
234 %v2.lane.0 = extractelement <4 x double> %v.2, i32 0
235 %v2.lane.1 = extractelement <4 x double> %v.2, i32 1
236 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
237 %a.lane.0 = fmul double %v1.lane.2, %v2.lane.2
238 %a.lane.1 = fmul double %v1.lane.3, %v2.lane.2
239 %a.lane.2 = fmul double %v1.lane.0, %v2.lane.2
240 %a.lane.3 = fmul double %v1.lane.1, %v2.lane.0
241 %a.ins.0 = insertelement <9 x double> zeroinitializer, double %a.lane.0, i32 0
242 %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
243 %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
244 %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
245 call void @use(double %v1.lane.0)
246 call void @use(double %v1.lane.1)
247 call void @use(double %v1.lane.2)
248 call void @use(double %v1.lane.3)
249 store <9 x double> %a.ins.3, ptr %ptr.1, align 8
253 ; Extracted lanes are not used in the right order, so we cannot reuse the
254 ; source vector registers directly.
255 define void @extracts_jumbled_4_lanes(ptr %ptr.1, ptr %ptr.2) {
256 ; CHECK-LABEL: @extracts_jumbled_4_lanes(
258 ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, ptr [[PTR_1:%.*]], align 8
259 ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0
260 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1
261 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2
262 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <9 x double> [[V_1]], i32 3
263 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
264 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
265 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <4 x i32> <i32 2, i32 2, i32 1, i32 0>
266 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP0]], [[TMP1]]
267 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <9 x i32> <i32 0, i32 2, i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
268 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP4]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 4, i32 5, i32 6, i32 7, i32 8>
269 ; CHECK-NEXT: call void @use(double [[V1_LANE_0]])
270 ; CHECK-NEXT: call void @use(double [[V1_LANE_1]])
271 ; CHECK-NEXT: call void @use(double [[V1_LANE_2]])
272 ; CHECK-NEXT: call void @use(double [[V1_LANE_3]])
273 ; CHECK-NEXT: store <9 x double> [[TMP3]], ptr [[PTR_1]], align 8
274 ; CHECK-NEXT: ret void
277 %v.1 = load <9 x double>, ptr %ptr.1, align 8
278 %v1.lane.0 = extractelement <9 x double> %v.1, i32 0
279 %v1.lane.1 = extractelement <9 x double> %v.1, i32 1
280 %v1.lane.2 = extractelement <9 x double> %v.1, i32 2
281 %v1.lane.3 = extractelement <9 x double> %v.1, i32 3
282 %v.2 = load <4 x double>, ptr %ptr.2, align 16
283 %v2.lane.0 = extractelement <4 x double> %v.2, i32 0
284 %v2.lane.1 = extractelement <4 x double> %v.2, i32 1
285 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
286 %a.lane.0 = fmul double %v1.lane.0, %v2.lane.2
287 %a.lane.1 = fmul double %v1.lane.2, %v2.lane.1
288 %a.lane.2 = fmul double %v1.lane.1, %v2.lane.2
289 %a.lane.3 = fmul double %v1.lane.3, %v2.lane.0
290 %a.ins.0 = insertelement <9 x double> zeroinitializer, double %a.lane.0, i32 0
291 %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
292 %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
293 %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
294 call void @use(double %v1.lane.0)
295 call void @use(double %v1.lane.1)
296 call void @use(double %v1.lane.2)
297 call void @use(double %v1.lane.3)
298 store <9 x double> %a.ins.3, ptr %ptr.1, align 8
303 ; Even more complex case where the extracted lanes are directly from a vector
304 ; register on AArch64 and should be considered free, because we can
305 ; directly use the source vector register.
306 define void @noop_extracts_9_lanes(ptr %ptr.1, ptr %ptr.2) {
307 ; CHECK-LABEL: @noop_extracts_9_lanes(
309 ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, ptr [[PTR_1:%.*]], align 8
310 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2
311 ; CHECK-NEXT: [[V1_LANE_5:%.*]] = extractelement <9 x double> [[V_1]], i32 5
312 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
313 ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0
314 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> <i32 0, i32 2, i32 1, i32 0, i32 2, i32 0, i32 2, i32 1>
315 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> <i32 2, i32 1, i32 0, i32 2, i32 1, i32 0, i32 2, i32 1>
316 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 0, i32 1>
317 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <8 x double> [[TMP0]], [[TMP1]]
318 ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_0]]
319 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
320 ; CHECK-NEXT: [[A_INS_72:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP3]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
321 ; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[A_INS_72]], double [[A_LANE_8]], i32 8
322 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> <i32 6, i32 7, i32 8, i32 0, i32 1, i32 2, i32 3, i32 4>
323 ; CHECK-NEXT: [[TMP6:%.*]] = fmul <8 x double> [[TMP4]], [[TMP5]]
324 ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]]
325 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
326 ; CHECK-NEXT: [[B_INS_71:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP7]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
327 ; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[B_INS_71]], double [[B_LANE_8]], i32 8
328 ; CHECK-NEXT: [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]]
329 ; CHECK-NEXT: store <9 x double> [[RES]], ptr [[PTR_1]], align 8
330 ; CHECK-NEXT: ret void
333 %v.1 = load <9 x double>, ptr %ptr.1, align 8
334 %v1.lane.0 = extractelement <9 x double> %v.1, i32 0
335 %v1.lane.1 = extractelement <9 x double> %v.1, i32 1
336 %v1.lane.2 = extractelement <9 x double> %v.1, i32 2
337 %v1.lane.3 = extractelement <9 x double> %v.1, i32 3
338 %v1.lane.4 = extractelement <9 x double> %v.1, i32 4
339 %v1.lane.5 = extractelement <9 x double> %v.1, i32 5
340 %v1.lane.6 = extractelement <9 x double> %v.1, i32 6
341 %v1.lane.7 = extractelement <9 x double> %v.1, i32 7
342 %v1.lane.8 = extractelement <9 x double> %v.1, i32 8
344 %v.2 = load <4 x double>, ptr %ptr.2, align 16
345 %v2.lane.0 = extractelement <4 x double> %v.2, i32 0
346 %v2.lane.1 = extractelement <4 x double> %v.2, i32 1
347 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
349 %a.lane.0 = fmul double %v1.lane.3, %v2.lane.0
350 %a.lane.1 = fmul double %v1.lane.4, %v2.lane.2
351 %a.lane.2 = fmul double %v1.lane.5, %v2.lane.1
352 %a.lane.3 = fmul double %v1.lane.6, %v2.lane.0
353 %a.lane.4 = fmul double %v1.lane.7, %v2.lane.2
354 %a.lane.5 = fmul double %v1.lane.8, %v2.lane.0
355 %a.lane.6 = fmul double %v1.lane.0, %v2.lane.2
356 %a.lane.7 = fmul double %v1.lane.1, %v2.lane.1
357 %a.lane.8 = fmul double %v1.lane.2, %v2.lane.0
359 %a.ins.0 = insertelement <9 x double> zeroinitializer, double %a.lane.0, i32 0
360 %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
361 %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
362 %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
363 %a.ins.4 = insertelement <9 x double> %a.ins.3, double %a.lane.4, i32 4
364 %a.ins.5 = insertelement <9 x double> %a.ins.4, double %a.lane.5, i32 5
365 %a.ins.6 = insertelement <9 x double> %a.ins.5, double %a.lane.6, i32 6
366 %a.ins.7 = insertelement <9 x double> %a.ins.6, double %a.lane.7, i32 7
367 %a.ins.8 = insertelement <9 x double> %a.ins.7, double %a.lane.8, i32 8
369 %b.lane.0 = fmul double %v1.lane.6, %v2.lane.2
370 %b.lane.1 = fmul double %v1.lane.7, %v2.lane.1
371 %b.lane.2 = fmul double %v1.lane.8, %v2.lane.0
372 %b.lane.3 = fmul double %v1.lane.0, %v2.lane.2
373 %b.lane.4 = fmul double %v1.lane.1, %v2.lane.1
374 %b.lane.5 = fmul double %v1.lane.2, %v2.lane.0
375 %b.lane.6 = fmul double %v1.lane.3, %v2.lane.2
376 %b.lane.7 = fmul double %v1.lane.4, %v2.lane.1
377 %b.lane.8 = fmul double %v1.lane.5, %v2.lane.0
379 %b.ins.0 = insertelement <9 x double> zeroinitializer, double %b.lane.0, i32 0
380 %b.ins.1 = insertelement <9 x double> %b.ins.0, double %b.lane.1, i32 1
381 %b.ins.2 = insertelement <9 x double> %b.ins.1, double %b.lane.2, i32 2
382 %b.ins.3 = insertelement <9 x double> %b.ins.2, double %b.lane.3, i32 3
383 %b.ins.4 = insertelement <9 x double> %b.ins.3, double %b.lane.4, i32 4
384 %b.ins.5 = insertelement <9 x double> %b.ins.4, double %b.lane.5, i32 5
385 %b.ins.6 = insertelement <9 x double> %b.ins.5, double %b.lane.6, i32 6
386 %b.ins.7 = insertelement <9 x double> %b.ins.6, double %b.lane.7, i32 7
387 %b.ins.8 = insertelement <9 x double> %b.ins.7, double %b.lane.8, i32 8
389 %res = fsub <9 x double> %a.ins.8, %b.ins.8
390 store <9 x double> %res, ptr %ptr.1, align 8
394 ; Extracted lanes used in first fmul chain are not used in the right order, so
395 ; we cannot reuse the source vector registers directly.
396 define void @first_mul_chain_jumbled(ptr %ptr.1, ptr %ptr.2) {
397 ; CHECK-LABEL: @first_mul_chain_jumbled(
399 ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, ptr [[PTR_1:%.*]], align 8
400 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2
401 ; CHECK-NEXT: [[V1_LANE_5:%.*]] = extractelement <9 x double> [[V_1]], i32 5
402 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
403 ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0
404 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 2, i32 1, i32 0, i32 2>
405 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> <i32 4, i32 3, i32 6, i32 5, i32 8, i32 7, i32 1, i32 0>
406 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <8 x double> [[TMP0]], [[TMP1]]
407 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1
408 ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_1]]
409 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
410 ; CHECK-NEXT: [[A_INS_72:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP3]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
411 ; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[A_INS_72]], double [[A_LANE_8]], i32 8
412 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> <i32 6, i32 7, i32 8, i32 0, i32 1, i32 2, i32 3, i32 4>
413 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <8 x double> [[TMP4]], [[TMP1]]
414 ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]]
415 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
416 ; CHECK-NEXT: [[B_INS_71:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP6]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
417 ; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[B_INS_71]], double [[B_LANE_8]], i32 8
418 ; CHECK-NEXT: [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]]
419 ; CHECK-NEXT: store <9 x double> [[RES]], ptr [[PTR_1]], align 8
420 ; CHECK-NEXT: ret void
423 %v.1 = load <9 x double>, ptr %ptr.1, align 8
424 %v1.lane.0 = extractelement <9 x double> %v.1, i32 0
425 %v1.lane.1 = extractelement <9 x double> %v.1, i32 1
426 %v1.lane.2 = extractelement <9 x double> %v.1, i32 2
427 %v1.lane.3 = extractelement <9 x double> %v.1, i32 3
428 %v1.lane.4 = extractelement <9 x double> %v.1, i32 4
429 %v1.lane.5 = extractelement <9 x double> %v.1, i32 5
430 %v1.lane.6 = extractelement <9 x double> %v.1, i32 6
431 %v1.lane.7 = extractelement <9 x double> %v.1, i32 7
432 %v1.lane.8 = extractelement <9 x double> %v.1, i32 8
434 %v.2 = load <4 x double>, ptr %ptr.2, align 16
435 %v2.lane.0 = extractelement <4 x double> %v.2, i32 0
436 %v2.lane.1 = extractelement <4 x double> %v.2, i32 1
437 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
439 %a.lane.0 = fmul double %v1.lane.4, %v2.lane.1
440 %a.lane.1 = fmul double %v1.lane.3, %v2.lane.0
441 %a.lane.2 = fmul double %v1.lane.6, %v2.lane.2
442 %a.lane.3 = fmul double %v1.lane.5, %v2.lane.0
443 %a.lane.4 = fmul double %v1.lane.8, %v2.lane.2
444 %a.lane.5 = fmul double %v1.lane.7, %v2.lane.1
445 %a.lane.6 = fmul double %v1.lane.1, %v2.lane.0
446 %a.lane.7 = fmul double %v1.lane.0, %v2.lane.2
447 %a.lane.8 = fmul double %v1.lane.2, %v2.lane.1
449 %a.ins.0 = insertelement <9 x double> zeroinitializer, double %a.lane.0, i32 0
450 %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
451 %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
452 %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
453 %a.ins.4 = insertelement <9 x double> %a.ins.3, double %a.lane.4, i32 4
454 %a.ins.5 = insertelement <9 x double> %a.ins.4, double %a.lane.5, i32 5
455 %a.ins.6 = insertelement <9 x double> %a.ins.5, double %a.lane.6, i32 6
456 %a.ins.7 = insertelement <9 x double> %a.ins.6, double %a.lane.7, i32 7
457 %a.ins.8 = insertelement <9 x double> %a.ins.7, double %a.lane.8, i32 8
459 %b.lane.0 = fmul double %v1.lane.6, %v2.lane.1
460 %b.lane.1 = fmul double %v1.lane.7, %v2.lane.0
461 %b.lane.2 = fmul double %v1.lane.8, %v2.lane.2
462 %b.lane.3 = fmul double %v1.lane.0, %v2.lane.0
463 %b.lane.4 = fmul double %v1.lane.1, %v2.lane.2
464 %b.lane.5 = fmul double %v1.lane.2, %v2.lane.1
465 %b.lane.6 = fmul double %v1.lane.3, %v2.lane.0
466 %b.lane.7 = fmul double %v1.lane.4, %v2.lane.2
467 %b.lane.8 = fmul double %v1.lane.5, %v2.lane.0
469 %b.ins.0 = insertelement <9 x double> zeroinitializer, double %b.lane.0, i32 0
470 %b.ins.1 = insertelement <9 x double> %b.ins.0, double %b.lane.1, i32 1
471 %b.ins.2 = insertelement <9 x double> %b.ins.1, double %b.lane.2, i32 2
472 %b.ins.3 = insertelement <9 x double> %b.ins.2, double %b.lane.3, i32 3
473 %b.ins.4 = insertelement <9 x double> %b.ins.3, double %b.lane.4, i32 4
474 %b.ins.5 = insertelement <9 x double> %b.ins.4, double %b.lane.5, i32 5
475 %b.ins.6 = insertelement <9 x double> %b.ins.5, double %b.lane.6, i32 6
476 %b.ins.7 = insertelement <9 x double> %b.ins.6, double %b.lane.7, i32 7
477 %b.ins.8 = insertelement <9 x double> %b.ins.7, double %b.lane.8, i32 8
479 %res = fsub <9 x double> %a.ins.8, %b.ins.8
480 store <9 x double> %res, ptr %ptr.1, align 8
484 ; Extracted lanes used in both fmul chain are not used in the right order, so
485 ; we cannot reuse the source vector registers directly.
486 define void @first_and_second_mul_chain_jumbled(ptr %ptr.1, ptr %ptr.2) {
487 ; CHECK-LABEL: @first_and_second_mul_chain_jumbled(
489 ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, ptr [[PTR_1:%.*]], align 8
490 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2
491 ; CHECK-NEXT: [[V1_LANE_4:%.*]] = extractelement <9 x double> [[V_1]], i32 4
492 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, ptr [[PTR_2:%.*]], align 16
493 ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0
494 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> <i32 0, i32 2, i32 1, i32 2, i32 1, i32 0, i32 2, i32 1>
495 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[V_2]], <4 x double> poison, <8 x i32> <i32 2, i32 1, i32 0, i32 2, i32 0, i32 2, i32 1, i32 0>
496 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
497 ; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> <i32 4, i32 3, i32 5, i32 6, i32 8, i32 7, i32 1, i32 0>
498 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <8 x double> [[TMP0]], [[TMP1]]
499 ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_0]]
500 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
501 ; CHECK-NEXT: [[A_INS_72:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP3]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
502 ; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[A_INS_72]], double [[A_LANE_8]], i32 8
503 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <9 x double> [[V_1]], <9 x double> poison, <8 x i32> <i32 7, i32 6, i32 8, i32 1, i32 0, i32 3, i32 2, i32 5>
504 ; CHECK-NEXT: [[TMP6:%.*]] = fmul <8 x double> [[TMP4]], [[TMP5]]
505 ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_4]], [[V2_LANE_2]]
506 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison>
507 ; CHECK-NEXT: [[B_INS_71:%.*]] = shufflevector <9 x double> zeroinitializer, <9 x double> [[TMP7]], <9 x i32> <i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 8>
508 ; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[B_INS_71]], double [[B_LANE_8]], i32 8
509 ; CHECK-NEXT: [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]]
510 ; CHECK-NEXT: store <9 x double> [[RES]], ptr [[PTR_1]], align 8
511 ; CHECK-NEXT: ret void
514 %v.1 = load <9 x double>, ptr %ptr.1, align 8
515 %v1.lane.0 = extractelement <9 x double> %v.1, i32 0
516 %v1.lane.1 = extractelement <9 x double> %v.1, i32 1
517 %v1.lane.2 = extractelement <9 x double> %v.1, i32 2
518 %v1.lane.3 = extractelement <9 x double> %v.1, i32 3
519 %v1.lane.4 = extractelement <9 x double> %v.1, i32 4
520 %v1.lane.5 = extractelement <9 x double> %v.1, i32 5
521 %v1.lane.6 = extractelement <9 x double> %v.1, i32 6
522 %v1.lane.7 = extractelement <9 x double> %v.1, i32 7
523 %v1.lane.8 = extractelement <9 x double> %v.1, i32 8
525 %v.2 = load <4 x double>, ptr %ptr.2, align 16
526 %v2.lane.0 = extractelement <4 x double> %v.2, i32 0
527 %v2.lane.1 = extractelement <4 x double> %v.2, i32 1
528 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
530 %a.lane.0 = fmul double %v1.lane.4, %v2.lane.0
531 %a.lane.1 = fmul double %v1.lane.3, %v2.lane.2
532 %a.lane.2 = fmul double %v1.lane.5, %v2.lane.1
533 %a.lane.3 = fmul double %v1.lane.6, %v2.lane.2
534 %a.lane.4 = fmul double %v1.lane.8, %v2.lane.1
535 %a.lane.5 = fmul double %v1.lane.7, %v2.lane.0
536 %a.lane.6 = fmul double %v1.lane.1, %v2.lane.2
537 %a.lane.7 = fmul double %v1.lane.0, %v2.lane.1
538 %a.lane.8 = fmul double %v1.lane.2, %v2.lane.0
540 %a.ins.0 = insertelement <9 x double> zeroinitializer, double %a.lane.0, i32 0
541 %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
542 %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
543 %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
544 %a.ins.4 = insertelement <9 x double> %a.ins.3, double %a.lane.4, i32 4
545 %a.ins.5 = insertelement <9 x double> %a.ins.4, double %a.lane.5, i32 5
546 %a.ins.6 = insertelement <9 x double> %a.ins.5, double %a.lane.6, i32 6
547 %a.ins.7 = insertelement <9 x double> %a.ins.6, double %a.lane.7, i32 7
548 %a.ins.8 = insertelement <9 x double> %a.ins.7, double %a.lane.8, i32 8
550 %b.lane.0 = fmul double %v1.lane.7, %v2.lane.2
551 %b.lane.1 = fmul double %v1.lane.6, %v2.lane.1
552 %b.lane.2 = fmul double %v1.lane.8, %v2.lane.0
553 %b.lane.3 = fmul double %v1.lane.1, %v2.lane.2
554 %b.lane.4 = fmul double %v1.lane.0, %v2.lane.0
555 %b.lane.5 = fmul double %v1.lane.3, %v2.lane.2
556 %b.lane.6 = fmul double %v1.lane.2, %v2.lane.1
557 %b.lane.7 = fmul double %v1.lane.5, %v2.lane.0
558 %b.lane.8 = fmul double %v1.lane.4, %v2.lane.2
560 %b.ins.0 = insertelement <9 x double> zeroinitializer, double %b.lane.0, i32 0
561 %b.ins.1 = insertelement <9 x double> %b.ins.0, double %b.lane.1, i32 1
562 %b.ins.2 = insertelement <9 x double> %b.ins.1, double %b.lane.2, i32 2
563 %b.ins.3 = insertelement <9 x double> %b.ins.2, double %b.lane.3, i32 3
564 %b.ins.4 = insertelement <9 x double> %b.ins.3, double %b.lane.4, i32 4
565 %b.ins.5 = insertelement <9 x double> %b.ins.4, double %b.lane.5, i32 5
566 %b.ins.6 = insertelement <9 x double> %b.ins.5, double %b.lane.6, i32 6
567 %b.ins.7 = insertelement <9 x double> %b.ins.6, double %b.lane.7, i32 7
568 %b.ins.8 = insertelement <9 x double> %b.ins.7, double %b.lane.8, i32 8
570 %res = fsub <9 x double> %a.ins.8, %b.ins.8
571 store <9 x double> %res, ptr %ptr.1, align 8