1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -slp-vectorizer -S %s | FileCheck %s
4 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
5 target triple = "arm64-apple-darwin"
7 declare void @use(double)
9 ; The extracts %v1.lane.0 and %v1.lane.1 should be considered free during SLP,
10 ; because they will be directly in a vector register on AArch64.
11 define void @noop_extracts_first_2_lanes(<2 x double>* %ptr.1, <4 x double>* %ptr.2) {
12 ; CHECK-LABEL: @noop_extracts_first_2_lanes(
14 ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, <2 x double>* [[PTR_1:%.*]], align 8
15 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16
16 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
17 ; CHECK-NEXT: [[V2_LANE_3:%.*]] = extractelement <4 x double> [[V_2]], i32 3
18 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
19 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V2_LANE_3]], i32 1
20 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[V_1]], [[TMP1]]
21 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[V_1]], i32 0
22 ; CHECK-NEXT: call void @use(double [[TMP3]])
23 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 1
24 ; CHECK-NEXT: call void @use(double [[TMP4]])
25 ; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[PTR_1]], align 8
26 ; CHECK-NEXT: ret void
29 %v.1 = load <2 x double>, <2 x double>* %ptr.1, align 8
30 %v1.lane.0 = extractelement <2 x double> %v.1, i32 0
31 %v1.lane.1 = extractelement <2 x double> %v.1, i32 1
33 %v.2 = load <4 x double>, <4 x double>* %ptr.2, align 16
34 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
35 %v2.lane.3 = extractelement <4 x double> %v.2, i32 3
37 %a.lane.0 = fmul double %v1.lane.0, %v2.lane.2
38 %a.lane.1 = fmul double %v1.lane.1, %v2.lane.3
40 %a.ins.0 = insertelement <2 x double> undef, double %a.lane.0, i32 0
41 %a.ins.1 = insertelement <2 x double> %a.ins.0, double %a.lane.1, i32 1
43 call void @use(double %v1.lane.0)
44 call void @use(double %v1.lane.1)
46 store <2 x double> %a.ins.1, <2 x double>* %ptr.1, align 8
50 ; Extracts of consecutive indices, but different vector operand.
51 define void @extracts_first_2_lanes_different_vectors(<2 x double>* %ptr.1, <4 x double>* %ptr.2, <2 x double>* %ptr.3) {
52 ; CHECK-LABEL: @extracts_first_2_lanes_different_vectors(
54 ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, <2 x double>* [[PTR_1:%.*]], align 8
55 ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <2 x double> [[V_1]], i32 0
56 ; CHECK-NEXT: [[V_3:%.*]] = load <2 x double>, <2 x double>* [[PTR_3:%.*]], align 8
57 ; CHECK-NEXT: [[V3_LANE_1:%.*]] = extractelement <2 x double> [[V_3]], i32 1
58 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16
59 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
60 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_0]], i32 0
61 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V3_LANE_1]], i32 1
62 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
63 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1
64 ; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
65 ; CHECK-NEXT: call void @use(double [[V1_LANE_0]])
66 ; CHECK-NEXT: call void @use(double [[V3_LANE_1]])
67 ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[PTR_1]], align 8
68 ; CHECK-NEXT: ret void
71 %v.1 = load <2 x double>, <2 x double>* %ptr.1, align 8
72 %v1.lane.0 = extractelement <2 x double> %v.1, i32 0
73 %v.3 = load <2 x double>, <2 x double>* %ptr.3, align 8
74 %v3.lane.1 = extractelement <2 x double> %v.3, i32 1
76 %v.2 = load <4 x double>, <4 x double>* %ptr.2, align 16
77 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
79 %a.lane.0 = fmul double %v1.lane.0, %v2.lane.2
80 %a.lane.1 = fmul double %v3.lane.1, %v2.lane.2
82 %a.ins.0 = insertelement <2 x double> undef, double %a.lane.0, i32 0
83 %a.ins.1 = insertelement <2 x double> %a.ins.0, double %a.lane.1, i32 1
85 call void @use(double %v1.lane.0)
86 call void @use(double %v3.lane.1)
88 store <2 x double> %a.ins.1, <2 x double>* %ptr.1, align 8
92 ; The extracts %v1.lane.2 and %v1.lane.3 should be considered free during SLP,
93 ; because they will be directly in a vector register on AArch64.
94 define void @noop_extract_second_2_lanes(<4 x double>* %ptr.1, <4 x double>* %ptr.2) {
95 ; CHECK-LABEL: @noop_extract_second_2_lanes(
97 ; CHECK-NEXT: [[V_1:%.*]] = load <4 x double>, <4 x double>* [[PTR_1:%.*]], align 8
98 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <4 x double> [[V_1]], i32 2
99 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <4 x double> [[V_1]], i32 3
100 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16
101 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
102 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_2]], i32 0
103 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1_LANE_3]], i32 1
104 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
105 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1
106 ; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
107 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
108 ; CHECK-NEXT: call void @use(double [[V1_LANE_2]])
109 ; CHECK-NEXT: call void @use(double [[V1_LANE_3]])
110 ; CHECK-NEXT: store <4 x double> [[TMP5]], <4 x double>* [[PTR_1]], align 8
111 ; CHECK-NEXT: ret void
114 %v.1 = load <4 x double>, <4 x double>* %ptr.1, align 8
115 %v1.lane.2 = extractelement <4 x double> %v.1, i32 2
116 %v1.lane.3 = extractelement <4 x double> %v.1, i32 3
118 %v.2 = load <4 x double>, <4 x double>* %ptr.2, align 16
119 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
121 %a.lane.0 = fmul double %v1.lane.2, %v2.lane.2
122 %a.lane.1 = fmul double %v1.lane.3, %v2.lane.2
124 %a.ins.0 = insertelement <4 x double> undef, double %a.lane.0, i32 0
125 %a.ins.1 = insertelement <4 x double> %a.ins.0, double %a.lane.1, i32 1
127 call void @use(double %v1.lane.2)
128 call void @use(double %v1.lane.3)
129 store <4 x double> %a.ins.1, <4 x double>* %ptr.1, align 8
133 ; %v1.lane.0 and %v1.lane.1 are used in reverse-order, so they won't be
134 ; directly in a vector register on AArch64.
135 define void @extract_reverse_order(<2 x double>* %ptr.1, <4 x double>* %ptr.2) {
136 ; CHECK-LABEL: @extract_reverse_order(
138 ; CHECK-NEXT: [[V_1:%.*]] = load <2 x double>, <2 x double>* [[PTR_1:%.*]], align 8
139 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16
140 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
141 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
142 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V2_LANE_2]], i32 1
143 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[V_1]], [[TMP1]]
144 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
145 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[V_1]], i32 0
146 ; CHECK-NEXT: call void @use(double [[TMP4]])
147 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[V_1]], i32 1
148 ; CHECK-NEXT: call void @use(double [[TMP5]])
149 ; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[PTR_1]], align 8
150 ; CHECK-NEXT: ret void
153 %v.1 = load <2 x double>, <2 x double>* %ptr.1, align 8
154 %v1.lane.0 = extractelement <2 x double> %v.1, i32 0
155 %v1.lane.1 = extractelement <2 x double> %v.1, i32 1
157 %v.2 = load <4 x double>, <4 x double>* %ptr.2, align 16
158 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
160 %a.lane.0 = fmul double %v1.lane.1, %v2.lane.2
161 %a.lane.1 = fmul double %v1.lane.0, %v2.lane.2
163 %a.ins.0 = insertelement <2 x double> undef, double %a.lane.0, i32 0
164 %a.ins.1 = insertelement <2 x double> %a.ins.0, double %a.lane.1, i32 1
166 call void @use(double %v1.lane.0)
167 call void @use(double %v1.lane.1)
169 store <2 x double> %a.ins.1, <2 x double>* %ptr.1, align 8
173 ; %v1.lane.1 and %v1.lane.2 are extracted from different vector registers on AArch64.
174 define void @extract_lanes_1_and_2(<4 x double>* %ptr.1, <4 x double>* %ptr.2) {
175 ; CHECK-LABEL: @extract_lanes_1_and_2(
177 ; CHECK-NEXT: [[V_1:%.*]] = load <4 x double>, <4 x double>* [[PTR_1:%.*]], align 8
178 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <4 x double> [[V_1]], i32 1
179 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <4 x double> [[V_1]], i32 2
180 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16
181 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
182 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V1_LANE_1]], i32 0
183 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1_LANE_2]], i32 1
184 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V2_LANE_2]], i32 0
185 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2_LANE_2]], i32 1
186 ; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
187 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
188 ; CHECK-NEXT: call void @use(double [[V1_LANE_1]])
189 ; CHECK-NEXT: call void @use(double [[V1_LANE_2]])
190 ; CHECK-NEXT: store <4 x double> [[TMP5]], <4 x double>* [[PTR_1]], align 8
191 ; CHECK-NEXT: ret void
194 %v.1 = load <4 x double>, <4 x double>* %ptr.1, align 8
195 %v1.lane.1 = extractelement <4 x double> %v.1, i32 1
196 %v1.lane.2 = extractelement <4 x double> %v.1, i32 2
198 %v.2 = load <4 x double>, <4 x double>* %ptr.2, align 16
199 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
201 %a.lane.0 = fmul double %v1.lane.1, %v2.lane.2
202 %a.lane.1 = fmul double %v1.lane.2, %v2.lane.2
204 %a.ins.0 = insertelement <4 x double> undef, double %a.lane.0, i32 0
205 %a.ins.1 = insertelement <4 x double> %a.ins.0, double %a.lane.1, i32 1
207 call void @use(double %v1.lane.1)
208 call void @use(double %v1.lane.2)
210 store <4 x double> %a.ins.1, <4 x double>* %ptr.1, align 8
214 ; More complex case where the extracted lanes are directly from a vector
215 ; register on AArch64 and should be considered free, because we can
216 ; directly use the source vector register.
217 define void @noop_extracts_existing_vector_4_lanes(<9 x double>* %ptr.1, <4 x double>* %ptr.2) {
218 ; CHECK-LABEL: @noop_extracts_existing_vector_4_lanes(
220 ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, <9 x double>* [[PTR_1:%.*]], align 8
221 ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0
222 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1
223 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2
224 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <9 x double> [[V_1]], i32 3
225 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16
226 ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0
227 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1
228 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
229 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> poison, double [[V1_LANE_2]], i32 0
230 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[V1_LANE_3]], i32 1
231 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[V1_LANE_0]], i32 2
232 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[V1_LANE_1]], i32 3
233 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> poison, double [[V2_LANE_2]], i32 0
234 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[V2_LANE_0]], i32 1
235 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP5]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
236 ; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x double> [[TMP3]], [[SHUFFLE]]
237 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
238 ; CHECK-NEXT: call void @use(double [[V1_LANE_0]])
239 ; CHECK-NEXT: call void @use(double [[V1_LANE_1]])
240 ; CHECK-NEXT: call void @use(double [[V1_LANE_2]])
241 ; CHECK-NEXT: call void @use(double [[V1_LANE_3]])
242 ; CHECK-NEXT: store <9 x double> [[TMP7]], <9 x double>* [[PTR_1]], align 8
243 ; CHECK-NEXT: ret void
246 %v.1 = load <9 x double>, <9 x double>* %ptr.1, align 8
247 %v1.lane.0 = extractelement <9 x double> %v.1, i32 0
248 %v1.lane.1 = extractelement <9 x double> %v.1, i32 1
249 %v1.lane.2 = extractelement <9 x double> %v.1, i32 2
250 %v1.lane.3 = extractelement <9 x double> %v.1, i32 3
251 %v.2 = load <4 x double>, <4 x double>* %ptr.2, align 16
252 %v2.lane.0 = extractelement <4 x double> %v.2, i32 0
253 %v2.lane.1 = extractelement <4 x double> %v.2, i32 1
254 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
255 %a.lane.0 = fmul double %v1.lane.2, %v2.lane.2
256 %a.lane.1 = fmul double %v1.lane.3, %v2.lane.2
257 %a.lane.2 = fmul double %v1.lane.0, %v2.lane.2
258 %a.lane.3 = fmul double %v1.lane.1, %v2.lane.0
259 %a.ins.0 = insertelement <9 x double> undef, double %a.lane.0, i32 0
260 %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
261 %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
262 %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
263 call void @use(double %v1.lane.0)
264 call void @use(double %v1.lane.1)
265 call void @use(double %v1.lane.2)
266 call void @use(double %v1.lane.3)
267 store <9 x double> %a.ins.3, <9 x double>* %ptr.1, align 8
271 ; Extracted lanes are not used in the right order, so we cannot reuse the
272 ; source vector registers directly.
273 define void @extracts_jumbled_4_lanes(<9 x double>* %ptr.1, <4 x double>* %ptr.2) {
274 ; CHECK-LABEL: @extracts_jumbled_4_lanes(
276 ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, <9 x double>* [[PTR_1:%.*]], align 8
277 ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0
278 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1
279 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2
280 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <9 x double> [[V_1]], i32 3
281 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16
282 ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0
283 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1
284 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
285 ; CHECK-NEXT: [[A_LANE_0:%.*]] = fmul double [[V1_LANE_0]], [[V2_LANE_2]]
286 ; CHECK-NEXT: [[A_LANE_1:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_1]]
287 ; CHECK-NEXT: [[A_LANE_2:%.*]] = fmul double [[V1_LANE_1]], [[V2_LANE_2]]
288 ; CHECK-NEXT: [[A_LANE_3:%.*]] = fmul double [[V1_LANE_3]], [[V2_LANE_0]]
289 ; CHECK-NEXT: [[A_INS_0:%.*]] = insertelement <9 x double> undef, double [[A_LANE_0]], i32 0
290 ; CHECK-NEXT: [[A_INS_1:%.*]] = insertelement <9 x double> [[A_INS_0]], double [[A_LANE_1]], i32 1
291 ; CHECK-NEXT: [[A_INS_2:%.*]] = insertelement <9 x double> [[A_INS_1]], double [[A_LANE_2]], i32 2
292 ; CHECK-NEXT: [[A_INS_3:%.*]] = insertelement <9 x double> [[A_INS_2]], double [[A_LANE_3]], i32 3
293 ; CHECK-NEXT: call void @use(double [[V1_LANE_0]])
294 ; CHECK-NEXT: call void @use(double [[V1_LANE_1]])
295 ; CHECK-NEXT: call void @use(double [[V1_LANE_2]])
296 ; CHECK-NEXT: call void @use(double [[V1_LANE_3]])
297 ; CHECK-NEXT: store <9 x double> [[A_INS_3]], <9 x double>* [[PTR_1]], align 8
298 ; CHECK-NEXT: ret void
301 %v.1 = load <9 x double>, <9 x double>* %ptr.1, align 8
302 %v1.lane.0 = extractelement <9 x double> %v.1, i32 0
303 %v1.lane.1 = extractelement <9 x double> %v.1, i32 1
304 %v1.lane.2 = extractelement <9 x double> %v.1, i32 2
305 %v1.lane.3 = extractelement <9 x double> %v.1, i32 3
306 %v.2 = load <4 x double>, <4 x double>* %ptr.2, align 16
307 %v2.lane.0 = extractelement <4 x double> %v.2, i32 0
308 %v2.lane.1 = extractelement <4 x double> %v.2, i32 1
309 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
310 %a.lane.0 = fmul double %v1.lane.0, %v2.lane.2
311 %a.lane.1 = fmul double %v1.lane.2, %v2.lane.1
312 %a.lane.2 = fmul double %v1.lane.1, %v2.lane.2
313 %a.lane.3 = fmul double %v1.lane.3, %v2.lane.0
314 %a.ins.0 = insertelement <9 x double> undef, double %a.lane.0, i32 0
315 %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
316 %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
317 %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
318 call void @use(double %v1.lane.0)
319 call void @use(double %v1.lane.1)
320 call void @use(double %v1.lane.2)
321 call void @use(double %v1.lane.3)
322 store <9 x double> %a.ins.3, <9 x double>* %ptr.1, align 8
327 ; Even more complex case where the extracted lanes are directly from a vector
328 ; register on AArch64 and should be considered free, because we can
329 ; directly use the source vector register.
330 define void @noop_extracts_9_lanes(<9 x double>* %ptr.1, <4 x double>* %ptr.2) {
331 ; CHECK-LABEL: @noop_extracts_9_lanes(
333 ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, <9 x double>* [[PTR_1:%.*]], align 8
334 ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0
335 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1
336 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2
337 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <9 x double> [[V_1]], i32 3
338 ; CHECK-NEXT: [[V1_LANE_4:%.*]] = extractelement <9 x double> [[V_1]], i32 4
339 ; CHECK-NEXT: [[V1_LANE_5:%.*]] = extractelement <9 x double> [[V_1]], i32 5
340 ; CHECK-NEXT: [[V1_LANE_6:%.*]] = extractelement <9 x double> [[V_1]], i32 6
341 ; CHECK-NEXT: [[V1_LANE_7:%.*]] = extractelement <9 x double> [[V_1]], i32 7
342 ; CHECK-NEXT: [[V1_LANE_8:%.*]] = extractelement <9 x double> [[V_1]], i32 8
343 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16
344 ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0
345 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1
346 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
347 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_3]], i32 0
348 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x double> [[TMP0]], double [[V1_LANE_4]], i32 1
349 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x double> [[TMP1]], double [[V1_LANE_5]], i32 2
350 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x double> [[TMP2]], double [[V1_LANE_6]], i32 3
351 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x double> [[TMP3]], double [[V1_LANE_7]], i32 4
352 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x double> [[TMP4]], double [[V1_LANE_8]], i32 5
353 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[V1_LANE_0]], i32 6
354 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[V1_LANE_1]], i32 7
355 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_0]], i32 0
356 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[V2_LANE_2]], i32 1
357 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[V2_LANE_1]], i32 2
358 ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x double> [[TMP10]], <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 0, i32 1, i32 0, i32 1, i32 2>
359 ; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x double> [[TMP7]], [[SHUFFLE1]]
360 ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_0]]
361 ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
362 ; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP12]], double [[A_LANE_8]], i32 8
363 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_6]], i32 0
364 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[V1_LANE_7]], i32 1
365 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[V1_LANE_8]], i32 2
366 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x double> [[TMP15]], double [[V1_LANE_0]], i32 3
367 ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x double> [[TMP16]], double [[V1_LANE_1]], i32 4
368 ; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x double> [[TMP17]], double [[V1_LANE_2]], i32 5
369 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x double> [[TMP18]], double [[V1_LANE_3]], i32 6
370 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x double> [[TMP19]], double [[V1_LANE_4]], i32 7
371 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_2]], i32 0
372 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <8 x double> [[TMP21]], double [[V2_LANE_1]], i32 1
373 ; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x double> [[TMP22]], double [[V2_LANE_0]], i32 2
374 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x double> [[TMP23]], <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 0, i32 1, i32 2, i32 0, i32 1>
375 ; CHECK-NEXT: [[TMP24:%.*]] = fmul <8 x double> [[TMP20]], [[SHUFFLE]]
376 ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]]
377 ; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <8 x double> [[TMP24]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
378 ; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP25]], double [[B_LANE_8]], i32 8
379 ; CHECK-NEXT: [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]]
380 ; CHECK-NEXT: store <9 x double> [[RES]], <9 x double>* [[PTR_1]], align 8
381 ; CHECK-NEXT: ret void
384 %v.1 = load <9 x double>, <9 x double>* %ptr.1, align 8
385 %v1.lane.0 = extractelement <9 x double> %v.1, i32 0
386 %v1.lane.1 = extractelement <9 x double> %v.1, i32 1
387 %v1.lane.2 = extractelement <9 x double> %v.1, i32 2
388 %v1.lane.3 = extractelement <9 x double> %v.1, i32 3
389 %v1.lane.4 = extractelement <9 x double> %v.1, i32 4
390 %v1.lane.5 = extractelement <9 x double> %v.1, i32 5
391 %v1.lane.6 = extractelement <9 x double> %v.1, i32 6
392 %v1.lane.7 = extractelement <9 x double> %v.1, i32 7
393 %v1.lane.8 = extractelement <9 x double> %v.1, i32 8
395 %v.2 = load <4 x double>, <4 x double>* %ptr.2, align 16
396 %v2.lane.0 = extractelement <4 x double> %v.2, i32 0
397 %v2.lane.1 = extractelement <4 x double> %v.2, i32 1
398 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
400 %a.lane.0 = fmul double %v1.lane.3, %v2.lane.0
401 %a.lane.1 = fmul double %v1.lane.4, %v2.lane.2
402 %a.lane.2 = fmul double %v1.lane.5, %v2.lane.1
403 %a.lane.3 = fmul double %v1.lane.6, %v2.lane.0
404 %a.lane.4 = fmul double %v1.lane.7, %v2.lane.2
405 %a.lane.5 = fmul double %v1.lane.8, %v2.lane.0
406 %a.lane.6 = fmul double %v1.lane.0, %v2.lane.2
407 %a.lane.7 = fmul double %v1.lane.1, %v2.lane.1
408 %a.lane.8 = fmul double %v1.lane.2, %v2.lane.0
410 %a.ins.0 = insertelement <9 x double> undef, double %a.lane.0, i32 0
411 %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
412 %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
413 %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
414 %a.ins.4 = insertelement <9 x double> %a.ins.3, double %a.lane.4, i32 4
415 %a.ins.5 = insertelement <9 x double> %a.ins.4, double %a.lane.5, i32 5
416 %a.ins.6 = insertelement <9 x double> %a.ins.5, double %a.lane.6, i32 6
417 %a.ins.7 = insertelement <9 x double> %a.ins.6, double %a.lane.7, i32 7
418 %a.ins.8 = insertelement <9 x double> %a.ins.7, double %a.lane.8, i32 8
420 %b.lane.0 = fmul double %v1.lane.6, %v2.lane.2
421 %b.lane.1 = fmul double %v1.lane.7, %v2.lane.1
422 %b.lane.2 = fmul double %v1.lane.8, %v2.lane.0
423 %b.lane.3 = fmul double %v1.lane.0, %v2.lane.2
424 %b.lane.4 = fmul double %v1.lane.1, %v2.lane.1
425 %b.lane.5 = fmul double %v1.lane.2, %v2.lane.0
426 %b.lane.6 = fmul double %v1.lane.3, %v2.lane.2
427 %b.lane.7 = fmul double %v1.lane.4, %v2.lane.1
428 %b.lane.8 = fmul double %v1.lane.5, %v2.lane.0
430 %b.ins.0 = insertelement <9 x double> undef, double %b.lane.0, i32 0
431 %b.ins.1 = insertelement <9 x double> %b.ins.0, double %b.lane.1, i32 1
432 %b.ins.2 = insertelement <9 x double> %b.ins.1, double %b.lane.2, i32 2
433 %b.ins.3 = insertelement <9 x double> %b.ins.2, double %b.lane.3, i32 3
434 %b.ins.4 = insertelement <9 x double> %b.ins.3, double %b.lane.4, i32 4
435 %b.ins.5 = insertelement <9 x double> %b.ins.4, double %b.lane.5, i32 5
436 %b.ins.6 = insertelement <9 x double> %b.ins.5, double %b.lane.6, i32 6
437 %b.ins.7 = insertelement <9 x double> %b.ins.6, double %b.lane.7, i32 7
438 %b.ins.8 = insertelement <9 x double> %b.ins.7, double %b.lane.8, i32 8
440 %res = fsub <9 x double> %a.ins.8, %b.ins.8
441 store <9 x double> %res, <9 x double>* %ptr.1, align 8
445 ; Extracted lanes used in first fmul chain are not used in the right order, so
446 ; we cannot reuse the source vector registers directly.
447 define void @first_mul_chain_jumbled(<9 x double>* %ptr.1, <4 x double>* %ptr.2) {
448 ; CHECK-LABEL: @first_mul_chain_jumbled(
450 ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, <9 x double>* [[PTR_1:%.*]], align 8
451 ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0
452 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1
453 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2
454 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <9 x double> [[V_1]], i32 3
455 ; CHECK-NEXT: [[V1_LANE_4:%.*]] = extractelement <9 x double> [[V_1]], i32 4
456 ; CHECK-NEXT: [[V1_LANE_5:%.*]] = extractelement <9 x double> [[V_1]], i32 5
457 ; CHECK-NEXT: [[V1_LANE_6:%.*]] = extractelement <9 x double> [[V_1]], i32 6
458 ; CHECK-NEXT: [[V1_LANE_7:%.*]] = extractelement <9 x double> [[V_1]], i32 7
459 ; CHECK-NEXT: [[V1_LANE_8:%.*]] = extractelement <9 x double> [[V_1]], i32 8
460 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16
461 ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0
462 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1
463 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
464 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_4]], i32 0
465 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x double> [[TMP0]], double [[V1_LANE_3]], i32 1
466 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x double> [[TMP1]], double [[V1_LANE_6]], i32 2
467 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x double> [[TMP2]], double [[V1_LANE_5]], i32 3
468 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x double> [[TMP3]], double [[V1_LANE_8]], i32 4
469 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x double> [[TMP4]], double [[V1_LANE_7]], i32 5
470 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[V1_LANE_1]], i32 6
471 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[V1_LANE_0]], i32 7
472 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_1]], i32 0
473 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[V2_LANE_0]], i32 1
474 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[V2_LANE_2]], i32 2
475 ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x double> [[TMP10]], <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 1, i32 2, i32 0, i32 1, i32 2>
476 ; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x double> [[TMP7]], [[SHUFFLE1]]
477 ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_1]]
478 ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
479 ; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP12]], double [[A_LANE_8]], i32 8
480 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_6]], i32 0
481 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[V1_LANE_7]], i32 1
482 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[V1_LANE_8]], i32 2
483 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x double> [[TMP15]], double [[V1_LANE_0]], i32 3
484 ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x double> [[TMP16]], double [[V1_LANE_1]], i32 4
485 ; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x double> [[TMP17]], double [[V1_LANE_2]], i32 5
486 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x double> [[TMP18]], double [[V1_LANE_3]], i32 6
487 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x double> [[TMP19]], double [[V1_LANE_4]], i32 7
488 ; CHECK-NEXT: [[TMP21:%.*]] = fmul <8 x double> [[TMP20]], [[SHUFFLE1]]
489 ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_5]], [[V2_LANE_0]]
490 ; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x double> [[TMP21]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
491 ; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP22]], double [[B_LANE_8]], i32 8
492 ; CHECK-NEXT: [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]]
493 ; CHECK-NEXT: store <9 x double> [[RES]], <9 x double>* [[PTR_1]], align 8
494 ; CHECK-NEXT: ret void
497 %v.1 = load <9 x double>, <9 x double>* %ptr.1, align 8
498 %v1.lane.0 = extractelement <9 x double> %v.1, i32 0
499 %v1.lane.1 = extractelement <9 x double> %v.1, i32 1
500 %v1.lane.2 = extractelement <9 x double> %v.1, i32 2
501 %v1.lane.3 = extractelement <9 x double> %v.1, i32 3
502 %v1.lane.4 = extractelement <9 x double> %v.1, i32 4
503 %v1.lane.5 = extractelement <9 x double> %v.1, i32 5
504 %v1.lane.6 = extractelement <9 x double> %v.1, i32 6
505 %v1.lane.7 = extractelement <9 x double> %v.1, i32 7
506 %v1.lane.8 = extractelement <9 x double> %v.1, i32 8
508 %v.2 = load <4 x double>, <4 x double>* %ptr.2, align 16
509 %v2.lane.0 = extractelement <4 x double> %v.2, i32 0
510 %v2.lane.1 = extractelement <4 x double> %v.2, i32 1
511 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
513 %a.lane.0 = fmul double %v1.lane.4, %v2.lane.1
514 %a.lane.1 = fmul double %v1.lane.3, %v2.lane.0
515 %a.lane.2 = fmul double %v1.lane.6, %v2.lane.2
516 %a.lane.3 = fmul double %v1.lane.5, %v2.lane.0
517 %a.lane.4 = fmul double %v1.lane.8, %v2.lane.2
518 %a.lane.5 = fmul double %v1.lane.7, %v2.lane.1
519 %a.lane.6 = fmul double %v1.lane.1, %v2.lane.0
520 %a.lane.7 = fmul double %v1.lane.0, %v2.lane.2
521 %a.lane.8 = fmul double %v1.lane.2, %v2.lane.1
523 %a.ins.0 = insertelement <9 x double> undef, double %a.lane.0, i32 0
524 %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
525 %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
526 %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
527 %a.ins.4 = insertelement <9 x double> %a.ins.3, double %a.lane.4, i32 4
528 %a.ins.5 = insertelement <9 x double> %a.ins.4, double %a.lane.5, i32 5
529 %a.ins.6 = insertelement <9 x double> %a.ins.5, double %a.lane.6, i32 6
530 %a.ins.7 = insertelement <9 x double> %a.ins.6, double %a.lane.7, i32 7
531 %a.ins.8 = insertelement <9 x double> %a.ins.7, double %a.lane.8, i32 8
533 %b.lane.0 = fmul double %v1.lane.6, %v2.lane.1
534 %b.lane.1 = fmul double %v1.lane.7, %v2.lane.0
535 %b.lane.2 = fmul double %v1.lane.8, %v2.lane.2
536 %b.lane.3 = fmul double %v1.lane.0, %v2.lane.0
537 %b.lane.4 = fmul double %v1.lane.1, %v2.lane.2
538 %b.lane.5 = fmul double %v1.lane.2, %v2.lane.1
539 %b.lane.6 = fmul double %v1.lane.3, %v2.lane.0
540 %b.lane.7 = fmul double %v1.lane.4, %v2.lane.2
541 %b.lane.8 = fmul double %v1.lane.5, %v2.lane.0
543 %b.ins.0 = insertelement <9 x double> undef, double %b.lane.0, i32 0
544 %b.ins.1 = insertelement <9 x double> %b.ins.0, double %b.lane.1, i32 1
545 %b.ins.2 = insertelement <9 x double> %b.ins.1, double %b.lane.2, i32 2
546 %b.ins.3 = insertelement <9 x double> %b.ins.2, double %b.lane.3, i32 3
547 %b.ins.4 = insertelement <9 x double> %b.ins.3, double %b.lane.4, i32 4
548 %b.ins.5 = insertelement <9 x double> %b.ins.4, double %b.lane.5, i32 5
549 %b.ins.6 = insertelement <9 x double> %b.ins.5, double %b.lane.6, i32 6
550 %b.ins.7 = insertelement <9 x double> %b.ins.6, double %b.lane.7, i32 7
551 %b.ins.8 = insertelement <9 x double> %b.ins.7, double %b.lane.8, i32 8
553 %res = fsub <9 x double> %a.ins.8, %b.ins.8
554 store <9 x double> %res, <9 x double>* %ptr.1, align 8
558 ; Extracted lanes used in both fmul chain are not used in the right order, so
559 ; we cannot reuse the source vector registers directly.
560 define void @first_and_second_mul_chain_jumbled(<9 x double>* %ptr.1, <4 x double>* %ptr.2) {
561 ; CHECK-LABEL: @first_and_second_mul_chain_jumbled(
563 ; CHECK-NEXT: [[V_1:%.*]] = load <9 x double>, <9 x double>* [[PTR_1:%.*]], align 8
564 ; CHECK-NEXT: [[V1_LANE_0:%.*]] = extractelement <9 x double> [[V_1]], i32 0
565 ; CHECK-NEXT: [[V1_LANE_1:%.*]] = extractelement <9 x double> [[V_1]], i32 1
566 ; CHECK-NEXT: [[V1_LANE_2:%.*]] = extractelement <9 x double> [[V_1]], i32 2
567 ; CHECK-NEXT: [[V1_LANE_3:%.*]] = extractelement <9 x double> [[V_1]], i32 3
568 ; CHECK-NEXT: [[V1_LANE_4:%.*]] = extractelement <9 x double> [[V_1]], i32 4
569 ; CHECK-NEXT: [[V1_LANE_5:%.*]] = extractelement <9 x double> [[V_1]], i32 5
570 ; CHECK-NEXT: [[V1_LANE_6:%.*]] = extractelement <9 x double> [[V_1]], i32 6
571 ; CHECK-NEXT: [[V1_LANE_7:%.*]] = extractelement <9 x double> [[V_1]], i32 7
572 ; CHECK-NEXT: [[V1_LANE_8:%.*]] = extractelement <9 x double> [[V_1]], i32 8
573 ; CHECK-NEXT: [[V_2:%.*]] = load <4 x double>, <4 x double>* [[PTR_2:%.*]], align 16
574 ; CHECK-NEXT: [[V2_LANE_0:%.*]] = extractelement <4 x double> [[V_2]], i32 0
575 ; CHECK-NEXT: [[V2_LANE_1:%.*]] = extractelement <4 x double> [[V_2]], i32 1
576 ; CHECK-NEXT: [[V2_LANE_2:%.*]] = extractelement <4 x double> [[V_2]], i32 2
577 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_4]], i32 0
578 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x double> [[TMP0]], double [[V1_LANE_3]], i32 1
579 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x double> [[TMP1]], double [[V1_LANE_5]], i32 2
580 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x double> [[TMP2]], double [[V1_LANE_6]], i32 3
581 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x double> [[TMP3]], double [[V1_LANE_8]], i32 4
582 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x double> [[TMP4]], double [[V1_LANE_7]], i32 5
583 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[V1_LANE_1]], i32 6
584 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[V1_LANE_0]], i32 7
585 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_0]], i32 0
586 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[V2_LANE_2]], i32 1
587 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[V2_LANE_1]], i32 2
588 ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x double> [[TMP10]], <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 1, i32 2, i32 0, i32 1, i32 2>
589 ; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x double> [[TMP7]], [[SHUFFLE1]]
590 ; CHECK-NEXT: [[A_LANE_8:%.*]] = fmul double [[V1_LANE_2]], [[V2_LANE_0]]
591 ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x double> [[TMP11]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
592 ; CHECK-NEXT: [[A_INS_8:%.*]] = insertelement <9 x double> [[TMP12]], double [[A_LANE_8]], i32 8
593 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> poison, double [[V1_LANE_7]], i32 0
594 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[V1_LANE_6]], i32 1
595 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[V1_LANE_8]], i32 2
596 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x double> [[TMP15]], double [[V1_LANE_1]], i32 3
597 ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x double> [[TMP16]], double [[V1_LANE_0]], i32 4
598 ; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x double> [[TMP17]], double [[V1_LANE_3]], i32 5
599 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x double> [[TMP18]], double [[V1_LANE_2]], i32 6
600 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x double> [[TMP19]], double [[V1_LANE_5]], i32 7
601 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x double> poison, double [[V2_LANE_2]], i32 0
602 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <8 x double> [[TMP21]], double [[V2_LANE_1]], i32 1
603 ; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x double> [[TMP22]], double [[V2_LANE_0]], i32 2
604 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x double> [[TMP23]], <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 0, i32 2, i32 0, i32 1, i32 2>
605 ; CHECK-NEXT: [[TMP24:%.*]] = fmul <8 x double> [[TMP20]], [[SHUFFLE]]
606 ; CHECK-NEXT: [[B_LANE_8:%.*]] = fmul double [[V1_LANE_4]], [[V2_LANE_2]]
607 ; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <8 x double> [[TMP24]], <8 x double> poison, <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef>
608 ; CHECK-NEXT: [[B_INS_8:%.*]] = insertelement <9 x double> [[TMP25]], double [[B_LANE_8]], i32 8
609 ; CHECK-NEXT: [[RES:%.*]] = fsub <9 x double> [[A_INS_8]], [[B_INS_8]]
610 ; CHECK-NEXT: store <9 x double> [[RES]], <9 x double>* [[PTR_1]], align 8
611 ; CHECK-NEXT: ret void
614 %v.1 = load <9 x double>, <9 x double>* %ptr.1, align 8
615 %v1.lane.0 = extractelement <9 x double> %v.1, i32 0
616 %v1.lane.1 = extractelement <9 x double> %v.1, i32 1
617 %v1.lane.2 = extractelement <9 x double> %v.1, i32 2
618 %v1.lane.3 = extractelement <9 x double> %v.1, i32 3
619 %v1.lane.4 = extractelement <9 x double> %v.1, i32 4
620 %v1.lane.5 = extractelement <9 x double> %v.1, i32 5
621 %v1.lane.6 = extractelement <9 x double> %v.1, i32 6
622 %v1.lane.7 = extractelement <9 x double> %v.1, i32 7
623 %v1.lane.8 = extractelement <9 x double> %v.1, i32 8
625 %v.2 = load <4 x double>, <4 x double>* %ptr.2, align 16
626 %v2.lane.0 = extractelement <4 x double> %v.2, i32 0
627 %v2.lane.1 = extractelement <4 x double> %v.2, i32 1
628 %v2.lane.2 = extractelement <4 x double> %v.2, i32 2
630 %a.lane.0 = fmul double %v1.lane.4, %v2.lane.0
631 %a.lane.1 = fmul double %v1.lane.3, %v2.lane.2
632 %a.lane.2 = fmul double %v1.lane.5, %v2.lane.1
633 %a.lane.3 = fmul double %v1.lane.6, %v2.lane.2
634 %a.lane.4 = fmul double %v1.lane.8, %v2.lane.1
635 %a.lane.5 = fmul double %v1.lane.7, %v2.lane.0
636 %a.lane.6 = fmul double %v1.lane.1, %v2.lane.2
637 %a.lane.7 = fmul double %v1.lane.0, %v2.lane.1
638 %a.lane.8 = fmul double %v1.lane.2, %v2.lane.0
640 %a.ins.0 = insertelement <9 x double> undef, double %a.lane.0, i32 0
641 %a.ins.1 = insertelement <9 x double> %a.ins.0, double %a.lane.1, i32 1
642 %a.ins.2 = insertelement <9 x double> %a.ins.1, double %a.lane.2, i32 2
643 %a.ins.3 = insertelement <9 x double> %a.ins.2, double %a.lane.3, i32 3
644 %a.ins.4 = insertelement <9 x double> %a.ins.3, double %a.lane.4, i32 4
645 %a.ins.5 = insertelement <9 x double> %a.ins.4, double %a.lane.5, i32 5
646 %a.ins.6 = insertelement <9 x double> %a.ins.5, double %a.lane.6, i32 6
647 %a.ins.7 = insertelement <9 x double> %a.ins.6, double %a.lane.7, i32 7
648 %a.ins.8 = insertelement <9 x double> %a.ins.7, double %a.lane.8, i32 8
650 %b.lane.0 = fmul double %v1.lane.7, %v2.lane.2
651 %b.lane.1 = fmul double %v1.lane.6, %v2.lane.1
652 %b.lane.2 = fmul double %v1.lane.8, %v2.lane.0
653 %b.lane.3 = fmul double %v1.lane.1, %v2.lane.2
654 %b.lane.4 = fmul double %v1.lane.0, %v2.lane.0
655 %b.lane.5 = fmul double %v1.lane.3, %v2.lane.2
656 %b.lane.6 = fmul double %v1.lane.2, %v2.lane.1
657 %b.lane.7 = fmul double %v1.lane.5, %v2.lane.0
658 %b.lane.8 = fmul double %v1.lane.4, %v2.lane.2
660 %b.ins.0 = insertelement <9 x double> undef, double %b.lane.0, i32 0
661 %b.ins.1 = insertelement <9 x double> %b.ins.0, double %b.lane.1, i32 1
662 %b.ins.2 = insertelement <9 x double> %b.ins.1, double %b.lane.2, i32 2
663 %b.ins.3 = insertelement <9 x double> %b.ins.2, double %b.lane.3, i32 3
664 %b.ins.4 = insertelement <9 x double> %b.ins.3, double %b.lane.4, i32 4
665 %b.ins.5 = insertelement <9 x double> %b.ins.4, double %b.lane.5, i32 5
666 %b.ins.6 = insertelement <9 x double> %b.ins.5, double %b.lane.6, i32 6
667 %b.ins.7 = insertelement <9 x double> %b.ins.6, double %b.lane.7, i32 7
668 %b.ins.8 = insertelement <9 x double> %b.ins.7, double %b.lane.8, i32 8
670 %res = fsub <9 x double> %a.ins.8, %b.ins.8
671 store <9 x double> %res, <9 x double>* %ptr.1, align 8