1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL
9 define void @widen_fadd_v2f32_v4f32(ptr %a0, ptr %b0, ptr %c0) {
10 ; SSE-LABEL: widen_fadd_v2f32_v4f32:
12 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
13 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
14 ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
15 ; SSE-NEXT: addps %xmm0, %xmm2
16 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
17 ; SSE-NEXT: addps %xmm1, %xmm0
18 ; SSE-NEXT: movlps %xmm2, (%rdx)
19 ; SSE-NEXT: movlps %xmm0, 8(%rdx)
22 ; AVX-LABEL: widen_fadd_v2f32_v4f32:
24 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
25 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
26 ; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
27 ; AVX-NEXT: vaddps %xmm2, %xmm0, %xmm0
28 ; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
29 ; AVX-NEXT: vaddps %xmm2, %xmm1, %xmm1
30 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
31 ; AVX-NEXT: vmovups %xmm0, (%rdx)
33 %a2 = getelementptr inbounds i8, ptr %a0, i64 8
34 %b2 = getelementptr inbounds i8, ptr %b0, i64 8
35 %c2 = getelementptr inbounds i8, ptr %c0, i64 8
36 %va0 = load <2 x float>, ptr %a0, align 4
37 %vb0 = load <2 x float>, ptr %b0, align 4
38 %va2 = load <2 x float>, ptr %a2, align 4
39 %vb2 = load <2 x float>, ptr %b2, align 4
40 %vc0 = fadd <2 x float> %va0, %vb0
41 %vc2 = fadd <2 x float> %va2, %vb2
42 store <2 x float> %vc0, ptr %c0, align 4
43 store <2 x float> %vc2, ptr %c2, align 4
47 define void @widen_fadd_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) {
48 ; SSE-LABEL: widen_fadd_v2f32_v8f32:
50 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
51 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
52 ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
53 ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
54 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero
55 ; SSE-NEXT: addps %xmm0, %xmm4
56 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
57 ; SSE-NEXT: addps %xmm1, %xmm0
58 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
59 ; SSE-NEXT: addps %xmm2, %xmm1
60 ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
61 ; SSE-NEXT: addps %xmm3, %xmm2
62 ; SSE-NEXT: movlps %xmm4, (%rdx)
63 ; SSE-NEXT: movlps %xmm0, 8(%rdx)
64 ; SSE-NEXT: movlps %xmm1, 16(%rdx)
65 ; SSE-NEXT: movlps %xmm2, 24(%rdx)
68 ; AVX1OR2-LABEL: widen_fadd_v2f32_v8f32:
70 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
71 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
72 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
73 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
74 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
75 ; AVX1OR2-NEXT: vaddps %xmm4, %xmm0, %xmm0
76 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
77 ; AVX1OR2-NEXT: vaddps %xmm4, %xmm1, %xmm1
78 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
79 ; AVX1OR2-NEXT: vaddps %xmm4, %xmm2, %xmm2
80 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
81 ; AVX1OR2-NEXT: vaddps %xmm4, %xmm3, %xmm3
82 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
83 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
84 ; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
85 ; AVX1OR2-NEXT: vmovups %ymm0, (%rdx)
86 ; AVX1OR2-NEXT: vzeroupper
89 ; AVX512F-LABEL: widen_fadd_v2f32_v8f32:
91 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
92 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
93 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
94 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
95 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
96 ; AVX512F-NEXT: vaddps %xmm4, %xmm0, %xmm0
97 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
98 ; AVX512F-NEXT: vaddps %xmm4, %xmm1, %xmm1
99 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
100 ; AVX512F-NEXT: vaddps %xmm4, %xmm2, %xmm2
101 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
102 ; AVX512F-NEXT: vaddps %xmm4, %xmm3, %xmm3
103 ; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
104 ; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
105 ; AVX512F-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
106 ; AVX512F-NEXT: vmovups %ymm0, (%rdx)
107 ; AVX512F-NEXT: vzeroupper
110 ; AVX512VL-LABEL: widen_fadd_v2f32_v8f32:
112 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
113 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
114 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
115 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
116 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
117 ; AVX512VL-NEXT: vaddps %xmm4, %xmm0, %xmm0
118 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
119 ; AVX512VL-NEXT: vaddps %xmm4, %xmm1, %xmm1
120 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
121 ; AVX512VL-NEXT: vaddps %xmm4, %xmm2, %xmm2
122 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
123 ; AVX512VL-NEXT: vaddps %xmm4, %xmm3, %xmm3
124 ; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
125 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
126 ; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
127 ; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
128 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
129 ; AVX512VL-NEXT: vmovups %ymm0, (%rdx)
130 ; AVX512VL-NEXT: vzeroupper
131 ; AVX512VL-NEXT: retq
132 %a2 = getelementptr inbounds i8, ptr %a0, i64 8
133 %b2 = getelementptr inbounds i8, ptr %b0, i64 8
134 %c2 = getelementptr inbounds i8, ptr %c0, i64 8
135 %a4 = getelementptr inbounds i8, ptr %a0, i64 16
136 %b4 = getelementptr inbounds i8, ptr %b0, i64 16
137 %c4 = getelementptr inbounds i8, ptr %c0, i64 16
138 %a6 = getelementptr inbounds i8, ptr %a0, i64 24
139 %b6 = getelementptr inbounds i8, ptr %b0, i64 24
140 %c6 = getelementptr inbounds i8, ptr %c0, i64 24
141 %va0 = load <2 x float>, ptr %a0, align 4
142 %vb0 = load <2 x float>, ptr %b0, align 4
143 %va2 = load <2 x float>, ptr %a2, align 4
144 %vb2 = load <2 x float>, ptr %b2, align 4
145 %va4 = load <2 x float>, ptr %a4, align 4
146 %vb4 = load <2 x float>, ptr %b4, align 4
147 %va6 = load <2 x float>, ptr %a6, align 4
148 %vb6 = load <2 x float>, ptr %b6, align 4
149 %vc0 = fadd <2 x float> %va0, %vb0
150 %vc2 = fadd <2 x float> %va2, %vb2
151 %vc4 = fadd <2 x float> %va4, %vb4
152 %vc6 = fadd <2 x float> %va6, %vb6
153 store <2 x float> %vc0, ptr %c0, align 4
154 store <2 x float> %vc2, ptr %c2, align 4
155 store <2 x float> %vc4, ptr %c4, align 4
156 store <2 x float> %vc6, ptr %c6, align 4
160 define void @widen_fadd_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
161 ; SSE-LABEL: widen_fadd_v2f32_v16f32:
163 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
164 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
165 ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
166 ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
167 ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero
168 ; SSE-NEXT: addps %xmm0, %xmm4
169 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
170 ; SSE-NEXT: addps %xmm1, %xmm0
171 ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
172 ; SSE-NEXT: addps %xmm2, %xmm1
173 ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
174 ; SSE-NEXT: addps %xmm3, %xmm2
175 ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
176 ; SSE-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero
177 ; SSE-NEXT: addps %xmm3, %xmm5
178 ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
179 ; SSE-NEXT: movsd {{.*#+}} xmm6 = mem[0],zero
180 ; SSE-NEXT: addps %xmm3, %xmm6
181 ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
182 ; SSE-NEXT: movsd {{.*#+}} xmm7 = mem[0],zero
183 ; SSE-NEXT: addps %xmm3, %xmm7
184 ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
185 ; SSE-NEXT: movsd {{.*#+}} xmm8 = mem[0],zero
186 ; SSE-NEXT: addps %xmm3, %xmm8
187 ; SSE-NEXT: movlps %xmm4, (%rdx)
188 ; SSE-NEXT: movlps %xmm0, 8(%rdx)
189 ; SSE-NEXT: movlps %xmm1, 16(%rdx)
190 ; SSE-NEXT: movlps %xmm2, 24(%rdx)
191 ; SSE-NEXT: movlps %xmm5, 32(%rdx)
192 ; SSE-NEXT: movlps %xmm6, 40(%rdx)
193 ; SSE-NEXT: movlps %xmm7, 48(%rdx)
194 ; SSE-NEXT: movlps %xmm8, 56(%rdx)
197 ; AVX1OR2-LABEL: widen_fadd_v2f32_v16f32:
199 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
200 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
201 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
202 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
203 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
204 ; AVX1OR2-NEXT: vaddps %xmm4, %xmm0, %xmm0
205 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
206 ; AVX1OR2-NEXT: vaddps %xmm4, %xmm1, %xmm1
207 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
208 ; AVX1OR2-NEXT: vaddps %xmm4, %xmm2, %xmm2
209 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
210 ; AVX1OR2-NEXT: vaddps %xmm4, %xmm3, %xmm3
211 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
212 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
213 ; AVX1OR2-NEXT: vaddps %xmm5, %xmm4, %xmm4
214 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
215 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
216 ; AVX1OR2-NEXT: vaddps %xmm6, %xmm5, %xmm5
217 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
218 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
219 ; AVX1OR2-NEXT: vaddps %xmm7, %xmm6, %xmm6
220 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
221 ; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
222 ; AVX1OR2-NEXT: vaddps %xmm7, %xmm8, %xmm7
223 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
224 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
225 ; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
226 ; AVX1OR2-NEXT: vmovups %ymm0, (%rdx)
227 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm0
228 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm1
229 ; AVX1OR2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
230 ; AVX1OR2-NEXT: vmovups %ymm0, 32(%rdx)
231 ; AVX1OR2-NEXT: vzeroupper
234 ; AVX512F-LABEL: widen_fadd_v2f32_v16f32:
236 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
237 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
238 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
239 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
240 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
241 ; AVX512F-NEXT: vaddps %xmm4, %xmm0, %xmm0
242 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
243 ; AVX512F-NEXT: vaddps %xmm4, %xmm1, %xmm1
244 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
245 ; AVX512F-NEXT: vaddps %xmm4, %xmm2, %xmm2
246 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
247 ; AVX512F-NEXT: vaddps %xmm4, %xmm3, %xmm3
248 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
249 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
250 ; AVX512F-NEXT: vaddps %xmm5, %xmm4, %xmm4
251 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
252 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
253 ; AVX512F-NEXT: vaddps %xmm6, %xmm5, %xmm5
254 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
255 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
256 ; AVX512F-NEXT: vaddps %xmm7, %xmm6, %xmm6
257 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
258 ; AVX512F-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
259 ; AVX512F-NEXT: vaddps %xmm7, %xmm8, %xmm7
260 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
261 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
262 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
263 ; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
264 ; AVX512F-NEXT: vpermt2pd %zmm6, %zmm5, %zmm4
265 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
266 ; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
267 ; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0
268 ; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm4, %zmm0
269 ; AVX512F-NEXT: vmovupd %zmm0, (%rdx)
270 ; AVX512F-NEXT: vzeroupper
273 ; AVX512VL-LABEL: widen_fadd_v2f32_v16f32:
275 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
276 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
277 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
278 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
279 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
280 ; AVX512VL-NEXT: vaddps %xmm4, %xmm0, %xmm0
281 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
282 ; AVX512VL-NEXT: vaddps %xmm4, %xmm1, %xmm1
283 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
284 ; AVX512VL-NEXT: vaddps %xmm4, %xmm2, %xmm2
285 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
286 ; AVX512VL-NEXT: vaddps %xmm4, %xmm3, %xmm3
287 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
288 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
289 ; AVX512VL-NEXT: vaddps %xmm5, %xmm4, %xmm4
290 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
291 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
292 ; AVX512VL-NEXT: vaddps %xmm6, %xmm5, %xmm5
293 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
294 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
295 ; AVX512VL-NEXT: vaddps %xmm7, %xmm6, %xmm6
296 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
297 ; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
298 ; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7
299 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
300 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
301 ; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
302 ; AVX512VL-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
303 ; AVX512VL-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5
304 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
305 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
306 ; AVX512VL-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6]
307 ; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
308 ; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm5, %zmm0
309 ; AVX512VL-NEXT: vmovupd %zmm0, (%rdx)
310 ; AVX512VL-NEXT: vzeroupper
311 ; AVX512VL-NEXT: retq
312 %a2 = getelementptr inbounds i8, ptr %a0, i64 8
313 %b2 = getelementptr inbounds i8, ptr %b0, i64 8
314 %c2 = getelementptr inbounds i8, ptr %c0, i64 8
315 %a4 = getelementptr inbounds i8, ptr %a0, i64 16
316 %b4 = getelementptr inbounds i8, ptr %b0, i64 16
317 %c4 = getelementptr inbounds i8, ptr %c0, i64 16
318 %a6 = getelementptr inbounds i8, ptr %a0, i64 24
319 %b6 = getelementptr inbounds i8, ptr %b0, i64 24
320 %c6 = getelementptr inbounds i8, ptr %c0, i64 24
321 %a8 = getelementptr inbounds i8, ptr %a0, i64 32
322 %b8 = getelementptr inbounds i8, ptr %b0, i64 32
323 %c8 = getelementptr inbounds i8, ptr %c0, i64 32
324 %a10 = getelementptr inbounds i8, ptr %a0, i64 40
325 %b10 = getelementptr inbounds i8, ptr %b0, i64 40
326 %c10 = getelementptr inbounds i8, ptr %c0, i64 40
327 %a12 = getelementptr inbounds i8, ptr %a0, i64 48
328 %b12 = getelementptr inbounds i8, ptr %b0, i64 48
329 %c12 = getelementptr inbounds i8, ptr %c0, i64 48
330 %a14 = getelementptr inbounds i8, ptr %a0, i64 56
331 %b14 = getelementptr inbounds i8, ptr %b0, i64 56
332 %c14 = getelementptr inbounds i8, ptr %c0, i64 56
333 %va0 = load <2 x float>, ptr %a0, align 4
334 %vb0 = load <2 x float>, ptr %b0, align 4
335 %va2 = load <2 x float>, ptr %a2, align 4
336 %vb2 = load <2 x float>, ptr %b2, align 4
337 %va4 = load <2 x float>, ptr %a4, align 4
338 %vb4 = load <2 x float>, ptr %b4, align 4
339 %va6 = load <2 x float>, ptr %a6, align 4
340 %vb6 = load <2 x float>, ptr %b6, align 4
341 %va8 = load <2 x float>, ptr %a8, align 4
342 %vb8 = load <2 x float>, ptr %b8, align 4
343 %va10 = load <2 x float>, ptr %a10, align 4
344 %vb10 = load <2 x float>, ptr %b10, align 4
345 %va12 = load <2 x float>, ptr %a12, align 4
346 %vb12 = load <2 x float>, ptr %b12, align 4
347 %va14 = load <2 x float>, ptr %a14, align 4
348 %vb14 = load <2 x float>, ptr %b14, align 4
349 %vc0 = fadd <2 x float> %va0, %vb0
350 %vc2 = fadd <2 x float> %va2, %vb2
351 %vc4 = fadd <2 x float> %va4, %vb4
352 %vc6 = fadd <2 x float> %va6, %vb6
353 %vc8 = fadd <2 x float> %va8, %vb8
354 %vc10 = fadd <2 x float> %va10, %vb10
355 %vc12 = fadd <2 x float> %va12, %vb12
356 %vc14 = fadd <2 x float> %va14, %vb14
357 store <2 x float> %vc0, ptr %c0, align 4
358 store <2 x float> %vc2, ptr %c2, align 4
359 store <2 x float> %vc4, ptr %c4, align 4
360 store <2 x float> %vc6, ptr %c6, align 4
361 store <2 x float> %vc8, ptr %c8, align 4
362 store <2 x float> %vc10, ptr %c10, align 4
363 store <2 x float> %vc12, ptr %c12, align 4
364 store <2 x float> %vc14, ptr %c14, align 4