1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX2
4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512
5 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512
6 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512
14 ; Subvector Load + Broadcast
17 define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
18 ; X86-LABEL: test_broadcast_2f64_4f64:
20 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
21 ; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
24 ; X64-LABEL: test_broadcast_2f64_4f64:
26 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
28 %1 = load <2 x double>, <2 x double> *%p
29 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
33 define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind {
34 ; X86-AVX-LABEL: test_broadcast_2f64_8f64:
36 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
37 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
38 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
41 ; X86-AVX512-LABEL: test_broadcast_2f64_8f64:
42 ; X86-AVX512: # %bb.0:
43 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
44 ; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
45 ; X86-AVX512-NEXT: retl
47 ; X64-AVX-LABEL: test_broadcast_2f64_8f64:
49 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
50 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
53 ; X64-AVX512-LABEL: test_broadcast_2f64_8f64:
54 ; X64-AVX512: # %bb.0:
55 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
56 ; X64-AVX512-NEXT: retq
57 %1 = load <2 x double>, <2 x double> *%p
58 %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
62 define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind {
63 ; X86-AVX-LABEL: test_broadcast_4f64_8f64:
65 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
66 ; X86-AVX-NEXT: vmovaps (%eax), %ymm0
67 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
70 ; X86-AVX512-LABEL: test_broadcast_4f64_8f64:
71 ; X86-AVX512: # %bb.0:
72 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
73 ; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
74 ; X86-AVX512-NEXT: retl
76 ; X64-AVX-LABEL: test_broadcast_4f64_8f64:
78 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
79 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
82 ; X64-AVX512-LABEL: test_broadcast_4f64_8f64:
83 ; X64-AVX512: # %bb.0:
84 ; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
85 ; X64-AVX512-NEXT: retq
86 %1 = load <4 x double>, <4 x double> *%p
87 %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
91 define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
92 ; X86-AVX-LABEL: test_broadcast_2i64_4i64:
94 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
95 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
98 ; X86-AVX512-LABEL: test_broadcast_2i64_4i64:
99 ; X86-AVX512: # %bb.0:
100 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
101 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
102 ; X86-AVX512-NEXT: retl
104 ; X64-AVX-LABEL: test_broadcast_2i64_4i64:
106 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
109 ; X64-AVX512-LABEL: test_broadcast_2i64_4i64:
110 ; X64-AVX512: # %bb.0:
111 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
112 ; X64-AVX512-NEXT: retq
113 %1 = load <2 x i64>, <2 x i64> *%p
114 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
118 define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind {
119 ; X86-AVX-LABEL: test_broadcast_2i64_8i64:
121 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
122 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
123 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
126 ; X86-AVX512-LABEL: test_broadcast_2i64_8i64:
127 ; X86-AVX512: # %bb.0:
128 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
129 ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
130 ; X86-AVX512-NEXT: retl
132 ; X64-AVX-LABEL: test_broadcast_2i64_8i64:
134 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
135 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
138 ; X64-AVX512-LABEL: test_broadcast_2i64_8i64:
139 ; X64-AVX512: # %bb.0:
140 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
141 ; X64-AVX512-NEXT: retq
142 %1 = load <2 x i64>, <2 x i64> *%p
143 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
147 define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind {
148 ; X86-AVX-LABEL: test_broadcast_4i64_8i64:
150 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
151 ; X86-AVX-NEXT: vmovaps (%eax), %ymm0
152 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
155 ; X86-AVX512-LABEL: test_broadcast_4i64_8i64:
156 ; X86-AVX512: # %bb.0:
157 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
158 ; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
159 ; X86-AVX512-NEXT: retl
161 ; X64-AVX-LABEL: test_broadcast_4i64_8i64:
163 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
164 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
167 ; X64-AVX512-LABEL: test_broadcast_4i64_8i64:
168 ; X64-AVX512: # %bb.0:
169 ; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
170 ; X64-AVX512-NEXT: retq
171 %1 = load <4 x i64>, <4 x i64> *%p
172 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
176 define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
177 ; X86-LABEL: test_broadcast_4f32_8f32:
179 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
180 ; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
183 ; X64-LABEL: test_broadcast_4f32_8f32:
185 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
187 %1 = load <4 x float>, <4 x float> *%p
188 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
192 define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind {
193 ; X86-AVX-LABEL: test_broadcast_4f32_16f32:
195 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
196 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
197 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
200 ; X86-AVX512-LABEL: test_broadcast_4f32_16f32:
201 ; X86-AVX512: # %bb.0:
202 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
203 ; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
204 ; X86-AVX512-NEXT: retl
206 ; X64-AVX-LABEL: test_broadcast_4f32_16f32:
208 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
209 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
212 ; X64-AVX512-LABEL: test_broadcast_4f32_16f32:
213 ; X64-AVX512: # %bb.0:
214 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
215 ; X64-AVX512-NEXT: retq
216 %1 = load <4 x float>, <4 x float> *%p
217 %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
221 define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind {
222 ; X86-AVX-LABEL: test_broadcast_8f32_16f32:
224 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
225 ; X86-AVX-NEXT: vmovaps (%eax), %ymm0
226 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
229 ; X86-AVX512-LABEL: test_broadcast_8f32_16f32:
230 ; X86-AVX512: # %bb.0:
231 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
232 ; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
233 ; X86-AVX512-NEXT: retl
235 ; X64-AVX-LABEL: test_broadcast_8f32_16f32:
237 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
238 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
241 ; X64-AVX512-LABEL: test_broadcast_8f32_16f32:
242 ; X64-AVX512: # %bb.0:
243 ; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
244 ; X64-AVX512-NEXT: retq
245 %1 = load <8 x float>, <8 x float> *%p
246 %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
250 define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
251 ; X86-AVX-LABEL: test_broadcast_4i32_8i32:
253 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
254 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
257 ; X86-AVX512-LABEL: test_broadcast_4i32_8i32:
258 ; X86-AVX512: # %bb.0:
259 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
260 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
261 ; X86-AVX512-NEXT: retl
263 ; X64-AVX-LABEL: test_broadcast_4i32_8i32:
265 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
268 ; X64-AVX512-LABEL: test_broadcast_4i32_8i32:
269 ; X64-AVX512: # %bb.0:
270 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
271 ; X64-AVX512-NEXT: retq
272 %1 = load <4 x i32>, <4 x i32> *%p
273 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
277 define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind {
278 ; X86-AVX-LABEL: test_broadcast_4i32_16i32:
280 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
281 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
282 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
285 ; X86-AVX512-LABEL: test_broadcast_4i32_16i32:
286 ; X86-AVX512: # %bb.0:
287 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
288 ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
289 ; X86-AVX512-NEXT: retl
291 ; X64-AVX-LABEL: test_broadcast_4i32_16i32:
293 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
294 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
297 ; X64-AVX512-LABEL: test_broadcast_4i32_16i32:
298 ; X64-AVX512: # %bb.0:
299 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
300 ; X64-AVX512-NEXT: retq
301 %1 = load <4 x i32>, <4 x i32> *%p
302 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
306 define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind {
307 ; X86-AVX-LABEL: test_broadcast_8i32_16i32:
309 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
310 ; X86-AVX-NEXT: vmovaps (%eax), %ymm0
311 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
314 ; X86-AVX512-LABEL: test_broadcast_8i32_16i32:
315 ; X86-AVX512: # %bb.0:
316 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
317 ; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
318 ; X86-AVX512-NEXT: retl
320 ; X64-AVX-LABEL: test_broadcast_8i32_16i32:
322 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
323 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
326 ; X64-AVX512-LABEL: test_broadcast_8i32_16i32:
327 ; X64-AVX512: # %bb.0:
328 ; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
329 ; X64-AVX512-NEXT: retq
330 %1 = load <8 x i32>, <8 x i32> *%p
331 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
335 define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
336 ; X86-AVX-LABEL: test_broadcast_8i16_16i16:
338 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
339 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
342 ; X86-AVX512-LABEL: test_broadcast_8i16_16i16:
343 ; X86-AVX512: # %bb.0:
344 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
345 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
346 ; X86-AVX512-NEXT: retl
348 ; X64-AVX-LABEL: test_broadcast_8i16_16i16:
350 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
353 ; X64-AVX512-LABEL: test_broadcast_8i16_16i16:
354 ; X64-AVX512: # %bb.0:
355 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
356 ; X64-AVX512-NEXT: retq
357 %1 = load <8 x i16>, <8 x i16> *%p
358 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
362 define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
363 ; X86-AVX-LABEL: test_broadcast_8i16_32i16:
365 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
366 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
367 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
370 ; X86-AVX512-LABEL: test_broadcast_8i16_32i16:
371 ; X86-AVX512: # %bb.0:
372 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
373 ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
374 ; X86-AVX512-NEXT: retl
376 ; X64-AVX-LABEL: test_broadcast_8i16_32i16:
378 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
379 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
382 ; X64-AVX512-LABEL: test_broadcast_8i16_32i16:
383 ; X64-AVX512: # %bb.0:
384 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
385 ; X64-AVX512-NEXT: retq
386 %1 = load <8 x i16>, <8 x i16> *%p
387 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
391 define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind {
392 ; X86-AVX-LABEL: test_broadcast_16i16_32i16:
394 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
395 ; X86-AVX-NEXT: vmovaps (%eax), %ymm0
396 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
399 ; X86-AVX512-LABEL: test_broadcast_16i16_32i16:
400 ; X86-AVX512: # %bb.0:
401 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
402 ; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
403 ; X86-AVX512-NEXT: retl
405 ; X64-AVX-LABEL: test_broadcast_16i16_32i16:
407 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
408 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
411 ; X64-AVX512-LABEL: test_broadcast_16i16_32i16:
412 ; X64-AVX512: # %bb.0:
413 ; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
414 ; X64-AVX512-NEXT: retq
415 %1 = load <16 x i16>, <16 x i16> *%p
416 %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
420 define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
421 ; X86-AVX-LABEL: test_broadcast_16i8_32i8:
423 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
424 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
427 ; X86-AVX512-LABEL: test_broadcast_16i8_32i8:
428 ; X86-AVX512: # %bb.0:
429 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
430 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
431 ; X86-AVX512-NEXT: retl
433 ; X64-AVX-LABEL: test_broadcast_16i8_32i8:
435 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
438 ; X64-AVX512-LABEL: test_broadcast_16i8_32i8:
439 ; X64-AVX512: # %bb.0:
440 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
441 ; X64-AVX512-NEXT: retq
442 %1 = load <16 x i8>, <16 x i8> *%p
443 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
447 define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
448 ; X86-AVX-LABEL: test_broadcast_16i8_64i8:
450 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
451 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
452 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
455 ; X86-AVX512-LABEL: test_broadcast_16i8_64i8:
456 ; X86-AVX512: # %bb.0:
457 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
458 ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
459 ; X86-AVX512-NEXT: retl
461 ; X64-AVX-LABEL: test_broadcast_16i8_64i8:
463 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
464 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
467 ; X64-AVX512-LABEL: test_broadcast_16i8_64i8:
468 ; X64-AVX512: # %bb.0:
469 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
470 ; X64-AVX512-NEXT: retq
471 %1 = load <16 x i8>, <16 x i8> *%p
472 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
476 define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind {
477 ; X86-AVX-LABEL: test_broadcast_32i8_64i8:
479 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
480 ; X86-AVX-NEXT: vmovaps (%eax), %ymm0
481 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
484 ; X86-AVX512-LABEL: test_broadcast_32i8_64i8:
485 ; X86-AVX512: # %bb.0:
486 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
487 ; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
488 ; X86-AVX512-NEXT: retl
490 ; X64-AVX-LABEL: test_broadcast_32i8_64i8:
492 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
493 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
496 ; X64-AVX512-LABEL: test_broadcast_32i8_64i8:
497 ; X64-AVX512: # %bb.0:
498 ; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
499 ; X64-AVX512-NEXT: retq
500 %1 = load <32 x i8>, <32 x i8> *%p
501 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
506 ; Subvector Load + Broadcast + Store
509 define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) {
510 ; X86-AVX-LABEL: test_broadcast_2f64_4f64_reuse:
512 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
513 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
514 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
515 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
518 ; X86-AVX512-LABEL: test_broadcast_2f64_4f64_reuse:
519 ; X86-AVX512: # %bb.0:
520 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
521 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
522 ; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
523 ; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax)
524 ; X86-AVX512-NEXT: retl
526 ; X64-AVX-LABEL: test_broadcast_2f64_4f64_reuse:
528 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
529 ; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
532 ; X64-AVX512-LABEL: test_broadcast_2f64_4f64_reuse:
533 ; X64-AVX512: # %bb.0:
534 ; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
535 ; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
536 ; X64-AVX512-NEXT: retq
537 %1 = load <2 x double>, <2 x double>* %p0
538 store <2 x double> %1, <2 x double>* %p1
539 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
543 define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) {
544 ; X86-AVX-LABEL: test_broadcast_2i64_4i64_reuse:
546 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
547 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
548 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
549 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
552 ; X86-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
553 ; X86-AVX512: # %bb.0:
554 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
555 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
556 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
557 ; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax)
558 ; X86-AVX512-NEXT: retl
560 ; X64-AVX-LABEL: test_broadcast_2i64_4i64_reuse:
562 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
563 ; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
566 ; X64-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
567 ; X64-AVX512: # %bb.0:
568 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
569 ; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
570 ; X64-AVX512-NEXT: retq
571 %1 = load <2 x i64>, <2 x i64>* %p0
572 store <2 x i64> %1, <2 x i64>* %p1
573 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
577 define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) {
578 ; X86-AVX-LABEL: test_broadcast_4f32_8f32_reuse:
580 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
581 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
582 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
583 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
586 ; X86-AVX512-LABEL: test_broadcast_4f32_8f32_reuse:
587 ; X86-AVX512: # %bb.0:
588 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
589 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
590 ; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
591 ; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax)
592 ; X86-AVX512-NEXT: retl
594 ; X64-AVX-LABEL: test_broadcast_4f32_8f32_reuse:
596 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
597 ; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
600 ; X64-AVX512-LABEL: test_broadcast_4f32_8f32_reuse:
601 ; X64-AVX512: # %bb.0:
602 ; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
603 ; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
604 ; X64-AVX512-NEXT: retq
605 %1 = load <4 x float>, <4 x float>* %p0
606 store <4 x float> %1, <4 x float>* %p1
607 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
611 define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) {
612 ; X86-AVX-LABEL: test_broadcast_4i32_8i32_reuse:
614 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
615 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
616 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
617 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
620 ; X86-AVX512-LABEL: test_broadcast_4i32_8i32_reuse:
621 ; X86-AVX512: # %bb.0:
622 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
623 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
624 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
625 ; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax)
626 ; X86-AVX512-NEXT: retl
628 ; X64-AVX-LABEL: test_broadcast_4i32_8i32_reuse:
630 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
631 ; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
634 ; X64-AVX512-LABEL: test_broadcast_4i32_8i32_reuse:
635 ; X64-AVX512: # %bb.0:
636 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
637 ; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
638 ; X64-AVX512-NEXT: retq
639 %1 = load <4 x i32>, <4 x i32>* %p0
640 store <4 x i32> %1, <4 x i32>* %p1
641 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
645 define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind {
646 ; X86-AVX-LABEL: test_broadcast_8i16_16i16_reuse:
648 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
649 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
650 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
651 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
654 ; X86-AVX512-LABEL: test_broadcast_8i16_16i16_reuse:
655 ; X86-AVX512: # %bb.0:
656 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
657 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
658 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
659 ; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax)
660 ; X86-AVX512-NEXT: retl
662 ; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse:
664 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
665 ; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
668 ; X64-AVX512-LABEL: test_broadcast_8i16_16i16_reuse:
669 ; X64-AVX512: # %bb.0:
670 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
671 ; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
672 ; X64-AVX512-NEXT: retq
673 %1 = load <8 x i16>, <8 x i16> *%p0
674 store <8 x i16> %1, <8 x i16>* %p1
675 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
679 define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind {
680 ; X86-AVX-LABEL: test_broadcast_16i8_32i8_reuse:
682 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
683 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
684 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
685 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
688 ; X86-AVX512-LABEL: test_broadcast_16i8_32i8_reuse:
689 ; X86-AVX512: # %bb.0:
690 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
691 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
692 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
693 ; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax)
694 ; X86-AVX512-NEXT: retl
696 ; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse:
698 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
699 ; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
702 ; X64-AVX512-LABEL: test_broadcast_16i8_32i8_reuse:
703 ; X64-AVX512: # %bb.0:
704 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
705 ; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
706 ; X64-AVX512-NEXT: retq
707 %1 = load <16 x i8>, <16 x i8> *%p0
708 store <16 x i8> %1, <16 x i8>* %p1
709 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
714 ; Subvector Load + Broadcast with Separate Store
717 define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
718 ; X86-AVX-LABEL: test_broadcast_4i32_8i32_chain:
720 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
721 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
722 ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
723 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
724 ; X86-AVX-NEXT: vmovaps %xmm1, (%eax)
727 ; X86-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
728 ; X86-AVX512: # %bb.0:
729 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
730 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
731 ; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
732 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
733 ; X86-AVX512-NEXT: vmovaps %xmm1, (%eax)
734 ; X86-AVX512-NEXT: retl
736 ; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain:
738 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
739 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
740 ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi)
743 ; X64-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
744 ; X64-AVX512: # %bb.0:
745 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
746 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
747 ; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi)
748 ; X64-AVX512-NEXT: retq
749 %1 = load <4 x i32>, <4 x i32>* %p0
750 store <4 x float> zeroinitializer, <4 x float>* %p1
751 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
755 define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
756 ; X86-AVX-LABEL: test_broadcast_4i32_16i32_chain:
758 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
759 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
760 ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
761 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
762 ; X86-AVX-NEXT: vmovaps %xmm1, (%eax)
763 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
766 ; X86-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
767 ; X86-AVX512: # %bb.0:
768 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
769 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
770 ; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
771 ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
772 ; X86-AVX512-NEXT: vmovaps %xmm1, (%eax)
773 ; X86-AVX512-NEXT: retl
775 ; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain:
777 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
778 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
779 ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi)
780 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
783 ; X64-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
784 ; X64-AVX512: # %bb.0:
785 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
786 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
787 ; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi)
788 ; X64-AVX512-NEXT: retq
789 %1 = load <4 x i32>, <4 x i32>* %p0
790 store <4 x float> zeroinitializer, <4 x float>* %p1
791 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
796 ; subvector Load with multiple uses + broadcast
797 ; Fallback to the broadcast should be done
800 @ga4 = dso_local global <4 x i64> zeroinitializer, align 8
801 @gb4 = dso_local global <8 x i64> zeroinitializer, align 8
803 define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
804 ; X86-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
805 ; X86-AVX1: # %bb.0: # %entry
806 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,2,0]
807 ; X86-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4
808 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
809 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,0,4,0]
810 ; X86-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0
811 ; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [1,0,2,0,3,0,4,0]
812 ; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
813 ; X86-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm7
814 ; X86-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
815 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2
816 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
817 ; X86-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm5
818 ; X86-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
819 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
820 ; X86-AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1
821 ; X86-AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
822 ; X86-AVX1-NEXT: vmovdqu %xmm0, ga4+16
823 ; X86-AVX1-NEXT: vmovdqu %xmm4, ga4
824 ; X86-AVX1-NEXT: vmovups %ymm2, gb4+32
825 ; X86-AVX1-NEXT: vmovups %ymm1, gb4
826 ; X86-AVX1-NEXT: vzeroupper
827 ; X86-AVX1-NEXT: retl
829 ; X86-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
830 ; X86-AVX2: # %bb.0: # %entry
831 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
832 ; X86-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
833 ; X86-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
834 ; X86-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
835 ; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
836 ; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
837 ; X86-AVX2-NEXT: vmovdqu %ymm0, ga4
838 ; X86-AVX2-NEXT: vmovdqu %ymm2, gb4+32
839 ; X86-AVX2-NEXT: vmovdqu %ymm1, gb4
840 ; X86-AVX2-NEXT: vzeroupper
841 ; X86-AVX2-NEXT: retl
843 ; X86-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
844 ; X86-AVX512: # %bb.0: # %entry
845 ; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0]
846 ; X86-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
847 ; X86-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0
848 ; X86-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1
849 ; X86-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
850 ; X86-AVX512-NEXT: vmovdqu %ymm0, ga4
851 ; X86-AVX512-NEXT: vmovdqu64 %zmm1, gb4
852 ; X86-AVX512-NEXT: vzeroupper
853 ; X86-AVX512-NEXT: retl
855 ; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
856 ; X64-AVX1: # %bb.0: # %entry
857 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2]
858 ; X64-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4
859 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
860 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,4]
861 ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0
862 ; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [1,2,3,4]
863 ; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
864 ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm7
865 ; X64-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
866 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2
867 ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
868 ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm5
869 ; X64-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
870 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
871 ; X64-AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1
872 ; X64-AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
873 ; X64-AVX1-NEXT: vmovdqu %xmm0, ga4+16(%rip)
874 ; X64-AVX1-NEXT: vmovdqu %xmm4, ga4(%rip)
875 ; X64-AVX1-NEXT: vmovups %ymm2, gb4+32(%rip)
876 ; X64-AVX1-NEXT: vmovups %ymm1, gb4(%rip)
877 ; X64-AVX1-NEXT: vzeroupper
878 ; X64-AVX1-NEXT: retq
880 ; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
881 ; X64-AVX2: # %bb.0: # %entry
882 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4]
883 ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
884 ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
885 ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
886 ; X64-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
887 ; X64-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
888 ; X64-AVX2-NEXT: vmovdqu %ymm0, ga4(%rip)
889 ; X64-AVX2-NEXT: vmovdqu %ymm2, gb4+32(%rip)
890 ; X64-AVX2-NEXT: vmovdqu %ymm1, gb4(%rip)
891 ; X64-AVX2-NEXT: vzeroupper
892 ; X64-AVX2-NEXT: retq
894 ; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
895 ; X64-AVX512: # %bb.0: # %entry
896 ; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,1,2,3,4]
897 ; X64-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
898 ; X64-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0
899 ; X64-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1
900 ; X64-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
901 ; X64-AVX512-NEXT: vmovdqu %ymm0, ga4(%rip)
902 ; X64-AVX512-NEXT: vmovdqu64 %zmm1, gb4(%rip)
903 ; X64-AVX512-NEXT: vzeroupper
904 ; X64-AVX512-NEXT: retq
906 %0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
907 %1 = add <8 x i64> %b, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
908 %2 = and <8 x i64> %1, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
909 store <4 x i64> %0, <4 x i64>* @ga4, align 8
910 store <8 x i64> %2, <8 x i64>* @gb4, align 8
915 @ga2 = dso_local global <4 x double> zeroinitializer, align 8
916 @gb2 = dso_local global <8 x double> zeroinitializer, align 8
918 define dso_local void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) {
919 ; X86-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
920 ; X86-AVX: # %bb.0: # %entry
921 ; X86-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
922 ; X86-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0
923 ; X86-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2
924 ; X86-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1
925 ; X86-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1
926 ; X86-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2
927 ; X86-AVX-NEXT: vmovupd %ymm0, ga2
928 ; X86-AVX-NEXT: vmovupd %ymm2, gb2+32
929 ; X86-AVX-NEXT: vmovupd %ymm1, gb2
930 ; X86-AVX-NEXT: vzeroupper
933 ; X86-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
934 ; X86-AVX512: # %bb.0: # %entry
935 ; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0]
936 ; X86-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
937 ; X86-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0
938 ; X86-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
939 ; X86-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1
940 ; X86-AVX512-NEXT: vmovupd %ymm0, ga2
941 ; X86-AVX512-NEXT: vmovupd %zmm1, gb2
942 ; X86-AVX512-NEXT: vzeroupper
943 ; X86-AVX512-NEXT: retl
945 ; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
946 ; X64-AVX: # %bb.0: # %entry
947 ; X64-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
948 ; X64-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0
949 ; X64-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2
950 ; X64-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1
951 ; X64-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1
952 ; X64-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2
953 ; X64-AVX-NEXT: vmovupd %ymm0, ga2(%rip)
954 ; X64-AVX-NEXT: vmovupd %ymm2, gb2+32(%rip)
955 ; X64-AVX-NEXT: vmovupd %ymm1, gb2(%rip)
956 ; X64-AVX-NEXT: vzeroupper
959 ; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
960 ; X64-AVX512: # %bb.0: # %entry
961 ; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0]
962 ; X64-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
963 ; X64-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0
964 ; X64-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
965 ; X64-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1
966 ; X64-AVX512-NEXT: vmovupd %ymm0, ga2(%rip)
967 ; X64-AVX512-NEXT: vmovupd %zmm1, gb2(%rip)
968 ; X64-AVX512-NEXT: vzeroupper
969 ; X64-AVX512-NEXT: retq
971 %0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0>
972 %1 = fadd <8 x double> %b, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
973 %2 = fdiv <8 x double> %1, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
974 store <4 x double> %0, <4 x double>* @ga2, align 8
975 store <8 x double> %2, <8 x double>* @gb2, align 8
979 @ha4 = dso_local global <4 x i32> zeroinitializer, align 8
980 @hb4 = dso_local global <8 x i32> zeroinitializer, align 8
981 @hc4 = dso_local global <16 x i32> zeroinitializer, align 8
983 define dso_local void @fallback_broadcast_v4i32_v8i32_v16i32(<4 x i32> %a, <8 x i32> %b, <16 x i32> %c) nounwind {
984 ; X86-AVX1-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
985 ; X86-AVX1: # %bb.0: # %entry
986 ; X86-AVX1-NEXT: pushl %ebp
987 ; X86-AVX1-NEXT: movl %esp, %ebp
988 ; X86-AVX1-NEXT: andl $-32, %esp
989 ; X86-AVX1-NEXT: subl $32, %esp
990 ; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4]
991 ; X86-AVX1-NEXT: # ymm3 = mem[0,1,0,1]
992 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
993 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
994 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4
995 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
996 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
997 ; X86-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
998 ; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
999 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4
1000 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1001 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
1002 ; X86-AVX1-NEXT: vpaddd 8(%ebp), %xmm3, %xmm4
1003 ; X86-AVX1-NEXT: vpaddd 24(%ebp), %xmm3, %xmm5
1004 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
1005 ; X86-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
1006 ; X86-AVX1-NEXT: vandps %ymm3, %ymm4, %ymm3
1007 ; X86-AVX1-NEXT: vmovdqu %xmm0, ha4
1008 ; X86-AVX1-NEXT: vmovups %ymm1, hb4
1009 ; X86-AVX1-NEXT: vmovups %ymm3, hc4+32
1010 ; X86-AVX1-NEXT: vmovups %ymm2, hc4
1011 ; X86-AVX1-NEXT: movl %ebp, %esp
1012 ; X86-AVX1-NEXT: popl %ebp
1013 ; X86-AVX1-NEXT: vzeroupper
1014 ; X86-AVX1-NEXT: retl
1016 ; X86-AVX2-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1017 ; X86-AVX2: # %bb.0: # %entry
1018 ; X86-AVX2-NEXT: pushl %ebp
1019 ; X86-AVX2-NEXT: movl %esp, %ebp
1020 ; X86-AVX2-NEXT: andl $-32, %esp
1021 ; X86-AVX2-NEXT: subl $32, %esp
1022 ; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4]
1023 ; X86-AVX2-NEXT: # ymm3 = mem[0,1,0,1]
1024 ; X86-AVX2-NEXT: vpaddd %xmm3, %xmm0, %xmm0
1025 ; X86-AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
1026 ; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
1027 ; X86-AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2
1028 ; X86-AVX2-NEXT: vpaddd 8(%ebp), %ymm3, %ymm4
1029 ; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
1030 ; X86-AVX2-NEXT: vpand %ymm3, %ymm4, %ymm3
1031 ; X86-AVX2-NEXT: vmovdqu %xmm0, ha4
1032 ; X86-AVX2-NEXT: vmovdqu %ymm1, hb4
1033 ; X86-AVX2-NEXT: vmovdqu %ymm3, hc4+32
1034 ; X86-AVX2-NEXT: vmovdqu %ymm2, hc4
1035 ; X86-AVX2-NEXT: movl %ebp, %esp
1036 ; X86-AVX2-NEXT: popl %ebp
1037 ; X86-AVX2-NEXT: vzeroupper
1038 ; X86-AVX2-NEXT: retl
1040 ; X86-AVX512-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1041 ; X86-AVX512: # %bb.0: # %entry
1042 ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4]
1043 ; X86-AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1044 ; X86-AVX512-NEXT: vpaddd %xmm3, %xmm0, %xmm0
1045 ; X86-AVX512-NEXT: vpaddd %ymm3, %ymm1, %ymm1
1046 ; X86-AVX512-NEXT: vpand %ymm3, %ymm1, %ymm1
1047 ; X86-AVX512-NEXT: vpaddd %zmm3, %zmm2, %zmm2
1048 ; X86-AVX512-NEXT: vpandd %zmm3, %zmm2, %zmm2
1049 ; X86-AVX512-NEXT: vmovdqu %xmm0, ha4
1050 ; X86-AVX512-NEXT: vmovdqu %ymm1, hb4
1051 ; X86-AVX512-NEXT: vmovdqu64 %zmm2, hc4
1052 ; X86-AVX512-NEXT: vzeroupper
1053 ; X86-AVX512-NEXT: retl
1055 ; X64-AVX1-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1056 ; X64-AVX1: # %bb.0: # %entry
1057 ; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4]
1058 ; X64-AVX1-NEXT: # ymm4 = mem[0,1,0,1]
1059 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1060 ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
1061 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5
1062 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
1063 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
1064 ; X64-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
1065 ; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
1066 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5
1067 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
1068 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
1069 ; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
1070 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5
1071 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
1072 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
1073 ; X64-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
1074 ; X64-AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
1075 ; X64-AVX1-NEXT: vmovdqu %xmm0, ha4(%rip)
1076 ; X64-AVX1-NEXT: vmovups %ymm1, hb4(%rip)
1077 ; X64-AVX1-NEXT: vmovups %ymm3, hc4+32(%rip)
1078 ; X64-AVX1-NEXT: vmovups %ymm2, hc4(%rip)
1079 ; X64-AVX1-NEXT: vzeroupper
1080 ; X64-AVX1-NEXT: retq
1082 ; X64-AVX2-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1083 ; X64-AVX2: # %bb.0: # %entry
1084 ; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4]
1085 ; X64-AVX2-NEXT: # ymm4 = mem[0,1,0,1]
1086 ; X64-AVX2-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1087 ; X64-AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
1088 ; X64-AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
1089 ; X64-AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3
1090 ; X64-AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2
1091 ; X64-AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
1092 ; X64-AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
1093 ; X64-AVX2-NEXT: vmovdqu %xmm0, ha4(%rip)
1094 ; X64-AVX2-NEXT: vmovdqu %ymm1, hb4(%rip)
1095 ; X64-AVX2-NEXT: vmovdqu %ymm3, hc4+32(%rip)
1096 ; X64-AVX2-NEXT: vmovdqu %ymm2, hc4(%rip)
1097 ; X64-AVX2-NEXT: vzeroupper
1098 ; X64-AVX2-NEXT: retq
1100 ; X64-AVX512-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1101 ; X64-AVX512: # %bb.0: # %entry
1102 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4]
1103 ; X64-AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1104 ; X64-AVX512-NEXT: vpaddd %xmm3, %xmm0, %xmm0
1105 ; X64-AVX512-NEXT: vpaddd %ymm3, %ymm1, %ymm1
1106 ; X64-AVX512-NEXT: vpand %ymm3, %ymm1, %ymm1
1107 ; X64-AVX512-NEXT: vpaddd %zmm3, %zmm2, %zmm2
1108 ; X64-AVX512-NEXT: vpandd %zmm3, %zmm2, %zmm2
1109 ; X64-AVX512-NEXT: vmovdqu %xmm0, ha4(%rip)
1110 ; X64-AVX512-NEXT: vmovdqu %ymm1, hb4(%rip)
1111 ; X64-AVX512-NEXT: vmovdqu64 %zmm2, hc4(%rip)
1112 ; X64-AVX512-NEXT: vzeroupper
1113 ; X64-AVX512-NEXT: retq
1115 %0 = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
1116 %1 = add <8 x i32> %b, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
1117 %2 = and <8 x i32> %1, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
1118 %3 = add <16 x i32> %c, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
1119 %4 = and <16 x i32> %3, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
1120 store <4 x i32> %0, <4 x i32>* @ha4, align 8
1121 store <8 x i32> %2, <8 x i32>* @hb4, align 8
1122 store <16 x i32> %4, <16 x i32>* @hc4, align 8
1127 ; Subvector Broadcast from register
1130 define <4 x double> @reg_broadcast_2f64_4f64(<2 x double> %a0) nounwind {
1131 ; X86-LABEL: reg_broadcast_2f64_4f64:
1133 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1134 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1137 ; X64-LABEL: reg_broadcast_2f64_4f64:
1139 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1140 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1142 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1146 define <8 x double> @reg_broadcast_2f64_8f64(<2 x double> %a0) nounwind {
1147 ; X86-AVX-LABEL: reg_broadcast_2f64_8f64:
1149 ; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1150 ; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1151 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1152 ; X86-AVX-NEXT: retl
1154 ; X86-AVX512-LABEL: reg_broadcast_2f64_8f64:
1155 ; X86-AVX512: # %bb.0:
1156 ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1157 ; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1158 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1159 ; X86-AVX512-NEXT: retl
1161 ; X64-AVX-LABEL: reg_broadcast_2f64_8f64:
1163 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1164 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1165 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1166 ; X64-AVX-NEXT: retq
1168 ; X64-AVX512-LABEL: reg_broadcast_2f64_8f64:
1169 ; X64-AVX512: # %bb.0:
1170 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1171 ; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1172 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1173 ; X64-AVX512-NEXT: retq
1174 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1178 define <8 x double> @reg_broadcast_4f64_8f64(<4 x double> %a0) nounwind {
1179 ; X86-AVX-LABEL: reg_broadcast_4f64_8f64:
1181 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1182 ; X86-AVX-NEXT: retl
1184 ; X86-AVX512-LABEL: reg_broadcast_4f64_8f64:
1185 ; X86-AVX512: # %bb.0:
1186 ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1187 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1188 ; X86-AVX512-NEXT: retl
1190 ; X64-AVX-LABEL: reg_broadcast_4f64_8f64:
1192 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1193 ; X64-AVX-NEXT: retq
1195 ; X64-AVX512-LABEL: reg_broadcast_4f64_8f64:
1196 ; X64-AVX512: # %bb.0:
1197 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1198 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1199 ; X64-AVX512-NEXT: retq
1200 %1 = shufflevector <4 x double> %a0, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1204 define <4 x i64> @reg_broadcast_2i64_4i64(<2 x i64> %a0) nounwind {
1205 ; X86-LABEL: reg_broadcast_2i64_4i64:
1207 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1208 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1211 ; X64-LABEL: reg_broadcast_2i64_4i64:
1213 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1214 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1216 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1220 define <8 x i64> @reg_broadcast_2i64_8i64(<2 x i64> %a0) nounwind {
1221 ; X86-AVX-LABEL: reg_broadcast_2i64_8i64:
1223 ; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1224 ; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1225 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1226 ; X86-AVX-NEXT: retl
1228 ; X86-AVX512-LABEL: reg_broadcast_2i64_8i64:
1229 ; X86-AVX512: # %bb.0:
1230 ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1231 ; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1232 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1233 ; X86-AVX512-NEXT: retl
1235 ; X64-AVX-LABEL: reg_broadcast_2i64_8i64:
1237 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1238 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1239 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1240 ; X64-AVX-NEXT: retq
1242 ; X64-AVX512-LABEL: reg_broadcast_2i64_8i64:
1243 ; X64-AVX512: # %bb.0:
1244 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1245 ; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1246 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1247 ; X64-AVX512-NEXT: retq
1248 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1252 define <8 x i64> @reg_broadcast_4i64_8i64(<4 x i64> %a0) nounwind {
1253 ; X86-AVX-LABEL: reg_broadcast_4i64_8i64:
1255 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1256 ; X86-AVX-NEXT: retl
1258 ; X86-AVX512-LABEL: reg_broadcast_4i64_8i64:
1259 ; X86-AVX512: # %bb.0:
1260 ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1261 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1262 ; X86-AVX512-NEXT: retl
1264 ; X64-AVX-LABEL: reg_broadcast_4i64_8i64:
1266 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1267 ; X64-AVX-NEXT: retq
1269 ; X64-AVX512-LABEL: reg_broadcast_4i64_8i64:
1270 ; X64-AVX512: # %bb.0:
1271 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1272 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1273 ; X64-AVX512-NEXT: retq
1274 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1278 define <8 x float> @reg_broadcast_4f32_8f32(<4 x float> %a0) nounwind {
1279 ; X86-LABEL: reg_broadcast_4f32_8f32:
1281 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1282 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1285 ; X64-LABEL: reg_broadcast_4f32_8f32:
1287 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1288 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1290 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1294 define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind {
1295 ; X86-AVX-LABEL: reg_broadcast_4f32_16f32:
1297 ; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1298 ; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1299 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1300 ; X86-AVX-NEXT: retl
1302 ; X86-AVX512-LABEL: reg_broadcast_4f32_16f32:
1303 ; X86-AVX512: # %bb.0:
1304 ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1305 ; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1306 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1307 ; X86-AVX512-NEXT: retl
1309 ; X64-AVX-LABEL: reg_broadcast_4f32_16f32:
1311 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1312 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1313 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1314 ; X64-AVX-NEXT: retq
1316 ; X64-AVX512-LABEL: reg_broadcast_4f32_16f32:
1317 ; X64-AVX512: # %bb.0:
1318 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1319 ; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1320 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1321 ; X64-AVX512-NEXT: retq
1322 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1326 define <16 x float> @reg_broadcast_8f32_16f32(<8 x float> %a0) nounwind {
1327 ; X86-AVX-LABEL: reg_broadcast_8f32_16f32:
1329 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1330 ; X86-AVX-NEXT: retl
1332 ; X86-AVX512-LABEL: reg_broadcast_8f32_16f32:
1333 ; X86-AVX512: # %bb.0:
1334 ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1335 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1336 ; X86-AVX512-NEXT: retl
1338 ; X64-AVX-LABEL: reg_broadcast_8f32_16f32:
1340 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1341 ; X64-AVX-NEXT: retq
1343 ; X64-AVX512-LABEL: reg_broadcast_8f32_16f32:
1344 ; X64-AVX512: # %bb.0:
1345 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1346 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1347 ; X64-AVX512-NEXT: retq
1348 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1352 define <8 x i32> @reg_broadcast_4i32_8i32(<4 x i32> %a0) nounwind {
1353 ; X86-LABEL: reg_broadcast_4i32_8i32:
1355 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1356 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1359 ; X64-LABEL: reg_broadcast_4i32_8i32:
1361 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1362 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1364 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1368 define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind {
1369 ; X86-AVX-LABEL: reg_broadcast_4i32_16i32:
1371 ; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1372 ; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1373 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1374 ; X86-AVX-NEXT: retl
1376 ; X86-AVX512-LABEL: reg_broadcast_4i32_16i32:
1377 ; X86-AVX512: # %bb.0:
1378 ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1379 ; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1380 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1381 ; X86-AVX512-NEXT: retl
1383 ; X64-AVX-LABEL: reg_broadcast_4i32_16i32:
1385 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1386 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1387 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1388 ; X64-AVX-NEXT: retq
1390 ; X64-AVX512-LABEL: reg_broadcast_4i32_16i32:
1391 ; X64-AVX512: # %bb.0:
1392 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1393 ; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1394 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1395 ; X64-AVX512-NEXT: retq
1396 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1400 define <16 x i32> @reg_broadcast_8i32_16i32(<8 x i32> %a0) nounwind {
1401 ; X86-AVX-LABEL: reg_broadcast_8i32_16i32:
1403 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1404 ; X86-AVX-NEXT: retl
1406 ; X86-AVX512-LABEL: reg_broadcast_8i32_16i32:
1407 ; X86-AVX512: # %bb.0:
1408 ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1409 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1410 ; X86-AVX512-NEXT: retl
1412 ; X64-AVX-LABEL: reg_broadcast_8i32_16i32:
1414 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1415 ; X64-AVX-NEXT: retq
1417 ; X64-AVX512-LABEL: reg_broadcast_8i32_16i32:
1418 ; X64-AVX512: # %bb.0:
1419 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1420 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1421 ; X64-AVX512-NEXT: retq
1422 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1426 define <16 x i16> @reg_broadcast_8i16_16i16(<8 x i16> %a0) nounwind {
1427 ; X86-LABEL: reg_broadcast_8i16_16i16:
1429 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1430 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1433 ; X64-LABEL: reg_broadcast_8i16_16i16:
1435 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1436 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1438 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1442 define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind {
1443 ; X86-AVX-LABEL: reg_broadcast_8i16_32i16:
1445 ; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1446 ; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1447 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1448 ; X86-AVX-NEXT: retl
1450 ; X86-AVX512-LABEL: reg_broadcast_8i16_32i16:
1451 ; X86-AVX512: # %bb.0:
1452 ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1453 ; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1454 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1455 ; X86-AVX512-NEXT: retl
1457 ; X64-AVX-LABEL: reg_broadcast_8i16_32i16:
1459 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1460 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1461 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1462 ; X64-AVX-NEXT: retq
1464 ; X64-AVX512-LABEL: reg_broadcast_8i16_32i16:
1465 ; X64-AVX512: # %bb.0:
1466 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1467 ; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1468 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1469 ; X64-AVX512-NEXT: retq
1470 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1474 define <32 x i16> @reg_broadcast_16i16_32i16(<16 x i16> %a0) nounwind {
1475 ; X86-AVX-LABEL: reg_broadcast_16i16_32i16:
1477 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1478 ; X86-AVX-NEXT: retl
1480 ; X86-AVX512-LABEL: reg_broadcast_16i16_32i16:
1481 ; X86-AVX512: # %bb.0:
1482 ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1483 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1484 ; X86-AVX512-NEXT: retl
1486 ; X64-AVX-LABEL: reg_broadcast_16i16_32i16:
1488 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1489 ; X64-AVX-NEXT: retq
1491 ; X64-AVX512-LABEL: reg_broadcast_16i16_32i16:
1492 ; X64-AVX512: # %bb.0:
1493 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1494 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1495 ; X64-AVX512-NEXT: retq
1496 %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1500 define <32 x i8> @reg_broadcast_16i8_32i8(<16 x i8> %a0) nounwind {
1501 ; X86-LABEL: reg_broadcast_16i8_32i8:
1503 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1504 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1507 ; X64-LABEL: reg_broadcast_16i8_32i8:
1509 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1510 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1512 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1516 define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind {
1517 ; X86-AVX-LABEL: reg_broadcast_16i8_64i8:
1519 ; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1520 ; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1521 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1522 ; X86-AVX-NEXT: retl
1524 ; X86-AVX512-LABEL: reg_broadcast_16i8_64i8:
1525 ; X86-AVX512: # %bb.0:
1526 ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1527 ; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1528 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1529 ; X86-AVX512-NEXT: retl
1531 ; X64-AVX-LABEL: reg_broadcast_16i8_64i8:
1533 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1534 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1535 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1536 ; X64-AVX-NEXT: retq
1538 ; X64-AVX512-LABEL: reg_broadcast_16i8_64i8:
1539 ; X64-AVX512: # %bb.0:
1540 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1541 ; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1542 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1543 ; X64-AVX512-NEXT: retq
1544 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1548 define <64 x i8> @reg_broadcast_32i8_64i8(<32 x i8> %a0) nounwind {
1549 ; X86-AVX-LABEL: reg_broadcast_32i8_64i8:
1551 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1552 ; X86-AVX-NEXT: retl
1554 ; X86-AVX512-LABEL: reg_broadcast_32i8_64i8:
1555 ; X86-AVX512: # %bb.0:
1556 ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1557 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1558 ; X86-AVX512-NEXT: retl
1560 ; X64-AVX-LABEL: reg_broadcast_32i8_64i8:
1562 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1563 ; X64-AVX-NEXT: retq
1565 ; X64-AVX512-LABEL: reg_broadcast_32i8_64i8:
1566 ; X64-AVX512: # %bb.0:
1567 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1568 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1569 ; X64-AVX512-NEXT: retq
1570 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1578 define <4 x i32> @test_2xi32_to_4xi32_mem(<2 x i32>* %vp) {
1579 ; X86-LABEL: test_2xi32_to_4xi32_mem:
1581 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1582 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
1585 ; X64-LABEL: test_2xi32_to_4xi32_mem:
1587 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
1589 %vec = load <2 x i32>, <2 x i32>* %vp
1590 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1594 define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) {
1595 ; X86-LABEL: test_2xi32_to_8xi32_mem:
1597 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1598 ; X86-NEXT: vbroadcastsd (%eax), %ymm0
1601 ; X64-LABEL: test_2xi32_to_8xi32_mem:
1603 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
1605 %vec = load <2 x i32>, <2 x i32>* %vp
1606 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1610 define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) {
1611 ; X86-AVX-LABEL: test_2xi32_to_16xi32_mem:
1613 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1614 ; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm0
1615 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1616 ; X86-AVX-NEXT: retl
1618 ; X86-AVX512-LABEL: test_2xi32_to_16xi32_mem:
1619 ; X86-AVX512: # %bb.0:
1620 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1621 ; X86-AVX512-NEXT: vbroadcastsd (%eax), %zmm0
1622 ; X86-AVX512-NEXT: retl
1624 ; X64-AVX-LABEL: test_2xi32_to_16xi32_mem:
1626 ; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm0
1627 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1628 ; X64-AVX-NEXT: retq
1630 ; X64-AVX512-LABEL: test_2xi32_to_16xi32_mem:
1631 ; X64-AVX512: # %bb.0:
1632 ; X64-AVX512-NEXT: vbroadcastsd (%rdi), %zmm0
1633 ; X64-AVX512-NEXT: retq
1634 %vec = load <2 x i32>, <2 x i32>* %vp
1635 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1643 define <4 x double> @broadcast_v4f64_f64_u000(double* %p) {
1644 ; X86-LABEL: broadcast_v4f64_f64_u000:
1646 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1647 ; X86-NEXT: vbroadcastsd (%eax), %ymm0
1650 ; X64-LABEL: broadcast_v4f64_f64_u000:
1652 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
1654 %s = load double, double* %p
1655 %vec = insertelement <2 x double> undef, double %s, i32 0
1656 %res = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
1657 ret <4 x double> %res
1660 define <4 x double> @broadcast_v4f64_v2f64_4u61(<2 x double>* %vp, <4 x double> %default) {
1661 ; X86-LABEL: broadcast_v4f64_v2f64_4u61:
1663 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1664 ; X86-NEXT: vinsertf128 $1, (%eax), %ymm0, %ymm1
1665 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1668 ; X64-LABEL: broadcast_v4f64_v2f64_4u61:
1670 ; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm1
1671 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1673 %vec = load <2 x double>, <2 x double>* %vp
1674 %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 1>
1675 %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %default
1676 ret <4 x double> %res
1679 define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(<2 x float>* %vp, <8 x float> %default) {
1680 ; X86-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1682 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1683 ; X86-NEXT: vbroadcastsd (%eax), %ymm1
1684 ; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
1687 ; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1689 ; X64-NEXT: vbroadcastsd (%rdi), %ymm1
1690 ; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
1692 %vec = load <2 x float>, <2 x float>* %vp
1693 %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 0, i32 2, i32 3, i32 undef>
1694 %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %default
1695 ret <8 x float> %res
1698 define <8 x double> @broadcast_v8f64_v2f64_u1u10101(<2 x double>* %vp) {
1699 ; X86-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101:
1701 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1702 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1703 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1704 ; X86-AVX-NEXT: retl
1706 ; X86-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101:
1707 ; X86-AVX512: # %bb.0:
1708 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1709 ; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1710 ; X86-AVX512-NEXT: retl
1712 ; X64-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101:
1714 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1715 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1716 ; X64-AVX-NEXT: retq
1718 ; X64-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101:
1719 ; X64-AVX512: # %bb.0:
1720 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1721 ; X64-AVX512-NEXT: retq
1722 %vec = load <2 x double>, <2 x double>* %vp
1723 %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 3, i32 1, i32 undef, i32 1, i32 0, i32 1, i32 0, i32 1>
1724 ret <8 x double> %res
1727 define <8 x double> @broadcast_v8f64_v2f64_0uuu0101(<2 x double>* %vp) {
1728 ; X86-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1730 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1731 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1732 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1733 ; X86-AVX-NEXT: retl
1735 ; X86-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1736 ; X86-AVX512: # %bb.0:
1737 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1738 ; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1739 ; X86-AVX512-NEXT: retl
1741 ; X64-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1743 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1744 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1745 ; X64-AVX-NEXT: retq
1747 ; X64-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1748 ; X64-AVX512: # %bb.0:
1749 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1750 ; X64-AVX512-NEXT: retq
1751 %vec = load <2 x double>, <2 x double>* %vp
1752 %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 0, i32 1>
1753 ret <8 x double> %res