1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX2
4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512
5 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512
6 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512
14 ; Subvector Load + Broadcast
17 define <4 x double> @test_broadcast_2f64_4f64(ptr%p) nounwind {
18 ; X86-LABEL: test_broadcast_2f64_4f64:
20 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
21 ; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
24 ; X64-LABEL: test_broadcast_2f64_4f64:
26 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
28 %1 = load <2 x double>, ptr%p
29 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
33 define <8 x double> @test_broadcast_2f64_8f64(ptr%p) nounwind {
34 ; X86-AVX-LABEL: test_broadcast_2f64_8f64:
36 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
37 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
38 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
41 ; X86-AVX512-LABEL: test_broadcast_2f64_8f64:
42 ; X86-AVX512: # %bb.0:
43 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
44 ; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
45 ; X86-AVX512-NEXT: retl
47 ; X64-AVX-LABEL: test_broadcast_2f64_8f64:
49 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
50 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
53 ; X64-AVX512-LABEL: test_broadcast_2f64_8f64:
54 ; X64-AVX512: # %bb.0:
55 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
56 ; X64-AVX512-NEXT: retq
57 %1 = load <2 x double>, ptr%p
58 %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
62 define <8 x double> @test_broadcast_4f64_8f64(ptr%p) nounwind {
63 ; X86-AVX-LABEL: test_broadcast_4f64_8f64:
65 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
66 ; X86-AVX-NEXT: vmovaps (%eax), %ymm0
67 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
70 ; X86-AVX512-LABEL: test_broadcast_4f64_8f64:
71 ; X86-AVX512: # %bb.0:
72 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
73 ; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
74 ; X86-AVX512-NEXT: retl
76 ; X64-AVX-LABEL: test_broadcast_4f64_8f64:
78 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
79 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
82 ; X64-AVX512-LABEL: test_broadcast_4f64_8f64:
83 ; X64-AVX512: # %bb.0:
84 ; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
85 ; X64-AVX512-NEXT: retq
86 %1 = load <4 x double>, ptr%p
87 %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
91 define <4 x i64> @test_broadcast_2i64_4i64(ptr%p) nounwind {
92 ; X86-AVX-LABEL: test_broadcast_2i64_4i64:
94 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
95 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
98 ; X86-AVX512-LABEL: test_broadcast_2i64_4i64:
99 ; X86-AVX512: # %bb.0:
100 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
101 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
102 ; X86-AVX512-NEXT: retl
104 ; X64-AVX-LABEL: test_broadcast_2i64_4i64:
106 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
109 ; X64-AVX512-LABEL: test_broadcast_2i64_4i64:
110 ; X64-AVX512: # %bb.0:
111 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
112 ; X64-AVX512-NEXT: retq
113 %1 = load <2 x i64>, ptr%p
114 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
118 define <8 x i64> @test_broadcast_2i64_8i64(ptr%p) nounwind {
119 ; X86-AVX-LABEL: test_broadcast_2i64_8i64:
121 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
122 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
123 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
126 ; X86-AVX512-LABEL: test_broadcast_2i64_8i64:
127 ; X86-AVX512: # %bb.0:
128 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
129 ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
130 ; X86-AVX512-NEXT: retl
132 ; X64-AVX-LABEL: test_broadcast_2i64_8i64:
134 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
135 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
138 ; X64-AVX512-LABEL: test_broadcast_2i64_8i64:
139 ; X64-AVX512: # %bb.0:
140 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
141 ; X64-AVX512-NEXT: retq
142 %1 = load <2 x i64>, ptr%p
143 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
147 define <8 x i64> @test_broadcast_4i64_8i64(ptr%p) nounwind {
148 ; X86-AVX-LABEL: test_broadcast_4i64_8i64:
150 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
151 ; X86-AVX-NEXT: vmovaps (%eax), %ymm0
152 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
155 ; X86-AVX512-LABEL: test_broadcast_4i64_8i64:
156 ; X86-AVX512: # %bb.0:
157 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
158 ; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
159 ; X86-AVX512-NEXT: retl
161 ; X64-AVX-LABEL: test_broadcast_4i64_8i64:
163 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
164 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
167 ; X64-AVX512-LABEL: test_broadcast_4i64_8i64:
168 ; X64-AVX512: # %bb.0:
169 ; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
170 ; X64-AVX512-NEXT: retq
171 %1 = load <4 x i64>, ptr%p
172 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
176 define <8 x float> @test_broadcast_4f32_8f32(ptr%p) nounwind {
177 ; X86-LABEL: test_broadcast_4f32_8f32:
179 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
180 ; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
183 ; X64-LABEL: test_broadcast_4f32_8f32:
185 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
187 %1 = load <4 x float>, ptr%p
188 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
192 define <16 x float> @test_broadcast_4f32_16f32(ptr%p) nounwind {
193 ; X86-AVX-LABEL: test_broadcast_4f32_16f32:
195 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
196 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
197 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
200 ; X86-AVX512-LABEL: test_broadcast_4f32_16f32:
201 ; X86-AVX512: # %bb.0:
202 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
203 ; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
204 ; X86-AVX512-NEXT: retl
206 ; X64-AVX-LABEL: test_broadcast_4f32_16f32:
208 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
209 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
212 ; X64-AVX512-LABEL: test_broadcast_4f32_16f32:
213 ; X64-AVX512: # %bb.0:
214 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
215 ; X64-AVX512-NEXT: retq
216 %1 = load <4 x float>, ptr%p
217 %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
221 define <16 x float> @test_broadcast_8f32_16f32(ptr%p) nounwind {
222 ; X86-AVX-LABEL: test_broadcast_8f32_16f32:
224 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
225 ; X86-AVX-NEXT: vmovaps (%eax), %ymm0
226 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
229 ; X86-AVX512-LABEL: test_broadcast_8f32_16f32:
230 ; X86-AVX512: # %bb.0:
231 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
232 ; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
233 ; X86-AVX512-NEXT: retl
235 ; X64-AVX-LABEL: test_broadcast_8f32_16f32:
237 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
238 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
241 ; X64-AVX512-LABEL: test_broadcast_8f32_16f32:
242 ; X64-AVX512: # %bb.0:
243 ; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
244 ; X64-AVX512-NEXT: retq
245 %1 = load <8 x float>, ptr%p
246 %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
250 define <8 x i32> @test_broadcast_4i32_8i32(ptr%p) nounwind {
251 ; X86-AVX-LABEL: test_broadcast_4i32_8i32:
253 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
254 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
257 ; X86-AVX512-LABEL: test_broadcast_4i32_8i32:
258 ; X86-AVX512: # %bb.0:
259 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
260 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
261 ; X86-AVX512-NEXT: retl
263 ; X64-AVX-LABEL: test_broadcast_4i32_8i32:
265 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
268 ; X64-AVX512-LABEL: test_broadcast_4i32_8i32:
269 ; X64-AVX512: # %bb.0:
270 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
271 ; X64-AVX512-NEXT: retq
272 %1 = load <4 x i32>, ptr%p
273 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
277 define <16 x i32> @test_broadcast_4i32_16i32(ptr%p) nounwind {
278 ; X86-AVX-LABEL: test_broadcast_4i32_16i32:
280 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
281 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
282 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
285 ; X86-AVX512-LABEL: test_broadcast_4i32_16i32:
286 ; X86-AVX512: # %bb.0:
287 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
288 ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
289 ; X86-AVX512-NEXT: retl
291 ; X64-AVX-LABEL: test_broadcast_4i32_16i32:
293 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
294 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
297 ; X64-AVX512-LABEL: test_broadcast_4i32_16i32:
298 ; X64-AVX512: # %bb.0:
299 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
300 ; X64-AVX512-NEXT: retq
301 %1 = load <4 x i32>, ptr%p
302 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
306 define <16 x i32> @test_broadcast_8i32_16i32(ptr%p) nounwind {
307 ; X86-AVX-LABEL: test_broadcast_8i32_16i32:
309 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
310 ; X86-AVX-NEXT: vmovaps (%eax), %ymm0
311 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
314 ; X86-AVX512-LABEL: test_broadcast_8i32_16i32:
315 ; X86-AVX512: # %bb.0:
316 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
317 ; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
318 ; X86-AVX512-NEXT: retl
320 ; X64-AVX-LABEL: test_broadcast_8i32_16i32:
322 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
323 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
326 ; X64-AVX512-LABEL: test_broadcast_8i32_16i32:
327 ; X64-AVX512: # %bb.0:
328 ; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
329 ; X64-AVX512-NEXT: retq
330 %1 = load <8 x i32>, ptr%p
331 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
335 define <16 x i16> @test_broadcast_8i16_16i16(ptr%p) nounwind {
336 ; X86-AVX-LABEL: test_broadcast_8i16_16i16:
338 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
339 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
342 ; X86-AVX512-LABEL: test_broadcast_8i16_16i16:
343 ; X86-AVX512: # %bb.0:
344 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
345 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
346 ; X86-AVX512-NEXT: retl
348 ; X64-AVX-LABEL: test_broadcast_8i16_16i16:
350 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
353 ; X64-AVX512-LABEL: test_broadcast_8i16_16i16:
354 ; X64-AVX512: # %bb.0:
355 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
356 ; X64-AVX512-NEXT: retq
357 %1 = load <8 x i16>, ptr%p
358 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
362 define <32 x i16> @test_broadcast_8i16_32i16(ptr%p) nounwind {
363 ; X86-AVX-LABEL: test_broadcast_8i16_32i16:
365 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
366 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
367 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
370 ; X86-AVX512-LABEL: test_broadcast_8i16_32i16:
371 ; X86-AVX512: # %bb.0:
372 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
373 ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
374 ; X86-AVX512-NEXT: retl
376 ; X64-AVX-LABEL: test_broadcast_8i16_32i16:
378 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
379 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
382 ; X64-AVX512-LABEL: test_broadcast_8i16_32i16:
383 ; X64-AVX512: # %bb.0:
384 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
385 ; X64-AVX512-NEXT: retq
386 %1 = load <8 x i16>, ptr%p
387 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
391 define <32 x i16> @test_broadcast_16i16_32i16(ptr%p) nounwind {
392 ; X86-AVX-LABEL: test_broadcast_16i16_32i16:
394 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
395 ; X86-AVX-NEXT: vmovaps (%eax), %ymm0
396 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
399 ; X86-AVX512-LABEL: test_broadcast_16i16_32i16:
400 ; X86-AVX512: # %bb.0:
401 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
402 ; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
403 ; X86-AVX512-NEXT: retl
405 ; X64-AVX-LABEL: test_broadcast_16i16_32i16:
407 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
408 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
411 ; X64-AVX512-LABEL: test_broadcast_16i16_32i16:
412 ; X64-AVX512: # %bb.0:
413 ; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
414 ; X64-AVX512-NEXT: retq
415 %1 = load <16 x i16>, ptr%p
416 %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
420 define <32 x i8> @test_broadcast_16i8_32i8(ptr%p) nounwind {
421 ; X86-AVX-LABEL: test_broadcast_16i8_32i8:
423 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
424 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
427 ; X86-AVX512-LABEL: test_broadcast_16i8_32i8:
428 ; X86-AVX512: # %bb.0:
429 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
430 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
431 ; X86-AVX512-NEXT: retl
433 ; X64-AVX-LABEL: test_broadcast_16i8_32i8:
435 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
438 ; X64-AVX512-LABEL: test_broadcast_16i8_32i8:
439 ; X64-AVX512: # %bb.0:
440 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
441 ; X64-AVX512-NEXT: retq
442 %1 = load <16 x i8>, ptr%p
443 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
447 define <64 x i8> @test_broadcast_16i8_64i8(ptr%p) nounwind {
448 ; X86-AVX-LABEL: test_broadcast_16i8_64i8:
450 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
451 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
452 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
455 ; X86-AVX512-LABEL: test_broadcast_16i8_64i8:
456 ; X86-AVX512: # %bb.0:
457 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
458 ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
459 ; X86-AVX512-NEXT: retl
461 ; X64-AVX-LABEL: test_broadcast_16i8_64i8:
463 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
464 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
467 ; X64-AVX512-LABEL: test_broadcast_16i8_64i8:
468 ; X64-AVX512: # %bb.0:
469 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
470 ; X64-AVX512-NEXT: retq
471 %1 = load <16 x i8>, ptr%p
472 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
476 define <64 x i8> @test_broadcast_32i8_64i8(ptr%p) nounwind {
477 ; X86-AVX-LABEL: test_broadcast_32i8_64i8:
479 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
480 ; X86-AVX-NEXT: vmovaps (%eax), %ymm0
481 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
484 ; X86-AVX512-LABEL: test_broadcast_32i8_64i8:
485 ; X86-AVX512: # %bb.0:
486 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
487 ; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
488 ; X86-AVX512-NEXT: retl
490 ; X64-AVX-LABEL: test_broadcast_32i8_64i8:
492 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
493 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
496 ; X64-AVX512-LABEL: test_broadcast_32i8_64i8:
497 ; X64-AVX512: # %bb.0:
498 ; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
499 ; X64-AVX512-NEXT: retq
500 %1 = load <32 x i8>, ptr%p
501 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
506 ; Subvector Load + Broadcast + Store
509 define <4 x double> @test_broadcast_2f64_4f64_reuse(ptr %p0, ptr %p1) {
510 ; X86-AVX-LABEL: test_broadcast_2f64_4f64_reuse:
512 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
513 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
514 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
515 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
518 ; X86-AVX512-LABEL: test_broadcast_2f64_4f64_reuse:
519 ; X86-AVX512: # %bb.0:
520 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
521 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
522 ; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
523 ; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax)
524 ; X86-AVX512-NEXT: retl
526 ; X64-AVX-LABEL: test_broadcast_2f64_4f64_reuse:
528 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
529 ; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
532 ; X64-AVX512-LABEL: test_broadcast_2f64_4f64_reuse:
533 ; X64-AVX512: # %bb.0:
534 ; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
535 ; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
536 ; X64-AVX512-NEXT: retq
537 %1 = load <2 x double>, ptr %p0
538 store <2 x double> %1, ptr %p1
539 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
543 define <4 x i64> @test_broadcast_2i64_4i64_reuse(ptr %p0, ptr %p1) {
544 ; X86-AVX-LABEL: test_broadcast_2i64_4i64_reuse:
546 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
547 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
548 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
549 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
552 ; X86-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
553 ; X86-AVX512: # %bb.0:
554 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
555 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
556 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
557 ; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax)
558 ; X86-AVX512-NEXT: retl
560 ; X64-AVX-LABEL: test_broadcast_2i64_4i64_reuse:
562 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
563 ; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
566 ; X64-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
567 ; X64-AVX512: # %bb.0:
568 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
569 ; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
570 ; X64-AVX512-NEXT: retq
571 %1 = load <2 x i64>, ptr %p0
572 store <2 x i64> %1, ptr %p1
573 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
577 define <8 x float> @test_broadcast_4f32_8f32_reuse(ptr %p0, ptr %p1) {
578 ; X86-AVX-LABEL: test_broadcast_4f32_8f32_reuse:
580 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
581 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
582 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
583 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
586 ; X86-AVX512-LABEL: test_broadcast_4f32_8f32_reuse:
587 ; X86-AVX512: # %bb.0:
588 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
589 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
590 ; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
591 ; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax)
592 ; X86-AVX512-NEXT: retl
594 ; X64-AVX-LABEL: test_broadcast_4f32_8f32_reuse:
596 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
597 ; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
600 ; X64-AVX512-LABEL: test_broadcast_4f32_8f32_reuse:
601 ; X64-AVX512: # %bb.0:
602 ; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
603 ; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
604 ; X64-AVX512-NEXT: retq
605 %1 = load <4 x float>, ptr %p0
606 store <4 x float> %1, ptr %p1
607 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
611 define <8 x i32> @test_broadcast_4i32_8i32_reuse(ptr %p0, ptr %p1) {
612 ; X86-AVX-LABEL: test_broadcast_4i32_8i32_reuse:
614 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
615 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
616 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
617 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
620 ; X86-AVX512-LABEL: test_broadcast_4i32_8i32_reuse:
621 ; X86-AVX512: # %bb.0:
622 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
623 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
624 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
625 ; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax)
626 ; X86-AVX512-NEXT: retl
628 ; X64-AVX-LABEL: test_broadcast_4i32_8i32_reuse:
630 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
631 ; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
634 ; X64-AVX512-LABEL: test_broadcast_4i32_8i32_reuse:
635 ; X64-AVX512: # %bb.0:
636 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
637 ; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
638 ; X64-AVX512-NEXT: retq
639 %1 = load <4 x i32>, ptr %p0
640 store <4 x i32> %1, ptr %p1
641 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
645 define <16 x i16> @test_broadcast_8i16_16i16_reuse(ptr%p0, ptr%p1) nounwind {
646 ; X86-AVX-LABEL: test_broadcast_8i16_16i16_reuse:
648 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
649 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
650 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
651 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
654 ; X86-AVX512-LABEL: test_broadcast_8i16_16i16_reuse:
655 ; X86-AVX512: # %bb.0:
656 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
657 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
658 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
659 ; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax)
660 ; X86-AVX512-NEXT: retl
662 ; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse:
664 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
665 ; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
668 ; X64-AVX512-LABEL: test_broadcast_8i16_16i16_reuse:
669 ; X64-AVX512: # %bb.0:
670 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
671 ; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
672 ; X64-AVX512-NEXT: retq
673 %1 = load <8 x i16>, ptr%p0
674 store <8 x i16> %1, ptr %p1
675 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
679 define <32 x i8> @test_broadcast_16i8_32i8_reuse(ptr%p0, ptr%p1) nounwind {
680 ; X86-AVX-LABEL: test_broadcast_16i8_32i8_reuse:
682 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
683 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
684 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
685 ; X86-AVX-NEXT: vmovaps %xmm0, (%eax)
688 ; X86-AVX512-LABEL: test_broadcast_16i8_32i8_reuse:
689 ; X86-AVX512: # %bb.0:
690 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
691 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
692 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
693 ; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax)
694 ; X86-AVX512-NEXT: retl
696 ; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse:
698 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
699 ; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
702 ; X64-AVX512-LABEL: test_broadcast_16i8_32i8_reuse:
703 ; X64-AVX512: # %bb.0:
704 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
705 ; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
706 ; X64-AVX512-NEXT: retq
707 %1 = load <16 x i8>, ptr%p0
708 store <16 x i8> %1, ptr %p1
709 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
714 ; Subvector Load + Broadcast with Separate Store
717 define <8 x i32> @test_broadcast_4i32_8i32_chain(ptr %p0, ptr %p1) {
718 ; X86-AVX-LABEL: test_broadcast_4i32_8i32_chain:
720 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
721 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
722 ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
723 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
724 ; X86-AVX-NEXT: vmovaps %xmm1, (%eax)
727 ; X86-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
728 ; X86-AVX512: # %bb.0:
729 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
730 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
731 ; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
732 ; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
733 ; X86-AVX512-NEXT: vmovaps %xmm1, (%eax)
734 ; X86-AVX512-NEXT: retl
736 ; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain:
738 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
739 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
740 ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi)
743 ; X64-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
744 ; X64-AVX512: # %bb.0:
745 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
746 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
747 ; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi)
748 ; X64-AVX512-NEXT: retq
749 %1 = load <4 x i32>, ptr %p0
750 store <4 x float> zeroinitializer, ptr %p1
751 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
755 define <16 x i32> @test_broadcast_4i32_16i32_chain(ptr %p0, ptr %p1) {
756 ; X86-AVX-LABEL: test_broadcast_4i32_16i32_chain:
758 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
759 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
760 ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
761 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
762 ; X86-AVX-NEXT: vmovaps %xmm1, (%eax)
763 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
766 ; X86-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
767 ; X86-AVX512: # %bb.0:
768 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
769 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
770 ; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
771 ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
772 ; X86-AVX512-NEXT: vmovaps %xmm1, (%eax)
773 ; X86-AVX512-NEXT: retl
775 ; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain:
777 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
778 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
779 ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi)
780 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
783 ; X64-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
784 ; X64-AVX512: # %bb.0:
785 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
786 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
787 ; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi)
788 ; X64-AVX512-NEXT: retq
789 %1 = load <4 x i32>, ptr %p0
790 store <4 x float> zeroinitializer, ptr %p1
791 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
796 ; subvector Load with multiple uses + broadcast
797 ; Fallback to the broadcast should be done
800 @ga4 = dso_local global <4 x i64> zeroinitializer, align 8
801 @gb4 = dso_local global <8 x i64> zeroinitializer, align 8
803 define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
804 ; X86-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
805 ; X86-AVX1: # %bb.0: # %entry
806 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,0,2,0]
807 ; X86-AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm3
808 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
809 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,0,4,0]
810 ; X86-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0
811 ; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [1,0,2,0,3,0,4,0]
812 ; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
813 ; X86-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm7
814 ; X86-AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
815 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2
816 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
817 ; X86-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm5
818 ; X86-AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1
819 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
820 ; X86-AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1
821 ; X86-AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
822 ; X86-AVX1-NEXT: vmovdqu %xmm0, ga4+16
823 ; X86-AVX1-NEXT: vmovdqu %xmm3, ga4
824 ; X86-AVX1-NEXT: vmovups %ymm2, gb4+32
825 ; X86-AVX1-NEXT: vmovups %ymm1, gb4
826 ; X86-AVX1-NEXT: vzeroupper
827 ; X86-AVX1-NEXT: retl
829 ; X86-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
830 ; X86-AVX2: # %bb.0: # %entry
831 ; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
832 ; X86-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
833 ; X86-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
834 ; X86-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
835 ; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
836 ; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
837 ; X86-AVX2-NEXT: vmovdqu %ymm0, ga4
838 ; X86-AVX2-NEXT: vmovdqu %ymm2, gb4+32
839 ; X86-AVX2-NEXT: vmovdqu %ymm1, gb4
840 ; X86-AVX2-NEXT: vzeroupper
841 ; X86-AVX2-NEXT: retl
843 ; X86-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
844 ; X86-AVX512: # %bb.0: # %entry
845 ; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0]
846 ; X86-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
847 ; X86-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0
848 ; X86-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1
849 ; X86-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
850 ; X86-AVX512-NEXT: vmovdqu %ymm0, ga4
851 ; X86-AVX512-NEXT: vmovdqu64 %zmm1, gb4
852 ; X86-AVX512-NEXT: vzeroupper
853 ; X86-AVX512-NEXT: retl
855 ; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
856 ; X64-AVX1: # %bb.0: # %entry
857 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2]
858 ; X64-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4
859 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
860 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,4]
861 ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0
862 ; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [1,2,3,4]
863 ; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
864 ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm7
865 ; X64-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
866 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2
867 ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
868 ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm5
869 ; X64-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
870 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
871 ; X64-AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1
872 ; X64-AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
873 ; X64-AVX1-NEXT: vmovdqu %xmm0, ga4+16(%rip)
874 ; X64-AVX1-NEXT: vmovdqu %xmm4, ga4(%rip)
875 ; X64-AVX1-NEXT: vmovups %ymm2, gb4+32(%rip)
876 ; X64-AVX1-NEXT: vmovups %ymm1, gb4(%rip)
877 ; X64-AVX1-NEXT: vzeroupper
878 ; X64-AVX1-NEXT: retq
880 ; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
881 ; X64-AVX2: # %bb.0: # %entry
882 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4]
883 ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
884 ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
885 ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
886 ; X64-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
887 ; X64-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
888 ; X64-AVX2-NEXT: vmovdqu %ymm0, ga4(%rip)
889 ; X64-AVX2-NEXT: vmovdqu %ymm2, gb4+32(%rip)
890 ; X64-AVX2-NEXT: vmovdqu %ymm1, gb4(%rip)
891 ; X64-AVX2-NEXT: vzeroupper
892 ; X64-AVX2-NEXT: retq
894 ; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
895 ; X64-AVX512: # %bb.0: # %entry
896 ; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,1,2,3,4]
897 ; X64-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
898 ; X64-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0
899 ; X64-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1
900 ; X64-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
901 ; X64-AVX512-NEXT: vmovdqu %ymm0, ga4(%rip)
902 ; X64-AVX512-NEXT: vmovdqu64 %zmm1, gb4(%rip)
903 ; X64-AVX512-NEXT: vzeroupper
904 ; X64-AVX512-NEXT: retq
906 %0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
907 %1 = add <8 x i64> %b, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
908 %2 = and <8 x i64> %1, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
909 store <4 x i64> %0, ptr @ga4, align 8
910 store <8 x i64> %2, ptr @gb4, align 8
915 @ga2 = dso_local global <4 x double> zeroinitializer, align 8
916 @gb2 = dso_local global <8 x double> zeroinitializer, align 8
918 define dso_local void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) {
919 ; X86-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
920 ; X86-AVX: # %bb.0: # %entry
921 ; X86-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
922 ; X86-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0
923 ; X86-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2
924 ; X86-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1
925 ; X86-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1
926 ; X86-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2
927 ; X86-AVX-NEXT: vmovupd %ymm0, ga2
928 ; X86-AVX-NEXT: vmovupd %ymm2, gb2+32
929 ; X86-AVX-NEXT: vmovupd %ymm1, gb2
930 ; X86-AVX-NEXT: vzeroupper
933 ; X86-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
934 ; X86-AVX512: # %bb.0: # %entry
935 ; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0]
936 ; X86-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
937 ; X86-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0
938 ; X86-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
939 ; X86-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1
940 ; X86-AVX512-NEXT: vmovupd %ymm0, ga2
941 ; X86-AVX512-NEXT: vmovupd %zmm1, gb2
942 ; X86-AVX512-NEXT: vzeroupper
943 ; X86-AVX512-NEXT: retl
945 ; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
946 ; X64-AVX: # %bb.0: # %entry
947 ; X64-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
948 ; X64-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0
949 ; X64-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2
950 ; X64-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1
951 ; X64-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1
952 ; X64-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2
953 ; X64-AVX-NEXT: vmovupd %ymm0, ga2(%rip)
954 ; X64-AVX-NEXT: vmovupd %ymm2, gb2+32(%rip)
955 ; X64-AVX-NEXT: vmovupd %ymm1, gb2(%rip)
956 ; X64-AVX-NEXT: vzeroupper
959 ; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
960 ; X64-AVX512: # %bb.0: # %entry
961 ; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0]
962 ; X64-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
963 ; X64-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0
964 ; X64-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
965 ; X64-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1
966 ; X64-AVX512-NEXT: vmovupd %ymm0, ga2(%rip)
967 ; X64-AVX512-NEXT: vmovupd %zmm1, gb2(%rip)
968 ; X64-AVX512-NEXT: vzeroupper
969 ; X64-AVX512-NEXT: retq
971 %0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0>
972 %1 = fadd <8 x double> %b, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
973 %2 = fdiv <8 x double> %1, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
974 store <4 x double> %0, ptr @ga2, align 8
975 store <8 x double> %2, ptr @gb2, align 8
979 @ha4 = dso_local global <4 x i32> zeroinitializer, align 8
980 @hb4 = dso_local global <8 x i32> zeroinitializer, align 8
981 @hc4 = dso_local global <16 x i32> zeroinitializer, align 8
983 define dso_local void @fallback_broadcast_v4i32_v8i32_v16i32(<4 x i32> %a, <8 x i32> %b, <16 x i32> %c) nounwind {
984 ; X86-AVX1-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
985 ; X86-AVX1: # %bb.0: # %entry
986 ; X86-AVX1-NEXT: pushl %ebp
987 ; X86-AVX1-NEXT: movl %esp, %ebp
988 ; X86-AVX1-NEXT: andl $-32, %esp
989 ; X86-AVX1-NEXT: subl $32, %esp
990 ; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4]
991 ; X86-AVX1-NEXT: # ymm3 = mem[0,1,0,1]
992 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
993 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
994 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4
995 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
996 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
997 ; X86-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
998 ; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
999 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4
1000 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
1001 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
1002 ; X86-AVX1-NEXT: vpaddd 8(%ebp), %xmm3, %xmm4
1003 ; X86-AVX1-NEXT: vpaddd 24(%ebp), %xmm3, %xmm5
1004 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
1005 ; X86-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
1006 ; X86-AVX1-NEXT: vandps %ymm3, %ymm4, %ymm3
1007 ; X86-AVX1-NEXT: vmovdqu %xmm0, ha4
1008 ; X86-AVX1-NEXT: vmovups %ymm1, hb4
1009 ; X86-AVX1-NEXT: vmovups %ymm3, hc4+32
1010 ; X86-AVX1-NEXT: vmovups %ymm2, hc4
1011 ; X86-AVX1-NEXT: movl %ebp, %esp
1012 ; X86-AVX1-NEXT: popl %ebp
1013 ; X86-AVX1-NEXT: vzeroupper
1014 ; X86-AVX1-NEXT: retl
1016 ; X86-AVX2-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1017 ; X86-AVX2: # %bb.0: # %entry
1018 ; X86-AVX2-NEXT: pushl %ebp
1019 ; X86-AVX2-NEXT: movl %esp, %ebp
1020 ; X86-AVX2-NEXT: andl $-32, %esp
1021 ; X86-AVX2-NEXT: subl $32, %esp
1022 ; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4]
1023 ; X86-AVX2-NEXT: # ymm3 = mem[0,1,0,1]
1024 ; X86-AVX2-NEXT: vpaddd %xmm3, %xmm0, %xmm0
1025 ; X86-AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
1026 ; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
1027 ; X86-AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2
1028 ; X86-AVX2-NEXT: vpaddd 8(%ebp), %ymm3, %ymm4
1029 ; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
1030 ; X86-AVX2-NEXT: vpand %ymm3, %ymm4, %ymm3
1031 ; X86-AVX2-NEXT: vmovdqu %xmm0, ha4
1032 ; X86-AVX2-NEXT: vmovdqu %ymm1, hb4
1033 ; X86-AVX2-NEXT: vmovdqu %ymm3, hc4+32
1034 ; X86-AVX2-NEXT: vmovdqu %ymm2, hc4
1035 ; X86-AVX2-NEXT: movl %ebp, %esp
1036 ; X86-AVX2-NEXT: popl %ebp
1037 ; X86-AVX2-NEXT: vzeroupper
1038 ; X86-AVX2-NEXT: retl
1040 ; X86-AVX512-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1041 ; X86-AVX512: # %bb.0: # %entry
1042 ; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4]
1043 ; X86-AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1044 ; X86-AVX512-NEXT: vpaddd %xmm3, %xmm0, %xmm0
1045 ; X86-AVX512-NEXT: vpaddd %ymm3, %ymm1, %ymm1
1046 ; X86-AVX512-NEXT: vpand %ymm3, %ymm1, %ymm1
1047 ; X86-AVX512-NEXT: vpaddd %zmm3, %zmm2, %zmm2
1048 ; X86-AVX512-NEXT: vpandd %zmm3, %zmm2, %zmm2
1049 ; X86-AVX512-NEXT: vmovdqu %xmm0, ha4
1050 ; X86-AVX512-NEXT: vmovdqu %ymm1, hb4
1051 ; X86-AVX512-NEXT: vmovdqu64 %zmm2, hc4
1052 ; X86-AVX512-NEXT: vzeroupper
1053 ; X86-AVX512-NEXT: retl
1055 ; X64-AVX1-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1056 ; X64-AVX1: # %bb.0: # %entry
1057 ; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4]
1058 ; X64-AVX1-NEXT: # ymm4 = mem[0,1,0,1]
1059 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1060 ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
1061 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5
1062 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
1063 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
1064 ; X64-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
1065 ; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
1066 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5
1067 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
1068 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
1069 ; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
1070 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5
1071 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
1072 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
1073 ; X64-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
1074 ; X64-AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
1075 ; X64-AVX1-NEXT: vmovdqu %xmm0, ha4(%rip)
1076 ; X64-AVX1-NEXT: vmovups %ymm1, hb4(%rip)
1077 ; X64-AVX1-NEXT: vmovups %ymm3, hc4+32(%rip)
1078 ; X64-AVX1-NEXT: vmovups %ymm2, hc4(%rip)
1079 ; X64-AVX1-NEXT: vzeroupper
1080 ; X64-AVX1-NEXT: retq
1082 ; X64-AVX2-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1083 ; X64-AVX2: # %bb.0: # %entry
1084 ; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4]
1085 ; X64-AVX2-NEXT: # ymm4 = mem[0,1,0,1]
1086 ; X64-AVX2-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1087 ; X64-AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
1088 ; X64-AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
1089 ; X64-AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3
1090 ; X64-AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2
1091 ; X64-AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
1092 ; X64-AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
1093 ; X64-AVX2-NEXT: vmovdqu %xmm0, ha4(%rip)
1094 ; X64-AVX2-NEXT: vmovdqu %ymm1, hb4(%rip)
1095 ; X64-AVX2-NEXT: vmovdqu %ymm3, hc4+32(%rip)
1096 ; X64-AVX2-NEXT: vmovdqu %ymm2, hc4(%rip)
1097 ; X64-AVX2-NEXT: vzeroupper
1098 ; X64-AVX2-NEXT: retq
1100 ; X64-AVX512-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1101 ; X64-AVX512: # %bb.0: # %entry
1102 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4]
1103 ; X64-AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1104 ; X64-AVX512-NEXT: vpaddd %xmm3, %xmm0, %xmm0
1105 ; X64-AVX512-NEXT: vpaddd %ymm3, %ymm1, %ymm1
1106 ; X64-AVX512-NEXT: vpand %ymm3, %ymm1, %ymm1
1107 ; X64-AVX512-NEXT: vpaddd %zmm3, %zmm2, %zmm2
1108 ; X64-AVX512-NEXT: vpandd %zmm3, %zmm2, %zmm2
1109 ; X64-AVX512-NEXT: vmovdqu %xmm0, ha4(%rip)
1110 ; X64-AVX512-NEXT: vmovdqu %ymm1, hb4(%rip)
1111 ; X64-AVX512-NEXT: vmovdqu64 %zmm2, hc4(%rip)
1112 ; X64-AVX512-NEXT: vzeroupper
1113 ; X64-AVX512-NEXT: retq
1115 %0 = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
1116 %1 = add <8 x i32> %b, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
1117 %2 = and <8 x i32> %1, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
1118 %3 = add <16 x i32> %c, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
1119 %4 = and <16 x i32> %3, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
1120 store <4 x i32> %0, ptr @ha4, align 8
1121 store <8 x i32> %2, ptr @hb4, align 8
1122 store <16 x i32> %4, ptr @hc4, align 8
1127 ; Subvector Broadcast from register
1130 define <4 x double> @reg_broadcast_2f64_4f64(<2 x double> %a0) nounwind {
1131 ; X86-LABEL: reg_broadcast_2f64_4f64:
1133 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1134 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1137 ; X64-LABEL: reg_broadcast_2f64_4f64:
1139 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1140 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1142 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1146 define <8 x double> @reg_broadcast_2f64_8f64(<2 x double> %a0) nounwind {
1147 ; X86-AVX-LABEL: reg_broadcast_2f64_8f64:
1149 ; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1150 ; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1151 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1152 ; X86-AVX-NEXT: retl
1154 ; X86-AVX512-LABEL: reg_broadcast_2f64_8f64:
1155 ; X86-AVX512: # %bb.0:
1156 ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1157 ; X86-AVX512-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1158 ; X86-AVX512-NEXT: retl
1160 ; X64-AVX-LABEL: reg_broadcast_2f64_8f64:
1162 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1163 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1164 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1165 ; X64-AVX-NEXT: retq
1167 ; X64-AVX512-LABEL: reg_broadcast_2f64_8f64:
1168 ; X64-AVX512: # %bb.0:
1169 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1170 ; X64-AVX512-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1171 ; X64-AVX512-NEXT: retq
1172 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1176 define <8 x double> @reg_broadcast_4f64_8f64(<4 x double> %a0) nounwind {
1177 ; X86-AVX-LABEL: reg_broadcast_4f64_8f64:
1179 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1180 ; X86-AVX-NEXT: retl
1182 ; X86-AVX512-LABEL: reg_broadcast_4f64_8f64:
1183 ; X86-AVX512: # %bb.0:
1184 ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1185 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1186 ; X86-AVX512-NEXT: retl
1188 ; X64-AVX-LABEL: reg_broadcast_4f64_8f64:
1190 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1191 ; X64-AVX-NEXT: retq
1193 ; X64-AVX512-LABEL: reg_broadcast_4f64_8f64:
1194 ; X64-AVX512: # %bb.0:
1195 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1196 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1197 ; X64-AVX512-NEXT: retq
1198 %1 = shufflevector <4 x double> %a0, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1202 define <4 x i64> @reg_broadcast_2i64_4i64(<2 x i64> %a0) nounwind {
1203 ; X86-LABEL: reg_broadcast_2i64_4i64:
1205 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1206 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1209 ; X64-LABEL: reg_broadcast_2i64_4i64:
1211 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1212 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1214 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1218 define <8 x i64> @reg_broadcast_2i64_8i64(<2 x i64> %a0) nounwind {
1219 ; X86-AVX-LABEL: reg_broadcast_2i64_8i64:
1221 ; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1222 ; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1223 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1224 ; X86-AVX-NEXT: retl
1226 ; X86-AVX512-LABEL: reg_broadcast_2i64_8i64:
1227 ; X86-AVX512: # %bb.0:
1228 ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1229 ; X86-AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1230 ; X86-AVX512-NEXT: retl
1232 ; X64-AVX-LABEL: reg_broadcast_2i64_8i64:
1234 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1235 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1236 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1237 ; X64-AVX-NEXT: retq
1239 ; X64-AVX512-LABEL: reg_broadcast_2i64_8i64:
1240 ; X64-AVX512: # %bb.0:
1241 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1242 ; X64-AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1243 ; X64-AVX512-NEXT: retq
1244 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1248 define <8 x i64> @reg_broadcast_4i64_8i64(<4 x i64> %a0) nounwind {
1249 ; X86-AVX-LABEL: reg_broadcast_4i64_8i64:
1251 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1252 ; X86-AVX-NEXT: retl
1254 ; X86-AVX512-LABEL: reg_broadcast_4i64_8i64:
1255 ; X86-AVX512: # %bb.0:
1256 ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1257 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1258 ; X86-AVX512-NEXT: retl
1260 ; X64-AVX-LABEL: reg_broadcast_4i64_8i64:
1262 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1263 ; X64-AVX-NEXT: retq
1265 ; X64-AVX512-LABEL: reg_broadcast_4i64_8i64:
1266 ; X64-AVX512: # %bb.0:
1267 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1268 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1269 ; X64-AVX512-NEXT: retq
1270 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1274 define <8 x float> @reg_broadcast_4f32_8f32(<4 x float> %a0) nounwind {
1275 ; X86-LABEL: reg_broadcast_4f32_8f32:
1277 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1278 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1281 ; X64-LABEL: reg_broadcast_4f32_8f32:
1283 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1284 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1286 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1290 define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind {
1291 ; X86-AVX-LABEL: reg_broadcast_4f32_16f32:
1293 ; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1294 ; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1295 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1296 ; X86-AVX-NEXT: retl
1298 ; X86-AVX512-LABEL: reg_broadcast_4f32_16f32:
1299 ; X86-AVX512: # %bb.0:
1300 ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1301 ; X86-AVX512-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1302 ; X86-AVX512-NEXT: retl
1304 ; X64-AVX-LABEL: reg_broadcast_4f32_16f32:
1306 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1307 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1308 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1309 ; X64-AVX-NEXT: retq
1311 ; X64-AVX512-LABEL: reg_broadcast_4f32_16f32:
1312 ; X64-AVX512: # %bb.0:
1313 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1314 ; X64-AVX512-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1315 ; X64-AVX512-NEXT: retq
1316 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1320 define <16 x float> @reg_broadcast_8f32_16f32(<8 x float> %a0) nounwind {
1321 ; X86-AVX-LABEL: reg_broadcast_8f32_16f32:
1323 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1324 ; X86-AVX-NEXT: retl
1326 ; X86-AVX512-LABEL: reg_broadcast_8f32_16f32:
1327 ; X86-AVX512: # %bb.0:
1328 ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1329 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1330 ; X86-AVX512-NEXT: retl
1332 ; X64-AVX-LABEL: reg_broadcast_8f32_16f32:
1334 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1335 ; X64-AVX-NEXT: retq
1337 ; X64-AVX512-LABEL: reg_broadcast_8f32_16f32:
1338 ; X64-AVX512: # %bb.0:
1339 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1340 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1341 ; X64-AVX512-NEXT: retq
1342 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1346 define <8 x i32> @reg_broadcast_4i32_8i32(<4 x i32> %a0) nounwind {
1347 ; X86-LABEL: reg_broadcast_4i32_8i32:
1349 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1350 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1353 ; X64-LABEL: reg_broadcast_4i32_8i32:
1355 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1356 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1358 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1362 define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind {
1363 ; X86-AVX-LABEL: reg_broadcast_4i32_16i32:
1365 ; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1366 ; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1367 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1368 ; X86-AVX-NEXT: retl
1370 ; X86-AVX512-LABEL: reg_broadcast_4i32_16i32:
1371 ; X86-AVX512: # %bb.0:
1372 ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1373 ; X86-AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1374 ; X86-AVX512-NEXT: retl
1376 ; X64-AVX-LABEL: reg_broadcast_4i32_16i32:
1378 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1379 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1380 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1381 ; X64-AVX-NEXT: retq
1383 ; X64-AVX512-LABEL: reg_broadcast_4i32_16i32:
1384 ; X64-AVX512: # %bb.0:
1385 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1386 ; X64-AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1387 ; X64-AVX512-NEXT: retq
1388 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1392 define <16 x i32> @reg_broadcast_8i32_16i32(<8 x i32> %a0) nounwind {
1393 ; X86-AVX-LABEL: reg_broadcast_8i32_16i32:
1395 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1396 ; X86-AVX-NEXT: retl
1398 ; X86-AVX512-LABEL: reg_broadcast_8i32_16i32:
1399 ; X86-AVX512: # %bb.0:
1400 ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1401 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1402 ; X86-AVX512-NEXT: retl
1404 ; X64-AVX-LABEL: reg_broadcast_8i32_16i32:
1406 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1407 ; X64-AVX-NEXT: retq
1409 ; X64-AVX512-LABEL: reg_broadcast_8i32_16i32:
1410 ; X64-AVX512: # %bb.0:
1411 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1412 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1413 ; X64-AVX512-NEXT: retq
1414 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1418 define <16 x i16> @reg_broadcast_8i16_16i16(<8 x i16> %a0) nounwind {
1419 ; X86-LABEL: reg_broadcast_8i16_16i16:
1421 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1422 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1425 ; X64-LABEL: reg_broadcast_8i16_16i16:
1427 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1428 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1430 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1434 define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind {
1435 ; X86-AVX-LABEL: reg_broadcast_8i16_32i16:
1437 ; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1438 ; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1439 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1440 ; X86-AVX-NEXT: retl
1442 ; X86-AVX512-LABEL: reg_broadcast_8i16_32i16:
1443 ; X86-AVX512: # %bb.0:
1444 ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1445 ; X86-AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1446 ; X86-AVX512-NEXT: retl
1448 ; X64-AVX-LABEL: reg_broadcast_8i16_32i16:
1450 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1451 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1452 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1453 ; X64-AVX-NEXT: retq
1455 ; X64-AVX512-LABEL: reg_broadcast_8i16_32i16:
1456 ; X64-AVX512: # %bb.0:
1457 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1458 ; X64-AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1459 ; X64-AVX512-NEXT: retq
1460 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1464 define <32 x i16> @reg_broadcast_16i16_32i16(<16 x i16> %a0) nounwind {
1465 ; X86-AVX-LABEL: reg_broadcast_16i16_32i16:
1467 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1468 ; X86-AVX-NEXT: retl
1470 ; X86-AVX512-LABEL: reg_broadcast_16i16_32i16:
1471 ; X86-AVX512: # %bb.0:
1472 ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1473 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1474 ; X86-AVX512-NEXT: retl
1476 ; X64-AVX-LABEL: reg_broadcast_16i16_32i16:
1478 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1479 ; X64-AVX-NEXT: retq
1481 ; X64-AVX512-LABEL: reg_broadcast_16i16_32i16:
1482 ; X64-AVX512: # %bb.0:
1483 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1484 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1485 ; X64-AVX512-NEXT: retq
1486 %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1490 define <32 x i8> @reg_broadcast_16i8_32i8(<16 x i8> %a0) nounwind {
1491 ; X86-LABEL: reg_broadcast_16i8_32i8:
1493 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1494 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1497 ; X64-LABEL: reg_broadcast_16i8_32i8:
1499 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1500 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1502 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1506 define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind {
1507 ; X86-AVX-LABEL: reg_broadcast_16i8_64i8:
1509 ; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1510 ; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1511 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1512 ; X86-AVX-NEXT: retl
1514 ; X86-AVX512-LABEL: reg_broadcast_16i8_64i8:
1515 ; X86-AVX512: # %bb.0:
1516 ; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1517 ; X86-AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1518 ; X86-AVX512-NEXT: retl
1520 ; X64-AVX-LABEL: reg_broadcast_16i8_64i8:
1522 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1523 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1524 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1525 ; X64-AVX-NEXT: retq
1527 ; X64-AVX512-LABEL: reg_broadcast_16i8_64i8:
1528 ; X64-AVX512: # %bb.0:
1529 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1530 ; X64-AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
1531 ; X64-AVX512-NEXT: retq
1532 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1536 define <64 x i8> @reg_broadcast_32i8_64i8(<32 x i8> %a0) nounwind {
1537 ; X86-AVX-LABEL: reg_broadcast_32i8_64i8:
1539 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1540 ; X86-AVX-NEXT: retl
1542 ; X86-AVX512-LABEL: reg_broadcast_32i8_64i8:
1543 ; X86-AVX512: # %bb.0:
1544 ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1545 ; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1546 ; X86-AVX512-NEXT: retl
1548 ; X64-AVX-LABEL: reg_broadcast_32i8_64i8:
1550 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1551 ; X64-AVX-NEXT: retq
1553 ; X64-AVX512-LABEL: reg_broadcast_32i8_64i8:
1554 ; X64-AVX512: # %bb.0:
1555 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1556 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1557 ; X64-AVX512-NEXT: retq
1558 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1566 define <4 x i32> @test_2xi32_to_4xi32_mem(ptr %vp) {
1567 ; X86-LABEL: test_2xi32_to_4xi32_mem:
1569 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1570 ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
1573 ; X64-LABEL: test_2xi32_to_4xi32_mem:
1575 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
1577 %vec = load <2 x i32>, ptr %vp
1578 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1582 define <8 x i32> @test_2xi32_to_8xi32_mem(ptr %vp) {
1583 ; X86-LABEL: test_2xi32_to_8xi32_mem:
1585 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1586 ; X86-NEXT: vbroadcastsd (%eax), %ymm0
1589 ; X64-LABEL: test_2xi32_to_8xi32_mem:
1591 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
1593 %vec = load <2 x i32>, ptr %vp
1594 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1598 define <16 x i32> @test_2xi32_to_16xi32_mem(ptr %vp) {
1599 ; X86-AVX-LABEL: test_2xi32_to_16xi32_mem:
1601 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1602 ; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm0
1603 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1604 ; X86-AVX-NEXT: retl
1606 ; X86-AVX512-LABEL: test_2xi32_to_16xi32_mem:
1607 ; X86-AVX512: # %bb.0:
1608 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1609 ; X86-AVX512-NEXT: vbroadcastsd (%eax), %zmm0
1610 ; X86-AVX512-NEXT: retl
1612 ; X64-AVX-LABEL: test_2xi32_to_16xi32_mem:
1614 ; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm0
1615 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1616 ; X64-AVX-NEXT: retq
1618 ; X64-AVX512-LABEL: test_2xi32_to_16xi32_mem:
1619 ; X64-AVX512: # %bb.0:
1620 ; X64-AVX512-NEXT: vbroadcastsd (%rdi), %zmm0
1621 ; X64-AVX512-NEXT: retq
1622 %vec = load <2 x i32>, ptr %vp
1623 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1631 define <4 x double> @broadcast_v4f64_f64_u000(ptr %p) {
1632 ; X86-LABEL: broadcast_v4f64_f64_u000:
1634 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1635 ; X86-NEXT: vbroadcastsd (%eax), %ymm0
1638 ; X64-LABEL: broadcast_v4f64_f64_u000:
1640 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
1642 %s = load double, ptr %p
1643 %vec = insertelement <2 x double> undef, double %s, i32 0
1644 %res = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
1645 ret <4 x double> %res
1648 define <4 x double> @broadcast_v4f64_v2f64_4u61(ptr %vp, <4 x double> %default) {
1649 ; X86-LABEL: broadcast_v4f64_v2f64_4u61:
1651 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1652 ; X86-NEXT: vinsertf128 $1, (%eax), %ymm0, %ymm1
1653 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1656 ; X64-LABEL: broadcast_v4f64_v2f64_4u61:
1658 ; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm1
1659 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1661 %vec = load <2 x double>, ptr %vp
1662 %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 1>
1663 %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %default
1664 ret <4 x double> %res
1667 define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(ptr %vp, <8 x float> %default) {
1668 ; X86-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1670 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1671 ; X86-NEXT: vbroadcastsd (%eax), %ymm1
1672 ; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
1675 ; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1677 ; X64-NEXT: vbroadcastsd (%rdi), %ymm1
1678 ; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
1680 %vec = load <2 x float>, ptr %vp
1681 %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 0, i32 2, i32 3, i32 undef>
1682 %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %default
1683 ret <8 x float> %res
1686 define <8 x double> @broadcast_v8f64_v2f64_u1u10101(ptr %vp) {
1687 ; X86-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101:
1689 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1690 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1691 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1692 ; X86-AVX-NEXT: retl
1694 ; X86-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101:
1695 ; X86-AVX512: # %bb.0:
1696 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1697 ; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1698 ; X86-AVX512-NEXT: retl
1700 ; X64-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101:
1702 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1703 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1704 ; X64-AVX-NEXT: retq
1706 ; X64-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101:
1707 ; X64-AVX512: # %bb.0:
1708 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1709 ; X64-AVX512-NEXT: retq
1710 %vec = load <2 x double>, ptr %vp
1711 %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 3, i32 1, i32 undef, i32 1, i32 0, i32 1, i32 0, i32 1>
1712 ret <8 x double> %res
1715 define <8 x double> @broadcast_v8f64_v2f64_0uuu0101(ptr %vp) {
1716 ; X86-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1718 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1719 ; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1720 ; X86-AVX-NEXT: vmovaps %ymm0, %ymm1
1721 ; X86-AVX-NEXT: retl
1723 ; X86-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1724 ; X86-AVX512: # %bb.0:
1725 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1726 ; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1727 ; X86-AVX512-NEXT: retl
1729 ; X64-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1731 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1732 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1733 ; X64-AVX-NEXT: retq
1735 ; X64-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1736 ; X64-AVX512: # %bb.0:
1737 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1738 ; X64-AVX512-NEXT: retq
1739 %vec = load <2 x double>, ptr %vp
1740 %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 0, i32 1>
1741 ret <8 x double> %res
1744 define void @PR51226() {
1745 ; X86-AVX1-LABEL: PR51226:
1746 ; X86-AVX1: # %bb.0:
1747 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1748 ; X86-AVX1-NEXT: vpslld $16, %xmm0, %xmm0
1749 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1750 ; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1751 ; X86-AVX1-NEXT: vminps %ymm1, %ymm0, %ymm0
1752 ; X86-AVX1-NEXT: vmovups %ymm0, (%eax)
1753 ; X86-AVX1-NEXT: vzeroupper
1754 ; X86-AVX1-NEXT: retl
1756 ; X86-AVX2-LABEL: PR51226:
1757 ; X86-AVX2: # %bb.0:
1758 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1759 ; X86-AVX2-NEXT: vpslld $16, %xmm0, %xmm0
1760 ; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1761 ; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
1762 ; X86-AVX2-NEXT: vminps %ymm1, %ymm0, %ymm0
1763 ; X86-AVX2-NEXT: vmovups %ymm0, (%eax)
1764 ; X86-AVX2-NEXT: vzeroupper
1765 ; X86-AVX2-NEXT: retl
1767 ; X86-AVX512-LABEL: PR51226:
1768 ; X86-AVX512: # %bb.0:
1769 ; X86-AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1770 ; X86-AVX512-NEXT: vpslld $16, %xmm0, %xmm0
1771 ; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1772 ; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
1773 ; X86-AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0
1774 ; X86-AVX512-NEXT: vmovups %ymm0, (%eax)
1775 ; X86-AVX512-NEXT: vzeroupper
1776 ; X86-AVX512-NEXT: retl
1778 ; X64-AVX1-LABEL: PR51226:
1779 ; X64-AVX1: # %bb.0:
1780 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1781 ; X64-AVX1-NEXT: vpslld $16, %xmm0, %xmm0
1782 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1783 ; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1784 ; X64-AVX1-NEXT: vminps %ymm1, %ymm0, %ymm0
1785 ; X64-AVX1-NEXT: vmovups %ymm0, (%rax)
1786 ; X64-AVX1-NEXT: vzeroupper
1787 ; X64-AVX1-NEXT: retq
1789 ; X64-AVX2-LABEL: PR51226:
1790 ; X64-AVX2: # %bb.0:
1791 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1792 ; X64-AVX2-NEXT: vpslld $16, %xmm0, %xmm0
1793 ; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1794 ; X64-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
1795 ; X64-AVX2-NEXT: vminps %ymm1, %ymm0, %ymm0
1796 ; X64-AVX2-NEXT: vmovups %ymm0, (%rax)
1797 ; X64-AVX2-NEXT: vzeroupper
1798 ; X64-AVX2-NEXT: retq
1800 ; X64-AVX512-LABEL: PR51226:
1801 ; X64-AVX512: # %bb.0:
1802 ; X64-AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1803 ; X64-AVX512-NEXT: vpslld $16, %xmm0, %xmm0
1804 ; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1805 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
1806 ; X64-AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0
1807 ; X64-AVX512-NEXT: vmovups %ymm0, (%rax)
1808 ; X64-AVX512-NEXT: vzeroupper
1809 ; X64-AVX512-NEXT: retq
1810 %i = load <4 x i16>, ptr undef, align 8
1811 %i1 = zext <4 x i16> %i to <4 x i32>
1812 %i2 = shl nuw <4 x i32> %i1, <i32 16, i32 16, i32 16, i32 16>
1813 %i3 = bitcast <4 x i32> %i2 to <4 x float>
1814 %shuffle99 = shufflevector <4 x float> %i3, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1815 %i4 = fcmp reassoc nsz contract ogt <8 x float> zeroinitializer, %shuffle99
1816 %i5 = select <8 x i1> %i4, <8 x float> %shuffle99, <8 x float> zeroinitializer
1817 store <8 x float> %i5, ptr undef, align 16