1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X32,X32-AVX,X32-AVX1
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-AVX,X32-AVX2
4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X32,X32-AVX512,X32-AVX512F
5 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X32,X32-AVX512,X32-AVX512BW
6 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X32,X32-AVX512,X32-AVX512DQ
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512,X64-AVX512F
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512,X64-AVX512BW
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512,X64-AVX512DQ
14 ; Subvector Load + Broadcast
17 define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
18 ; X32-LABEL: test_broadcast_2f64_4f64:
20 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
21 ; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
24 ; X64-LABEL: test_broadcast_2f64_4f64:
26 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
28 %1 = load <2 x double>, <2 x double> *%p
29 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
33 define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind {
34 ; X32-AVX-LABEL: test_broadcast_2f64_8f64:
36 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
37 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
38 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
41 ; X32-AVX512-LABEL: test_broadcast_2f64_8f64:
42 ; X32-AVX512: # %bb.0:
43 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
44 ; X32-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
45 ; X32-AVX512-NEXT: retl
47 ; X64-AVX-LABEL: test_broadcast_2f64_8f64:
49 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
50 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
53 ; X64-AVX512-LABEL: test_broadcast_2f64_8f64:
54 ; X64-AVX512: # %bb.0:
55 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
56 ; X64-AVX512-NEXT: retq
57 %1 = load <2 x double>, <2 x double> *%p
58 %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
62 define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind {
63 ; X32-AVX-LABEL: test_broadcast_4f64_8f64:
65 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
66 ; X32-AVX-NEXT: vmovaps (%eax), %ymm0
67 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
70 ; X32-AVX512-LABEL: test_broadcast_4f64_8f64:
71 ; X32-AVX512: # %bb.0:
72 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
73 ; X32-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
74 ; X32-AVX512-NEXT: retl
76 ; X64-AVX-LABEL: test_broadcast_4f64_8f64:
78 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
79 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
82 ; X64-AVX512-LABEL: test_broadcast_4f64_8f64:
83 ; X64-AVX512: # %bb.0:
84 ; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
85 ; X64-AVX512-NEXT: retq
86 %1 = load <4 x double>, <4 x double> *%p
87 %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
91 define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
92 ; X32-AVX-LABEL: test_broadcast_2i64_4i64:
94 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
95 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
98 ; X32-AVX512-LABEL: test_broadcast_2i64_4i64:
99 ; X32-AVX512: # %bb.0:
100 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
101 ; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
102 ; X32-AVX512-NEXT: retl
104 ; X64-AVX-LABEL: test_broadcast_2i64_4i64:
106 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
109 ; X64-AVX512-LABEL: test_broadcast_2i64_4i64:
110 ; X64-AVX512: # %bb.0:
111 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
112 ; X64-AVX512-NEXT: retq
113 %1 = load <2 x i64>, <2 x i64> *%p
114 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
118 define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind {
119 ; X32-AVX-LABEL: test_broadcast_2i64_8i64:
121 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
122 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
123 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
126 ; X32-AVX512-LABEL: test_broadcast_2i64_8i64:
127 ; X32-AVX512: # %bb.0:
128 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
129 ; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
130 ; X32-AVX512-NEXT: retl
132 ; X64-AVX-LABEL: test_broadcast_2i64_8i64:
134 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
135 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
138 ; X64-AVX512-LABEL: test_broadcast_2i64_8i64:
139 ; X64-AVX512: # %bb.0:
140 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
141 ; X64-AVX512-NEXT: retq
142 %1 = load <2 x i64>, <2 x i64> *%p
143 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
147 define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind {
148 ; X32-AVX-LABEL: test_broadcast_4i64_8i64:
150 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
151 ; X32-AVX-NEXT: vmovaps (%eax), %ymm0
152 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
155 ; X32-AVX512-LABEL: test_broadcast_4i64_8i64:
156 ; X32-AVX512: # %bb.0:
157 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
158 ; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
159 ; X32-AVX512-NEXT: retl
161 ; X64-AVX-LABEL: test_broadcast_4i64_8i64:
163 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
164 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
167 ; X64-AVX512-LABEL: test_broadcast_4i64_8i64:
168 ; X64-AVX512: # %bb.0:
169 ; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
170 ; X64-AVX512-NEXT: retq
171 %1 = load <4 x i64>, <4 x i64> *%p
172 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
176 define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
177 ; X32-LABEL: test_broadcast_4f32_8f32:
179 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
180 ; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
183 ; X64-LABEL: test_broadcast_4f32_8f32:
185 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
187 %1 = load <4 x float>, <4 x float> *%p
188 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
192 define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind {
193 ; X32-AVX-LABEL: test_broadcast_4f32_16f32:
195 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
196 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
197 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
200 ; X32-AVX512-LABEL: test_broadcast_4f32_16f32:
201 ; X32-AVX512: # %bb.0:
202 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
203 ; X32-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
204 ; X32-AVX512-NEXT: retl
206 ; X64-AVX-LABEL: test_broadcast_4f32_16f32:
208 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
209 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
212 ; X64-AVX512-LABEL: test_broadcast_4f32_16f32:
213 ; X64-AVX512: # %bb.0:
214 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
215 ; X64-AVX512-NEXT: retq
216 %1 = load <4 x float>, <4 x float> *%p
217 %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
221 define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind {
222 ; X32-AVX-LABEL: test_broadcast_8f32_16f32:
224 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
225 ; X32-AVX-NEXT: vmovaps (%eax), %ymm0
226 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
229 ; X32-AVX512-LABEL: test_broadcast_8f32_16f32:
230 ; X32-AVX512: # %bb.0:
231 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
232 ; X32-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
233 ; X32-AVX512-NEXT: retl
235 ; X64-AVX-LABEL: test_broadcast_8f32_16f32:
237 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
238 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
241 ; X64-AVX512-LABEL: test_broadcast_8f32_16f32:
242 ; X64-AVX512: # %bb.0:
243 ; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
244 ; X64-AVX512-NEXT: retq
245 %1 = load <8 x float>, <8 x float> *%p
246 %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
250 define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
251 ; X32-AVX-LABEL: test_broadcast_4i32_8i32:
253 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
254 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
257 ; X32-AVX512-LABEL: test_broadcast_4i32_8i32:
258 ; X32-AVX512: # %bb.0:
259 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
260 ; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
261 ; X32-AVX512-NEXT: retl
263 ; X64-AVX-LABEL: test_broadcast_4i32_8i32:
265 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
268 ; X64-AVX512-LABEL: test_broadcast_4i32_8i32:
269 ; X64-AVX512: # %bb.0:
270 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
271 ; X64-AVX512-NEXT: retq
272 %1 = load <4 x i32>, <4 x i32> *%p
273 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
277 define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind {
278 ; X32-AVX-LABEL: test_broadcast_4i32_16i32:
280 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
281 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
282 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
285 ; X32-AVX512-LABEL: test_broadcast_4i32_16i32:
286 ; X32-AVX512: # %bb.0:
287 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
288 ; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
289 ; X32-AVX512-NEXT: retl
291 ; X64-AVX-LABEL: test_broadcast_4i32_16i32:
293 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
294 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
297 ; X64-AVX512-LABEL: test_broadcast_4i32_16i32:
298 ; X64-AVX512: # %bb.0:
299 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
300 ; X64-AVX512-NEXT: retq
301 %1 = load <4 x i32>, <4 x i32> *%p
302 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
306 define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind {
307 ; X32-AVX-LABEL: test_broadcast_8i32_16i32:
309 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
310 ; X32-AVX-NEXT: vmovaps (%eax), %ymm0
311 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
314 ; X32-AVX512-LABEL: test_broadcast_8i32_16i32:
315 ; X32-AVX512: # %bb.0:
316 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
317 ; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
318 ; X32-AVX512-NEXT: retl
320 ; X64-AVX-LABEL: test_broadcast_8i32_16i32:
322 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
323 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
326 ; X64-AVX512-LABEL: test_broadcast_8i32_16i32:
327 ; X64-AVX512: # %bb.0:
328 ; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
329 ; X64-AVX512-NEXT: retq
330 %1 = load <8 x i32>, <8 x i32> *%p
331 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
335 define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
336 ; X32-AVX-LABEL: test_broadcast_8i16_16i16:
338 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
339 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
342 ; X32-AVX512-LABEL: test_broadcast_8i16_16i16:
343 ; X32-AVX512: # %bb.0:
344 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
345 ; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
346 ; X32-AVX512-NEXT: retl
348 ; X64-AVX-LABEL: test_broadcast_8i16_16i16:
350 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
353 ; X64-AVX512-LABEL: test_broadcast_8i16_16i16:
354 ; X64-AVX512: # %bb.0:
355 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
356 ; X64-AVX512-NEXT: retq
357 %1 = load <8 x i16>, <8 x i16> *%p
358 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
362 define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
363 ; X32-AVX-LABEL: test_broadcast_8i16_32i16:
365 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
366 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
367 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
370 ; X32-AVX512F-LABEL: test_broadcast_8i16_32i16:
371 ; X32-AVX512F: # %bb.0:
372 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
373 ; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
374 ; X32-AVX512F-NEXT: vmovdqa %ymm0, %ymm1
375 ; X32-AVX512F-NEXT: retl
377 ; X32-AVX512BW-LABEL: test_broadcast_8i16_32i16:
378 ; X32-AVX512BW: # %bb.0:
379 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
380 ; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
381 ; X32-AVX512BW-NEXT: retl
383 ; X32-AVX512DQ-LABEL: test_broadcast_8i16_32i16:
384 ; X32-AVX512DQ: # %bb.0:
385 ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
386 ; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
387 ; X32-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
388 ; X32-AVX512DQ-NEXT: retl
390 ; X64-AVX-LABEL: test_broadcast_8i16_32i16:
392 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
393 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
396 ; X64-AVX512F-LABEL: test_broadcast_8i16_32i16:
397 ; X64-AVX512F: # %bb.0:
398 ; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
399 ; X64-AVX512F-NEXT: vmovdqa %ymm0, %ymm1
400 ; X64-AVX512F-NEXT: retq
402 ; X64-AVX512BW-LABEL: test_broadcast_8i16_32i16:
403 ; X64-AVX512BW: # %bb.0:
404 ; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
405 ; X64-AVX512BW-NEXT: retq
407 ; X64-AVX512DQ-LABEL: test_broadcast_8i16_32i16:
408 ; X64-AVX512DQ: # %bb.0:
409 ; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
410 ; X64-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
411 ; X64-AVX512DQ-NEXT: retq
412 %1 = load <8 x i16>, <8 x i16> *%p
413 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
417 define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind {
418 ; X32-AVX-LABEL: test_broadcast_16i16_32i16:
420 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
421 ; X32-AVX-NEXT: vmovaps (%eax), %ymm0
422 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
425 ; X32-AVX512F-LABEL: test_broadcast_16i16_32i16:
426 ; X32-AVX512F: # %bb.0:
427 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
428 ; X32-AVX512F-NEXT: vmovaps (%eax), %ymm0
429 ; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
430 ; X32-AVX512F-NEXT: retl
432 ; X32-AVX512BW-LABEL: test_broadcast_16i16_32i16:
433 ; X32-AVX512BW: # %bb.0:
434 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
435 ; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
436 ; X32-AVX512BW-NEXT: retl
438 ; X32-AVX512DQ-LABEL: test_broadcast_16i16_32i16:
439 ; X32-AVX512DQ: # %bb.0:
440 ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
441 ; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0
442 ; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
443 ; X32-AVX512DQ-NEXT: retl
445 ; X64-AVX-LABEL: test_broadcast_16i16_32i16:
447 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
448 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
451 ; X64-AVX512F-LABEL: test_broadcast_16i16_32i16:
452 ; X64-AVX512F: # %bb.0:
453 ; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm0
454 ; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
455 ; X64-AVX512F-NEXT: retq
457 ; X64-AVX512BW-LABEL: test_broadcast_16i16_32i16:
458 ; X64-AVX512BW: # %bb.0:
459 ; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
460 ; X64-AVX512BW-NEXT: retq
462 ; X64-AVX512DQ-LABEL: test_broadcast_16i16_32i16:
463 ; X64-AVX512DQ: # %bb.0:
464 ; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
465 ; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
466 ; X64-AVX512DQ-NEXT: retq
467 %1 = load <16 x i16>, <16 x i16> *%p
468 %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
472 define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
473 ; X32-AVX-LABEL: test_broadcast_16i8_32i8:
475 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
476 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
479 ; X32-AVX512-LABEL: test_broadcast_16i8_32i8:
480 ; X32-AVX512: # %bb.0:
481 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
482 ; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
483 ; X32-AVX512-NEXT: retl
485 ; X64-AVX-LABEL: test_broadcast_16i8_32i8:
487 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
490 ; X64-AVX512-LABEL: test_broadcast_16i8_32i8:
491 ; X64-AVX512: # %bb.0:
492 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
493 ; X64-AVX512-NEXT: retq
494 %1 = load <16 x i8>, <16 x i8> *%p
495 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
499 define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
500 ; X32-AVX-LABEL: test_broadcast_16i8_64i8:
502 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
503 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
504 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
507 ; X32-AVX512F-LABEL: test_broadcast_16i8_64i8:
508 ; X32-AVX512F: # %bb.0:
509 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
510 ; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
511 ; X32-AVX512F-NEXT: vmovdqa %ymm0, %ymm1
512 ; X32-AVX512F-NEXT: retl
514 ; X32-AVX512BW-LABEL: test_broadcast_16i8_64i8:
515 ; X32-AVX512BW: # %bb.0:
516 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
517 ; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
518 ; X32-AVX512BW-NEXT: retl
520 ; X32-AVX512DQ-LABEL: test_broadcast_16i8_64i8:
521 ; X32-AVX512DQ: # %bb.0:
522 ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
523 ; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
524 ; X32-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
525 ; X32-AVX512DQ-NEXT: retl
527 ; X64-AVX-LABEL: test_broadcast_16i8_64i8:
529 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
530 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
533 ; X64-AVX512F-LABEL: test_broadcast_16i8_64i8:
534 ; X64-AVX512F: # %bb.0:
535 ; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
536 ; X64-AVX512F-NEXT: vmovdqa %ymm0, %ymm1
537 ; X64-AVX512F-NEXT: retq
539 ; X64-AVX512BW-LABEL: test_broadcast_16i8_64i8:
540 ; X64-AVX512BW: # %bb.0:
541 ; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
542 ; X64-AVX512BW-NEXT: retq
544 ; X64-AVX512DQ-LABEL: test_broadcast_16i8_64i8:
545 ; X64-AVX512DQ: # %bb.0:
546 ; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
547 ; X64-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
548 ; X64-AVX512DQ-NEXT: retq
549 %1 = load <16 x i8>, <16 x i8> *%p
550 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
554 define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind {
555 ; X32-AVX-LABEL: test_broadcast_32i8_64i8:
557 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
558 ; X32-AVX-NEXT: vmovaps (%eax), %ymm0
559 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
562 ; X32-AVX512F-LABEL: test_broadcast_32i8_64i8:
563 ; X32-AVX512F: # %bb.0:
564 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
565 ; X32-AVX512F-NEXT: vmovaps (%eax), %ymm0
566 ; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
567 ; X32-AVX512F-NEXT: retl
569 ; X32-AVX512BW-LABEL: test_broadcast_32i8_64i8:
570 ; X32-AVX512BW: # %bb.0:
571 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
572 ; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
573 ; X32-AVX512BW-NEXT: retl
575 ; X32-AVX512DQ-LABEL: test_broadcast_32i8_64i8:
576 ; X32-AVX512DQ: # %bb.0:
577 ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
578 ; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0
579 ; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
580 ; X32-AVX512DQ-NEXT: retl
582 ; X64-AVX-LABEL: test_broadcast_32i8_64i8:
584 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
585 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
588 ; X64-AVX512F-LABEL: test_broadcast_32i8_64i8:
589 ; X64-AVX512F: # %bb.0:
590 ; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm0
591 ; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
592 ; X64-AVX512F-NEXT: retq
594 ; X64-AVX512BW-LABEL: test_broadcast_32i8_64i8:
595 ; X64-AVX512BW: # %bb.0:
596 ; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
597 ; X64-AVX512BW-NEXT: retq
599 ; X64-AVX512DQ-LABEL: test_broadcast_32i8_64i8:
600 ; X64-AVX512DQ: # %bb.0:
601 ; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
602 ; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
603 ; X64-AVX512DQ-NEXT: retq
604 %1 = load <32 x i8>, <32 x i8> *%p
605 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
610 ; Subvector Load + Broadcast + Store
613 define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) {
614 ; X32-LABEL: test_broadcast_2f64_4f64_reuse:
616 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
617 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
618 ; X32-NEXT: vmovaps (%ecx), %xmm0
619 ; X32-NEXT: vmovaps %xmm0, (%eax)
620 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
623 ; X64-LABEL: test_broadcast_2f64_4f64_reuse:
625 ; X64-NEXT: vmovaps (%rdi), %xmm0
626 ; X64-NEXT: vmovaps %xmm0, (%rsi)
627 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
629 %1 = load <2 x double>, <2 x double>* %p0
630 store <2 x double> %1, <2 x double>* %p1
631 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
635 define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) {
636 ; X32-LABEL: test_broadcast_2i64_4i64_reuse:
638 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
639 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
640 ; X32-NEXT: vmovaps (%ecx), %xmm0
641 ; X32-NEXT: vmovaps %xmm0, (%eax)
642 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
645 ; X64-LABEL: test_broadcast_2i64_4i64_reuse:
647 ; X64-NEXT: vmovaps (%rdi), %xmm0
648 ; X64-NEXT: vmovaps %xmm0, (%rsi)
649 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
651 %1 = load <2 x i64>, <2 x i64>* %p0
652 store <2 x i64> %1, <2 x i64>* %p1
653 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
657 define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) {
658 ; X32-LABEL: test_broadcast_4f32_8f32_reuse:
660 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
661 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
662 ; X32-NEXT: vmovaps (%ecx), %xmm0
663 ; X32-NEXT: vmovaps %xmm0, (%eax)
664 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
667 ; X64-LABEL: test_broadcast_4f32_8f32_reuse:
669 ; X64-NEXT: vmovaps (%rdi), %xmm0
670 ; X64-NEXT: vmovaps %xmm0, (%rsi)
671 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
673 %1 = load <4 x float>, <4 x float>* %p0
674 store <4 x float> %1, <4 x float>* %p1
675 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
679 define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) {
680 ; X32-LABEL: test_broadcast_4i32_8i32_reuse:
682 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
683 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
684 ; X32-NEXT: vmovaps (%ecx), %xmm0
685 ; X32-NEXT: vmovaps %xmm0, (%eax)
686 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
689 ; X64-LABEL: test_broadcast_4i32_8i32_reuse:
691 ; X64-NEXT: vmovaps (%rdi), %xmm0
692 ; X64-NEXT: vmovaps %xmm0, (%rsi)
693 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
695 %1 = load <4 x i32>, <4 x i32>* %p0
696 store <4 x i32> %1, <4 x i32>* %p1
697 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
701 define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind {
702 ; X32-LABEL: test_broadcast_8i16_16i16_reuse:
704 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
705 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
706 ; X32-NEXT: vmovaps (%ecx), %xmm0
707 ; X32-NEXT: vmovaps %xmm0, (%eax)
708 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
711 ; X64-LABEL: test_broadcast_8i16_16i16_reuse:
713 ; X64-NEXT: vmovaps (%rdi), %xmm0
714 ; X64-NEXT: vmovaps %xmm0, (%rsi)
715 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
717 %1 = load <8 x i16>, <8 x i16> *%p0
718 store <8 x i16> %1, <8 x i16>* %p1
719 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
723 define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind {
724 ; X32-LABEL: test_broadcast_16i8_32i8_reuse:
726 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
727 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
728 ; X32-NEXT: vmovaps (%ecx), %xmm0
729 ; X32-NEXT: vmovaps %xmm0, (%eax)
730 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
733 ; X64-LABEL: test_broadcast_16i8_32i8_reuse:
735 ; X64-NEXT: vmovaps (%rdi), %xmm0
736 ; X64-NEXT: vmovaps %xmm0, (%rsi)
737 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
739 %1 = load <16 x i8>, <16 x i8> *%p0
740 store <16 x i8> %1, <16 x i8>* %p1
741 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
746 ; Subvector Load + Broadcast with Separate Store
749 define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
750 ; X32-AVX-LABEL: test_broadcast_4i32_8i32_chain:
752 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
753 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
754 ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
755 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
756 ; X32-AVX-NEXT: vmovaps %xmm1, (%eax)
759 ; X32-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
760 ; X32-AVX512: # %bb.0:
761 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
762 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
763 ; X32-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
764 ; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
765 ; X32-AVX512-NEXT: vmovaps %xmm1, (%eax)
766 ; X32-AVX512-NEXT: retl
768 ; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain:
770 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
771 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
772 ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi)
775 ; X64-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
776 ; X64-AVX512: # %bb.0:
777 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
778 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
779 ; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi)
780 ; X64-AVX512-NEXT: retq
781 %1 = load <4 x i32>, <4 x i32>* %p0
782 store <4 x float> zeroinitializer, <4 x float>* %p1
783 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
787 define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
788 ; X32-AVX-LABEL: test_broadcast_4i32_16i32_chain:
790 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
791 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
792 ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
793 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
794 ; X32-AVX-NEXT: vmovaps %xmm1, (%eax)
795 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
798 ; X32-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
799 ; X32-AVX512: # %bb.0:
800 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
801 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
802 ; X32-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
803 ; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
804 ; X32-AVX512-NEXT: vmovaps %xmm1, (%eax)
805 ; X32-AVX512-NEXT: retl
807 ; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain:
809 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
810 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
811 ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi)
812 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
815 ; X64-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
816 ; X64-AVX512: # %bb.0:
817 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
818 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
819 ; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi)
820 ; X64-AVX512-NEXT: retq
821 %1 = load <4 x i32>, <4 x i32>* %p0
822 store <4 x float> zeroinitializer, <4 x float>* %p1
823 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
828 ; subvector Load with multiple uses + broadcast
829 ; Fallback to the broadcast should be done
832 @ga4 = global <4 x i64> zeroinitializer, align 8
833 @gb4 = global <8 x i64> zeroinitializer, align 8
835 define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
836 ; X32-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
837 ; X32-AVX1: # %bb.0: # %entry
838 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,2,0]
839 ; X32-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4
840 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
841 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,0,4,0]
842 ; X32-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0
843 ; X32-AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [1,0,2,0,3,0,4,0]
844 ; X32-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
845 ; X32-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm7
846 ; X32-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
847 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2
848 ; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
849 ; X32-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm5
850 ; X32-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
851 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
852 ; X32-AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1
853 ; X32-AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
854 ; X32-AVX1-NEXT: vmovdqu %xmm0, ga4+16
855 ; X32-AVX1-NEXT: vmovdqu %xmm4, ga4
856 ; X32-AVX1-NEXT: vmovups %ymm2, gb4+32
857 ; X32-AVX1-NEXT: vmovups %ymm1, gb4
858 ; X32-AVX1-NEXT: vzeroupper
859 ; X32-AVX1-NEXT: retl
861 ; X32-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
862 ; X32-AVX2: # %bb.0: # %entry
863 ; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
864 ; X32-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
865 ; X32-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
866 ; X32-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
867 ; X32-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
868 ; X32-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
869 ; X32-AVX2-NEXT: vmovdqu %ymm0, ga4
870 ; X32-AVX2-NEXT: vmovdqu %ymm2, gb4+32
871 ; X32-AVX2-NEXT: vmovdqu %ymm1, gb4
872 ; X32-AVX2-NEXT: vzeroupper
873 ; X32-AVX2-NEXT: retl
875 ; X32-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
876 ; X32-AVX512: # %bb.0: # %entry
877 ; X32-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,2,0,3,0,4,0]
878 ; X32-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0
879 ; X32-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
880 ; X32-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1
881 ; X32-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
882 ; X32-AVX512-NEXT: vmovdqu %ymm0, ga4
883 ; X32-AVX512-NEXT: vmovdqu64 %zmm1, gb4
884 ; X32-AVX512-NEXT: vzeroupper
885 ; X32-AVX512-NEXT: retl
887 ; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
888 ; X64-AVX1: # %bb.0: # %entry
889 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2]
890 ; X64-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4
891 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
892 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,4]
893 ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0
894 ; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [1,2,3,4]
895 ; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
896 ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm7
897 ; X64-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
898 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2
899 ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
900 ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm5
901 ; X64-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
902 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
903 ; X64-AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1
904 ; X64-AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
905 ; X64-AVX1-NEXT: vmovdqu %xmm0, ga4+{{.*}}(%rip)
906 ; X64-AVX1-NEXT: vmovdqu %xmm4, {{.*}}(%rip)
907 ; X64-AVX1-NEXT: vmovups %ymm2, gb4+{{.*}}(%rip)
908 ; X64-AVX1-NEXT: vmovups %ymm1, {{.*}}(%rip)
909 ; X64-AVX1-NEXT: vzeroupper
910 ; X64-AVX1-NEXT: retq
912 ; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
913 ; X64-AVX2: # %bb.0: # %entry
914 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4]
915 ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
916 ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
917 ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
918 ; X64-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
919 ; X64-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
920 ; X64-AVX2-NEXT: vmovdqu %ymm0, {{.*}}(%rip)
921 ; X64-AVX2-NEXT: vmovdqu %ymm2, gb4+{{.*}}(%rip)
922 ; X64-AVX2-NEXT: vmovdqu %ymm1, {{.*}}(%rip)
923 ; X64-AVX2-NEXT: vzeroupper
924 ; X64-AVX2-NEXT: retq
926 ; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
927 ; X64-AVX512: # %bb.0: # %entry
928 ; X64-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,3,4]
929 ; X64-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0
930 ; X64-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
931 ; X64-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1
932 ; X64-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
933 ; X64-AVX512-NEXT: vmovdqu %ymm0, {{.*}}(%rip)
934 ; X64-AVX512-NEXT: vmovdqu64 %zmm1, {{.*}}(%rip)
935 ; X64-AVX512-NEXT: vzeroupper
936 ; X64-AVX512-NEXT: retq
938 %0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
939 %1 = add <8 x i64> %b, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
940 %2 = and <8 x i64> %1, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
941 store <4 x i64> %0, <4 x i64>* @ga4, align 8
942 store <8 x i64> %2, <8 x i64>* @gb4, align 8
947 @ga2 = global <4 x double> zeroinitializer, align 8
948 @gb2 = global <8 x double> zeroinitializer, align 8
950 define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) {
951 ; X32-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
952 ; X32-AVX: # %bb.0: # %entry
953 ; X32-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
954 ; X32-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0
955 ; X32-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2
956 ; X32-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1
957 ; X32-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1
958 ; X32-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2
959 ; X32-AVX-NEXT: vmovupd %ymm0, ga2
960 ; X32-AVX-NEXT: vmovupd %ymm2, gb2+32
961 ; X32-AVX-NEXT: vmovupd %ymm1, gb2
962 ; X32-AVX-NEXT: vzeroupper
965 ; X32-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
966 ; X32-AVX512: # %bb.0: # %entry
967 ; X32-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
968 ; X32-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0
969 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
970 ; X32-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
971 ; X32-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1
972 ; X32-AVX512-NEXT: vmovupd %ymm0, ga2
973 ; X32-AVX512-NEXT: vmovupd %zmm1, gb2
974 ; X32-AVX512-NEXT: vzeroupper
975 ; X32-AVX512-NEXT: retl
977 ; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
978 ; X64-AVX: # %bb.0: # %entry
979 ; X64-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
980 ; X64-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0
981 ; X64-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2
982 ; X64-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1
983 ; X64-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1
984 ; X64-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2
985 ; X64-AVX-NEXT: vmovupd %ymm0, {{.*}}(%rip)
986 ; X64-AVX-NEXT: vmovupd %ymm2, gb2+{{.*}}(%rip)
987 ; X64-AVX-NEXT: vmovupd %ymm1, {{.*}}(%rip)
988 ; X64-AVX-NEXT: vzeroupper
991 ; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
992 ; X64-AVX512: # %bb.0: # %entry
993 ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
994 ; X64-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0
995 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
996 ; X64-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
997 ; X64-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1
998 ; X64-AVX512-NEXT: vmovupd %ymm0, {{.*}}(%rip)
999 ; X64-AVX512-NEXT: vmovupd %zmm1, {{.*}}(%rip)
1000 ; X64-AVX512-NEXT: vzeroupper
1001 ; X64-AVX512-NEXT: retq
1003 %0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0>
1004 %1 = fadd <8 x double> %b, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
1005 %2 = fdiv <8 x double> %1, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
1006 store <4 x double> %0, <4 x double>* @ga2, align 8
1007 store <8 x double> %2, <8 x double>* @gb2, align 8
1012 ; Subvector Broadcast from register
1015 define <4 x double> @reg_broadcast_2f64_4f64(<2 x double> %a0) nounwind {
1016 ; X32-LABEL: reg_broadcast_2f64_4f64:
1018 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1019 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1022 ; X64-LABEL: reg_broadcast_2f64_4f64:
1024 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1025 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1027 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1031 define <8 x double> @reg_broadcast_2f64_8f64(<2 x double> %a0) nounwind {
1032 ; X32-AVX-LABEL: reg_broadcast_2f64_8f64:
1034 ; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1035 ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1036 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1037 ; X32-AVX-NEXT: retl
1039 ; X32-AVX512-LABEL: reg_broadcast_2f64_8f64:
1040 ; X32-AVX512: # %bb.0:
1041 ; X32-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1042 ; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1043 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1044 ; X32-AVX512-NEXT: retl
1046 ; X64-AVX-LABEL: reg_broadcast_2f64_8f64:
1048 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1049 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1050 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1051 ; X64-AVX-NEXT: retq
1053 ; X64-AVX512-LABEL: reg_broadcast_2f64_8f64:
1054 ; X64-AVX512: # %bb.0:
1055 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1056 ; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1057 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1058 ; X64-AVX512-NEXT: retq
1059 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1063 define <8 x double> @reg_broadcast_4f64_8f64(<4 x double> %a0) nounwind {
1064 ; X32-AVX-LABEL: reg_broadcast_4f64_8f64:
1066 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1067 ; X32-AVX-NEXT: retl
1069 ; X32-AVX512-LABEL: reg_broadcast_4f64_8f64:
1070 ; X32-AVX512: # %bb.0:
1071 ; X32-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1072 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1073 ; X32-AVX512-NEXT: retl
1075 ; X64-AVX-LABEL: reg_broadcast_4f64_8f64:
1077 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1078 ; X64-AVX-NEXT: retq
1080 ; X64-AVX512-LABEL: reg_broadcast_4f64_8f64:
1081 ; X64-AVX512: # %bb.0:
1082 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1083 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1084 ; X64-AVX512-NEXT: retq
1085 %1 = shufflevector <4 x double> %a0, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1089 define <4 x i64> @reg_broadcast_2i64_4i64(<2 x i64> %a0) nounwind {
1090 ; X32-LABEL: reg_broadcast_2i64_4i64:
1092 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1093 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1096 ; X64-LABEL: reg_broadcast_2i64_4i64:
1098 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1099 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1101 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1105 define <8 x i64> @reg_broadcast_2i64_8i64(<2 x i64> %a0) nounwind {
1106 ; X32-AVX-LABEL: reg_broadcast_2i64_8i64:
1108 ; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1109 ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1110 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1111 ; X32-AVX-NEXT: retl
1113 ; X32-AVX512-LABEL: reg_broadcast_2i64_8i64:
1114 ; X32-AVX512: # %bb.0:
1115 ; X32-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1116 ; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1117 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1118 ; X32-AVX512-NEXT: retl
1120 ; X64-AVX-LABEL: reg_broadcast_2i64_8i64:
1122 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1123 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1124 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1125 ; X64-AVX-NEXT: retq
1127 ; X64-AVX512-LABEL: reg_broadcast_2i64_8i64:
1128 ; X64-AVX512: # %bb.0:
1129 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1130 ; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1131 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1132 ; X64-AVX512-NEXT: retq
1133 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1137 define <8 x i64> @reg_broadcast_4i64_8i64(<4 x i64> %a0) nounwind {
1138 ; X32-AVX-LABEL: reg_broadcast_4i64_8i64:
1140 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1141 ; X32-AVX-NEXT: retl
1143 ; X32-AVX512-LABEL: reg_broadcast_4i64_8i64:
1144 ; X32-AVX512: # %bb.0:
1145 ; X32-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1146 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1147 ; X32-AVX512-NEXT: retl
1149 ; X64-AVX-LABEL: reg_broadcast_4i64_8i64:
1151 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1152 ; X64-AVX-NEXT: retq
1154 ; X64-AVX512-LABEL: reg_broadcast_4i64_8i64:
1155 ; X64-AVX512: # %bb.0:
1156 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1157 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1158 ; X64-AVX512-NEXT: retq
1159 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1163 define <8 x float> @reg_broadcast_4f32_8f32(<4 x float> %a0) nounwind {
1164 ; X32-LABEL: reg_broadcast_4f32_8f32:
1166 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1167 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1170 ; X64-LABEL: reg_broadcast_4f32_8f32:
1172 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1173 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1175 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1179 define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind {
1180 ; X32-AVX-LABEL: reg_broadcast_4f32_16f32:
1182 ; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1183 ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1184 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1185 ; X32-AVX-NEXT: retl
1187 ; X32-AVX512-LABEL: reg_broadcast_4f32_16f32:
1188 ; X32-AVX512: # %bb.0:
1189 ; X32-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1190 ; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1191 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1192 ; X32-AVX512-NEXT: retl
1194 ; X64-AVX-LABEL: reg_broadcast_4f32_16f32:
1196 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1197 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1198 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1199 ; X64-AVX-NEXT: retq
1201 ; X64-AVX512-LABEL: reg_broadcast_4f32_16f32:
1202 ; X64-AVX512: # %bb.0:
1203 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1204 ; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1205 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1206 ; X64-AVX512-NEXT: retq
1207 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1211 define <16 x float> @reg_broadcast_8f32_16f32(<8 x float> %a0) nounwind {
1212 ; X32-AVX-LABEL: reg_broadcast_8f32_16f32:
1214 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1215 ; X32-AVX-NEXT: retl
1217 ; X32-AVX512-LABEL: reg_broadcast_8f32_16f32:
1218 ; X32-AVX512: # %bb.0:
1219 ; X32-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1220 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1221 ; X32-AVX512-NEXT: retl
1223 ; X64-AVX-LABEL: reg_broadcast_8f32_16f32:
1225 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1226 ; X64-AVX-NEXT: retq
1228 ; X64-AVX512-LABEL: reg_broadcast_8f32_16f32:
1229 ; X64-AVX512: # %bb.0:
1230 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1231 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1232 ; X64-AVX512-NEXT: retq
1233 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1237 define <8 x i32> @reg_broadcast_4i32_8i32(<4 x i32> %a0) nounwind {
1238 ; X32-LABEL: reg_broadcast_4i32_8i32:
1240 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1241 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1244 ; X64-LABEL: reg_broadcast_4i32_8i32:
1246 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1247 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1249 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1253 define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind {
1254 ; X32-AVX-LABEL: reg_broadcast_4i32_16i32:
1256 ; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1257 ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1258 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1259 ; X32-AVX-NEXT: retl
1261 ; X32-AVX512-LABEL: reg_broadcast_4i32_16i32:
1262 ; X32-AVX512: # %bb.0:
1263 ; X32-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1264 ; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1265 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1266 ; X32-AVX512-NEXT: retl
1268 ; X64-AVX-LABEL: reg_broadcast_4i32_16i32:
1270 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1271 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1272 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1273 ; X64-AVX-NEXT: retq
1275 ; X64-AVX512-LABEL: reg_broadcast_4i32_16i32:
1276 ; X64-AVX512: # %bb.0:
1277 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1278 ; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1279 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1280 ; X64-AVX512-NEXT: retq
1281 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1285 define <16 x i32> @reg_broadcast_8i32_16i32(<8 x i32> %a0) nounwind {
1286 ; X32-AVX-LABEL: reg_broadcast_8i32_16i32:
1288 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1289 ; X32-AVX-NEXT: retl
1291 ; X32-AVX512-LABEL: reg_broadcast_8i32_16i32:
1292 ; X32-AVX512: # %bb.0:
1293 ; X32-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1294 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1295 ; X32-AVX512-NEXT: retl
1297 ; X64-AVX-LABEL: reg_broadcast_8i32_16i32:
1299 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1300 ; X64-AVX-NEXT: retq
1302 ; X64-AVX512-LABEL: reg_broadcast_8i32_16i32:
1303 ; X64-AVX512: # %bb.0:
1304 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1305 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1306 ; X64-AVX512-NEXT: retq
1307 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1311 define <16 x i16> @reg_broadcast_8i16_16i16(<8 x i16> %a0) nounwind {
1312 ; X32-LABEL: reg_broadcast_8i16_16i16:
1314 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1315 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1318 ; X64-LABEL: reg_broadcast_8i16_16i16:
1320 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1321 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1323 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1327 define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind {
1328 ; X32-AVX-LABEL: reg_broadcast_8i16_32i16:
1330 ; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1331 ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1332 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1333 ; X32-AVX-NEXT: retl
1335 ; X32-AVX512F-LABEL: reg_broadcast_8i16_32i16:
1336 ; X32-AVX512F: # %bb.0:
1337 ; X32-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1338 ; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1339 ; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
1340 ; X32-AVX512F-NEXT: retl
1342 ; X32-AVX512BW-LABEL: reg_broadcast_8i16_32i16:
1343 ; X32-AVX512BW: # %bb.0:
1344 ; X32-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1345 ; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1346 ; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1347 ; X32-AVX512BW-NEXT: retl
1349 ; X32-AVX512DQ-LABEL: reg_broadcast_8i16_32i16:
1350 ; X32-AVX512DQ: # %bb.0:
1351 ; X32-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1352 ; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1353 ; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
1354 ; X32-AVX512DQ-NEXT: retl
1356 ; X64-AVX-LABEL: reg_broadcast_8i16_32i16:
1358 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1359 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1360 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1361 ; X64-AVX-NEXT: retq
1363 ; X64-AVX512F-LABEL: reg_broadcast_8i16_32i16:
1364 ; X64-AVX512F: # %bb.0:
1365 ; X64-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1366 ; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1367 ; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
1368 ; X64-AVX512F-NEXT: retq
1370 ; X64-AVX512BW-LABEL: reg_broadcast_8i16_32i16:
1371 ; X64-AVX512BW: # %bb.0:
1372 ; X64-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1373 ; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1374 ; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1375 ; X64-AVX512BW-NEXT: retq
1377 ; X64-AVX512DQ-LABEL: reg_broadcast_8i16_32i16:
1378 ; X64-AVX512DQ: # %bb.0:
1379 ; X64-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1380 ; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1381 ; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
1382 ; X64-AVX512DQ-NEXT: retq
1383 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1387 define <32 x i16> @reg_broadcast_16i16_32i16(<16 x i16> %a0) nounwind {
1388 ; X32-AVX-LABEL: reg_broadcast_16i16_32i16:
1390 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1391 ; X32-AVX-NEXT: retl
1393 ; X32-AVX512F-LABEL: reg_broadcast_16i16_32i16:
1394 ; X32-AVX512F: # %bb.0:
1395 ; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
1396 ; X32-AVX512F-NEXT: retl
1398 ; X32-AVX512BW-LABEL: reg_broadcast_16i16_32i16:
1399 ; X32-AVX512BW: # %bb.0:
1400 ; X32-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1401 ; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1402 ; X32-AVX512BW-NEXT: retl
1404 ; X32-AVX512DQ-LABEL: reg_broadcast_16i16_32i16:
1405 ; X32-AVX512DQ: # %bb.0:
1406 ; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
1407 ; X32-AVX512DQ-NEXT: retl
1409 ; X64-AVX-LABEL: reg_broadcast_16i16_32i16:
1411 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1412 ; X64-AVX-NEXT: retq
1414 ; X64-AVX512F-LABEL: reg_broadcast_16i16_32i16:
1415 ; X64-AVX512F: # %bb.0:
1416 ; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
1417 ; X64-AVX512F-NEXT: retq
1419 ; X64-AVX512BW-LABEL: reg_broadcast_16i16_32i16:
1420 ; X64-AVX512BW: # %bb.0:
1421 ; X64-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1422 ; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1423 ; X64-AVX512BW-NEXT: retq
1425 ; X64-AVX512DQ-LABEL: reg_broadcast_16i16_32i16:
1426 ; X64-AVX512DQ: # %bb.0:
1427 ; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
1428 ; X64-AVX512DQ-NEXT: retq
1429 %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1433 define <32 x i8> @reg_broadcast_16i8_32i8(<16 x i8> %a0) nounwind {
1434 ; X32-LABEL: reg_broadcast_16i8_32i8:
1436 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1437 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1440 ; X64-LABEL: reg_broadcast_16i8_32i8:
1442 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1443 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1445 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1449 define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind {
1450 ; X32-AVX-LABEL: reg_broadcast_16i8_64i8:
1452 ; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1453 ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1454 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1455 ; X32-AVX-NEXT: retl
1457 ; X32-AVX512F-LABEL: reg_broadcast_16i8_64i8:
1458 ; X32-AVX512F: # %bb.0:
1459 ; X32-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1460 ; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1461 ; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
1462 ; X32-AVX512F-NEXT: retl
1464 ; X32-AVX512BW-LABEL: reg_broadcast_16i8_64i8:
1465 ; X32-AVX512BW: # %bb.0:
1466 ; X32-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1467 ; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1468 ; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1469 ; X32-AVX512BW-NEXT: retl
1471 ; X32-AVX512DQ-LABEL: reg_broadcast_16i8_64i8:
1472 ; X32-AVX512DQ: # %bb.0:
1473 ; X32-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1474 ; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1475 ; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
1476 ; X32-AVX512DQ-NEXT: retl
1478 ; X64-AVX-LABEL: reg_broadcast_16i8_64i8:
1480 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1481 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1482 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1483 ; X64-AVX-NEXT: retq
1485 ; X64-AVX512F-LABEL: reg_broadcast_16i8_64i8:
1486 ; X64-AVX512F: # %bb.0:
1487 ; X64-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1488 ; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1489 ; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
1490 ; X64-AVX512F-NEXT: retq
1492 ; X64-AVX512BW-LABEL: reg_broadcast_16i8_64i8:
1493 ; X64-AVX512BW: # %bb.0:
1494 ; X64-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1495 ; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1496 ; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1497 ; X64-AVX512BW-NEXT: retq
1499 ; X64-AVX512DQ-LABEL: reg_broadcast_16i8_64i8:
1500 ; X64-AVX512DQ: # %bb.0:
1501 ; X64-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1502 ; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1503 ; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
1504 ; X64-AVX512DQ-NEXT: retq
1505 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1509 define <64 x i8> @reg_broadcast_32i8_64i8(<32 x i8> %a0) nounwind {
1510 ; X32-AVX-LABEL: reg_broadcast_32i8_64i8:
1512 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1513 ; X32-AVX-NEXT: retl
1515 ; X32-AVX512F-LABEL: reg_broadcast_32i8_64i8:
1516 ; X32-AVX512F: # %bb.0:
1517 ; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
1518 ; X32-AVX512F-NEXT: retl
1520 ; X32-AVX512BW-LABEL: reg_broadcast_32i8_64i8:
1521 ; X32-AVX512BW: # %bb.0:
1522 ; X32-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1523 ; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1524 ; X32-AVX512BW-NEXT: retl
1526 ; X32-AVX512DQ-LABEL: reg_broadcast_32i8_64i8:
1527 ; X32-AVX512DQ: # %bb.0:
1528 ; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
1529 ; X32-AVX512DQ-NEXT: retl
1531 ; X64-AVX-LABEL: reg_broadcast_32i8_64i8:
1533 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1534 ; X64-AVX-NEXT: retq
1536 ; X64-AVX512F-LABEL: reg_broadcast_32i8_64i8:
1537 ; X64-AVX512F: # %bb.0:
1538 ; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
1539 ; X64-AVX512F-NEXT: retq
1541 ; X64-AVX512BW-LABEL: reg_broadcast_32i8_64i8:
1542 ; X64-AVX512BW: # %bb.0:
1543 ; X64-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1544 ; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1545 ; X64-AVX512BW-NEXT: retq
1547 ; X64-AVX512DQ-LABEL: reg_broadcast_32i8_64i8:
1548 ; X64-AVX512DQ: # %bb.0:
1549 ; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
1550 ; X64-AVX512DQ-NEXT: retq
1551 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1559 define <4 x i32> @test_2xi32_to_4xi32_mem(<2 x i32>* %vp) {
1560 ; X32-LABEL: test_2xi32_to_4xi32_mem:
1562 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
1563 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
1566 ; X64-LABEL: test_2xi32_to_4xi32_mem:
1568 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
1570 %vec = load <2 x i32>, <2 x i32>* %vp
1571 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1575 define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) {
1576 ; X32-LABEL: test_2xi32_to_8xi32_mem:
1578 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
1579 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
1582 ; X64-LABEL: test_2xi32_to_8xi32_mem:
1584 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
1586 %vec = load <2 x i32>, <2 x i32>* %vp
1587 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1591 define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) {
1592 ; X32-AVX-LABEL: test_2xi32_to_16xi32_mem:
1594 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1595 ; X32-AVX-NEXT: vbroadcastsd (%eax), %ymm0
1596 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1597 ; X32-AVX-NEXT: retl
1599 ; X32-AVX512-LABEL: test_2xi32_to_16xi32_mem:
1600 ; X32-AVX512: # %bb.0:
1601 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1602 ; X32-AVX512-NEXT: vbroadcastsd (%eax), %zmm0
1603 ; X32-AVX512-NEXT: retl
1605 ; X64-AVX-LABEL: test_2xi32_to_16xi32_mem:
1607 ; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm0
1608 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1609 ; X64-AVX-NEXT: retq
1611 ; X64-AVX512-LABEL: test_2xi32_to_16xi32_mem:
1612 ; X64-AVX512: # %bb.0:
1613 ; X64-AVX512-NEXT: vbroadcastsd (%rdi), %zmm0
1614 ; X64-AVX512-NEXT: retq
1615 %vec = load <2 x i32>, <2 x i32>* %vp
1616 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1624 define <4 x double> @broadcast_v4f64_f64_u000(double* %p) {
1625 ; X32-LABEL: broadcast_v4f64_f64_u000:
1627 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
1628 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
1631 ; X64-LABEL: broadcast_v4f64_f64_u000:
1633 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
1635 %s = load double, double* %p
1636 %vec = insertelement <2 x double> undef, double %s, i32 0
1637 %res = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
1638 ret <4 x double> %res
1641 define <4 x double> @broadcast_v4f64_v2f64_4u61(<2 x double>* %vp, <4 x double> %default) {
1642 ; X32-LABEL: broadcast_v4f64_v2f64_4u61:
1644 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
1645 ; X32-NEXT: vinsertf128 $1, (%eax), %ymm0, %ymm1
1646 ; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1649 ; X64-LABEL: broadcast_v4f64_v2f64_4u61:
1651 ; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm1
1652 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1654 %vec = load <2 x double>, <2 x double>* %vp
1655 %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 1>
1656 %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %default
1657 ret <4 x double> %res
1660 define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(<2 x float>* %vp, <8 x float> %default) {
1661 ; X32-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1663 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
1664 ; X32-NEXT: vbroadcastsd (%eax), %ymm1
1665 ; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
1668 ; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1670 ; X64-NEXT: vbroadcastsd (%rdi), %ymm1
1671 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
1673 %vec = load <2 x float>, <2 x float>* %vp
1674 %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 0, i32 2, i32 3, i32 undef>
1675 %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %default
1676 ret <8 x float> %res
1679 define <8 x double> @broadcast_v8f64_v2f64_u1u10101(<2 x double>* %vp) {
1680 ; X32-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101:
1682 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1683 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1684 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1685 ; X32-AVX-NEXT: retl
1687 ; X32-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101:
1688 ; X32-AVX512: # %bb.0:
1689 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1690 ; X32-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1691 ; X32-AVX512-NEXT: retl
1693 ; X64-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101:
1695 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1696 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1697 ; X64-AVX-NEXT: retq
1699 ; X64-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101:
1700 ; X64-AVX512: # %bb.0:
1701 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1702 ; X64-AVX512-NEXT: retq
1703 %vec = load <2 x double>, <2 x double>* %vp
1704 %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 3, i32 1, i32 undef, i32 1, i32 0, i32 1, i32 0, i32 1>
1705 ret <8 x double> %res
1708 define <8 x double> @broadcast_v8f64_v2f64_0uuu0101(<2 x double>* %vp) {
1709 ; X32-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1711 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1712 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1713 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1714 ; X32-AVX-NEXT: retl
1716 ; X32-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1717 ; X32-AVX512: # %bb.0:
1718 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1719 ; X32-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1720 ; X32-AVX512-NEXT: retl
1722 ; X64-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1724 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1725 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1726 ; X64-AVX-NEXT: retq
1728 ; X64-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1729 ; X64-AVX512: # %bb.0:
1730 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1731 ; X64-AVX512-NEXT: retq
1732 %vec = load <2 x double>, <2 x double>* %vp
1733 %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 0, i32 1>
1734 ret <8 x double> %res