1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X32,X32-AVX,X32-AVX1
3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-AVX,X32-AVX2
4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X32,X32-AVX512,X32-AVX512F
5 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X32,X32-AVX512,X32-AVX512BW
6 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X32,X32-AVX512,X32-AVX512DQ
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512,X64-AVX512F
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512,X64-AVX512BW
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512,X64-AVX512DQ
14 ; Subvector Load + Broadcast
17 define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
18 ; X32-LABEL: test_broadcast_2f64_4f64:
20 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
21 ; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
24 ; X64-LABEL: test_broadcast_2f64_4f64:
26 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
28 %1 = load <2 x double>, <2 x double> *%p
29 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
33 define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind {
34 ; X32-AVX-LABEL: test_broadcast_2f64_8f64:
36 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
37 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
38 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
41 ; X32-AVX512-LABEL: test_broadcast_2f64_8f64:
42 ; X32-AVX512: # %bb.0:
43 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
44 ; X32-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
45 ; X32-AVX512-NEXT: retl
47 ; X64-AVX-LABEL: test_broadcast_2f64_8f64:
49 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
50 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
53 ; X64-AVX512-LABEL: test_broadcast_2f64_8f64:
54 ; X64-AVX512: # %bb.0:
55 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
56 ; X64-AVX512-NEXT: retq
57 %1 = load <2 x double>, <2 x double> *%p
58 %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
62 define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind {
63 ; X32-AVX-LABEL: test_broadcast_4f64_8f64:
65 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
66 ; X32-AVX-NEXT: vmovaps (%eax), %ymm0
67 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
70 ; X32-AVX512-LABEL: test_broadcast_4f64_8f64:
71 ; X32-AVX512: # %bb.0:
72 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
73 ; X32-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
74 ; X32-AVX512-NEXT: retl
76 ; X64-AVX-LABEL: test_broadcast_4f64_8f64:
78 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
79 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
82 ; X64-AVX512-LABEL: test_broadcast_4f64_8f64:
83 ; X64-AVX512: # %bb.0:
84 ; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
85 ; X64-AVX512-NEXT: retq
86 %1 = load <4 x double>, <4 x double> *%p
87 %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
91 define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
92 ; X32-AVX-LABEL: test_broadcast_2i64_4i64:
94 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
95 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
98 ; X32-AVX512-LABEL: test_broadcast_2i64_4i64:
99 ; X32-AVX512: # %bb.0:
100 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
101 ; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
102 ; X32-AVX512-NEXT: retl
104 ; X64-AVX-LABEL: test_broadcast_2i64_4i64:
106 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
109 ; X64-AVX512-LABEL: test_broadcast_2i64_4i64:
110 ; X64-AVX512: # %bb.0:
111 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
112 ; X64-AVX512-NEXT: retq
113 %1 = load <2 x i64>, <2 x i64> *%p
114 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
118 define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind {
119 ; X32-AVX-LABEL: test_broadcast_2i64_8i64:
121 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
122 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
123 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
126 ; X32-AVX512-LABEL: test_broadcast_2i64_8i64:
127 ; X32-AVX512: # %bb.0:
128 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
129 ; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
130 ; X32-AVX512-NEXT: retl
132 ; X64-AVX-LABEL: test_broadcast_2i64_8i64:
134 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
135 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
138 ; X64-AVX512-LABEL: test_broadcast_2i64_8i64:
139 ; X64-AVX512: # %bb.0:
140 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
141 ; X64-AVX512-NEXT: retq
142 %1 = load <2 x i64>, <2 x i64> *%p
143 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
147 define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind {
148 ; X32-AVX-LABEL: test_broadcast_4i64_8i64:
150 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
151 ; X32-AVX-NEXT: vmovaps (%eax), %ymm0
152 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
155 ; X32-AVX512-LABEL: test_broadcast_4i64_8i64:
156 ; X32-AVX512: # %bb.0:
157 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
158 ; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
159 ; X32-AVX512-NEXT: retl
161 ; X64-AVX-LABEL: test_broadcast_4i64_8i64:
163 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
164 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
167 ; X64-AVX512-LABEL: test_broadcast_4i64_8i64:
168 ; X64-AVX512: # %bb.0:
169 ; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
170 ; X64-AVX512-NEXT: retq
171 %1 = load <4 x i64>, <4 x i64> *%p
172 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
176 define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
177 ; X32-LABEL: test_broadcast_4f32_8f32:
179 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
180 ; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
183 ; X64-LABEL: test_broadcast_4f32_8f32:
185 ; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
187 %1 = load <4 x float>, <4 x float> *%p
188 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
192 define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind {
193 ; X32-AVX-LABEL: test_broadcast_4f32_16f32:
195 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
196 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
197 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
200 ; X32-AVX512-LABEL: test_broadcast_4f32_16f32:
201 ; X32-AVX512: # %bb.0:
202 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
203 ; X32-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
204 ; X32-AVX512-NEXT: retl
206 ; X64-AVX-LABEL: test_broadcast_4f32_16f32:
208 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
209 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
212 ; X64-AVX512-LABEL: test_broadcast_4f32_16f32:
213 ; X64-AVX512: # %bb.0:
214 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
215 ; X64-AVX512-NEXT: retq
216 %1 = load <4 x float>, <4 x float> *%p
217 %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
221 define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind {
222 ; X32-AVX-LABEL: test_broadcast_8f32_16f32:
224 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
225 ; X32-AVX-NEXT: vmovaps (%eax), %ymm0
226 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
229 ; X32-AVX512-LABEL: test_broadcast_8f32_16f32:
230 ; X32-AVX512: # %bb.0:
231 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
232 ; X32-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
233 ; X32-AVX512-NEXT: retl
235 ; X64-AVX-LABEL: test_broadcast_8f32_16f32:
237 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
238 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
241 ; X64-AVX512-LABEL: test_broadcast_8f32_16f32:
242 ; X64-AVX512: # %bb.0:
243 ; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
244 ; X64-AVX512-NEXT: retq
245 %1 = load <8 x float>, <8 x float> *%p
246 %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
250 define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
251 ; X32-AVX-LABEL: test_broadcast_4i32_8i32:
253 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
254 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
257 ; X32-AVX512-LABEL: test_broadcast_4i32_8i32:
258 ; X32-AVX512: # %bb.0:
259 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
260 ; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
261 ; X32-AVX512-NEXT: retl
263 ; X64-AVX-LABEL: test_broadcast_4i32_8i32:
265 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
268 ; X64-AVX512-LABEL: test_broadcast_4i32_8i32:
269 ; X64-AVX512: # %bb.0:
270 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
271 ; X64-AVX512-NEXT: retq
272 %1 = load <4 x i32>, <4 x i32> *%p
273 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
277 define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind {
278 ; X32-AVX-LABEL: test_broadcast_4i32_16i32:
280 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
281 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
282 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
285 ; X32-AVX512-LABEL: test_broadcast_4i32_16i32:
286 ; X32-AVX512: # %bb.0:
287 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
288 ; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
289 ; X32-AVX512-NEXT: retl
291 ; X64-AVX-LABEL: test_broadcast_4i32_16i32:
293 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
294 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
297 ; X64-AVX512-LABEL: test_broadcast_4i32_16i32:
298 ; X64-AVX512: # %bb.0:
299 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
300 ; X64-AVX512-NEXT: retq
301 %1 = load <4 x i32>, <4 x i32> *%p
302 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
306 define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind {
307 ; X32-AVX-LABEL: test_broadcast_8i32_16i32:
309 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
310 ; X32-AVX-NEXT: vmovaps (%eax), %ymm0
311 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
314 ; X32-AVX512-LABEL: test_broadcast_8i32_16i32:
315 ; X32-AVX512: # %bb.0:
316 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
317 ; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
318 ; X32-AVX512-NEXT: retl
320 ; X64-AVX-LABEL: test_broadcast_8i32_16i32:
322 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
323 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
326 ; X64-AVX512-LABEL: test_broadcast_8i32_16i32:
327 ; X64-AVX512: # %bb.0:
328 ; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
329 ; X64-AVX512-NEXT: retq
330 %1 = load <8 x i32>, <8 x i32> *%p
331 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
335 define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
336 ; X32-AVX-LABEL: test_broadcast_8i16_16i16:
338 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
339 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
342 ; X32-AVX512-LABEL: test_broadcast_8i16_16i16:
343 ; X32-AVX512: # %bb.0:
344 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
345 ; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
346 ; X32-AVX512-NEXT: retl
348 ; X64-AVX-LABEL: test_broadcast_8i16_16i16:
350 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
353 ; X64-AVX512-LABEL: test_broadcast_8i16_16i16:
354 ; X64-AVX512: # %bb.0:
355 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
356 ; X64-AVX512-NEXT: retq
357 %1 = load <8 x i16>, <8 x i16> *%p
358 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
362 define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
363 ; X32-AVX-LABEL: test_broadcast_8i16_32i16:
365 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
366 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
367 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
370 ; X32-AVX512F-LABEL: test_broadcast_8i16_32i16:
371 ; X32-AVX512F: # %bb.0:
372 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
373 ; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
374 ; X32-AVX512F-NEXT: vmovdqa %ymm0, %ymm1
375 ; X32-AVX512F-NEXT: retl
377 ; X32-AVX512BW-LABEL: test_broadcast_8i16_32i16:
378 ; X32-AVX512BW: # %bb.0:
379 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
380 ; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
381 ; X32-AVX512BW-NEXT: retl
383 ; X32-AVX512DQ-LABEL: test_broadcast_8i16_32i16:
384 ; X32-AVX512DQ: # %bb.0:
385 ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
386 ; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
387 ; X32-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
388 ; X32-AVX512DQ-NEXT: retl
390 ; X64-AVX-LABEL: test_broadcast_8i16_32i16:
392 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
393 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
396 ; X64-AVX512F-LABEL: test_broadcast_8i16_32i16:
397 ; X64-AVX512F: # %bb.0:
398 ; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
399 ; X64-AVX512F-NEXT: vmovdqa %ymm0, %ymm1
400 ; X64-AVX512F-NEXT: retq
402 ; X64-AVX512BW-LABEL: test_broadcast_8i16_32i16:
403 ; X64-AVX512BW: # %bb.0:
404 ; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
405 ; X64-AVX512BW-NEXT: retq
407 ; X64-AVX512DQ-LABEL: test_broadcast_8i16_32i16:
408 ; X64-AVX512DQ: # %bb.0:
409 ; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
410 ; X64-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
411 ; X64-AVX512DQ-NEXT: retq
412 %1 = load <8 x i16>, <8 x i16> *%p
413 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
417 define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind {
418 ; X32-AVX-LABEL: test_broadcast_16i16_32i16:
420 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
421 ; X32-AVX-NEXT: vmovaps (%eax), %ymm0
422 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
425 ; X32-AVX512F-LABEL: test_broadcast_16i16_32i16:
426 ; X32-AVX512F: # %bb.0:
427 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
428 ; X32-AVX512F-NEXT: vmovaps (%eax), %ymm0
429 ; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
430 ; X32-AVX512F-NEXT: retl
432 ; X32-AVX512BW-LABEL: test_broadcast_16i16_32i16:
433 ; X32-AVX512BW: # %bb.0:
434 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
435 ; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
436 ; X32-AVX512BW-NEXT: retl
438 ; X32-AVX512DQ-LABEL: test_broadcast_16i16_32i16:
439 ; X32-AVX512DQ: # %bb.0:
440 ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
441 ; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0
442 ; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
443 ; X32-AVX512DQ-NEXT: retl
445 ; X64-AVX-LABEL: test_broadcast_16i16_32i16:
447 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
448 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
451 ; X64-AVX512F-LABEL: test_broadcast_16i16_32i16:
452 ; X64-AVX512F: # %bb.0:
453 ; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm0
454 ; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
455 ; X64-AVX512F-NEXT: retq
457 ; X64-AVX512BW-LABEL: test_broadcast_16i16_32i16:
458 ; X64-AVX512BW: # %bb.0:
459 ; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
460 ; X64-AVX512BW-NEXT: retq
462 ; X64-AVX512DQ-LABEL: test_broadcast_16i16_32i16:
463 ; X64-AVX512DQ: # %bb.0:
464 ; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
465 ; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
466 ; X64-AVX512DQ-NEXT: retq
467 %1 = load <16 x i16>, <16 x i16> *%p
468 %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
472 define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
473 ; X32-AVX-LABEL: test_broadcast_16i8_32i8:
475 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
476 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
479 ; X32-AVX512-LABEL: test_broadcast_16i8_32i8:
480 ; X32-AVX512: # %bb.0:
481 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
482 ; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
483 ; X32-AVX512-NEXT: retl
485 ; X64-AVX-LABEL: test_broadcast_16i8_32i8:
487 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
490 ; X64-AVX512-LABEL: test_broadcast_16i8_32i8:
491 ; X64-AVX512: # %bb.0:
492 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
493 ; X64-AVX512-NEXT: retq
494 %1 = load <16 x i8>, <16 x i8> *%p
495 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
499 define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
500 ; X32-AVX-LABEL: test_broadcast_16i8_64i8:
502 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
503 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
504 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
507 ; X32-AVX512F-LABEL: test_broadcast_16i8_64i8:
508 ; X32-AVX512F: # %bb.0:
509 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
510 ; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
511 ; X32-AVX512F-NEXT: vmovdqa %ymm0, %ymm1
512 ; X32-AVX512F-NEXT: retl
514 ; X32-AVX512BW-LABEL: test_broadcast_16i8_64i8:
515 ; X32-AVX512BW: # %bb.0:
516 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
517 ; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
518 ; X32-AVX512BW-NEXT: retl
520 ; X32-AVX512DQ-LABEL: test_broadcast_16i8_64i8:
521 ; X32-AVX512DQ: # %bb.0:
522 ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
523 ; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
524 ; X32-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
525 ; X32-AVX512DQ-NEXT: retl
527 ; X64-AVX-LABEL: test_broadcast_16i8_64i8:
529 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
530 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
533 ; X64-AVX512F-LABEL: test_broadcast_16i8_64i8:
534 ; X64-AVX512F: # %bb.0:
535 ; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
536 ; X64-AVX512F-NEXT: vmovdqa %ymm0, %ymm1
537 ; X64-AVX512F-NEXT: retq
539 ; X64-AVX512BW-LABEL: test_broadcast_16i8_64i8:
540 ; X64-AVX512BW: # %bb.0:
541 ; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
542 ; X64-AVX512BW-NEXT: retq
544 ; X64-AVX512DQ-LABEL: test_broadcast_16i8_64i8:
545 ; X64-AVX512DQ: # %bb.0:
546 ; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
547 ; X64-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
548 ; X64-AVX512DQ-NEXT: retq
549 %1 = load <16 x i8>, <16 x i8> *%p
550 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
554 define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind {
555 ; X32-AVX-LABEL: test_broadcast_32i8_64i8:
557 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
558 ; X32-AVX-NEXT: vmovaps (%eax), %ymm0
559 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
562 ; X32-AVX512F-LABEL: test_broadcast_32i8_64i8:
563 ; X32-AVX512F: # %bb.0:
564 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
565 ; X32-AVX512F-NEXT: vmovaps (%eax), %ymm0
566 ; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
567 ; X32-AVX512F-NEXT: retl
569 ; X32-AVX512BW-LABEL: test_broadcast_32i8_64i8:
570 ; X32-AVX512BW: # %bb.0:
571 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
572 ; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
573 ; X32-AVX512BW-NEXT: retl
575 ; X32-AVX512DQ-LABEL: test_broadcast_32i8_64i8:
576 ; X32-AVX512DQ: # %bb.0:
577 ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
578 ; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0
579 ; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
580 ; X32-AVX512DQ-NEXT: retl
582 ; X64-AVX-LABEL: test_broadcast_32i8_64i8:
584 ; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
585 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
588 ; X64-AVX512F-LABEL: test_broadcast_32i8_64i8:
589 ; X64-AVX512F: # %bb.0:
590 ; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm0
591 ; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
592 ; X64-AVX512F-NEXT: retq
594 ; X64-AVX512BW-LABEL: test_broadcast_32i8_64i8:
595 ; X64-AVX512BW: # %bb.0:
596 ; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
597 ; X64-AVX512BW-NEXT: retq
599 ; X64-AVX512DQ-LABEL: test_broadcast_32i8_64i8:
600 ; X64-AVX512DQ: # %bb.0:
601 ; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
602 ; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
603 ; X64-AVX512DQ-NEXT: retq
604 %1 = load <32 x i8>, <32 x i8> *%p
605 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
610 ; Subvector Load + Broadcast + Store
613 define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) {
614 ; X32-LABEL: test_broadcast_2f64_4f64_reuse:
616 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
617 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
618 ; X32-NEXT: vmovaps (%ecx), %xmm0
619 ; X32-NEXT: vmovaps %xmm0, (%eax)
620 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
623 ; X64-LABEL: test_broadcast_2f64_4f64_reuse:
625 ; X64-NEXT: vmovaps (%rdi), %xmm0
626 ; X64-NEXT: vmovaps %xmm0, (%rsi)
627 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
629 %1 = load <2 x double>, <2 x double>* %p0
630 store <2 x double> %1, <2 x double>* %p1
631 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
635 define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) {
636 ; X32-LABEL: test_broadcast_2i64_4i64_reuse:
638 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
639 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
640 ; X32-NEXT: vmovaps (%ecx), %xmm0
641 ; X32-NEXT: vmovaps %xmm0, (%eax)
642 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
645 ; X64-LABEL: test_broadcast_2i64_4i64_reuse:
647 ; X64-NEXT: vmovaps (%rdi), %xmm0
648 ; X64-NEXT: vmovaps %xmm0, (%rsi)
649 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
651 %1 = load <2 x i64>, <2 x i64>* %p0
652 store <2 x i64> %1, <2 x i64>* %p1
653 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
657 define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) {
658 ; X32-LABEL: test_broadcast_4f32_8f32_reuse:
660 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
661 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
662 ; X32-NEXT: vmovaps (%ecx), %xmm0
663 ; X32-NEXT: vmovaps %xmm0, (%eax)
664 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
667 ; X64-LABEL: test_broadcast_4f32_8f32_reuse:
669 ; X64-NEXT: vmovaps (%rdi), %xmm0
670 ; X64-NEXT: vmovaps %xmm0, (%rsi)
671 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
673 %1 = load <4 x float>, <4 x float>* %p0
674 store <4 x float> %1, <4 x float>* %p1
675 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
679 define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) {
680 ; X32-LABEL: test_broadcast_4i32_8i32_reuse:
682 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
683 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
684 ; X32-NEXT: vmovaps (%ecx), %xmm0
685 ; X32-NEXT: vmovaps %xmm0, (%eax)
686 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
689 ; X64-LABEL: test_broadcast_4i32_8i32_reuse:
691 ; X64-NEXT: vmovaps (%rdi), %xmm0
692 ; X64-NEXT: vmovaps %xmm0, (%rsi)
693 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
695 %1 = load <4 x i32>, <4 x i32>* %p0
696 store <4 x i32> %1, <4 x i32>* %p1
697 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
701 define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind {
702 ; X32-LABEL: test_broadcast_8i16_16i16_reuse:
704 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
705 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
706 ; X32-NEXT: vmovaps (%ecx), %xmm0
707 ; X32-NEXT: vmovaps %xmm0, (%eax)
708 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
711 ; X64-LABEL: test_broadcast_8i16_16i16_reuse:
713 ; X64-NEXT: vmovaps (%rdi), %xmm0
714 ; X64-NEXT: vmovaps %xmm0, (%rsi)
715 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
717 %1 = load <8 x i16>, <8 x i16> *%p0
718 store <8 x i16> %1, <8 x i16>* %p1
719 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
723 define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind {
724 ; X32-LABEL: test_broadcast_16i8_32i8_reuse:
726 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
727 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
728 ; X32-NEXT: vmovaps (%ecx), %xmm0
729 ; X32-NEXT: vmovaps %xmm0, (%eax)
730 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
733 ; X64-LABEL: test_broadcast_16i8_32i8_reuse:
735 ; X64-NEXT: vmovaps (%rdi), %xmm0
736 ; X64-NEXT: vmovaps %xmm0, (%rsi)
737 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
739 %1 = load <16 x i8>, <16 x i8> *%p0
740 store <16 x i8> %1, <16 x i8>* %p1
741 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
746 ; Subvector Load + Broadcast with Separate Store
749 define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
750 ; X32-AVX-LABEL: test_broadcast_4i32_8i32_chain:
752 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
753 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
754 ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
755 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
756 ; X32-AVX-NEXT: vmovaps %xmm1, (%eax)
759 ; X32-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
760 ; X32-AVX512: # %bb.0:
761 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
762 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
763 ; X32-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
764 ; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
765 ; X32-AVX512-NEXT: vmovaps %xmm1, (%eax)
766 ; X32-AVX512-NEXT: retl
768 ; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain:
770 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
771 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
772 ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi)
775 ; X64-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
776 ; X64-AVX512: # %bb.0:
777 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
778 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
779 ; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi)
780 ; X64-AVX512-NEXT: retq
781 %1 = load <4 x i32>, <4 x i32>* %p0
782 store <4 x float> zeroinitializer, <4 x float>* %p1
783 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
787 define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
788 ; X32-AVX-LABEL: test_broadcast_4i32_16i32_chain:
790 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
791 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
792 ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
793 ; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
794 ; X32-AVX-NEXT: vmovaps %xmm1, (%eax)
795 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
798 ; X32-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
799 ; X32-AVX512: # %bb.0:
800 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
801 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
802 ; X32-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
803 ; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
804 ; X32-AVX512-NEXT: vmovaps %xmm1, (%eax)
805 ; X32-AVX512-NEXT: retl
807 ; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain:
809 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
810 ; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
811 ; X64-AVX-NEXT: vmovaps %xmm1, (%rsi)
812 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
815 ; X64-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
816 ; X64-AVX512: # %bb.0:
817 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
818 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
819 ; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi)
820 ; X64-AVX512-NEXT: retq
821 %1 = load <4 x i32>, <4 x i32>* %p0
822 store <4 x float> zeroinitializer, <4 x float>* %p1
823 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
828 ; subvector Load with multiple uses + broadcast
829 ; Fallback to the broadcast should be done
832 @ga4 = global <4 x i64> zeroinitializer, align 8
833 @gb4 = global <8 x i64> zeroinitializer, align 8
835 define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
836 ; X32-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
837 ; X32-AVX1: # %bb.0: # %entry
838 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
839 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [3,0,4,0]
840 ; X32-AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
841 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,0,2,0]
842 ; X32-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0
843 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
844 ; X32-AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
845 ; X32-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
846 ; X32-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm6
847 ; X32-AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2
848 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
849 ; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
850 ; X32-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4
851 ; X32-AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
852 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
853 ; X32-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
854 ; X32-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
855 ; X32-AVX1-NEXT: vmovups %ymm0, ga4
856 ; X32-AVX1-NEXT: vmovups %ymm2, gb4+32
857 ; X32-AVX1-NEXT: vmovups %ymm1, gb4
858 ; X32-AVX1-NEXT: vzeroupper
859 ; X32-AVX1-NEXT: retl
861 ; X32-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
862 ; X32-AVX2: # %bb.0: # %entry
863 ; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
864 ; X32-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
865 ; X32-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
866 ; X32-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
867 ; X32-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
868 ; X32-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
869 ; X32-AVX2-NEXT: vmovdqu %ymm0, ga4
870 ; X32-AVX2-NEXT: vmovdqu %ymm2, gb4+32
871 ; X32-AVX2-NEXT: vmovdqu %ymm1, gb4
872 ; X32-AVX2-NEXT: vzeroupper
873 ; X32-AVX2-NEXT: retl
875 ; X32-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
876 ; X32-AVX512: # %bb.0: # %entry
877 ; X32-AVX512-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0
878 ; X32-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0]
879 ; X32-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1
880 ; X32-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
881 ; X32-AVX512-NEXT: vmovdqu %ymm0, ga4
882 ; X32-AVX512-NEXT: vmovdqu64 %zmm1, gb4
883 ; X32-AVX512-NEXT: vzeroupper
884 ; X32-AVX512-NEXT: retl
886 ; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
887 ; X64-AVX1: # %bb.0: # %entry
888 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
889 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [3,4]
890 ; X64-AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
891 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2]
892 ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0
893 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
894 ; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [1,2,3,4]
895 ; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
896 ; X64-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm6
897 ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2
898 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
899 ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
900 ; X64-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4
901 ; X64-AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
902 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
903 ; X64-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
904 ; X64-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
905 ; X64-AVX1-NEXT: vmovups %ymm0, {{.*}}(%rip)
906 ; X64-AVX1-NEXT: vmovups %ymm2, gb4+{{.*}}(%rip)
907 ; X64-AVX1-NEXT: vmovups %ymm1, {{.*}}(%rip)
908 ; X64-AVX1-NEXT: vzeroupper
909 ; X64-AVX1-NEXT: retq
911 ; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
912 ; X64-AVX2: # %bb.0: # %entry
913 ; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4]
914 ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
915 ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
916 ; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
917 ; X64-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
918 ; X64-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
919 ; X64-AVX2-NEXT: vmovdqu %ymm0, {{.*}}(%rip)
920 ; X64-AVX2-NEXT: vmovdqu %ymm2, gb4+{{.*}}(%rip)
921 ; X64-AVX2-NEXT: vmovdqu %ymm1, {{.*}}(%rip)
922 ; X64-AVX2-NEXT: vzeroupper
923 ; X64-AVX2-NEXT: retq
925 ; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
926 ; X64-AVX512: # %bb.0: # %entry
927 ; X64-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,3,4]
928 ; X64-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0
929 ; X64-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
930 ; X64-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1
931 ; X64-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
932 ; X64-AVX512-NEXT: vmovdqu %ymm0, {{.*}}(%rip)
933 ; X64-AVX512-NEXT: vmovdqu64 %zmm1, {{.*}}(%rip)
934 ; X64-AVX512-NEXT: vzeroupper
935 ; X64-AVX512-NEXT: retq
937 %0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
938 %1 = add <8 x i64> %b, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
939 %2 = and <8 x i64> %1, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
940 store <4 x i64> %0, <4 x i64>* @ga4, align 8
941 store <8 x i64> %2, <8 x i64>* @gb4, align 8
946 @ga2 = global <4 x double> zeroinitializer, align 8
947 @gb2 = global <8 x double> zeroinitializer, align 8
949 define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) {
950 ; X32-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
951 ; X32-AVX: # %bb.0: # %entry
952 ; X32-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
953 ; X32-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0
954 ; X32-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2
955 ; X32-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1
956 ; X32-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1
957 ; X32-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2
958 ; X32-AVX-NEXT: vmovupd %ymm0, ga2
959 ; X32-AVX-NEXT: vmovupd %ymm2, gb2+32
960 ; X32-AVX-NEXT: vmovupd %ymm1, gb2
961 ; X32-AVX-NEXT: vzeroupper
964 ; X32-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
965 ; X32-AVX512: # %bb.0: # %entry
966 ; X32-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
967 ; X32-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0
968 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
969 ; X32-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
970 ; X32-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1
971 ; X32-AVX512-NEXT: vmovupd %ymm0, ga2
972 ; X32-AVX512-NEXT: vmovupd %zmm1, gb2
973 ; X32-AVX512-NEXT: vzeroupper
974 ; X32-AVX512-NEXT: retl
976 ; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
977 ; X64-AVX: # %bb.0: # %entry
978 ; X64-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
979 ; X64-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0
980 ; X64-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2
981 ; X64-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1
982 ; X64-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1
983 ; X64-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2
984 ; X64-AVX-NEXT: vmovupd %ymm0, {{.*}}(%rip)
985 ; X64-AVX-NEXT: vmovupd %ymm2, gb2+{{.*}}(%rip)
986 ; X64-AVX-NEXT: vmovupd %ymm1, {{.*}}(%rip)
987 ; X64-AVX-NEXT: vzeroupper
990 ; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
991 ; X64-AVX512: # %bb.0: # %entry
992 ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
993 ; X64-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0
994 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
995 ; X64-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
996 ; X64-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1
997 ; X64-AVX512-NEXT: vmovupd %ymm0, {{.*}}(%rip)
998 ; X64-AVX512-NEXT: vmovupd %zmm1, {{.*}}(%rip)
999 ; X64-AVX512-NEXT: vzeroupper
1000 ; X64-AVX512-NEXT: retq
1002 %0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0>
1003 %1 = fadd <8 x double> %b, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
1004 %2 = fdiv <8 x double> %1, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
1005 store <4 x double> %0, <4 x double>* @ga2, align 8
1006 store <8 x double> %2, <8 x double>* @gb2, align 8
1011 ; Subvector Broadcast from register
1014 define <4 x double> @reg_broadcast_2f64_4f64(<2 x double> %a0) nounwind {
1015 ; X32-LABEL: reg_broadcast_2f64_4f64:
1017 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1018 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1021 ; X64-LABEL: reg_broadcast_2f64_4f64:
1023 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1024 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1026 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1030 define <8 x double> @reg_broadcast_2f64_8f64(<2 x double> %a0) nounwind {
1031 ; X32-AVX-LABEL: reg_broadcast_2f64_8f64:
1033 ; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1034 ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1035 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1036 ; X32-AVX-NEXT: retl
1038 ; X32-AVX512-LABEL: reg_broadcast_2f64_8f64:
1039 ; X32-AVX512: # %bb.0:
1040 ; X32-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1041 ; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1042 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1043 ; X32-AVX512-NEXT: retl
1045 ; X64-AVX-LABEL: reg_broadcast_2f64_8f64:
1047 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1048 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1049 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1050 ; X64-AVX-NEXT: retq
1052 ; X64-AVX512-LABEL: reg_broadcast_2f64_8f64:
1053 ; X64-AVX512: # %bb.0:
1054 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1055 ; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1056 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1057 ; X64-AVX512-NEXT: retq
1058 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1062 define <8 x double> @reg_broadcast_4f64_8f64(<4 x double> %a0) nounwind {
1063 ; X32-AVX-LABEL: reg_broadcast_4f64_8f64:
1065 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1066 ; X32-AVX-NEXT: retl
1068 ; X32-AVX512-LABEL: reg_broadcast_4f64_8f64:
1069 ; X32-AVX512: # %bb.0:
1070 ; X32-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1071 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1072 ; X32-AVX512-NEXT: retl
1074 ; X64-AVX-LABEL: reg_broadcast_4f64_8f64:
1076 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1077 ; X64-AVX-NEXT: retq
1079 ; X64-AVX512-LABEL: reg_broadcast_4f64_8f64:
1080 ; X64-AVX512: # %bb.0:
1081 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1082 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1083 ; X64-AVX512-NEXT: retq
1084 %1 = shufflevector <4 x double> %a0, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1088 define <4 x i64> @reg_broadcast_2i64_4i64(<2 x i64> %a0) nounwind {
1089 ; X32-LABEL: reg_broadcast_2i64_4i64:
1091 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1092 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1095 ; X64-LABEL: reg_broadcast_2i64_4i64:
1097 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1098 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1100 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1104 define <8 x i64> @reg_broadcast_2i64_8i64(<2 x i64> %a0) nounwind {
1105 ; X32-AVX-LABEL: reg_broadcast_2i64_8i64:
1107 ; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1108 ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1109 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1110 ; X32-AVX-NEXT: retl
1112 ; X32-AVX512-LABEL: reg_broadcast_2i64_8i64:
1113 ; X32-AVX512: # %bb.0:
1114 ; X32-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1115 ; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1116 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1117 ; X32-AVX512-NEXT: retl
1119 ; X64-AVX-LABEL: reg_broadcast_2i64_8i64:
1121 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1122 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1123 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1124 ; X64-AVX-NEXT: retq
1126 ; X64-AVX512-LABEL: reg_broadcast_2i64_8i64:
1127 ; X64-AVX512: # %bb.0:
1128 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1129 ; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1130 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1131 ; X64-AVX512-NEXT: retq
1132 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1136 define <8 x i64> @reg_broadcast_4i64_8i64(<4 x i64> %a0) nounwind {
1137 ; X32-AVX-LABEL: reg_broadcast_4i64_8i64:
1139 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1140 ; X32-AVX-NEXT: retl
1142 ; X32-AVX512-LABEL: reg_broadcast_4i64_8i64:
1143 ; X32-AVX512: # %bb.0:
1144 ; X32-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1145 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1146 ; X32-AVX512-NEXT: retl
1148 ; X64-AVX-LABEL: reg_broadcast_4i64_8i64:
1150 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1151 ; X64-AVX-NEXT: retq
1153 ; X64-AVX512-LABEL: reg_broadcast_4i64_8i64:
1154 ; X64-AVX512: # %bb.0:
1155 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1156 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1157 ; X64-AVX512-NEXT: retq
1158 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1162 define <8 x float> @reg_broadcast_4f32_8f32(<4 x float> %a0) nounwind {
1163 ; X32-LABEL: reg_broadcast_4f32_8f32:
1165 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1166 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1169 ; X64-LABEL: reg_broadcast_4f32_8f32:
1171 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1172 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1174 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1178 define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind {
1179 ; X32-AVX-LABEL: reg_broadcast_4f32_16f32:
1181 ; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1182 ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1183 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1184 ; X32-AVX-NEXT: retl
1186 ; X32-AVX512-LABEL: reg_broadcast_4f32_16f32:
1187 ; X32-AVX512: # %bb.0:
1188 ; X32-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1189 ; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1190 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1191 ; X32-AVX512-NEXT: retl
1193 ; X64-AVX-LABEL: reg_broadcast_4f32_16f32:
1195 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1196 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1197 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1198 ; X64-AVX-NEXT: retq
1200 ; X64-AVX512-LABEL: reg_broadcast_4f32_16f32:
1201 ; X64-AVX512: # %bb.0:
1202 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1203 ; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1204 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1205 ; X64-AVX512-NEXT: retq
1206 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1210 define <16 x float> @reg_broadcast_8f32_16f32(<8 x float> %a0) nounwind {
1211 ; X32-AVX-LABEL: reg_broadcast_8f32_16f32:
1213 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1214 ; X32-AVX-NEXT: retl
1216 ; X32-AVX512-LABEL: reg_broadcast_8f32_16f32:
1217 ; X32-AVX512: # %bb.0:
1218 ; X32-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1219 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1220 ; X32-AVX512-NEXT: retl
1222 ; X64-AVX-LABEL: reg_broadcast_8f32_16f32:
1224 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1225 ; X64-AVX-NEXT: retq
1227 ; X64-AVX512-LABEL: reg_broadcast_8f32_16f32:
1228 ; X64-AVX512: # %bb.0:
1229 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1230 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1231 ; X64-AVX512-NEXT: retq
1232 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1236 define <8 x i32> @reg_broadcast_4i32_8i32(<4 x i32> %a0) nounwind {
1237 ; X32-LABEL: reg_broadcast_4i32_8i32:
1239 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1240 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1243 ; X64-LABEL: reg_broadcast_4i32_8i32:
1245 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1246 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1248 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1252 define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind {
1253 ; X32-AVX-LABEL: reg_broadcast_4i32_16i32:
1255 ; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1256 ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1257 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1258 ; X32-AVX-NEXT: retl
1260 ; X32-AVX512-LABEL: reg_broadcast_4i32_16i32:
1261 ; X32-AVX512: # %bb.0:
1262 ; X32-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1263 ; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1264 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1265 ; X32-AVX512-NEXT: retl
1267 ; X64-AVX-LABEL: reg_broadcast_4i32_16i32:
1269 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1270 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1271 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1272 ; X64-AVX-NEXT: retq
1274 ; X64-AVX512-LABEL: reg_broadcast_4i32_16i32:
1275 ; X64-AVX512: # %bb.0:
1276 ; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1277 ; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1278 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1279 ; X64-AVX512-NEXT: retq
1280 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1284 define <16 x i32> @reg_broadcast_8i32_16i32(<8 x i32> %a0) nounwind {
1285 ; X32-AVX-LABEL: reg_broadcast_8i32_16i32:
1287 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1288 ; X32-AVX-NEXT: retl
1290 ; X32-AVX512-LABEL: reg_broadcast_8i32_16i32:
1291 ; X32-AVX512: # %bb.0:
1292 ; X32-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1293 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1294 ; X32-AVX512-NEXT: retl
1296 ; X64-AVX-LABEL: reg_broadcast_8i32_16i32:
1298 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1299 ; X64-AVX-NEXT: retq
1301 ; X64-AVX512-LABEL: reg_broadcast_8i32_16i32:
1302 ; X64-AVX512: # %bb.0:
1303 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1304 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1305 ; X64-AVX512-NEXT: retq
1306 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1310 define <16 x i16> @reg_broadcast_8i16_16i16(<8 x i16> %a0) nounwind {
1311 ; X32-LABEL: reg_broadcast_8i16_16i16:
1313 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1314 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1317 ; X64-LABEL: reg_broadcast_8i16_16i16:
1319 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1320 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1322 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1326 define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind {
1327 ; X32-AVX-LABEL: reg_broadcast_8i16_32i16:
1329 ; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1330 ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1331 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1332 ; X32-AVX-NEXT: retl
1334 ; X32-AVX512F-LABEL: reg_broadcast_8i16_32i16:
1335 ; X32-AVX512F: # %bb.0:
1336 ; X32-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1337 ; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1338 ; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
1339 ; X32-AVX512F-NEXT: retl
1341 ; X32-AVX512BW-LABEL: reg_broadcast_8i16_32i16:
1342 ; X32-AVX512BW: # %bb.0:
1343 ; X32-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1344 ; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1345 ; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1346 ; X32-AVX512BW-NEXT: retl
1348 ; X32-AVX512DQ-LABEL: reg_broadcast_8i16_32i16:
1349 ; X32-AVX512DQ: # %bb.0:
1350 ; X32-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1351 ; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1352 ; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
1353 ; X32-AVX512DQ-NEXT: retl
1355 ; X64-AVX-LABEL: reg_broadcast_8i16_32i16:
1357 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1358 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1359 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1360 ; X64-AVX-NEXT: retq
1362 ; X64-AVX512F-LABEL: reg_broadcast_8i16_32i16:
1363 ; X64-AVX512F: # %bb.0:
1364 ; X64-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1365 ; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1366 ; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
1367 ; X64-AVX512F-NEXT: retq
1369 ; X64-AVX512BW-LABEL: reg_broadcast_8i16_32i16:
1370 ; X64-AVX512BW: # %bb.0:
1371 ; X64-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1372 ; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1373 ; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1374 ; X64-AVX512BW-NEXT: retq
1376 ; X64-AVX512DQ-LABEL: reg_broadcast_8i16_32i16:
1377 ; X64-AVX512DQ: # %bb.0:
1378 ; X64-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1379 ; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1380 ; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
1381 ; X64-AVX512DQ-NEXT: retq
1382 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1386 define <32 x i16> @reg_broadcast_16i16_32i16(<16 x i16> %a0) nounwind {
1387 ; X32-AVX-LABEL: reg_broadcast_16i16_32i16:
1389 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1390 ; X32-AVX-NEXT: retl
1392 ; X32-AVX512F-LABEL: reg_broadcast_16i16_32i16:
1393 ; X32-AVX512F: # %bb.0:
1394 ; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
1395 ; X32-AVX512F-NEXT: retl
1397 ; X32-AVX512BW-LABEL: reg_broadcast_16i16_32i16:
1398 ; X32-AVX512BW: # %bb.0:
1399 ; X32-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1400 ; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1401 ; X32-AVX512BW-NEXT: retl
1403 ; X32-AVX512DQ-LABEL: reg_broadcast_16i16_32i16:
1404 ; X32-AVX512DQ: # %bb.0:
1405 ; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
1406 ; X32-AVX512DQ-NEXT: retl
1408 ; X64-AVX-LABEL: reg_broadcast_16i16_32i16:
1410 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1411 ; X64-AVX-NEXT: retq
1413 ; X64-AVX512F-LABEL: reg_broadcast_16i16_32i16:
1414 ; X64-AVX512F: # %bb.0:
1415 ; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
1416 ; X64-AVX512F-NEXT: retq
1418 ; X64-AVX512BW-LABEL: reg_broadcast_16i16_32i16:
1419 ; X64-AVX512BW: # %bb.0:
1420 ; X64-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1421 ; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1422 ; X64-AVX512BW-NEXT: retq
1424 ; X64-AVX512DQ-LABEL: reg_broadcast_16i16_32i16:
1425 ; X64-AVX512DQ: # %bb.0:
1426 ; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
1427 ; X64-AVX512DQ-NEXT: retq
1428 %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1432 define <32 x i8> @reg_broadcast_16i8_32i8(<16 x i8> %a0) nounwind {
1433 ; X32-LABEL: reg_broadcast_16i8_32i8:
1435 ; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1436 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1439 ; X64-LABEL: reg_broadcast_16i8_32i8:
1441 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1442 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1444 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1448 define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind {
1449 ; X32-AVX-LABEL: reg_broadcast_16i8_64i8:
1451 ; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1452 ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1453 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1454 ; X32-AVX-NEXT: retl
1456 ; X32-AVX512F-LABEL: reg_broadcast_16i8_64i8:
1457 ; X32-AVX512F: # %bb.0:
1458 ; X32-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1459 ; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1460 ; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
1461 ; X32-AVX512F-NEXT: retl
1463 ; X32-AVX512BW-LABEL: reg_broadcast_16i8_64i8:
1464 ; X32-AVX512BW: # %bb.0:
1465 ; X32-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1466 ; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1467 ; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1468 ; X32-AVX512BW-NEXT: retl
1470 ; X32-AVX512DQ-LABEL: reg_broadcast_16i8_64i8:
1471 ; X32-AVX512DQ: # %bb.0:
1472 ; X32-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1473 ; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1474 ; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
1475 ; X32-AVX512DQ-NEXT: retl
1477 ; X64-AVX-LABEL: reg_broadcast_16i8_64i8:
1479 ; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1480 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1481 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1482 ; X64-AVX-NEXT: retq
1484 ; X64-AVX512F-LABEL: reg_broadcast_16i8_64i8:
1485 ; X64-AVX512F: # %bb.0:
1486 ; X64-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1487 ; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1488 ; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
1489 ; X64-AVX512F-NEXT: retq
1491 ; X64-AVX512BW-LABEL: reg_broadcast_16i8_64i8:
1492 ; X64-AVX512BW: # %bb.0:
1493 ; X64-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1494 ; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1495 ; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1496 ; X64-AVX512BW-NEXT: retq
1498 ; X64-AVX512DQ-LABEL: reg_broadcast_16i8_64i8:
1499 ; X64-AVX512DQ: # %bb.0:
1500 ; X64-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1501 ; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1502 ; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
1503 ; X64-AVX512DQ-NEXT: retq
1504 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1508 define <64 x i8> @reg_broadcast_32i8_64i8(<32 x i8> %a0) nounwind {
1509 ; X32-AVX-LABEL: reg_broadcast_32i8_64i8:
1511 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1512 ; X32-AVX-NEXT: retl
1514 ; X32-AVX512F-LABEL: reg_broadcast_32i8_64i8:
1515 ; X32-AVX512F: # %bb.0:
1516 ; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
1517 ; X32-AVX512F-NEXT: retl
1519 ; X32-AVX512BW-LABEL: reg_broadcast_32i8_64i8:
1520 ; X32-AVX512BW: # %bb.0:
1521 ; X32-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1522 ; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1523 ; X32-AVX512BW-NEXT: retl
1525 ; X32-AVX512DQ-LABEL: reg_broadcast_32i8_64i8:
1526 ; X32-AVX512DQ: # %bb.0:
1527 ; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
1528 ; X32-AVX512DQ-NEXT: retl
1530 ; X64-AVX-LABEL: reg_broadcast_32i8_64i8:
1532 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1533 ; X64-AVX-NEXT: retq
1535 ; X64-AVX512F-LABEL: reg_broadcast_32i8_64i8:
1536 ; X64-AVX512F: # %bb.0:
1537 ; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
1538 ; X64-AVX512F-NEXT: retq
1540 ; X64-AVX512BW-LABEL: reg_broadcast_32i8_64i8:
1541 ; X64-AVX512BW: # %bb.0:
1542 ; X64-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1543 ; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1544 ; X64-AVX512BW-NEXT: retq
1546 ; X64-AVX512DQ-LABEL: reg_broadcast_32i8_64i8:
1547 ; X64-AVX512DQ: # %bb.0:
1548 ; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
1549 ; X64-AVX512DQ-NEXT: retq
1550 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1558 define <4 x i32> @test_2xi32_to_4xi32_mem(<2 x i32>* %vp) {
1559 ; X32-LABEL: test_2xi32_to_4xi32_mem:
1561 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
1562 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
1565 ; X64-LABEL: test_2xi32_to_4xi32_mem:
1567 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
1569 %vec = load <2 x i32>, <2 x i32>* %vp
1570 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1574 define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) {
1575 ; X32-LABEL: test_2xi32_to_8xi32_mem:
1577 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
1578 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
1581 ; X64-LABEL: test_2xi32_to_8xi32_mem:
1583 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
1585 %vec = load <2 x i32>, <2 x i32>* %vp
1586 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1590 define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) {
1591 ; X32-AVX-LABEL: test_2xi32_to_16xi32_mem:
1593 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1594 ; X32-AVX-NEXT: vbroadcastsd (%eax), %ymm0
1595 ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
1596 ; X32-AVX-NEXT: retl
1598 ; X32-AVX512-LABEL: test_2xi32_to_16xi32_mem:
1599 ; X32-AVX512: # %bb.0:
1600 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1601 ; X32-AVX512-NEXT: vbroadcastsd (%eax), %zmm0
1602 ; X32-AVX512-NEXT: retl
1604 ; X64-AVX-LABEL: test_2xi32_to_16xi32_mem:
1606 ; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm0
1607 ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
1608 ; X64-AVX-NEXT: retq
1610 ; X64-AVX512-LABEL: test_2xi32_to_16xi32_mem:
1611 ; X64-AVX512: # %bb.0:
1612 ; X64-AVX512-NEXT: vbroadcastsd (%rdi), %zmm0
1613 ; X64-AVX512-NEXT: retq
1614 %vec = load <2 x i32>, <2 x i32>* %vp
1615 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1623 define <4 x double> @broadcast_v4f64_f64_u000(double* %p) {
1624 ; X32-LABEL: broadcast_v4f64_f64_u000:
1626 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
1627 ; X32-NEXT: vbroadcastsd (%eax), %ymm0
1630 ; X64-LABEL: broadcast_v4f64_f64_u000:
1632 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
1634 %s = load double, double* %p
1635 %vec = insertelement <2 x double> undef, double %s, i32 0
1636 %res = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
1637 ret <4 x double> %res
1640 define <4 x double> @broadcast_v4f64_v2f64_4u61(<2 x double>* %vp, <4 x double> %default) {
1641 ; X32-LABEL: broadcast_v4f64_v2f64_4u61:
1643 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
1644 ; X32-NEXT: vinsertf128 $1, (%eax), %ymm0, %ymm1
1645 ; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1648 ; X64-LABEL: broadcast_v4f64_v2f64_4u61:
1650 ; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm1
1651 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1653 %vec = load <2 x double>, <2 x double>* %vp
1654 %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 1>
1655 %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %default
1656 ret <4 x double> %res
1659 define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(<2 x float>* %vp, <8 x float> %default) {
1660 ; X32-AVX1-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1661 ; X32-AVX1: # %bb.0:
1662 ; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
1663 ; X32-AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1664 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
1665 ; X32-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1666 ; X32-AVX1-NEXT: retl
1668 ; X32-AVX2-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1669 ; X32-AVX2: # %bb.0:
1670 ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
1671 ; X32-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1672 ; X32-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1673 ; X32-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
1674 ; X32-AVX2-NEXT: retl
1676 ; X32-AVX512-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1677 ; X32-AVX512: # %bb.0:
1678 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1679 ; X32-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1680 ; X32-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1681 ; X32-AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
1682 ; X32-AVX512-NEXT: retl
1684 ; X64-AVX1-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1685 ; X64-AVX1: # %bb.0:
1686 ; X64-AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1687 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
1688 ; X64-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1689 ; X64-AVX1-NEXT: retq
1691 ; X64-AVX2-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1692 ; X64-AVX2: # %bb.0:
1693 ; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1694 ; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1695 ; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
1696 ; X64-AVX2-NEXT: retq
1698 ; X64-AVX512-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1699 ; X64-AVX512: # %bb.0:
1700 ; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1701 ; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1702 ; X64-AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3]
1703 ; X64-AVX512-NEXT: retq
1704 %vec = load <2 x float>, <2 x float>* %vp
1705 %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 0, i32 2, i32 3, i32 undef>
1706 %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %default
1707 ret <8 x float> %res
1710 define <8 x double> @broadcast_v8f64_v2f64_u1u10101(<2 x double>* %vp) {
1711 ; X32-AVX1-LABEL: broadcast_v8f64_v2f64_u1u10101:
1712 ; X32-AVX1: # %bb.0:
1713 ; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
1714 ; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1715 ; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
1716 ; X32-AVX1-NEXT: retl
1718 ; X32-AVX2-LABEL: broadcast_v8f64_v2f64_u1u10101:
1719 ; X32-AVX2: # %bb.0:
1720 ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
1721 ; X32-AVX2-NEXT: vmovaps (%eax), %xmm0
1722 ; X32-AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
1723 ; X32-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
1724 ; X32-AVX2-NEXT: retl
1726 ; X32-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101:
1727 ; X32-AVX512: # %bb.0:
1728 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1729 ; X32-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1730 ; X32-AVX512-NEXT: retl
1732 ; X64-AVX1-LABEL: broadcast_v8f64_v2f64_u1u10101:
1733 ; X64-AVX1: # %bb.0:
1734 ; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1735 ; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
1736 ; X64-AVX1-NEXT: retq
1738 ; X64-AVX2-LABEL: broadcast_v8f64_v2f64_u1u10101:
1739 ; X64-AVX2: # %bb.0:
1740 ; X64-AVX2-NEXT: vmovaps (%rdi), %xmm0
1741 ; X64-AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
1742 ; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
1743 ; X64-AVX2-NEXT: retq
1745 ; X64-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101:
1746 ; X64-AVX512: # %bb.0:
1747 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1748 ; X64-AVX512-NEXT: retq
1749 %vec = load <2 x double>, <2 x double>* %vp
1750 %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 3, i32 1, i32 undef, i32 1, i32 0, i32 1, i32 0, i32 1>
1751 ret <8 x double> %res
1754 define <8 x double> @broadcast_v8f64_v2f64_0uuu0101(<2 x double>* %vp) {
1755 ; X32-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1757 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1758 ; X32-AVX-NEXT: vmovaps (%eax), %xmm0
1759 ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
1760 ; X32-AVX-NEXT: retl
1762 ; X32-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1763 ; X32-AVX512: # %bb.0:
1764 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1765 ; X32-AVX512-NEXT: vmovaps (%eax), %xmm0
1766 ; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
1767 ; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
1768 ; X32-AVX512-NEXT: retl
1770 ; X64-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1772 ; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
1773 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
1774 ; X64-AVX-NEXT: retq
1776 ; X64-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1777 ; X64-AVX512: # %bb.0:
1778 ; X64-AVX512-NEXT: vmovaps (%rdi), %xmm0
1779 ; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
1780 ; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
1781 ; X64-AVX512-NEXT: retq
1782 %vec = load <2 x double>, <2 x double>* %vp
1783 %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 0, i32 1>
1784 ret <8 x double> %res