1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=X64-AVX512 --check-prefix=X64-AVX512VL
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X64-AVX512 --check-prefix=X64-AVX512BWVL
4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64-AVX512 --check-prefix=X64-AVX512DQVL
7 ; 128-bit Subvector Broadcast to 256-bit
10 define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
11 ; X64-AVX512-LABEL: test_broadcast_2f64_4f64:
12 ; X64-AVX512: ## %bb.0:
13 ; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
14 ; X64-AVX512-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
15 ; X64-AVX512-NEXT: retq
16 %1 = load <2 x double>, <2 x double> *%p
17 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
18 %3 = fadd <4 x double> %2, <double 1.0, double 2.0, double 3.0, double 4.0>
22 define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
23 ; X64-AVX512-LABEL: test_broadcast_2i64_4i64:
24 ; X64-AVX512: ## %bb.0:
25 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
26 ; X64-AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
27 ; X64-AVX512-NEXT: retq
28 %1 = load <2 x i64>, <2 x i64> *%p
29 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
30 %3 = add <4 x i64> %2, <i64 1, i64 2, i64 3, i64 4>
34 define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
35 ; X64-AVX512-LABEL: test_broadcast_4f32_8f32:
36 ; X64-AVX512: ## %bb.0:
37 ; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
38 ; X64-AVX512-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
39 ; X64-AVX512-NEXT: retq
40 %1 = load <4 x float>, <4 x float> *%p
41 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
42 %3 = fadd <8 x float> %2, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>
46 define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
47 ; X64-AVX512-LABEL: test_broadcast_4i32_8i32:
48 ; X64-AVX512: ## %bb.0:
49 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
50 ; X64-AVX512-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
51 ; X64-AVX512-NEXT: retq
52 %1 = load <4 x i32>, <4 x i32> *%p
53 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
54 %3 = add <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
58 define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
59 ; X64-AVX512-LABEL: test_broadcast_8i16_16i16:
60 ; X64-AVX512: ## %bb.0:
61 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
62 ; X64-AVX512-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
63 ; X64-AVX512-NEXT: retq
64 %1 = load <8 x i16>, <8 x i16> *%p
65 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
66 %3 = add <16 x i16> %2, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16>
70 define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
71 ; X64-AVX512-LABEL: test_broadcast_16i8_32i8:
72 ; X64-AVX512: ## %bb.0:
73 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
74 ; X64-AVX512-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
75 ; X64-AVX512-NEXT: retq
76 %1 = load <16 x i8>, <16 x i8> *%p
77 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
78 %3 = add <32 x i8> %2, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32>
83 ; 128-bit Subvector Broadcast to 512-bit
86 define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind {
87 ; X64-AVX512-LABEL: test_broadcast_2f64_8f64:
88 ; X64-AVX512: ## %bb.0:
89 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
90 ; X64-AVX512-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0
91 ; X64-AVX512-NEXT: retq
92 %1 = load <2 x double>, <2 x double> *%p
93 %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
94 %3 = fadd <8 x double> %2, <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0>
98 define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind {
99 ; X64-AVX512-LABEL: test_broadcast_2i64_8i64:
100 ; X64-AVX512: ## %bb.0:
101 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
102 ; X64-AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
103 ; X64-AVX512-NEXT: retq
104 %1 = load <2 x i64>, <2 x i64> *%p
105 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
106 %3 = add <8 x i64> %2, <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8>
110 define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind {
111 ; X64-AVX512-LABEL: test_broadcast_4f32_16f32:
112 ; X64-AVX512: ## %bb.0:
113 ; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
114 ; X64-AVX512-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0
115 ; X64-AVX512-NEXT: retq
116 %1 = load <4 x float>, <4 x float> *%p
117 %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
118 %3 = fadd <16 x float> %2, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>
122 define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind {
123 ; X64-AVX512-LABEL: test_broadcast_4i32_16i32:
124 ; X64-AVX512: ## %bb.0:
125 ; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
126 ; X64-AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
127 ; X64-AVX512-NEXT: retq
128 %1 = load <4 x i32>, <4 x i32> *%p
129 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
130 %3 = add <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
134 define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
135 ; X64-AVX512VL-LABEL: test_broadcast_8i16_32i16:
136 ; X64-AVX512VL: ## %bb.0:
137 ; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
138 ; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1
139 ; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
140 ; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
141 ; X64-AVX512VL-NEXT: retq
143 ; X64-AVX512BWVL-LABEL: test_broadcast_8i16_32i16:
144 ; X64-AVX512BWVL: ## %bb.0:
145 ; X64-AVX512BWVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
146 ; X64-AVX512BWVL-NEXT: vpaddw {{.*}}(%rip), %zmm0, %zmm0
147 ; X64-AVX512BWVL-NEXT: retq
149 ; X64-AVX512DQVL-LABEL: test_broadcast_8i16_32i16:
150 ; X64-AVX512DQVL: ## %bb.0:
151 ; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
152 ; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1
153 ; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
154 ; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
155 ; X64-AVX512DQVL-NEXT: retq
156 %1 = load <8 x i16>, <8 x i16> *%p
157 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
158 %3 = add <32 x i16> %2, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31, i16 32>
162 define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
163 ; X64-AVX512VL-LABEL: test_broadcast_16i8_64i8:
164 ; X64-AVX512VL: ## %bb.0:
165 ; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
166 ; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1
167 ; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
168 ; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
169 ; X64-AVX512VL-NEXT: retq
171 ; X64-AVX512BWVL-LABEL: test_broadcast_16i8_64i8:
172 ; X64-AVX512BWVL: ## %bb.0:
173 ; X64-AVX512BWVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
174 ; X64-AVX512BWVL-NEXT: vpaddb {{.*}}(%rip), %zmm0, %zmm0
175 ; X64-AVX512BWVL-NEXT: retq
177 ; X64-AVX512DQVL-LABEL: test_broadcast_16i8_64i8:
178 ; X64-AVX512DQVL: ## %bb.0:
179 ; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
180 ; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1
181 ; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
182 ; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
183 ; X64-AVX512DQVL-NEXT: retq
184 %1 = load <16 x i8>, <16 x i8> *%p
185 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
186 %3 = add <64 x i8> %2, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47, i8 48, i8 49, i8 50, i8 51, i8 52, i8 53, i8 54, i8 55, i8 56, i8 57, i8 58, i8 59, i8 60, i8 61, i8 62, i8 63, i8 64>
190 define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) {
191 ; X64-AVX512-LABEL: PR29088:
192 ; X64-AVX512: ## %bb.0:
193 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
194 ; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
195 ; X64-AVX512-NEXT: vmovaps %ymm1, (%rsi)
196 ; X64-AVX512-NEXT: retq
197 %ld = load <4 x i32>, <4 x i32>* %p0
198 store <8 x float> zeroinitializer, <8 x float>* %p1
199 %shuf = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>