1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s
4 declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32)
5 declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
6 declare <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double>, i8*, <8 x i32>, i8, i32)
7 declare void @llvm.x86.avx512.scatter.dpd.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
9 declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float>, i8*, <8 x i64>, i8, i32)
10 declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
11 declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32)
12 declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
14 define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) {
15 ; CHECK-LABEL: gather_mask_dps:
17 ; CHECK-NEXT: kmovd %edi, %k1
18 ; CHECK-NEXT: kmovq %k1, %k2
19 ; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2}
20 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
21 ; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1}
22 ; CHECK-NEXT: vzeroupper
24 %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
25 %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
26 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
30 define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
31 ; CHECK-LABEL: gather_mask_dpd:
33 ; CHECK-NEXT: kmovd %edi, %k1
34 ; CHECK-NEXT: kmovq %k1, %k2
35 ; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2}
36 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
37 ; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1}
38 ; CHECK-NEXT: vzeroupper
40 %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
41 %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
42 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
46 define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) {
47 ; CHECK-LABEL: gather_mask_qps:
49 ; CHECK-NEXT: kmovd %edi, %k1
50 ; CHECK-NEXT: kmovq %k1, %k2
51 ; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2}
52 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
53 ; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1}
54 ; CHECK-NEXT: vzeroupper
56 %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
57 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
58 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
62 define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
63 ; CHECK-LABEL: gather_mask_qpd:
65 ; CHECK-NEXT: kmovd %edi, %k1
66 ; CHECK-NEXT: kmovq %k1, %k2
67 ; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2}
68 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
69 ; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1}
70 ; CHECK-NEXT: vzeroupper
72 %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
73 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
74 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
78 ;; Integer Gather/Scatter
80 declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, <16 x i32>, i16, i32)
81 declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
82 declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64>, i8*, <8 x i32>, i8, i32)
83 declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
85 declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, i8*, <8 x i64>, i8, i32)
86 declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
87 declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32)
88 declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
90 define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) {
91 ; CHECK-LABEL: gather_mask_dd:
93 ; CHECK-NEXT: kmovd %edi, %k1
94 ; CHECK-NEXT: kmovq %k1, %k2
95 ; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2}
96 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
97 ; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1}
98 ; CHECK-NEXT: vzeroupper
100 %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
101 %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
102 call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
106 define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) {
107 ; CHECK-LABEL: gather_mask_qd:
109 ; CHECK-NEXT: kmovd %edi, %k1
110 ; CHECK-NEXT: kmovq %k1, %k2
111 ; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2}
112 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
113 ; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1}
114 ; CHECK-NEXT: vzeroupper
116 %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
117 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
118 call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
122 define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
123 ; CHECK-LABEL: gather_mask_qq:
125 ; CHECK-NEXT: kmovd %edi, %k1
126 ; CHECK-NEXT: kmovq %k1, %k2
127 ; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2}
128 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
129 ; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1}
130 ; CHECK-NEXT: vzeroupper
132 %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
133 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
134 call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
138 define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
139 ; CHECK-LABEL: gather_mask_dq:
141 ; CHECK-NEXT: kmovd %edi, %k1
142 ; CHECK-NEXT: kmovq %k1, %k2
143 ; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2}
144 ; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
145 ; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1}
146 ; CHECK-NEXT: vzeroupper
148 %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
149 %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
150 call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
154 define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
155 ; CHECK-LABEL: gather_mask_dpd_execdomain:
157 ; CHECK-NEXT: kmovd %edi, %k1
158 ; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1}
159 ; CHECK-NEXT: vmovapd %zmm1, (%rdx)
160 ; CHECK-NEXT: vzeroupper
162 %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
163 store <8 x double> %x, <8 x double>* %stbuf
167 define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
168 ; CHECK-LABEL: gather_mask_qpd_execdomain:
170 ; CHECK-NEXT: kmovd %edi, %k1
171 ; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1}
172 ; CHECK-NEXT: vmovapd %zmm1, (%rdx)
173 ; CHECK-NEXT: vzeroupper
175 %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
176 store <8 x double> %x, <8 x double>* %stbuf
180 define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) {
181 ; CHECK-LABEL: gather_mask_dps_execdomain:
183 ; CHECK-NEXT: kmovd %edi, %k1
184 ; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1}
185 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
187 %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
188 ret <16 x float> %res;
191 define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) {
192 ; CHECK-LABEL: gather_mask_qps_execdomain:
194 ; CHECK-NEXT: kmovd %edi, %k1
195 ; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1}
196 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
198 %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
199 ret <8 x float> %res;
202 define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
203 ; CHECK-LABEL: scatter_mask_dpd_execdomain:
205 ; CHECK-NEXT: kmovd %esi, %k1
206 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
207 ; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1}
208 ; CHECK-NEXT: vzeroupper
210 %x = load <8 x double>, <8 x double>* %src, align 64
211 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
215 define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
216 ; CHECK-LABEL: scatter_mask_qpd_execdomain:
218 ; CHECK-NEXT: kmovd %esi, %k1
219 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
220 ; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1}
221 ; CHECK-NEXT: vzeroupper
223 %x = load <8 x double>, <8 x double>* %src, align 64
224 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
228 define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf) {
229 ; CHECK-LABEL: scatter_mask_dps_execdomain:
231 ; CHECK-NEXT: kmovd %esi, %k1
232 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
233 ; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1}
234 ; CHECK-NEXT: vzeroupper
236 %x = load <16 x float>, <16 x float>* %src, align 64
237 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
241 define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf) {
242 ; CHECK-LABEL: scatter_mask_qps_execdomain:
244 ; CHECK-NEXT: kmovd %esi, %k1
245 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
246 ; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1}
247 ; CHECK-NEXT: vzeroupper
249 %x = load <8 x float>, <8 x float>* %src, align 32
250 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
254 define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) {
255 ; CHECK-LABEL: gather_qps:
257 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
258 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
259 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
260 ; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
261 ; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
262 ; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
263 ; CHECK-NEXT: vzeroupper
265 %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4)
266 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
267 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4)
271 declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
272 declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
273 define void @prefetch(<8 x i64> %ind, i8* %base) {
274 ; CHECK-LABEL: prefetch:
276 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
277 ; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1}
278 ; CHECK-NEXT: kxorw %k0, %k0, %k1
279 ; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1}
280 ; CHECK-NEXT: movb $1, %al
281 ; CHECK-NEXT: kmovd %eax, %k1
282 ; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1}
283 ; CHECK-NEXT: movb $120, %al
284 ; CHECK-NEXT: kmovd %eax, %k1
285 ; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1}
286 ; CHECK-NEXT: vzeroupper
288 call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 3)
289 call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 2)
290 call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 3)
291 call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 2)
295 declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32)
297 define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
298 ; CHECK-LABEL: test_int_x86_avx512_gather3div2_df:
300 ; CHECK-NEXT: kmovd %esi, %k1
301 ; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm0 {%k1}
302 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
303 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
304 ; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,2), %xmm2 {%k1}
305 ; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0
307 %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
308 %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2)
309 %res2 = fadd <2 x double> %res, %res1
310 ret <2 x double> %res2
313 declare <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64>, i8*, <2 x i64>, i8, i32)
315 define <2 x i64>@test_int_x86_avx512_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
316 ; CHECK-LABEL: test_int_x86_avx512_gather3div2_di:
318 ; CHECK-NEXT: kmovd %esi, %k1
319 ; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1}
320 ; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0
322 %res = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
323 %res1 = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
324 %res2 = add <2 x i64> %res, %res1
328 declare <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double>, i8*, <4 x i64>, i8, i32)
330 define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
331 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_df:
333 ; CHECK-NEXT: kmovd %esi, %k1
334 ; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1}
335 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
336 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
337 ; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,2), %ymm2 {%k1}
338 ; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0
340 %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
341 %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2)
342 %res2 = fadd <4 x double> %res, %res1
343 ret <4 x double> %res2
346 declare <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64>, i8*, <4 x i64>, i8, i32)
348 define <4 x i64>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
349 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_di:
351 ; CHECK-NEXT: kmovd %esi, %k1
352 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
353 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
354 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
355 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
356 ; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0
358 %res = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8)
359 %res1 = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8)
360 %res2 = add <4 x i64> %res, %res1
364 declare <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float>, i8*, <2 x i64>, i8, i32)
366 define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
367 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf:
369 ; CHECK-NEXT: kmovd %esi, %k1
370 ; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1}
371 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
372 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
373 ; CHECK-NEXT: vgatherqps (%rdi,%xmm1,2), %xmm2 {%k1}
374 ; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0
376 %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
377 %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2)
378 %res2 = fadd <4 x float> %res, %res1
379 ret <4 x float> %res2
382 declare <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32>, i8*, <2 x i64>, i8, i32)
384 define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
385 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_si:
387 ; CHECK-NEXT: kmovd %esi, %k1
388 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
389 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
390 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2}
391 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1}
392 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
394 %res = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 4)
395 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
396 %res2 = add <4 x i32> %res, %res1
400 declare <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float>, i8*, <4 x i64>, i8, i32)
402 define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
403 ; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf:
405 ; CHECK-NEXT: kmovd %esi, %k1
406 ; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1}
407 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
408 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
409 ; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm2 {%k1}
410 ; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0
411 ; CHECK-NEXT: vzeroupper
413 %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
414 %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2)
415 %res2 = fadd <4 x float> %res, %res1
416 ret <4 x float> %res2
419 declare <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32>, i8*, <4 x i64>, i8, i32)
421 define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
422 ; CHECK-LABEL: test_int_x86_avx512_gather3div8_si:
424 ; CHECK-NEXT: kmovd %esi, %k1
425 ; CHECK-NEXT: vmovdqa %xmm0, %xmm2
426 ; CHECK-NEXT: kmovq %k1, %k2
427 ; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
428 ; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
429 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
430 ; CHECK-NEXT: vzeroupper
432 %res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
433 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 2)
434 %res2 = add <4 x i32> %res, %res1
438 declare <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double>, i8*, <4 x i32>, i8, i32)
440 define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
441 ; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df:
443 ; CHECK-NEXT: kmovd %esi, %k1
444 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm0 {%k1}
445 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
446 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
447 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %xmm2 {%k1}
448 ; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0
450 %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
451 %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
452 %res2 = fadd <2 x double> %res, %res1
453 ret <2 x double> %res2
456 declare <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, i8, i32)
458 define <2 x i64>@test_int_x86_avx512_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
459 ; CHECK-LABEL: test_int_x86_avx512_gather3siv2_di:
461 ; CHECK-NEXT: kmovd %esi, %k1
462 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1}
463 ; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0
465 %res = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
466 %res1 = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
467 %res2 = add <2 x i64> %res, %res1
471 declare <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double>, i8*, <4 x i32>, i8, i32)
473 define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
474 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df:
476 ; CHECK-NEXT: kmovd %esi, %k1
477 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm0 {%k1}
478 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
479 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
480 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %ymm2 {%k1}
481 ; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0
483 %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
484 %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
485 %res2 = fadd <4 x double> %res, %res1
486 ret <4 x double> %res2
489 declare <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, i8, i32)
491 define <4 x i64>@test_int_x86_avx512_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
492 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_di:
494 ; CHECK-NEXT: kmovd %esi, %k1
495 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1}
496 ; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0
498 %res = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
499 %res1 = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
500 %res2 = add <4 x i64> %res, %res1
504 declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, i8*, <4 x i32>, i8, i32)
506 define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
507 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf:
509 ; CHECK-NEXT: kmovd %esi, %k1
510 ; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
511 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
512 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
513 ; CHECK-NEXT: vgatherdps (%rdi,%xmm1,2), %xmm2 {%k1}
514 ; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0
516 %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
517 %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
518 %res2 = fadd <4 x float> %res, %res1
519 ret <4 x float> %res2
522 declare <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, i8, i32)
524 define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
525 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si:
527 ; CHECK-NEXT: kmovd %esi, %k1
528 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
529 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
530 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2}
531 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1}
532 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
534 %res = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 4)
535 %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 2)
536 %res2 = add <4 x i32> %res, %res1
540 declare <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, i8, i32)
542 define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
543 ; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf:
545 ; CHECK-NEXT: kmovd %esi, %k1
546 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
547 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
548 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
549 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
550 ; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0
552 %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
553 %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 -1, i32 2)
554 %res2 = fadd <8 x float> %res, %res1
555 ret <8 x float> %res2
558 declare <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, i8, i32)
560 define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
561 ; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si:
563 ; CHECK-NEXT: kmovd %esi, %k1
564 ; CHECK-NEXT: vmovdqa %ymm0, %ymm2
565 ; CHECK-NEXT: kmovq %k1, %k2
566 ; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2}
567 ; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1}
568 ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
570 %res = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
571 %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 2)
572 %res2 = add <8 x i32> %res, %res1
576 declare void @llvm.x86.avx512.scatterdiv2.df(i8*, i8, <2 x i64>, <2 x double>, i32)
578 define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) {
579 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df:
581 ; CHECK-NEXT: kmovd %esi, %k1
582 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
583 ; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2}
584 ; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1}
586 call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 -1, <2 x i64> %x2, <2 x double> %x3, i32 2)
587 call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3, i32 4)
591 declare void @llvm.x86.avx512.scatterdiv2.di(i8*, i8, <2 x i64>, <2 x i64>, i32)
593 define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) {
594 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di:
596 ; CHECK-NEXT: kmovd %esi, %k1
597 ; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1}
598 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
599 ; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1}
601 call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3, i32 2)
602 call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 -1, <2 x i64> %x2, <2 x i64> %x3, i32 4)
606 declare void @llvm.x86.avx512.scatterdiv4.df(i8*, i8, <4 x i64>, <4 x double>, i32)
608 define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) {
609 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df:
611 ; CHECK-NEXT: kmovd %esi, %k1
612 ; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1}
613 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
614 ; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1}
615 ; CHECK-NEXT: vzeroupper
617 call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 2)
618 call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4)
622 declare void @llvm.x86.avx512.scatterdiv4.di(i8*, i8, <4 x i64>, <4 x i64>, i32)
624 define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) {
625 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di:
627 ; CHECK-NEXT: kmovd %esi, %k1
628 ; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1}
629 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
630 ; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1}
631 ; CHECK-NEXT: vzeroupper
633 call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 2)
634 call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4)
638 declare void @llvm.x86.avx512.scatterdiv4.sf(i8*, i8, <2 x i64>, <4 x float>, i32)
640 define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) {
641 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf:
643 ; CHECK-NEXT: kmovd %esi, %k1
644 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1}
645 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
646 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1}
648 call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3, i32 2)
649 call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 -1, <2 x i64> %x2, <4 x float> %x3, i32 4)
653 declare void @llvm.x86.avx512.scatterdiv4.si(i8*, i8, <2 x i64>, <4 x i32>, i32)
655 define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) {
656 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si:
658 ; CHECK-NEXT: kmovd %esi, %k1
659 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
660 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2}
661 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1}
663 call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 -1, <2 x i64> %x2, <4 x i32> %x3, i32 2)
664 call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3, i32 4)
668 declare void @llvm.x86.avx512.scatterdiv8.sf(i8*, i8, <4 x i64>, <4 x float>, i32)
670 define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) {
671 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf:
673 ; CHECK-NEXT: kmovd %esi, %k1
674 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1}
675 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
676 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1}
677 ; CHECK-NEXT: vzeroupper
679 call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 2)
680 call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4)
684 declare void @llvm.x86.avx512.scatterdiv8.si(i8*, i8, <4 x i64>, <4 x i32>, i32)
686 define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) {
687 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si:
689 ; CHECK-NEXT: kmovd %esi, %k1
690 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1}
691 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
692 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1}
693 ; CHECK-NEXT: vzeroupper
695 call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 2)
696 call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4)
700 declare void @llvm.x86.avx512.scattersiv2.df(i8*, i8, <4 x i32>, <2 x double>, i32)
702 define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) {
703 ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df:
705 ; CHECK-NEXT: kmovd %esi, %k1
706 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
707 ; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2}
708 ; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1}
710 call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 -1, <4 x i32> %x2, <2 x double> %x3, i32 2)
711 call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3, i32 4)
715 declare void @llvm.x86.avx512.scattersiv2.di(i8*, i8, <4 x i32>, <2 x i64>, i32)
717 define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) {
718 ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di:
720 ; CHECK-NEXT: kmovd %esi, %k1
721 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
722 ; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2}
723 ; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1}
725 call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 -1, <4 x i32> %x2, <2 x i64> %x3, i32 2)
726 call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3, i32 4)
730 declare void @llvm.x86.avx512.scattersiv4.df(i8*, i8, <4 x i32>, <4 x double>, i32)
732 define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) {
733 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df:
735 ; CHECK-NEXT: kmovd %esi, %k1
736 ; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1}
737 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
738 ; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1}
739 ; CHECK-NEXT: vzeroupper
741 call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 2)
742 call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4)
746 declare void @llvm.x86.avx512.scattersiv4.di(i8*, i8, <4 x i32>, <4 x i64>, i32)
748 define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) {
749 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di:
751 ; CHECK-NEXT: kmovd %esi, %k1
752 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
753 ; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2}
754 ; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1}
755 ; CHECK-NEXT: vzeroupper
757 call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 2)
758 call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4)
762 declare void @llvm.x86.avx512.scattersiv4.sf(i8*, i8, <4 x i32>, <4 x float>, i32)
764 define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) {
765 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf:
767 ; CHECK-NEXT: kmovd %esi, %k1
768 ; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1}
769 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
770 ; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1}
772 call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3, i32 2)
773 call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 -1, <4 x i32> %x2, <4 x float> %x3, i32 4)
777 declare void @llvm.x86.avx512.scattersiv4.si(i8*, i8, <4 x i32>, <4 x i32>, i32)
779 define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) {
780 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si:
782 ; CHECK-NEXT: kmovd %esi, %k1
783 ; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1}
784 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
785 ; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
787 call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3, i32 2)
788 call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i32> %x3, i32 4)
792 declare void @llvm.x86.avx512.scattersiv8.sf(i8*, i8, <8 x i32>, <8 x float>, i32)
794 define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) {
795 ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf:
797 ; CHECK-NEXT: kmovd %esi, %k1
798 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
799 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
800 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
801 ; CHECK-NEXT: vzeroupper
803 call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 2)
804 call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4)
808 declare void @llvm.x86.avx512.scattersiv8.si(i8*, i8, <8 x i32>, <8 x i32>, i32)
810 define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) {
811 ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si:
813 ; CHECK-NEXT: kmovd %esi, %k1
814 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
815 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
816 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
817 ; CHECK-NEXT: vzeroupper
819 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
820 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4)
824 define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) {
825 ; CHECK-LABEL: scatter_mask_test:
827 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
828 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
829 ; CHECK-NEXT: kxorw %k0, %k0, %k1
830 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
831 ; CHECK-NEXT: movb $1, %al
832 ; CHECK-NEXT: kmovd %eax, %k1
833 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
834 ; CHECK-NEXT: movb $96, %al
835 ; CHECK-NEXT: kmovd %eax, %k1
836 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
837 ; CHECK-NEXT: vzeroupper
839 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
840 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4)
841 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
842 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 96, <8 x i32> %x2, <8 x i32> %x3, i32 4)
846 define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base) {
847 ; CHECK-LABEL: gather_mask_test:
849 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
850 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
851 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
852 ; CHECK-NEXT: kxorw %k0, %k0, %k1
853 ; CHECK-NEXT: vmovaps %zmm1, %zmm3
854 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
855 ; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2
856 ; CHECK-NEXT: movw $1, %ax
857 ; CHECK-NEXT: kmovd %eax, %k1
858 ; CHECK-NEXT: vmovaps %zmm1, %zmm3
859 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
860 ; CHECK-NEXT: movw $220, %ax
861 ; CHECK-NEXT: kmovd %eax, %k1
862 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
863 ; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0
864 ; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0
866 %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4)
867 %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4)
868 %res2 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 1, i32 4)
869 %res3 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 220, i32 4)
871 %res4 = fadd <16 x float> %res, %res1
872 %res5 = fadd <16 x float> %res3, %res2
873 %res6 = fadd <16 x float> %res5, %res4
874 ret <16 x float> %res6