1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s
4 declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, ptr, <16 x i32>, i16, i32)
5 declare void @llvm.x86.avx512.scatter.dps.512 (ptr, i16, <16 x i32>, <16 x float>, i32)
6 declare <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double>, ptr, <8 x i32>, i8, i32)
7 declare void @llvm.x86.avx512.scatter.dpd.512 (ptr, i8, <8 x i32>, <8 x double>, i32)
9 declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float>, ptr, <8 x i64>, i8, i32)
10 declare void @llvm.x86.avx512.scatter.qps.512 (ptr, i8, <8 x i64>, <8 x float>, i32)
11 declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, ptr, <8 x i64>, i8, i32)
12 declare void @llvm.x86.avx512.scatter.qpd.512 (ptr, i8, <8 x i64>, <8 x double>, i32)
14 define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, ptr %base, ptr %stbuf) {
15 ; CHECK-LABEL: gather_mask_dps:
17 ; CHECK-NEXT: kmovd %edi, %k1
18 ; CHECK-NEXT: kmovq %k1, %k2
19 ; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2}
20 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
21 ; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1}
22 ; CHECK-NEXT: vzeroupper
24 %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, ptr %base, <16 x i32>%ind, i16 %mask, i32 4)
25 %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
26 call void @llvm.x86.avx512.scatter.dps.512 (ptr %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
30 define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, ptr %base, ptr %stbuf) {
31 ; CHECK-LABEL: gather_mask_dpd:
33 ; CHECK-NEXT: kmovd %edi, %k1
34 ; CHECK-NEXT: kmovq %k1, %k2
35 ; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2}
36 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
37 ; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1}
38 ; CHECK-NEXT: vzeroupper
40 %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, ptr %base, <8 x i32>%ind, i8 %mask, i32 4)
41 %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
42 call void @llvm.x86.avx512.scatter.dpd.512 (ptr %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
46 define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, ptr %base, ptr %stbuf) {
47 ; CHECK-LABEL: gather_mask_qps:
49 ; CHECK-NEXT: kmovd %edi, %k1
50 ; CHECK-NEXT: kmovq %k1, %k2
51 ; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2}
52 ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
53 ; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1}
54 ; CHECK-NEXT: vzeroupper
56 %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, ptr %base, <8 x i64>%ind, i8 %mask, i32 4)
57 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
58 call void @llvm.x86.avx512.scatter.qps.512 (ptr %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
62 define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, ptr %base, ptr %stbuf) {
63 ; CHECK-LABEL: gather_mask_qpd:
65 ; CHECK-NEXT: kmovd %edi, %k1
66 ; CHECK-NEXT: kmovq %k1, %k2
67 ; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2}
68 ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
69 ; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1}
70 ; CHECK-NEXT: vzeroupper
72 %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, ptr %base, <8 x i64>%ind, i8 %mask, i32 4)
73 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
74 call void @llvm.x86.avx512.scatter.qpd.512 (ptr %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
78 ;; Integer Gather/Scatter
80 declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, ptr, <16 x i32>, i16, i32)
81 declare void @llvm.x86.avx512.scatter.dpi.512 (ptr, i16, <16 x i32>, <16 x i32>, i32)
82 declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64>, ptr, <8 x i32>, i8, i32)
83 declare void @llvm.x86.avx512.scatter.dpq.512 (ptr, i8, <8 x i32>, <8 x i64>, i32)
85 declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, ptr, <8 x i64>, i8, i32)
86 declare void @llvm.x86.avx512.scatter.qpi.512 (ptr, i8, <8 x i64>, <8 x i32>, i32)
87 declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, ptr, <8 x i64>, i8, i32)
88 declare void @llvm.x86.avx512.scatter.qpq.512 (ptr, i8, <8 x i64>, <8 x i64>, i32)
90 define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, ptr %base, ptr %stbuf) {
91 ; CHECK-LABEL: gather_mask_dd:
93 ; CHECK-NEXT: kmovd %edi, %k1
94 ; CHECK-NEXT: kmovq %k1, %k2
95 ; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2}
96 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
97 ; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1}
98 ; CHECK-NEXT: vzeroupper
100 %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, ptr %base, <16 x i32>%ind, i16 %mask, i32 4)
101 %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
102 call void @llvm.x86.avx512.scatter.dpi.512 (ptr %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
106 define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, ptr %base, ptr %stbuf) {
107 ; CHECK-LABEL: gather_mask_qd:
109 ; CHECK-NEXT: kmovd %edi, %k1
110 ; CHECK-NEXT: kmovq %k1, %k2
111 ; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2}
112 ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
113 ; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1}
114 ; CHECK-NEXT: vzeroupper
116 %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, ptr %base, <8 x i64>%ind, i8 %mask, i32 4)
117 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
118 call void @llvm.x86.avx512.scatter.qpi.512 (ptr %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
122 define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, ptr %base, ptr %stbuf) {
123 ; CHECK-LABEL: gather_mask_qq:
125 ; CHECK-NEXT: kmovd %edi, %k1
126 ; CHECK-NEXT: kmovq %k1, %k2
127 ; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2}
128 ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
129 ; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1}
130 ; CHECK-NEXT: vzeroupper
132 %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, ptr %base, <8 x i64>%ind, i8 %mask, i32 4)
133 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
134 call void @llvm.x86.avx512.scatter.qpq.512 (ptr %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
138 define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, ptr %base, ptr %stbuf) {
139 ; CHECK-LABEL: gather_mask_dq:
141 ; CHECK-NEXT: kmovd %edi, %k1
142 ; CHECK-NEXT: kmovq %k1, %k2
143 ; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2}
144 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
145 ; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1}
146 ; CHECK-NEXT: vzeroupper
148 %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, ptr %base, <8 x i32>%ind, i8 %mask, i32 4)
149 %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
150 call void @llvm.x86.avx512.scatter.dpq.512 (ptr %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
154 define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, ptr %base, ptr %stbuf) {
155 ; CHECK-LABEL: gather_mask_dpd_execdomain:
157 ; CHECK-NEXT: kmovd %edi, %k1
158 ; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1}
159 ; CHECK-NEXT: vmovapd %zmm1, (%rdx)
160 ; CHECK-NEXT: vzeroupper
162 %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, ptr %base, <8 x i32>%ind, i8 %mask, i32 4)
163 store <8 x double> %x, ptr %stbuf
167 define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, ptr %base, ptr %stbuf) {
168 ; CHECK-LABEL: gather_mask_qpd_execdomain:
170 ; CHECK-NEXT: kmovd %edi, %k1
171 ; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1}
172 ; CHECK-NEXT: vmovapd %zmm1, (%rdx)
173 ; CHECK-NEXT: vzeroupper
175 %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, ptr %base, <8 x i64>%ind, i8 %mask, i32 4)
176 store <8 x double> %x, ptr %stbuf
180 define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, ptr %base) {
181 ; CHECK-LABEL: gather_mask_dps_execdomain:
183 ; CHECK-NEXT: kmovd %edi, %k1
184 ; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1}
185 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
187 %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, ptr %base, <16 x i32>%ind, i16 %mask, i32 4)
188 ret <16 x float> %res;
191 define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, ptr %base) {
192 ; CHECK-LABEL: gather_mask_qps_execdomain:
194 ; CHECK-NEXT: kmovd %edi, %k1
195 ; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1}
196 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
198 %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, ptr %base, <8 x i64>%ind, i8 %mask, i32 4)
199 ret <8 x float> %res;
202 define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, ptr %src, i8 %mask, ptr %base, ptr %stbuf) {
203 ; CHECK-LABEL: scatter_mask_dpd_execdomain:
205 ; CHECK-NEXT: kmovd %esi, %k1
206 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
207 ; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1}
208 ; CHECK-NEXT: vzeroupper
210 %x = load <8 x double>, ptr %src, align 64
211 call void @llvm.x86.avx512.scatter.dpd.512 (ptr %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
215 define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, ptr %src, i8 %mask, ptr %base, ptr %stbuf) {
216 ; CHECK-LABEL: scatter_mask_qpd_execdomain:
218 ; CHECK-NEXT: kmovd %esi, %k1
219 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
220 ; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1}
221 ; CHECK-NEXT: vzeroupper
223 %x = load <8 x double>, ptr %src, align 64
224 call void @llvm.x86.avx512.scatter.qpd.512 (ptr %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
228 define void @scatter_mask_dps_execdomain(<16 x i32> %ind, ptr %src, i16 %mask, ptr %base, ptr %stbuf) {
229 ; CHECK-LABEL: scatter_mask_dps_execdomain:
231 ; CHECK-NEXT: kmovd %esi, %k1
232 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
233 ; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1}
234 ; CHECK-NEXT: vzeroupper
236 %x = load <16 x float>, ptr %src, align 64
237 call void @llvm.x86.avx512.scatter.dps.512 (ptr %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
241 define void @scatter_mask_qps_execdomain(<8 x i64> %ind, ptr %src, i8 %mask, ptr %base, ptr %stbuf) {
242 ; CHECK-LABEL: scatter_mask_qps_execdomain:
244 ; CHECK-NEXT: kmovd %esi, %k1
245 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
246 ; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1}
247 ; CHECK-NEXT: vzeroupper
249 %x = load <8 x float>, ptr %src, align 32
250 call void @llvm.x86.avx512.scatter.qps.512 (ptr %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
254 define void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf) {
255 ; CHECK-LABEL: gather_qps:
257 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
258 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
259 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
260 ; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
261 ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
262 ; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
263 ; CHECK-NEXT: vzeroupper
265 %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, ptr %base, <8 x i64>%ind, i8 -1, i32 4)
266 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
267 call void @llvm.x86.avx512.scatter.qps.512 (ptr %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4)
271 declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, ptr, <2 x i64>, i8, i32)
273 define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) {
274 ; CHECK-LABEL: test_int_x86_avx512_gather3div2_df:
276 ; CHECK-NEXT: kmovd %esi, %k1
277 ; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm0 {%k1}
278 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
279 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
280 ; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,2), %xmm2 {%k1}
281 ; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0
283 %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, i8 %x3, i32 4)
284 %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, i8 -1, i32 2)
285 %res2 = fadd <2 x double> %res, %res1
286 ret <2 x double> %res2
289 declare <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64>, ptr, <2 x i64>, i8, i32)
291 define <2 x i64>@test_int_x86_avx512_gather3div2_di(<2 x i64> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) {
292 ; CHECK-LABEL: test_int_x86_avx512_gather3div2_di:
294 ; CHECK-NEXT: kmovd %esi, %k1
295 ; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1}
296 ; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0
298 %res = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, ptr %x1, <2 x i64> %x2, i8 %x3, i32 8)
299 %res1 = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, ptr %x1, <2 x i64> %x2, i8 %x3, i32 8)
300 %res2 = add <2 x i64> %res, %res1
304 declare <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double>, ptr, <4 x i64>, i8, i32)
306 define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, ptr %x1, <4 x i64> %x2, i8 %x3) {
307 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_df:
309 ; CHECK-NEXT: kmovd %esi, %k1
310 ; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1}
311 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
312 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
313 ; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,2), %ymm2 {%k1}
314 ; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0
316 %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, ptr %x1, <4 x i64> %x2, i8 %x3, i32 4)
317 %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, ptr %x1, <4 x i64> %x2, i8 -1, i32 2)
318 %res2 = fadd <4 x double> %res, %res1
319 ret <4 x double> %res2
322 declare <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64>, ptr, <4 x i64>, i8, i32)
324 define <4 x i64>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, ptr %x1, <4 x i64> %x2, i8 %x3) {
325 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_di:
327 ; CHECK-NEXT: kmovd %esi, %k1
328 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
329 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
330 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
331 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
332 ; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0
334 %res = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, ptr %x1, <4 x i64> %x2, i8 %x3, i32 8)
335 %res1 = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, ptr %x1, <4 x i64> %x2, i8 -1, i32 8)
336 %res2 = add <4 x i64> %res, %res1
340 declare <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float>, ptr, <2 x i64>, i8, i32)
342 define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) {
343 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf:
345 ; CHECK-NEXT: kmovd %esi, %k1
346 ; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1}
347 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
348 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
349 ; CHECK-NEXT: vgatherqps (%rdi,%xmm1,2), %xmm2 {%k1}
350 ; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0
352 %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, ptr %x1, <2 x i64> %x2, i8 %x3, i32 4)
353 %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, ptr %x1, <2 x i64> %x2, i8 -1, i32 2)
354 %res2 = fadd <4 x float> %res, %res1
355 ret <4 x float> %res2
358 declare <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32>, ptr, <2 x i64>, i8, i32)
360 define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) {
361 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_si:
363 ; CHECK-NEXT: kmovd %esi, %k1
364 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
365 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
366 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2}
367 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1}
368 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
370 %res = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, ptr %x1, <2 x i64> %x2, i8 -1, i32 4)
371 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, ptr %x1, <2 x i64> %x2, i8 %x3, i32 4)
372 %res2 = add <4 x i32> %res, %res1
376 declare <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float>, ptr, <4 x i64>, i8, i32)
378 define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, ptr %x1, <4 x i64> %x2, i8 %x3) {
379 ; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf:
381 ; CHECK-NEXT: kmovd %esi, %k1
382 ; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1}
383 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
384 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
385 ; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm2 {%k1}
386 ; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0
387 ; CHECK-NEXT: vzeroupper
389 %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, ptr %x1, <4 x i64> %x2, i8 %x3, i32 4)
390 %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, ptr %x1, <4 x i64> %x2, i8 -1, i32 2)
391 %res2 = fadd <4 x float> %res, %res1
392 ret <4 x float> %res2
395 declare <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32>, ptr, <4 x i64>, i8, i32)
397 define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, ptr %x1, <4 x i64> %x2, i8 %x3) {
398 ; CHECK-LABEL: test_int_x86_avx512_gather3div8_si:
400 ; CHECK-NEXT: kmovd %esi, %k1
401 ; CHECK-NEXT: vmovdqa %xmm0, %xmm2
402 ; CHECK-NEXT: kmovq %k1, %k2
403 ; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
404 ; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
405 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
406 ; CHECK-NEXT: vzeroupper
408 %res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, ptr %x1, <4 x i64> %x2, i8 %x3, i32 4)
409 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, ptr %x1, <4 x i64> %x2, i8 %x3, i32 2)
410 %res2 = add <4 x i32> %res, %res1
414 declare <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double>, ptr, <4 x i32>, i8, i32)
416 define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
417 ; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df:
419 ; CHECK-NEXT: kmovd %esi, %k1
420 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm0 {%k1}
421 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
422 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
423 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %xmm2 {%k1}
424 ; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0
426 %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, ptr %x1, <4 x i32> %x2, i8 %x3, i32 4)
427 %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, ptr %x1, <4 x i32> %x2, i8 -1, i32 2)
428 %res2 = fadd <2 x double> %res, %res1
429 ret <2 x double> %res2
432 declare <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64>, ptr, <4 x i32>, i8, i32)
434 define <2 x i64>@test_int_x86_avx512_gather3siv2_di(<2 x i64> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
435 ; CHECK-LABEL: test_int_x86_avx512_gather3siv2_di:
437 ; CHECK-NEXT: kmovd %esi, %k1
438 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1}
439 ; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0
441 %res = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, ptr %x1, <4 x i32> %x2, i8 %x3, i32 8)
442 %res1 = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, ptr %x1, <4 x i32> %x2, i8 %x3, i32 8)
443 %res2 = add <2 x i64> %res, %res1
447 declare <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double>, ptr, <4 x i32>, i8, i32)
449 define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
450 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df:
452 ; CHECK-NEXT: kmovd %esi, %k1
453 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm0 {%k1}
454 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
455 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
456 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %ymm2 {%k1}
457 ; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0
459 %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, ptr %x1, <4 x i32> %x2, i8 %x3, i32 4)
460 %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, ptr %x1, <4 x i32> %x2, i8 -1, i32 2)
461 %res2 = fadd <4 x double> %res, %res1
462 ret <4 x double> %res2
465 declare <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64>, ptr, <4 x i32>, i8, i32)
467 define <4 x i64>@test_int_x86_avx512_gather3siv4_di(<4 x i64> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
468 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_di:
470 ; CHECK-NEXT: kmovd %esi, %k1
471 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1}
472 ; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0
474 %res = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, ptr %x1, <4 x i32> %x2, i8 %x3, i32 8)
475 %res1 = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, ptr %x1, <4 x i32> %x2, i8 %x3, i32 8)
476 %res2 = add <4 x i64> %res, %res1
480 declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, ptr, <4 x i32>, i8, i32)
482 define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
483 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf:
485 ; CHECK-NEXT: kmovd %esi, %k1
486 ; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
487 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
488 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
489 ; CHECK-NEXT: vgatherdps (%rdi,%xmm1,2), %xmm2 {%k1}
490 ; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0
492 %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, ptr %x1, <4 x i32> %x2, i8 %x3, i32 4)
493 %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, ptr %x1, <4 x i32> %x2, i8 -1, i32 2)
494 %res2 = fadd <4 x float> %res, %res1
495 ret <4 x float> %res2
498 declare <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32>, ptr, <4 x i32>, i8, i32)
500 define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) {
501 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si:
503 ; CHECK-NEXT: kmovd %esi, %k1
504 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
505 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
506 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2}
507 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1}
508 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
510 %res = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, ptr %x1, <4 x i32> %x2, i8 -1, i32 4)
511 %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, ptr %x1, <4 x i32> %x2, i8 %x3, i32 2)
512 %res2 = add <4 x i32> %res, %res1
516 declare <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float>, ptr, <8 x i32>, i8, i32)
518 define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, ptr %x1, <8 x i32> %x2, i8 %x3) {
519 ; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf:
521 ; CHECK-NEXT: kmovd %esi, %k1
522 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
523 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
524 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
525 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
526 ; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0
528 %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, ptr %x1, <8 x i32> %x2, i8 %x3, i32 4)
529 %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, ptr %x1, <8 x i32> %x2, i8 -1, i32 2)
530 %res2 = fadd <8 x float> %res, %res1
531 ret <8 x float> %res2
534 declare <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32>, ptr, <8 x i32>, i8, i32)
536 define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, ptr %x1, <8 x i32> %x2, i8 %x3) {
537 ; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si:
539 ; CHECK-NEXT: kmovd %esi, %k1
540 ; CHECK-NEXT: vmovdqa %ymm0, %ymm2
541 ; CHECK-NEXT: kmovq %k1, %k2
542 ; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2}
543 ; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1}
544 ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
546 %res = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, ptr %x1, <8 x i32> %x2, i8 %x3, i32 4)
547 %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, ptr %x1, <8 x i32> %x2, i8 %x3, i32 2)
548 %res2 = add <8 x i32> %res, %res1
552 declare void @llvm.x86.avx512.scatterdiv2.df(ptr, i8, <2 x i64>, <2 x double>, i32)
554 define void@test_int_x86_avx512_scatterdiv2_df(ptr %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) {
555 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df:
557 ; CHECK-NEXT: kmovd %esi, %k1
558 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
559 ; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2}
560 ; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1}
562 call void @llvm.x86.avx512.scatterdiv2.df(ptr %x0, i8 -1, <2 x i64> %x2, <2 x double> %x3, i32 2)
563 call void @llvm.x86.avx512.scatterdiv2.df(ptr %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3, i32 4)
567 declare void @llvm.x86.avx512.scatterdiv2.di(ptr, i8, <2 x i64>, <2 x i64>, i32)
569 define void@test_int_x86_avx512_scatterdiv2_di(ptr %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) {
570 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di:
572 ; CHECK-NEXT: kmovd %esi, %k1
573 ; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1}
574 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
575 ; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1}
577 call void @llvm.x86.avx512.scatterdiv2.di(ptr %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3, i32 2)
578 call void @llvm.x86.avx512.scatterdiv2.di(ptr %x0, i8 -1, <2 x i64> %x2, <2 x i64> %x3, i32 4)
582 declare void @llvm.x86.avx512.scatterdiv4.df(ptr, i8, <4 x i64>, <4 x double>, i32)
584 define void@test_int_x86_avx512_scatterdiv4_df(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) {
585 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df:
587 ; CHECK-NEXT: kmovd %esi, %k1
588 ; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1}
589 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
590 ; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1}
591 ; CHECK-NEXT: vzeroupper
593 call void @llvm.x86.avx512.scatterdiv4.df(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 2)
594 call void @llvm.x86.avx512.scatterdiv4.df(ptr %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4)
598 declare void @llvm.x86.avx512.scatterdiv4.di(ptr, i8, <4 x i64>, <4 x i64>, i32)
600 define void@test_int_x86_avx512_scatterdiv4_di(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) {
601 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di:
603 ; CHECK-NEXT: kmovd %esi, %k1
604 ; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1}
605 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
606 ; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1}
607 ; CHECK-NEXT: vzeroupper
609 call void @llvm.x86.avx512.scatterdiv4.di(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 2)
610 call void @llvm.x86.avx512.scatterdiv4.di(ptr %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4)
614 declare void @llvm.x86.avx512.scatterdiv4.sf(ptr, i8, <2 x i64>, <4 x float>, i32)
616 define void@test_int_x86_avx512_scatterdiv4_sf(ptr %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) {
617 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf:
619 ; CHECK-NEXT: kmovd %esi, %k1
620 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1}
621 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
622 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1}
624 call void @llvm.x86.avx512.scatterdiv4.sf(ptr %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3, i32 2)
625 call void @llvm.x86.avx512.scatterdiv4.sf(ptr %x0, i8 -1, <2 x i64> %x2, <4 x float> %x3, i32 4)
629 declare void @llvm.x86.avx512.scatterdiv4.si(ptr, i8, <2 x i64>, <4 x i32>, i32)
631 define void@test_int_x86_avx512_scatterdiv4_si(ptr %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) {
632 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si:
634 ; CHECK-NEXT: kmovd %esi, %k1
635 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
636 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2}
637 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1}
639 call void @llvm.x86.avx512.scatterdiv4.si(ptr %x0, i8 -1, <2 x i64> %x2, <4 x i32> %x3, i32 2)
640 call void @llvm.x86.avx512.scatterdiv4.si(ptr %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3, i32 4)
644 declare void @llvm.x86.avx512.scatterdiv8.sf(ptr, i8, <4 x i64>, <4 x float>, i32)
646 define void@test_int_x86_avx512_scatterdiv8_sf(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) {
647 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf:
649 ; CHECK-NEXT: kmovd %esi, %k1
650 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1}
651 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
652 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1}
653 ; CHECK-NEXT: vzeroupper
655 call void @llvm.x86.avx512.scatterdiv8.sf(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 2)
656 call void @llvm.x86.avx512.scatterdiv8.sf(ptr %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4)
660 declare void @llvm.x86.avx512.scatterdiv8.si(ptr, i8, <4 x i64>, <4 x i32>, i32)
662 define void@test_int_x86_avx512_scatterdiv8_si(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) {
663 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si:
665 ; CHECK-NEXT: kmovd %esi, %k1
666 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1}
667 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
668 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1}
669 ; CHECK-NEXT: vzeroupper
671 call void @llvm.x86.avx512.scatterdiv8.si(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 2)
672 call void @llvm.x86.avx512.scatterdiv8.si(ptr %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4)
676 declare void @llvm.x86.avx512.scattersiv2.df(ptr, i8, <4 x i32>, <2 x double>, i32)
678 define void@test_int_x86_avx512_scattersiv2_df(ptr %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) {
679 ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df:
681 ; CHECK-NEXT: kmovd %esi, %k1
682 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
683 ; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2}
684 ; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1}
686 call void @llvm.x86.avx512.scattersiv2.df(ptr %x0, i8 -1, <4 x i32> %x2, <2 x double> %x3, i32 2)
687 call void @llvm.x86.avx512.scattersiv2.df(ptr %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3, i32 4)
691 declare void @llvm.x86.avx512.scattersiv2.di(ptr, i8, <4 x i32>, <2 x i64>, i32)
693 define void@test_int_x86_avx512_scattersiv2_di(ptr %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) {
694 ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di:
696 ; CHECK-NEXT: kmovd %esi, %k1
697 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
698 ; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2}
699 ; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1}
701 call void @llvm.x86.avx512.scattersiv2.di(ptr %x0, i8 -1, <4 x i32> %x2, <2 x i64> %x3, i32 2)
702 call void @llvm.x86.avx512.scattersiv2.di(ptr %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3, i32 4)
706 declare void @llvm.x86.avx512.scattersiv4.df(ptr, i8, <4 x i32>, <4 x double>, i32)
708 define void@test_int_x86_avx512_scattersiv4_df(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) {
709 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df:
711 ; CHECK-NEXT: kmovd %esi, %k1
712 ; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1}
713 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
714 ; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1}
715 ; CHECK-NEXT: vzeroupper
717 call void @llvm.x86.avx512.scattersiv4.df(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 2)
718 call void @llvm.x86.avx512.scattersiv4.df(ptr %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4)
722 declare void @llvm.x86.avx512.scattersiv4.di(ptr, i8, <4 x i32>, <4 x i64>, i32)
724 define void@test_int_x86_avx512_scattersiv4_di(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) {
725 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di:
727 ; CHECK-NEXT: kmovd %esi, %k1
728 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
729 ; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2}
730 ; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1}
731 ; CHECK-NEXT: vzeroupper
733 call void @llvm.x86.avx512.scattersiv4.di(ptr %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 2)
734 call void @llvm.x86.avx512.scattersiv4.di(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4)
738 declare void @llvm.x86.avx512.scattersiv4.sf(ptr, i8, <4 x i32>, <4 x float>, i32)
740 define void@test_int_x86_avx512_scattersiv4_sf(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) {
741 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf:
743 ; CHECK-NEXT: kmovd %esi, %k1
744 ; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1}
745 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
746 ; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1}
748 call void @llvm.x86.avx512.scattersiv4.sf(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3, i32 2)
749 call void @llvm.x86.avx512.scattersiv4.sf(ptr %x0, i8 -1, <4 x i32> %x2, <4 x float> %x3, i32 4)
753 declare void @llvm.x86.avx512.scattersiv4.si(ptr, i8, <4 x i32>, <4 x i32>, i32)
755 define void@test_int_x86_avx512_scattersiv4_si(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) {
756 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si:
758 ; CHECK-NEXT: kmovd %esi, %k1
759 ; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1}
760 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
761 ; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
763 call void @llvm.x86.avx512.scattersiv4.si(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3, i32 2)
764 call void @llvm.x86.avx512.scattersiv4.si(ptr %x0, i8 -1, <4 x i32> %x2, <4 x i32> %x3, i32 4)
768 declare void @llvm.x86.avx512.scattersiv8.sf(ptr, i8, <8 x i32>, <8 x float>, i32)
770 define void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) {
771 ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf:
773 ; CHECK-NEXT: kmovd %esi, %k1
774 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
775 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
776 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
777 ; CHECK-NEXT: vzeroupper
779 call void @llvm.x86.avx512.scattersiv8.sf(ptr %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 2)
780 call void @llvm.x86.avx512.scattersiv8.sf(ptr %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4)
784 declare void @llvm.x86.avx512.scattersiv8.si(ptr, i8, <8 x i32>, <8 x i32>, i32)
786 define void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) {
787 ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si:
789 ; CHECK-NEXT: kmovd %esi, %k1
790 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
791 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
792 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
793 ; CHECK-NEXT: vzeroupper
795 call void @llvm.x86.avx512.scattersiv8.si(ptr %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
796 call void @llvm.x86.avx512.scattersiv8.si(ptr %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4)
800 define void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) {
801 ; CHECK-LABEL: scatter_mask_test:
803 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
804 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
805 ; CHECK-NEXT: kxorw %k0, %k0, %k1
806 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
807 ; CHECK-NEXT: movb $1, %al
808 ; CHECK-NEXT: kmovd %eax, %k1
809 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
810 ; CHECK-NEXT: movb $96, %al
811 ; CHECK-NEXT: kmovd %eax, %k1
812 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
813 ; CHECK-NEXT: vzeroupper
815 call void @llvm.x86.avx512.scattersiv8.si(ptr %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
816 call void @llvm.x86.avx512.scattersiv8.si(ptr %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4)
817 call void @llvm.x86.avx512.scattersiv8.si(ptr %x0, i8 1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
818 call void @llvm.x86.avx512.scattersiv8.si(ptr %x0, i8 96, <8 x i32> %x2, <8 x i32> %x3, i32 4)
822 define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, ptr %base) {
823 ; CHECK-LABEL: gather_mask_test:
825 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
826 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
827 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
828 ; CHECK-NEXT: kxorw %k0, %k0, %k1
829 ; CHECK-NEXT: vmovaps %zmm1, %zmm3
830 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
831 ; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2
832 ; CHECK-NEXT: movw $1, %ax
833 ; CHECK-NEXT: kmovd %eax, %k1
834 ; CHECK-NEXT: vmovaps %zmm1, %zmm3
835 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
836 ; CHECK-NEXT: movw $220, %ax
837 ; CHECK-NEXT: kmovd %eax, %k1
838 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
839 ; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0
840 ; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0
842 %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, ptr %base, <16 x i32>%ind, i16 -1, i32 4)
843 %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, ptr %base, <16 x i32>%ind, i16 0, i32 4)
844 %res2 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, ptr %base, <16 x i32>%ind, i16 1, i32 4)
845 %res3 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, ptr %base, <16 x i32>%ind, i16 220, i32 4)
847 %res4 = fadd <16 x float> %res, %res1
848 %res5 = fadd <16 x float> %res3, %res2
849 %res6 = fadd <16 x float> %res5, %res4
850 ret <16 x float> %res6