1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s
4 define dso_local void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) {
5 ; CHECK-LABEL: gather_mask_dps:
7 ; CHECK-NEXT: kmovd %edi, %k1
8 ; CHECK-NEXT: kmovq %k1, %k2
9 ; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2}
10 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
11 ; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1}
12 ; CHECK-NEXT: vzeroupper
14 %1 = bitcast i16 %mask to <16 x i1>
15 %x = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> %1, i32 4)
16 %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
17 call void @llvm.x86.avx512.mask.scatter.dps.512(i8* %stbuf, <16 x i1> %1, <16 x i32> %ind2, <16 x float> %x, i32 4)
21 define dso_local void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
22 ; CHECK-LABEL: gather_mask_dpd:
24 ; CHECK-NEXT: kmovd %edi, %k1
25 ; CHECK-NEXT: kmovq %k1, %k2
26 ; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2}
27 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
28 ; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1}
29 ; CHECK-NEXT: vzeroupper
31 %1 = bitcast i8 %mask to <8 x i1>
32 %x = call <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double> %src, i8* %base, <8 x i32> %ind, <8 x i1> %1, i32 4)
33 %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
34 call void @llvm.x86.avx512.mask.scatter.dpd.512(i8* %stbuf, <8 x i1> %1, <8 x i32> %ind2, <8 x double> %x, i32 4)
38 define dso_local void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) {
39 ; CHECK-LABEL: gather_mask_qps:
41 ; CHECK-NEXT: kmovd %edi, %k1
42 ; CHECK-NEXT: kmovq %k1, %k2
43 ; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2}
44 ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
45 ; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1}
46 ; CHECK-NEXT: vzeroupper
48 %1 = bitcast i8 %mask to <8 x i1>
49 %x = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
50 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
51 call void @llvm.x86.avx512.mask.scatter.qps.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x float> %x, i32 4)
55 define dso_local void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
56 ; CHECK-LABEL: gather_mask_qpd:
58 ; CHECK-NEXT: kmovd %edi, %k1
59 ; CHECK-NEXT: kmovq %k1, %k2
60 ; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2}
61 ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
62 ; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1}
63 ; CHECK-NEXT: vzeroupper
65 %1 = bitcast i8 %mask to <8 x i1>
66 %x = call <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
67 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
68 call void @llvm.x86.avx512.mask.scatter.qpd.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x double> %x, i32 4)
72 ;; Integer Gather/Scatter
75 define dso_local void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) {
76 ; CHECK-LABEL: gather_mask_dd:
78 ; CHECK-NEXT: kmovd %edi, %k1
79 ; CHECK-NEXT: kmovq %k1, %k2
80 ; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2}
81 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
82 ; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1}
83 ; CHECK-NEXT: vzeroupper
85 %1 = bitcast i16 %mask to <16 x i1>
86 %x = call <16 x i32> @llvm.x86.avx512.mask.gather.dpi.512(<16 x i32> %src, i8* %base, <16 x i32> %ind, <16 x i1> %1, i32 4)
87 %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
88 call void @llvm.x86.avx512.mask.scatter.dpi.512(i8* %stbuf, <16 x i1> %1, <16 x i32> %ind2, <16 x i32> %x, i32 4)
92 define dso_local void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) {
93 ; CHECK-LABEL: gather_mask_qd:
95 ; CHECK-NEXT: kmovd %edi, %k1
96 ; CHECK-NEXT: kmovq %k1, %k2
97 ; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2}
98 ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
99 ; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1}
100 ; CHECK-NEXT: vzeroupper
102 %1 = bitcast i8 %mask to <8 x i1>
103 %x = call <8 x i32> @llvm.x86.avx512.mask.gather.qpi.512(<8 x i32> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
104 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
105 call void @llvm.x86.avx512.mask.scatter.qpi.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x i32> %x, i32 4)
109 define dso_local void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
110 ; CHECK-LABEL: gather_mask_qq:
112 ; CHECK-NEXT: kmovd %edi, %k1
113 ; CHECK-NEXT: kmovq %k1, %k2
114 ; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2}
115 ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
116 ; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1}
117 ; CHECK-NEXT: vzeroupper
119 %1 = bitcast i8 %mask to <8 x i1>
120 %x = call <8 x i64> @llvm.x86.avx512.mask.gather.qpq.512(<8 x i64> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
121 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
122 call void @llvm.x86.avx512.mask.scatter.qpq.512(i8* %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x i64> %x, i32 4)
126 define dso_local void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
127 ; CHECK-LABEL: gather_mask_dq:
129 ; CHECK-NEXT: kmovd %edi, %k1
130 ; CHECK-NEXT: kmovq %k1, %k2
131 ; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2}
132 ; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
133 ; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1}
134 ; CHECK-NEXT: vzeroupper
136 %1 = bitcast i8 %mask to <8 x i1>
137 %x = call <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512(<8 x i64> %src, i8* %base, <8 x i32> %ind, <8 x i1> %1, i32 4)
138 %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
139 call void @llvm.x86.avx512.mask.scatter.dpq.512(i8* %stbuf, <8 x i1> %1, <8 x i32> %ind2, <8 x i64> %x, i32 4)
143 define dso_local void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
144 ; CHECK-LABEL: gather_mask_dpd_execdomain:
146 ; CHECK-NEXT: kmovd %edi, %k1
147 ; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1}
148 ; CHECK-NEXT: vmovapd %zmm1, (%rdx)
149 ; CHECK-NEXT: vzeroupper
151 %1 = bitcast i8 %mask to <8 x i1>
152 %x = call <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double> %src, i8* %base, <8 x i32> %ind, <8 x i1> %1, i32 4)
153 store <8 x double> %x, <8 x double>* %stbuf
157 define dso_local void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
158 ; CHECK-LABEL: gather_mask_qpd_execdomain:
160 ; CHECK-NEXT: kmovd %edi, %k1
161 ; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1}
162 ; CHECK-NEXT: vmovapd %zmm1, (%rdx)
163 ; CHECK-NEXT: vzeroupper
165 %1 = bitcast i8 %mask to <8 x i1>
166 %x = call <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
167 store <8 x double> %x, <8 x double>* %stbuf
171 define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) {
172 ; CHECK-LABEL: gather_mask_dps_execdomain:
174 ; CHECK-NEXT: kmovd %edi, %k1
175 ; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1}
176 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
178 %1 = bitcast i16 %mask to <16 x i1>
179 %res = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> %1, i32 4)
180 ret <16 x float> %res
183 define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) {
184 ; CHECK-LABEL: gather_mask_qps_execdomain:
186 ; CHECK-NEXT: kmovd %edi, %k1
187 ; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1}
188 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
190 %1 = bitcast i8 %mask to <8 x i1>
191 %res = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, i8* %base, <8 x i64> %ind, <8 x i1> %1, i32 4)
195 define dso_local void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
196 ; CHECK-LABEL: scatter_mask_dpd_execdomain:
198 ; CHECK-NEXT: kmovd %esi, %k1
199 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
200 ; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1}
201 ; CHECK-NEXT: vzeroupper
203 %1 = bitcast i8 %mask to <8 x i1>
204 %x = load <8 x double>, <8 x double>* %src, align 64
205 call void @llvm.x86.avx512.mask.scatter.dpd.512(i8* %stbuf, <8 x i1> %1, <8 x i32>%ind, <8 x double> %x, i32 4)
209 define dso_local void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
210 ; CHECK-LABEL: scatter_mask_qpd_execdomain:
212 ; CHECK-NEXT: kmovd %esi, %k1
213 ; CHECK-NEXT: vmovapd (%rdi), %zmm1
214 ; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1}
215 ; CHECK-NEXT: vzeroupper
217 %1 = bitcast i8 %mask to <8 x i1>
218 %x = load <8 x double>, <8 x double>* %src, align 64
219 call void @llvm.x86.avx512.mask.scatter.qpd.512(i8* %stbuf, <8 x i1> %1, <8 x i64>%ind, <8 x double> %x, i32 4)
223 define dso_local void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf) {
224 ; CHECK-LABEL: scatter_mask_dps_execdomain:
226 ; CHECK-NEXT: kmovd %esi, %k1
227 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
228 ; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1}
229 ; CHECK-NEXT: vzeroupper
231 %1 = bitcast i16 %mask to <16 x i1>
232 %x = load <16 x float>, <16 x float>* %src, align 64
233 call void @llvm.x86.avx512.mask.scatter.dps.512(i8* %stbuf, <16 x i1> %1, <16 x i32>%ind, <16 x float> %x, i32 4)
237 define dso_local void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf) {
238 ; CHECK-LABEL: scatter_mask_qps_execdomain:
240 ; CHECK-NEXT: kmovd %esi, %k1
241 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
242 ; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1}
243 ; CHECK-NEXT: vzeroupper
245 %1 = bitcast i8 %mask to <8 x i1>
246 %x = load <8 x float>, <8 x float>* %src, align 32
247 call void @llvm.x86.avx512.mask.scatter.qps.512(i8* %stbuf, <8 x i1> %1, <8 x i64>%ind, <8 x float> %x, i32 4)
251 define dso_local void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) {
252 ; CHECK-LABEL: gather_qps:
254 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
255 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
256 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
257 ; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
258 ; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
259 ; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
260 ; CHECK-NEXT: vzeroupper
262 %x = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, i8* %base, <8 x i64> %ind, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
263 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
264 call void @llvm.x86.avx512.mask.scatter.qps.512(i8* %stbuf, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i64> %ind2, <8 x float> %x, i32 4)
268 declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
269 declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
270 define dso_local void @prefetch(<8 x i64> %ind, i8* %base) {
271 ; CHECK-LABEL: prefetch:
273 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
274 ; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1}
275 ; CHECK-NEXT: kxorw %k0, %k0, %k1
276 ; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1}
277 ; CHECK-NEXT: movb $1, %al
278 ; CHECK-NEXT: kmovd %eax, %k1
279 ; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1}
280 ; CHECK-NEXT: movb $120, %al
281 ; CHECK-NEXT: kmovd %eax, %k1
282 ; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1}
283 ; CHECK-NEXT: vzeroupper
285 call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 3)
286 call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 2)
287 call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 3)
288 call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 2)
292 define <2 x double> @test_int_x86_avx512_mask_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
293 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div2_df:
295 ; CHECK-NEXT: kmovd %esi, %k1
296 ; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm0 {%k1}
297 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
298 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
299 ; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,2), %xmm2 {%k1}
300 ; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0
302 %1 = bitcast i8 %x3 to <8 x i1>
303 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
304 %res = call <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4)
305 %res1 = call <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> <i1 true, i1 true>, i32 2)
306 %res2 = fadd <2 x double> %res, %res1
307 ret <2 x double> %res2
310 define <2 x i64> @test_int_x86_avx512_mask_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
311 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div2_di:
313 ; CHECK-NEXT: kmovd %esi, %k1
314 ; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1}
315 ; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0
317 %1 = bitcast i8 %x3 to <8 x i1>
318 %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
319 %res = call <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract1, i32 8)
320 %2 = bitcast i8 %x3 to <8 x i1>
321 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
322 %res1 = call <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract, i32 8)
323 %res2 = add <2 x i64> %res, %res1
327 define <4 x double> @test_int_x86_avx512_mask_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
328 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_df:
330 ; CHECK-NEXT: kmovd %esi, %k1
331 ; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1}
332 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
333 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
334 ; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,2), %ymm2 {%k1}
335 ; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0
337 %1 = bitcast i8 %x3 to <8 x i1>
338 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
339 %res = call <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract, i32 4)
340 %res1 = call <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2)
341 %res2 = fadd <4 x double> %res, %res1
342 ret <4 x double> %res2
345 define <4 x i64> @test_int_x86_avx512_mask_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
346 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_di:
348 ; CHECK-NEXT: kmovd %esi, %k1
349 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
350 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
351 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
352 ; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
353 ; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0
355 %1 = bitcast i8 %x3 to <8 x i1>
356 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
357 %res = call <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract, i32 8)
358 %res1 = call <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 8)
359 %res2 = add <4 x i64> %res, %res1
363 define <4 x float> @test_int_x86_avx512_mask_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
364 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_sf:
366 ; CHECK-NEXT: kmovd %esi, %k1
367 ; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1}
368 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
369 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
370 ; CHECK-NEXT: vgatherqps (%rdi,%xmm1,2), %xmm2 {%k1}
371 ; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0
373 %1 = bitcast i8 %x3 to <8 x i1>
374 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
375 %res = call <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4)
376 %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> <i1 true, i1 true>, i32 2)
377 %res2 = fadd <4 x float> %res, %res1
378 ret <4 x float> %res2
381 define <4 x i32> @test_int_x86_avx512_mask_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
382 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_si:
384 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
385 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
386 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k1}
387 ; CHECK-NEXT: kmovd %esi, %k1
388 ; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1}
389 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
391 %res = call <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> <i1 true, i1 true>, i32 4)
392 %1 = bitcast i8 %x3 to <8 x i1>
393 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
394 %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4)
395 %res2 = add <4 x i32> %res, %res1
399 define <4 x float> @test_int_x86_avx512_mask_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
400 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div8_sf:
402 ; CHECK-NEXT: kmovd %esi, %k1
403 ; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1}
404 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
405 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
406 ; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm2 {%k1}
407 ; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0
408 ; CHECK-NEXT: vzeroupper
410 %1 = bitcast i8 %x3 to <8 x i1>
411 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
412 %res = call <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract, i32 4)
413 %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2)
414 %res2 = fadd <4 x float> %res, %res1
415 ret <4 x float> %res2
418 define <4 x i32> @test_int_x86_avx512_mask_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
419 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div8_si:
421 ; CHECK-NEXT: kmovd %esi, %k1
422 ; CHECK-NEXT: vmovdqa %xmm0, %xmm2
423 ; CHECK-NEXT: kmovq %k1, %k2
424 ; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
425 ; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
426 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
427 ; CHECK-NEXT: vzeroupper
429 %1 = bitcast i8 %x3 to <8 x i1>
430 %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
431 %res = call <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract1, i32 4)
432 %2 = bitcast i8 %x3 to <8 x i1>
433 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
434 %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, <4 x i1> %extract, i32 2)
435 %res2 = add <4 x i32> %res, %res1
439 define <2 x double> @test_int_x86_avx512_mask_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
440 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv2_df:
442 ; CHECK-NEXT: kmovd %esi, %k1
443 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm0 {%k1}
444 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
445 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
446 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %xmm2 {%k1}
447 ; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0
449 %1 = bitcast i8 %x3 to <8 x i1>
450 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
451 %res = call <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, <2 x i1> %extract, i32 4)
452 %res1 = call <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, <2 x i1> <i1 true, i1 true>, i32 2)
453 %res2 = fadd <2 x double> %res, %res1
454 ret <2 x double> %res2
457 define <2 x i64> @test_int_x86_avx512_mask_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
458 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv2_di:
460 ; CHECK-NEXT: kmovd %esi, %k1
461 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1}
462 ; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0
464 %1 = bitcast i8 %x3 to <8 x i1>
465 %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
466 %res = call <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, <2 x i1> %extract1, i32 8)
467 %2 = bitcast i8 %x3 to <8 x i1>
468 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
469 %res1 = call <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, <2 x i1> %extract, i32 8)
470 %res2 = add <2 x i64> %res, %res1
474 define <4 x double> @test_int_x86_avx512_mask_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
475 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_df:
477 ; CHECK-NEXT: kmovd %esi, %k1
478 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm0 {%k1}
479 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
480 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
481 ; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %ymm2 {%k1}
482 ; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0
484 %1 = bitcast i8 %x3 to <8 x i1>
485 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
486 %res = call <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract, i32 4)
487 %res1 = call <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2)
488 %res2 = fadd <4 x double> %res, %res1
489 ret <4 x double> %res2
492 define <4 x i64> @test_int_x86_avx512_mask_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
493 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_di:
495 ; CHECK-NEXT: kmovd %esi, %k1
496 ; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1}
497 ; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0
499 %1 = bitcast i8 %x3 to <8 x i1>
500 %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
501 %res = call <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract1, i32 8)
502 %2 = bitcast i8 %x3 to <8 x i1>
503 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
504 %res1 = call <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract, i32 8)
505 %res2 = add <4 x i64> %res, %res1
509 define <4 x float> @test_int_x86_avx512_mask_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
510 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_sf:
512 ; CHECK-NEXT: kmovd %esi, %k1
513 ; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
514 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
515 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
516 ; CHECK-NEXT: vgatherdps (%rdi,%xmm1,2), %xmm2 {%k1}
517 ; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0
519 %1 = bitcast i8 %x3 to <8 x i1>
520 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
521 %res = call <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract, i32 4)
522 %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2)
523 %res2 = fadd <4 x float> %res, %res1
524 ret <4 x float> %res2
527 define <4 x i32> @test_int_x86_avx512_mask_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
528 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_si:
530 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
531 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
532 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k1}
533 ; CHECK-NEXT: kmovd %esi, %k1
534 ; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1}
535 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
537 %res = call <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
538 %1 = bitcast i8 %x3 to <8 x i1>
539 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
540 %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, <4 x i1> %extract, i32 2)
541 %res2 = add <4 x i32> %res, %res1
545 define <8 x float> @test_int_x86_avx512_mask_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
546 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv8_sf:
548 ; CHECK-NEXT: kmovd %esi, %k1
549 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
550 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
551 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
552 ; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
553 ; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0
555 %1 = bitcast i8 %x3 to <8 x i1>
556 %res = call <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, <8 x i1> %1, i32 4)
557 %res1 = call <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 2)
558 %res2 = fadd <8 x float> %res, %res1
559 ret <8 x float> %res2
562 define <8 x i32> @test_int_x86_avx512_mask_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
563 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv8_si:
565 ; CHECK-NEXT: kmovd %esi, %k1
566 ; CHECK-NEXT: vmovdqa %ymm0, %ymm2
567 ; CHECK-NEXT: kmovq %k1, %k2
568 ; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2}
569 ; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1}
570 ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
572 %1 = bitcast i8 %x3 to <8 x i1>
573 %res = call <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, <8 x i1> %1, i32 4)
574 %2 = bitcast i8 %x3 to <8 x i1>
575 %res1 = call <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, <8 x i1> %2, i32 2)
576 %res2 = add <8 x i32> %res, %res1
580 define dso_local void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) {
581 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df:
583 ; CHECK-NEXT: kmovd %esi, %k1
584 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
585 ; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2}
586 ; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1}
588 %1 = bitcast i8 %x1 to <8 x i1>
589 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
590 call void @llvm.x86.avx512.mask.scatterdiv2.df(i8* %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <2 x double> %x3, i32 2)
591 call void @llvm.x86.avx512.mask.scatterdiv2.df(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <2 x double> %x3, i32 4)
595 define dso_local void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) {
596 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di:
598 ; CHECK-NEXT: kmovd %esi, %k1
599 ; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1}
600 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
601 ; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1}
603 %1 = bitcast i8 %x1 to <8 x i1>
604 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
605 call void @llvm.x86.avx512.mask.scatterdiv2.di(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <2 x i64> %x3, i32 2)
606 call void @llvm.x86.avx512.mask.scatterdiv2.di(i8* %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <2 x i64> %x3, i32 4)
610 define dso_local void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) {
611 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df:
613 ; CHECK-NEXT: kmovd %esi, %k1
614 ; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1}
615 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
616 ; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1}
617 ; CHECK-NEXT: vzeroupper
619 %1 = bitcast i8 %x1 to <8 x i1>
620 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
621 call void @llvm.x86.avx512.mask.scatterdiv4.df(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x double> %x3, i32 2)
622 call void @llvm.x86.avx512.mask.scatterdiv4.df(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x double> %x3, i32 4)
626 define dso_local void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) {
627 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di:
629 ; CHECK-NEXT: kmovd %esi, %k1
630 ; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1}
631 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
632 ; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1}
633 ; CHECK-NEXT: vzeroupper
635 %1 = bitcast i8 %x1 to <8 x i1>
636 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
637 call void @llvm.x86.avx512.mask.scatterdiv4.di(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x i64> %x3, i32 2)
638 call void @llvm.x86.avx512.mask.scatterdiv4.di(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x i64> %x3, i32 4)
642 define dso_local void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) {
643 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf:
645 ; CHECK-NEXT: kmovd %esi, %k1
646 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1}
647 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
648 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1}
650 %1 = bitcast i8 %x1 to <8 x i1>
651 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
652 call void @llvm.x86.avx512.mask.scatterdiv4.sf(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <4 x float> %x3, i32 2)
653 call void @llvm.x86.avx512.mask.scatterdiv4.sf(i8* %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <4 x float> %x3, i32 4)
657 define dso_local void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) {
658 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si:
660 ; CHECK-NEXT: kmovd %esi, %k1
661 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
662 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2}
663 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1}
665 %1 = bitcast i8 %x1 to <8 x i1>
666 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
667 call void @llvm.x86.avx512.mask.scatterdiv4.si(i8* %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <4 x i32> %x3, i32 2)
668 call void @llvm.x86.avx512.mask.scatterdiv4.si(i8* %x0, <2 x i1> %2, <2 x i64> %x2, <4 x i32> %x3, i32 4)
672 define dso_local void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) {
673 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf:
675 ; CHECK-NEXT: kmovd %esi, %k1
676 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1}
677 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
678 ; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1}
679 ; CHECK-NEXT: vzeroupper
681 %1 = bitcast i8 %x1 to <8 x i1>
682 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
683 call void @llvm.x86.avx512.mask.scatterdiv8.sf(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x float> %x3, i32 2)
684 call void @llvm.x86.avx512.mask.scatterdiv8.sf(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x float> %x3, i32 4)
688 define dso_local void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) {
689 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si:
691 ; CHECK-NEXT: kmovd %esi, %k1
692 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1}
693 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
694 ; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1}
695 ; CHECK-NEXT: vzeroupper
697 %1 = bitcast i8 %x1 to <8 x i1>
698 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
699 call void @llvm.x86.avx512.mask.scatterdiv8.si(i8* %x0, <4 x i1> %2, <4 x i64> %x2, <4 x i32> %x3, i32 2)
700 call void @llvm.x86.avx512.mask.scatterdiv8.si(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x i32> %x3, i32 4)
704 define dso_local void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) {
705 ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df:
707 ; CHECK-NEXT: kmovd %esi, %k1
708 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
709 ; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2}
710 ; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1}
712 %1 = bitcast i8 %x1 to <8 x i1>
713 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
714 call void @llvm.x86.avx512.mask.scattersiv2.df(i8* %x0, <2 x i1> <i1 true, i1 true>, <4 x i32> %x2, <2 x double> %x3, i32 2)
715 call void @llvm.x86.avx512.mask.scattersiv2.df(i8* %x0, <2 x i1> %2, <4 x i32> %x2, <2 x double> %x3, i32 4)
719 define dso_local void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) {
720 ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di:
722 ; CHECK-NEXT: kmovd %esi, %k1
723 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
724 ; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2}
725 ; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1}
727 %1 = bitcast i8 %x1 to <8 x i1>
728 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
729 call void @llvm.x86.avx512.mask.scattersiv2.di(i8* %x0, <2 x i1> <i1 true, i1 true>, <4 x i32> %x2, <2 x i64> %x3, i32 2)
730 call void @llvm.x86.avx512.mask.scattersiv2.di(i8* %x0, <2 x i1> %2, <4 x i32> %x2, <2 x i64> %x3, i32 4)
734 define dso_local void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) {
735 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df:
737 ; CHECK-NEXT: kmovd %esi, %k1
738 ; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1}
739 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
740 ; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1}
741 ; CHECK-NEXT: vzeroupper
743 %1 = bitcast i8 %x1 to <8 x i1>
744 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
745 call void @llvm.x86.avx512.mask.scattersiv4.df(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x double> %x3, i32 2)
746 call void @llvm.x86.avx512.mask.scattersiv4.df(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x double> %x3, i32 4)
750 define dso_local void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) {
751 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di:
753 ; CHECK-NEXT: kmovd %esi, %k1
754 ; CHECK-NEXT: kxnorw %k0, %k0, %k2
755 ; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2}
756 ; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1}
757 ; CHECK-NEXT: vzeroupper
759 %1 = bitcast i8 %x1 to <8 x i1>
760 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
761 call void @llvm.x86.avx512.mask.scattersiv4.di(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x i64> %x3, i32 2)
762 call void @llvm.x86.avx512.mask.scattersiv4.di(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x i64> %x3, i32 4)
766 define dso_local void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) {
767 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf:
769 ; CHECK-NEXT: kmovd %esi, %k1
770 ; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1}
771 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
772 ; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1}
774 %1 = bitcast i8 %x1 to <8 x i1>
775 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
776 call void @llvm.x86.avx512.mask.scattersiv4.sf(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x float> %x3, i32 2)
777 call void @llvm.x86.avx512.mask.scattersiv4.sf(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x float> %x3, i32 4)
781 define dso_local void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) {
782 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si:
784 ; CHECK-NEXT: kmovd %esi, %k1
785 ; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1}
786 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
787 ; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
789 %1 = bitcast i8 %x1 to <8 x i1>
790 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
791 call void @llvm.x86.avx512.mask.scattersiv4.si(i8* %x0, <4 x i1> %2, <4 x i32> %x2, <4 x i32> %x3, i32 2)
792 call void @llvm.x86.avx512.mask.scattersiv4.si(i8* %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x i32> %x3, i32 4)
796 define dso_local void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) {
797 ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf:
799 ; CHECK-NEXT: kmovd %esi, %k1
800 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
801 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
802 ; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
803 ; CHECK-NEXT: vzeroupper
805 %1 = bitcast i8 %x1 to <8 x i1>
806 call void @llvm.x86.avx512.mask.scattersiv8.sf(i8* %x0, <8 x i1> %1, <8 x i32> %x2, <8 x float> %x3, i32 2)
807 call void @llvm.x86.avx512.mask.scattersiv8.sf(i8* %x0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %x2, <8 x float> %x3, i32 4)
811 define dso_local void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) {
812 ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si:
814 ; CHECK-NEXT: kmovd %esi, %k1
815 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
816 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
817 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
818 ; CHECK-NEXT: vzeroupper
820 %1 = bitcast i8 %x1 to <8 x i1>
821 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> %1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
822 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %x2, <8 x i32> %x3, i32 4)
826 define dso_local void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) {
827 ; CHECK-LABEL: scatter_mask_test:
829 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
830 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
831 ; CHECK-NEXT: kxorw %k0, %k0, %k1
832 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
833 ; CHECK-NEXT: movb $1, %al
834 ; CHECK-NEXT: kmovd %eax, %k1
835 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
836 ; CHECK-NEXT: movb $96, %al
837 ; CHECK-NEXT: kmovd %eax, %k1
838 ; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
839 ; CHECK-NEXT: vzeroupper
841 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %x2, <8 x i32> %x3, i32 2)
842 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> zeroinitializer, <8 x i32> %x2, <8 x i32> %x3, i32 4)
843 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> bitcast (<1 x i8> <i8 1> to <8 x i1>), <8 x i32> %x2, <8 x i32> %x3, i32 2)
844 call void @llvm.x86.avx512.mask.scattersiv8.si(i8* %x0, <8 x i1> bitcast (<1 x i8> <i8 96> to <8 x i1>), <8 x i32> %x2, <8 x i32> %x3, i32 4)
848 define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base) {
849 ; CHECK-LABEL: gather_mask_test:
851 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
852 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
853 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
854 ; CHECK-NEXT: kxorw %k0, %k0, %k1
855 ; CHECK-NEXT: vmovaps %zmm1, %zmm3
856 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
857 ; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2
858 ; CHECK-NEXT: movw $1, %ax
859 ; CHECK-NEXT: kmovd %eax, %k1
860 ; CHECK-NEXT: vmovaps %zmm1, %zmm3
861 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
862 ; CHECK-NEXT: movw $220, %ax
863 ; CHECK-NEXT: kmovd %eax, %k1
864 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
865 ; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0
866 ; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0
868 %res = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
869 %res1 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> zeroinitializer, i32 4)
870 %res2 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> bitcast (<1 x i16> <i16 1> to <16 x i1>), i32 4)
871 %res3 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, i8* %base, <16 x i32> %ind, <16 x i1> bitcast (<1 x i16> <i16 220> to <16 x i1>), i32 4)
872 %res4 = fadd <16 x float> %res, %res1
873 %res5 = fadd <16 x float> %res3, %res2
874 %res6 = fadd <16 x float> %res5, %res4
875 ret <16 x float> %res6
878 @x = dso_local global [1024 x float] zeroinitializer, align 16
880 define <8 x float> @gather_global(<8 x i64>, i32* nocapture readnone) {
881 ; CHECK-LABEL: gather_global:
883 ; CHECK-NEXT: kxnorw %k0, %k0, %k1
884 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
885 ; CHECK-NEXT: vgatherqps x(,%zmm0,4), %ymm1 {%k1}
886 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
888 %3 = tail call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> zeroinitializer, i8* bitcast ([1024 x float]* @x to i8*), <8 x i64> %0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
892 declare <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float>, i8*, <16 x i32>, <16 x i1>, i32)
893 declare <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double>, i8*, <8 x i32>, <8 x i1>, i32)
894 declare <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float>, i8*, <8 x i64>, <8 x i1>, i32)
895 declare <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double>, i8*, <8 x i64>, <8 x i1>, i32)
896 declare <16 x i32> @llvm.x86.avx512.mask.gather.dpi.512(<16 x i32>, i8*, <16 x i32>, <16 x i1>, i32)
897 declare <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512(<8 x i64>, i8*, <8 x i32>, <8 x i1>, i32)
898 declare <8 x i32> @llvm.x86.avx512.mask.gather.qpi.512(<8 x i32>, i8*, <8 x i64>, <8 x i1>, i32)
899 declare <8 x i64> @llvm.x86.avx512.mask.gather.qpq.512(<8 x i64>, i8*, <8 x i64>, <8 x i1>, i32)
900 declare <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double>, i8*, <2 x i64>, <2 x i1>, i32)
901 declare <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64>, i8*, <2 x i64>, <2 x i1>, i32)
902 declare <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double>, i8*, <4 x i64>, <4 x i1>, i32)
903 declare <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64>, i8*, <4 x i64>, <4 x i1>, i32)
904 declare <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float>, i8*, <2 x i64>, <2 x i1>, i32)
905 declare <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32>, i8*, <2 x i64>, <2 x i1>, i32)
906 declare <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float>, i8*, <4 x i64>, <4 x i1>, i32)
907 declare <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32>, i8*, <4 x i64>, <4 x i1>, i32)
908 declare <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double>, i8*, <4 x i32>, <2 x i1>, i32)
909 declare <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, <2 x i1>, i32)
910 declare <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double>, i8*, <4 x i32>, <4 x i1>, i32)
911 declare <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, <4 x i1>, i32)
912 declare <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float>, i8*, <4 x i32>, <4 x i1>, i32)
913 declare <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, <4 x i1>, i32)
914 declare <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, <8 x i1>, i32)
915 declare <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, <8 x i1>, i32)
916 declare void @llvm.x86.avx512.mask.scatter.dps.512(i8*, <16 x i1>, <16 x i32>, <16 x float>, i32)
917 declare void @llvm.x86.avx512.mask.scatter.dpd.512(i8*, <8 x i1>, <8 x i32>, <8 x double>, i32)
918 declare void @llvm.x86.avx512.mask.scatter.qps.512(i8*, <8 x i1>, <8 x i64>, <8 x float>, i32)
919 declare void @llvm.x86.avx512.mask.scatter.qpd.512(i8*, <8 x i1>, <8 x i64>, <8 x double>, i32)
920 declare void @llvm.x86.avx512.mask.scatter.dpi.512(i8*, <16 x i1>, <16 x i32>, <16 x i32>, i32)
921 declare void @llvm.x86.avx512.mask.scatter.dpq.512(i8*, <8 x i1>, <8 x i32>, <8 x i64>, i32)
922 declare void @llvm.x86.avx512.mask.scatter.qpi.512(i8*, <8 x i1>, <8 x i64>, <8 x i32>, i32)
923 declare void @llvm.x86.avx512.mask.scatter.qpq.512(i8*, <8 x i1>, <8 x i64>, <8 x i64>, i32)
924 declare void @llvm.x86.avx512.mask.scatterdiv2.df(i8*, <2 x i1>, <2 x i64>, <2 x double>, i32)
925 declare void @llvm.x86.avx512.mask.scatterdiv2.di(i8*, <2 x i1>, <2 x i64>, <2 x i64>, i32)
926 declare void @llvm.x86.avx512.mask.scatterdiv4.df(i8*, <4 x i1>, <4 x i64>, <4 x double>, i32)
927 declare void @llvm.x86.avx512.mask.scatterdiv4.di(i8*, <4 x i1>, <4 x i64>, <4 x i64>, i32)
928 declare void @llvm.x86.avx512.mask.scatterdiv4.sf(i8*, <2 x i1>, <2 x i64>, <4 x float>, i32)
929 declare void @llvm.x86.avx512.mask.scatterdiv4.si(i8*, <2 x i1>, <2 x i64>, <4 x i32>, i32)
930 declare void @llvm.x86.avx512.mask.scatterdiv8.sf(i8*, <4 x i1>, <4 x i64>, <4 x float>, i32)
931 declare void @llvm.x86.avx512.mask.scatterdiv8.si(i8*, <4 x i1>, <4 x i64>, <4 x i32>, i32)
932 declare void @llvm.x86.avx512.mask.scattersiv2.df(i8*, <2 x i1>, <4 x i32>, <2 x double>, i32)
933 declare void @llvm.x86.avx512.mask.scattersiv2.di(i8*, <2 x i1>, <4 x i32>, <2 x i64>, i32)
934 declare void @llvm.x86.avx512.mask.scattersiv4.df(i8*, <4 x i1>, <4 x i32>, <4 x double>, i32)
935 declare void @llvm.x86.avx512.mask.scattersiv4.di(i8*, <4 x i1>, <4 x i32>, <4 x i64>, i32)
936 declare void @llvm.x86.avx512.mask.scattersiv4.sf(i8*, <4 x i1>, <4 x i32>, <4 x float>, i32)
937 declare void @llvm.x86.avx512.mask.scattersiv4.si(i8*, <4 x i1>, <4 x i32>, <4 x i32>, i32)
938 declare void @llvm.x86.avx512.mask.scattersiv8.sf(i8*, <8 x i1>, <8 x i32>, <8 x float>, i32)
939 declare void @llvm.x86.avx512.mask.scattersiv8.si(i8*, <8 x i1>, <8 x i32>, <8 x i32>, i32)