1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_64
3 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32
4 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_SMALL
5 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq -code-model=large < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_LARGE
6 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32
7 ; RUN: opt -mtriple=x86_64-apple-darwin -scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
8 ; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
10 @glob_array = internal unnamed_addr constant [16 x i32] [i32 1, i32 1, i32 2, i32 3, i32 5, i32 8, i32 13, i32 21, i32 34, i32 55, i32 89, i32 144, i32 233, i32 377, i32 610, i32 987], align 16
13 ; SCALAR: extractelement <16 x float*>
14 ; SCALAR-NEXT: load float
15 ; SCALAR-NEXT: insertelement <16 x float>
16 ; SCALAR-NEXT: extractelement <16 x float*>
17 ; SCALAR-NEXT: load float
19 define <16 x float> @test1(float* %base, <16 x i32> %ind) {
20 ; KNL_64-LABEL: test1:
22 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
23 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
24 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
27 ; KNL_32-LABEL: test1:
29 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
30 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
31 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
32 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
37 ; SKX-NEXT: kxnorw %k0, %k0, %k1
38 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
39 ; SKX-NEXT: vmovaps %zmm1, %zmm0
42 ; SKX_32-LABEL: test1:
44 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
45 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
46 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
47 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
50 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
51 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
53 %sext_ind = sext <16 x i32> %ind to <16 x i64>
54 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
56 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
60 declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
61 declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
62 declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
66 ; SCALAR: extractelement <16 x float*>
67 ; SCALAR-NEXT: load float
68 ; SCALAR-NEXT: insertelement <16 x float>
69 ; SCALAR-NEXT: br label %else
71 ; SCALAR-NEXT: %res.phi.else = phi
72 ; SCALAR-NEXT: and i16 %{{.*}}, 2
73 ; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0
74 ; SCALAR-NEXT: br i1 %{{.*}}, label %cond.load1, label %else2
76 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
77 ; KNL_64-LABEL: test2:
79 ; KNL_64-NEXT: kmovw %esi, %k1
80 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
81 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
84 ; KNL_32-LABEL: test2:
86 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
87 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
88 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
89 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
94 ; SKX-NEXT: kmovw %esi, %k1
95 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
96 ; SKX-NEXT: vmovaps %zmm1, %zmm0
99 ; SKX_32-LABEL: test2:
101 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
102 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
103 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
104 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
107 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
108 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
110 %sext_ind = sext <16 x i32> %ind to <16 x i64>
111 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
112 %imask = bitcast i16 %mask to <16 x i1>
113 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
114 ret <16 x float> %res
117 define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
118 ; KNL_64-LABEL: test3:
120 ; KNL_64-NEXT: kmovw %esi, %k1
121 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
122 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
125 ; KNL_32-LABEL: test3:
127 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
128 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
129 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
130 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
135 ; SKX-NEXT: kmovw %esi, %k1
136 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
137 ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
140 ; SKX_32-LABEL: test3:
142 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
143 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
144 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
145 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
148 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
149 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
151 %sext_ind = sext <16 x i32> %ind to <16 x i64>
152 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind
153 %imask = bitcast i16 %mask to <16 x i1>
154 %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
159 define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
160 ; KNL_64-LABEL: test4:
162 ; KNL_64-NEXT: kmovw %esi, %k1
163 ; KNL_64-NEXT: kmovw %k1, %k2
164 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
165 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
166 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
167 ; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0
170 ; KNL_32-LABEL: test4:
172 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
173 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
174 ; KNL_32-NEXT: kmovw %k1, %k2
175 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
176 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
177 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
178 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
183 ; SKX-NEXT: kmovw %esi, %k1
184 ; SKX-NEXT: kmovw %k1, %k2
185 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
186 ; SKX-NEXT: vmovdqa64 %zmm1, %zmm2
187 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
188 ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0
191 ; SKX_32-LABEL: test4:
193 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
194 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
195 ; SKX_32-NEXT: kmovw %k1, %k2
196 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
197 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
198 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
199 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
202 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
203 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
205 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
206 %imask = bitcast i16 %mask to <16 x i1>
207 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
208 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
209 %res = add <16 x i32> %gt1, %gt2
214 ; SCALAR-LABEL: test5
215 ; SCALAR: and i16 %scalar_mask, 1
216 ; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0
217 ; SCALAR-NEXT: br i1 %{{.*}}, label %cond.store, label %else
218 ; SCALAR: cond.store:
219 ; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i64 0
220 ; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i64 0
221 ; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4
222 ; SCALAR-NEXT: br label %else
224 ; SCALAR-NEXT: and i16 %scalar_mask, 2
225 ; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0
226 ; SCALAR-NEXT: br i1 %{{.*}}, label %cond.store1, label %else2
228 define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
229 ; KNL_64-LABEL: test5:
231 ; KNL_64-NEXT: kmovw %esi, %k1
232 ; KNL_64-NEXT: kmovw %k1, %k2
233 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
234 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
235 ; KNL_64-NEXT: vzeroupper
238 ; KNL_32-LABEL: test5:
240 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
241 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
242 ; KNL_32-NEXT: kmovw %k1, %k2
243 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
244 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
245 ; KNL_32-NEXT: vzeroupper
250 ; SKX-NEXT: kmovw %esi, %k1
251 ; SKX-NEXT: kmovw %k1, %k2
252 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
253 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
254 ; SKX-NEXT: vzeroupper
257 ; SKX_32-LABEL: test5:
259 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
260 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
261 ; SKX_32-NEXT: kmovw %k1, %k2
262 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
263 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
264 ; SKX_32-NEXT: vzeroupper
267 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
268 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
270 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
271 %imask = bitcast i16 %mask to <16 x i1>
272 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
273 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
277 declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
278 declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
281 ; SCALAR-LABEL: test6
282 ; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4
283 ; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i64 1
284 ; SCALAR-NEXT: %Ptr12 = extractelement <8 x i32*> %ptr, i64 1
285 ; SCALAR-NEXT: store i32 %Elt1, i32* %Ptr12, align 4
286 ; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i64 2
287 ; SCALAR-NEXT: %Ptr23 = extractelement <8 x i32*> %ptr, i64 2
288 ; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4
290 define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
291 ; KNL_64-LABEL: test6:
293 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
294 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
295 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
296 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
297 ; KNL_64-NEXT: vmovdqa %ymm2, %ymm0
300 ; KNL_32-LABEL: test6:
302 ; KNL_32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
303 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
304 ; KNL_32-NEXT: movw $255, %ax
305 ; KNL_32-NEXT: kmovw %eax, %k1
306 ; KNL_32-NEXT: kmovw %k1, %k2
307 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm2 {%k2}
308 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
309 ; KNL_32-NEXT: vmovdqa %ymm2, %ymm0
314 ; SKX-NEXT: kxnorw %k0, %k0, %k1
315 ; SKX-NEXT: kxnorw %k0, %k0, %k2
316 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
317 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
318 ; SKX-NEXT: vmovdqa %ymm2, %ymm0
321 ; SKX_32-LABEL: test6:
323 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
324 ; SKX_32-NEXT: kxnorw %k0, %k0, %k2
325 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2}
326 ; SKX_32-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1}
327 ; SKX_32-NEXT: vmovdqa %ymm2, %ymm0
330 %a = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
332 call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
336 define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
338 ; KNL_64-LABEL: test7:
340 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
341 ; KNL_64-NEXT: kmovw %esi, %k0
342 ; KNL_64-NEXT: kshiftlw $8, %k0, %k0
343 ; KNL_64-NEXT: kshiftrw $8, %k0, %k1
344 ; KNL_64-NEXT: kmovw %k1, %k2
345 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
346 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
347 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
348 ; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0
351 ; KNL_32-LABEL: test7:
353 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
354 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
355 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
356 ; KNL_32-NEXT: kmovw %ecx, %k0
357 ; KNL_32-NEXT: kshiftlw $8, %k0, %k0
358 ; KNL_32-NEXT: kshiftrw $8, %k0, %k1
359 ; KNL_32-NEXT: kmovw %k1, %k2
360 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
361 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
362 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
363 ; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
368 ; SKX-NEXT: kmovw %esi, %k1
369 ; SKX-NEXT: kmovw %k1, %k2
370 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
371 ; SKX-NEXT: vmovdqa %ymm1, %ymm2
372 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
373 ; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0
376 ; SKX_32-LABEL: test7:
378 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
379 ; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
380 ; SKX_32-NEXT: kmovw %k1, %k2
381 ; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm1 {%k2}
382 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm2
383 ; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm2 {%k1}
384 ; SKX_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
387 %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
388 %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
390 %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind
391 %imask = bitcast i8 %mask to <8 x i1>
392 %gt1 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
393 %gt2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
394 %res = add <8 x i32> %gt1, %gt2
398 ; No uniform base in this case, index <8 x i64> contains addresses,
399 ; each gather call will be split into two
400 define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
401 ; KNL_64-LABEL: test8:
403 ; KNL_64-NEXT: kmovw %edi, %k1
404 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
405 ; KNL_64-NEXT: kmovw %k2, %k3
406 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
407 ; KNL_64-NEXT: kmovw %k1, %k3
408 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
409 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
410 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
411 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
412 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
413 ; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0
416 ; KNL_32-LABEL: test8:
418 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
419 ; KNL_32-NEXT: kmovw %k1, %k2
420 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
421 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
422 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
423 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
428 ; SKX-NEXT: kmovw %edi, %k1
429 ; SKX-NEXT: kshiftrw $8, %k1, %k2
430 ; SKX-NEXT: kmovw %k2, %k3
431 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
432 ; SKX-NEXT: kmovw %k1, %k3
433 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
434 ; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
435 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
436 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
437 ; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
438 ; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
441 ; SKX_32-LABEL: test8:
443 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
444 ; SKX_32-NEXT: kmovw %k1, %k2
445 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
446 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
447 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
448 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
451 %imask = bitcast i16 %mask to <16 x i1>
452 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
453 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
454 %res = add <16 x i32> %gt1, %gt2
458 %struct.RT = type { i8, [10 x [20 x i32]], i8 }
459 %struct.ST = type { i32, double, %struct.RT }
461 ; Masked gather for agregate types
462 ; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
465 define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
466 ; KNL_64-LABEL: test9:
467 ; KNL_64: # %bb.0: # %entry
468 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
469 ; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
470 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
471 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
472 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
473 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
474 ; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
475 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
476 ; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
477 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
478 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
479 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
480 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
481 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
484 ; KNL_32-LABEL: test9:
485 ; KNL_32: # %bb.0: # %entry
486 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
487 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
488 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
489 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
490 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
491 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
492 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
493 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
494 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
495 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1
496 ; KNL_32-NEXT: movw $255, %ax
497 ; KNL_32-NEXT: kmovw %eax, %k1
498 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1}
499 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
502 ; SKX_SMALL-LABEL: test9:
503 ; SKX_SMALL: # %bb.0: # %entry
504 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
505 ; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
506 ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
507 ; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
508 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
509 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
510 ; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
511 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
512 ; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
513 ; SKX_SMALL-NEXT: retq
515 ; SKX_LARGE-LABEL: test9:
516 ; SKX_LARGE: # %bb.0: # %entry
517 ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2
518 ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
519 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
520 ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
521 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
522 ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
523 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
524 ; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
525 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
526 ; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1
527 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
528 ; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
529 ; SKX_LARGE-NEXT: retq
531 ; SKX_32-LABEL: test9:
532 ; SKX_32: # %bb.0: # %entry
533 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
534 ; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
535 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
536 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
537 ; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
538 ; SKX_32-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
539 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
540 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1}
543 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
544 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
546 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
547 %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
551 define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
552 ; KNL_64-LABEL: test10:
553 ; KNL_64: # %bb.0: # %entry
554 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
555 ; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
556 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
557 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
558 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
559 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
560 ; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
561 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
562 ; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
563 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
564 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
565 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
566 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
567 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
570 ; KNL_32-LABEL: test10:
571 ; KNL_32: # %bb.0: # %entry
572 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
573 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
574 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
575 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
576 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
577 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
578 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
579 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
580 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
581 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1
582 ; KNL_32-NEXT: movw $255, %ax
583 ; KNL_32-NEXT: kmovw %eax, %k1
584 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1}
585 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
588 ; SKX_SMALL-LABEL: test10:
589 ; SKX_SMALL: # %bb.0: # %entry
590 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
591 ; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
592 ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
593 ; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
594 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
595 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
596 ; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
597 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
598 ; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
599 ; SKX_SMALL-NEXT: retq
601 ; SKX_LARGE-LABEL: test10:
602 ; SKX_LARGE: # %bb.0: # %entry
603 ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2
604 ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
605 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
606 ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
607 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
608 ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
609 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
610 ; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
611 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
612 ; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1
613 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
614 ; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
615 ; SKX_LARGE-NEXT: retq
617 ; SKX_32-LABEL: test10:
618 ; SKX_32: # %bb.0: # %entry
619 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
620 ; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
621 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
622 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
623 ; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
624 ; SKX_32-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
625 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
626 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1}
629 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
630 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
632 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
633 %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
637 ; Splat index in GEP, requires broadcast
638 define <16 x float> @test11(float* %base, i32 %ind) {
639 ; KNL_64-LABEL: test11:
641 ; KNL_64-NEXT: vpbroadcastd %esi, %zmm1
642 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
643 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
646 ; KNL_32-LABEL: test11:
648 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
649 ; KNL_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
650 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
651 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
656 ; SKX-NEXT: vpbroadcastd %esi, %zmm1
657 ; SKX-NEXT: kxnorw %k0, %k0, %k1
658 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
661 ; SKX_32-LABEL: test11:
663 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
664 ; SKX_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
665 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
666 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
669 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
670 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
672 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
674 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
678 ; We are checking the uniform base here. It is taken directly from input to vgatherdps
679 define <16 x float> @test12(float* %base, <16 x i32> %ind) {
680 ; KNL_64-LABEL: test12:
682 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
683 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
684 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
687 ; KNL_32-LABEL: test12:
689 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
690 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
691 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
692 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
697 ; SKX-NEXT: kxnorw %k0, %k0, %k1
698 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
699 ; SKX-NEXT: vmovaps %zmm1, %zmm0
702 ; SKX_32-LABEL: test12:
704 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
705 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
706 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
707 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
710 %sext_ind = sext <16 x i32> %ind to <16 x i64>
711 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
713 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
717 ; The same as the previous, but the mask is undefined
718 define <16 x float> @test13(float* %base, <16 x i32> %ind) {
719 ; KNL_64-LABEL: test13:
721 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
722 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
723 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
726 ; KNL_32-LABEL: test13:
728 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
729 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
730 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
731 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
736 ; SKX-NEXT: kxnorw %k0, %k0, %k1
737 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
738 ; SKX-NEXT: vmovaps %zmm1, %zmm0
741 ; SKX_32-LABEL: test13:
743 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
744 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
745 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
746 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
749 %sext_ind = sext <16 x i32> %ind to <16 x i64>
750 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
752 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
756 ; The base pointer is not splat, can't find unform base
757 define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
758 ; KNL_64-LABEL: test14:
760 ; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0
761 ; KNL_64-NEXT: vmovd %esi, %xmm1
762 ; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1
763 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
764 ; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1
765 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
766 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
767 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
768 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
771 ; KNL_32-LABEL: test14:
773 ; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0
774 ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
775 ; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
776 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
777 ; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
782 ; SKX-NEXT: vpbroadcastq %xmm0, %zmm0
783 ; SKX-NEXT: vpbroadcastd %esi, %ymm1
784 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
785 ; SKX-NEXT: vpsllq $2, %zmm1, %zmm1
786 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
787 ; SKX-NEXT: kxnorw %k0, %k0, %k1
788 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
789 ; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
792 ; SKX_32-LABEL: test14:
794 ; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0
795 ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
796 ; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
797 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
798 ; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
801 %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
802 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
804 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
806 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
810 declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
811 declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
812 declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
814 ; Gather smaller than existing instruction
815 define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
816 ; KNL_64-LABEL: test15:
818 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
819 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
820 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
821 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
822 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
823 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
824 ; KNL_64-NEXT: vmovaps %xmm1, %xmm0
825 ; KNL_64-NEXT: vzeroupper
828 ; KNL_32-LABEL: test15:
830 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
831 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
832 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
833 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
834 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
835 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
836 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
837 ; KNL_32-NEXT: vmovaps %xmm1, %xmm0
838 ; KNL_32-NEXT: vzeroupper
843 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
844 ; SKX-NEXT: vpmovd2m %xmm1, %k1
845 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
846 ; SKX-NEXT: vmovaps %xmm1, %xmm0
849 ; SKX_32-LABEL: test15:
851 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
852 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
853 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
854 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
855 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
858 %sext_ind = sext <4 x i32> %ind to <4 x i64>
859 %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
860 %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
864 ; Gather smaller than existing instruction
865 define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
866 ; KNL_64-LABEL: test16:
868 ; KNL_64-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
869 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
870 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
871 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
872 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
873 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
874 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1}
875 ; KNL_64-NEXT: vmovapd %ymm2, %ymm0
878 ; KNL_32-LABEL: test16:
880 ; KNL_32-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
881 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
882 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
883 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
884 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
885 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
886 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
887 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1}
888 ; KNL_32-NEXT: vmovapd %ymm2, %ymm0
893 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
894 ; SKX-NEXT: vpmovd2m %xmm1, %k1
895 ; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
896 ; SKX-NEXT: vmovapd %ymm2, %ymm0
899 ; SKX_32-LABEL: test16:
901 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
902 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
903 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
904 ; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
905 ; SKX_32-NEXT: vmovapd %ymm2, %ymm0
908 %sext_ind = sext <4 x i32> %ind to <4 x i64>
909 %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
910 %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
914 define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
915 ; KNL_64-LABEL: test17:
917 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
918 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
919 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
920 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
921 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
922 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
923 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1}
924 ; KNL_64-NEXT: vmovapd %xmm2, %xmm0
925 ; KNL_64-NEXT: vzeroupper
928 ; KNL_32-LABEL: test17:
930 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
931 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
932 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
933 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
934 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
935 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
936 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
937 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1}
938 ; KNL_32-NEXT: vmovapd %xmm2, %xmm0
939 ; KNL_32-NEXT: vzeroupper
944 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
945 ; SKX-NEXT: vpmovq2m %xmm1, %k1
946 ; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %xmm2 {%k1}
947 ; SKX-NEXT: vmovapd %xmm2, %xmm0
950 ; SKX_32-LABEL: test17:
952 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
953 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
954 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
955 ; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %xmm2 {%k1}
956 ; SKX_32-NEXT: vmovapd %xmm2, %xmm0
959 %sext_ind = sext <2 x i32> %ind to <2 x i64>
960 %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
961 %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
965 declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
966 declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
967 declare void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
968 declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
969 declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
971 define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
972 ; KNL_64-LABEL: test18:
974 ; KNL_64-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
975 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
976 ; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2
977 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0
978 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
979 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
980 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
981 ; KNL_64-NEXT: vzeroupper
984 ; KNL_32-LABEL: test18:
986 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
987 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
988 ; KNL_32-NEXT: vpslld $31, %xmm2, %xmm2
989 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k0
990 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
991 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
992 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
993 ; KNL_32-NEXT: vzeroupper
998 ; SKX-NEXT: vpslld $31, %xmm2, %xmm2
999 ; SKX-NEXT: vpmovd2m %xmm2, %k1
1000 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
1001 ; SKX-NEXT: vzeroupper
1004 ; SKX_32-LABEL: test18:
1006 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
1007 ; SKX_32-NEXT: vpmovd2m %xmm2, %k1
1008 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
1010 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
1014 define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
1015 ; KNL_64-LABEL: test19:
1017 ; KNL_64-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
1018 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1019 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
1020 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
1021 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
1022 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
1023 ; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
1024 ; KNL_64-NEXT: vzeroupper
1027 ; KNL_32-LABEL: test19:
1029 ; KNL_32-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
1030 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1031 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
1032 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
1033 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
1034 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
1035 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1036 ; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
1037 ; KNL_32-NEXT: vzeroupper
1040 ; SKX-LABEL: test19:
1042 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
1043 ; SKX-NEXT: vpmovd2m %xmm1, %k1
1044 ; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
1045 ; SKX-NEXT: vzeroupper
1048 ; SKX_32-LABEL: test19:
1050 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
1051 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
1052 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1053 ; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
1054 ; SKX_32-NEXT: vzeroupper
1056 %gep = getelementptr double, double* %ptr, <4 x i64> %ind
1057 call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
1061 ; Data type requires widening
1062 define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
1063 ; KNL_64-LABEL: test20:
1065 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1066 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1067 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
1068 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
1069 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1070 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1071 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
1072 ; KNL_64-NEXT: vzeroupper
1075 ; KNL_32-LABEL: test20:
1077 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1078 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1079 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
1080 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
1081 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1082 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1083 ; KNL_32-NEXT: vscatterdps %zmm0, (,%zmm1) {%k1}
1084 ; KNL_32-NEXT: vzeroupper
1087 ; SKX-LABEL: test20:
1089 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
1090 ; SKX-NEXT: vpmovq2m %xmm2, %k1
1091 ; SKX-NEXT: vscatterqps %xmm0, (,%xmm1) {%k1}
1094 ; SKX_32-LABEL: test20:
1096 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
1097 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1
1098 ; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1}
1100 call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
1104 ; Data type requires promotion
1105 define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
1106 ; KNL_64-LABEL: test21:
1108 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1109 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1110 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
1111 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
1112 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1113 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1114 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1115 ; KNL_64-NEXT: vzeroupper
1118 ; KNL_32-LABEL: test21:
1120 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1121 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1122 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
1123 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
1124 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1125 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1126 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
1127 ; KNL_32-NEXT: vzeroupper
1130 ; SKX-LABEL: test21:
1132 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
1133 ; SKX-NEXT: vpmovq2m %xmm2, %k1
1134 ; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1}
1137 ; SKX_32-LABEL: test21:
1139 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
1140 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1
1141 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
1143 call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
1147 ; The result type requires widening
1148 declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
1150 define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
1151 ; KNL_64-LABEL: test22:
1153 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1154 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1155 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1156 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1157 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1158 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1159 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
1160 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1161 ; KNL_64-NEXT: vzeroupper
1164 ; KNL_32-LABEL: test22:
1166 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1167 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1168 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1169 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1170 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1171 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1172 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1173 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm2 {%k1}
1174 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1175 ; KNL_32-NEXT: vzeroupper
1178 ; SKX-LABEL: test22:
1180 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1181 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1182 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
1183 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1186 ; SKX_32-LABEL: test22:
1188 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1189 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1190 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1191 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
1192 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1194 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1195 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1196 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1200 define <2 x float> @test22a(float* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x float> %src0) {
1201 ; KNL_64-LABEL: test22a:
1203 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
1204 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1205 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1206 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1207 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1208 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1209 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
1210 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1211 ; KNL_64-NEXT: vzeroupper
1214 ; KNL_32-LABEL: test22a:
1216 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
1217 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1218 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1219 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1220 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1221 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1222 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1223 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
1224 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1225 ; KNL_32-NEXT: vzeroupper
1228 ; SKX-LABEL: test22a:
1230 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1231 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1232 ; SKX-NEXT: vgatherqps (%rdi,%xmm0,4), %xmm2 {%k1}
1233 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1236 ; SKX_32-LABEL: test22a:
1238 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1239 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1240 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1241 ; SKX_32-NEXT: vgatherqps (%eax,%xmm0,4), %xmm2 {%k1}
1242 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1244 %gep.random = getelementptr float, float* %base, <2 x i64> %ind
1245 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1249 declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
1250 declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
1252 define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1253 ; KNL_64-LABEL: test23:
1255 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1256 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1257 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1258 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1259 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1260 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1261 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
1262 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1263 ; KNL_64-NEXT: vzeroupper
1266 ; KNL_32-LABEL: test23:
1268 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1269 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1270 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1271 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1272 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1273 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1274 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1275 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
1276 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1277 ; KNL_32-NEXT: vzeroupper
1280 ; SKX-LABEL: test23:
1282 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1283 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1284 ; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1}
1285 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1288 ; SKX_32-LABEL: test23:
1290 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1291 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1292 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1293 ; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm2 {%k1}
1294 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1296 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1297 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1298 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1302 define <2 x i32> @test23b(i32* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1303 ; KNL_64-LABEL: test23b:
1305 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
1306 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1307 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1308 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1309 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1310 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1311 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
1312 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1313 ; KNL_64-NEXT: vzeroupper
1316 ; KNL_32-LABEL: test23b:
1318 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
1319 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1320 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1321 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1322 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1323 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1324 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1325 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
1326 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1327 ; KNL_32-NEXT: vzeroupper
1330 ; SKX-LABEL: test23b:
1332 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1333 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1334 ; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm2 {%k1}
1335 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1338 ; SKX_32-LABEL: test23b:
1340 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1341 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1342 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1343 ; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm2 {%k1}
1344 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1346 %gep.random = getelementptr i32, i32* %base, <2 x i64> %ind
1347 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1351 define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
1352 ; KNL_64-LABEL: test24:
1354 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1355 ; KNL_64-NEXT: movw $3, %ax
1356 ; KNL_64-NEXT: kmovw %eax, %k1
1357 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
1358 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
1359 ; KNL_64-NEXT: vzeroupper
1362 ; KNL_32-LABEL: test24:
1364 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1365 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1366 ; KNL_32-NEXT: movw $3, %cx
1367 ; KNL_32-NEXT: kmovw %ecx, %k1
1368 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
1369 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
1370 ; KNL_32-NEXT: vzeroupper
1373 ; SKX-LABEL: test24:
1375 ; SKX-NEXT: movb $3, %al
1376 ; SKX-NEXT: kmovw %eax, %k1
1377 ; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
1378 ; SKX-NEXT: vmovdqa %xmm1, %xmm0
1381 ; SKX_32-LABEL: test24:
1383 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1384 ; SKX_32-NEXT: movb $3, %cl
1385 ; SKX_32-NEXT: kmovw %ecx, %k1
1386 ; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
1387 ; SKX_32-NEXT: vmovdqa %xmm1, %xmm0
1389 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1390 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1391 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1395 define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
1396 ; KNL_64-LABEL: test25:
1398 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1399 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1400 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1401 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1402 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1403 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1404 ; KNL_64-NEXT: vpgatherdq (%rdi,%ymm0,8), %zmm2 {%k1}
1405 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1406 ; KNL_64-NEXT: vzeroupper
1409 ; KNL_32-LABEL: test25:
1411 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1412 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1413 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1414 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1415 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1416 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1417 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1418 ; KNL_32-NEXT: vpgatherdq (%eax,%ymm0,8), %zmm2 {%k1}
1419 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1420 ; KNL_32-NEXT: vzeroupper
1423 ; SKX-LABEL: test25:
1425 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1426 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1427 ; SKX-NEXT: vpgatherdq (%rdi,%xmm0,8), %xmm2 {%k1}
1428 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1431 ; SKX_32-LABEL: test25:
1433 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1434 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1435 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1436 ; SKX_32-NEXT: vpgatherdq (%eax,%xmm0,8), %xmm2 {%k1}
1437 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1439 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1440 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1441 %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
1445 define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
1446 ; KNL_64-LABEL: test26:
1448 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1449 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1450 ; KNL_64-NEXT: movb $3, %al
1451 ; KNL_64-NEXT: kmovw %eax, %k1
1452 ; KNL_64-NEXT: vpgatherdq (%rdi,%ymm0,8), %zmm1 {%k1}
1453 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
1454 ; KNL_64-NEXT: vzeroupper
1457 ; KNL_32-LABEL: test26:
1459 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1460 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1461 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1462 ; KNL_32-NEXT: movb $3, %cl
1463 ; KNL_32-NEXT: kmovw %ecx, %k1
1464 ; KNL_32-NEXT: vpgatherdq (%eax,%ymm0,8), %zmm1 {%k1}
1465 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
1466 ; KNL_32-NEXT: vzeroupper
1469 ; SKX-LABEL: test26:
1471 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1472 ; SKX-NEXT: vpgatherdq (%rdi,%xmm0,8), %xmm1 {%k1}
1473 ; SKX-NEXT: vmovdqa %xmm1, %xmm0
1476 ; SKX_32-LABEL: test26:
1478 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1479 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
1480 ; SKX_32-NEXT: vpgatherdq (%eax,%xmm0,8), %xmm1 {%k1}
1481 ; SKX_32-NEXT: vmovdqa %xmm1, %xmm0
1483 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1484 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1485 %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
1489 ; Result type requires widening; all-ones mask
1490 define <2 x float> @test27(float* %base, <2 x i32> %ind) {
1491 ; KNL_64-LABEL: test27:
1493 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1494 ; KNL_64-NEXT: movw $3, %ax
1495 ; KNL_64-NEXT: kmovw %eax, %k1
1496 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1497 ; KNL_64-NEXT: vmovaps %xmm1, %xmm0
1498 ; KNL_64-NEXT: vzeroupper
1501 ; KNL_32-LABEL: test27:
1503 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1504 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1505 ; KNL_32-NEXT: movw $3, %cx
1506 ; KNL_32-NEXT: kmovw %ecx, %k1
1507 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1508 ; KNL_32-NEXT: vmovaps %xmm1, %xmm0
1509 ; KNL_32-NEXT: vzeroupper
1512 ; SKX-LABEL: test27:
1514 ; SKX-NEXT: movb $3, %al
1515 ; SKX-NEXT: kmovw %eax, %k1
1516 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
1517 ; SKX-NEXT: vmovaps %xmm1, %xmm0
1520 ; SKX_32-LABEL: test27:
1522 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1523 ; SKX_32-NEXT: movb $3, %cl
1524 ; SKX_32-NEXT: kmovw %ecx, %k1
1525 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
1526 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
1528 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1529 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1530 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
1534 ; Data type requires promotion, mask is all-ones
1535 define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
1536 ; KNL_64-LABEL: test28:
1538 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1539 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1540 ; KNL_64-NEXT: movb $3, %al
1541 ; KNL_64-NEXT: kmovw %eax, %k1
1542 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1543 ; KNL_64-NEXT: vzeroupper
1546 ; KNL_32-LABEL: test28:
1548 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1549 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1550 ; KNL_32-NEXT: movw $3, %ax
1551 ; KNL_32-NEXT: kmovw %eax, %k1
1552 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
1553 ; KNL_32-NEXT: vzeroupper
1556 ; SKX-LABEL: test28:
1558 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1559 ; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1}
1562 ; SKX_32-LABEL: test28:
1564 ; SKX_32-NEXT: movb $3, %al
1565 ; SKX_32-NEXT: kmovw %eax, %k1
1566 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
1568 call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
1572 ; SCALAR-LABEL: test29
1573 ; SCALAR: extractelement <16 x float*>
1574 ; SCALAR-NEXT: load float
1575 ; SCALAR-NEXT: insertelement <16 x float>
1576 ; SCALAR-NEXT: extractelement <16 x float*>
1577 ; SCALAR-NEXT: load float
1579 define <16 x float> @test29(float* %base, <16 x i32> %ind) {
1580 ; KNL_64-LABEL: test29:
1582 ; KNL_64-NEXT: movw $44, %ax
1583 ; KNL_64-NEXT: kmovw %eax, %k1
1584 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1585 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
1588 ; KNL_32-LABEL: test29:
1590 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1591 ; KNL_32-NEXT: movw $44, %cx
1592 ; KNL_32-NEXT: kmovw %ecx, %k1
1593 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1594 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1597 ; SKX-LABEL: test29:
1599 ; SKX-NEXT: movw $44, %ax
1600 ; SKX-NEXT: kmovw %eax, %k1
1601 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1602 ; SKX-NEXT: vmovaps %zmm1, %zmm0
1605 ; SKX_32-LABEL: test29:
1607 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1608 ; SKX_32-NEXT: movw $44, %cx
1609 ; SKX_32-NEXT: kmovw %ecx, %k1
1610 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1611 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
1614 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
1615 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
1617 %sext_ind = sext <16 x i32> %ind to <16 x i64>
1618 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
1620 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
1621 ret <16 x float>%res
1624 ; Check non-power-of-2 case. It should be scalarized.
1625 declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
1626 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
1627 ; KNL_64-LABEL: test30:
1629 ; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2
1630 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0
1631 ; KNL_64-NEXT: kshiftrw $1, %k0, %k1
1632 ; KNL_64-NEXT: kshiftrw $2, %k0, %k2
1633 ; KNL_64-NEXT: kmovw %k0, %eax
1634 ; KNL_64-NEXT: andb $1, %al
1635 ; KNL_64-NEXT: kmovw %k1, %ecx
1636 ; KNL_64-NEXT: andb $1, %cl
1637 ; KNL_64-NEXT: addb %cl, %cl
1638 ; KNL_64-NEXT: orb %al, %cl
1639 ; KNL_64-NEXT: kmovw %k2, %eax
1640 ; KNL_64-NEXT: andb $1, %al
1641 ; KNL_64-NEXT: shlb $2, %al
1642 ; KNL_64-NEXT: orb %cl, %al
1643 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
1644 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
1645 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
1646 ; KNL_64-NEXT: testb $1, %al
1647 ; KNL_64-NEXT: jne .LBB31_1
1648 ; KNL_64-NEXT: # %bb.2: # %else
1649 ; KNL_64-NEXT: testb $2, %al
1650 ; KNL_64-NEXT: jne .LBB31_3
1651 ; KNL_64-NEXT: .LBB31_4: # %else2
1652 ; KNL_64-NEXT: testb $4, %al
1653 ; KNL_64-NEXT: jne .LBB31_5
1654 ; KNL_64-NEXT: .LBB31_6: # %else5
1655 ; KNL_64-NEXT: vmovdqa %xmm3, %xmm0
1656 ; KNL_64-NEXT: vzeroupper
1658 ; KNL_64-NEXT: .LBB31_1: # %cond.load
1659 ; KNL_64-NEXT: vmovq %xmm0, %rcx
1660 ; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm3, %xmm3
1661 ; KNL_64-NEXT: testb $2, %al
1662 ; KNL_64-NEXT: je .LBB31_4
1663 ; KNL_64-NEXT: .LBB31_3: # %cond.load1
1664 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
1665 ; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm3, %xmm3
1666 ; KNL_64-NEXT: testb $4, %al
1667 ; KNL_64-NEXT: je .LBB31_6
1668 ; KNL_64-NEXT: .LBB31_5: # %cond.load4
1669 ; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
1670 ; KNL_64-NEXT: vmovq %xmm0, %rax
1671 ; KNL_64-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3
1672 ; KNL_64-NEXT: vmovdqa %xmm3, %xmm0
1673 ; KNL_64-NEXT: vzeroupper
1676 ; KNL_32-LABEL: test30:
1678 ; KNL_32-NEXT: subl $12, %esp
1679 ; KNL_32-NEXT: .cfi_def_cfa_offset 16
1680 ; KNL_32-NEXT: vmovdqa %xmm0, %xmm3
1681 ; KNL_32-NEXT: vpslld $31, %xmm2, %xmm0
1682 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0
1683 ; KNL_32-NEXT: kshiftrw $1, %k0, %k1
1684 ; KNL_32-NEXT: kshiftrw $2, %k0, %k2
1685 ; KNL_32-NEXT: kmovw %k0, %eax
1686 ; KNL_32-NEXT: andb $1, %al
1687 ; KNL_32-NEXT: kmovw %k1, %ecx
1688 ; KNL_32-NEXT: andb $1, %cl
1689 ; KNL_32-NEXT: addb %cl, %cl
1690 ; KNL_32-NEXT: orb %al, %cl
1691 ; KNL_32-NEXT: kmovw %k2, %eax
1692 ; KNL_32-NEXT: andb $1, %al
1693 ; KNL_32-NEXT: shlb $2, %al
1694 ; KNL_32-NEXT: orb %cl, %al
1695 ; KNL_32-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0
1696 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
1697 ; KNL_32-NEXT: vpaddd %xmm1, %xmm3, %xmm1
1698 ; KNL_32-NEXT: testb $1, %al
1699 ; KNL_32-NEXT: jne .LBB31_1
1700 ; KNL_32-NEXT: # %bb.2: # %else
1701 ; KNL_32-NEXT: testb $2, %al
1702 ; KNL_32-NEXT: jne .LBB31_3
1703 ; KNL_32-NEXT: .LBB31_4: # %else2
1704 ; KNL_32-NEXT: testb $4, %al
1705 ; KNL_32-NEXT: jne .LBB31_5
1706 ; KNL_32-NEXT: .LBB31_6: # %else5
1707 ; KNL_32-NEXT: addl $12, %esp
1708 ; KNL_32-NEXT: .cfi_def_cfa_offset 4
1709 ; KNL_32-NEXT: vzeroupper
1711 ; KNL_32-NEXT: .LBB31_1: # %cond.load
1712 ; KNL_32-NEXT: .cfi_def_cfa_offset 16
1713 ; KNL_32-NEXT: vmovd %xmm1, %ecx
1714 ; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm0, %xmm0
1715 ; KNL_32-NEXT: testb $2, %al
1716 ; KNL_32-NEXT: je .LBB31_4
1717 ; KNL_32-NEXT: .LBB31_3: # %cond.load1
1718 ; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
1719 ; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
1720 ; KNL_32-NEXT: testb $4, %al
1721 ; KNL_32-NEXT: je .LBB31_6
1722 ; KNL_32-NEXT: .LBB31_5: # %cond.load4
1723 ; KNL_32-NEXT: vpextrd $2, %xmm1, %eax
1724 ; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
1725 ; KNL_32-NEXT: addl $12, %esp
1726 ; KNL_32-NEXT: .cfi_def_cfa_offset 4
1727 ; KNL_32-NEXT: vzeroupper
1730 ; SKX-LABEL: test30:
1732 ; SKX-NEXT: vpslld $31, %xmm2, %xmm2
1733 ; SKX-NEXT: vpmovd2m %xmm2, %k0
1734 ; SKX-NEXT: kshiftrb $1, %k0, %k1
1735 ; SKX-NEXT: kshiftrb $2, %k0, %k2
1736 ; SKX-NEXT: kmovw %k0, %eax
1737 ; SKX-NEXT: andb $1, %al
1738 ; SKX-NEXT: kmovw %k1, %ecx
1739 ; SKX-NEXT: andb $1, %cl
1740 ; SKX-NEXT: addb %cl, %cl
1741 ; SKX-NEXT: orb %al, %cl
1742 ; SKX-NEXT: kmovw %k2, %eax
1743 ; SKX-NEXT: andb $1, %al
1744 ; SKX-NEXT: shlb $2, %al
1745 ; SKX-NEXT: orb %cl, %al
1746 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
1747 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
1748 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
1749 ; SKX-NEXT: testb $1, %al
1750 ; SKX-NEXT: jne .LBB31_1
1751 ; SKX-NEXT: # %bb.2: # %else
1752 ; SKX-NEXT: testb $2, %al
1753 ; SKX-NEXT: jne .LBB31_3
1754 ; SKX-NEXT: .LBB31_4: # %else2
1755 ; SKX-NEXT: testb $4, %al
1756 ; SKX-NEXT: jne .LBB31_5
1757 ; SKX-NEXT: .LBB31_6: # %else5
1758 ; SKX-NEXT: vmovdqa %xmm3, %xmm0
1759 ; SKX-NEXT: vzeroupper
1761 ; SKX-NEXT: .LBB31_1: # %cond.load
1762 ; SKX-NEXT: vmovq %xmm0, %rcx
1763 ; SKX-NEXT: vpinsrd $0, (%rcx), %xmm3, %xmm3
1764 ; SKX-NEXT: testb $2, %al
1765 ; SKX-NEXT: je .LBB31_4
1766 ; SKX-NEXT: .LBB31_3: # %cond.load1
1767 ; SKX-NEXT: vpextrq $1, %xmm0, %rcx
1768 ; SKX-NEXT: vpinsrd $1, (%rcx), %xmm3, %xmm3
1769 ; SKX-NEXT: testb $4, %al
1770 ; SKX-NEXT: je .LBB31_6
1771 ; SKX-NEXT: .LBB31_5: # %cond.load4
1772 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
1773 ; SKX-NEXT: vmovq %xmm0, %rax
1774 ; SKX-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3
1775 ; SKX-NEXT: vmovdqa %xmm3, %xmm0
1776 ; SKX-NEXT: vzeroupper
1779 ; SKX_32-LABEL: test30:
1781 ; SKX_32-NEXT: subl $12, %esp
1782 ; SKX_32-NEXT: .cfi_def_cfa_offset 16
1783 ; SKX_32-NEXT: vmovdqa %xmm0, %xmm3
1784 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm0
1785 ; SKX_32-NEXT: vpmovd2m %xmm0, %k0
1786 ; SKX_32-NEXT: kshiftrb $1, %k0, %k1
1787 ; SKX_32-NEXT: kshiftrb $2, %k0, %k2
1788 ; SKX_32-NEXT: kmovw %k0, %eax
1789 ; SKX_32-NEXT: andb $1, %al
1790 ; SKX_32-NEXT: kmovw %k1, %ecx
1791 ; SKX_32-NEXT: andb $1, %cl
1792 ; SKX_32-NEXT: addb %cl, %cl
1793 ; SKX_32-NEXT: orb %al, %cl
1794 ; SKX_32-NEXT: kmovw %k2, %eax
1795 ; SKX_32-NEXT: andb $1, %al
1796 ; SKX_32-NEXT: shlb $2, %al
1797 ; SKX_32-NEXT: orb %cl, %al
1798 ; SKX_32-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0
1799 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
1800 ; SKX_32-NEXT: vpaddd %xmm1, %xmm3, %xmm1
1801 ; SKX_32-NEXT: testb $1, %al
1802 ; SKX_32-NEXT: jne .LBB31_1
1803 ; SKX_32-NEXT: # %bb.2: # %else
1804 ; SKX_32-NEXT: testb $2, %al
1805 ; SKX_32-NEXT: jne .LBB31_3
1806 ; SKX_32-NEXT: .LBB31_4: # %else2
1807 ; SKX_32-NEXT: testb $4, %al
1808 ; SKX_32-NEXT: jne .LBB31_5
1809 ; SKX_32-NEXT: .LBB31_6: # %else5
1810 ; SKX_32-NEXT: addl $12, %esp
1811 ; SKX_32-NEXT: .cfi_def_cfa_offset 4
1813 ; SKX_32-NEXT: .LBB31_1: # %cond.load
1814 ; SKX_32-NEXT: .cfi_def_cfa_offset 16
1815 ; SKX_32-NEXT: vmovd %xmm1, %ecx
1816 ; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm0, %xmm0
1817 ; SKX_32-NEXT: testb $2, %al
1818 ; SKX_32-NEXT: je .LBB31_4
1819 ; SKX_32-NEXT: .LBB31_3: # %cond.load1
1820 ; SKX_32-NEXT: vpextrd $1, %xmm1, %ecx
1821 ; SKX_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
1822 ; SKX_32-NEXT: testb $4, %al
1823 ; SKX_32-NEXT: je .LBB31_6
1824 ; SKX_32-NEXT: .LBB31_5: # %cond.load4
1825 ; SKX_32-NEXT: vpextrd $2, %xmm1, %eax
1826 ; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
1827 ; SKX_32-NEXT: addl $12, %esp
1828 ; SKX_32-NEXT: .cfi_def_cfa_offset 4
1831 %sext_ind = sext <3 x i32> %ind to <3 x i64>
1832 %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
1833 %res = call <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
1837 declare <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
1838 define <16 x float*> @test31(<16 x float**> %ptrs) {
1839 ; KNL_64-LABEL: test31:
1841 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
1842 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
1843 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
1844 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
1845 ; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0
1846 ; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm1
1849 ; KNL_32-LABEL: test31:
1851 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
1852 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
1853 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
1856 ; SKX-LABEL: test31:
1858 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1859 ; SKX-NEXT: kxnorw %k0, %k0, %k2
1860 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
1861 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
1862 ; SKX-NEXT: vmovdqa64 %zmm2, %zmm0
1863 ; SKX-NEXT: vmovdqa64 %zmm3, %zmm1
1866 ; SKX_32-LABEL: test31:
1868 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
1869 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
1870 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
1873 %res = call <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
1874 ret <16 x float*>%res
1877 define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
1878 ; KNL_64-LABEL: test_gather_16i32:
1880 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1881 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1882 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1883 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2
1884 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1885 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
1886 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
1887 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
1890 ; KNL_32-LABEL: test_gather_16i32:
1892 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1893 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1894 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1895 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
1896 ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
1899 ; SKX-LABEL: test_gather_16i32:
1901 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1902 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1903 ; SKX-NEXT: vpmovd2m %zmm2, %k1
1904 ; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm2
1905 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1906 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
1907 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
1908 ; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
1911 ; SKX_32-LABEL: test_gather_16i32:
1913 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1914 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1915 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
1916 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
1917 ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
1919 %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
1922 define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
1923 ; KNL_64-LABEL: test_gather_16i64:
1925 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1926 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1927 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1928 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1929 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
1930 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
1931 ; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm0
1932 ; KNL_64-NEXT: vmovdqa64 %zmm4, %zmm1
1935 ; KNL_32-LABEL: test_gather_16i64:
1937 ; KNL_32-NEXT: pushl %ebp
1938 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1939 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1940 ; KNL_32-NEXT: movl %esp, %ebp
1941 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1942 ; KNL_32-NEXT: andl $-64, %esp
1943 ; KNL_32-NEXT: subl $64, %esp
1944 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1945 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1946 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1947 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1948 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1949 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
1950 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1951 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
1952 ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
1953 ; KNL_32-NEXT: movl %ebp, %esp
1954 ; KNL_32-NEXT: popl %ebp
1955 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
1958 ; SKX-LABEL: test_gather_16i64:
1960 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1961 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1962 ; SKX-NEXT: vpmovd2m %zmm2, %k1
1963 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1964 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
1965 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
1966 ; SKX-NEXT: vmovdqa64 %zmm3, %zmm0
1967 ; SKX-NEXT: vmovdqa64 %zmm4, %zmm1
1970 ; SKX_32-LABEL: test_gather_16i64:
1972 ; SKX_32-NEXT: pushl %ebp
1973 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
1974 ; SKX_32-NEXT: .cfi_offset %ebp, -8
1975 ; SKX_32-NEXT: movl %esp, %ebp
1976 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
1977 ; SKX_32-NEXT: andl $-64, %esp
1978 ; SKX_32-NEXT: subl $64, %esp
1979 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1980 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1981 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
1982 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1983 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
1984 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
1985 ; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1986 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
1987 ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
1988 ; SKX_32-NEXT: movl %ebp, %esp
1989 ; SKX_32-NEXT: popl %ebp
1990 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
1992 %res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
1995 declare <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
1996 define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
1997 ; KNL_64-LABEL: test_gather_16f32:
1999 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2000 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2001 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2002 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2
2003 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2004 ; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
2005 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
2006 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
2009 ; KNL_32-LABEL: test_gather_16f32:
2011 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2012 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2013 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2014 ; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
2015 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
2018 ; SKX-LABEL: test_gather_16f32:
2020 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2021 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2022 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2023 ; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm2
2024 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2025 ; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
2026 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
2027 ; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
2030 ; SKX_32-LABEL: test_gather_16f32:
2032 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2033 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2034 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2035 ; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
2036 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
2038 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
2039 ret <16 x float> %res
2041 define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
2042 ; KNL_64-LABEL: test_gather_16f64:
2044 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2045 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2046 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2047 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2048 ; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
2049 ; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
2050 ; KNL_64-NEXT: vmovapd %zmm3, %zmm0
2051 ; KNL_64-NEXT: vmovapd %zmm4, %zmm1
2054 ; KNL_32-LABEL: test_gather_16f64:
2056 ; KNL_32-NEXT: pushl %ebp
2057 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2058 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2059 ; KNL_32-NEXT: movl %esp, %ebp
2060 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2061 ; KNL_32-NEXT: andl $-64, %esp
2062 ; KNL_32-NEXT: subl $64, %esp
2063 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2064 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2065 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2066 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
2067 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2068 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
2069 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2070 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
2071 ; KNL_32-NEXT: vmovapd %zmm2, %zmm0
2072 ; KNL_32-NEXT: movl %ebp, %esp
2073 ; KNL_32-NEXT: popl %ebp
2074 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2077 ; SKX-LABEL: test_gather_16f64:
2079 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2080 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2081 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2082 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2083 ; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
2084 ; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
2085 ; SKX-NEXT: vmovapd %zmm3, %zmm0
2086 ; SKX-NEXT: vmovapd %zmm4, %zmm1
2089 ; SKX_32-LABEL: test_gather_16f64:
2091 ; SKX_32-NEXT: pushl %ebp
2092 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2093 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2094 ; SKX_32-NEXT: movl %esp, %ebp
2095 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2096 ; SKX_32-NEXT: andl $-64, %esp
2097 ; SKX_32-NEXT: subl $64, %esp
2098 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2099 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2100 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2101 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
2102 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2103 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
2104 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2105 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
2106 ; SKX_32-NEXT: vmovapd %zmm2, %zmm0
2107 ; SKX_32-NEXT: movl %ebp, %esp
2108 ; SKX_32-NEXT: popl %ebp
2109 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2111 %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
2112 ret <16 x double> %res
2114 declare <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
2115 define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
2116 ; KNL_64-LABEL: test_scatter_16i32:
2118 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2119 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2120 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2121 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2122 ; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
2123 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
2124 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
2125 ; KNL_64-NEXT: vzeroupper
2128 ; KNL_32-LABEL: test_scatter_16i32:
2130 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2131 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2132 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2133 ; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
2134 ; KNL_32-NEXT: vzeroupper
2137 ; SKX-LABEL: test_scatter_16i32:
2139 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2140 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2141 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2142 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2143 ; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
2144 ; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm0
2145 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
2146 ; SKX-NEXT: vzeroupper
2149 ; SKX_32-LABEL: test_scatter_16i32:
2151 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2152 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2153 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2154 ; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
2155 ; SKX_32-NEXT: vzeroupper
2157 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
2160 define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
2161 ; KNL_64-LABEL: test_scatter_16i64:
2163 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2164 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2165 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2166 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2167 ; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
2168 ; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
2169 ; KNL_64-NEXT: vzeroupper
2172 ; KNL_32-LABEL: test_scatter_16i64:
2174 ; KNL_32-NEXT: pushl %ebp
2175 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2176 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2177 ; KNL_32-NEXT: movl %esp, %ebp
2178 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2179 ; KNL_32-NEXT: andl $-64, %esp
2180 ; KNL_32-NEXT: subl $64, %esp
2181 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2182 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2183 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2184 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
2185 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2186 ; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
2187 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2188 ; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
2189 ; KNL_32-NEXT: movl %ebp, %esp
2190 ; KNL_32-NEXT: popl %ebp
2191 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2192 ; KNL_32-NEXT: vzeroupper
2195 ; SKX-LABEL: test_scatter_16i64:
2197 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2198 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2199 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2200 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2201 ; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
2202 ; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
2203 ; SKX-NEXT: vzeroupper
2206 ; SKX_32-LABEL: test_scatter_16i64:
2208 ; SKX_32-NEXT: pushl %ebp
2209 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2210 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2211 ; SKX_32-NEXT: movl %esp, %ebp
2212 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2213 ; SKX_32-NEXT: andl $-64, %esp
2214 ; SKX_32-NEXT: subl $64, %esp
2215 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2216 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2217 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2218 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
2219 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2220 ; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
2221 ; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2222 ; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
2223 ; SKX_32-NEXT: movl %ebp, %esp
2224 ; SKX_32-NEXT: popl %ebp
2225 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2226 ; SKX_32-NEXT: vzeroupper
2228 call void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
2231 declare void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
2232 define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
2233 ; KNL_64-LABEL: test_scatter_16f32:
2235 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2236 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2237 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2238 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2239 ; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
2240 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
2241 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
2242 ; KNL_64-NEXT: vzeroupper
2245 ; KNL_32-LABEL: test_scatter_16f32:
2247 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2248 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2249 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2250 ; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
2251 ; KNL_32-NEXT: vzeroupper
2254 ; SKX-LABEL: test_scatter_16f32:
2256 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2257 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2258 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2259 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2260 ; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
2261 ; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm0
2262 ; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
2263 ; SKX-NEXT: vzeroupper
2266 ; SKX_32-LABEL: test_scatter_16f32:
2268 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2269 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2270 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2271 ; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
2272 ; SKX_32-NEXT: vzeroupper
2274 call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
2277 declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
2278 define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
2279 ; KNL_64-LABEL: test_scatter_16f64:
2281 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2282 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2283 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2284 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2285 ; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
2286 ; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
2287 ; KNL_64-NEXT: vzeroupper
2290 ; KNL_32-LABEL: test_scatter_16f64:
2292 ; KNL_32-NEXT: pushl %ebp
2293 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2294 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2295 ; KNL_32-NEXT: movl %esp, %ebp
2296 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2297 ; KNL_32-NEXT: andl $-64, %esp
2298 ; KNL_32-NEXT: subl $64, %esp
2299 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2300 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2301 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2302 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
2303 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2304 ; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
2305 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2306 ; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
2307 ; KNL_32-NEXT: movl %ebp, %esp
2308 ; KNL_32-NEXT: popl %ebp
2309 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2310 ; KNL_32-NEXT: vzeroupper
2313 ; SKX-LABEL: test_scatter_16f64:
2315 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2316 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2317 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2318 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2319 ; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
2320 ; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
2321 ; SKX-NEXT: vzeroupper
2324 ; SKX_32-LABEL: test_scatter_16f64:
2326 ; SKX_32-NEXT: pushl %ebp
2327 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2328 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2329 ; SKX_32-NEXT: movl %esp, %ebp
2330 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2331 ; SKX_32-NEXT: andl $-64, %esp
2332 ; SKX_32-NEXT: subl $64, %esp
2333 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2334 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2335 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2336 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
2337 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2338 ; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
2339 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2340 ; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
2341 ; SKX_32-NEXT: movl %ebp, %esp
2342 ; SKX_32-NEXT: popl %ebp
2343 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2344 ; SKX_32-NEXT: vzeroupper
2346 call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
2349 declare void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
2351 define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) {
2352 ; KNL_64-LABEL: test_pr28312:
2354 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2355 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
2356 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
2357 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
2358 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
2359 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1}
2360 ; KNL_64-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2361 ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2364 ; KNL_32-LABEL: test_pr28312:
2366 ; KNL_32-NEXT: pushl %ebp
2367 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2368 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2369 ; KNL_32-NEXT: movl %esp, %ebp
2370 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2371 ; KNL_32-NEXT: andl $-32, %esp
2372 ; KNL_32-NEXT: subl $32, %esp
2373 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2374 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
2375 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
2376 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
2377 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
2378 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k1}
2379 ; KNL_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2380 ; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2381 ; KNL_32-NEXT: movl %ebp, %esp
2382 ; KNL_32-NEXT: popl %ebp
2383 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2386 ; SKX-LABEL: test_pr28312:
2388 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
2389 ; SKX-NEXT: vpmovd2m %xmm1, %k1
2390 ; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1}
2391 ; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2392 ; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2395 ; SKX_32-LABEL: test_pr28312:
2397 ; SKX_32-NEXT: pushl %ebp
2398 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2399 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2400 ; SKX_32-NEXT: movl %esp, %ebp
2401 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2402 ; SKX_32-NEXT: andl $-32, %esp
2403 ; SKX_32-NEXT: subl $32, %esp
2404 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
2405 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
2406 ; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1}
2407 ; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2408 ; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2409 ; SKX_32-NEXT: movl %ebp, %esp
2410 ; SKX_32-NEXT: popl %ebp
2411 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2413 %g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2414 %g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2415 %g3 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2416 %a = add <4 x i64> %g1, %g2
2417 %b = add <4 x i64> %a, %g3
2420 declare <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>)
2422 define <8 x i32> @test_global_array(<8 x i64> %indxs) {
2423 ; KNL_64-LABEL: test_global_array:
2425 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2426 ; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2427 ; KNL_64-NEXT: vmovdqa %ymm1, %ymm0
2430 ; KNL_32-LABEL: test_global_array:
2432 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2433 ; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2434 ; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
2437 ; SKX_SMALL-LABEL: test_global_array:
2438 ; SKX_SMALL: # %bb.0:
2439 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
2440 ; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2441 ; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0
2442 ; SKX_SMALL-NEXT: retq
2444 ; SKX_LARGE-LABEL: test_global_array:
2445 ; SKX_LARGE: # %bb.0:
2446 ; SKX_LARGE-NEXT: movabsq $glob_array, %rax
2447 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
2448 ; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
2449 ; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0
2450 ; SKX_LARGE-NEXT: retq
2452 ; SKX_32-LABEL: test_global_array:
2454 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2455 ; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2456 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm0
2458 %p = getelementptr inbounds [16 x i32], [16 x i32]* @glob_array, i64 0, <8 x i64> %indxs
2459 %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
2463 define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
2464 ; KNL_64-LABEL: test_global_array_zeroinitializer_index:
2466 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2467 ; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2468 ; KNL_64-NEXT: vmovdqa %ymm1, %ymm0
2471 ; KNL_32-LABEL: test_global_array_zeroinitializer_index:
2473 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2474 ; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2475 ; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
2478 ; SKX_SMALL-LABEL: test_global_array_zeroinitializer_index:
2479 ; SKX_SMALL: # %bb.0:
2480 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
2481 ; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2482 ; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0
2483 ; SKX_SMALL-NEXT: retq
2485 ; SKX_LARGE-LABEL: test_global_array_zeroinitializer_index:
2486 ; SKX_LARGE: # %bb.0:
2487 ; SKX_LARGE-NEXT: movabsq $glob_array, %rax
2488 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
2489 ; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
2490 ; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0
2491 ; SKX_LARGE-NEXT: retq
2493 ; SKX_32-LABEL: test_global_array_zeroinitializer_index:
2495 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2496 ; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2497 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm0
2499 %p = getelementptr inbounds [16 x i32], [16 x i32]* @glob_array, <8 x i64> zeroinitializer, <8 x i64> %indxs
2500 %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
2504 define void @v1_scatter(<1 x i32>%a1, <1 x i32*> %ptr, <1 x i1> %mask) {
2505 ; KNL_64-LABEL: v1_scatter:
2507 ; KNL_64-NEXT: testb $1, %dl
2508 ; KNL_64-NEXT: je .LBB44_2
2509 ; KNL_64-NEXT: # %bb.1: # %cond.store
2510 ; KNL_64-NEXT: movl %edi, (%rsi)
2511 ; KNL_64-NEXT: .LBB44_2: # %else
2514 ; KNL_32-LABEL: v1_scatter:
2516 ; KNL_32-NEXT: testb $1, {{[0-9]+}}(%esp)
2517 ; KNL_32-NEXT: je .LBB44_2
2518 ; KNL_32-NEXT: # %bb.1: # %cond.store
2519 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2520 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
2521 ; KNL_32-NEXT: movl %ecx, (%eax)
2522 ; KNL_32-NEXT: .LBB44_2: # %else
2525 ; SKX-LABEL: v1_scatter:
2527 ; SKX-NEXT: testb $1, %dl
2528 ; SKX-NEXT: je .LBB44_2
2529 ; SKX-NEXT: # %bb.1: # %cond.store
2530 ; SKX-NEXT: movl %edi, (%rsi)
2531 ; SKX-NEXT: .LBB44_2: # %else
2534 ; SKX_32-LABEL: v1_scatter:
2536 ; SKX_32-NEXT: testb $1, {{[0-9]+}}(%esp)
2537 ; SKX_32-NEXT: je .LBB44_2
2538 ; SKX_32-NEXT: # %bb.1: # %cond.store
2539 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2540 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
2541 ; SKX_32-NEXT: movl %ecx, (%eax)
2542 ; SKX_32-NEXT: .LBB44_2: # %else
2544 call void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32> %a1, <1 x i32*> %ptr, i32 4, <1 x i1> %mask)
2547 declare void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32>, <1 x i32*>, i32, <1 x i1>)
2549 define <1 x i32> @v1_gather(<1 x i32*> %ptr, <1 x i1> %mask, <1 x i32> %src0) {
2550 ; KNL_64-LABEL: v1_gather:
2552 ; KNL_64-NEXT: movl (%rdi), %eax
2555 ; KNL_32-LABEL: v1_gather:
2557 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2558 ; KNL_32-NEXT: movl (%eax), %eax
2561 ; SKX-LABEL: v1_gather:
2563 ; SKX-NEXT: movl (%rdi), %eax
2566 ; SKX_32-LABEL: v1_gather:
2568 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2569 ; SKX_32-NEXT: movl (%eax), %eax
2571 %res = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> %ptr, i32 4, <1 x i1> <i1 true>, <1 x i32> %src0)
2574 declare <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*>, i32, <1 x i1>, <1 x i32>)
2576 ; Make sure we don't crash when the index element type is larger than i64 and we need to widen the result
2577 ; This experienced a bad interaction when we widened and then tried to split.
2578 define <2 x float> @large_index(float* %base, <2 x i128> %ind, <2 x i1> %mask, <2 x float> %src0) {
2579 ; KNL_64-LABEL: large_index:
2581 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2582 ; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0
2583 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0
2584 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
2585 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
2586 ; KNL_64-NEXT: vmovq %rcx, %xmm0
2587 ; KNL_64-NEXT: vmovq %rsi, %xmm2
2588 ; KNL_64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2589 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k1}
2590 ; KNL_64-NEXT: vmovaps %xmm1, %xmm0
2591 ; KNL_64-NEXT: vzeroupper
2594 ; KNL_32-LABEL: large_index:
2596 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2597 ; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0
2598 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0
2599 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
2600 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
2601 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2602 ; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2603 ; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2604 ; KNL_32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2605 ; KNL_32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2606 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm1 {%k1}
2607 ; KNL_32-NEXT: vmovaps %xmm1, %xmm0
2608 ; KNL_32-NEXT: vzeroupper
2611 ; SKX-LABEL: large_index:
2613 ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
2614 ; SKX-NEXT: vpmovq2m %xmm0, %k1
2615 ; SKX-NEXT: vmovq %rcx, %xmm0
2616 ; SKX-NEXT: vmovq %rsi, %xmm2
2617 ; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2618 ; SKX-NEXT: vgatherqps (%rdi,%xmm0,4), %xmm1 {%k1}
2619 ; SKX-NEXT: vmovaps %xmm1, %xmm0
2622 ; SKX_32-LABEL: large_index:
2624 ; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0
2625 ; SKX_32-NEXT: vpmovq2m %xmm0, %k1
2626 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2627 ; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2628 ; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2629 ; SKX_32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2630 ; SKX_32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2631 ; SKX_32-NEXT: vgatherqps (%eax,%xmm0,4), %xmm1 {%k1}
2632 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
2634 %gep.random = getelementptr float, float* %base, <2 x i128> %ind
2635 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
2639 ; Make sure we allow index to be sign extended from a smaller than i32 element size.
2640 define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {
2641 ; KNL_64-LABEL: sext_i8_index:
2643 ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm1
2644 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2645 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2648 ; KNL_32-LABEL: sext_i8_index:
2650 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2651 ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm1
2652 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2653 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2656 ; SKX-LABEL: sext_i8_index:
2658 ; SKX-NEXT: vpmovsxbd %xmm0, %zmm1
2659 ; SKX-NEXT: kxnorw %k0, %k0, %k1
2660 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2663 ; SKX_32-LABEL: sext_i8_index:
2665 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2666 ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm1
2667 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2668 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2671 %sext_ind = sext <16 x i8> %ind to <16 x i64>
2672 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
2674 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2675 ret <16 x float>%res
2678 ; Make sure we allow index to be sign extended from a smaller than i32 element size.
2679 define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {
2680 ; KNL_64-LABEL: sext_v8i8_index:
2682 ; KNL_64-NEXT: vpmovsxbd %xmm0, %ymm1
2683 ; KNL_64-NEXT: movw $255, %ax
2684 ; KNL_64-NEXT: kmovw %eax, %k1
2685 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2686 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2689 ; KNL_32-LABEL: sext_v8i8_index:
2691 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2692 ; KNL_32-NEXT: vpmovsxbd %xmm0, %ymm1
2693 ; KNL_32-NEXT: movw $255, %cx
2694 ; KNL_32-NEXT: kmovw %ecx, %k1
2695 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2696 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2699 ; SKX-LABEL: sext_v8i8_index:
2701 ; SKX-NEXT: vpmovsxbd %xmm0, %ymm1
2702 ; SKX-NEXT: kxnorw %k0, %k0, %k1
2703 ; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
2706 ; SKX_32-LABEL: sext_v8i8_index:
2708 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2709 ; SKX_32-NEXT: vpmovsxbd %xmm0, %ymm1
2710 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2711 ; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
2714 %sext_ind = sext <8 x i8> %ind to <8 x i64>
2715 %gep.random = getelementptr float, float *%base, <8 x i64> %sext_ind
2717 %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %gep.random, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
2720 declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>)
2722 ; Index requires promotion
2723 define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32> %ind, <2 x i1> %mask) {
2724 ; KNL_64-LABEL: test_scatter_2i32_index:
2726 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2727 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2728 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
2729 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
2730 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
2731 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
2732 ; KNL_64-NEXT: vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1}
2733 ; KNL_64-NEXT: vzeroupper
2736 ; KNL_32-LABEL: test_scatter_2i32_index:
2738 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2739 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2740 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
2741 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
2742 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
2743 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
2744 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2745 ; KNL_32-NEXT: vscatterdpd %zmm0, (%eax,%ymm1,8) {%k1}
2746 ; KNL_32-NEXT: vzeroupper
2749 ; SKX-LABEL: test_scatter_2i32_index:
2751 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
2752 ; SKX-NEXT: vpmovq2m %xmm2, %k1
2753 ; SKX-NEXT: vscatterdpd %xmm0, (%rdi,%xmm1,8) {%k1}
2756 ; SKX_32-LABEL: test_scatter_2i32_index:
2758 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
2759 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1
2760 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2761 ; SKX_32-NEXT: vscatterdpd %xmm0, (%eax,%xmm1,8) {%k1}
2763 %gep = getelementptr double, double *%base, <2 x i32> %ind
2764 call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask)
2767 declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>)
2769 define <16 x float> @zext_index(float* %base, <16 x i32> %ind) {
2770 ; KNL_64-LABEL: zext_index:
2772 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
2773 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2774 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2777 ; KNL_32-LABEL: zext_index:
2779 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2780 ; KNL_32-NEXT: vpandd {{\.LCPI.*}}{1to16}, %zmm0, %zmm1
2781 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2782 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2785 ; SKX_SMALL-LABEL: zext_index:
2786 ; SKX_SMALL: # %bb.0:
2787 ; SKX_SMALL-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm1
2788 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
2789 ; SKX_SMALL-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2790 ; SKX_SMALL-NEXT: retq
2792 ; SKX_LARGE-LABEL: zext_index:
2793 ; SKX_LARGE: # %bb.0:
2794 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
2795 ; SKX_LARGE-NEXT: vandps (%rax){1to16}, %zmm0, %zmm1
2796 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
2797 ; SKX_LARGE-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2798 ; SKX_LARGE-NEXT: retq
2800 ; SKX_32-LABEL: zext_index:
2802 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2803 ; SKX_32-NEXT: vandps {{\.LCPI.*}}{1to16}, %zmm0, %zmm1
2804 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2805 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2807 %ind_masked = and <16 x i32> %ind, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
2808 %sext_ind = zext <16 x i32> %ind_masked to <16 x i64>
2809 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
2811 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2812 ret <16 x float>%res
2815 define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %passthru) {
2816 ; KNL_64-LABEL: test_gather_setcc_split:
2818 ; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2819 ; KNL_64-NEXT: vptestnmd %zmm4, %zmm4, %k1
2820 ; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
2821 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
2822 ; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2823 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm3 {%k1}
2824 ; KNL_64-NEXT: vmovapd %zmm2, %zmm0
2825 ; KNL_64-NEXT: vmovapd %zmm3, %zmm1
2828 ; KNL_32-LABEL: test_gather_setcc_split:
2830 ; KNL_32-NEXT: pushl %ebp
2831 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2832 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2833 ; KNL_32-NEXT: movl %esp, %ebp
2834 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2835 ; KNL_32-NEXT: andl $-64, %esp
2836 ; KNL_32-NEXT: subl $64, %esp
2837 ; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
2838 ; KNL_32-NEXT: movl 8(%ebp), %eax
2839 ; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2840 ; KNL_32-NEXT: vptestnmd %zmm4, %zmm4, %k1
2841 ; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
2842 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
2843 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2844 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1}
2845 ; KNL_32-NEXT: vmovapd %zmm2, %zmm0
2846 ; KNL_32-NEXT: vmovapd %zmm3, %zmm1
2847 ; KNL_32-NEXT: movl %ebp, %esp
2848 ; KNL_32-NEXT: popl %ebp
2849 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2852 ; SKX-LABEL: test_gather_setcc_split:
2854 ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2855 ; SKX-NEXT: vptestnmd %ymm4, %ymm4, %k1
2856 ; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
2857 ; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
2858 ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2859 ; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm3 {%k1}
2860 ; SKX-NEXT: vmovapd %zmm2, %zmm0
2861 ; SKX-NEXT: vmovapd %zmm3, %zmm1
2864 ; SKX_32-LABEL: test_gather_setcc_split:
2866 ; SKX_32-NEXT: pushl %ebp
2867 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2868 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2869 ; SKX_32-NEXT: movl %esp, %ebp
2870 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2871 ; SKX_32-NEXT: andl $-64, %esp
2872 ; SKX_32-NEXT: subl $64, %esp
2873 ; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
2874 ; SKX_32-NEXT: movl 8(%ebp), %eax
2875 ; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2876 ; SKX_32-NEXT: vptestnmd %ymm4, %ymm4, %k1
2877 ; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
2878 ; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
2879 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2880 ; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1}
2881 ; SKX_32-NEXT: vmovapd %zmm2, %zmm0
2882 ; SKX_32-NEXT: vmovapd %zmm3, %zmm1
2883 ; SKX_32-NEXT: movl %ebp, %esp
2884 ; SKX_32-NEXT: popl %ebp
2885 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2887 %sext_ind = sext <16 x i32> %ind to <16 x i64>
2888 %gep.random = getelementptr double, double *%base, <16 x i64> %sext_ind
2890 %mask = icmp eq <16 x i32> %cmp, zeroinitializer
2891 %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %gep.random, i32 4, <16 x i1> %mask, <16 x double> %passthru)
2892 ret <16 x double>%res
2895 define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %src0) {
2896 ; KNL_64-LABEL: test_scatter_setcc_split:
2898 ; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2899 ; KNL_64-NEXT: vptestnmd %zmm4, %zmm4, %k1
2900 ; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
2901 ; KNL_64-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
2902 ; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2903 ; KNL_64-NEXT: vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1}
2904 ; KNL_64-NEXT: vzeroupper
2907 ; KNL_32-LABEL: test_scatter_setcc_split:
2909 ; KNL_32-NEXT: pushl %ebp
2910 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2911 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2912 ; KNL_32-NEXT: movl %esp, %ebp
2913 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2914 ; KNL_32-NEXT: andl $-64, %esp
2915 ; KNL_32-NEXT: subl $64, %esp
2916 ; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
2917 ; KNL_32-NEXT: movl 8(%ebp), %eax
2918 ; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2919 ; KNL_32-NEXT: vptestnmd %zmm4, %zmm4, %k1
2920 ; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
2921 ; KNL_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
2922 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2923 ; KNL_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
2924 ; KNL_32-NEXT: movl %ebp, %esp
2925 ; KNL_32-NEXT: popl %ebp
2926 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2927 ; KNL_32-NEXT: vzeroupper
2930 ; SKX-LABEL: test_scatter_setcc_split:
2932 ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2933 ; SKX-NEXT: vptestnmd %ymm4, %ymm4, %k1
2934 ; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
2935 ; SKX-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
2936 ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2937 ; SKX-NEXT: vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1}
2938 ; SKX-NEXT: vzeroupper
2941 ; SKX_32-LABEL: test_scatter_setcc_split:
2943 ; SKX_32-NEXT: pushl %ebp
2944 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2945 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2946 ; SKX_32-NEXT: movl %esp, %ebp
2947 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2948 ; SKX_32-NEXT: andl $-64, %esp
2949 ; SKX_32-NEXT: subl $64, %esp
2950 ; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
2951 ; SKX_32-NEXT: movl 8(%ebp), %eax
2952 ; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2953 ; SKX_32-NEXT: vptestnmd %ymm4, %ymm4, %k1
2954 ; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
2955 ; SKX_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
2956 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2957 ; SKX_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
2958 ; SKX_32-NEXT: movl %ebp, %esp
2959 ; SKX_32-NEXT: popl %ebp
2960 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2961 ; SKX_32-NEXT: vzeroupper
2963 %sext_ind = sext <16 x i32> %ind to <16 x i64>
2964 %gep.random = getelementptr double, double *%base, <16 x i64> %sext_ind
2966 %mask = icmp eq <16 x i32> %cmp, zeroinitializer
2967 call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %gep.random, i32 4, <16 x i1> %mask)
2971 ; This test case previously triggered an infinite loop when the two gathers became identical after DAG combine removed the sign extend.
2972 define <16 x float> @test_sext_cse(float* %base, <16 x i32> %ind, <16 x i32>* %foo) {
2973 ; KNL_64-LABEL: test_sext_cse:
2975 ; KNL_64-NEXT: vmovaps %zmm0, (%rsi)
2976 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2977 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
2978 ; KNL_64-NEXT: vaddps %zmm1, %zmm1, %zmm0
2981 ; KNL_32-LABEL: test_sext_cse:
2983 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2984 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
2985 ; KNL_32-NEXT: vmovaps %zmm0, (%ecx)
2986 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2987 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
2988 ; KNL_32-NEXT: vaddps %zmm1, %zmm1, %zmm0
2991 ; SKX-LABEL: test_sext_cse:
2993 ; SKX-NEXT: vmovaps %zmm0, (%rsi)
2994 ; SKX-NEXT: kxnorw %k0, %k0, %k1
2995 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
2996 ; SKX-NEXT: vaddps %zmm1, %zmm1, %zmm0
2999 ; SKX_32-LABEL: test_sext_cse:
3001 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3002 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
3003 ; SKX_32-NEXT: vmovaps %zmm0, (%ecx)
3004 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
3005 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
3006 ; SKX_32-NEXT: vaddps %zmm1, %zmm1, %zmm0
3008 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
3009 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
3011 %sext_ind = sext <16 x i32> %ind to <16 x i64>
3012 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
3014 store <16 x i32> %ind, <16 x i32>* %foo
3015 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
3016 %gep.random2 = getelementptr float, <16 x float*> %broadcast.splat, <16 x i32> %ind
3017 %res2 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random2, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
3018 %res3 = fadd <16 x float> %res2, %res
3019 ret <16 x float>%res3
3022 define void @zero_mask(<2 x double>%a1, <2 x double*> %ptr) {
3023 ; ALL-LABEL: zero_mask:
3025 ; ALL-NEXT: ret{{[l|q]}}
3026 call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %ptr, i32 4, <2 x i1> zeroinitializer)