1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_64
3 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32
4 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_SMALL
5 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq -code-model=large < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_LARGE
6 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32
7 ; RUN: opt -mtriple=x86_64-apple-darwin -scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
8 ; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
10 @glob_array = internal unnamed_addr constant [16 x i32] [i32 1, i32 1, i32 2, i32 3, i32 5, i32 8, i32 13, i32 21, i32 34, i32 55, i32 89, i32 144, i32 233, i32 377, i32 610, i32 987], align 16
13 ; SCALAR: extractelement <16 x float*>
14 ; SCALAR-NEXT: load float
15 ; SCALAR-NEXT: insertelement <16 x float>
16 ; SCALAR-NEXT: extractelement <16 x float*>
17 ; SCALAR-NEXT: load float
19 define <16 x float> @test1(float* %base, <16 x i32> %ind) {
20 ; KNL_64-LABEL: test1:
22 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
23 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
24 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
27 ; KNL_32-LABEL: test1:
29 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
30 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
31 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
32 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
37 ; SKX-NEXT: kxnorw %k0, %k0, %k1
38 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
39 ; SKX-NEXT: vmovaps %zmm1, %zmm0
42 ; SKX_32-LABEL: test1:
44 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
45 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
46 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
47 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
50 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
51 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
53 %sext_ind = sext <16 x i32> %ind to <16 x i64>
54 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
56 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
60 declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
61 declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
62 declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
66 ; SCALAR: extractelement <16 x float*>
67 ; SCALAR-NEXT: load float
68 ; SCALAR-NEXT: insertelement <16 x float>
69 ; SCALAR-NEXT: br label %else
71 ; SCALAR-NEXT: %res.phi.else = phi
72 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
73 ; SCALAR-NEXT: br i1 %Mask1, label %cond.load1, label %else2
75 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
76 ; KNL_64-LABEL: test2:
78 ; KNL_64-NEXT: kmovw %esi, %k1
79 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
80 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
83 ; KNL_32-LABEL: test2:
85 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
86 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
87 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
88 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
93 ; SKX-NEXT: kmovw %esi, %k1
94 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
95 ; SKX-NEXT: vmovaps %zmm1, %zmm0
98 ; SKX_32-LABEL: test2:
100 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
101 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
102 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
103 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
106 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
107 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
109 %sext_ind = sext <16 x i32> %ind to <16 x i64>
110 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
111 %imask = bitcast i16 %mask to <16 x i1>
112 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
113 ret <16 x float> %res
116 define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
117 ; KNL_64-LABEL: test3:
119 ; KNL_64-NEXT: kmovw %esi, %k1
120 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
121 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
124 ; KNL_32-LABEL: test3:
126 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
127 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
128 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
129 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
134 ; SKX-NEXT: kmovw %esi, %k1
135 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
136 ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
139 ; SKX_32-LABEL: test3:
141 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
142 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
143 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
144 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
147 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
148 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
150 %sext_ind = sext <16 x i32> %ind to <16 x i64>
151 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind
152 %imask = bitcast i16 %mask to <16 x i1>
153 %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
158 define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
159 ; KNL_64-LABEL: test4:
161 ; KNL_64-NEXT: kmovw %esi, %k1
162 ; KNL_64-NEXT: kmovw %k1, %k2
163 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
164 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
165 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
166 ; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0
169 ; KNL_32-LABEL: test4:
171 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
172 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
173 ; KNL_32-NEXT: kmovw %k1, %k2
174 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
175 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
176 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
177 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
182 ; SKX-NEXT: kmovw %esi, %k1
183 ; SKX-NEXT: kmovw %k1, %k2
184 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
185 ; SKX-NEXT: vmovdqa64 %zmm1, %zmm2
186 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
187 ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0
190 ; SKX_32-LABEL: test4:
192 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
193 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
194 ; SKX_32-NEXT: kmovw %k1, %k2
195 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
196 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
197 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
198 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
201 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
202 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
204 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
205 %imask = bitcast i16 %mask to <16 x i1>
206 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
207 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
208 %res = add <16 x i32> %gt1, %gt2
213 ; SCALAR-LABEL: test5
214 ; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i32 0
215 ; SCALAR-NEXT: br i1 %Mask0, label %cond.store, label %else
216 ; SCALAR: cond.store:
217 ; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i32 0
218 ; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0
219 ; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4
220 ; SCALAR-NEXT: br label %else
222 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
223 ; SCALAR-NEXT: br i1 %Mask1, label %cond.store1, label %else2
225 define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
226 ; KNL_64-LABEL: test5:
228 ; KNL_64-NEXT: kmovw %esi, %k1
229 ; KNL_64-NEXT: kmovw %k1, %k2
230 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
231 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
232 ; KNL_64-NEXT: vzeroupper
235 ; KNL_32-LABEL: test5:
237 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
238 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
239 ; KNL_32-NEXT: kmovw %k1, %k2
240 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
241 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
242 ; KNL_32-NEXT: vzeroupper
247 ; SKX-NEXT: kmovw %esi, %k1
248 ; SKX-NEXT: kmovw %k1, %k2
249 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
250 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
251 ; SKX-NEXT: vzeroupper
254 ; SKX_32-LABEL: test5:
256 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
257 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
258 ; SKX_32-NEXT: kmovw %k1, %k2
259 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
260 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
261 ; SKX_32-NEXT: vzeroupper
264 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
265 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
267 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
268 %imask = bitcast i16 %mask to <16 x i1>
269 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
270 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
274 declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
275 declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
278 ; SCALAR-LABEL: test6
279 ; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4
280 ; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i32 1
281 ; SCALAR-NEXT: %Ptr12 = extractelement <8 x i32*> %ptr, i32 1
282 ; SCALAR-NEXT: store i32 %Elt1, i32* %Ptr12, align 4
283 ; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i32 2
284 ; SCALAR-NEXT: %Ptr23 = extractelement <8 x i32*> %ptr, i32 2
285 ; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4
287 define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
288 ; KNL_64-LABEL: test6:
290 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
291 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
292 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
293 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
294 ; KNL_64-NEXT: vmovdqa %ymm2, %ymm0
297 ; KNL_32-LABEL: test6:
299 ; KNL_32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
300 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
301 ; KNL_32-NEXT: movw $255, %ax
302 ; KNL_32-NEXT: kmovw %eax, %k1
303 ; KNL_32-NEXT: kmovw %k1, %k2
304 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm2 {%k2}
305 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
306 ; KNL_32-NEXT: vmovdqa %ymm2, %ymm0
311 ; SKX-NEXT: kxnorw %k0, %k0, %k1
312 ; SKX-NEXT: kxnorw %k0, %k0, %k2
313 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
314 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
315 ; SKX-NEXT: vmovdqa %ymm2, %ymm0
318 ; SKX_32-LABEL: test6:
320 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
321 ; SKX_32-NEXT: kxnorw %k0, %k0, %k2
322 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2}
323 ; SKX_32-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1}
324 ; SKX_32-NEXT: vmovdqa %ymm2, %ymm0
327 %a = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
329 call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
333 define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
335 ; KNL_64-LABEL: test7:
337 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
338 ; KNL_64-NEXT: kmovw %esi, %k0
339 ; KNL_64-NEXT: kshiftlw $8, %k0, %k0
340 ; KNL_64-NEXT: kshiftrw $8, %k0, %k1
341 ; KNL_64-NEXT: kmovw %k1, %k2
342 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
343 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
344 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
345 ; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0
348 ; KNL_32-LABEL: test7:
350 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
351 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
352 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
353 ; KNL_32-NEXT: kmovw %ecx, %k0
354 ; KNL_32-NEXT: kshiftlw $8, %k0, %k0
355 ; KNL_32-NEXT: kshiftrw $8, %k0, %k1
356 ; KNL_32-NEXT: kmovw %k1, %k2
357 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
358 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
359 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
360 ; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
365 ; SKX-NEXT: kmovw %esi, %k1
366 ; SKX-NEXT: kmovw %k1, %k2
367 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
368 ; SKX-NEXT: vmovdqa %ymm1, %ymm2
369 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
370 ; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0
373 ; SKX_32-LABEL: test7:
375 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
376 ; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
377 ; SKX_32-NEXT: kmovw %k1, %k2
378 ; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm1 {%k2}
379 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm2
380 ; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm2 {%k1}
381 ; SKX_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
384 %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
385 %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
387 %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind
388 %imask = bitcast i8 %mask to <8 x i1>
389 %gt1 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
390 %gt2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
391 %res = add <8 x i32> %gt1, %gt2
395 ; No uniform base in this case, index <8 x i64> contains addresses,
396 ; each gather call will be split into two
397 define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
398 ; KNL_64-LABEL: test8:
400 ; KNL_64-NEXT: kmovw %edi, %k1
401 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
402 ; KNL_64-NEXT: kmovw %k2, %k3
403 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
404 ; KNL_64-NEXT: kmovw %k1, %k3
405 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
406 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
407 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
408 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
409 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
410 ; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0
413 ; KNL_32-LABEL: test8:
415 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
416 ; KNL_32-NEXT: kmovw %k1, %k2
417 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
418 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
419 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
420 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
425 ; SKX-NEXT: kmovw %edi, %k1
426 ; SKX-NEXT: kshiftrw $8, %k1, %k2
427 ; SKX-NEXT: kmovw %k2, %k3
428 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
429 ; SKX-NEXT: kmovw %k1, %k3
430 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
431 ; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
432 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
433 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
434 ; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
435 ; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
438 ; SKX_32-LABEL: test8:
440 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
441 ; SKX_32-NEXT: kmovw %k1, %k2
442 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
443 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
444 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
445 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
448 %imask = bitcast i16 %mask to <16 x i1>
449 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
450 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
451 %res = add <16 x i32> %gt1, %gt2
455 %struct.RT = type { i8, [10 x [20 x i32]], i8 }
456 %struct.ST = type { i32, double, %struct.RT }
458 ; Masked gather for agregate types
459 ; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
462 define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
463 ; KNL_64-LABEL: test9:
464 ; KNL_64: # %bb.0: # %entry
465 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
466 ; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
467 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
468 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
469 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
470 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
471 ; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
472 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
473 ; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
474 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
475 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
476 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
477 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
478 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
481 ; KNL_32-LABEL: test9:
482 ; KNL_32: # %bb.0: # %entry
483 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
484 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
485 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
486 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
487 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
488 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
489 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
490 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
491 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
492 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1
493 ; KNL_32-NEXT: movw $255, %ax
494 ; KNL_32-NEXT: kmovw %eax, %k1
495 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1}
496 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
499 ; SKX_SMALL-LABEL: test9:
500 ; SKX_SMALL: # %bb.0: # %entry
501 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
502 ; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
503 ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
504 ; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
505 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
506 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
507 ; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
508 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
509 ; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
510 ; SKX_SMALL-NEXT: retq
512 ; SKX_LARGE-LABEL: test9:
513 ; SKX_LARGE: # %bb.0: # %entry
514 ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2
515 ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
516 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
517 ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
518 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
519 ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
520 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
521 ; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
522 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
523 ; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1
524 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
525 ; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
526 ; SKX_LARGE-NEXT: retq
528 ; SKX_32-LABEL: test9:
529 ; SKX_32: # %bb.0: # %entry
530 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
531 ; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
532 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
533 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
534 ; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
535 ; SKX_32-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
536 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
537 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1}
540 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
541 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
543 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
544 %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
548 define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
549 ; KNL_64-LABEL: test10:
550 ; KNL_64: # %bb.0: # %entry
551 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
552 ; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
553 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
554 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
555 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
556 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
557 ; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
558 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
559 ; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
560 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
561 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
562 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
563 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
564 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
567 ; KNL_32-LABEL: test10:
568 ; KNL_32: # %bb.0: # %entry
569 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
570 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
571 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
572 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
573 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
574 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
575 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
576 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
577 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
578 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1
579 ; KNL_32-NEXT: movw $255, %ax
580 ; KNL_32-NEXT: kmovw %eax, %k1
581 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1}
582 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
585 ; SKX_SMALL-LABEL: test10:
586 ; SKX_SMALL: # %bb.0: # %entry
587 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
588 ; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
589 ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
590 ; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
591 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
592 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
593 ; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
594 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
595 ; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
596 ; SKX_SMALL-NEXT: retq
598 ; SKX_LARGE-LABEL: test10:
599 ; SKX_LARGE: # %bb.0: # %entry
600 ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2
601 ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
602 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
603 ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
604 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
605 ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
606 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
607 ; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
608 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
609 ; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1
610 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
611 ; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
612 ; SKX_LARGE-NEXT: retq
614 ; SKX_32-LABEL: test10:
615 ; SKX_32: # %bb.0: # %entry
616 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
617 ; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
618 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
619 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
620 ; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
621 ; SKX_32-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
622 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
623 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1}
626 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
627 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
629 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
630 %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
634 ; Splat index in GEP, requires broadcast
635 define <16 x float> @test11(float* %base, i32 %ind) {
636 ; KNL_64-LABEL: test11:
638 ; KNL_64-NEXT: vpbroadcastd %esi, %zmm1
639 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
640 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
643 ; KNL_32-LABEL: test11:
645 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
646 ; KNL_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
647 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
648 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
653 ; SKX-NEXT: vpbroadcastd %esi, %zmm1
654 ; SKX-NEXT: kxnorw %k0, %k0, %k1
655 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
658 ; SKX_32-LABEL: test11:
660 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
661 ; SKX_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
662 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
663 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
666 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
667 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
669 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
671 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
675 ; We are checking the uniform base here. It is taken directly from input to vgatherdps
676 define <16 x float> @test12(float* %base, <16 x i32> %ind) {
677 ; KNL_64-LABEL: test12:
679 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
680 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
681 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
684 ; KNL_32-LABEL: test12:
686 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
687 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
688 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
689 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
694 ; SKX-NEXT: kxnorw %k0, %k0, %k1
695 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
696 ; SKX-NEXT: vmovaps %zmm1, %zmm0
699 ; SKX_32-LABEL: test12:
701 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
702 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
703 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
704 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
707 %sext_ind = sext <16 x i32> %ind to <16 x i64>
708 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
710 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
714 ; The same as the previous, but the mask is undefined
715 define <16 x float> @test13(float* %base, <16 x i32> %ind) {
716 ; KNL_64-LABEL: test13:
718 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
719 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
720 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
723 ; KNL_32-LABEL: test13:
725 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
726 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
727 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
728 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
733 ; SKX-NEXT: kxnorw %k0, %k0, %k1
734 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
735 ; SKX-NEXT: vmovaps %zmm1, %zmm0
738 ; SKX_32-LABEL: test13:
740 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
741 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
742 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
743 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
746 %sext_ind = sext <16 x i32> %ind to <16 x i64>
747 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
749 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
753 ; The base pointer is not splat, can't find unform base
754 define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
755 ; KNL_64-LABEL: test14:
757 ; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0
758 ; KNL_64-NEXT: vmovd %esi, %xmm1
759 ; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1
760 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
761 ; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1
762 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
763 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
764 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
765 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
768 ; KNL_32-LABEL: test14:
770 ; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0
771 ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
772 ; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
773 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
774 ; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
779 ; SKX-NEXT: vpbroadcastq %xmm0, %zmm0
780 ; SKX-NEXT: vpbroadcastd %esi, %ymm1
781 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
782 ; SKX-NEXT: vpsllq $2, %zmm1, %zmm1
783 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
784 ; SKX-NEXT: kxnorw %k0, %k0, %k1
785 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
786 ; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
789 ; SKX_32-LABEL: test14:
791 ; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0
792 ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
793 ; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
794 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
795 ; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
798 %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
799 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
801 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
803 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
807 declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
808 declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
809 declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
811 ; Gather smaller than existing instruction
812 define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
813 ; KNL_64-LABEL: test15:
815 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
816 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
817 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
818 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
819 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
820 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
821 ; KNL_64-NEXT: vmovaps %xmm1, %xmm0
822 ; KNL_64-NEXT: vzeroupper
825 ; KNL_32-LABEL: test15:
827 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
828 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
829 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
830 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
831 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
832 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
833 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
834 ; KNL_32-NEXT: vmovaps %xmm1, %xmm0
835 ; KNL_32-NEXT: vzeroupper
840 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
841 ; SKX-NEXT: vpmovd2m %xmm1, %k1
842 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
843 ; SKX-NEXT: vmovaps %xmm1, %xmm0
846 ; SKX_32-LABEL: test15:
848 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
849 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
850 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
851 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
852 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
855 %sext_ind = sext <4 x i32> %ind to <4 x i64>
856 %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
857 %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
861 ; Gather smaller than existing instruction
862 define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
863 ; KNL_64-LABEL: test16:
865 ; KNL_64-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
866 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
867 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
868 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
869 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
870 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
871 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1}
872 ; KNL_64-NEXT: vmovapd %ymm2, %ymm0
875 ; KNL_32-LABEL: test16:
877 ; KNL_32-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
878 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
879 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
880 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
881 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
882 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
883 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
884 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1}
885 ; KNL_32-NEXT: vmovapd %ymm2, %ymm0
890 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
891 ; SKX-NEXT: vpmovd2m %xmm1, %k1
892 ; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
893 ; SKX-NEXT: vmovapd %ymm2, %ymm0
896 ; SKX_32-LABEL: test16:
898 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
899 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
900 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
901 ; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
902 ; SKX_32-NEXT: vmovapd %ymm2, %ymm0
905 %sext_ind = sext <4 x i32> %ind to <4 x i64>
906 %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
907 %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
911 define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
912 ; KNL_64-LABEL: test17:
914 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
915 ; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0
916 ; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0
917 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
918 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
919 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
920 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
921 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
922 ; KNL_64-NEXT: vmovapd %xmm2, %xmm0
923 ; KNL_64-NEXT: vzeroupper
926 ; KNL_32-LABEL: test17:
928 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
929 ; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0
930 ; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0
931 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
932 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
933 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
934 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
935 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
936 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
937 ; KNL_32-NEXT: vmovapd %xmm2, %xmm0
938 ; KNL_32-NEXT: vzeroupper
943 ; SKX-NEXT: vpsllq $32, %xmm0, %xmm0
944 ; SKX-NEXT: vpsraq $32, %xmm0, %xmm0
945 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
946 ; SKX-NEXT: vpmovq2m %xmm1, %k1
947 ; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
948 ; SKX-NEXT: vmovapd %xmm2, %xmm0
951 ; SKX_32-LABEL: test17:
953 ; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0
954 ; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0
955 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
956 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
957 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
958 ; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1}
959 ; SKX_32-NEXT: vmovapd %xmm2, %xmm0
962 %sext_ind = sext <2 x i32> %ind to <2 x i64>
963 %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
964 %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
968 declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
969 declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
970 declare void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
971 declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
972 declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
974 define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
975 ; KNL_64-LABEL: test18:
977 ; KNL_64-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
978 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
979 ; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2
980 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0
981 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
982 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
983 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
984 ; KNL_64-NEXT: vzeroupper
987 ; KNL_32-LABEL: test18:
989 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
990 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
991 ; KNL_32-NEXT: vpslld $31, %xmm2, %xmm2
992 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k0
993 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
994 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
995 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
996 ; KNL_32-NEXT: vzeroupper
1001 ; SKX-NEXT: vpslld $31, %xmm2, %xmm2
1002 ; SKX-NEXT: vpmovd2m %xmm2, %k1
1003 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
1004 ; SKX-NEXT: vzeroupper
1007 ; SKX_32-LABEL: test18:
1009 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
1010 ; SKX_32-NEXT: vpmovd2m %xmm2, %k1
1011 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
1013 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
1017 define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
1018 ; KNL_64-LABEL: test19:
1020 ; KNL_64-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
1021 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1022 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
1023 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
1024 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
1025 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
1026 ; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
1027 ; KNL_64-NEXT: vzeroupper
1030 ; KNL_32-LABEL: test19:
1032 ; KNL_32-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
1033 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1034 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
1035 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
1036 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
1037 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
1038 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1039 ; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
1040 ; KNL_32-NEXT: vzeroupper
1043 ; SKX-LABEL: test19:
1045 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
1046 ; SKX-NEXT: vpmovd2m %xmm1, %k1
1047 ; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
1048 ; SKX-NEXT: vzeroupper
1051 ; SKX_32-LABEL: test19:
1053 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
1054 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
1055 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1056 ; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
1057 ; SKX_32-NEXT: vzeroupper
1059 %gep = getelementptr double, double* %ptr, <4 x i64> %ind
1060 call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
1064 ; Data type requires widening
1065 define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
1066 ; KNL_64-LABEL: test20:
1068 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1069 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1070 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
1071 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
1072 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1073 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1074 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
1075 ; KNL_64-NEXT: vzeroupper
1078 ; KNL_32-LABEL: test20:
1080 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1081 ; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
1082 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
1083 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
1084 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1085 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1086 ; KNL_32-NEXT: vscatterdps %zmm0, (,%zmm1) {%k1}
1087 ; KNL_32-NEXT: vzeroupper
1090 ; SKX-LABEL: test20:
1092 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
1093 ; SKX-NEXT: vpmovq2m %xmm2, %k1
1094 ; SKX-NEXT: vscatterqps %xmm0, (,%xmm1) {%k1}
1097 ; SKX_32-LABEL: test20:
1099 ; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
1100 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
1101 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1
1102 ; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1}
1104 call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
1108 ; Data type requires promotion
1109 define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
1110 ; KNL_64-LABEL: test21:
1112 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1113 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
1114 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
1115 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1116 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1117 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1118 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1119 ; KNL_64-NEXT: vzeroupper
1122 ; KNL_32-LABEL: test21:
1124 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
1125 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
1126 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1127 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1128 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1129 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1130 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
1131 ; KNL_32-NEXT: vzeroupper
1134 ; SKX-LABEL: test21:
1136 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
1137 ; SKX-NEXT: vpmovq2m %xmm2, %k1
1138 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1139 ; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1}
1142 ; SKX_32-LABEL: test21:
1144 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
1145 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1
1146 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1147 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1148 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
1150 call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
1154 ; The result type requires widening
1155 declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
1157 define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
1158 ; KNL_64-LABEL: test22:
1160 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1161 ; KNL_64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
1162 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1163 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1164 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1165 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1166 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
1167 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1168 ; KNL_64-NEXT: vzeroupper
1171 ; KNL_32-LABEL: test22:
1173 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1174 ; KNL_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
1175 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1176 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1177 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1178 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1179 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1180 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm2 {%k1}
1181 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1182 ; KNL_32-NEXT: vzeroupper
1185 ; SKX-LABEL: test22:
1187 ; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
1188 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1189 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1190 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
1191 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1194 ; SKX_32-LABEL: test22:
1196 ; SKX_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
1197 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1198 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1199 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1200 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
1201 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1203 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1204 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1205 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1209 define <2 x float> @test22a(float* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x float> %src0) {
1210 ; KNL_64-LABEL: test22a:
1212 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
1213 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1214 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1215 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1216 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1217 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1218 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
1219 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1220 ; KNL_64-NEXT: vzeroupper
1223 ; KNL_32-LABEL: test22a:
1225 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
1226 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1227 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1228 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1229 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1230 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1231 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1232 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
1233 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1234 ; KNL_32-NEXT: vzeroupper
1237 ; SKX-LABEL: test22a:
1239 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1240 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1241 ; SKX-NEXT: vgatherqps (%rdi,%xmm0,4), %xmm2 {%k1}
1242 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1245 ; SKX_32-LABEL: test22a:
1247 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1248 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1249 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1250 ; SKX_32-NEXT: vgatherqps (%eax,%xmm0,4), %xmm2 {%k1}
1251 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1253 %gep.random = getelementptr float, float* %base, <2 x i64> %ind
1254 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1258 declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
1259 declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
1261 define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1262 ; KNL_64-LABEL: test23:
1264 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1265 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1266 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1267 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1268 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1269 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1270 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
1271 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1272 ; KNL_64-NEXT: vzeroupper
1275 ; KNL_32-LABEL: test23:
1277 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1278 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1279 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1280 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1281 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1282 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1283 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1284 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
1285 ; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1286 ; KNL_32-NEXT: vzeroupper
1289 ; SKX-LABEL: test23:
1291 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1292 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1293 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1294 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1295 ; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
1296 ; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1299 ; SKX_32-LABEL: test23:
1301 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1302 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1303 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1304 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1305 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1306 ; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
1307 ; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1309 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1310 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1311 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1315 define <2 x i32> @test23b(i32* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1316 ; KNL_64-LABEL: test23b:
1318 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1319 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1320 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1321 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1322 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1323 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1324 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k1}
1325 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1326 ; KNL_64-NEXT: vzeroupper
1329 ; KNL_32-LABEL: test23b:
1331 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1332 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1333 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1334 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1335 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1336 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1337 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1338 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k1}
1339 ; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1340 ; KNL_32-NEXT: vzeroupper
1343 ; SKX-LABEL: test23b:
1345 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1346 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1347 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1348 ; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1}
1349 ; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1352 ; SKX_32-LABEL: test23b:
1354 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1355 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1356 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1357 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1358 ; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1}
1359 ; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1361 %gep.random = getelementptr i32, i32* %base, <2 x i64> %ind
1362 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1366 define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
1367 ; KNL_64-LABEL: test24:
1369 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1370 ; KNL_64-NEXT: movw $3, %ax
1371 ; KNL_64-NEXT: kmovw %eax, %k1
1372 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
1373 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1374 ; KNL_64-NEXT: vzeroupper
1377 ; KNL_32-LABEL: test24:
1379 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1380 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1381 ; KNL_32-NEXT: movw $3, %cx
1382 ; KNL_32-NEXT: kmovw %ecx, %k1
1383 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
1384 ; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1385 ; KNL_32-NEXT: vzeroupper
1388 ; SKX-LABEL: test24:
1390 ; SKX-NEXT: movb $3, %al
1391 ; SKX-NEXT: kmovw %eax, %k1
1392 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1393 ; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
1394 ; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1397 ; SKX_32-LABEL: test24:
1399 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1400 ; SKX_32-NEXT: movb $3, %cl
1401 ; SKX_32-NEXT: kmovw %ecx, %k1
1402 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1403 ; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
1404 ; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1406 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1407 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1408 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1412 define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
1413 ; KNL_64-LABEL: test25:
1415 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1416 ; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0
1417 ; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0
1418 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1419 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1420 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1421 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1422 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
1423 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1424 ; KNL_64-NEXT: vzeroupper
1427 ; KNL_32-LABEL: test25:
1429 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1430 ; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0
1431 ; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0
1432 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1433 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1434 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1435 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1436 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1437 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
1438 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1439 ; KNL_32-NEXT: vzeroupper
1442 ; SKX-LABEL: test25:
1444 ; SKX-NEXT: vpsllq $32, %xmm0, %xmm0
1445 ; SKX-NEXT: vpsraq $32, %xmm0, %xmm0
1446 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1447 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1448 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
1449 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1452 ; SKX_32-LABEL: test25:
1454 ; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0
1455 ; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0
1456 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1457 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1458 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1459 ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
1460 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1462 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1463 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1464 %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
1468 define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
1469 ; KNL_64-LABEL: test26:
1471 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1472 ; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0
1473 ; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0
1474 ; KNL_64-NEXT: movb $3, %al
1475 ; KNL_64-NEXT: kmovw %eax, %k1
1476 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
1477 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
1478 ; KNL_64-NEXT: vzeroupper
1481 ; KNL_32-LABEL: test26:
1483 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1484 ; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0
1485 ; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0
1486 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1487 ; KNL_32-NEXT: movb $3, %cl
1488 ; KNL_32-NEXT: kmovw %ecx, %k1
1489 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
1490 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
1491 ; KNL_32-NEXT: vzeroupper
1494 ; SKX-LABEL: test26:
1496 ; SKX-NEXT: vpsllq $32, %xmm0, %xmm0
1497 ; SKX-NEXT: vpsraq $32, %xmm0, %xmm0
1498 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1499 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
1500 ; SKX-NEXT: vmovdqa %xmm1, %xmm0
1503 ; SKX_32-LABEL: test26:
1505 ; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0
1506 ; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0
1507 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1508 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
1509 ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
1510 ; SKX_32-NEXT: vmovdqa %xmm1, %xmm0
1512 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1513 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1514 %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
1518 ; Result type requires widening; all-ones mask
1519 define <2 x float> @test27(float* %base, <2 x i32> %ind) {
1520 ; KNL_64-LABEL: test27:
1522 ; KNL_64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
1523 ; KNL_64-NEXT: movw $3, %ax
1524 ; KNL_64-NEXT: kmovw %eax, %k1
1525 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
1526 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1527 ; KNL_64-NEXT: vzeroupper
1530 ; KNL_32-LABEL: test27:
1532 ; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
1533 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1534 ; KNL_32-NEXT: movw $3, %cx
1535 ; KNL_32-NEXT: kmovw %ecx, %k1
1536 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
1537 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1538 ; KNL_32-NEXT: vzeroupper
1541 ; SKX-LABEL: test27:
1543 ; SKX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
1544 ; SKX-NEXT: movb $3, %al
1545 ; SKX-NEXT: kmovw %eax, %k1
1546 ; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
1549 ; SKX_32-LABEL: test27:
1551 ; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
1552 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1553 ; SKX_32-NEXT: movb $3, %cl
1554 ; SKX_32-NEXT: kmovw %ecx, %k1
1555 ; SKX_32-NEXT: vgatherdps (%eax,%xmm1,4), %xmm0 {%k1}
1557 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1558 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1559 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
1563 ; Data type requires promotion, mask is all-ones
1564 define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
1565 ; KNL_64-LABEL: test28:
1567 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1568 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1569 ; KNL_64-NEXT: movb $3, %al
1570 ; KNL_64-NEXT: kmovw %eax, %k1
1571 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1572 ; KNL_64-NEXT: vzeroupper
1575 ; KNL_32-LABEL: test28:
1577 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1578 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1579 ; KNL_32-NEXT: movw $3, %ax
1580 ; KNL_32-NEXT: kmovw %eax, %k1
1581 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
1582 ; KNL_32-NEXT: vzeroupper
1585 ; SKX-LABEL: test28:
1587 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1588 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1589 ; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1}
1592 ; SKX_32-LABEL: test28:
1594 ; SKX_32-NEXT: movb $3, %al
1595 ; SKX_32-NEXT: kmovw %eax, %k1
1596 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1597 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1598 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
1600 call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
1604 ; SCALAR-LABEL: test29
1605 ; SCALAR: extractelement <16 x float*>
1606 ; SCALAR-NEXT: load float
1607 ; SCALAR-NEXT: insertelement <16 x float>
1608 ; SCALAR-NEXT: extractelement <16 x float*>
1609 ; SCALAR-NEXT: load float
1611 define <16 x float> @test29(float* %base, <16 x i32> %ind) {
1612 ; KNL_64-LABEL: test29:
1614 ; KNL_64-NEXT: movw $44, %ax
1615 ; KNL_64-NEXT: kmovw %eax, %k1
1616 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1617 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
1620 ; KNL_32-LABEL: test29:
1622 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1623 ; KNL_32-NEXT: movw $44, %cx
1624 ; KNL_32-NEXT: kmovw %ecx, %k1
1625 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1626 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1629 ; SKX-LABEL: test29:
1631 ; SKX-NEXT: movw $44, %ax
1632 ; SKX-NEXT: kmovw %eax, %k1
1633 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1634 ; SKX-NEXT: vmovaps %zmm1, %zmm0
1637 ; SKX_32-LABEL: test29:
1639 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1640 ; SKX_32-NEXT: movw $44, %cx
1641 ; SKX_32-NEXT: kmovw %ecx, %k1
1642 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1643 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
1646 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
1647 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
1649 %sext_ind = sext <16 x i32> %ind to <16 x i64>
1650 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
1652 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
1653 ret <16 x float>%res
1656 ; Check non-power-of-2 case. It should be scalarized.
1657 declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
1658 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
1659 ; KNL_64-LABEL: test30:
1661 ; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2
1662 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0
1663 ; KNL_64-NEXT: kmovw %k0, %eax
1664 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
1665 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
1666 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
1667 ; KNL_64-NEXT: testb $1, %al
1668 ; KNL_64-NEXT: je .LBB31_2
1669 ; KNL_64-NEXT: # %bb.1: # %cond.load
1670 ; KNL_64-NEXT: vmovq %xmm0, %rax
1671 ; KNL_64-NEXT: vpinsrd $0, (%rax), %xmm3, %xmm3
1672 ; KNL_64-NEXT: .LBB31_2: # %else
1673 ; KNL_64-NEXT: kshiftrw $1, %k0, %k1
1674 ; KNL_64-NEXT: kmovw %k1, %eax
1675 ; KNL_64-NEXT: testb $1, %al
1676 ; KNL_64-NEXT: je .LBB31_4
1677 ; KNL_64-NEXT: # %bb.3: # %cond.load1
1678 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
1679 ; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm3, %xmm3
1680 ; KNL_64-NEXT: .LBB31_4: # %else2
1681 ; KNL_64-NEXT: kshiftrw $2, %k0, %k0
1682 ; KNL_64-NEXT: kmovw %k0, %eax
1683 ; KNL_64-NEXT: testb $1, %al
1684 ; KNL_64-NEXT: je .LBB31_6
1685 ; KNL_64-NEXT: # %bb.5: # %cond.load4
1686 ; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
1687 ; KNL_64-NEXT: vmovq %xmm0, %rax
1688 ; KNL_64-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3
1689 ; KNL_64-NEXT: .LBB31_6: # %else5
1690 ; KNL_64-NEXT: vmovdqa %xmm3, %xmm0
1691 ; KNL_64-NEXT: vzeroupper
1694 ; KNL_32-LABEL: test30:
1696 ; KNL_32-NEXT: subl $12, %esp
1697 ; KNL_32-NEXT: .cfi_def_cfa_offset 16
1698 ; KNL_32-NEXT: vmovdqa %xmm0, %xmm3
1699 ; KNL_32-NEXT: vpslld $31, %xmm2, %xmm0
1700 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0
1701 ; KNL_32-NEXT: kmovw %k0, %eax
1702 ; KNL_32-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0
1703 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
1704 ; KNL_32-NEXT: vpaddd %xmm1, %xmm3, %xmm1
1705 ; KNL_32-NEXT: testb $1, %al
1706 ; KNL_32-NEXT: je .LBB31_2
1707 ; KNL_32-NEXT: # %bb.1: # %cond.load
1708 ; KNL_32-NEXT: vmovd %xmm1, %eax
1709 ; KNL_32-NEXT: vpinsrd $0, (%eax), %xmm0, %xmm0
1710 ; KNL_32-NEXT: .LBB31_2: # %else
1711 ; KNL_32-NEXT: kshiftrw $1, %k0, %k1
1712 ; KNL_32-NEXT: kmovw %k1, %eax
1713 ; KNL_32-NEXT: testb $1, %al
1714 ; KNL_32-NEXT: je .LBB31_4
1715 ; KNL_32-NEXT: # %bb.3: # %cond.load1
1716 ; KNL_32-NEXT: vpextrd $1, %xmm1, %eax
1717 ; KNL_32-NEXT: vpinsrd $1, (%eax), %xmm0, %xmm0
1718 ; KNL_32-NEXT: .LBB31_4: # %else2
1719 ; KNL_32-NEXT: kshiftrw $2, %k0, %k0
1720 ; KNL_32-NEXT: kmovw %k0, %eax
1721 ; KNL_32-NEXT: testb $1, %al
1722 ; KNL_32-NEXT: je .LBB31_6
1723 ; KNL_32-NEXT: # %bb.5: # %cond.load4
1724 ; KNL_32-NEXT: vpextrd $2, %xmm1, %eax
1725 ; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
1726 ; KNL_32-NEXT: .LBB31_6: # %else5
1727 ; KNL_32-NEXT: addl $12, %esp
1728 ; KNL_32-NEXT: .cfi_def_cfa_offset 4
1729 ; KNL_32-NEXT: vzeroupper
1732 ; SKX-LABEL: test30:
1734 ; SKX-NEXT: vpslld $31, %xmm2, %xmm2
1735 ; SKX-NEXT: vpmovd2m %xmm2, %k0
1736 ; SKX-NEXT: kmovw %k0, %eax
1737 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
1738 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
1739 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
1740 ; SKX-NEXT: testb $1, %al
1741 ; SKX-NEXT: je .LBB31_2
1742 ; SKX-NEXT: # %bb.1: # %cond.load
1743 ; SKX-NEXT: vmovq %xmm0, %rax
1744 ; SKX-NEXT: vpinsrd $0, (%rax), %xmm3, %xmm3
1745 ; SKX-NEXT: .LBB31_2: # %else
1746 ; SKX-NEXT: kshiftrb $1, %k0, %k1
1747 ; SKX-NEXT: kmovw %k1, %eax
1748 ; SKX-NEXT: testb $1, %al
1749 ; SKX-NEXT: je .LBB31_4
1750 ; SKX-NEXT: # %bb.3: # %cond.load1
1751 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
1752 ; SKX-NEXT: vpinsrd $1, (%rax), %xmm3, %xmm3
1753 ; SKX-NEXT: .LBB31_4: # %else2
1754 ; SKX-NEXT: kshiftrb $2, %k0, %k0
1755 ; SKX-NEXT: kmovw %k0, %eax
1756 ; SKX-NEXT: testb $1, %al
1757 ; SKX-NEXT: je .LBB31_6
1758 ; SKX-NEXT: # %bb.5: # %cond.load4
1759 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
1760 ; SKX-NEXT: vmovq %xmm0, %rax
1761 ; SKX-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3
1762 ; SKX-NEXT: .LBB31_6: # %else5
1763 ; SKX-NEXT: vmovdqa %xmm3, %xmm0
1764 ; SKX-NEXT: vzeroupper
1767 ; SKX_32-LABEL: test30:
1769 ; SKX_32-NEXT: subl $12, %esp
1770 ; SKX_32-NEXT: .cfi_def_cfa_offset 16
1771 ; SKX_32-NEXT: vmovdqa %xmm0, %xmm3
1772 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm0
1773 ; SKX_32-NEXT: vpmovd2m %xmm0, %k0
1774 ; SKX_32-NEXT: kmovw %k0, %eax
1775 ; SKX_32-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0
1776 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
1777 ; SKX_32-NEXT: vpaddd %xmm1, %xmm3, %xmm1
1778 ; SKX_32-NEXT: testb $1, %al
1779 ; SKX_32-NEXT: je .LBB31_2
1780 ; SKX_32-NEXT: # %bb.1: # %cond.load
1781 ; SKX_32-NEXT: vmovd %xmm1, %eax
1782 ; SKX_32-NEXT: vpinsrd $0, (%eax), %xmm0, %xmm0
1783 ; SKX_32-NEXT: .LBB31_2: # %else
1784 ; SKX_32-NEXT: kshiftrb $1, %k0, %k1
1785 ; SKX_32-NEXT: kmovw %k1, %eax
1786 ; SKX_32-NEXT: testb $1, %al
1787 ; SKX_32-NEXT: je .LBB31_4
1788 ; SKX_32-NEXT: # %bb.3: # %cond.load1
1789 ; SKX_32-NEXT: vpextrd $1, %xmm1, %eax
1790 ; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm0, %xmm0
1791 ; SKX_32-NEXT: .LBB31_4: # %else2
1792 ; SKX_32-NEXT: kshiftrb $2, %k0, %k0
1793 ; SKX_32-NEXT: kmovw %k0, %eax
1794 ; SKX_32-NEXT: testb $1, %al
1795 ; SKX_32-NEXT: je .LBB31_6
1796 ; SKX_32-NEXT: # %bb.5: # %cond.load4
1797 ; SKX_32-NEXT: vpextrd $2, %xmm1, %eax
1798 ; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
1799 ; SKX_32-NEXT: .LBB31_6: # %else5
1800 ; SKX_32-NEXT: addl $12, %esp
1801 ; SKX_32-NEXT: .cfi_def_cfa_offset 4
1804 %sext_ind = sext <3 x i32> %ind to <3 x i64>
1805 %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
1806 %res = call <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
1810 declare <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
1811 define <16 x float*> @test31(<16 x float**> %ptrs) {
1812 ; KNL_64-LABEL: test31:
1814 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
1815 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
1816 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
1817 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
1818 ; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0
1819 ; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm1
1822 ; KNL_32-LABEL: test31:
1824 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
1825 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
1826 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
1829 ; SKX-LABEL: test31:
1831 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1832 ; SKX-NEXT: kxnorw %k0, %k0, %k2
1833 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
1834 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
1835 ; SKX-NEXT: vmovdqa64 %zmm2, %zmm0
1836 ; SKX-NEXT: vmovdqa64 %zmm3, %zmm1
1839 ; SKX_32-LABEL: test31:
1841 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
1842 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
1843 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
1846 %res = call <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
1847 ret <16 x float*>%res
1850 define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
1851 ; KNL_64-LABEL: test_gather_16i32:
1853 ; KNL_64-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1854 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1855 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1856 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2
1857 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1858 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
1859 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
1860 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
1863 ; KNL_32-LABEL: test_gather_16i32:
1865 ; KNL_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1866 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1867 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1868 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
1869 ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
1872 ; SKX-LABEL: test_gather_16i32:
1874 ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1875 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1876 ; SKX-NEXT: vpmovd2m %zmm2, %k1
1877 ; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm2
1878 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1879 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
1880 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
1881 ; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
1884 ; SKX_32-LABEL: test_gather_16i32:
1886 ; SKX_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1887 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1888 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
1889 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
1890 ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
1892 %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
1895 define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
1896 ; KNL_64-LABEL: test_gather_16i64:
1898 ; KNL_64-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1899 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1900 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1901 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1902 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
1903 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
1904 ; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm0
1905 ; KNL_64-NEXT: vmovdqa64 %zmm4, %zmm1
1908 ; KNL_32-LABEL: test_gather_16i64:
1910 ; KNL_32-NEXT: pushl %ebp
1911 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1912 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1913 ; KNL_32-NEXT: movl %esp, %ebp
1914 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1915 ; KNL_32-NEXT: andl $-64, %esp
1916 ; KNL_32-NEXT: subl $64, %esp
1917 ; KNL_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1918 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1919 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1920 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1921 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1922 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
1923 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1924 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
1925 ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
1926 ; KNL_32-NEXT: movl %ebp, %esp
1927 ; KNL_32-NEXT: popl %ebp
1928 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
1931 ; SKX-LABEL: test_gather_16i64:
1933 ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1934 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1935 ; SKX-NEXT: vpmovd2m %zmm2, %k1
1936 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1937 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
1938 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
1939 ; SKX-NEXT: vmovdqa64 %zmm3, %zmm0
1940 ; SKX-NEXT: vmovdqa64 %zmm4, %zmm1
1943 ; SKX_32-LABEL: test_gather_16i64:
1945 ; SKX_32-NEXT: pushl %ebp
1946 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
1947 ; SKX_32-NEXT: .cfi_offset %ebp, -8
1948 ; SKX_32-NEXT: movl %esp, %ebp
1949 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
1950 ; SKX_32-NEXT: andl $-64, %esp
1951 ; SKX_32-NEXT: subl $64, %esp
1952 ; SKX_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1953 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1954 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
1955 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1956 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
1957 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
1958 ; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1959 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
1960 ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
1961 ; SKX_32-NEXT: movl %ebp, %esp
1962 ; SKX_32-NEXT: popl %ebp
1963 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
1965 %res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
1968 declare <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
1969 define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
1970 ; KNL_64-LABEL: test_gather_16f32:
1972 ; KNL_64-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1973 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1974 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1975 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2
1976 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1977 ; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
1978 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
1979 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
1982 ; KNL_32-LABEL: test_gather_16f32:
1984 ; KNL_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1985 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1986 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1987 ; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
1988 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1991 ; SKX-LABEL: test_gather_16f32:
1993 ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
1994 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1995 ; SKX-NEXT: vpmovd2m %zmm2, %k1
1996 ; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm2
1997 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1998 ; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
1999 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
2000 ; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
2003 ; SKX_32-LABEL: test_gather_16f32:
2005 ; SKX_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2006 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2007 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2008 ; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
2009 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
2011 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
2012 ret <16 x float> %res
2014 define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
2015 ; KNL_64-LABEL: test_gather_16f64:
2017 ; KNL_64-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2018 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2019 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2020 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2021 ; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
2022 ; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
2023 ; KNL_64-NEXT: vmovapd %zmm3, %zmm0
2024 ; KNL_64-NEXT: vmovapd %zmm4, %zmm1
2027 ; KNL_32-LABEL: test_gather_16f64:
2029 ; KNL_32-NEXT: pushl %ebp
2030 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2031 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2032 ; KNL_32-NEXT: movl %esp, %ebp
2033 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2034 ; KNL_32-NEXT: andl $-64, %esp
2035 ; KNL_32-NEXT: subl $64, %esp
2036 ; KNL_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2037 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2038 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2039 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
2040 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2041 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
2042 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2043 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
2044 ; KNL_32-NEXT: vmovapd %zmm2, %zmm0
2045 ; KNL_32-NEXT: movl %ebp, %esp
2046 ; KNL_32-NEXT: popl %ebp
2047 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2050 ; SKX-LABEL: test_gather_16f64:
2052 ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2053 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2054 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2055 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2056 ; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
2057 ; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
2058 ; SKX-NEXT: vmovapd %zmm3, %zmm0
2059 ; SKX-NEXT: vmovapd %zmm4, %zmm1
2062 ; SKX_32-LABEL: test_gather_16f64:
2064 ; SKX_32-NEXT: pushl %ebp
2065 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2066 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2067 ; SKX_32-NEXT: movl %esp, %ebp
2068 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2069 ; SKX_32-NEXT: andl $-64, %esp
2070 ; SKX_32-NEXT: subl $64, %esp
2071 ; SKX_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2072 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2073 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2074 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
2075 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2076 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
2077 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2078 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
2079 ; SKX_32-NEXT: vmovapd %zmm2, %zmm0
2080 ; SKX_32-NEXT: movl %ebp, %esp
2081 ; SKX_32-NEXT: popl %ebp
2082 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2084 %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
2085 ret <16 x double> %res
2087 declare <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
2088 define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
2089 ; KNL_64-LABEL: test_scatter_16i32:
2091 ; KNL_64-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2092 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2093 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2094 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2095 ; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
2096 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
2097 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
2098 ; KNL_64-NEXT: vzeroupper
2101 ; KNL_32-LABEL: test_scatter_16i32:
2103 ; KNL_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2104 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2105 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2106 ; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
2107 ; KNL_32-NEXT: vzeroupper
2110 ; SKX-LABEL: test_scatter_16i32:
2112 ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2113 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2114 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2115 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2116 ; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
2117 ; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm0
2118 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
2119 ; SKX-NEXT: vzeroupper
2122 ; SKX_32-LABEL: test_scatter_16i32:
2124 ; SKX_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2125 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2126 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2127 ; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
2128 ; SKX_32-NEXT: vzeroupper
2130 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
2133 define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
2134 ; KNL_64-LABEL: test_scatter_16i64:
2136 ; KNL_64-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2137 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2138 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2139 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2140 ; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
2141 ; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
2142 ; KNL_64-NEXT: vzeroupper
2145 ; KNL_32-LABEL: test_scatter_16i64:
2147 ; KNL_32-NEXT: pushl %ebp
2148 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2149 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2150 ; KNL_32-NEXT: movl %esp, %ebp
2151 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2152 ; KNL_32-NEXT: andl $-64, %esp
2153 ; KNL_32-NEXT: subl $64, %esp
2154 ; KNL_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2155 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2156 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2157 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
2158 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2159 ; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
2160 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2161 ; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
2162 ; KNL_32-NEXT: movl %ebp, %esp
2163 ; KNL_32-NEXT: popl %ebp
2164 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2165 ; KNL_32-NEXT: vzeroupper
2168 ; SKX-LABEL: test_scatter_16i64:
2170 ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2171 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2172 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2173 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2174 ; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
2175 ; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
2176 ; SKX-NEXT: vzeroupper
2179 ; SKX_32-LABEL: test_scatter_16i64:
2181 ; SKX_32-NEXT: pushl %ebp
2182 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2183 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2184 ; SKX_32-NEXT: movl %esp, %ebp
2185 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2186 ; SKX_32-NEXT: andl $-64, %esp
2187 ; SKX_32-NEXT: subl $64, %esp
2188 ; SKX_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2189 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2190 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2191 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
2192 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2193 ; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
2194 ; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2195 ; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
2196 ; SKX_32-NEXT: movl %ebp, %esp
2197 ; SKX_32-NEXT: popl %ebp
2198 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2199 ; SKX_32-NEXT: vzeroupper
2201 call void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
2204 declare void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
2205 define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
2206 ; KNL_64-LABEL: test_scatter_16f32:
2208 ; KNL_64-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2209 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2210 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2211 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2212 ; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
2213 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
2214 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
2215 ; KNL_64-NEXT: vzeroupper
2218 ; KNL_32-LABEL: test_scatter_16f32:
2220 ; KNL_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2221 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2222 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2223 ; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
2224 ; KNL_32-NEXT: vzeroupper
2227 ; SKX-LABEL: test_scatter_16f32:
2229 ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2230 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2231 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2232 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2233 ; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
2234 ; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm0
2235 ; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
2236 ; SKX-NEXT: vzeroupper
2239 ; SKX_32-LABEL: test_scatter_16f32:
2241 ; SKX_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2242 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2243 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2244 ; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
2245 ; SKX_32-NEXT: vzeroupper
2247 call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
2250 declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
2251 define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
2252 ; KNL_64-LABEL: test_scatter_16f64:
2254 ; KNL_64-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2255 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2256 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2257 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2258 ; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
2259 ; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
2260 ; KNL_64-NEXT: vzeroupper
2263 ; KNL_32-LABEL: test_scatter_16f64:
2265 ; KNL_32-NEXT: pushl %ebp
2266 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2267 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2268 ; KNL_32-NEXT: movl %esp, %ebp
2269 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2270 ; KNL_32-NEXT: andl $-64, %esp
2271 ; KNL_32-NEXT: subl $64, %esp
2272 ; KNL_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2273 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2274 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2275 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
2276 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2277 ; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
2278 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2279 ; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
2280 ; KNL_32-NEXT: movl %ebp, %esp
2281 ; KNL_32-NEXT: popl %ebp
2282 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2283 ; KNL_32-NEXT: vzeroupper
2286 ; SKX-LABEL: test_scatter_16f64:
2288 ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
2289 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2290 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2291 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2292 ; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
2293 ; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
2294 ; SKX-NEXT: vzeroupper
2297 ; SKX_32-LABEL: test_scatter_16f64:
2299 ; SKX_32-NEXT: pushl %ebp
2300 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2301 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2302 ; SKX_32-NEXT: movl %esp, %ebp
2303 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2304 ; SKX_32-NEXT: andl $-64, %esp
2305 ; SKX_32-NEXT: subl $64, %esp
2306 ; SKX_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2307 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2308 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2309 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
2310 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2311 ; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
2312 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2313 ; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
2314 ; SKX_32-NEXT: movl %ebp, %esp
2315 ; SKX_32-NEXT: popl %ebp
2316 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2317 ; SKX_32-NEXT: vzeroupper
2319 call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
2322 declare void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
2324 define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) {
2325 ; KNL_64-LABEL: test_pr28312:
2327 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2328 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
2329 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
2330 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
2331 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
2332 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1}
2333 ; KNL_64-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2334 ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2337 ; KNL_32-LABEL: test_pr28312:
2339 ; KNL_32-NEXT: pushl %ebp
2340 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2341 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2342 ; KNL_32-NEXT: movl %esp, %ebp
2343 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2344 ; KNL_32-NEXT: andl $-32, %esp
2345 ; KNL_32-NEXT: subl $32, %esp
2346 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2347 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
2348 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
2349 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
2350 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
2351 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k1}
2352 ; KNL_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2353 ; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2354 ; KNL_32-NEXT: movl %ebp, %esp
2355 ; KNL_32-NEXT: popl %ebp
2356 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2359 ; SKX-LABEL: test_pr28312:
2361 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
2362 ; SKX-NEXT: vpmovd2m %xmm1, %k1
2363 ; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1}
2364 ; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2365 ; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2368 ; SKX_32-LABEL: test_pr28312:
2370 ; SKX_32-NEXT: pushl %ebp
2371 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2372 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2373 ; SKX_32-NEXT: movl %esp, %ebp
2374 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2375 ; SKX_32-NEXT: andl $-32, %esp
2376 ; SKX_32-NEXT: subl $32, %esp
2377 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
2378 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
2379 ; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1}
2380 ; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2381 ; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2382 ; SKX_32-NEXT: movl %ebp, %esp
2383 ; SKX_32-NEXT: popl %ebp
2384 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2386 %g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2387 %g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2388 %g3 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2389 %a = add <4 x i64> %g1, %g2
2390 %b = add <4 x i64> %a, %g3
2393 declare <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>)
2395 define <8 x i32> @test_global_array(<8 x i64> %indxs) {
2396 ; KNL_64-LABEL: test_global_array:
2398 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2399 ; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2400 ; KNL_64-NEXT: vmovdqa %ymm1, %ymm0
2403 ; KNL_32-LABEL: test_global_array:
2405 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2406 ; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2407 ; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
2410 ; SKX_SMALL-LABEL: test_global_array:
2411 ; SKX_SMALL: # %bb.0:
2412 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
2413 ; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2414 ; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0
2415 ; SKX_SMALL-NEXT: retq
2417 ; SKX_LARGE-LABEL: test_global_array:
2418 ; SKX_LARGE: # %bb.0:
2419 ; SKX_LARGE-NEXT: movabsq $glob_array, %rax
2420 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
2421 ; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
2422 ; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0
2423 ; SKX_LARGE-NEXT: retq
2425 ; SKX_32-LABEL: test_global_array:
2427 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2428 ; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2429 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm0
2431 %p = getelementptr inbounds [16 x i32], [16 x i32]* @glob_array, i64 0, <8 x i64> %indxs
2432 %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
2436 define void @v1_scatter(<1 x i32>%a1, <1 x i32*> %ptr, <1 x i1> %mask) {
2437 ; KNL_64-LABEL: v1_scatter:
2439 ; KNL_64-NEXT: testb $1, %dl
2440 ; KNL_64-NEXT: je .LBB43_2
2441 ; KNL_64-NEXT: # %bb.1: # %cond.store
2442 ; KNL_64-NEXT: movl %edi, (%rsi)
2443 ; KNL_64-NEXT: .LBB43_2: # %else
2446 ; KNL_32-LABEL: v1_scatter:
2448 ; KNL_32-NEXT: testb $1, {{[0-9]+}}(%esp)
2449 ; KNL_32-NEXT: je .LBB43_2
2450 ; KNL_32-NEXT: # %bb.1: # %cond.store
2451 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2452 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
2453 ; KNL_32-NEXT: movl %ecx, (%eax)
2454 ; KNL_32-NEXT: .LBB43_2: # %else
2457 ; SKX-LABEL: v1_scatter:
2459 ; SKX-NEXT: testb $1, %dl
2460 ; SKX-NEXT: je .LBB43_2
2461 ; SKX-NEXT: # %bb.1: # %cond.store
2462 ; SKX-NEXT: movl %edi, (%rsi)
2463 ; SKX-NEXT: .LBB43_2: # %else
2466 ; SKX_32-LABEL: v1_scatter:
2468 ; SKX_32-NEXT: testb $1, {{[0-9]+}}(%esp)
2469 ; SKX_32-NEXT: je .LBB43_2
2470 ; SKX_32-NEXT: # %bb.1: # %cond.store
2471 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2472 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
2473 ; SKX_32-NEXT: movl %ecx, (%eax)
2474 ; SKX_32-NEXT: .LBB43_2: # %else
2476 call void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32> %a1, <1 x i32*> %ptr, i32 4, <1 x i1> %mask)
2479 declare void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32>, <1 x i32*>, i32, <1 x i1>)
2481 define <1 x i32> @v1_gather(<1 x i32*> %ptr, <1 x i1> %mask, <1 x i32> %src0) {
2482 ; KNL_64-LABEL: v1_gather:
2484 ; KNL_64-NEXT: movl (%rdi), %eax
2487 ; KNL_32-LABEL: v1_gather:
2489 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2490 ; KNL_32-NEXT: movl (%eax), %eax
2493 ; SKX-LABEL: v1_gather:
2495 ; SKX-NEXT: movl (%rdi), %eax
2498 ; SKX_32-LABEL: v1_gather:
2500 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2501 ; SKX_32-NEXT: movl (%eax), %eax
2503 %res = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> %ptr, i32 4, <1 x i1> <i1 true>, <1 x i32> %src0)
2506 declare <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*>, i32, <1 x i1>, <1 x i32>)
2508 ; Make sure we don't crash when the index element type is larger than i64 and we need to widen the result
2509 ; This experienced a bad interaction when we widened and then tried to split.
2510 define <2 x float> @large_index(float* %base, <2 x i128> %ind, <2 x i1> %mask, <2 x float> %src0) {
2511 ; KNL_64-LABEL: large_index:
2513 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2514 ; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0
2515 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0
2516 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
2517 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
2518 ; KNL_64-NEXT: vmovq %rcx, %xmm0
2519 ; KNL_64-NEXT: vmovq %rsi, %xmm2
2520 ; KNL_64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2521 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k1}
2522 ; KNL_64-NEXT: vmovaps %xmm1, %xmm0
2523 ; KNL_64-NEXT: vzeroupper
2526 ; KNL_32-LABEL: large_index:
2528 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2529 ; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0
2530 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0
2531 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
2532 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
2533 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2534 ; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2535 ; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2536 ; KNL_32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2537 ; KNL_32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2538 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm1 {%k1}
2539 ; KNL_32-NEXT: vmovaps %xmm1, %xmm0
2540 ; KNL_32-NEXT: vzeroupper
2543 ; SKX-LABEL: large_index:
2545 ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
2546 ; SKX-NEXT: vpmovq2m %xmm0, %k1
2547 ; SKX-NEXT: vmovq %rcx, %xmm0
2548 ; SKX-NEXT: vmovq %rsi, %xmm2
2549 ; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2550 ; SKX-NEXT: vgatherqps (%rdi,%xmm0,4), %xmm1 {%k1}
2551 ; SKX-NEXT: vmovaps %xmm1, %xmm0
2554 ; SKX_32-LABEL: large_index:
2556 ; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0
2557 ; SKX_32-NEXT: vpmovq2m %xmm0, %k1
2558 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2559 ; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2560 ; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2561 ; SKX_32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2562 ; SKX_32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2563 ; SKX_32-NEXT: vgatherqps (%eax,%xmm0,4), %xmm1 {%k1}
2564 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
2566 %gep.random = getelementptr float, float* %base, <2 x i128> %ind
2567 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
2571 ; Make sure we allow index to be sign extended from a smaller than i32 element size.
2572 define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {
2573 ; KNL_64-LABEL: sext_i8_index:
2575 ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm1
2576 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2577 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2580 ; KNL_32-LABEL: sext_i8_index:
2582 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2583 ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm1
2584 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2585 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2588 ; SKX-LABEL: sext_i8_index:
2590 ; SKX-NEXT: vpmovsxbd %xmm0, %zmm1
2591 ; SKX-NEXT: kxnorw %k0, %k0, %k1
2592 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2595 ; SKX_32-LABEL: sext_i8_index:
2597 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2598 ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm1
2599 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2600 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2603 %sext_ind = sext <16 x i8> %ind to <16 x i64>
2604 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
2606 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2607 ret <16 x float>%res
2610 ; Make sure we allow index to be sign extended from a smaller than i32 element size.
2611 define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {
2612 ; KNL_64-LABEL: sext_v8i8_index:
2614 ; KNL_64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2615 ; KNL_64-NEXT: vpslld $24, %ymm0, %ymm0
2616 ; KNL_64-NEXT: vpsrad $24, %ymm0, %ymm1
2617 ; KNL_64-NEXT: movw $255, %ax
2618 ; KNL_64-NEXT: kmovw %eax, %k1
2619 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2620 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2623 ; KNL_32-LABEL: sext_v8i8_index:
2625 ; KNL_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2626 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2627 ; KNL_32-NEXT: vpslld $24, %ymm0, %ymm0
2628 ; KNL_32-NEXT: vpsrad $24, %ymm0, %ymm1
2629 ; KNL_32-NEXT: movw $255, %cx
2630 ; KNL_32-NEXT: kmovw %ecx, %k1
2631 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2632 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2635 ; SKX-LABEL: sext_v8i8_index:
2637 ; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2638 ; SKX-NEXT: kxnorw %k0, %k0, %k1
2639 ; SKX-NEXT: vpslld $24, %ymm0, %ymm0
2640 ; SKX-NEXT: vpsrad $24, %ymm0, %ymm1
2641 ; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
2644 ; SKX_32-LABEL: sext_v8i8_index:
2646 ; SKX_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2647 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2648 ; SKX_32-NEXT: vpslld $24, %ymm0, %ymm0
2649 ; SKX_32-NEXT: vpsrad $24, %ymm0, %ymm1
2650 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2651 ; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
2654 %sext_ind = sext <8 x i8> %ind to <8 x i64>
2655 %gep.random = getelementptr float, float *%base, <8 x i64> %sext_ind
2657 %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %gep.random, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
2660 declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>)
2662 ; Index requires promotion
2663 define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32> %ind, <2 x i1> %mask) {
2664 ; KNL_64-LABEL: test_scatter_2i32_index:
2666 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2667 ; KNL_64-NEXT: vpsllq $32, %xmm1, %xmm1
2668 ; KNL_64-NEXT: vpsraq $32, %zmm1, %zmm1
2669 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
2670 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
2671 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
2672 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
2673 ; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm1,8) {%k1}
2674 ; KNL_64-NEXT: vzeroupper
2677 ; KNL_32-LABEL: test_scatter_2i32_index:
2679 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2680 ; KNL_32-NEXT: vpsllq $32, %xmm1, %xmm1
2681 ; KNL_32-NEXT: vpsraq $32, %zmm1, %zmm1
2682 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
2683 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
2684 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
2685 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
2686 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2687 ; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm1,8) {%k1}
2688 ; KNL_32-NEXT: vzeroupper
2691 ; SKX-LABEL: test_scatter_2i32_index:
2693 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
2694 ; SKX-NEXT: vpmovq2m %xmm2, %k1
2695 ; SKX-NEXT: vpsllq $32, %xmm1, %xmm1
2696 ; SKX-NEXT: vpsraq $32, %xmm1, %xmm1
2697 ; SKX-NEXT: vscatterqpd %xmm0, (%rdi,%xmm1,8) {%k1}
2700 ; SKX_32-LABEL: test_scatter_2i32_index:
2702 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
2703 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1
2704 ; SKX_32-NEXT: vpsllq $32, %xmm1, %xmm1
2705 ; SKX_32-NEXT: vpsraq $32, %xmm1, %xmm1
2706 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2707 ; SKX_32-NEXT: vscatterqpd %xmm0, (%eax,%xmm1,8) {%k1}
2709 %gep = getelementptr double, double *%base, <2 x i32> %ind
2710 call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask)
2713 declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>)
2715 define <16 x float> @zext_index(float* %base, <16 x i32> %ind) {
2716 ; KNL_64-LABEL: zext_index:
2718 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
2719 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2720 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2723 ; KNL_32-LABEL: zext_index:
2725 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2726 ; KNL_32-NEXT: vpandd {{\.LCPI.*}}{1to16}, %zmm0, %zmm1
2727 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2728 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2731 ; SKX_SMALL-LABEL: zext_index:
2732 ; SKX_SMALL: # %bb.0:
2733 ; SKX_SMALL-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm1
2734 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
2735 ; SKX_SMALL-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2736 ; SKX_SMALL-NEXT: retq
2738 ; SKX_LARGE-LABEL: zext_index:
2739 ; SKX_LARGE: # %bb.0:
2740 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
2741 ; SKX_LARGE-NEXT: vandps (%rax){1to16}, %zmm0, %zmm1
2742 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
2743 ; SKX_LARGE-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2744 ; SKX_LARGE-NEXT: retq
2746 ; SKX_32-LABEL: zext_index:
2748 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2749 ; SKX_32-NEXT: vandps {{\.LCPI.*}}{1to16}, %zmm0, %zmm1
2750 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2751 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2753 %ind_masked = and <16 x i32> %ind, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
2754 %sext_ind = zext <16 x i32> %ind_masked to <16 x i64>
2755 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
2757 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2758 ret <16 x float>%res
2761 define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %passthru) {
2762 ; KNL_64-LABEL: test_gather_setcc_split:
2764 ; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm4
2765 ; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm5
2766 ; KNL_64-NEXT: vptestnmd %zmm5, %zmm5, %k1
2767 ; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
2768 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
2769 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm4,8), %zmm3 {%k1}
2770 ; KNL_64-NEXT: vmovapd %zmm2, %zmm0
2771 ; KNL_64-NEXT: vmovapd %zmm3, %zmm1
2774 ; KNL_32-LABEL: test_gather_setcc_split:
2776 ; KNL_32-NEXT: pushl %ebp
2777 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2778 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2779 ; KNL_32-NEXT: movl %esp, %ebp
2780 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2781 ; KNL_32-NEXT: andl $-64, %esp
2782 ; KNL_32-NEXT: subl $64, %esp
2783 ; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
2784 ; KNL_32-NEXT: movl 8(%ebp), %eax
2785 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
2786 ; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5
2787 ; KNL_32-NEXT: vptestnmd %zmm5, %zmm5, %k1
2788 ; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
2789 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
2790 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm4,8), %zmm3 {%k1}
2791 ; KNL_32-NEXT: vmovapd %zmm2, %zmm0
2792 ; KNL_32-NEXT: vmovapd %zmm3, %zmm1
2793 ; KNL_32-NEXT: movl %ebp, %esp
2794 ; KNL_32-NEXT: popl %ebp
2795 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2798 ; SKX-LABEL: test_gather_setcc_split:
2800 ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm4
2801 ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm5
2802 ; SKX-NEXT: vptestnmd %ymm5, %ymm5, %k1
2803 ; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
2804 ; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
2805 ; SKX-NEXT: vgatherdpd (%rdi,%ymm4,8), %zmm3 {%k1}
2806 ; SKX-NEXT: vmovapd %zmm2, %zmm0
2807 ; SKX-NEXT: vmovapd %zmm3, %zmm1
2810 ; SKX_32-LABEL: test_gather_setcc_split:
2812 ; SKX_32-NEXT: pushl %ebp
2813 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2814 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2815 ; SKX_32-NEXT: movl %esp, %ebp
2816 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2817 ; SKX_32-NEXT: andl $-64, %esp
2818 ; SKX_32-NEXT: subl $64, %esp
2819 ; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
2820 ; SKX_32-NEXT: movl 8(%ebp), %eax
2821 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
2822 ; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5
2823 ; SKX_32-NEXT: vptestnmd %ymm5, %ymm5, %k1
2824 ; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
2825 ; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
2826 ; SKX_32-NEXT: vgatherdpd (%eax,%ymm4,8), %zmm3 {%k1}
2827 ; SKX_32-NEXT: vmovapd %zmm2, %zmm0
2828 ; SKX_32-NEXT: vmovapd %zmm3, %zmm1
2829 ; SKX_32-NEXT: movl %ebp, %esp
2830 ; SKX_32-NEXT: popl %ebp
2831 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2833 %sext_ind = sext <16 x i32> %ind to <16 x i64>
2834 %gep.random = getelementptr double, double *%base, <16 x i64> %sext_ind
2836 %mask = icmp eq <16 x i32> %cmp, zeroinitializer
2837 %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %gep.random, i32 4, <16 x i1> %mask, <16 x double> %passthru)
2838 ret <16 x double>%res
2841 define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %src0) {
2842 ; KNL_64-LABEL: test_scatter_setcc_split:
2844 ; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm4
2845 ; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm5
2846 ; KNL_64-NEXT: vptestnmd %zmm5, %zmm5, %k1
2847 ; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
2848 ; KNL_64-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
2849 ; KNL_64-NEXT: vscatterdpd %zmm3, (%rdi,%ymm4,8) {%k1}
2850 ; KNL_64-NEXT: vzeroupper
2853 ; KNL_32-LABEL: test_scatter_setcc_split:
2855 ; KNL_32-NEXT: pushl %ebp
2856 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2857 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2858 ; KNL_32-NEXT: movl %esp, %ebp
2859 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2860 ; KNL_32-NEXT: andl $-64, %esp
2861 ; KNL_32-NEXT: subl $64, %esp
2862 ; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
2863 ; KNL_32-NEXT: movl 8(%ebp), %eax
2864 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
2865 ; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5
2866 ; KNL_32-NEXT: vptestnmd %zmm5, %zmm5, %k1
2867 ; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
2868 ; KNL_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
2869 ; KNL_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm4,8) {%k1}
2870 ; KNL_32-NEXT: movl %ebp, %esp
2871 ; KNL_32-NEXT: popl %ebp
2872 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2873 ; KNL_32-NEXT: vzeroupper
2876 ; SKX-LABEL: test_scatter_setcc_split:
2878 ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm4
2879 ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm5
2880 ; SKX-NEXT: vptestnmd %ymm5, %ymm5, %k1
2881 ; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
2882 ; SKX-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
2883 ; SKX-NEXT: vscatterdpd %zmm3, (%rdi,%ymm4,8) {%k1}
2884 ; SKX-NEXT: vzeroupper
2887 ; SKX_32-LABEL: test_scatter_setcc_split:
2889 ; SKX_32-NEXT: pushl %ebp
2890 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2891 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2892 ; SKX_32-NEXT: movl %esp, %ebp
2893 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2894 ; SKX_32-NEXT: andl $-64, %esp
2895 ; SKX_32-NEXT: subl $64, %esp
2896 ; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
2897 ; SKX_32-NEXT: movl 8(%ebp), %eax
2898 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
2899 ; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5
2900 ; SKX_32-NEXT: vptestnmd %ymm5, %ymm5, %k1
2901 ; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
2902 ; SKX_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
2903 ; SKX_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm4,8) {%k1}
2904 ; SKX_32-NEXT: movl %ebp, %esp
2905 ; SKX_32-NEXT: popl %ebp
2906 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2907 ; SKX_32-NEXT: vzeroupper
2909 %sext_ind = sext <16 x i32> %ind to <16 x i64>
2910 %gep.random = getelementptr double, double *%base, <16 x i64> %sext_ind
2912 %mask = icmp eq <16 x i32> %cmp, zeroinitializer
2913 call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %gep.random, i32 4, <16 x i1> %mask)
2917 ; This test case previously triggered an infinite loop when the two gathers became identical after DAG combine removed the sign extend.
2918 define <16 x float> @test_sext_cse(float* %base, <16 x i32> %ind, <16 x i32>* %foo) {
2919 ; KNL_64-LABEL: test_sext_cse:
2921 ; KNL_64-NEXT: vmovaps %zmm0, (%rsi)
2922 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2923 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
2924 ; KNL_64-NEXT: vaddps %zmm1, %zmm1, %zmm0
2927 ; KNL_32-LABEL: test_sext_cse:
2929 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2930 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
2931 ; KNL_32-NEXT: vmovaps %zmm0, (%ecx)
2932 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2933 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
2934 ; KNL_32-NEXT: vaddps %zmm1, %zmm1, %zmm0
2937 ; SKX-LABEL: test_sext_cse:
2939 ; SKX-NEXT: vmovaps %zmm0, (%rsi)
2940 ; SKX-NEXT: kxnorw %k0, %k0, %k1
2941 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
2942 ; SKX-NEXT: vaddps %zmm1, %zmm1, %zmm0
2945 ; SKX_32-LABEL: test_sext_cse:
2947 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2948 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
2949 ; SKX_32-NEXT: vmovaps %zmm0, (%ecx)
2950 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2951 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
2952 ; SKX_32-NEXT: vaddps %zmm1, %zmm1, %zmm0
2954 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
2955 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
2957 %sext_ind = sext <16 x i32> %ind to <16 x i64>
2958 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
2960 store <16 x i32> %ind, <16 x i32>* %foo
2961 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2962 %gep.random2 = getelementptr float, <16 x float*> %broadcast.splat, <16 x i32> %ind
2963 %res2 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random2, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2964 %res3 = fadd <16 x float> %res2, %res
2965 ret <16 x float>%res3