1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_64
3 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32
4 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_SMALL
5 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq -code-model=large < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_LARGE
6 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32
7 ; RUN: opt -mtriple=x86_64-apple-darwin -scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
8 ; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
10 @glob_array = internal unnamed_addr constant [16 x i32] [i32 1, i32 1, i32 2, i32 3, i32 5, i32 8, i32 13, i32 21, i32 34, i32 55, i32 89, i32 144, i32 233, i32 377, i32 610, i32 987], align 16
13 ; SCALAR: extractelement <16 x float*>
14 ; SCALAR-NEXT: load float
15 ; SCALAR-NEXT: insertelement <16 x float>
16 ; SCALAR-NEXT: extractelement <16 x float*>
17 ; SCALAR-NEXT: load float
19 define <16 x float> @test1(float* %base, <16 x i32> %ind) {
20 ; KNL_64-LABEL: test1:
22 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
23 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
24 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
27 ; KNL_32-LABEL: test1:
29 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
30 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
31 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
32 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
37 ; SKX-NEXT: kxnorw %k0, %k0, %k1
38 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
39 ; SKX-NEXT: vmovaps %zmm1, %zmm0
42 ; SKX_32-LABEL: test1:
44 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
45 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
46 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
47 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
50 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
51 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
53 %sext_ind = sext <16 x i32> %ind to <16 x i64>
54 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
56 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
60 declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
61 declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
62 declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
66 ; SCALAR: extractelement <16 x float*>
67 ; SCALAR-NEXT: load float
68 ; SCALAR-NEXT: insertelement <16 x float>
69 ; SCALAR-NEXT: br label %else
71 ; SCALAR-NEXT: %res.phi.else = phi
72 ; SCALAR-NEXT: and i16 %{{.*}}, 2
73 ; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0
74 ; SCALAR-NEXT: br i1 %{{.*}}, label %cond.load1, label %else2
76 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
77 ; KNL_64-LABEL: test2:
79 ; KNL_64-NEXT: kmovw %esi, %k1
80 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
81 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
84 ; KNL_32-LABEL: test2:
86 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
87 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
88 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
89 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
94 ; SKX-NEXT: kmovw %esi, %k1
95 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
96 ; SKX-NEXT: vmovaps %zmm1, %zmm0
99 ; SKX_32-LABEL: test2:
101 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
102 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
103 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
104 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
107 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
108 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
110 %sext_ind = sext <16 x i32> %ind to <16 x i64>
111 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
112 %imask = bitcast i16 %mask to <16 x i1>
113 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
114 ret <16 x float> %res
117 define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
118 ; KNL_64-LABEL: test3:
120 ; KNL_64-NEXT: kmovw %esi, %k1
121 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
122 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
125 ; KNL_32-LABEL: test3:
127 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
128 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
129 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
130 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
135 ; SKX-NEXT: kmovw %esi, %k1
136 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
137 ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
140 ; SKX_32-LABEL: test3:
142 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
143 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
144 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
145 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
148 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
149 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
151 %sext_ind = sext <16 x i32> %ind to <16 x i64>
152 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind
153 %imask = bitcast i16 %mask to <16 x i1>
154 %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
159 define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
160 ; KNL_64-LABEL: test4:
162 ; KNL_64-NEXT: kmovw %esi, %k1
163 ; KNL_64-NEXT: kmovw %k1, %k2
164 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
165 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
166 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
167 ; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0
170 ; KNL_32-LABEL: test4:
172 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
173 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
174 ; KNL_32-NEXT: kmovw %k1, %k2
175 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
176 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
177 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
178 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
183 ; SKX-NEXT: kmovw %esi, %k1
184 ; SKX-NEXT: kmovw %k1, %k2
185 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
186 ; SKX-NEXT: vmovdqa64 %zmm1, %zmm2
187 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
188 ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0
191 ; SKX_32-LABEL: test4:
193 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
194 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
195 ; SKX_32-NEXT: kmovw %k1, %k2
196 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
197 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
198 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
199 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
202 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
203 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
205 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
206 %imask = bitcast i16 %mask to <16 x i1>
207 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
208 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
209 %res = add <16 x i32> %gt1, %gt2
214 ; SCALAR-LABEL: test5
215 ; SCALAR: and i16 %scalar_mask, 1
216 ; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0
217 ; SCALAR-NEXT: br i1 %{{.*}}, label %cond.store, label %else
218 ; SCALAR: cond.store:
219 ; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i64 0
220 ; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i64 0
221 ; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4
222 ; SCALAR-NEXT: br label %else
224 ; SCALAR-NEXT: and i16 %scalar_mask, 2
225 ; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0
226 ; SCALAR-NEXT: br i1 %{{.*}}, label %cond.store1, label %else2
228 define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
229 ; KNL_64-LABEL: test5:
231 ; KNL_64-NEXT: kmovw %esi, %k1
232 ; KNL_64-NEXT: kmovw %k1, %k2
233 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
234 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
235 ; KNL_64-NEXT: vzeroupper
238 ; KNL_32-LABEL: test5:
240 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
241 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
242 ; KNL_32-NEXT: kmovw %k1, %k2
243 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
244 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
245 ; KNL_32-NEXT: vzeroupper
250 ; SKX-NEXT: kmovw %esi, %k1
251 ; SKX-NEXT: kmovw %k1, %k2
252 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
253 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
254 ; SKX-NEXT: vzeroupper
257 ; SKX_32-LABEL: test5:
259 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
260 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
261 ; SKX_32-NEXT: kmovw %k1, %k2
262 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
263 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
264 ; SKX_32-NEXT: vzeroupper
267 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
268 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
270 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
271 %imask = bitcast i16 %mask to <16 x i1>
272 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
273 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
277 declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
278 declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
281 ; SCALAR-LABEL: test6
282 ; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4
283 ; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i64 1
284 ; SCALAR-NEXT: %Ptr12 = extractelement <8 x i32*> %ptr, i64 1
285 ; SCALAR-NEXT: store i32 %Elt1, i32* %Ptr12, align 4
286 ; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i64 2
287 ; SCALAR-NEXT: %Ptr23 = extractelement <8 x i32*> %ptr, i64 2
288 ; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4
290 define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
291 ; KNL_64-LABEL: test6:
293 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
294 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
295 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
296 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
297 ; KNL_64-NEXT: vmovdqa %ymm2, %ymm0
300 ; KNL_32-LABEL: test6:
302 ; KNL_32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
303 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
304 ; KNL_32-NEXT: movw $255, %ax
305 ; KNL_32-NEXT: kmovw %eax, %k1
306 ; KNL_32-NEXT: kmovw %k1, %k2
307 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm2 {%k2}
308 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
309 ; KNL_32-NEXT: vmovdqa %ymm2, %ymm0
314 ; SKX-NEXT: kxnorw %k0, %k0, %k1
315 ; SKX-NEXT: kxnorw %k0, %k0, %k2
316 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
317 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
318 ; SKX-NEXT: vmovdqa %ymm2, %ymm0
321 ; SKX_32-LABEL: test6:
323 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
324 ; SKX_32-NEXT: kxnorw %k0, %k0, %k2
325 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2}
326 ; SKX_32-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1}
327 ; SKX_32-NEXT: vmovdqa %ymm2, %ymm0
330 %a = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
332 call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
336 define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
338 ; KNL_64-LABEL: test7:
340 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
341 ; KNL_64-NEXT: kmovw %esi, %k0
342 ; KNL_64-NEXT: kshiftlw $8, %k0, %k0
343 ; KNL_64-NEXT: kshiftrw $8, %k0, %k1
344 ; KNL_64-NEXT: kmovw %k1, %k2
345 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
346 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
347 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
348 ; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0
351 ; KNL_32-LABEL: test7:
353 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
354 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
355 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
356 ; KNL_32-NEXT: kmovw %ecx, %k0
357 ; KNL_32-NEXT: kshiftlw $8, %k0, %k0
358 ; KNL_32-NEXT: kshiftrw $8, %k0, %k1
359 ; KNL_32-NEXT: kmovw %k1, %k2
360 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
361 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
362 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
363 ; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
368 ; SKX-NEXT: kmovw %esi, %k1
369 ; SKX-NEXT: kmovw %k1, %k2
370 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
371 ; SKX-NEXT: vmovdqa %ymm1, %ymm2
372 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
373 ; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0
376 ; SKX_32-LABEL: test7:
378 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
379 ; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
380 ; SKX_32-NEXT: kmovw %k1, %k2
381 ; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm1 {%k2}
382 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm2
383 ; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm2 {%k1}
384 ; SKX_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
387 %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
388 %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
390 %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind
391 %imask = bitcast i8 %mask to <8 x i1>
392 %gt1 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
393 %gt2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
394 %res = add <8 x i32> %gt1, %gt2
398 ; No uniform base in this case, index <8 x i64> contains addresses,
399 ; each gather call will be split into two
400 define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
401 ; KNL_64-LABEL: test8:
403 ; KNL_64-NEXT: kmovw %edi, %k1
404 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
405 ; KNL_64-NEXT: kmovw %k2, %k3
406 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
407 ; KNL_64-NEXT: kmovw %k1, %k3
408 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
409 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
410 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
411 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
412 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
413 ; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0
416 ; KNL_32-LABEL: test8:
418 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
419 ; KNL_32-NEXT: kmovw %k1, %k2
420 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
421 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
422 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
423 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
428 ; SKX-NEXT: kmovw %edi, %k1
429 ; SKX-NEXT: kshiftrw $8, %k1, %k2
430 ; SKX-NEXT: kmovw %k2, %k3
431 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
432 ; SKX-NEXT: kmovw %k1, %k3
433 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
434 ; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
435 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
436 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
437 ; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
438 ; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
441 ; SKX_32-LABEL: test8:
443 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
444 ; SKX_32-NEXT: kmovw %k1, %k2
445 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
446 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
447 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
448 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
451 %imask = bitcast i16 %mask to <16 x i1>
452 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
453 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
454 %res = add <16 x i32> %gt1, %gt2
458 %struct.RT = type { i8, [10 x [20 x i32]], i8 }
459 %struct.ST = type { i32, double, %struct.RT }
461 ; Masked gather for agregate types
462 ; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
465 define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
466 ; KNL_64-LABEL: test9:
467 ; KNL_64: # %bb.0: # %entry
468 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
469 ; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
470 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
471 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
472 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
473 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
474 ; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
475 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
476 ; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
477 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
478 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
479 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
480 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
481 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
484 ; KNL_32-LABEL: test9:
485 ; KNL_32: # %bb.0: # %entry
486 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
487 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
488 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
489 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
490 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
491 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
492 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
493 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
494 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
495 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1
496 ; KNL_32-NEXT: movw $255, %ax
497 ; KNL_32-NEXT: kmovw %eax, %k1
498 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1}
499 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
502 ; SKX_SMALL-LABEL: test9:
503 ; SKX_SMALL: # %bb.0: # %entry
504 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
505 ; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
506 ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
507 ; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
508 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
509 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
510 ; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
511 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
512 ; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
513 ; SKX_SMALL-NEXT: retq
515 ; SKX_LARGE-LABEL: test9:
516 ; SKX_LARGE: # %bb.0: # %entry
517 ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2
518 ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
519 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
520 ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
521 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
522 ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
523 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
524 ; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
525 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
526 ; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1
527 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
528 ; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
529 ; SKX_LARGE-NEXT: retq
531 ; SKX_32-LABEL: test9:
532 ; SKX_32: # %bb.0: # %entry
533 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
534 ; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
535 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
536 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
537 ; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
538 ; SKX_32-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
539 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
540 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1}
543 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
544 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
546 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
547 %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
551 define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
552 ; KNL_64-LABEL: test10:
553 ; KNL_64: # %bb.0: # %entry
554 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
555 ; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
556 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
557 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
558 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
559 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
560 ; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
561 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
562 ; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
563 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
564 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
565 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
566 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
567 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
570 ; KNL_32-LABEL: test10:
571 ; KNL_32: # %bb.0: # %entry
572 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
573 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
574 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
575 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
576 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
577 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
578 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
579 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
580 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
581 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1
582 ; KNL_32-NEXT: movw $255, %ax
583 ; KNL_32-NEXT: kmovw %eax, %k1
584 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1}
585 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
588 ; SKX_SMALL-LABEL: test10:
589 ; SKX_SMALL: # %bb.0: # %entry
590 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
591 ; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
592 ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
593 ; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
594 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
595 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
596 ; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
597 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
598 ; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
599 ; SKX_SMALL-NEXT: retq
601 ; SKX_LARGE-LABEL: test10:
602 ; SKX_LARGE: # %bb.0: # %entry
603 ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2
604 ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
605 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
606 ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
607 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
608 ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
609 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
610 ; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
611 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
612 ; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1
613 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
614 ; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
615 ; SKX_LARGE-NEXT: retq
617 ; SKX_32-LABEL: test10:
618 ; SKX_32: # %bb.0: # %entry
619 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
620 ; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
621 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
622 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
623 ; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
624 ; SKX_32-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
625 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
626 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1}
629 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
630 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
632 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
633 %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
637 ; Splat index in GEP, requires broadcast
638 define <16 x float> @test11(float* %base, i32 %ind) {
639 ; KNL_64-LABEL: test11:
641 ; KNL_64-NEXT: vpbroadcastd %esi, %zmm1
642 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
643 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
646 ; KNL_32-LABEL: test11:
648 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
649 ; KNL_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
650 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
651 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
656 ; SKX-NEXT: vpbroadcastd %esi, %zmm1
657 ; SKX-NEXT: kxnorw %k0, %k0, %k1
658 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
661 ; SKX_32-LABEL: test11:
663 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
664 ; SKX_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
665 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
666 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
669 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
670 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
672 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
674 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
678 ; We are checking the uniform base here. It is taken directly from input to vgatherdps
679 define <16 x float> @test12(float* %base, <16 x i32> %ind) {
680 ; KNL_64-LABEL: test12:
682 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
683 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
684 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
687 ; KNL_32-LABEL: test12:
689 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
690 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
691 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
692 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
697 ; SKX-NEXT: kxnorw %k0, %k0, %k1
698 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
699 ; SKX-NEXT: vmovaps %zmm1, %zmm0
702 ; SKX_32-LABEL: test12:
704 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
705 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
706 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
707 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
710 %sext_ind = sext <16 x i32> %ind to <16 x i64>
711 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
713 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
717 ; The same as the previous, but the mask is undefined
718 define <16 x float> @test13(float* %base, <16 x i32> %ind) {
719 ; KNL_64-LABEL: test13:
721 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
722 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
723 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
726 ; KNL_32-LABEL: test13:
728 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
729 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
730 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
731 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
736 ; SKX-NEXT: kxnorw %k0, %k0, %k1
737 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
738 ; SKX-NEXT: vmovaps %zmm1, %zmm0
741 ; SKX_32-LABEL: test13:
743 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
744 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
745 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
746 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
749 %sext_ind = sext <16 x i32> %ind to <16 x i64>
750 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
752 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
756 ; The base pointer is not splat, can't find unform base
757 define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
758 ; KNL_64-LABEL: test14:
760 ; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0
761 ; KNL_64-NEXT: vmovd %esi, %xmm1
762 ; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1
763 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
764 ; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1
765 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
766 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
767 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
768 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
771 ; KNL_32-LABEL: test14:
773 ; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0
774 ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
775 ; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
776 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
777 ; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
782 ; SKX-NEXT: vpbroadcastq %xmm0, %zmm0
783 ; SKX-NEXT: vpbroadcastd %esi, %ymm1
784 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
785 ; SKX-NEXT: vpsllq $2, %zmm1, %zmm1
786 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
787 ; SKX-NEXT: kxnorw %k0, %k0, %k1
788 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
789 ; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
792 ; SKX_32-LABEL: test14:
794 ; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0
795 ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
796 ; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
797 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
798 ; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
801 %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
802 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
804 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
806 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
810 declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
811 declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
812 declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
814 ; Gather smaller than existing instruction
815 define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
816 ; KNL_64-LABEL: test15:
818 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
819 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
820 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
821 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
822 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
823 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
824 ; KNL_64-NEXT: vmovaps %xmm1, %xmm0
825 ; KNL_64-NEXT: vzeroupper
828 ; KNL_32-LABEL: test15:
830 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
831 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
832 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
833 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
834 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
835 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
836 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
837 ; KNL_32-NEXT: vmovaps %xmm1, %xmm0
838 ; KNL_32-NEXT: vzeroupper
843 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
844 ; SKX-NEXT: vpmovd2m %xmm1, %k1
845 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
846 ; SKX-NEXT: vmovaps %xmm1, %xmm0
849 ; SKX_32-LABEL: test15:
851 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
852 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
853 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
854 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
855 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
858 %sext_ind = sext <4 x i32> %ind to <4 x i64>
859 %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
860 %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
864 ; Gather smaller than existing instruction
865 define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
866 ; KNL_64-LABEL: test16:
868 ; KNL_64-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
869 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
870 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
871 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
872 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
873 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
874 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1}
875 ; KNL_64-NEXT: vmovapd %ymm2, %ymm0
878 ; KNL_32-LABEL: test16:
880 ; KNL_32-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
881 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
882 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
883 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
884 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
885 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
886 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
887 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1}
888 ; KNL_32-NEXT: vmovapd %ymm2, %ymm0
893 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
894 ; SKX-NEXT: vpmovd2m %xmm1, %k1
895 ; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
896 ; SKX-NEXT: vmovapd %ymm2, %ymm0
899 ; SKX_32-LABEL: test16:
901 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
902 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
903 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
904 ; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
905 ; SKX_32-NEXT: vmovapd %ymm2, %ymm0
908 %sext_ind = sext <4 x i32> %ind to <4 x i64>
909 %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
910 %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
914 define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
915 ; KNL_64-LABEL: test17:
917 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
918 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
919 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
920 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
921 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
922 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
923 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1}
924 ; KNL_64-NEXT: vmovapd %xmm2, %xmm0
925 ; KNL_64-NEXT: vzeroupper
928 ; KNL_32-LABEL: test17:
930 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
931 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
932 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
933 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
934 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
935 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
936 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
937 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1}
938 ; KNL_32-NEXT: vmovapd %xmm2, %xmm0
939 ; KNL_32-NEXT: vzeroupper
944 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
945 ; SKX-NEXT: vpmovq2m %xmm1, %k1
946 ; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %xmm2 {%k1}
947 ; SKX-NEXT: vmovapd %xmm2, %xmm0
950 ; SKX_32-LABEL: test17:
952 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
953 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
954 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
955 ; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %xmm2 {%k1}
956 ; SKX_32-NEXT: vmovapd %xmm2, %xmm0
959 %sext_ind = sext <2 x i32> %ind to <2 x i64>
960 %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
961 %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
965 declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
966 declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
967 declare void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
968 declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
969 declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
971 define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
972 ; KNL_64-LABEL: test18:
974 ; KNL_64-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
975 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
976 ; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2
977 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0
978 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
979 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
980 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
981 ; KNL_64-NEXT: vzeroupper
984 ; KNL_32-LABEL: test18:
986 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
987 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
988 ; KNL_32-NEXT: vpslld $31, %xmm2, %xmm2
989 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k0
990 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
991 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
992 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
993 ; KNL_32-NEXT: vzeroupper
998 ; SKX-NEXT: vpslld $31, %xmm2, %xmm2
999 ; SKX-NEXT: vpmovd2m %xmm2, %k1
1000 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
1001 ; SKX-NEXT: vzeroupper
1004 ; SKX_32-LABEL: test18:
1006 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
1007 ; SKX_32-NEXT: vpmovd2m %xmm2, %k1
1008 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
1010 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
1014 define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
1015 ; KNL_64-LABEL: test19:
1017 ; KNL_64-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
1018 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1019 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
1020 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
1021 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
1022 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
1023 ; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
1024 ; KNL_64-NEXT: vzeroupper
1027 ; KNL_32-LABEL: test19:
1029 ; KNL_32-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
1030 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1031 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
1032 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
1033 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
1034 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
1035 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1036 ; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
1037 ; KNL_32-NEXT: vzeroupper
1040 ; SKX-LABEL: test19:
1042 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
1043 ; SKX-NEXT: vpmovd2m %xmm1, %k1
1044 ; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
1045 ; SKX-NEXT: vzeroupper
1048 ; SKX_32-LABEL: test19:
1050 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
1051 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
1052 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1053 ; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
1054 ; SKX_32-NEXT: vzeroupper
1056 %gep = getelementptr double, double* %ptr, <4 x i64> %ind
1057 call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
1061 ; Data type requires widening
1062 define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
1063 ; KNL_64-LABEL: test20:
1065 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1066 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1067 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
1068 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
1069 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1070 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1071 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
1072 ; KNL_64-NEXT: vzeroupper
1075 ; KNL_32-LABEL: test20:
1077 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1078 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1079 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
1080 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
1081 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1082 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1083 ; KNL_32-NEXT: vscatterdps %zmm0, (,%zmm1) {%k1}
1084 ; KNL_32-NEXT: vzeroupper
1087 ; SKX-LABEL: test20:
1089 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
1090 ; SKX-NEXT: vpmovq2m %xmm2, %k1
1091 ; SKX-NEXT: vscatterqps %xmm0, (,%xmm1) {%k1}
1094 ; SKX_32-LABEL: test20:
1096 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
1097 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1
1098 ; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1}
1100 call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
1104 ; Data type requires promotion
1105 define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
1106 ; KNL_64-LABEL: test21:
1108 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1109 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1110 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
1111 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
1112 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1113 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1114 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1115 ; KNL_64-NEXT: vzeroupper
1118 ; KNL_32-LABEL: test21:
1120 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1121 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1122 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
1123 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
1124 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1125 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1126 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
1127 ; KNL_32-NEXT: vzeroupper
1130 ; SKX-LABEL: test21:
1132 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
1133 ; SKX-NEXT: vpmovq2m %xmm2, %k1
1134 ; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1}
1137 ; SKX_32-LABEL: test21:
1139 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
1140 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1
1141 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
1143 call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
1147 ; The result type requires widening
1148 declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
1150 define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
1151 ; KNL_64-LABEL: test22:
1153 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1154 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1155 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1156 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1157 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1158 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1159 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
1160 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1161 ; KNL_64-NEXT: vzeroupper
1164 ; KNL_32-LABEL: test22:
1166 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1167 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1168 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1169 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1170 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1171 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1172 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1173 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm2 {%k1}
1174 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1175 ; KNL_32-NEXT: vzeroupper
1178 ; SKX-LABEL: test22:
1180 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1181 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1182 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
1183 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1186 ; SKX_32-LABEL: test22:
1188 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1189 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1190 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1191 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
1192 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1194 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1195 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1196 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1200 define <2 x float> @test22a(float* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x float> %src0) {
1201 ; KNL_64-LABEL: test22a:
1203 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
1204 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1205 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1206 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1207 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1208 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1209 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
1210 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1211 ; KNL_64-NEXT: vzeroupper
1214 ; KNL_32-LABEL: test22a:
1216 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
1217 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1218 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1219 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1220 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1221 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1222 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1223 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
1224 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1225 ; KNL_32-NEXT: vzeroupper
1228 ; SKX-LABEL: test22a:
1230 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1231 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1232 ; SKX-NEXT: vgatherqps (%rdi,%xmm0,4), %xmm2 {%k1}
1233 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1236 ; SKX_32-LABEL: test22a:
1238 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1239 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1240 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1241 ; SKX_32-NEXT: vgatherqps (%eax,%xmm0,4), %xmm2 {%k1}
1242 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1244 %gep.random = getelementptr float, float* %base, <2 x i64> %ind
1245 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1249 declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
1250 declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
1252 define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1253 ; KNL_64-LABEL: test23:
1255 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1256 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1257 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1258 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1259 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1260 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1261 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
1262 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1263 ; KNL_64-NEXT: vzeroupper
1266 ; KNL_32-LABEL: test23:
1268 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1269 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1270 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1271 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1272 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1273 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1274 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1275 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
1276 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1277 ; KNL_32-NEXT: vzeroupper
1280 ; SKX-LABEL: test23:
1282 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1283 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1284 ; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1}
1285 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1288 ; SKX_32-LABEL: test23:
1290 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1291 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1292 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1293 ; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm2 {%k1}
1294 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1296 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1297 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1298 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1302 define <2 x i32> @test23b(i32* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1303 ; KNL_64-LABEL: test23b:
1305 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
1306 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1307 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1308 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1309 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1310 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1311 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
1312 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1313 ; KNL_64-NEXT: vzeroupper
1316 ; KNL_32-LABEL: test23b:
1318 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
1319 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1320 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1321 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1322 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1323 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1324 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1325 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
1326 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1327 ; KNL_32-NEXT: vzeroupper
1330 ; SKX-LABEL: test23b:
1332 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1333 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1334 ; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm2 {%k1}
1335 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1338 ; SKX_32-LABEL: test23b:
1340 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1341 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1342 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1343 ; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm2 {%k1}
1344 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1346 %gep.random = getelementptr i32, i32* %base, <2 x i64> %ind
1347 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1351 define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
1352 ; KNL_64-LABEL: test24:
1354 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1355 ; KNL_64-NEXT: movw $3, %ax
1356 ; KNL_64-NEXT: kmovw %eax, %k1
1357 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
1358 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
1359 ; KNL_64-NEXT: vzeroupper
1362 ; KNL_32-LABEL: test24:
1364 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1365 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1366 ; KNL_32-NEXT: movw $3, %cx
1367 ; KNL_32-NEXT: kmovw %ecx, %k1
1368 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
1369 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
1370 ; KNL_32-NEXT: vzeroupper
1373 ; SKX-LABEL: test24:
1375 ; SKX-NEXT: movb $3, %al
1376 ; SKX-NEXT: kmovw %eax, %k1
1377 ; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
1378 ; SKX-NEXT: vmovdqa %xmm1, %xmm0
1381 ; SKX_32-LABEL: test24:
1383 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1384 ; SKX_32-NEXT: movb $3, %cl
1385 ; SKX_32-NEXT: kmovw %ecx, %k1
1386 ; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
1387 ; SKX_32-NEXT: vmovdqa %xmm1, %xmm0
1389 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1390 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1391 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1395 define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
1396 ; KNL_64-LABEL: test25:
1398 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1399 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1400 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1401 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1402 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1403 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1404 ; KNL_64-NEXT: vpgatherdq (%rdi,%ymm0,8), %zmm2 {%k1}
1405 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1406 ; KNL_64-NEXT: vzeroupper
1409 ; KNL_32-LABEL: test25:
1411 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1412 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1413 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1414 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1415 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1416 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1417 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1418 ; KNL_32-NEXT: vpgatherdq (%eax,%ymm0,8), %zmm2 {%k1}
1419 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1420 ; KNL_32-NEXT: vzeroupper
1423 ; SKX-LABEL: test25:
1425 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1426 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1427 ; SKX-NEXT: vpgatherdq (%rdi,%xmm0,8), %xmm2 {%k1}
1428 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1431 ; SKX_32-LABEL: test25:
1433 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1434 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1435 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1436 ; SKX_32-NEXT: vpgatherdq (%eax,%xmm0,8), %xmm2 {%k1}
1437 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1439 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1440 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1441 %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
1445 define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
1446 ; KNL_64-LABEL: test26:
1448 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1449 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1450 ; KNL_64-NEXT: movb $3, %al
1451 ; KNL_64-NEXT: kmovw %eax, %k1
1452 ; KNL_64-NEXT: vpgatherdq (%rdi,%ymm0,8), %zmm1 {%k1}
1453 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
1454 ; KNL_64-NEXT: vzeroupper
1457 ; KNL_32-LABEL: test26:
1459 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1460 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1461 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1462 ; KNL_32-NEXT: movb $3, %cl
1463 ; KNL_32-NEXT: kmovw %ecx, %k1
1464 ; KNL_32-NEXT: vpgatherdq (%eax,%ymm0,8), %zmm1 {%k1}
1465 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
1466 ; KNL_32-NEXT: vzeroupper
1469 ; SKX-LABEL: test26:
1471 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1472 ; SKX-NEXT: vpgatherdq (%rdi,%xmm0,8), %xmm1 {%k1}
1473 ; SKX-NEXT: vmovdqa %xmm1, %xmm0
1476 ; SKX_32-LABEL: test26:
1478 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1479 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
1480 ; SKX_32-NEXT: vpgatherdq (%eax,%xmm0,8), %xmm1 {%k1}
1481 ; SKX_32-NEXT: vmovdqa %xmm1, %xmm0
1483 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1484 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1485 %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
1489 ; Result type requires widening; all-ones mask
1490 define <2 x float> @test27(float* %base, <2 x i32> %ind) {
1491 ; KNL_64-LABEL: test27:
1493 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1494 ; KNL_64-NEXT: movw $3, %ax
1495 ; KNL_64-NEXT: kmovw %eax, %k1
1496 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1497 ; KNL_64-NEXT: vmovaps %xmm1, %xmm0
1498 ; KNL_64-NEXT: vzeroupper
1501 ; KNL_32-LABEL: test27:
1503 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1504 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1505 ; KNL_32-NEXT: movw $3, %cx
1506 ; KNL_32-NEXT: kmovw %ecx, %k1
1507 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1508 ; KNL_32-NEXT: vmovaps %xmm1, %xmm0
1509 ; KNL_32-NEXT: vzeroupper
1512 ; SKX-LABEL: test27:
1514 ; SKX-NEXT: movb $3, %al
1515 ; SKX-NEXT: kmovw %eax, %k1
1516 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
1517 ; SKX-NEXT: vmovaps %xmm1, %xmm0
1520 ; SKX_32-LABEL: test27:
1522 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1523 ; SKX_32-NEXT: movb $3, %cl
1524 ; SKX_32-NEXT: kmovw %ecx, %k1
1525 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
1526 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
1528 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1529 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1530 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
1534 ; Data type requires promotion, mask is all-ones
1535 define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
1536 ; KNL_64-LABEL: test28:
1538 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1539 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1540 ; KNL_64-NEXT: movb $3, %al
1541 ; KNL_64-NEXT: kmovw %eax, %k1
1542 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1543 ; KNL_64-NEXT: vzeroupper
1546 ; KNL_32-LABEL: test28:
1548 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1549 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1550 ; KNL_32-NEXT: movw $3, %ax
1551 ; KNL_32-NEXT: kmovw %eax, %k1
1552 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
1553 ; KNL_32-NEXT: vzeroupper
1556 ; SKX-LABEL: test28:
1558 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1559 ; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1}
1562 ; SKX_32-LABEL: test28:
1564 ; SKX_32-NEXT: movb $3, %al
1565 ; SKX_32-NEXT: kmovw %eax, %k1
1566 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
1568 call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
1572 ; SCALAR-LABEL: test29
1573 ; SCALAR: extractelement <16 x float*>
1574 ; SCALAR-NEXT: load float
1575 ; SCALAR-NEXT: insertelement <16 x float>
1576 ; SCALAR-NEXT: extractelement <16 x float*>
1577 ; SCALAR-NEXT: load float
1579 define <16 x float> @test29(float* %base, <16 x i32> %ind) {
1580 ; KNL_64-LABEL: test29:
1582 ; KNL_64-NEXT: movw $44, %ax
1583 ; KNL_64-NEXT: kmovw %eax, %k1
1584 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1585 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
1588 ; KNL_32-LABEL: test29:
1590 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1591 ; KNL_32-NEXT: movw $44, %cx
1592 ; KNL_32-NEXT: kmovw %ecx, %k1
1593 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1594 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1597 ; SKX-LABEL: test29:
1599 ; SKX-NEXT: movw $44, %ax
1600 ; SKX-NEXT: kmovw %eax, %k1
1601 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1602 ; SKX-NEXT: vmovaps %zmm1, %zmm0
1605 ; SKX_32-LABEL: test29:
1607 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1608 ; SKX_32-NEXT: movw $44, %cx
1609 ; SKX_32-NEXT: kmovw %ecx, %k1
1610 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1611 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
1614 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
1615 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
1617 %sext_ind = sext <16 x i32> %ind to <16 x i64>
1618 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
1620 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
1621 ret <16 x float>%res
1624 ; Check non-power-of-2 case. It should be scalarized.
1625 declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
1626 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
1627 ; KNL_64-LABEL: test30:
1629 ; KNL_64-NEXT: andb $1, %dil
1630 ; KNL_64-NEXT: andb $1, %sil
1631 ; KNL_64-NEXT: addb %sil, %sil
1632 ; KNL_64-NEXT: orb %dil, %sil
1633 ; KNL_64-NEXT: andb $1, %dl
1634 ; KNL_64-NEXT: shlb $2, %dl
1635 ; KNL_64-NEXT: orb %sil, %dl
1636 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
1637 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
1638 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
1639 ; KNL_64-NEXT: testb $1, %dl
1640 ; KNL_64-NEXT: jne .LBB31_1
1641 ; KNL_64-NEXT: # %bb.2: # %else
1642 ; KNL_64-NEXT: testb $2, %dl
1643 ; KNL_64-NEXT: jne .LBB31_3
1644 ; KNL_64-NEXT: .LBB31_4: # %else2
1645 ; KNL_64-NEXT: testb $4, %dl
1646 ; KNL_64-NEXT: jne .LBB31_5
1647 ; KNL_64-NEXT: .LBB31_6: # %else5
1648 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1649 ; KNL_64-NEXT: vzeroupper
1651 ; KNL_64-NEXT: .LBB31_1: # %cond.load
1652 ; KNL_64-NEXT: vmovq %xmm0, %rax
1653 ; KNL_64-NEXT: vpinsrd $0, (%rax), %xmm2, %xmm2
1654 ; KNL_64-NEXT: testb $2, %dl
1655 ; KNL_64-NEXT: je .LBB31_4
1656 ; KNL_64-NEXT: .LBB31_3: # %cond.load1
1657 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
1658 ; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
1659 ; KNL_64-NEXT: testb $4, %dl
1660 ; KNL_64-NEXT: je .LBB31_6
1661 ; KNL_64-NEXT: .LBB31_5: # %cond.load4
1662 ; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
1663 ; KNL_64-NEXT: vmovq %xmm0, %rax
1664 ; KNL_64-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm2
1665 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1666 ; KNL_64-NEXT: vzeroupper
1669 ; KNL_32-LABEL: test30:
1671 ; KNL_32-NEXT: pushl %eax
1672 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1673 ; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al
1674 ; KNL_32-NEXT: andb $1, %al
1675 ; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %cl
1676 ; KNL_32-NEXT: andb $1, %cl
1677 ; KNL_32-NEXT: addb %cl, %cl
1678 ; KNL_32-NEXT: orb %al, %cl
1679 ; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al
1680 ; KNL_32-NEXT: andb $1, %al
1681 ; KNL_32-NEXT: shlb $2, %al
1682 ; KNL_32-NEXT: orb %cl, %al
1683 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
1684 ; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1685 ; KNL_32-NEXT: testb $1, %al
1686 ; KNL_32-NEXT: jne .LBB31_1
1687 ; KNL_32-NEXT: # %bb.2: # %else
1688 ; KNL_32-NEXT: testb $2, %al
1689 ; KNL_32-NEXT: jne .LBB31_3
1690 ; KNL_32-NEXT: .LBB31_4: # %else2
1691 ; KNL_32-NEXT: testb $4, %al
1692 ; KNL_32-NEXT: je .LBB31_6
1693 ; KNL_32-NEXT: .LBB31_5: # %cond.load4
1694 ; KNL_32-NEXT: vpextrd $2, %xmm0, %eax
1695 ; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2
1696 ; KNL_32-NEXT: .LBB31_6: # %else5
1697 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1698 ; KNL_32-NEXT: popl %eax
1699 ; KNL_32-NEXT: .cfi_def_cfa_offset 4
1701 ; KNL_32-NEXT: .LBB31_1: # %cond.load
1702 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1703 ; KNL_32-NEXT: vmovd %xmm0, %ecx
1704 ; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2
1705 ; KNL_32-NEXT: testb $2, %al
1706 ; KNL_32-NEXT: je .LBB31_4
1707 ; KNL_32-NEXT: .LBB31_3: # %cond.load1
1708 ; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
1709 ; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm2, %xmm2
1710 ; KNL_32-NEXT: testb $4, %al
1711 ; KNL_32-NEXT: jne .LBB31_5
1712 ; KNL_32-NEXT: jmp .LBB31_6
1714 ; SKX-LABEL: test30:
1716 ; SKX-NEXT: andb $1, %dil
1717 ; SKX-NEXT: andb $1, %sil
1718 ; SKX-NEXT: addb %sil, %sil
1719 ; SKX-NEXT: orb %dil, %sil
1720 ; SKX-NEXT: andb $1, %dl
1721 ; SKX-NEXT: shlb $2, %dl
1722 ; SKX-NEXT: orb %sil, %dl
1723 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
1724 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
1725 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
1726 ; SKX-NEXT: testb $1, %dl
1727 ; SKX-NEXT: jne .LBB31_1
1728 ; SKX-NEXT: # %bb.2: # %else
1729 ; SKX-NEXT: testb $2, %dl
1730 ; SKX-NEXT: jne .LBB31_3
1731 ; SKX-NEXT: .LBB31_4: # %else2
1732 ; SKX-NEXT: testb $4, %dl
1733 ; SKX-NEXT: jne .LBB31_5
1734 ; SKX-NEXT: .LBB31_6: # %else5
1735 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1736 ; SKX-NEXT: vzeroupper
1738 ; SKX-NEXT: .LBB31_1: # %cond.load
1739 ; SKX-NEXT: vmovq %xmm0, %rax
1740 ; SKX-NEXT: vpinsrd $0, (%rax), %xmm2, %xmm2
1741 ; SKX-NEXT: testb $2, %dl
1742 ; SKX-NEXT: je .LBB31_4
1743 ; SKX-NEXT: .LBB31_3: # %cond.load1
1744 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
1745 ; SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
1746 ; SKX-NEXT: testb $4, %dl
1747 ; SKX-NEXT: je .LBB31_6
1748 ; SKX-NEXT: .LBB31_5: # %cond.load4
1749 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
1750 ; SKX-NEXT: vmovq %xmm0, %rax
1751 ; SKX-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm2
1752 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1753 ; SKX-NEXT: vzeroupper
1756 ; SKX_32-LABEL: test30:
1758 ; SKX_32-NEXT: pushl %eax
1759 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
1760 ; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
1761 ; SKX_32-NEXT: andb $1, %al
1762 ; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %cl
1763 ; SKX_32-NEXT: andb $1, %cl
1764 ; SKX_32-NEXT: addb %cl, %cl
1765 ; SKX_32-NEXT: orb %al, %cl
1766 ; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
1767 ; SKX_32-NEXT: andb $1, %al
1768 ; SKX_32-NEXT: shlb $2, %al
1769 ; SKX_32-NEXT: orb %cl, %al
1770 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
1771 ; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1772 ; SKX_32-NEXT: testb $1, %al
1773 ; SKX_32-NEXT: jne .LBB31_1
1774 ; SKX_32-NEXT: # %bb.2: # %else
1775 ; SKX_32-NEXT: testb $2, %al
1776 ; SKX_32-NEXT: jne .LBB31_3
1777 ; SKX_32-NEXT: .LBB31_4: # %else2
1778 ; SKX_32-NEXT: testb $4, %al
1779 ; SKX_32-NEXT: je .LBB31_6
1780 ; SKX_32-NEXT: .LBB31_5: # %cond.load4
1781 ; SKX_32-NEXT: vpextrd $2, %xmm0, %eax
1782 ; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2
1783 ; SKX_32-NEXT: .LBB31_6: # %else5
1784 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1785 ; SKX_32-NEXT: popl %eax
1786 ; SKX_32-NEXT: .cfi_def_cfa_offset 4
1788 ; SKX_32-NEXT: .LBB31_1: # %cond.load
1789 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
1790 ; SKX_32-NEXT: vmovd %xmm0, %ecx
1791 ; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2
1792 ; SKX_32-NEXT: testb $2, %al
1793 ; SKX_32-NEXT: je .LBB31_4
1794 ; SKX_32-NEXT: .LBB31_3: # %cond.load1
1795 ; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx
1796 ; SKX_32-NEXT: vpinsrd $1, (%ecx), %xmm2, %xmm2
1797 ; SKX_32-NEXT: testb $4, %al
1798 ; SKX_32-NEXT: jne .LBB31_5
1799 ; SKX_32-NEXT: jmp .LBB31_6
1801 %sext_ind = sext <3 x i32> %ind to <3 x i64>
1802 %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
1803 %res = call <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
1807 declare <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
1808 define <16 x float*> @test31(<16 x float**> %ptrs) {
1809 ; KNL_64-LABEL: test31:
1811 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
1812 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
1813 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
1814 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
1815 ; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0
1816 ; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm1
1819 ; KNL_32-LABEL: test31:
1821 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
1822 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
1823 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
1826 ; SKX-LABEL: test31:
1828 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1829 ; SKX-NEXT: kxnorw %k0, %k0, %k2
1830 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
1831 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
1832 ; SKX-NEXT: vmovdqa64 %zmm2, %zmm0
1833 ; SKX-NEXT: vmovdqa64 %zmm3, %zmm1
1836 ; SKX_32-LABEL: test31:
1838 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
1839 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
1840 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
1843 %res = call <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
1844 ret <16 x float*>%res
1847 define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
1848 ; KNL_64-LABEL: test_gather_16i32:
1850 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1851 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1852 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1853 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2
1854 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1855 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
1856 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
1857 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
1860 ; KNL_32-LABEL: test_gather_16i32:
1862 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1863 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1864 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1865 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
1866 ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
1869 ; SKX-LABEL: test_gather_16i32:
1871 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1872 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1873 ; SKX-NEXT: vpmovd2m %zmm2, %k1
1874 ; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm2
1875 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1876 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
1877 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
1878 ; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
1881 ; SKX_32-LABEL: test_gather_16i32:
1883 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1884 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1885 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
1886 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
1887 ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
1889 %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
1892 define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
1893 ; KNL_64-LABEL: test_gather_16i64:
1895 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1896 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1897 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1898 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1899 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
1900 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
1901 ; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm0
1902 ; KNL_64-NEXT: vmovdqa64 %zmm4, %zmm1
1905 ; KNL_32-LABEL: test_gather_16i64:
1907 ; KNL_32-NEXT: pushl %ebp
1908 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1909 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1910 ; KNL_32-NEXT: movl %esp, %ebp
1911 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1912 ; KNL_32-NEXT: andl $-64, %esp
1913 ; KNL_32-NEXT: subl $64, %esp
1914 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1915 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1916 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1917 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1918 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1919 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
1920 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1921 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
1922 ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
1923 ; KNL_32-NEXT: movl %ebp, %esp
1924 ; KNL_32-NEXT: popl %ebp
1925 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
1928 ; SKX-LABEL: test_gather_16i64:
1930 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1931 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1932 ; SKX-NEXT: vpmovd2m %zmm2, %k1
1933 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1934 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
1935 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
1936 ; SKX-NEXT: vmovdqa64 %zmm3, %zmm0
1937 ; SKX-NEXT: vmovdqa64 %zmm4, %zmm1
1940 ; SKX_32-LABEL: test_gather_16i64:
1942 ; SKX_32-NEXT: pushl %ebp
1943 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
1944 ; SKX_32-NEXT: .cfi_offset %ebp, -8
1945 ; SKX_32-NEXT: movl %esp, %ebp
1946 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
1947 ; SKX_32-NEXT: andl $-64, %esp
1948 ; SKX_32-NEXT: subl $64, %esp
1949 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1950 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1951 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
1952 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1953 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
1954 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
1955 ; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1956 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
1957 ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
1958 ; SKX_32-NEXT: movl %ebp, %esp
1959 ; SKX_32-NEXT: popl %ebp
1960 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
1962 %res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
1965 declare <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
1966 define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
1967 ; KNL_64-LABEL: test_gather_16f32:
1969 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1970 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1971 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1972 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2
1973 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1974 ; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
1975 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
1976 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
1979 ; KNL_32-LABEL: test_gather_16f32:
1981 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1982 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1983 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1984 ; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
1985 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1988 ; SKX-LABEL: test_gather_16f32:
1990 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1991 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1992 ; SKX-NEXT: vpmovd2m %zmm2, %k1
1993 ; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm2
1994 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1995 ; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
1996 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
1997 ; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
2000 ; SKX_32-LABEL: test_gather_16f32:
2002 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2003 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2004 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2005 ; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
2006 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
2008 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
2009 ret <16 x float> %res
2011 define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
2012 ; KNL_64-LABEL: test_gather_16f64:
2014 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2015 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2016 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2017 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2018 ; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
2019 ; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
2020 ; KNL_64-NEXT: vmovapd %zmm3, %zmm0
2021 ; KNL_64-NEXT: vmovapd %zmm4, %zmm1
2024 ; KNL_32-LABEL: test_gather_16f64:
2026 ; KNL_32-NEXT: pushl %ebp
2027 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2028 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2029 ; KNL_32-NEXT: movl %esp, %ebp
2030 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2031 ; KNL_32-NEXT: andl $-64, %esp
2032 ; KNL_32-NEXT: subl $64, %esp
2033 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2034 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2035 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2036 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
2037 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2038 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
2039 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2040 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
2041 ; KNL_32-NEXT: vmovapd %zmm2, %zmm0
2042 ; KNL_32-NEXT: movl %ebp, %esp
2043 ; KNL_32-NEXT: popl %ebp
2044 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2047 ; SKX-LABEL: test_gather_16f64:
2049 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2050 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2051 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2052 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2053 ; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
2054 ; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
2055 ; SKX-NEXT: vmovapd %zmm3, %zmm0
2056 ; SKX-NEXT: vmovapd %zmm4, %zmm1
2059 ; SKX_32-LABEL: test_gather_16f64:
2061 ; SKX_32-NEXT: pushl %ebp
2062 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2063 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2064 ; SKX_32-NEXT: movl %esp, %ebp
2065 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2066 ; SKX_32-NEXT: andl $-64, %esp
2067 ; SKX_32-NEXT: subl $64, %esp
2068 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2069 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2070 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2071 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
2072 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2073 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
2074 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2075 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
2076 ; SKX_32-NEXT: vmovapd %zmm2, %zmm0
2077 ; SKX_32-NEXT: movl %ebp, %esp
2078 ; SKX_32-NEXT: popl %ebp
2079 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2081 %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
2082 ret <16 x double> %res
2084 declare <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
2085 define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
2086 ; KNL_64-LABEL: test_scatter_16i32:
2088 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2089 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2090 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2091 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2092 ; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
2093 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
2094 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
2095 ; KNL_64-NEXT: vzeroupper
2098 ; KNL_32-LABEL: test_scatter_16i32:
2100 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2101 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2102 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2103 ; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
2104 ; KNL_32-NEXT: vzeroupper
2107 ; SKX-LABEL: test_scatter_16i32:
2109 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2110 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2111 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2112 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2113 ; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
2114 ; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm0
2115 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
2116 ; SKX-NEXT: vzeroupper
2119 ; SKX_32-LABEL: test_scatter_16i32:
2121 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2122 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2123 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2124 ; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
2125 ; SKX_32-NEXT: vzeroupper
2127 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
2130 define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
2131 ; KNL_64-LABEL: test_scatter_16i64:
2133 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2134 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2135 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2136 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2137 ; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
2138 ; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
2139 ; KNL_64-NEXT: vzeroupper
2142 ; KNL_32-LABEL: test_scatter_16i64:
2144 ; KNL_32-NEXT: pushl %ebp
2145 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2146 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2147 ; KNL_32-NEXT: movl %esp, %ebp
2148 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2149 ; KNL_32-NEXT: andl $-64, %esp
2150 ; KNL_32-NEXT: subl $64, %esp
2151 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2152 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2153 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2154 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
2155 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2156 ; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
2157 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2158 ; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
2159 ; KNL_32-NEXT: movl %ebp, %esp
2160 ; KNL_32-NEXT: popl %ebp
2161 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2162 ; KNL_32-NEXT: vzeroupper
2165 ; SKX-LABEL: test_scatter_16i64:
2167 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2168 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2169 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2170 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2171 ; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
2172 ; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
2173 ; SKX-NEXT: vzeroupper
2176 ; SKX_32-LABEL: test_scatter_16i64:
2178 ; SKX_32-NEXT: pushl %ebp
2179 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2180 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2181 ; SKX_32-NEXT: movl %esp, %ebp
2182 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2183 ; SKX_32-NEXT: andl $-64, %esp
2184 ; SKX_32-NEXT: subl $64, %esp
2185 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2186 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2187 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2188 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
2189 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2190 ; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
2191 ; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2192 ; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
2193 ; SKX_32-NEXT: movl %ebp, %esp
2194 ; SKX_32-NEXT: popl %ebp
2195 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2196 ; SKX_32-NEXT: vzeroupper
2198 call void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
2201 declare void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
2202 define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
2203 ; KNL_64-LABEL: test_scatter_16f32:
2205 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2206 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2207 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2208 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2209 ; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
2210 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
2211 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
2212 ; KNL_64-NEXT: vzeroupper
2215 ; KNL_32-LABEL: test_scatter_16f32:
2217 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2218 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2219 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2220 ; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
2221 ; KNL_32-NEXT: vzeroupper
2224 ; SKX-LABEL: test_scatter_16f32:
2226 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2227 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2228 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2229 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2230 ; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
2231 ; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm0
2232 ; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
2233 ; SKX-NEXT: vzeroupper
2236 ; SKX_32-LABEL: test_scatter_16f32:
2238 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2239 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2240 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2241 ; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
2242 ; SKX_32-NEXT: vzeroupper
2244 call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
2247 declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
2248 define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
2249 ; KNL_64-LABEL: test_scatter_16f64:
2251 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2252 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2253 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2254 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2255 ; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
2256 ; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
2257 ; KNL_64-NEXT: vzeroupper
2260 ; KNL_32-LABEL: test_scatter_16f64:
2262 ; KNL_32-NEXT: pushl %ebp
2263 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2264 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2265 ; KNL_32-NEXT: movl %esp, %ebp
2266 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2267 ; KNL_32-NEXT: andl $-64, %esp
2268 ; KNL_32-NEXT: subl $64, %esp
2269 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2270 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2271 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2272 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
2273 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2274 ; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
2275 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2276 ; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
2277 ; KNL_32-NEXT: movl %ebp, %esp
2278 ; KNL_32-NEXT: popl %ebp
2279 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2280 ; KNL_32-NEXT: vzeroupper
2283 ; SKX-LABEL: test_scatter_16f64:
2285 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2286 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2287 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2288 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2289 ; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
2290 ; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
2291 ; SKX-NEXT: vzeroupper
2294 ; SKX_32-LABEL: test_scatter_16f64:
2296 ; SKX_32-NEXT: pushl %ebp
2297 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2298 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2299 ; SKX_32-NEXT: movl %esp, %ebp
2300 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2301 ; SKX_32-NEXT: andl $-64, %esp
2302 ; SKX_32-NEXT: subl $64, %esp
2303 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2304 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2305 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2306 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
2307 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2308 ; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
2309 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2310 ; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
2311 ; SKX_32-NEXT: movl %ebp, %esp
2312 ; SKX_32-NEXT: popl %ebp
2313 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2314 ; SKX_32-NEXT: vzeroupper
2316 call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
2319 declare void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
2321 define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) {
2322 ; KNL_64-LABEL: test_pr28312:
2324 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2325 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
2326 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
2327 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
2328 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
2329 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1}
2330 ; KNL_64-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2331 ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2334 ; KNL_32-LABEL: test_pr28312:
2336 ; KNL_32-NEXT: pushl %ebp
2337 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2338 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2339 ; KNL_32-NEXT: movl %esp, %ebp
2340 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2341 ; KNL_32-NEXT: andl $-32, %esp
2342 ; KNL_32-NEXT: subl $32, %esp
2343 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2344 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
2345 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
2346 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
2347 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
2348 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k1}
2349 ; KNL_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2350 ; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2351 ; KNL_32-NEXT: movl %ebp, %esp
2352 ; KNL_32-NEXT: popl %ebp
2353 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2356 ; SKX-LABEL: test_pr28312:
2358 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
2359 ; SKX-NEXT: vpmovd2m %xmm1, %k1
2360 ; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1}
2361 ; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2362 ; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2365 ; SKX_32-LABEL: test_pr28312:
2367 ; SKX_32-NEXT: pushl %ebp
2368 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2369 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2370 ; SKX_32-NEXT: movl %esp, %ebp
2371 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2372 ; SKX_32-NEXT: andl $-32, %esp
2373 ; SKX_32-NEXT: subl $32, %esp
2374 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
2375 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
2376 ; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1}
2377 ; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2378 ; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2379 ; SKX_32-NEXT: movl %ebp, %esp
2380 ; SKX_32-NEXT: popl %ebp
2381 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2383 %g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2384 %g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2385 %g3 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2386 %a = add <4 x i64> %g1, %g2
2387 %b = add <4 x i64> %a, %g3
2390 declare <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>)
2392 define <8 x i32> @test_global_array(<8 x i64> %indxs) {
2393 ; KNL_64-LABEL: test_global_array:
2395 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2396 ; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2397 ; KNL_64-NEXT: vmovdqa %ymm1, %ymm0
2400 ; KNL_32-LABEL: test_global_array:
2402 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2403 ; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2404 ; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
2407 ; SKX_SMALL-LABEL: test_global_array:
2408 ; SKX_SMALL: # %bb.0:
2409 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
2410 ; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2411 ; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0
2412 ; SKX_SMALL-NEXT: retq
2414 ; SKX_LARGE-LABEL: test_global_array:
2415 ; SKX_LARGE: # %bb.0:
2416 ; SKX_LARGE-NEXT: movabsq $glob_array, %rax
2417 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
2418 ; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
2419 ; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0
2420 ; SKX_LARGE-NEXT: retq
2422 ; SKX_32-LABEL: test_global_array:
2424 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2425 ; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2426 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm0
2428 %p = getelementptr inbounds [16 x i32], [16 x i32]* @glob_array, i64 0, <8 x i64> %indxs
2429 %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
2433 define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
2434 ; KNL_64-LABEL: test_global_array_zeroinitializer_index:
2436 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2437 ; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2438 ; KNL_64-NEXT: vmovdqa %ymm1, %ymm0
2441 ; KNL_32-LABEL: test_global_array_zeroinitializer_index:
2443 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2444 ; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2445 ; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
2448 ; SKX_SMALL-LABEL: test_global_array_zeroinitializer_index:
2449 ; SKX_SMALL: # %bb.0:
2450 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
2451 ; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2452 ; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0
2453 ; SKX_SMALL-NEXT: retq
2455 ; SKX_LARGE-LABEL: test_global_array_zeroinitializer_index:
2456 ; SKX_LARGE: # %bb.0:
2457 ; SKX_LARGE-NEXT: movabsq $glob_array, %rax
2458 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
2459 ; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
2460 ; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0
2461 ; SKX_LARGE-NEXT: retq
2463 ; SKX_32-LABEL: test_global_array_zeroinitializer_index:
2465 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2466 ; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2467 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm0
2469 %p = getelementptr inbounds [16 x i32], [16 x i32]* @glob_array, <8 x i64> zeroinitializer, <8 x i64> %indxs
2470 %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
2474 define void @v1_scatter(<1 x i32>%a1, <1 x i32*> %ptr, <1 x i1> %mask) {
2475 ; KNL_64-LABEL: v1_scatter:
2477 ; KNL_64-NEXT: testb $1, %dl
2478 ; KNL_64-NEXT: je .LBB44_2
2479 ; KNL_64-NEXT: # %bb.1: # %cond.store
2480 ; KNL_64-NEXT: movl %edi, (%rsi)
2481 ; KNL_64-NEXT: .LBB44_2: # %else
2484 ; KNL_32-LABEL: v1_scatter:
2486 ; KNL_32-NEXT: testb $1, {{[0-9]+}}(%esp)
2487 ; KNL_32-NEXT: je .LBB44_2
2488 ; KNL_32-NEXT: # %bb.1: # %cond.store
2489 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2490 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
2491 ; KNL_32-NEXT: movl %ecx, (%eax)
2492 ; KNL_32-NEXT: .LBB44_2: # %else
2495 ; SKX-LABEL: v1_scatter:
2497 ; SKX-NEXT: testb $1, %dl
2498 ; SKX-NEXT: je .LBB44_2
2499 ; SKX-NEXT: # %bb.1: # %cond.store
2500 ; SKX-NEXT: movl %edi, (%rsi)
2501 ; SKX-NEXT: .LBB44_2: # %else
2504 ; SKX_32-LABEL: v1_scatter:
2506 ; SKX_32-NEXT: testb $1, {{[0-9]+}}(%esp)
2507 ; SKX_32-NEXT: je .LBB44_2
2508 ; SKX_32-NEXT: # %bb.1: # %cond.store
2509 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2510 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
2511 ; SKX_32-NEXT: movl %ecx, (%eax)
2512 ; SKX_32-NEXT: .LBB44_2: # %else
2514 call void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32> %a1, <1 x i32*> %ptr, i32 4, <1 x i1> %mask)
2517 declare void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32>, <1 x i32*>, i32, <1 x i1>)
2519 define <1 x i32> @v1_gather(<1 x i32*> %ptr, <1 x i1> %mask, <1 x i32> %src0) {
2520 ; KNL_64-LABEL: v1_gather:
2522 ; KNL_64-NEXT: movl (%rdi), %eax
2525 ; KNL_32-LABEL: v1_gather:
2527 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2528 ; KNL_32-NEXT: movl (%eax), %eax
2531 ; SKX-LABEL: v1_gather:
2533 ; SKX-NEXT: movl (%rdi), %eax
2536 ; SKX_32-LABEL: v1_gather:
2538 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2539 ; SKX_32-NEXT: movl (%eax), %eax
2541 %res = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> %ptr, i32 4, <1 x i1> <i1 true>, <1 x i32> %src0)
2544 declare <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*>, i32, <1 x i1>, <1 x i32>)
2546 ; Make sure we don't crash when the index element type is larger than i64 and we need to widen the result
2547 ; This experienced a bad interaction when we widened and then tried to split.
2548 define <2 x float> @large_index(float* %base, <2 x i128> %ind, <2 x i1> %mask, <2 x float> %src0) {
2549 ; KNL_64-LABEL: large_index:
2551 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2552 ; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0
2553 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0
2554 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
2555 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
2556 ; KNL_64-NEXT: vmovq %rcx, %xmm0
2557 ; KNL_64-NEXT: vmovq %rsi, %xmm2
2558 ; KNL_64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2559 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k1}
2560 ; KNL_64-NEXT: vmovaps %xmm1, %xmm0
2561 ; KNL_64-NEXT: vzeroupper
2564 ; KNL_32-LABEL: large_index:
2566 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2567 ; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0
2568 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0
2569 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
2570 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
2571 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2572 ; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2573 ; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2574 ; KNL_32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2575 ; KNL_32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2576 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm1 {%k1}
2577 ; KNL_32-NEXT: vmovaps %xmm1, %xmm0
2578 ; KNL_32-NEXT: vzeroupper
2581 ; SKX-LABEL: large_index:
2583 ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
2584 ; SKX-NEXT: vpmovq2m %xmm0, %k1
2585 ; SKX-NEXT: vmovq %rcx, %xmm0
2586 ; SKX-NEXT: vmovq %rsi, %xmm2
2587 ; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2588 ; SKX-NEXT: vgatherqps (%rdi,%xmm0,4), %xmm1 {%k1}
2589 ; SKX-NEXT: vmovaps %xmm1, %xmm0
2592 ; SKX_32-LABEL: large_index:
2594 ; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0
2595 ; SKX_32-NEXT: vpmovq2m %xmm0, %k1
2596 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2597 ; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2598 ; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2599 ; SKX_32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2600 ; SKX_32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2601 ; SKX_32-NEXT: vgatherqps (%eax,%xmm0,4), %xmm1 {%k1}
2602 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
2604 %gep.random = getelementptr float, float* %base, <2 x i128> %ind
2605 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
2609 ; Make sure we allow index to be sign extended from a smaller than i32 element size.
2610 define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {
2611 ; KNL_64-LABEL: sext_i8_index:
2613 ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm1
2614 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2615 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2618 ; KNL_32-LABEL: sext_i8_index:
2620 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2621 ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm1
2622 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2623 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2626 ; SKX-LABEL: sext_i8_index:
2628 ; SKX-NEXT: vpmovsxbd %xmm0, %zmm1
2629 ; SKX-NEXT: kxnorw %k0, %k0, %k1
2630 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2633 ; SKX_32-LABEL: sext_i8_index:
2635 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2636 ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm1
2637 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2638 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2641 %sext_ind = sext <16 x i8> %ind to <16 x i64>
2642 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
2644 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2645 ret <16 x float>%res
2648 ; Make sure we allow index to be sign extended from a smaller than i32 element size.
2649 define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {
2650 ; KNL_64-LABEL: sext_v8i8_index:
2652 ; KNL_64-NEXT: vpmovsxbd %xmm0, %ymm1
2653 ; KNL_64-NEXT: movw $255, %ax
2654 ; KNL_64-NEXT: kmovw %eax, %k1
2655 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2656 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2659 ; KNL_32-LABEL: sext_v8i8_index:
2661 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2662 ; KNL_32-NEXT: vpmovsxbd %xmm0, %ymm1
2663 ; KNL_32-NEXT: movw $255, %cx
2664 ; KNL_32-NEXT: kmovw %ecx, %k1
2665 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2666 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2669 ; SKX-LABEL: sext_v8i8_index:
2671 ; SKX-NEXT: vpmovsxbd %xmm0, %ymm1
2672 ; SKX-NEXT: kxnorw %k0, %k0, %k1
2673 ; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
2676 ; SKX_32-LABEL: sext_v8i8_index:
2678 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2679 ; SKX_32-NEXT: vpmovsxbd %xmm0, %ymm1
2680 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2681 ; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
2684 %sext_ind = sext <8 x i8> %ind to <8 x i64>
2685 %gep.random = getelementptr float, float *%base, <8 x i64> %sext_ind
2687 %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %gep.random, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
2690 declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>)
2692 ; Index requires promotion
2693 define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32> %ind, <2 x i1> %mask) {
2694 ; KNL_64-LABEL: test_scatter_2i32_index:
2696 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2697 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2698 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
2699 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
2700 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
2701 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
2702 ; KNL_64-NEXT: vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1}
2703 ; KNL_64-NEXT: vzeroupper
2706 ; KNL_32-LABEL: test_scatter_2i32_index:
2708 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2709 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2710 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
2711 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
2712 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
2713 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
2714 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2715 ; KNL_32-NEXT: vscatterdpd %zmm0, (%eax,%ymm1,8) {%k1}
2716 ; KNL_32-NEXT: vzeroupper
2719 ; SKX-LABEL: test_scatter_2i32_index:
2721 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
2722 ; SKX-NEXT: vpmovq2m %xmm2, %k1
2723 ; SKX-NEXT: vscatterdpd %xmm0, (%rdi,%xmm1,8) {%k1}
2726 ; SKX_32-LABEL: test_scatter_2i32_index:
2728 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
2729 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1
2730 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2731 ; SKX_32-NEXT: vscatterdpd %xmm0, (%eax,%xmm1,8) {%k1}
2733 %gep = getelementptr double, double *%base, <2 x i32> %ind
2734 call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask)
2737 declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>)
2739 define <16 x float> @zext_index(float* %base, <16 x i32> %ind) {
2740 ; KNL_64-LABEL: zext_index:
2742 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
2743 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2744 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2747 ; KNL_32-LABEL: zext_index:
2749 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2750 ; KNL_32-NEXT: vpandd {{\.LCPI.*}}{1to16}, %zmm0, %zmm1
2751 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2752 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2755 ; SKX_SMALL-LABEL: zext_index:
2756 ; SKX_SMALL: # %bb.0:
2757 ; SKX_SMALL-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm1
2758 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
2759 ; SKX_SMALL-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2760 ; SKX_SMALL-NEXT: retq
2762 ; SKX_LARGE-LABEL: zext_index:
2763 ; SKX_LARGE: # %bb.0:
2764 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
2765 ; SKX_LARGE-NEXT: vandps (%rax){1to16}, %zmm0, %zmm1
2766 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
2767 ; SKX_LARGE-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2768 ; SKX_LARGE-NEXT: retq
2770 ; SKX_32-LABEL: zext_index:
2772 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2773 ; SKX_32-NEXT: vandps {{\.LCPI.*}}{1to16}, %zmm0, %zmm1
2774 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2775 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2777 %ind_masked = and <16 x i32> %ind, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
2778 %sext_ind = zext <16 x i32> %ind_masked to <16 x i64>
2779 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
2781 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2782 ret <16 x float>%res
2785 define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %passthru) {
2786 ; KNL_64-LABEL: test_gather_setcc_split:
2788 ; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2789 ; KNL_64-NEXT: vptestnmd %zmm4, %zmm4, %k1
2790 ; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
2791 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
2792 ; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2793 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm3 {%k1}
2794 ; KNL_64-NEXT: vmovapd %zmm2, %zmm0
2795 ; KNL_64-NEXT: vmovapd %zmm3, %zmm1
2798 ; KNL_32-LABEL: test_gather_setcc_split:
2800 ; KNL_32-NEXT: pushl %ebp
2801 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2802 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2803 ; KNL_32-NEXT: movl %esp, %ebp
2804 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2805 ; KNL_32-NEXT: andl $-64, %esp
2806 ; KNL_32-NEXT: subl $64, %esp
2807 ; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
2808 ; KNL_32-NEXT: movl 8(%ebp), %eax
2809 ; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2810 ; KNL_32-NEXT: vptestnmd %zmm4, %zmm4, %k1
2811 ; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
2812 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
2813 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2814 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1}
2815 ; KNL_32-NEXT: vmovapd %zmm2, %zmm0
2816 ; KNL_32-NEXT: vmovapd %zmm3, %zmm1
2817 ; KNL_32-NEXT: movl %ebp, %esp
2818 ; KNL_32-NEXT: popl %ebp
2819 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2822 ; SKX-LABEL: test_gather_setcc_split:
2824 ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2825 ; SKX-NEXT: vptestnmd %ymm4, %ymm4, %k1
2826 ; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
2827 ; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
2828 ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2829 ; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm3 {%k1}
2830 ; SKX-NEXT: vmovapd %zmm2, %zmm0
2831 ; SKX-NEXT: vmovapd %zmm3, %zmm1
2834 ; SKX_32-LABEL: test_gather_setcc_split:
2836 ; SKX_32-NEXT: pushl %ebp
2837 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2838 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2839 ; SKX_32-NEXT: movl %esp, %ebp
2840 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2841 ; SKX_32-NEXT: andl $-64, %esp
2842 ; SKX_32-NEXT: subl $64, %esp
2843 ; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
2844 ; SKX_32-NEXT: movl 8(%ebp), %eax
2845 ; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2846 ; SKX_32-NEXT: vptestnmd %ymm4, %ymm4, %k1
2847 ; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
2848 ; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
2849 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2850 ; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1}
2851 ; SKX_32-NEXT: vmovapd %zmm2, %zmm0
2852 ; SKX_32-NEXT: vmovapd %zmm3, %zmm1
2853 ; SKX_32-NEXT: movl %ebp, %esp
2854 ; SKX_32-NEXT: popl %ebp
2855 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2857 %sext_ind = sext <16 x i32> %ind to <16 x i64>
2858 %gep.random = getelementptr double, double *%base, <16 x i64> %sext_ind
2860 %mask = icmp eq <16 x i32> %cmp, zeroinitializer
2861 %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %gep.random, i32 4, <16 x i1> %mask, <16 x double> %passthru)
2862 ret <16 x double>%res
2865 define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %src0) {
2866 ; KNL_64-LABEL: test_scatter_setcc_split:
2868 ; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2869 ; KNL_64-NEXT: vptestnmd %zmm4, %zmm4, %k1
2870 ; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
2871 ; KNL_64-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
2872 ; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2873 ; KNL_64-NEXT: vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1}
2874 ; KNL_64-NEXT: vzeroupper
2877 ; KNL_32-LABEL: test_scatter_setcc_split:
2879 ; KNL_32-NEXT: pushl %ebp
2880 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2881 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2882 ; KNL_32-NEXT: movl %esp, %ebp
2883 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2884 ; KNL_32-NEXT: andl $-64, %esp
2885 ; KNL_32-NEXT: subl $64, %esp
2886 ; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
2887 ; KNL_32-NEXT: movl 8(%ebp), %eax
2888 ; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2889 ; KNL_32-NEXT: vptestnmd %zmm4, %zmm4, %k1
2890 ; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
2891 ; KNL_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
2892 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2893 ; KNL_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
2894 ; KNL_32-NEXT: movl %ebp, %esp
2895 ; KNL_32-NEXT: popl %ebp
2896 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2897 ; KNL_32-NEXT: vzeroupper
2900 ; SKX-LABEL: test_scatter_setcc_split:
2902 ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2903 ; SKX-NEXT: vptestnmd %ymm4, %ymm4, %k1
2904 ; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
2905 ; SKX-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
2906 ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2907 ; SKX-NEXT: vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1}
2908 ; SKX-NEXT: vzeroupper
2911 ; SKX_32-LABEL: test_scatter_setcc_split:
2913 ; SKX_32-NEXT: pushl %ebp
2914 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2915 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2916 ; SKX_32-NEXT: movl %esp, %ebp
2917 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2918 ; SKX_32-NEXT: andl $-64, %esp
2919 ; SKX_32-NEXT: subl $64, %esp
2920 ; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
2921 ; SKX_32-NEXT: movl 8(%ebp), %eax
2922 ; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
2923 ; SKX_32-NEXT: vptestnmd %ymm4, %ymm4, %k1
2924 ; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
2925 ; SKX_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
2926 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2927 ; SKX_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
2928 ; SKX_32-NEXT: movl %ebp, %esp
2929 ; SKX_32-NEXT: popl %ebp
2930 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2931 ; SKX_32-NEXT: vzeroupper
2933 %sext_ind = sext <16 x i32> %ind to <16 x i64>
2934 %gep.random = getelementptr double, double *%base, <16 x i64> %sext_ind
2936 %mask = icmp eq <16 x i32> %cmp, zeroinitializer
2937 call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %gep.random, i32 4, <16 x i1> %mask)
2941 ; This test case previously triggered an infinite loop when the two gathers became identical after DAG combine removed the sign extend.
2942 define <16 x float> @test_sext_cse(float* %base, <16 x i32> %ind, <16 x i32>* %foo) {
2943 ; KNL_64-LABEL: test_sext_cse:
2945 ; KNL_64-NEXT: vmovaps %zmm0, (%rsi)
2946 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2947 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
2948 ; KNL_64-NEXT: vaddps %zmm1, %zmm1, %zmm0
2951 ; KNL_32-LABEL: test_sext_cse:
2953 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2954 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
2955 ; KNL_32-NEXT: vmovaps %zmm0, (%ecx)
2956 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2957 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
2958 ; KNL_32-NEXT: vaddps %zmm1, %zmm1, %zmm0
2961 ; SKX-LABEL: test_sext_cse:
2963 ; SKX-NEXT: vmovaps %zmm0, (%rsi)
2964 ; SKX-NEXT: kxnorw %k0, %k0, %k1
2965 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
2966 ; SKX-NEXT: vaddps %zmm1, %zmm1, %zmm0
2969 ; SKX_32-LABEL: test_sext_cse:
2971 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2972 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
2973 ; SKX_32-NEXT: vmovaps %zmm0, (%ecx)
2974 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2975 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
2976 ; SKX_32-NEXT: vaddps %zmm1, %zmm1, %zmm0
2978 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
2979 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
2981 %sext_ind = sext <16 x i32> %ind to <16 x i64>
2982 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
2984 store <16 x i32> %ind, <16 x i32>* %foo
2985 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2986 %gep.random2 = getelementptr float, <16 x float*> %broadcast.splat, <16 x i32> %ind
2987 %res2 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random2, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2988 %res3 = fadd <16 x float> %res2, %res
2989 ret <16 x float>%res3
2992 define void @zero_mask(<2 x double>%a1, <2 x double*> %ptr) {
2993 ; ALL-LABEL: zero_mask:
2995 ; ALL-NEXT: ret{{[l|q]}}
2996 call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %ptr, i32 4, <2 x i1> zeroinitializer)