1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_64
3 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32
4 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_SMALL
5 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq -code-model=large < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_LARGE
6 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32
7 ; RUN: opt -mtriple=x86_64-apple-darwin -scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
8 ; RUN: opt -mtriple=x86_64-apple-darwin -passes=scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
9 ; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
11 @glob_array = internal unnamed_addr constant [16 x i32] [i32 1, i32 1, i32 2, i32 3, i32 5, i32 8, i32 13, i32 21, i32 34, i32 55, i32 89, i32 144, i32 233, i32 377, i32 610, i32 987], align 16
14 ; SCALAR: extractelement <16 x float*>
15 ; SCALAR-NEXT: load float
16 ; SCALAR-NEXT: insertelement <16 x float>
17 ; SCALAR-NEXT: extractelement <16 x float*>
18 ; SCALAR-NEXT: load float
20 define <16 x float> @test1(float* %base, <16 x i32> %ind) {
21 ; KNL_64-LABEL: test1:
23 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
24 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
25 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
28 ; KNL_32-LABEL: test1:
30 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
31 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
32 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
33 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
38 ; SKX-NEXT: kxnorw %k0, %k0, %k1
39 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
40 ; SKX-NEXT: vmovaps %zmm1, %zmm0
43 ; SKX_32-LABEL: test1:
45 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
46 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
47 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
48 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
51 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
52 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
54 %sext_ind = sext <16 x i32> %ind to <16 x i64>
55 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
57 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
61 declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
62 declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
63 declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
67 ; SCALAR: extractelement <16 x float*>
68 ; SCALAR-NEXT: load float
69 ; SCALAR-NEXT: insertelement <16 x float>
70 ; SCALAR-NEXT: br label %else
72 ; SCALAR-NEXT: %res.phi.else = phi
73 ; SCALAR-NEXT: and i16 %{{.*}}, 2
74 ; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0
75 ; SCALAR-NEXT: br i1 %{{.*}}, label %cond.load1, label %else2
77 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
78 ; KNL_64-LABEL: test2:
80 ; KNL_64-NEXT: kmovw %esi, %k1
81 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
82 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
85 ; KNL_32-LABEL: test2:
87 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
88 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
89 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
90 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
95 ; SKX-NEXT: kmovw %esi, %k1
96 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
97 ; SKX-NEXT: vmovaps %zmm1, %zmm0
100 ; SKX_32-LABEL: test2:
102 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
103 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
104 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
105 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
108 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
109 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
111 %sext_ind = sext <16 x i32> %ind to <16 x i64>
112 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
113 %imask = bitcast i16 %mask to <16 x i1>
114 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
115 ret <16 x float> %res
118 define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
119 ; KNL_64-LABEL: test3:
121 ; KNL_64-NEXT: kmovw %esi, %k1
122 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
123 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
126 ; KNL_32-LABEL: test3:
128 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
129 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
130 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
131 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
136 ; SKX-NEXT: kmovw %esi, %k1
137 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
138 ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
141 ; SKX_32-LABEL: test3:
143 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
144 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
145 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
146 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
149 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
150 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
152 %sext_ind = sext <16 x i32> %ind to <16 x i64>
153 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind
154 %imask = bitcast i16 %mask to <16 x i1>
155 %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
160 define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
161 ; KNL_64-LABEL: test4:
163 ; KNL_64-NEXT: kmovw %esi, %k1
164 ; KNL_64-NEXT: kmovw %k1, %k2
165 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
166 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
167 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
168 ; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0
171 ; KNL_32-LABEL: test4:
173 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
174 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
175 ; KNL_32-NEXT: kmovw %k1, %k2
176 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
177 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
178 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
179 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
184 ; SKX-NEXT: kmovw %esi, %k1
185 ; SKX-NEXT: kmovw %k1, %k2
186 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
187 ; SKX-NEXT: vmovdqa64 %zmm1, %zmm2
188 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
189 ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0
192 ; SKX_32-LABEL: test4:
194 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
195 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
196 ; SKX_32-NEXT: kmovw %k1, %k2
197 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
198 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
199 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
200 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
203 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
204 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
206 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
207 %imask = bitcast i16 %mask to <16 x i1>
208 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
209 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
210 %res = add <16 x i32> %gt1, %gt2
215 ; SCALAR-LABEL: test5
216 ; SCALAR: and i16 %scalar_mask, 1
217 ; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0
218 ; SCALAR-NEXT: br i1 %{{.*}}, label %cond.store, label %else
219 ; SCALAR: cond.store:
220 ; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i64 0
221 ; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i64 0
222 ; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4
223 ; SCALAR-NEXT: br label %else
225 ; SCALAR-NEXT: and i16 %scalar_mask, 2
226 ; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0
227 ; SCALAR-NEXT: br i1 %{{.*}}, label %cond.store1, label %else2
229 define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
230 ; KNL_64-LABEL: test5:
232 ; KNL_64-NEXT: kmovw %esi, %k1
233 ; KNL_64-NEXT: kmovw %k1, %k2
234 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
235 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
236 ; KNL_64-NEXT: vzeroupper
239 ; KNL_32-LABEL: test5:
241 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
242 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
243 ; KNL_32-NEXT: kmovw %k1, %k2
244 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
245 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
246 ; KNL_32-NEXT: vzeroupper
251 ; SKX-NEXT: kmovw %esi, %k1
252 ; SKX-NEXT: kmovw %k1, %k2
253 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
254 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
255 ; SKX-NEXT: vzeroupper
258 ; SKX_32-LABEL: test5:
260 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
261 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
262 ; SKX_32-NEXT: kmovw %k1, %k2
263 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
264 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
265 ; SKX_32-NEXT: vzeroupper
268 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
269 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
271 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
272 %imask = bitcast i16 %mask to <16 x i1>
273 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
274 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
278 declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
279 declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
282 ; SCALAR-LABEL: test6
283 ; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4
284 ; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i64 1
285 ; SCALAR-NEXT: %Ptr12 = extractelement <8 x i32*> %ptr, i64 1
286 ; SCALAR-NEXT: store i32 %Elt1, i32* %Ptr12, align 4
287 ; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i64 2
288 ; SCALAR-NEXT: %Ptr23 = extractelement <8 x i32*> %ptr, i64 2
289 ; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4
291 define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
292 ; KNL_64-LABEL: test6:
294 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
295 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
296 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
297 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
298 ; KNL_64-NEXT: vmovdqa %ymm2, %ymm0
301 ; KNL_32-LABEL: test6:
303 ; KNL_32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
304 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
305 ; KNL_32-NEXT: movw $255, %ax
306 ; KNL_32-NEXT: kmovw %eax, %k1
307 ; KNL_32-NEXT: kmovw %k1, %k2
308 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm2 {%k2}
309 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
310 ; KNL_32-NEXT: vmovdqa %ymm2, %ymm0
315 ; SKX-NEXT: kxnorw %k0, %k0, %k1
316 ; SKX-NEXT: kxnorw %k0, %k0, %k2
317 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
318 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
319 ; SKX-NEXT: vmovdqa %ymm2, %ymm0
322 ; SKX_32-LABEL: test6:
324 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
325 ; SKX_32-NEXT: kxnorw %k0, %k0, %k2
326 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2}
327 ; SKX_32-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1}
328 ; SKX_32-NEXT: vmovdqa %ymm2, %ymm0
331 %a = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
333 call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
337 define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
339 ; KNL_64-LABEL: test7:
341 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
342 ; KNL_64-NEXT: kmovw %esi, %k0
343 ; KNL_64-NEXT: kshiftlw $8, %k0, %k0
344 ; KNL_64-NEXT: kshiftrw $8, %k0, %k1
345 ; KNL_64-NEXT: kmovw %k1, %k2
346 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
347 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
348 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
349 ; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0
352 ; KNL_32-LABEL: test7:
354 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
355 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
356 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
357 ; KNL_32-NEXT: kmovw %ecx, %k0
358 ; KNL_32-NEXT: kshiftlw $8, %k0, %k0
359 ; KNL_32-NEXT: kshiftrw $8, %k0, %k1
360 ; KNL_32-NEXT: kmovw %k1, %k2
361 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
362 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
363 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
364 ; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
369 ; SKX-NEXT: kmovw %esi, %k1
370 ; SKX-NEXT: kmovw %k1, %k2
371 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
372 ; SKX-NEXT: vmovdqa %ymm1, %ymm2
373 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
374 ; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0
377 ; SKX_32-LABEL: test7:
379 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
380 ; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
381 ; SKX_32-NEXT: kmovw %k1, %k2
382 ; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm1 {%k2}
383 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm2
384 ; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm2 {%k1}
385 ; SKX_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
388 %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
389 %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
391 %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind
392 %imask = bitcast i8 %mask to <8 x i1>
393 %gt1 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
394 %gt2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
395 %res = add <8 x i32> %gt1, %gt2
399 ; No uniform base in this case, index <8 x i64> contains addresses,
400 ; each gather call will be split into two
401 define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
402 ; KNL_64-LABEL: test8:
404 ; KNL_64-NEXT: kmovw %edi, %k1
405 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
406 ; KNL_64-NEXT: kmovw %k2, %k3
407 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
408 ; KNL_64-NEXT: kmovw %k1, %k3
409 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
410 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
411 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
412 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
413 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
414 ; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0
417 ; KNL_32-LABEL: test8:
419 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
420 ; KNL_32-NEXT: kmovw %k1, %k2
421 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
422 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
423 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
424 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
429 ; SKX-NEXT: kmovw %edi, %k1
430 ; SKX-NEXT: kshiftrw $8, %k1, %k2
431 ; SKX-NEXT: kmovw %k2, %k3
432 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
433 ; SKX-NEXT: kmovw %k1, %k3
434 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
435 ; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
436 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
437 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
438 ; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
439 ; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
442 ; SKX_32-LABEL: test8:
444 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
445 ; SKX_32-NEXT: kmovw %k1, %k2
446 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
447 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
448 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
449 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
452 %imask = bitcast i16 %mask to <16 x i1>
453 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
454 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
455 %res = add <16 x i32> %gt1, %gt2
459 %struct.RT = type { i8, [10 x [20 x i32]], i8 }
460 %struct.ST = type { i32, double, %struct.RT }
462 ; Masked gather for aggregate types
463 ; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
466 define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
467 ; KNL_64-LABEL: test9:
468 ; KNL_64: # %bb.0: # %entry
469 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
470 ; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
471 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
472 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
473 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
474 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
475 ; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
476 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
477 ; KNL_64-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
478 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
479 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
480 ; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
481 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
482 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
485 ; KNL_32-LABEL: test9:
486 ; KNL_32: # %bb.0: # %entry
487 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
488 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
489 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
490 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
491 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
492 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
493 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
494 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
495 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
496 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1
497 ; KNL_32-NEXT: movw $255, %ax
498 ; KNL_32-NEXT: kmovw %eax, %k1
499 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1}
500 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
503 ; SKX_SMALL-LABEL: test9:
504 ; SKX_SMALL: # %bb.0: # %entry
505 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
506 ; SKX_SMALL-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
507 ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
508 ; SKX_SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
509 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
510 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
511 ; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
512 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
513 ; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
514 ; SKX_SMALL-NEXT: retq
516 ; SKX_LARGE-LABEL: test9:
517 ; SKX_LARGE: # %bb.0: # %entry
518 ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2
519 ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
520 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
521 ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
522 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
523 ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
524 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
525 ; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
526 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
527 ; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1
528 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
529 ; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
530 ; SKX_LARGE-NEXT: retq
532 ; SKX_32-LABEL: test9:
533 ; SKX_32: # %bb.0: # %entry
534 ; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm1, %ymm1
535 ; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
536 ; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
537 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
538 ; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
539 ; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1
540 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
541 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1}
544 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
545 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
547 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
548 %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
552 define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
553 ; KNL_64-LABEL: test10:
554 ; KNL_64: # %bb.0: # %entry
555 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
556 ; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
557 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
558 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
559 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
560 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
561 ; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
562 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
563 ; KNL_64-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
564 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
565 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
566 ; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
567 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
568 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
571 ; KNL_32-LABEL: test10:
572 ; KNL_32: # %bb.0: # %entry
573 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
574 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
575 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
576 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
577 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
578 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
579 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
580 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
581 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
582 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1
583 ; KNL_32-NEXT: movw $255, %ax
584 ; KNL_32-NEXT: kmovw %eax, %k1
585 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1}
586 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
589 ; SKX_SMALL-LABEL: test10:
590 ; SKX_SMALL: # %bb.0: # %entry
591 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
592 ; SKX_SMALL-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
593 ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
594 ; SKX_SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
595 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
596 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
597 ; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
598 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
599 ; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
600 ; SKX_SMALL-NEXT: retq
602 ; SKX_LARGE-LABEL: test10:
603 ; SKX_LARGE: # %bb.0: # %entry
604 ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2
605 ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
606 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
607 ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
608 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
609 ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
610 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
611 ; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
612 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
613 ; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1
614 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
615 ; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
616 ; SKX_LARGE-NEXT: retq
618 ; SKX_32-LABEL: test10:
619 ; SKX_32: # %bb.0: # %entry
620 ; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm1, %ymm1
621 ; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
622 ; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
623 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
624 ; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
625 ; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1
626 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
627 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1}
630 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
631 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
633 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
634 %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
638 ; Splat index in GEP, requires broadcast
639 define <16 x float> @test11(float* %base, i32 %ind) {
640 ; KNL_64-LABEL: test11:
642 ; KNL_64-NEXT: movslq %esi, %rax
643 ; KNL_64-NEXT: leaq (%rdi,%rax,4), %rax
644 ; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
645 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
646 ; KNL_64-NEXT: vgatherdps (%rax,%zmm1,4), %zmm0 {%k1}
649 ; KNL_32-LABEL: test11:
651 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
652 ; KNL_32-NEXT: shll $2, %eax
653 ; KNL_32-NEXT: addl {{[0-9]+}}(%esp), %eax
654 ; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
655 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
656 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
661 ; SKX-NEXT: movslq %esi, %rax
662 ; SKX-NEXT: leaq (%rdi,%rax,4), %rax
663 ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
664 ; SKX-NEXT: kxnorw %k0, %k0, %k1
665 ; SKX-NEXT: vgatherdps (%rax,%zmm1,4), %zmm0 {%k1}
668 ; SKX_32-LABEL: test11:
670 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
671 ; SKX_32-NEXT: shll $2, %eax
672 ; SKX_32-NEXT: addl {{[0-9]+}}(%esp), %eax
673 ; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
674 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
675 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
678 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
679 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
681 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
683 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
687 ; We are checking the uniform base here. It is taken directly from input to vgatherdps
688 define <16 x float> @test12(float* %base, <16 x i32> %ind) {
689 ; KNL_64-LABEL: test12:
691 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
692 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
693 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
696 ; KNL_32-LABEL: test12:
698 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
699 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
700 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
701 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
706 ; SKX-NEXT: kxnorw %k0, %k0, %k1
707 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
708 ; SKX-NEXT: vmovaps %zmm1, %zmm0
711 ; SKX_32-LABEL: test12:
713 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
714 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
715 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
716 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
719 %sext_ind = sext <16 x i32> %ind to <16 x i64>
720 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
722 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
726 ; The same as the previous, but the mask is undefined
727 define <16 x float> @test13(float* %base, <16 x i32> %ind) {
728 ; KNL_64-LABEL: test13:
730 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
731 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
732 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
735 ; KNL_32-LABEL: test13:
737 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
738 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
739 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
740 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
745 ; SKX-NEXT: kxnorw %k0, %k0, %k1
746 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
747 ; SKX-NEXT: vmovaps %zmm1, %zmm0
750 ; SKX_32-LABEL: test13:
752 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
753 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
754 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
755 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
758 %sext_ind = sext <16 x i32> %ind to <16 x i64>
759 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
761 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
765 ; The base pointer is not splat, can't find unform base
766 define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
767 ; KNL_64-LABEL: test14:
769 ; KNL_64-NEXT: vmovq %xmm0, %rax
770 ; KNL_64-NEXT: vmovd %esi, %xmm0
771 ; KNL_64-NEXT: vpbroadcastd %xmm0, %ymm0
772 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
773 ; KNL_64-NEXT: vpsllq $2, %zmm0, %zmm0
774 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
775 ; KNL_64-NEXT: vgatherqps (%rax,%zmm0), %ymm1 {%k1}
776 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
779 ; KNL_32-LABEL: test14:
781 ; KNL_32-NEXT: vmovd %xmm0, %eax
782 ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
783 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
784 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1), %zmm0 {%k1}
789 ; SKX-NEXT: vmovq %xmm0, %rax
790 ; SKX-NEXT: vpbroadcastd %esi, %ymm0
791 ; SKX-NEXT: vpmovsxdq %ymm0, %zmm0
792 ; SKX-NEXT: vpsllq $2, %zmm0, %zmm0
793 ; SKX-NEXT: kxnorw %k0, %k0, %k1
794 ; SKX-NEXT: vgatherqps (%rax,%zmm0), %ymm1 {%k1}
795 ; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
798 ; SKX_32-LABEL: test14:
800 ; SKX_32-NEXT: vmovd %xmm0, %eax
801 ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
802 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
803 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1), %zmm0 {%k1}
806 %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
807 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
809 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
811 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
815 declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
816 declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
817 declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
819 ; Gather smaller than existing instruction
820 define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
821 ; KNL_64-LABEL: test15:
823 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
824 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
825 ; KNL_64-NEXT: vpmovsxdq %xmm0, %ymm0
826 ; KNL_64-NEXT: vpsllq $2, %ymm0, %ymm0
827 ; KNL_64-NEXT: vmovq %rdi, %xmm1
828 ; KNL_64-NEXT: vpbroadcastq %xmm1, %ymm1
829 ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm1
830 ; KNL_64-NEXT: kmovw %k0, %eax
831 ; KNL_64-NEXT: testb $1, %al
832 ; KNL_64-NEXT: # implicit-def: $xmm0
833 ; KNL_64-NEXT: je .LBB14_2
834 ; KNL_64-NEXT: # %bb.1: # %cond.load
835 ; KNL_64-NEXT: vmovq %xmm1, %rcx
836 ; KNL_64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
837 ; KNL_64-NEXT: .LBB14_2: # %else
838 ; KNL_64-NEXT: testb $2, %al
839 ; KNL_64-NEXT: je .LBB14_4
840 ; KNL_64-NEXT: # %bb.3: # %cond.load1
841 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx
842 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
843 ; KNL_64-NEXT: .LBB14_4: # %else2
844 ; KNL_64-NEXT: testb $4, %al
845 ; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1
846 ; KNL_64-NEXT: jne .LBB14_5
847 ; KNL_64-NEXT: # %bb.6: # %else5
848 ; KNL_64-NEXT: testb $8, %al
849 ; KNL_64-NEXT: jne .LBB14_7
850 ; KNL_64-NEXT: .LBB14_8: # %else8
851 ; KNL_64-NEXT: vzeroupper
853 ; KNL_64-NEXT: .LBB14_5: # %cond.load4
854 ; KNL_64-NEXT: vmovq %xmm1, %rcx
855 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
856 ; KNL_64-NEXT: testb $8, %al
857 ; KNL_64-NEXT: je .LBB14_8
858 ; KNL_64-NEXT: .LBB14_7: # %cond.load7
859 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
860 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
861 ; KNL_64-NEXT: vzeroupper
864 ; KNL_32-LABEL: test15:
866 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
867 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
868 ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
869 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
870 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm1
871 ; KNL_32-NEXT: kmovw %k0, %eax
872 ; KNL_32-NEXT: testb $1, %al
873 ; KNL_32-NEXT: # implicit-def: $xmm0
874 ; KNL_32-NEXT: jne .LBB14_1
875 ; KNL_32-NEXT: # %bb.2: # %else
876 ; KNL_32-NEXT: testb $2, %al
877 ; KNL_32-NEXT: jne .LBB14_3
878 ; KNL_32-NEXT: .LBB14_4: # %else2
879 ; KNL_32-NEXT: testb $4, %al
880 ; KNL_32-NEXT: jne .LBB14_5
881 ; KNL_32-NEXT: .LBB14_6: # %else5
882 ; KNL_32-NEXT: testb $8, %al
883 ; KNL_32-NEXT: jne .LBB14_7
884 ; KNL_32-NEXT: .LBB14_8: # %else8
885 ; KNL_32-NEXT: vzeroupper
887 ; KNL_32-NEXT: .LBB14_1: # %cond.load
888 ; KNL_32-NEXT: vmovd %xmm1, %ecx
889 ; KNL_32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
890 ; KNL_32-NEXT: testb $2, %al
891 ; KNL_32-NEXT: je .LBB14_4
892 ; KNL_32-NEXT: .LBB14_3: # %cond.load1
893 ; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
894 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
895 ; KNL_32-NEXT: testb $4, %al
896 ; KNL_32-NEXT: je .LBB14_6
897 ; KNL_32-NEXT: .LBB14_5: # %cond.load4
898 ; KNL_32-NEXT: vpextrd $2, %xmm1, %ecx
899 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
900 ; KNL_32-NEXT: testb $8, %al
901 ; KNL_32-NEXT: je .LBB14_8
902 ; KNL_32-NEXT: .LBB14_7: # %cond.load7
903 ; KNL_32-NEXT: vpextrd $3, %xmm1, %eax
904 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
905 ; KNL_32-NEXT: vzeroupper
910 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
911 ; SKX-NEXT: vpmovd2m %xmm1, %k1
912 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
913 ; SKX-NEXT: vmovaps %xmm1, %xmm0
916 ; SKX_32-LABEL: test15:
918 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
919 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
920 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
921 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
922 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
925 %sext_ind = sext <4 x i32> %ind to <4 x i64>
926 %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
927 %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
931 ; Gather smaller than existing instruction
932 define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
933 ; KNL_64-LABEL: test16:
935 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
936 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
937 ; KNL_64-NEXT: vpmovsxdq %xmm0, %ymm0
938 ; KNL_64-NEXT: vpsllq $3, %ymm0, %ymm0
939 ; KNL_64-NEXT: vmovq %rdi, %xmm1
940 ; KNL_64-NEXT: vpbroadcastq %xmm1, %ymm1
941 ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0
942 ; KNL_64-NEXT: kmovw %k0, %eax
943 ; KNL_64-NEXT: testb $1, %al
944 ; KNL_64-NEXT: je .LBB15_2
945 ; KNL_64-NEXT: # %bb.1: # %cond.load
946 ; KNL_64-NEXT: vmovq %xmm0, %rcx
947 ; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
948 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7]
949 ; KNL_64-NEXT: .LBB15_2: # %else
950 ; KNL_64-NEXT: testb $2, %al
951 ; KNL_64-NEXT: je .LBB15_4
952 ; KNL_64-NEXT: # %bb.3: # %cond.load1
953 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
954 ; KNL_64-NEXT: vmovhps {{.*#+}} xmm1 = xmm2[0,1],mem[0,1]
955 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
956 ; KNL_64-NEXT: .LBB15_4: # %else2
957 ; KNL_64-NEXT: testb $4, %al
958 ; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
959 ; KNL_64-NEXT: jne .LBB15_5
960 ; KNL_64-NEXT: # %bb.6: # %else5
961 ; KNL_64-NEXT: testb $8, %al
962 ; KNL_64-NEXT: jne .LBB15_7
963 ; KNL_64-NEXT: .LBB15_8: # %else8
964 ; KNL_64-NEXT: vmovdqa %ymm2, %ymm0
966 ; KNL_64-NEXT: .LBB15_5: # %cond.load4
967 ; KNL_64-NEXT: vmovq %xmm0, %rcx
968 ; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm1
969 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
970 ; KNL_64-NEXT: testb $8, %al
971 ; KNL_64-NEXT: je .LBB15_8
972 ; KNL_64-NEXT: .LBB15_7: # %cond.load7
973 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
974 ; KNL_64-NEXT: vpbroadcastq (%rax), %ymm0
975 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
976 ; KNL_64-NEXT: vmovdqa %ymm2, %ymm0
979 ; KNL_32-LABEL: test16:
981 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
982 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
983 ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0
984 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
985 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
986 ; KNL_32-NEXT: kmovw %k0, %eax
987 ; KNL_32-NEXT: testb $1, %al
988 ; KNL_32-NEXT: jne .LBB15_1
989 ; KNL_32-NEXT: # %bb.2: # %else
990 ; KNL_32-NEXT: testb $2, %al
991 ; KNL_32-NEXT: jne .LBB15_3
992 ; KNL_32-NEXT: .LBB15_4: # %else2
993 ; KNL_32-NEXT: testb $4, %al
994 ; KNL_32-NEXT: jne .LBB15_5
995 ; KNL_32-NEXT: .LBB15_6: # %else5
996 ; KNL_32-NEXT: testb $8, %al
997 ; KNL_32-NEXT: jne .LBB15_7
998 ; KNL_32-NEXT: .LBB15_8: # %else8
999 ; KNL_32-NEXT: vmovdqa %ymm2, %ymm0
1001 ; KNL_32-NEXT: .LBB15_1: # %cond.load
1002 ; KNL_32-NEXT: vmovd %xmm0, %ecx
1003 ; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1004 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1005 ; KNL_32-NEXT: testb $2, %al
1006 ; KNL_32-NEXT: je .LBB15_4
1007 ; KNL_32-NEXT: .LBB15_3: # %cond.load1
1008 ; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
1009 ; KNL_32-NEXT: vmovhps {{.*#+}} xmm1 = xmm2[0,1],mem[0,1]
1010 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1011 ; KNL_32-NEXT: testb $4, %al
1012 ; KNL_32-NEXT: je .LBB15_6
1013 ; KNL_32-NEXT: .LBB15_5: # %cond.load4
1014 ; KNL_32-NEXT: vpextrd $2, %xmm0, %ecx
1015 ; KNL_32-NEXT: vpbroadcastq (%ecx), %ymm1
1016 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
1017 ; KNL_32-NEXT: testb $8, %al
1018 ; KNL_32-NEXT: je .LBB15_8
1019 ; KNL_32-NEXT: .LBB15_7: # %cond.load7
1020 ; KNL_32-NEXT: vpextrd $3, %xmm0, %eax
1021 ; KNL_32-NEXT: vpbroadcastq (%eax), %ymm0
1022 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
1023 ; KNL_32-NEXT: vmovdqa %ymm2, %ymm0
1026 ; SKX-LABEL: test16:
1028 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
1029 ; SKX-NEXT: vpmovd2m %xmm1, %k1
1030 ; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
1031 ; SKX-NEXT: vmovapd %ymm2, %ymm0
1034 ; SKX_32-LABEL: test16:
1036 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
1037 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
1038 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1039 ; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
1040 ; SKX_32-NEXT: vmovapd %ymm2, %ymm0
1043 %sext_ind = sext <4 x i32> %ind to <4 x i64>
1044 %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
1045 %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
1046 ret <4 x double>%res
1049 define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
1050 ; KNL_64-LABEL: test17:
1052 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1053 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1054 ; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0
1055 ; KNL_64-NEXT: vpsllq $3, %xmm0, %xmm0
1056 ; KNL_64-NEXT: vmovq %rdi, %xmm1
1057 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
1058 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1059 ; KNL_64-NEXT: kmovw %k0, %eax
1060 ; KNL_64-NEXT: testb $1, %al
1061 ; KNL_64-NEXT: jne .LBB16_1
1062 ; KNL_64-NEXT: # %bb.2: # %else
1063 ; KNL_64-NEXT: testb $2, %al
1064 ; KNL_64-NEXT: jne .LBB16_3
1065 ; KNL_64-NEXT: .LBB16_4: # %else2
1066 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1067 ; KNL_64-NEXT: vzeroupper
1069 ; KNL_64-NEXT: .LBB16_1: # %cond.load
1070 ; KNL_64-NEXT: vmovq %xmm0, %rcx
1071 ; KNL_64-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
1072 ; KNL_64-NEXT: testb $2, %al
1073 ; KNL_64-NEXT: je .LBB16_4
1074 ; KNL_64-NEXT: .LBB16_3: # %cond.load1
1075 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
1076 ; KNL_64-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
1077 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1078 ; KNL_64-NEXT: vzeroupper
1081 ; KNL_32-LABEL: test17:
1083 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1084 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1085 ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0
1086 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
1087 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1088 ; KNL_32-NEXT: kmovw %k0, %eax
1089 ; KNL_32-NEXT: testb $1, %al
1090 ; KNL_32-NEXT: jne .LBB16_1
1091 ; KNL_32-NEXT: # %bb.2: # %else
1092 ; KNL_32-NEXT: testb $2, %al
1093 ; KNL_32-NEXT: jne .LBB16_3
1094 ; KNL_32-NEXT: .LBB16_4: # %else2
1095 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1096 ; KNL_32-NEXT: vzeroupper
1098 ; KNL_32-NEXT: .LBB16_1: # %cond.load
1099 ; KNL_32-NEXT: vmovd %xmm0, %ecx
1100 ; KNL_32-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
1101 ; KNL_32-NEXT: testb $2, %al
1102 ; KNL_32-NEXT: je .LBB16_4
1103 ; KNL_32-NEXT: .LBB16_3: # %cond.load1
1104 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
1105 ; KNL_32-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
1106 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1107 ; KNL_32-NEXT: vzeroupper
1110 ; SKX-LABEL: test17:
1112 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1113 ; SKX-NEXT: vpmovq2m %xmm1, %k0
1114 ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
1115 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
1116 ; SKX-NEXT: vpsllq $3, %xmm0, %xmm0
1117 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1118 ; SKX-NEXT: kmovw %k0, %eax
1119 ; SKX-NEXT: testb $1, %al
1120 ; SKX-NEXT: jne .LBB16_1
1121 ; SKX-NEXT: # %bb.2: # %else
1122 ; SKX-NEXT: testb $2, %al
1123 ; SKX-NEXT: jne .LBB16_3
1124 ; SKX-NEXT: .LBB16_4: # %else2
1125 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1127 ; SKX-NEXT: .LBB16_1: # %cond.load
1128 ; SKX-NEXT: vmovq %xmm0, %rcx
1129 ; SKX-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
1130 ; SKX-NEXT: testb $2, %al
1131 ; SKX-NEXT: je .LBB16_4
1132 ; SKX-NEXT: .LBB16_3: # %cond.load1
1133 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
1134 ; SKX-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
1135 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1138 ; SKX_32-LABEL: test17:
1140 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1141 ; SKX_32-NEXT: vpmovq2m %xmm1, %k0
1142 ; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0
1143 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
1144 ; SKX_32-NEXT: kmovw %k0, %eax
1145 ; SKX_32-NEXT: testb $1, %al
1146 ; SKX_32-NEXT: jne .LBB16_1
1147 ; SKX_32-NEXT: # %bb.2: # %else
1148 ; SKX_32-NEXT: testb $2, %al
1149 ; SKX_32-NEXT: jne .LBB16_3
1150 ; SKX_32-NEXT: .LBB16_4: # %else2
1151 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1153 ; SKX_32-NEXT: .LBB16_1: # %cond.load
1154 ; SKX_32-NEXT: vmovd %xmm0, %ecx
1155 ; SKX_32-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
1156 ; SKX_32-NEXT: testb $2, %al
1157 ; SKX_32-NEXT: je .LBB16_4
1158 ; SKX_32-NEXT: .LBB16_3: # %cond.load1
1159 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
1160 ; SKX_32-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
1161 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1164 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1165 %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
1166 %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
1167 ret <2 x double>%res
1170 declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
1171 declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
1172 declare void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
1173 declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
1174 declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
1176 define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
1177 ; KNL_64-LABEL: test18:
1179 ; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2
1180 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0
1181 ; KNL_64-NEXT: kmovw %k0, %eax
1182 ; KNL_64-NEXT: testb $1, %al
1183 ; KNL_64-NEXT: je .LBB17_2
1184 ; KNL_64-NEXT: # %bb.1: # %cond.store
1185 ; KNL_64-NEXT: vmovq %xmm1, %rcx
1186 ; KNL_64-NEXT: vmovss %xmm0, (%rcx)
1187 ; KNL_64-NEXT: .LBB17_2: # %else
1188 ; KNL_64-NEXT: testb $2, %al
1189 ; KNL_64-NEXT: je .LBB17_4
1190 ; KNL_64-NEXT: # %bb.3: # %cond.store1
1191 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx
1192 ; KNL_64-NEXT: vextractps $1, %xmm0, (%rcx)
1193 ; KNL_64-NEXT: .LBB17_4: # %else2
1194 ; KNL_64-NEXT: testb $4, %al
1195 ; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1
1196 ; KNL_64-NEXT: jne .LBB17_5
1197 ; KNL_64-NEXT: # %bb.6: # %else4
1198 ; KNL_64-NEXT: testb $8, %al
1199 ; KNL_64-NEXT: jne .LBB17_7
1200 ; KNL_64-NEXT: .LBB17_8: # %else6
1201 ; KNL_64-NEXT: vzeroupper
1203 ; KNL_64-NEXT: .LBB17_5: # %cond.store3
1204 ; KNL_64-NEXT: vmovq %xmm1, %rcx
1205 ; KNL_64-NEXT: vextractps $2, %xmm0, (%rcx)
1206 ; KNL_64-NEXT: testb $8, %al
1207 ; KNL_64-NEXT: je .LBB17_8
1208 ; KNL_64-NEXT: .LBB17_7: # %cond.store5
1209 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
1210 ; KNL_64-NEXT: vextractps $3, %xmm0, (%rax)
1211 ; KNL_64-NEXT: vzeroupper
1214 ; KNL_32-LABEL: test18:
1216 ; KNL_32-NEXT: vpslld $31, %xmm2, %xmm2
1217 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k0
1218 ; KNL_32-NEXT: kmovw %k0, %eax
1219 ; KNL_32-NEXT: testb $1, %al
1220 ; KNL_32-NEXT: jne .LBB17_1
1221 ; KNL_32-NEXT: # %bb.2: # %else
1222 ; KNL_32-NEXT: testb $2, %al
1223 ; KNL_32-NEXT: jne .LBB17_3
1224 ; KNL_32-NEXT: .LBB17_4: # %else2
1225 ; KNL_32-NEXT: testb $4, %al
1226 ; KNL_32-NEXT: jne .LBB17_5
1227 ; KNL_32-NEXT: .LBB17_6: # %else4
1228 ; KNL_32-NEXT: testb $8, %al
1229 ; KNL_32-NEXT: jne .LBB17_7
1230 ; KNL_32-NEXT: .LBB17_8: # %else6
1231 ; KNL_32-NEXT: vzeroupper
1233 ; KNL_32-NEXT: .LBB17_1: # %cond.store
1234 ; KNL_32-NEXT: vmovd %xmm1, %ecx
1235 ; KNL_32-NEXT: vmovss %xmm0, (%ecx)
1236 ; KNL_32-NEXT: testb $2, %al
1237 ; KNL_32-NEXT: je .LBB17_4
1238 ; KNL_32-NEXT: .LBB17_3: # %cond.store1
1239 ; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
1240 ; KNL_32-NEXT: vextractps $1, %xmm0, (%ecx)
1241 ; KNL_32-NEXT: testb $4, %al
1242 ; KNL_32-NEXT: je .LBB17_6
1243 ; KNL_32-NEXT: .LBB17_5: # %cond.store3
1244 ; KNL_32-NEXT: vpextrd $2, %xmm1, %ecx
1245 ; KNL_32-NEXT: vextractps $2, %xmm0, (%ecx)
1246 ; KNL_32-NEXT: testb $8, %al
1247 ; KNL_32-NEXT: je .LBB17_8
1248 ; KNL_32-NEXT: .LBB17_7: # %cond.store5
1249 ; KNL_32-NEXT: vpextrd $3, %xmm1, %eax
1250 ; KNL_32-NEXT: vextractps $3, %xmm0, (%eax)
1251 ; KNL_32-NEXT: vzeroupper
1254 ; SKX-LABEL: test18:
1256 ; SKX-NEXT: vpslld $31, %xmm2, %xmm2
1257 ; SKX-NEXT: vpmovd2m %xmm2, %k1
1258 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
1259 ; SKX-NEXT: vzeroupper
1262 ; SKX_32-LABEL: test18:
1264 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
1265 ; SKX_32-NEXT: vpmovd2m %xmm2, %k1
1266 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
1268 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
1272 define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
1273 ; KNL_64-LABEL: test19:
1275 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
1276 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
1277 ; KNL_64-NEXT: vpsllq $3, %ymm2, %ymm1
1278 ; KNL_64-NEXT: vmovq %rdi, %xmm2
1279 ; KNL_64-NEXT: vpbroadcastq %xmm2, %ymm2
1280 ; KNL_64-NEXT: vpaddq %ymm1, %ymm2, %ymm1
1281 ; KNL_64-NEXT: kmovw %k0, %eax
1282 ; KNL_64-NEXT: testb $1, %al
1283 ; KNL_64-NEXT: je .LBB18_2
1284 ; KNL_64-NEXT: # %bb.1: # %cond.store
1285 ; KNL_64-NEXT: vmovq %xmm1, %rcx
1286 ; KNL_64-NEXT: vmovlps %xmm0, (%rcx)
1287 ; KNL_64-NEXT: .LBB18_2: # %else
1288 ; KNL_64-NEXT: testb $2, %al
1289 ; KNL_64-NEXT: je .LBB18_4
1290 ; KNL_64-NEXT: # %bb.3: # %cond.store1
1291 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx
1292 ; KNL_64-NEXT: vmovhps %xmm0, (%rcx)
1293 ; KNL_64-NEXT: .LBB18_4: # %else2
1294 ; KNL_64-NEXT: testb $4, %al
1295 ; KNL_64-NEXT: vextractf128 $1, %ymm0, %xmm0
1296 ; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1
1297 ; KNL_64-NEXT: jne .LBB18_5
1298 ; KNL_64-NEXT: # %bb.6: # %else4
1299 ; KNL_64-NEXT: testb $8, %al
1300 ; KNL_64-NEXT: jne .LBB18_7
1301 ; KNL_64-NEXT: .LBB18_8: # %else6
1302 ; KNL_64-NEXT: vzeroupper
1304 ; KNL_64-NEXT: .LBB18_5: # %cond.store3
1305 ; KNL_64-NEXT: vmovq %xmm1, %rcx
1306 ; KNL_64-NEXT: vmovlps %xmm0, (%rcx)
1307 ; KNL_64-NEXT: testb $8, %al
1308 ; KNL_64-NEXT: je .LBB18_8
1309 ; KNL_64-NEXT: .LBB18_7: # %cond.store5
1310 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
1311 ; KNL_64-NEXT: vmovhps %xmm0, (%rax)
1312 ; KNL_64-NEXT: vzeroupper
1315 ; KNL_32-LABEL: test19:
1317 ; KNL_32-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
1318 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
1319 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
1320 ; KNL_32-NEXT: vpmovqd %zmm2, %ymm1
1321 ; KNL_32-NEXT: vpslld $3, %xmm1, %xmm1
1322 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2
1323 ; KNL_32-NEXT: vpaddd %xmm1, %xmm2, %xmm1
1324 ; KNL_32-NEXT: kmovw %k0, %eax
1325 ; KNL_32-NEXT: testb $1, %al
1326 ; KNL_32-NEXT: je .LBB18_2
1327 ; KNL_32-NEXT: # %bb.1: # %cond.store
1328 ; KNL_32-NEXT: vmovd %xmm1, %ecx
1329 ; KNL_32-NEXT: vmovlps %xmm0, (%ecx)
1330 ; KNL_32-NEXT: .LBB18_2: # %else
1331 ; KNL_32-NEXT: testb $2, %al
1332 ; KNL_32-NEXT: je .LBB18_4
1333 ; KNL_32-NEXT: # %bb.3: # %cond.store1
1334 ; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
1335 ; KNL_32-NEXT: vmovhps %xmm0, (%ecx)
1336 ; KNL_32-NEXT: .LBB18_4: # %else2
1337 ; KNL_32-NEXT: testb $4, %al
1338 ; KNL_32-NEXT: vextractf128 $1, %ymm0, %xmm0
1339 ; KNL_32-NEXT: jne .LBB18_5
1340 ; KNL_32-NEXT: # %bb.6: # %else4
1341 ; KNL_32-NEXT: testb $8, %al
1342 ; KNL_32-NEXT: jne .LBB18_7
1343 ; KNL_32-NEXT: .LBB18_8: # %else6
1344 ; KNL_32-NEXT: vzeroupper
1346 ; KNL_32-NEXT: .LBB18_5: # %cond.store3
1347 ; KNL_32-NEXT: vpextrd $2, %xmm1, %ecx
1348 ; KNL_32-NEXT: vmovlps %xmm0, (%ecx)
1349 ; KNL_32-NEXT: testb $8, %al
1350 ; KNL_32-NEXT: je .LBB18_8
1351 ; KNL_32-NEXT: .LBB18_7: # %cond.store5
1352 ; KNL_32-NEXT: vpextrd $3, %xmm1, %eax
1353 ; KNL_32-NEXT: vmovhps %xmm0, (%eax)
1354 ; KNL_32-NEXT: vzeroupper
1357 ; SKX-LABEL: test19:
1359 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
1360 ; SKX-NEXT: vpmovd2m %xmm1, %k1
1361 ; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
1362 ; SKX-NEXT: vzeroupper
1365 ; SKX_32-LABEL: test19:
1367 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
1368 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
1369 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1370 ; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
1371 ; SKX_32-NEXT: vzeroupper
1373 %gep = getelementptr double, double* %ptr, <4 x i64> %ind
1374 call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
1378 ; Data type requires widening
1379 define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
1380 ; KNL_64-LABEL: test20:
1382 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
1383 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
1384 ; KNL_64-NEXT: kmovw %k0, %eax
1385 ; KNL_64-NEXT: testb $1, %al
1386 ; KNL_64-NEXT: jne .LBB19_1
1387 ; KNL_64-NEXT: # %bb.2: # %else
1388 ; KNL_64-NEXT: testb $2, %al
1389 ; KNL_64-NEXT: jne .LBB19_3
1390 ; KNL_64-NEXT: .LBB19_4: # %else2
1391 ; KNL_64-NEXT: vzeroupper
1393 ; KNL_64-NEXT: .LBB19_1: # %cond.store
1394 ; KNL_64-NEXT: vmovq %xmm1, %rcx
1395 ; KNL_64-NEXT: vmovd %xmm0, (%rcx)
1396 ; KNL_64-NEXT: testb $2, %al
1397 ; KNL_64-NEXT: je .LBB19_4
1398 ; KNL_64-NEXT: .LBB19_3: # %cond.store1
1399 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
1400 ; KNL_64-NEXT: vextractps $1, %xmm0, (%rax)
1401 ; KNL_64-NEXT: vzeroupper
1404 ; KNL_32-LABEL: test20:
1406 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
1407 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
1408 ; KNL_32-NEXT: kmovw %k0, %eax
1409 ; KNL_32-NEXT: testb $1, %al
1410 ; KNL_32-NEXT: jne .LBB19_1
1411 ; KNL_32-NEXT: # %bb.2: # %else
1412 ; KNL_32-NEXT: testb $2, %al
1413 ; KNL_32-NEXT: jne .LBB19_3
1414 ; KNL_32-NEXT: .LBB19_4: # %else2
1415 ; KNL_32-NEXT: vzeroupper
1417 ; KNL_32-NEXT: .LBB19_1: # %cond.store
1418 ; KNL_32-NEXT: vmovd %xmm1, %ecx
1419 ; KNL_32-NEXT: vmovd %xmm0, (%ecx)
1420 ; KNL_32-NEXT: testb $2, %al
1421 ; KNL_32-NEXT: je .LBB19_4
1422 ; KNL_32-NEXT: .LBB19_3: # %cond.store1
1423 ; KNL_32-NEXT: vpextrd $1, %xmm1, %eax
1424 ; KNL_32-NEXT: vextractps $1, %xmm0, (%eax)
1425 ; KNL_32-NEXT: vzeroupper
1428 ; SKX-LABEL: test20:
1430 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
1431 ; SKX-NEXT: vpmovq2m %xmm2, %k0
1432 ; SKX-NEXT: kmovw %k0, %eax
1433 ; SKX-NEXT: testb $1, %al
1434 ; SKX-NEXT: jne .LBB19_1
1435 ; SKX-NEXT: # %bb.2: # %else
1436 ; SKX-NEXT: testb $2, %al
1437 ; SKX-NEXT: jne .LBB19_3
1438 ; SKX-NEXT: .LBB19_4: # %else2
1440 ; SKX-NEXT: .LBB19_1: # %cond.store
1441 ; SKX-NEXT: vmovq %xmm1, %rcx
1442 ; SKX-NEXT: vmovd %xmm0, (%rcx)
1443 ; SKX-NEXT: testb $2, %al
1444 ; SKX-NEXT: je .LBB19_4
1445 ; SKX-NEXT: .LBB19_3: # %cond.store1
1446 ; SKX-NEXT: vpextrq $1, %xmm1, %rax
1447 ; SKX-NEXT: vextractps $1, %xmm0, (%rax)
1450 ; SKX_32-LABEL: test20:
1452 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
1453 ; SKX_32-NEXT: vpmovq2m %xmm2, %k0
1454 ; SKX_32-NEXT: kmovw %k0, %eax
1455 ; SKX_32-NEXT: testb $1, %al
1456 ; SKX_32-NEXT: jne .LBB19_1
1457 ; SKX_32-NEXT: # %bb.2: # %else
1458 ; SKX_32-NEXT: testb $2, %al
1459 ; SKX_32-NEXT: jne .LBB19_3
1460 ; SKX_32-NEXT: .LBB19_4: # %else2
1462 ; SKX_32-NEXT: .LBB19_1: # %cond.store
1463 ; SKX_32-NEXT: vmovd %xmm1, %ecx
1464 ; SKX_32-NEXT: vmovd %xmm0, (%ecx)
1465 ; SKX_32-NEXT: testb $2, %al
1466 ; SKX_32-NEXT: je .LBB19_4
1467 ; SKX_32-NEXT: .LBB19_3: # %cond.store1
1468 ; SKX_32-NEXT: vpextrd $1, %xmm1, %eax
1469 ; SKX_32-NEXT: vextractps $1, %xmm0, (%eax)
1471 call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
1475 ; Data type requires promotion
1476 define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
1477 ; KNL_64-LABEL: test21:
1479 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
1480 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
1481 ; KNL_64-NEXT: kmovw %k0, %eax
1482 ; KNL_64-NEXT: testb $1, %al
1483 ; KNL_64-NEXT: jne .LBB20_1
1484 ; KNL_64-NEXT: # %bb.2: # %else
1485 ; KNL_64-NEXT: testb $2, %al
1486 ; KNL_64-NEXT: jne .LBB20_3
1487 ; KNL_64-NEXT: .LBB20_4: # %else2
1488 ; KNL_64-NEXT: vzeroupper
1490 ; KNL_64-NEXT: .LBB20_1: # %cond.store
1491 ; KNL_64-NEXT: vmovq %xmm1, %rcx
1492 ; KNL_64-NEXT: vmovss %xmm0, (%rcx)
1493 ; KNL_64-NEXT: testb $2, %al
1494 ; KNL_64-NEXT: je .LBB20_4
1495 ; KNL_64-NEXT: .LBB20_3: # %cond.store1
1496 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
1497 ; KNL_64-NEXT: vextractps $1, %xmm0, (%rax)
1498 ; KNL_64-NEXT: vzeroupper
1501 ; KNL_32-LABEL: test21:
1503 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
1504 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
1505 ; KNL_32-NEXT: kmovw %k0, %eax
1506 ; KNL_32-NEXT: testb $1, %al
1507 ; KNL_32-NEXT: jne .LBB20_1
1508 ; KNL_32-NEXT: # %bb.2: # %else
1509 ; KNL_32-NEXT: testb $2, %al
1510 ; KNL_32-NEXT: jne .LBB20_3
1511 ; KNL_32-NEXT: .LBB20_4: # %else2
1512 ; KNL_32-NEXT: vzeroupper
1514 ; KNL_32-NEXT: .LBB20_1: # %cond.store
1515 ; KNL_32-NEXT: vmovd %xmm1, %ecx
1516 ; KNL_32-NEXT: vmovss %xmm0, (%ecx)
1517 ; KNL_32-NEXT: testb $2, %al
1518 ; KNL_32-NEXT: je .LBB20_4
1519 ; KNL_32-NEXT: .LBB20_3: # %cond.store1
1520 ; KNL_32-NEXT: vpextrd $1, %xmm1, %eax
1521 ; KNL_32-NEXT: vextractps $1, %xmm0, (%eax)
1522 ; KNL_32-NEXT: vzeroupper
1525 ; SKX-LABEL: test21:
1527 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
1528 ; SKX-NEXT: vpmovq2m %xmm2, %k0
1529 ; SKX-NEXT: kmovw %k0, %eax
1530 ; SKX-NEXT: testb $1, %al
1531 ; SKX-NEXT: jne .LBB20_1
1532 ; SKX-NEXT: # %bb.2: # %else
1533 ; SKX-NEXT: testb $2, %al
1534 ; SKX-NEXT: jne .LBB20_3
1535 ; SKX-NEXT: .LBB20_4: # %else2
1537 ; SKX-NEXT: .LBB20_1: # %cond.store
1538 ; SKX-NEXT: vmovq %xmm1, %rcx
1539 ; SKX-NEXT: vmovss %xmm0, (%rcx)
1540 ; SKX-NEXT: testb $2, %al
1541 ; SKX-NEXT: je .LBB20_4
1542 ; SKX-NEXT: .LBB20_3: # %cond.store1
1543 ; SKX-NEXT: vpextrq $1, %xmm1, %rax
1544 ; SKX-NEXT: vextractps $1, %xmm0, (%rax)
1547 ; SKX_32-LABEL: test21:
1549 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
1550 ; SKX_32-NEXT: vpmovq2m %xmm2, %k0
1551 ; SKX_32-NEXT: kmovw %k0, %eax
1552 ; SKX_32-NEXT: testb $1, %al
1553 ; SKX_32-NEXT: jne .LBB20_1
1554 ; SKX_32-NEXT: # %bb.2: # %else
1555 ; SKX_32-NEXT: testb $2, %al
1556 ; SKX_32-NEXT: jne .LBB20_3
1557 ; SKX_32-NEXT: .LBB20_4: # %else2
1559 ; SKX_32-NEXT: .LBB20_1: # %cond.store
1560 ; SKX_32-NEXT: vmovd %xmm1, %ecx
1561 ; SKX_32-NEXT: vmovss %xmm0, (%ecx)
1562 ; SKX_32-NEXT: testb $2, %al
1563 ; SKX_32-NEXT: je .LBB20_4
1564 ; SKX_32-NEXT: .LBB20_3: # %cond.store1
1565 ; SKX_32-NEXT: vpextrd $1, %xmm1, %eax
1566 ; SKX_32-NEXT: vextractps $1, %xmm0, (%eax)
1568 call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
1572 ; The result type requires widening
1573 declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
1575 define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
1576 ; KNL_64-LABEL: test22:
1578 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1579 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1580 ; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0
1581 ; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0
1582 ; KNL_64-NEXT: vmovq %rdi, %xmm1
1583 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
1584 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1585 ; KNL_64-NEXT: kmovw %k0, %eax
1586 ; KNL_64-NEXT: testb $1, %al
1587 ; KNL_64-NEXT: jne .LBB21_1
1588 ; KNL_64-NEXT: # %bb.2: # %else
1589 ; KNL_64-NEXT: testb $2, %al
1590 ; KNL_64-NEXT: jne .LBB21_3
1591 ; KNL_64-NEXT: .LBB21_4: # %else2
1592 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1593 ; KNL_64-NEXT: vzeroupper
1595 ; KNL_64-NEXT: .LBB21_1: # %cond.load
1596 ; KNL_64-NEXT: vmovq %xmm0, %rcx
1597 ; KNL_64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1598 ; KNL_64-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1599 ; KNL_64-NEXT: testb $2, %al
1600 ; KNL_64-NEXT: je .LBB21_4
1601 ; KNL_64-NEXT: .LBB21_3: # %cond.load1
1602 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
1603 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1604 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1605 ; KNL_64-NEXT: vzeroupper
1608 ; KNL_32-LABEL: test22:
1610 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1611 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1612 ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
1613 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
1614 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1615 ; KNL_32-NEXT: kmovw %k0, %eax
1616 ; KNL_32-NEXT: testb $1, %al
1617 ; KNL_32-NEXT: jne .LBB21_1
1618 ; KNL_32-NEXT: # %bb.2: # %else
1619 ; KNL_32-NEXT: testb $2, %al
1620 ; KNL_32-NEXT: jne .LBB21_3
1621 ; KNL_32-NEXT: .LBB21_4: # %else2
1622 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1623 ; KNL_32-NEXT: vzeroupper
1625 ; KNL_32-NEXT: .LBB21_1: # %cond.load
1626 ; KNL_32-NEXT: vmovd %xmm0, %ecx
1627 ; KNL_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1628 ; KNL_32-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1629 ; KNL_32-NEXT: testb $2, %al
1630 ; KNL_32-NEXT: je .LBB21_4
1631 ; KNL_32-NEXT: .LBB21_3: # %cond.load1
1632 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
1633 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1634 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1635 ; KNL_32-NEXT: vzeroupper
1638 ; SKX-LABEL: test22:
1640 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1641 ; SKX-NEXT: vpmovq2m %xmm1, %k0
1642 ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
1643 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
1644 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
1645 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1646 ; SKX-NEXT: kmovw %k0, %eax
1647 ; SKX-NEXT: testb $1, %al
1648 ; SKX-NEXT: jne .LBB21_1
1649 ; SKX-NEXT: # %bb.2: # %else
1650 ; SKX-NEXT: testb $2, %al
1651 ; SKX-NEXT: jne .LBB21_3
1652 ; SKX-NEXT: .LBB21_4: # %else2
1653 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1655 ; SKX-NEXT: .LBB21_1: # %cond.load
1656 ; SKX-NEXT: vmovq %xmm0, %rcx
1657 ; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1658 ; SKX-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1659 ; SKX-NEXT: testb $2, %al
1660 ; SKX-NEXT: je .LBB21_4
1661 ; SKX-NEXT: .LBB21_3: # %cond.load1
1662 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
1663 ; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1664 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1667 ; SKX_32-LABEL: test22:
1669 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1670 ; SKX_32-NEXT: vpmovq2m %xmm1, %k0
1671 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
1672 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
1673 ; SKX_32-NEXT: kmovw %k0, %eax
1674 ; SKX_32-NEXT: testb $1, %al
1675 ; SKX_32-NEXT: jne .LBB21_1
1676 ; SKX_32-NEXT: # %bb.2: # %else
1677 ; SKX_32-NEXT: testb $2, %al
1678 ; SKX_32-NEXT: jne .LBB21_3
1679 ; SKX_32-NEXT: .LBB21_4: # %else2
1680 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1682 ; SKX_32-NEXT: .LBB21_1: # %cond.load
1683 ; SKX_32-NEXT: vmovd %xmm0, %ecx
1684 ; SKX_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1685 ; SKX_32-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1686 ; SKX_32-NEXT: testb $2, %al
1687 ; SKX_32-NEXT: je .LBB21_4
1688 ; SKX_32-NEXT: .LBB21_3: # %cond.load1
1689 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
1690 ; SKX_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1691 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1693 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1694 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1695 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1699 define <2 x float> @test22a(float* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x float> %src0) {
1700 ; KNL_64-LABEL: test22a:
1702 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1703 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1704 ; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0
1705 ; KNL_64-NEXT: vmovq %rdi, %xmm1
1706 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
1707 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1708 ; KNL_64-NEXT: kmovw %k0, %eax
1709 ; KNL_64-NEXT: testb $1, %al
1710 ; KNL_64-NEXT: jne .LBB22_1
1711 ; KNL_64-NEXT: # %bb.2: # %else
1712 ; KNL_64-NEXT: testb $2, %al
1713 ; KNL_64-NEXT: jne .LBB22_3
1714 ; KNL_64-NEXT: .LBB22_4: # %else2
1715 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1716 ; KNL_64-NEXT: vzeroupper
1718 ; KNL_64-NEXT: .LBB22_1: # %cond.load
1719 ; KNL_64-NEXT: vmovq %xmm0, %rcx
1720 ; KNL_64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1721 ; KNL_64-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1722 ; KNL_64-NEXT: testb $2, %al
1723 ; KNL_64-NEXT: je .LBB22_4
1724 ; KNL_64-NEXT: .LBB22_3: # %cond.load1
1725 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
1726 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1727 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1728 ; KNL_64-NEXT: vzeroupper
1731 ; KNL_32-LABEL: test22a:
1733 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1734 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1735 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1736 ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
1737 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
1738 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1739 ; KNL_32-NEXT: kmovw %k0, %eax
1740 ; KNL_32-NEXT: testb $1, %al
1741 ; KNL_32-NEXT: jne .LBB22_1
1742 ; KNL_32-NEXT: # %bb.2: # %else
1743 ; KNL_32-NEXT: testb $2, %al
1744 ; KNL_32-NEXT: jne .LBB22_3
1745 ; KNL_32-NEXT: .LBB22_4: # %else2
1746 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1747 ; KNL_32-NEXT: vzeroupper
1749 ; KNL_32-NEXT: .LBB22_1: # %cond.load
1750 ; KNL_32-NEXT: vmovd %xmm0, %ecx
1751 ; KNL_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1752 ; KNL_32-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1753 ; KNL_32-NEXT: testb $2, %al
1754 ; KNL_32-NEXT: je .LBB22_4
1755 ; KNL_32-NEXT: .LBB22_3: # %cond.load1
1756 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
1757 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1758 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1759 ; KNL_32-NEXT: vzeroupper
1762 ; SKX-LABEL: test22a:
1764 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1765 ; SKX-NEXT: vpmovq2m %xmm1, %k0
1766 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
1767 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
1768 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1769 ; SKX-NEXT: kmovw %k0, %eax
1770 ; SKX-NEXT: testb $1, %al
1771 ; SKX-NEXT: jne .LBB22_1
1772 ; SKX-NEXT: # %bb.2: # %else
1773 ; SKX-NEXT: testb $2, %al
1774 ; SKX-NEXT: jne .LBB22_3
1775 ; SKX-NEXT: .LBB22_4: # %else2
1776 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1778 ; SKX-NEXT: .LBB22_1: # %cond.load
1779 ; SKX-NEXT: vmovq %xmm0, %rcx
1780 ; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1781 ; SKX-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1782 ; SKX-NEXT: testb $2, %al
1783 ; SKX-NEXT: je .LBB22_4
1784 ; SKX-NEXT: .LBB22_3: # %cond.load1
1785 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
1786 ; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1787 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1790 ; SKX_32-LABEL: test22a:
1792 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1793 ; SKX_32-NEXT: vpmovq2m %xmm1, %k0
1794 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1795 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
1796 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
1797 ; SKX_32-NEXT: kmovw %k0, %eax
1798 ; SKX_32-NEXT: testb $1, %al
1799 ; SKX_32-NEXT: jne .LBB22_1
1800 ; SKX_32-NEXT: # %bb.2: # %else
1801 ; SKX_32-NEXT: testb $2, %al
1802 ; SKX_32-NEXT: jne .LBB22_3
1803 ; SKX_32-NEXT: .LBB22_4: # %else2
1804 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1806 ; SKX_32-NEXT: .LBB22_1: # %cond.load
1807 ; SKX_32-NEXT: vmovd %xmm0, %ecx
1808 ; SKX_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1809 ; SKX_32-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1810 ; SKX_32-NEXT: testb $2, %al
1811 ; SKX_32-NEXT: je .LBB22_4
1812 ; SKX_32-NEXT: .LBB22_3: # %cond.load1
1813 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
1814 ; SKX_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1815 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1817 %gep.random = getelementptr float, float* %base, <2 x i64> %ind
1818 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1822 declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
1823 declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
1825 define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1826 ; KNL_64-LABEL: test23:
1828 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1829 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1830 ; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0
1831 ; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0
1832 ; KNL_64-NEXT: vmovq %rdi, %xmm1
1833 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
1834 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1835 ; KNL_64-NEXT: kmovw %k0, %eax
1836 ; KNL_64-NEXT: testb $1, %al
1837 ; KNL_64-NEXT: jne .LBB23_1
1838 ; KNL_64-NEXT: # %bb.2: # %else
1839 ; KNL_64-NEXT: testb $2, %al
1840 ; KNL_64-NEXT: jne .LBB23_3
1841 ; KNL_64-NEXT: .LBB23_4: # %else2
1842 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1843 ; KNL_64-NEXT: vzeroupper
1845 ; KNL_64-NEXT: .LBB23_1: # %cond.load
1846 ; KNL_64-NEXT: vmovq %xmm0, %rcx
1847 ; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2
1848 ; KNL_64-NEXT: testb $2, %al
1849 ; KNL_64-NEXT: je .LBB23_4
1850 ; KNL_64-NEXT: .LBB23_3: # %cond.load1
1851 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
1852 ; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
1853 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1854 ; KNL_64-NEXT: vzeroupper
1857 ; KNL_32-LABEL: test23:
1859 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1860 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1861 ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
1862 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
1863 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1864 ; KNL_32-NEXT: kmovw %k0, %eax
1865 ; KNL_32-NEXT: testb $1, %al
1866 ; KNL_32-NEXT: jne .LBB23_1
1867 ; KNL_32-NEXT: # %bb.2: # %else
1868 ; KNL_32-NEXT: testb $2, %al
1869 ; KNL_32-NEXT: jne .LBB23_3
1870 ; KNL_32-NEXT: .LBB23_4: # %else2
1871 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1872 ; KNL_32-NEXT: vzeroupper
1874 ; KNL_32-NEXT: .LBB23_1: # %cond.load
1875 ; KNL_32-NEXT: vmovd %xmm0, %ecx
1876 ; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2
1877 ; KNL_32-NEXT: testb $2, %al
1878 ; KNL_32-NEXT: je .LBB23_4
1879 ; KNL_32-NEXT: .LBB23_3: # %cond.load1
1880 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
1881 ; KNL_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2
1882 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1883 ; KNL_32-NEXT: vzeroupper
1886 ; SKX-LABEL: test23:
1888 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1889 ; SKX-NEXT: vpmovq2m %xmm1, %k0
1890 ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
1891 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
1892 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
1893 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1894 ; SKX-NEXT: kmovw %k0, %eax
1895 ; SKX-NEXT: testb $1, %al
1896 ; SKX-NEXT: jne .LBB23_1
1897 ; SKX-NEXT: # %bb.2: # %else
1898 ; SKX-NEXT: testb $2, %al
1899 ; SKX-NEXT: jne .LBB23_3
1900 ; SKX-NEXT: .LBB23_4: # %else2
1901 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1903 ; SKX-NEXT: .LBB23_1: # %cond.load
1904 ; SKX-NEXT: vmovq %xmm0, %rcx
1905 ; SKX-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2
1906 ; SKX-NEXT: testb $2, %al
1907 ; SKX-NEXT: je .LBB23_4
1908 ; SKX-NEXT: .LBB23_3: # %cond.load1
1909 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
1910 ; SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
1911 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1914 ; SKX_32-LABEL: test23:
1916 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1917 ; SKX_32-NEXT: vpmovq2m %xmm1, %k0
1918 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
1919 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
1920 ; SKX_32-NEXT: kmovw %k0, %eax
1921 ; SKX_32-NEXT: testb $1, %al
1922 ; SKX_32-NEXT: jne .LBB23_1
1923 ; SKX_32-NEXT: # %bb.2: # %else
1924 ; SKX_32-NEXT: testb $2, %al
1925 ; SKX_32-NEXT: jne .LBB23_3
1926 ; SKX_32-NEXT: .LBB23_4: # %else2
1927 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1929 ; SKX_32-NEXT: .LBB23_1: # %cond.load
1930 ; SKX_32-NEXT: vmovd %xmm0, %ecx
1931 ; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2
1932 ; SKX_32-NEXT: testb $2, %al
1933 ; SKX_32-NEXT: je .LBB23_4
1934 ; SKX_32-NEXT: .LBB23_3: # %cond.load1
1935 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
1936 ; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2
1937 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1939 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1940 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1941 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1945 define <2 x i32> @test23b(i32* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1946 ; KNL_64-LABEL: test23b:
1948 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1949 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1950 ; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0
1951 ; KNL_64-NEXT: vmovq %rdi, %xmm1
1952 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
1953 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1954 ; KNL_64-NEXT: kmovw %k0, %eax
1955 ; KNL_64-NEXT: testb $1, %al
1956 ; KNL_64-NEXT: jne .LBB24_1
1957 ; KNL_64-NEXT: # %bb.2: # %else
1958 ; KNL_64-NEXT: testb $2, %al
1959 ; KNL_64-NEXT: jne .LBB24_3
1960 ; KNL_64-NEXT: .LBB24_4: # %else2
1961 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1962 ; KNL_64-NEXT: vzeroupper
1964 ; KNL_64-NEXT: .LBB24_1: # %cond.load
1965 ; KNL_64-NEXT: vmovq %xmm0, %rcx
1966 ; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2
1967 ; KNL_64-NEXT: testb $2, %al
1968 ; KNL_64-NEXT: je .LBB24_4
1969 ; KNL_64-NEXT: .LBB24_3: # %cond.load1
1970 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
1971 ; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
1972 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1973 ; KNL_64-NEXT: vzeroupper
1976 ; KNL_32-LABEL: test23b:
1978 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1979 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1980 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1981 ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
1982 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
1983 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1984 ; KNL_32-NEXT: kmovw %k0, %eax
1985 ; KNL_32-NEXT: testb $1, %al
1986 ; KNL_32-NEXT: jne .LBB24_1
1987 ; KNL_32-NEXT: # %bb.2: # %else
1988 ; KNL_32-NEXT: testb $2, %al
1989 ; KNL_32-NEXT: jne .LBB24_3
1990 ; KNL_32-NEXT: .LBB24_4: # %else2
1991 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1992 ; KNL_32-NEXT: vzeroupper
1994 ; KNL_32-NEXT: .LBB24_1: # %cond.load
1995 ; KNL_32-NEXT: vmovd %xmm0, %ecx
1996 ; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2
1997 ; KNL_32-NEXT: testb $2, %al
1998 ; KNL_32-NEXT: je .LBB24_4
1999 ; KNL_32-NEXT: .LBB24_3: # %cond.load1
2000 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
2001 ; KNL_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2
2002 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
2003 ; KNL_32-NEXT: vzeroupper
2006 ; SKX-LABEL: test23b:
2008 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
2009 ; SKX-NEXT: vpmovq2m %xmm1, %k0
2010 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
2011 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
2012 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2013 ; SKX-NEXT: kmovw %k0, %eax
2014 ; SKX-NEXT: testb $1, %al
2015 ; SKX-NEXT: jne .LBB24_1
2016 ; SKX-NEXT: # %bb.2: # %else
2017 ; SKX-NEXT: testb $2, %al
2018 ; SKX-NEXT: jne .LBB24_3
2019 ; SKX-NEXT: .LBB24_4: # %else2
2020 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
2022 ; SKX-NEXT: .LBB24_1: # %cond.load
2023 ; SKX-NEXT: vmovq %xmm0, %rcx
2024 ; SKX-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2
2025 ; SKX-NEXT: testb $2, %al
2026 ; SKX-NEXT: je .LBB24_4
2027 ; SKX-NEXT: .LBB24_3: # %cond.load1
2028 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
2029 ; SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
2030 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
2033 ; SKX_32-LABEL: test23b:
2035 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
2036 ; SKX_32-NEXT: vpmovq2m %xmm1, %k0
2037 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2038 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
2039 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
2040 ; SKX_32-NEXT: kmovw %k0, %eax
2041 ; SKX_32-NEXT: testb $1, %al
2042 ; SKX_32-NEXT: jne .LBB24_1
2043 ; SKX_32-NEXT: # %bb.2: # %else
2044 ; SKX_32-NEXT: testb $2, %al
2045 ; SKX_32-NEXT: jne .LBB24_3
2046 ; SKX_32-NEXT: .LBB24_4: # %else2
2047 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
2049 ; SKX_32-NEXT: .LBB24_1: # %cond.load
2050 ; SKX_32-NEXT: vmovd %xmm0, %ecx
2051 ; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2
2052 ; SKX_32-NEXT: testb $2, %al
2053 ; SKX_32-NEXT: je .LBB24_4
2054 ; SKX_32-NEXT: .LBB24_3: # %cond.load1
2055 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
2056 ; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2
2057 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
2059 %gep.random = getelementptr i32, i32* %base, <2 x i64> %ind
2060 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
2064 define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
2065 ; KNL_64-LABEL: test24:
2067 ; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0
2068 ; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0
2069 ; KNL_64-NEXT: vmovq %rdi, %xmm1
2070 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
2071 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2072 ; KNL_64-NEXT: vmovq %xmm0, %rax
2073 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
2074 ; KNL_64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2075 ; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0
2078 ; KNL_32-LABEL: test24:
2080 ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
2081 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
2082 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
2083 ; KNL_32-NEXT: vmovd %xmm0, %eax
2084 ; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
2085 ; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2086 ; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
2089 ; SKX-LABEL: test24:
2091 ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
2092 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
2093 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
2094 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2095 ; SKX-NEXT: vmovq %xmm0, %rax
2096 ; SKX-NEXT: vpextrq $1, %xmm0, %rcx
2097 ; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2098 ; SKX-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0
2101 ; SKX_32-LABEL: test24:
2103 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
2104 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
2105 ; SKX_32-NEXT: vmovd %xmm0, %eax
2106 ; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx
2107 ; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2108 ; SKX_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
2110 %sext_ind = sext <2 x i32> %ind to <2 x i64>
2111 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
2112 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
2116 define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
2117 ; KNL_64-LABEL: test25:
2119 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
2120 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
2121 ; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0
2122 ; KNL_64-NEXT: vpsllq $3, %xmm0, %xmm0
2123 ; KNL_64-NEXT: vmovq %rdi, %xmm1
2124 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
2125 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2126 ; KNL_64-NEXT: kmovw %k0, %eax
2127 ; KNL_64-NEXT: testb $1, %al
2128 ; KNL_64-NEXT: jne .LBB26_1
2129 ; KNL_64-NEXT: # %bb.2: # %else
2130 ; KNL_64-NEXT: testb $2, %al
2131 ; KNL_64-NEXT: jne .LBB26_3
2132 ; KNL_64-NEXT: .LBB26_4: # %else2
2133 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
2134 ; KNL_64-NEXT: vzeroupper
2136 ; KNL_64-NEXT: .LBB26_1: # %cond.load
2137 ; KNL_64-NEXT: vmovq %xmm0, %rcx
2138 ; KNL_64-NEXT: vpinsrq $0, (%rcx), %xmm2, %xmm2
2139 ; KNL_64-NEXT: testb $2, %al
2140 ; KNL_64-NEXT: je .LBB26_4
2141 ; KNL_64-NEXT: .LBB26_3: # %cond.load1
2142 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
2143 ; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm2
2144 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
2145 ; KNL_64-NEXT: vzeroupper
2148 ; KNL_32-LABEL: test25:
2150 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
2151 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
2152 ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0
2153 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
2154 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
2155 ; KNL_32-NEXT: kmovw %k0, %eax
2156 ; KNL_32-NEXT: testb $1, %al
2157 ; KNL_32-NEXT: jne .LBB26_1
2158 ; KNL_32-NEXT: # %bb.2: # %else
2159 ; KNL_32-NEXT: testb $2, %al
2160 ; KNL_32-NEXT: jne .LBB26_3
2161 ; KNL_32-NEXT: .LBB26_4: # %else2
2162 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
2163 ; KNL_32-NEXT: vzeroupper
2165 ; KNL_32-NEXT: .LBB26_1: # %cond.load
2166 ; KNL_32-NEXT: vmovd %xmm0, %ecx
2167 ; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm1
2168 ; KNL_32-NEXT: vpinsrd $1, 4(%ecx), %xmm1, %xmm2
2169 ; KNL_32-NEXT: testb $2, %al
2170 ; KNL_32-NEXT: je .LBB26_4
2171 ; KNL_32-NEXT: .LBB26_3: # %cond.load1
2172 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
2173 ; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm0
2174 ; KNL_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm2
2175 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
2176 ; KNL_32-NEXT: vzeroupper
2179 ; SKX-LABEL: test25:
2181 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
2182 ; SKX-NEXT: vpmovq2m %xmm1, %k0
2183 ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
2184 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
2185 ; SKX-NEXT: vpsllq $3, %xmm0, %xmm0
2186 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2187 ; SKX-NEXT: kmovw %k0, %eax
2188 ; SKX-NEXT: testb $1, %al
2189 ; SKX-NEXT: jne .LBB26_1
2190 ; SKX-NEXT: # %bb.2: # %else
2191 ; SKX-NEXT: testb $2, %al
2192 ; SKX-NEXT: jne .LBB26_3
2193 ; SKX-NEXT: .LBB26_4: # %else2
2194 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
2196 ; SKX-NEXT: .LBB26_1: # %cond.load
2197 ; SKX-NEXT: vmovq %xmm0, %rcx
2198 ; SKX-NEXT: vpinsrq $0, (%rcx), %xmm2, %xmm2
2199 ; SKX-NEXT: testb $2, %al
2200 ; SKX-NEXT: je .LBB26_4
2201 ; SKX-NEXT: .LBB26_3: # %cond.load1
2202 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
2203 ; SKX-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm2
2204 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
2207 ; SKX_32-LABEL: test25:
2209 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
2210 ; SKX_32-NEXT: vpmovq2m %xmm1, %k0
2211 ; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0
2212 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
2213 ; SKX_32-NEXT: kmovw %k0, %eax
2214 ; SKX_32-NEXT: testb $1, %al
2215 ; SKX_32-NEXT: jne .LBB26_1
2216 ; SKX_32-NEXT: # %bb.2: # %else
2217 ; SKX_32-NEXT: testb $2, %al
2218 ; SKX_32-NEXT: jne .LBB26_3
2219 ; SKX_32-NEXT: .LBB26_4: # %else2
2220 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
2222 ; SKX_32-NEXT: .LBB26_1: # %cond.load
2223 ; SKX_32-NEXT: vmovd %xmm0, %ecx
2224 ; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm1
2225 ; SKX_32-NEXT: vpinsrd $1, 4(%ecx), %xmm1, %xmm2
2226 ; SKX_32-NEXT: testb $2, %al
2227 ; SKX_32-NEXT: je .LBB26_4
2228 ; SKX_32-NEXT: .LBB26_3: # %cond.load1
2229 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
2230 ; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm0
2231 ; SKX_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm2
2232 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
2234 %sext_ind = sext <2 x i32> %ind to <2 x i64>
2235 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
2236 %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
2240 define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
2241 ; KNL_64-LABEL: test26:
2243 ; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0
2244 ; KNL_64-NEXT: vpsllq $3, %xmm0, %xmm0
2245 ; KNL_64-NEXT: vmovq %rdi, %xmm2
2246 ; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2
2247 ; KNL_64-NEXT: vpaddq %xmm0, %xmm2, %xmm0
2248 ; KNL_64-NEXT: vmovq %xmm0, %rax
2249 ; KNL_64-NEXT: vpinsrq $0, (%rax), %xmm1, %xmm1
2250 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
2251 ; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm1, %xmm0
2254 ; KNL_32-LABEL: test26:
2256 ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0
2257 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2
2258 ; KNL_32-NEXT: vpaddd %xmm0, %xmm2, %xmm0
2259 ; KNL_32-NEXT: vmovd %xmm0, %eax
2260 ; KNL_32-NEXT: vpinsrd $0, (%eax), %xmm1, %xmm1
2261 ; KNL_32-NEXT: vpinsrd $1, 4(%eax), %xmm1, %xmm1
2262 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
2263 ; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm0
2264 ; KNL_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0
2267 ; SKX-LABEL: test26:
2269 ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
2270 ; SKX-NEXT: vpbroadcastq %rdi, %xmm2
2271 ; SKX-NEXT: vpsllq $3, %xmm0, %xmm0
2272 ; SKX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
2273 ; SKX-NEXT: vmovq %xmm0, %rax
2274 ; SKX-NEXT: vpinsrq $0, (%rax), %xmm1, %xmm1
2275 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
2276 ; SKX-NEXT: vpinsrq $1, (%rax), %xmm1, %xmm0
2279 ; SKX_32-LABEL: test26:
2281 ; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0
2282 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
2283 ; SKX_32-NEXT: vmovd %xmm0, %eax
2284 ; SKX_32-NEXT: vpinsrd $0, (%eax), %xmm1, %xmm1
2285 ; SKX_32-NEXT: vpinsrd $1, 4(%eax), %xmm1, %xmm1
2286 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
2287 ; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm0
2288 ; SKX_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0
2290 %sext_ind = sext <2 x i32> %ind to <2 x i64>
2291 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
2292 %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
2296 ; Result type requires widening; all-ones mask
2297 define <2 x float> @test27(float* %base, <2 x i32> %ind) {
2298 ; KNL_64-LABEL: test27:
2300 ; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0
2301 ; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0
2302 ; KNL_64-NEXT: vmovq %rdi, %xmm1
2303 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
2304 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2305 ; KNL_64-NEXT: vmovq %xmm0, %rax
2306 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
2307 ; KNL_64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2308 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
2311 ; KNL_32-LABEL: test27:
2313 ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
2314 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
2315 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
2316 ; KNL_32-NEXT: vmovd %xmm0, %eax
2317 ; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
2318 ; KNL_32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2319 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
2322 ; SKX-LABEL: test27:
2324 ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
2325 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
2326 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
2327 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2328 ; SKX-NEXT: vmovq %xmm0, %rax
2329 ; SKX-NEXT: vpextrq $1, %xmm0, %rcx
2330 ; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2331 ; SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
2334 ; SKX_32-LABEL: test27:
2336 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
2337 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
2338 ; SKX_32-NEXT: vmovd %xmm0, %eax
2339 ; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx
2340 ; SKX_32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2341 ; SKX_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
2343 %sext_ind = sext <2 x i32> %ind to <2 x i64>
2344 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
2345 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
2349 ; Data type requires promotion, mask is all-ones
2350 define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
2351 ; KNL_64-LABEL: test28:
2353 ; KNL_64-NEXT: vmovq %xmm1, %rax
2354 ; KNL_64-NEXT: vmovss %xmm0, (%rax)
2355 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
2356 ; KNL_64-NEXT: vextractps $1, %xmm0, (%rax)
2359 ; KNL_32-LABEL: test28:
2361 ; KNL_32-NEXT: vmovd %xmm1, %eax
2362 ; KNL_32-NEXT: vmovss %xmm0, (%eax)
2363 ; KNL_32-NEXT: vpextrd $1, %xmm1, %eax
2364 ; KNL_32-NEXT: vextractps $1, %xmm0, (%eax)
2367 ; SKX-LABEL: test28:
2369 ; SKX-NEXT: vmovq %xmm1, %rax
2370 ; SKX-NEXT: vmovss %xmm0, (%rax)
2371 ; SKX-NEXT: vpextrq $1, %xmm1, %rax
2372 ; SKX-NEXT: vextractps $1, %xmm0, (%rax)
2375 ; SKX_32-LABEL: test28:
2377 ; SKX_32-NEXT: vmovd %xmm1, %eax
2378 ; SKX_32-NEXT: vmovss %xmm0, (%eax)
2379 ; SKX_32-NEXT: vpextrd $1, %xmm1, %eax
2380 ; SKX_32-NEXT: vextractps $1, %xmm0, (%eax)
2382 call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
2386 ; SCALAR-LABEL: test29
2387 ; SCALAR: extractelement <16 x float*>
2388 ; SCALAR-NEXT: load float
2389 ; SCALAR-NEXT: insertelement <16 x float>
2390 ; SCALAR-NEXT: extractelement <16 x float*>
2391 ; SCALAR-NEXT: load float
2393 define <16 x float> @test29(float* %base, <16 x i32> %ind) {
2394 ; KNL_64-LABEL: test29:
2396 ; KNL_64-NEXT: movw $44, %ax
2397 ; KNL_64-NEXT: kmovw %eax, %k1
2398 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
2399 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
2402 ; KNL_32-LABEL: test29:
2404 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2405 ; KNL_32-NEXT: movw $44, %cx
2406 ; KNL_32-NEXT: kmovw %ecx, %k1
2407 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
2408 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
2411 ; SKX-LABEL: test29:
2413 ; SKX-NEXT: movw $44, %ax
2414 ; SKX-NEXT: kmovw %eax, %k1
2415 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
2416 ; SKX-NEXT: vmovaps %zmm1, %zmm0
2419 ; SKX_32-LABEL: test29:
2421 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2422 ; SKX_32-NEXT: movw $44, %cx
2423 ; SKX_32-NEXT: kmovw %ecx, %k1
2424 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
2425 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
2428 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
2429 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
2431 %sext_ind = sext <16 x i32> %ind to <16 x i64>
2432 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
2434 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
2435 ret <16 x float>%res
2438 declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
2439 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
2440 ; KNL_64-LABEL: test30:
2442 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
2443 ; KNL_64-NEXT: movw $-3, %ax
2444 ; KNL_64-NEXT: kmovw %eax, %k0
2445 ; KNL_64-NEXT: andl $1, %edi
2446 ; KNL_64-NEXT: kmovw %edi, %k1
2447 ; KNL_64-NEXT: kandw %k0, %k1, %k0
2448 ; KNL_64-NEXT: kmovw %esi, %k1
2449 ; KNL_64-NEXT: kshiftlw $15, %k1, %k1
2450 ; KNL_64-NEXT: kshiftrw $14, %k1, %k1
2451 ; KNL_64-NEXT: korw %k1, %k0, %k0
2452 ; KNL_64-NEXT: movw $-5, %ax
2453 ; KNL_64-NEXT: kmovw %eax, %k1
2454 ; KNL_64-NEXT: kandw %k1, %k0, %k0
2455 ; KNL_64-NEXT: kmovw %edx, %k1
2456 ; KNL_64-NEXT: kshiftlw $15, %k1, %k1
2457 ; KNL_64-NEXT: kshiftrw $13, %k1, %k1
2458 ; KNL_64-NEXT: korw %k1, %k0, %k0
2459 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
2460 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
2461 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
2462 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
2463 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
2464 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1}
2465 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
2466 ; KNL_64-NEXT: vzeroupper
2469 ; KNL_32-LABEL: test30:
2471 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
2472 ; KNL_32-NEXT: movw $-3, %ax
2473 ; KNL_32-NEXT: kmovw %eax, %k0
2474 ; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al
2475 ; KNL_32-NEXT: andl $1, %eax
2476 ; KNL_32-NEXT: kmovw %eax, %k1
2477 ; KNL_32-NEXT: kandw %k0, %k1, %k0
2478 ; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al
2479 ; KNL_32-NEXT: kmovw %eax, %k1
2480 ; KNL_32-NEXT: kshiftlw $15, %k1, %k1
2481 ; KNL_32-NEXT: kshiftrw $14, %k1, %k1
2482 ; KNL_32-NEXT: korw %k1, %k0, %k0
2483 ; KNL_32-NEXT: movw $-5, %ax
2484 ; KNL_32-NEXT: kmovw %eax, %k1
2485 ; KNL_32-NEXT: kandw %k1, %k0, %k0
2486 ; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al
2487 ; KNL_32-NEXT: kmovw %eax, %k1
2488 ; KNL_32-NEXT: kshiftlw $15, %k1, %k1
2489 ; KNL_32-NEXT: kshiftrw $13, %k1, %k1
2490 ; KNL_32-NEXT: korw %k1, %k0, %k0
2491 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
2492 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
2493 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
2494 ; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
2495 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
2496 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
2497 ; KNL_32-NEXT: vzeroupper
2500 ; SKX-LABEL: test30:
2502 ; SKX-NEXT: movb $-3, %al
2503 ; SKX-NEXT: kmovw %eax, %k0
2504 ; SKX-NEXT: kmovw %edi, %k1
2505 ; SKX-NEXT: kshiftlb $7, %k1, %k1
2506 ; SKX-NEXT: kshiftrb $7, %k1, %k1
2507 ; SKX-NEXT: kandw %k0, %k1, %k0
2508 ; SKX-NEXT: kmovw %esi, %k1
2509 ; SKX-NEXT: kshiftlb $7, %k1, %k1
2510 ; SKX-NEXT: kshiftrb $6, %k1, %k1
2511 ; SKX-NEXT: korw %k1, %k0, %k0
2512 ; SKX-NEXT: movb $-5, %al
2513 ; SKX-NEXT: kmovw %eax, %k1
2514 ; SKX-NEXT: kandw %k1, %k0, %k0
2515 ; SKX-NEXT: kmovw %edx, %k1
2516 ; SKX-NEXT: kshiftlb $7, %k1, %k1
2517 ; SKX-NEXT: kshiftrb $5, %k1, %k1
2518 ; SKX-NEXT: korw %k1, %k0, %k1
2519 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
2520 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
2521 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
2522 ; SKX-NEXT: vpgatherqd (,%ymm0), %xmm2 {%k1}
2523 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
2524 ; SKX-NEXT: vzeroupper
2527 ; SKX_32-LABEL: test30:
2529 ; SKX_32-NEXT: movb $-3, %al
2530 ; SKX_32-NEXT: kmovw %eax, %k0
2531 ; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
2532 ; SKX_32-NEXT: kmovw %eax, %k1
2533 ; SKX_32-NEXT: kshiftlb $7, %k1, %k1
2534 ; SKX_32-NEXT: kshiftrb $7, %k1, %k1
2535 ; SKX_32-NEXT: kandw %k0, %k1, %k0
2536 ; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
2537 ; SKX_32-NEXT: kmovw %eax, %k1
2538 ; SKX_32-NEXT: kshiftlb $7, %k1, %k1
2539 ; SKX_32-NEXT: kshiftrb $6, %k1, %k1
2540 ; SKX_32-NEXT: korw %k1, %k0, %k0
2541 ; SKX_32-NEXT: movb $-5, %al
2542 ; SKX_32-NEXT: kmovw %eax, %k1
2543 ; SKX_32-NEXT: kandw %k1, %k0, %k0
2544 ; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
2545 ; SKX_32-NEXT: kmovw %eax, %k1
2546 ; SKX_32-NEXT: kshiftlb $7, %k1, %k1
2547 ; SKX_32-NEXT: kshiftrb $5, %k1, %k1
2548 ; SKX_32-NEXT: korw %k1, %k0, %k1
2549 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
2550 ; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
2551 ; SKX_32-NEXT: vpgatherdd (,%xmm0), %xmm2 {%k1}
2552 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
2555 %sext_ind = sext <3 x i32> %ind to <3 x i64>
2556 %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
2557 %res = call <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
2561 ; Non-power of 2 scatter
2562 declare void @llvm.masked.scatter.v3i32.v3p0i32(<3 x i32>, <3 x i32*>, i32, <3 x i1>)
2563 define void @test30b(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
2564 ; KNL_64-LABEL: test30b:
2566 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
2567 ; KNL_64-NEXT: movw $-3, %ax
2568 ; KNL_64-NEXT: kmovw %eax, %k0
2569 ; KNL_64-NEXT: andl $1, %edi
2570 ; KNL_64-NEXT: kmovw %edi, %k1
2571 ; KNL_64-NEXT: kandw %k0, %k1, %k0
2572 ; KNL_64-NEXT: kmovw %esi, %k1
2573 ; KNL_64-NEXT: kshiftlw $15, %k1, %k1
2574 ; KNL_64-NEXT: kshiftrw $14, %k1, %k1
2575 ; KNL_64-NEXT: korw %k1, %k0, %k0
2576 ; KNL_64-NEXT: movw $-5, %ax
2577 ; KNL_64-NEXT: kmovw %eax, %k1
2578 ; KNL_64-NEXT: kandw %k1, %k0, %k0
2579 ; KNL_64-NEXT: kmovw %edx, %k1
2580 ; KNL_64-NEXT: kshiftlw $15, %k1, %k1
2581 ; KNL_64-NEXT: kshiftrw $13, %k1, %k1
2582 ; KNL_64-NEXT: korw %k1, %k0, %k0
2583 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
2584 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
2585 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
2586 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
2587 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
2588 ; KNL_64-NEXT: vpscatterqd %ymm2, (,%zmm0) {%k1}
2589 ; KNL_64-NEXT: vzeroupper
2592 ; KNL_32-LABEL: test30b:
2594 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
2595 ; KNL_32-NEXT: movw $-3, %ax
2596 ; KNL_32-NEXT: kmovw %eax, %k0
2597 ; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al
2598 ; KNL_32-NEXT: andl $1, %eax
2599 ; KNL_32-NEXT: kmovw %eax, %k1
2600 ; KNL_32-NEXT: kandw %k0, %k1, %k0
2601 ; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al
2602 ; KNL_32-NEXT: kmovw %eax, %k1
2603 ; KNL_32-NEXT: kshiftlw $15, %k1, %k1
2604 ; KNL_32-NEXT: kshiftrw $14, %k1, %k1
2605 ; KNL_32-NEXT: korw %k1, %k0, %k0
2606 ; KNL_32-NEXT: movw $-5, %ax
2607 ; KNL_32-NEXT: kmovw %eax, %k1
2608 ; KNL_32-NEXT: kandw %k1, %k0, %k0
2609 ; KNL_32-NEXT: movb {{[0-9]+}}(%esp), %al
2610 ; KNL_32-NEXT: kmovw %eax, %k1
2611 ; KNL_32-NEXT: kshiftlw $15, %k1, %k1
2612 ; KNL_32-NEXT: kshiftrw $13, %k1, %k1
2613 ; KNL_32-NEXT: korw %k1, %k0, %k0
2614 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
2615 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
2616 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
2617 ; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
2618 ; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
2619 ; KNL_32-NEXT: vzeroupper
2622 ; SKX-LABEL: test30b:
2624 ; SKX-NEXT: movb $-3, %al
2625 ; SKX-NEXT: kmovw %eax, %k0
2626 ; SKX-NEXT: kmovw %edi, %k1
2627 ; SKX-NEXT: kshiftlb $7, %k1, %k1
2628 ; SKX-NEXT: kshiftrb $7, %k1, %k1
2629 ; SKX-NEXT: kandw %k0, %k1, %k0
2630 ; SKX-NEXT: kmovw %esi, %k1
2631 ; SKX-NEXT: kshiftlb $7, %k1, %k1
2632 ; SKX-NEXT: kshiftrb $6, %k1, %k1
2633 ; SKX-NEXT: korw %k1, %k0, %k0
2634 ; SKX-NEXT: movb $-5, %al
2635 ; SKX-NEXT: kmovw %eax, %k1
2636 ; SKX-NEXT: kandw %k1, %k0, %k0
2637 ; SKX-NEXT: kmovw %edx, %k1
2638 ; SKX-NEXT: kshiftlb $7, %k1, %k1
2639 ; SKX-NEXT: kshiftrb $5, %k1, %k1
2640 ; SKX-NEXT: korw %k1, %k0, %k1
2641 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
2642 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
2643 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
2644 ; SKX-NEXT: vpscatterqd %xmm2, (,%ymm0) {%k1}
2645 ; SKX-NEXT: vzeroupper
2648 ; SKX_32-LABEL: test30b:
2650 ; SKX_32-NEXT: movb $-3, %al
2651 ; SKX_32-NEXT: kmovw %eax, %k0
2652 ; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
2653 ; SKX_32-NEXT: kmovw %eax, %k1
2654 ; SKX_32-NEXT: kshiftlb $7, %k1, %k1
2655 ; SKX_32-NEXT: kshiftrb $7, %k1, %k1
2656 ; SKX_32-NEXT: kandw %k0, %k1, %k0
2657 ; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
2658 ; SKX_32-NEXT: kmovw %eax, %k1
2659 ; SKX_32-NEXT: kshiftlb $7, %k1, %k1
2660 ; SKX_32-NEXT: kshiftrb $6, %k1, %k1
2661 ; SKX_32-NEXT: korw %k1, %k0, %k0
2662 ; SKX_32-NEXT: movb $-5, %al
2663 ; SKX_32-NEXT: kmovw %eax, %k1
2664 ; SKX_32-NEXT: kandw %k1, %k0, %k0
2665 ; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
2666 ; SKX_32-NEXT: kmovw %eax, %k1
2667 ; SKX_32-NEXT: kshiftlb $7, %k1, %k1
2668 ; SKX_32-NEXT: kshiftrb $5, %k1, %k1
2669 ; SKX_32-NEXT: korw %k1, %k0, %k1
2670 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
2671 ; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
2672 ; SKX_32-NEXT: vpscatterdd %xmm2, (,%xmm0) {%k1}
2674 %sext_ind = sext <3 x i32> %ind to <3 x i64>
2675 %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
2676 call void @llvm.masked.scatter.v3i32.v3p0i32(<3 x i32> %src0, <3 x i32*> %gep.random, i32 4, <3 x i1> %mask)
2680 declare <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
2681 define <16 x float*> @test31(<16 x float**> %ptrs) {
2682 ; KNL_64-LABEL: test31:
2684 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2685 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
2686 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
2687 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
2688 ; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0
2689 ; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm1
2692 ; KNL_32-LABEL: test31:
2694 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2695 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
2696 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
2699 ; SKX-LABEL: test31:
2701 ; SKX-NEXT: kxnorw %k0, %k0, %k1
2702 ; SKX-NEXT: kxnorw %k0, %k0, %k2
2703 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
2704 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
2705 ; SKX-NEXT: vmovdqa64 %zmm2, %zmm0
2706 ; SKX-NEXT: vmovdqa64 %zmm3, %zmm1
2709 ; SKX_32-LABEL: test31:
2711 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2712 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
2713 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
2716 %res = call <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
2717 ret <16 x float*>%res
2720 define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
2721 ; KNL_64-LABEL: test_gather_16i32:
2723 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2724 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2725 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2726 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2
2727 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2728 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
2729 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
2730 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
2733 ; KNL_32-LABEL: test_gather_16i32:
2735 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2736 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2737 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2738 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
2739 ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
2742 ; SKX-LABEL: test_gather_16i32:
2744 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2745 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2746 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2747 ; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm2
2748 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2749 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
2750 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
2751 ; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
2754 ; SKX_32-LABEL: test_gather_16i32:
2756 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2757 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2758 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2759 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
2760 ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
2762 %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
2765 define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
2766 ; KNL_64-LABEL: test_gather_16i64:
2768 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2769 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2770 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2771 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2772 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
2773 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
2774 ; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm0
2775 ; KNL_64-NEXT: vmovdqa64 %zmm4, %zmm1
2778 ; KNL_32-LABEL: test_gather_16i64:
2780 ; KNL_32-NEXT: pushl %ebp
2781 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2782 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2783 ; KNL_32-NEXT: movl %esp, %ebp
2784 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2785 ; KNL_32-NEXT: andl $-64, %esp
2786 ; KNL_32-NEXT: subl $64, %esp
2787 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2788 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2789 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2790 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
2791 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2792 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
2793 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2794 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
2795 ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
2796 ; KNL_32-NEXT: movl %ebp, %esp
2797 ; KNL_32-NEXT: popl %ebp
2798 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2801 ; SKX-LABEL: test_gather_16i64:
2803 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2804 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2805 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2806 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2807 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
2808 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
2809 ; SKX-NEXT: vmovdqa64 %zmm3, %zmm0
2810 ; SKX-NEXT: vmovdqa64 %zmm4, %zmm1
2813 ; SKX_32-LABEL: test_gather_16i64:
2815 ; SKX_32-NEXT: pushl %ebp
2816 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2817 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2818 ; SKX_32-NEXT: movl %esp, %ebp
2819 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2820 ; SKX_32-NEXT: andl $-64, %esp
2821 ; SKX_32-NEXT: subl $64, %esp
2822 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2823 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2824 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2825 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
2826 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2827 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
2828 ; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2829 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
2830 ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
2831 ; SKX_32-NEXT: movl %ebp, %esp
2832 ; SKX_32-NEXT: popl %ebp
2833 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2835 %res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
2838 declare <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
2839 define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
2840 ; KNL_64-LABEL: test_gather_16f32:
2842 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2843 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2844 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2845 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2
2846 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2847 ; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
2848 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
2849 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
2852 ; KNL_32-LABEL: test_gather_16f32:
2854 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2855 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2856 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2857 ; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
2858 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
2861 ; SKX-LABEL: test_gather_16f32:
2863 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2864 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2865 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2866 ; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm2
2867 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2868 ; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
2869 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
2870 ; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
2873 ; SKX_32-LABEL: test_gather_16f32:
2875 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2876 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2877 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2878 ; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
2879 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
2881 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
2882 ret <16 x float> %res
2884 define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
2885 ; KNL_64-LABEL: test_gather_16f64:
2887 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2888 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2889 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2890 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2891 ; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
2892 ; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
2893 ; KNL_64-NEXT: vmovapd %zmm3, %zmm0
2894 ; KNL_64-NEXT: vmovapd %zmm4, %zmm1
2897 ; KNL_32-LABEL: test_gather_16f64:
2899 ; KNL_32-NEXT: pushl %ebp
2900 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2901 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2902 ; KNL_32-NEXT: movl %esp, %ebp
2903 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2904 ; KNL_32-NEXT: andl $-64, %esp
2905 ; KNL_32-NEXT: subl $64, %esp
2906 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2907 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2908 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2909 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
2910 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2911 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
2912 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2913 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
2914 ; KNL_32-NEXT: vmovapd %zmm2, %zmm0
2915 ; KNL_32-NEXT: movl %ebp, %esp
2916 ; KNL_32-NEXT: popl %ebp
2917 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2920 ; SKX-LABEL: test_gather_16f64:
2922 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2923 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2924 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2925 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2926 ; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
2927 ; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
2928 ; SKX-NEXT: vmovapd %zmm3, %zmm0
2929 ; SKX-NEXT: vmovapd %zmm4, %zmm1
2932 ; SKX_32-LABEL: test_gather_16f64:
2934 ; SKX_32-NEXT: pushl %ebp
2935 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2936 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2937 ; SKX_32-NEXT: movl %esp, %ebp
2938 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2939 ; SKX_32-NEXT: andl $-64, %esp
2940 ; SKX_32-NEXT: subl $64, %esp
2941 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2942 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2943 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2944 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
2945 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2946 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
2947 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2948 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
2949 ; SKX_32-NEXT: vmovapd %zmm2, %zmm0
2950 ; SKX_32-NEXT: movl %ebp, %esp
2951 ; SKX_32-NEXT: popl %ebp
2952 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2954 %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
2955 ret <16 x double> %res
2957 declare <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
2958 define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
2959 ; KNL_64-LABEL: test_scatter_16i32:
2961 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2962 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2963 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2964 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2965 ; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
2966 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
2967 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
2968 ; KNL_64-NEXT: vzeroupper
2971 ; KNL_32-LABEL: test_scatter_16i32:
2973 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2974 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2975 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2976 ; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
2977 ; KNL_32-NEXT: vzeroupper
2980 ; SKX-LABEL: test_scatter_16i32:
2982 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2983 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2984 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2985 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2986 ; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
2987 ; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm0
2988 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
2989 ; SKX-NEXT: vzeroupper
2992 ; SKX_32-LABEL: test_scatter_16i32:
2994 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2995 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2996 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2997 ; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
2998 ; SKX_32-NEXT: vzeroupper
3000 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
3003 define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
3004 ; KNL_64-LABEL: test_scatter_16i64:
3006 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
3007 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
3008 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
3009 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
3010 ; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
3011 ; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
3012 ; KNL_64-NEXT: vzeroupper
3015 ; KNL_32-LABEL: test_scatter_16i64:
3017 ; KNL_32-NEXT: pushl %ebp
3018 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
3019 ; KNL_32-NEXT: .cfi_offset %ebp, -8
3020 ; KNL_32-NEXT: movl %esp, %ebp
3021 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
3022 ; KNL_32-NEXT: andl $-64, %esp
3023 ; KNL_32-NEXT: subl $64, %esp
3024 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
3025 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
3026 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
3027 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
3028 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
3029 ; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
3030 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
3031 ; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
3032 ; KNL_32-NEXT: movl %ebp, %esp
3033 ; KNL_32-NEXT: popl %ebp
3034 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
3035 ; KNL_32-NEXT: vzeroupper
3038 ; SKX-LABEL: test_scatter_16i64:
3040 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
3041 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
3042 ; SKX-NEXT: vpmovd2m %zmm2, %k1
3043 ; SKX-NEXT: kshiftrw $8, %k1, %k2
3044 ; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
3045 ; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
3046 ; SKX-NEXT: vzeroupper
3049 ; SKX_32-LABEL: test_scatter_16i64:
3051 ; SKX_32-NEXT: pushl %ebp
3052 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
3053 ; SKX_32-NEXT: .cfi_offset %ebp, -8
3054 ; SKX_32-NEXT: movl %esp, %ebp
3055 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
3056 ; SKX_32-NEXT: andl $-64, %esp
3057 ; SKX_32-NEXT: subl $64, %esp
3058 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
3059 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
3060 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
3061 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
3062 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
3063 ; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
3064 ; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
3065 ; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
3066 ; SKX_32-NEXT: movl %ebp, %esp
3067 ; SKX_32-NEXT: popl %ebp
3068 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
3069 ; SKX_32-NEXT: vzeroupper
3071 call void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
3074 declare void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
3075 define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
3076 ; KNL_64-LABEL: test_scatter_16f32:
3078 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
3079 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
3080 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
3081 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
3082 ; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
3083 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
3084 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
3085 ; KNL_64-NEXT: vzeroupper
3088 ; KNL_32-LABEL: test_scatter_16f32:
3090 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
3091 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
3092 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
3093 ; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
3094 ; KNL_32-NEXT: vzeroupper
3097 ; SKX-LABEL: test_scatter_16f32:
3099 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
3100 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
3101 ; SKX-NEXT: vpmovd2m %zmm2, %k1
3102 ; SKX-NEXT: kshiftrw $8, %k1, %k2
3103 ; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
3104 ; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm0
3105 ; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
3106 ; SKX-NEXT: vzeroupper
3109 ; SKX_32-LABEL: test_scatter_16f32:
3111 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
3112 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
3113 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
3114 ; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
3115 ; SKX_32-NEXT: vzeroupper
3117 call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
3120 declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
3121 define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
3122 ; KNL_64-LABEL: test_scatter_16f64:
3124 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
3125 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
3126 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
3127 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
3128 ; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
3129 ; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
3130 ; KNL_64-NEXT: vzeroupper
3133 ; KNL_32-LABEL: test_scatter_16f64:
3135 ; KNL_32-NEXT: pushl %ebp
3136 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
3137 ; KNL_32-NEXT: .cfi_offset %ebp, -8
3138 ; KNL_32-NEXT: movl %esp, %ebp
3139 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
3140 ; KNL_32-NEXT: andl $-64, %esp
3141 ; KNL_32-NEXT: subl $64, %esp
3142 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
3143 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
3144 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
3145 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
3146 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
3147 ; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
3148 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
3149 ; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
3150 ; KNL_32-NEXT: movl %ebp, %esp
3151 ; KNL_32-NEXT: popl %ebp
3152 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
3153 ; KNL_32-NEXT: vzeroupper
3156 ; SKX-LABEL: test_scatter_16f64:
3158 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
3159 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
3160 ; SKX-NEXT: vpmovd2m %zmm2, %k1
3161 ; SKX-NEXT: kshiftrw $8, %k1, %k2
3162 ; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
3163 ; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
3164 ; SKX-NEXT: vzeroupper
3167 ; SKX_32-LABEL: test_scatter_16f64:
3169 ; SKX_32-NEXT: pushl %ebp
3170 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
3171 ; SKX_32-NEXT: .cfi_offset %ebp, -8
3172 ; SKX_32-NEXT: movl %esp, %ebp
3173 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
3174 ; SKX_32-NEXT: andl $-64, %esp
3175 ; SKX_32-NEXT: subl $64, %esp
3176 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
3177 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
3178 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
3179 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
3180 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
3181 ; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
3182 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
3183 ; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
3184 ; SKX_32-NEXT: movl %ebp, %esp
3185 ; SKX_32-NEXT: popl %ebp
3186 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
3187 ; SKX_32-NEXT: vzeroupper
3189 call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
3192 declare void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
3194 define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) {
3195 ; KNL_64-LABEL: test_pr28312:
3197 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
3198 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
3199 ; KNL_64-NEXT: kmovw %k0, %eax
3200 ; KNL_64-NEXT: testb $1, %al
3201 ; KNL_64-NEXT: # implicit-def: $ymm1
3202 ; KNL_64-NEXT: je .LBB42_2
3203 ; KNL_64-NEXT: # %bb.1: # %cond.load
3204 ; KNL_64-NEXT: vmovq %xmm0, %rcx
3205 ; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
3206 ; KNL_64-NEXT: .LBB42_2: # %else
3207 ; KNL_64-NEXT: testb $2, %al
3208 ; KNL_64-NEXT: je .LBB42_4
3209 ; KNL_64-NEXT: # %bb.3: # %cond.load1
3210 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
3211 ; KNL_64-NEXT: vpinsrq $1, (%rcx), %xmm1, %xmm2
3212 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
3213 ; KNL_64-NEXT: .LBB42_4: # %else2
3214 ; KNL_64-NEXT: testb $4, %al
3215 ; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm2
3216 ; KNL_64-NEXT: je .LBB42_6
3217 ; KNL_64-NEXT: # %bb.5: # %cond.load4
3218 ; KNL_64-NEXT: vmovq %xmm2, %rcx
3219 ; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm3
3220 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
3221 ; KNL_64-NEXT: .LBB42_6: # %else5
3222 ; KNL_64-NEXT: testb $8, %al
3223 ; KNL_64-NEXT: je .LBB42_8
3224 ; KNL_64-NEXT: # %bb.7: # %cond.load7
3225 ; KNL_64-NEXT: vpextrq $1, %xmm2, %rax
3226 ; KNL_64-NEXT: vpbroadcastq (%rax), %ymm3
3227 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
3228 ; KNL_64-NEXT: .LBB42_8: # %else8
3229 ; KNL_64-NEXT: kmovw %k0, %eax
3230 ; KNL_64-NEXT: testb $1, %al
3231 ; KNL_64-NEXT: # implicit-def: $ymm3
3232 ; KNL_64-NEXT: jne .LBB42_9
3233 ; KNL_64-NEXT: # %bb.10: # %else15
3234 ; KNL_64-NEXT: testb $2, %al
3235 ; KNL_64-NEXT: jne .LBB42_11
3236 ; KNL_64-NEXT: .LBB42_12: # %else21
3237 ; KNL_64-NEXT: testb $4, %al
3238 ; KNL_64-NEXT: jne .LBB42_13
3239 ; KNL_64-NEXT: .LBB42_14: # %else27
3240 ; KNL_64-NEXT: testb $8, %al
3241 ; KNL_64-NEXT: je .LBB42_16
3242 ; KNL_64-NEXT: .LBB42_15: # %cond.load29
3243 ; KNL_64-NEXT: vpextrq $1, %xmm2, %rax
3244 ; KNL_64-NEXT: vpbroadcastq (%rax), %ymm4
3245 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
3246 ; KNL_64-NEXT: .LBB42_16: # %else33
3247 ; KNL_64-NEXT: kmovw %k0, %eax
3248 ; KNL_64-NEXT: testb $1, %al
3249 ; KNL_64-NEXT: # implicit-def: $ymm4
3250 ; KNL_64-NEXT: jne .LBB42_17
3251 ; KNL_64-NEXT: # %bb.18: # %else40
3252 ; KNL_64-NEXT: testb $2, %al
3253 ; KNL_64-NEXT: jne .LBB42_19
3254 ; KNL_64-NEXT: .LBB42_20: # %else46
3255 ; KNL_64-NEXT: testb $4, %al
3256 ; KNL_64-NEXT: jne .LBB42_21
3257 ; KNL_64-NEXT: .LBB42_22: # %else52
3258 ; KNL_64-NEXT: testb $8, %al
3259 ; KNL_64-NEXT: je .LBB42_24
3260 ; KNL_64-NEXT: .LBB42_23: # %cond.load54
3261 ; KNL_64-NEXT: vpextrq $1, %xmm2, %rax
3262 ; KNL_64-NEXT: vpbroadcastq (%rax), %ymm0
3263 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm0[6,7]
3264 ; KNL_64-NEXT: .LBB42_24: # %else58
3265 ; KNL_64-NEXT: vpaddq %ymm3, %ymm1, %ymm0
3266 ; KNL_64-NEXT: vpaddq %ymm4, %ymm0, %ymm0
3268 ; KNL_64-NEXT: .LBB42_9: # %cond.load11
3269 ; KNL_64-NEXT: vmovq %xmm0, %rcx
3270 ; KNL_64-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
3271 ; KNL_64-NEXT: testb $2, %al
3272 ; KNL_64-NEXT: je .LBB42_12
3273 ; KNL_64-NEXT: .LBB42_11: # %cond.load17
3274 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
3275 ; KNL_64-NEXT: vpinsrq $1, (%rcx), %xmm3, %xmm4
3276 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3277 ; KNL_64-NEXT: testb $4, %al
3278 ; KNL_64-NEXT: je .LBB42_14
3279 ; KNL_64-NEXT: .LBB42_13: # %cond.load23
3280 ; KNL_64-NEXT: vmovq %xmm2, %rcx
3281 ; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm4
3282 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
3283 ; KNL_64-NEXT: testb $8, %al
3284 ; KNL_64-NEXT: jne .LBB42_15
3285 ; KNL_64-NEXT: jmp .LBB42_16
3286 ; KNL_64-NEXT: .LBB42_17: # %cond.load36
3287 ; KNL_64-NEXT: vmovq %xmm0, %rcx
3288 ; KNL_64-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
3289 ; KNL_64-NEXT: testb $2, %al
3290 ; KNL_64-NEXT: je .LBB42_20
3291 ; KNL_64-NEXT: .LBB42_19: # %cond.load42
3292 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
3293 ; KNL_64-NEXT: vpinsrq $1, (%rcx), %xmm4, %xmm0
3294 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm4[4,5,6,7]
3295 ; KNL_64-NEXT: testb $4, %al
3296 ; KNL_64-NEXT: je .LBB42_22
3297 ; KNL_64-NEXT: .LBB42_21: # %cond.load48
3298 ; KNL_64-NEXT: vmovq %xmm2, %rcx
3299 ; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm0
3300 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7]
3301 ; KNL_64-NEXT: testb $8, %al
3302 ; KNL_64-NEXT: jne .LBB42_23
3303 ; KNL_64-NEXT: jmp .LBB42_24
3305 ; KNL_32-LABEL: test_pr28312:
3307 ; KNL_32-NEXT: pushl %ebp
3308 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
3309 ; KNL_32-NEXT: .cfi_offset %ebp, -8
3310 ; KNL_32-NEXT: movl %esp, %ebp
3311 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
3312 ; KNL_32-NEXT: pushl %ebx
3313 ; KNL_32-NEXT: pushl %esi
3314 ; KNL_32-NEXT: andl $-32, %esp
3315 ; KNL_32-NEXT: subl $32, %esp
3316 ; KNL_32-NEXT: .cfi_offset %esi, -16
3317 ; KNL_32-NEXT: .cfi_offset %ebx, -12
3318 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
3319 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
3320 ; KNL_32-NEXT: kmovw %k0, %ebx
3321 ; KNL_32-NEXT: testb $1, %bl
3322 ; KNL_32-NEXT: vmovd %xmm0, %eax
3323 ; KNL_32-NEXT: # implicit-def: $ymm1
3324 ; KNL_32-NEXT: je .LBB42_2
3325 ; KNL_32-NEXT: # %bb.1: # %cond.load
3326 ; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
3327 ; KNL_32-NEXT: .LBB42_2: # %else
3328 ; KNL_32-NEXT: testb $2, %bl
3329 ; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
3330 ; KNL_32-NEXT: je .LBB42_4
3331 ; KNL_32-NEXT: # %bb.3: # %cond.load1
3332 ; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm1, %xmm2
3333 ; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm2, %xmm2
3334 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
3335 ; KNL_32-NEXT: .LBB42_4: # %else2
3336 ; KNL_32-NEXT: testb $4, %bl
3337 ; KNL_32-NEXT: vpextrd $2, %xmm0, %edx
3338 ; KNL_32-NEXT: je .LBB42_6
3339 ; KNL_32-NEXT: # %bb.5: # %cond.load4
3340 ; KNL_32-NEXT: vpbroadcastd (%edx), %ymm2
3341 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7]
3342 ; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm2
3343 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
3344 ; KNL_32-NEXT: .LBB42_6: # %else5
3345 ; KNL_32-NEXT: testb $8, %bl
3346 ; KNL_32-NEXT: vpextrd $3, %xmm0, %esi
3347 ; KNL_32-NEXT: je .LBB42_8
3348 ; KNL_32-NEXT: # %bb.7: # %cond.load7
3349 ; KNL_32-NEXT: vpbroadcastd (%esi), %ymm0
3350 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
3351 ; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm1
3352 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7]
3353 ; KNL_32-NEXT: .LBB42_8: # %else8
3354 ; KNL_32-NEXT: kmovw %k0, %ebx
3355 ; KNL_32-NEXT: testb $1, %bl
3356 ; KNL_32-NEXT: # implicit-def: $ymm0
3357 ; KNL_32-NEXT: jne .LBB42_9
3358 ; KNL_32-NEXT: # %bb.10: # %else15
3359 ; KNL_32-NEXT: testb $2, %bl
3360 ; KNL_32-NEXT: jne .LBB42_11
3361 ; KNL_32-NEXT: .LBB42_12: # %else21
3362 ; KNL_32-NEXT: testb $4, %bl
3363 ; KNL_32-NEXT: jne .LBB42_13
3364 ; KNL_32-NEXT: .LBB42_14: # %else27
3365 ; KNL_32-NEXT: testb $8, %bl
3366 ; KNL_32-NEXT: je .LBB42_16
3367 ; KNL_32-NEXT: .LBB42_15: # %cond.load29
3368 ; KNL_32-NEXT: vpbroadcastd (%esi), %ymm2
3369 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7]
3370 ; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm2
3371 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
3372 ; KNL_32-NEXT: .LBB42_16: # %else33
3373 ; KNL_32-NEXT: kmovw %k0, %ebx
3374 ; KNL_32-NEXT: testb $1, %bl
3375 ; KNL_32-NEXT: # implicit-def: $ymm2
3376 ; KNL_32-NEXT: jne .LBB42_17
3377 ; KNL_32-NEXT: # %bb.18: # %else40
3378 ; KNL_32-NEXT: testb $2, %bl
3379 ; KNL_32-NEXT: jne .LBB42_19
3380 ; KNL_32-NEXT: .LBB42_20: # %else46
3381 ; KNL_32-NEXT: testb $4, %bl
3382 ; KNL_32-NEXT: jne .LBB42_21
3383 ; KNL_32-NEXT: .LBB42_22: # %else52
3384 ; KNL_32-NEXT: testb $8, %bl
3385 ; KNL_32-NEXT: je .LBB42_24
3386 ; KNL_32-NEXT: .LBB42_23: # %cond.load54
3387 ; KNL_32-NEXT: vpbroadcastd (%esi), %ymm3
3388 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7]
3389 ; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm3
3390 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
3391 ; KNL_32-NEXT: .LBB42_24: # %else58
3392 ; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
3393 ; KNL_32-NEXT: vpaddq %ymm2, %ymm0, %ymm0
3394 ; KNL_32-NEXT: leal -8(%ebp), %esp
3395 ; KNL_32-NEXT: popl %esi
3396 ; KNL_32-NEXT: popl %ebx
3397 ; KNL_32-NEXT: popl %ebp
3398 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
3400 ; KNL_32-NEXT: .LBB42_9: # %cond.load11
3401 ; KNL_32-NEXT: .cfi_def_cfa %ebp, 8
3402 ; KNL_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
3403 ; KNL_32-NEXT: testb $2, %bl
3404 ; KNL_32-NEXT: je .LBB42_12
3405 ; KNL_32-NEXT: .LBB42_11: # %cond.load17
3406 ; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm2
3407 ; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm2, %xmm2
3408 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3409 ; KNL_32-NEXT: testb $4, %bl
3410 ; KNL_32-NEXT: je .LBB42_14
3411 ; KNL_32-NEXT: .LBB42_13: # %cond.load23
3412 ; KNL_32-NEXT: vpbroadcastd (%edx), %ymm2
3413 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7]
3414 ; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm2
3415 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
3416 ; KNL_32-NEXT: testb $8, %bl
3417 ; KNL_32-NEXT: jne .LBB42_15
3418 ; KNL_32-NEXT: jmp .LBB42_16
3419 ; KNL_32-NEXT: .LBB42_17: # %cond.load36
3420 ; KNL_32-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
3421 ; KNL_32-NEXT: testb $2, %bl
3422 ; KNL_32-NEXT: je .LBB42_20
3423 ; KNL_32-NEXT: .LBB42_19: # %cond.load42
3424 ; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm2, %xmm3
3425 ; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm3, %xmm3
3426 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
3427 ; KNL_32-NEXT: testb $4, %bl
3428 ; KNL_32-NEXT: je .LBB42_22
3429 ; KNL_32-NEXT: .LBB42_21: # %cond.load48
3430 ; KNL_32-NEXT: vpbroadcastd (%edx), %ymm3
3431 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7]
3432 ; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm3
3433 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
3434 ; KNL_32-NEXT: testb $8, %bl
3435 ; KNL_32-NEXT: jne .LBB42_23
3436 ; KNL_32-NEXT: jmp .LBB42_24
3438 ; SKX-LABEL: test_pr28312:
3440 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
3441 ; SKX-NEXT: vpmovd2m %xmm1, %k1
3442 ; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1}
3443 ; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
3444 ; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0
3447 ; SKX_32-LABEL: test_pr28312:
3449 ; SKX_32-NEXT: pushl %ebp
3450 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
3451 ; SKX_32-NEXT: .cfi_offset %ebp, -8
3452 ; SKX_32-NEXT: movl %esp, %ebp
3453 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
3454 ; SKX_32-NEXT: andl $-32, %esp
3455 ; SKX_32-NEXT: subl $32, %esp
3456 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
3457 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
3458 ; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1}
3459 ; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
3460 ; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
3461 ; SKX_32-NEXT: movl %ebp, %esp
3462 ; SKX_32-NEXT: popl %ebp
3463 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
3465 %g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
3466 %g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
3467 %g3 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
3468 %a = add <4 x i64> %g1, %g2
3469 %b = add <4 x i64> %a, %g3
3472 declare <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>)
3474 define <8 x i32> @test_global_array(<8 x i64> %indxs) {
3475 ; KNL_64-LABEL: test_global_array:
3477 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
3478 ; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
3479 ; KNL_64-NEXT: vmovdqa %ymm1, %ymm0
3482 ; KNL_32-LABEL: test_global_array:
3484 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
3485 ; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
3486 ; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
3489 ; SKX_SMALL-LABEL: test_global_array:
3490 ; SKX_SMALL: # %bb.0:
3491 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
3492 ; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
3493 ; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0
3494 ; SKX_SMALL-NEXT: retq
3496 ; SKX_LARGE-LABEL: test_global_array:
3497 ; SKX_LARGE: # %bb.0:
3498 ; SKX_LARGE-NEXT: movabsq $glob_array, %rax
3499 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
3500 ; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
3501 ; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0
3502 ; SKX_LARGE-NEXT: retq
3504 ; SKX_32-LABEL: test_global_array:
3506 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
3507 ; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
3508 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm0
3510 %p = getelementptr inbounds [16 x i32], [16 x i32]* @glob_array, i64 0, <8 x i64> %indxs
3511 %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
3515 define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
3516 ; KNL_64-LABEL: test_global_array_zeroinitializer_index:
3518 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
3519 ; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
3520 ; KNL_64-NEXT: vmovdqa %ymm1, %ymm0
3523 ; KNL_32-LABEL: test_global_array_zeroinitializer_index:
3525 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
3526 ; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
3527 ; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
3530 ; SKX_SMALL-LABEL: test_global_array_zeroinitializer_index:
3531 ; SKX_SMALL: # %bb.0:
3532 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
3533 ; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
3534 ; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0
3535 ; SKX_SMALL-NEXT: retq
3537 ; SKX_LARGE-LABEL: test_global_array_zeroinitializer_index:
3538 ; SKX_LARGE: # %bb.0:
3539 ; SKX_LARGE-NEXT: movabsq $glob_array, %rax
3540 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
3541 ; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
3542 ; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0
3543 ; SKX_LARGE-NEXT: retq
3545 ; SKX_32-LABEL: test_global_array_zeroinitializer_index:
3547 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
3548 ; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
3549 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm0
3551 %p = getelementptr inbounds [16 x i32], [16 x i32]* @glob_array, <8 x i64> zeroinitializer, <8 x i64> %indxs
3552 %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
3556 define void @v1_scatter(<1 x i32>%a1, <1 x i32*> %ptr, <1 x i1> %mask) {
3557 ; KNL_64-LABEL: v1_scatter:
3559 ; KNL_64-NEXT: testb $1, %dl
3560 ; KNL_64-NEXT: je .LBB45_2
3561 ; KNL_64-NEXT: # %bb.1: # %cond.store
3562 ; KNL_64-NEXT: movl %edi, (%rsi)
3563 ; KNL_64-NEXT: .LBB45_2: # %else
3566 ; KNL_32-LABEL: v1_scatter:
3568 ; KNL_32-NEXT: testb $1, {{[0-9]+}}(%esp)
3569 ; KNL_32-NEXT: je .LBB45_2
3570 ; KNL_32-NEXT: # %bb.1: # %cond.store
3571 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3572 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
3573 ; KNL_32-NEXT: movl %ecx, (%eax)
3574 ; KNL_32-NEXT: .LBB45_2: # %else
3577 ; SKX-LABEL: v1_scatter:
3579 ; SKX-NEXT: testb $1, %dl
3580 ; SKX-NEXT: je .LBB45_2
3581 ; SKX-NEXT: # %bb.1: # %cond.store
3582 ; SKX-NEXT: movl %edi, (%rsi)
3583 ; SKX-NEXT: .LBB45_2: # %else
3586 ; SKX_32-LABEL: v1_scatter:
3588 ; SKX_32-NEXT: testb $1, {{[0-9]+}}(%esp)
3589 ; SKX_32-NEXT: je .LBB45_2
3590 ; SKX_32-NEXT: # %bb.1: # %cond.store
3591 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3592 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
3593 ; SKX_32-NEXT: movl %ecx, (%eax)
3594 ; SKX_32-NEXT: .LBB45_2: # %else
3596 call void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32> %a1, <1 x i32*> %ptr, i32 4, <1 x i1> %mask)
3599 declare void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32>, <1 x i32*>, i32, <1 x i1>)
3601 define <1 x i32> @v1_gather(<1 x i32*> %ptr, <1 x i1> %mask, <1 x i32> %src0) {
3602 ; KNL_64-LABEL: v1_gather:
3604 ; KNL_64-NEXT: movl (%rdi), %eax
3607 ; KNL_32-LABEL: v1_gather:
3609 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3610 ; KNL_32-NEXT: movl (%eax), %eax
3613 ; SKX-LABEL: v1_gather:
3615 ; SKX-NEXT: movl (%rdi), %eax
3618 ; SKX_32-LABEL: v1_gather:
3620 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3621 ; SKX_32-NEXT: movl (%eax), %eax
3623 %res = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> %ptr, i32 4, <1 x i1> <i1 true>, <1 x i32> %src0)
3626 declare <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*>, i32, <1 x i1>, <1 x i32>)
3628 ; Make sure we don't crash when the index element type is larger than i64 and we need to widen the result
3629 ; This experienced a bad interaction when we widened and then tried to split.
3630 define <2 x float> @large_index(float* %base, <2 x i128> %ind, <2 x i1> %mask, <2 x float> %src0) {
3631 ; KNL_64-LABEL: large_index:
3633 ; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0
3634 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0
3635 ; KNL_64-NEXT: vmovq %rcx, %xmm0
3636 ; KNL_64-NEXT: vmovq %rsi, %xmm2
3637 ; KNL_64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
3638 ; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0
3639 ; KNL_64-NEXT: vmovq %rdi, %xmm2
3640 ; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2
3641 ; KNL_64-NEXT: vpaddq %xmm0, %xmm2, %xmm0
3642 ; KNL_64-NEXT: kmovw %k0, %eax
3643 ; KNL_64-NEXT: testb $1, %al
3644 ; KNL_64-NEXT: jne .LBB47_1
3645 ; KNL_64-NEXT: # %bb.2: # %else
3646 ; KNL_64-NEXT: testb $2, %al
3647 ; KNL_64-NEXT: jne .LBB47_3
3648 ; KNL_64-NEXT: .LBB47_4: # %else2
3649 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
3650 ; KNL_64-NEXT: vzeroupper
3652 ; KNL_64-NEXT: .LBB47_1: # %cond.load
3653 ; KNL_64-NEXT: vmovq %xmm0, %rcx
3654 ; KNL_64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
3655 ; KNL_64-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
3656 ; KNL_64-NEXT: testb $2, %al
3657 ; KNL_64-NEXT: je .LBB47_4
3658 ; KNL_64-NEXT: .LBB47_3: # %cond.load1
3659 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
3660 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
3661 ; KNL_64-NEXT: vmovaps %xmm1, %xmm0
3662 ; KNL_64-NEXT: vzeroupper
3665 ; KNL_32-LABEL: large_index:
3667 ; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0
3668 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0
3669 ; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3670 ; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
3671 ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
3672 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2
3673 ; KNL_32-NEXT: vpaddd %xmm0, %xmm2, %xmm0
3674 ; KNL_32-NEXT: kmovw %k0, %eax
3675 ; KNL_32-NEXT: testb $1, %al
3676 ; KNL_32-NEXT: jne .LBB47_1
3677 ; KNL_32-NEXT: # %bb.2: # %else
3678 ; KNL_32-NEXT: testb $2, %al
3679 ; KNL_32-NEXT: jne .LBB47_3
3680 ; KNL_32-NEXT: .LBB47_4: # %else2
3681 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
3682 ; KNL_32-NEXT: vzeroupper
3684 ; KNL_32-NEXT: .LBB47_1: # %cond.load
3685 ; KNL_32-NEXT: vmovd %xmm0, %ecx
3686 ; KNL_32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
3687 ; KNL_32-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
3688 ; KNL_32-NEXT: testb $2, %al
3689 ; KNL_32-NEXT: je .LBB47_4
3690 ; KNL_32-NEXT: .LBB47_3: # %cond.load1
3691 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
3692 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
3693 ; KNL_32-NEXT: vmovaps %xmm1, %xmm0
3694 ; KNL_32-NEXT: vzeroupper
3697 ; SKX-LABEL: large_index:
3699 ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
3700 ; SKX-NEXT: vpmovq2m %xmm0, %k0
3701 ; SKX-NEXT: vmovq %rcx, %xmm0
3702 ; SKX-NEXT: vmovq %rsi, %xmm2
3703 ; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
3704 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
3705 ; SKX-NEXT: vpbroadcastq %rdi, %xmm2
3706 ; SKX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
3707 ; SKX-NEXT: kmovw %k0, %eax
3708 ; SKX-NEXT: testb $1, %al
3709 ; SKX-NEXT: jne .LBB47_1
3710 ; SKX-NEXT: # %bb.2: # %else
3711 ; SKX-NEXT: testb $2, %al
3712 ; SKX-NEXT: jne .LBB47_3
3713 ; SKX-NEXT: .LBB47_4: # %else2
3714 ; SKX-NEXT: vmovdqa %xmm1, %xmm0
3716 ; SKX-NEXT: .LBB47_1: # %cond.load
3717 ; SKX-NEXT: vmovq %xmm0, %rcx
3718 ; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
3719 ; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
3720 ; SKX-NEXT: testb $2, %al
3721 ; SKX-NEXT: je .LBB47_4
3722 ; SKX-NEXT: .LBB47_3: # %cond.load1
3723 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
3724 ; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
3725 ; SKX-NEXT: vmovaps %xmm1, %xmm0
3728 ; SKX_32-LABEL: large_index:
3730 ; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0
3731 ; SKX_32-NEXT: vpmovq2m %xmm0, %k0
3732 ; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3733 ; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
3734 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
3735 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
3736 ; SKX_32-NEXT: kmovw %k0, %eax
3737 ; SKX_32-NEXT: testb $1, %al
3738 ; SKX_32-NEXT: jne .LBB47_1
3739 ; SKX_32-NEXT: # %bb.2: # %else
3740 ; SKX_32-NEXT: testb $2, %al
3741 ; SKX_32-NEXT: jne .LBB47_3
3742 ; SKX_32-NEXT: .LBB47_4: # %else2
3743 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
3745 ; SKX_32-NEXT: .LBB47_1: # %cond.load
3746 ; SKX_32-NEXT: vmovd %xmm0, %ecx
3747 ; SKX_32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3748 ; SKX_32-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
3749 ; SKX_32-NEXT: testb $2, %al
3750 ; SKX_32-NEXT: je .LBB47_4
3751 ; SKX_32-NEXT: .LBB47_3: # %cond.load1
3752 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
3753 ; SKX_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
3754 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
3756 %gep.random = getelementptr float, float* %base, <2 x i128> %ind
3757 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
3761 ; Make sure we allow index to be sign extended from a smaller than i32 element size.
3762 define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {
3763 ; KNL_64-LABEL: sext_i8_index:
3765 ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm1
3766 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
3767 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
3770 ; KNL_32-LABEL: sext_i8_index:
3772 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3773 ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm1
3774 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
3775 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
3778 ; SKX-LABEL: sext_i8_index:
3780 ; SKX-NEXT: vpmovsxbd %xmm0, %zmm1
3781 ; SKX-NEXT: kxnorw %k0, %k0, %k1
3782 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
3785 ; SKX_32-LABEL: sext_i8_index:
3787 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3788 ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm1
3789 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
3790 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
3793 %sext_ind = sext <16 x i8> %ind to <16 x i64>
3794 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
3796 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
3797 ret <16 x float>%res
3800 ; Make sure we allow index to be sign extended from a smaller than i32 element size.
3801 define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {
3802 ; KNL_64-LABEL: sext_v8i8_index:
3804 ; KNL_64-NEXT: vpmovsxbd %xmm0, %ymm1
3805 ; KNL_64-NEXT: movw $255, %ax
3806 ; KNL_64-NEXT: kmovw %eax, %k1
3807 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
3808 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3811 ; KNL_32-LABEL: sext_v8i8_index:
3813 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3814 ; KNL_32-NEXT: vpmovsxbd %xmm0, %ymm1
3815 ; KNL_32-NEXT: movw $255, %cx
3816 ; KNL_32-NEXT: kmovw %ecx, %k1
3817 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
3818 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3821 ; SKX-LABEL: sext_v8i8_index:
3823 ; SKX-NEXT: vpmovsxbd %xmm0, %ymm1
3824 ; SKX-NEXT: kxnorw %k0, %k0, %k1
3825 ; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
3828 ; SKX_32-LABEL: sext_v8i8_index:
3830 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3831 ; SKX_32-NEXT: vpmovsxbd %xmm0, %ymm1
3832 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
3833 ; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
3836 %sext_ind = sext <8 x i8> %ind to <8 x i64>
3837 %gep.random = getelementptr float, float *%base, <8 x i64> %sext_ind
3839 %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %gep.random, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
3842 declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>)
3844 ; Make sure we also allow index to be zero extended from a smaller than i32 element size.
3845 define <16 x float> @zext_i8_index(float* %base, <16 x i8> %ind) {
3846 ; KNL_64-LABEL: zext_i8_index:
3848 ; KNL_64-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
3849 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
3850 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
3853 ; KNL_32-LABEL: zext_i8_index:
3855 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3856 ; KNL_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
3857 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
3858 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
3861 ; SKX-LABEL: zext_i8_index:
3863 ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
3864 ; SKX-NEXT: kxnorw %k0, %k0, %k1
3865 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
3868 ; SKX_32-LABEL: zext_i8_index:
3870 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3871 ; SKX_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
3872 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
3873 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
3876 %zext_ind = zext <16 x i8> %ind to <16 x i64>
3877 %gep.random = getelementptr float, float *%base, <16 x i64> %zext_ind
3879 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
3880 ret <16 x float>%res
3883 ; Make sure we also allow index to be zero extended from a smaller than i32 element size.
3884 define <8 x float> @zext_v8i8_index(float* %base, <8 x i8> %ind) {
3885 ; KNL_64-LABEL: zext_v8i8_index:
3887 ; KNL_64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3888 ; KNL_64-NEXT: movw $255, %ax
3889 ; KNL_64-NEXT: kmovw %eax, %k1
3890 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
3891 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3894 ; KNL_32-LABEL: zext_v8i8_index:
3896 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3897 ; KNL_32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3898 ; KNL_32-NEXT: movw $255, %cx
3899 ; KNL_32-NEXT: kmovw %ecx, %k1
3900 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
3901 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3904 ; SKX-LABEL: zext_v8i8_index:
3906 ; SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3907 ; SKX-NEXT: kxnorw %k0, %k0, %k1
3908 ; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
3911 ; SKX_32-LABEL: zext_v8i8_index:
3913 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3914 ; SKX_32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3915 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
3916 ; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
3919 %zext_ind = zext <8 x i8> %ind to <8 x i64>
3920 %gep.random = getelementptr float, float *%base, <8 x i64> %zext_ind
3922 %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %gep.random, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
3926 ; Index requires promotion
3927 define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32> %ind, <2 x i1> %mask) {
3928 ; KNL_64-LABEL: test_scatter_2i32_index:
3930 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
3931 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
3932 ; KNL_64-NEXT: vpmovsxdq %xmm1, %xmm1
3933 ; KNL_64-NEXT: vpsllq $3, %xmm1, %xmm1
3934 ; KNL_64-NEXT: vmovq %rdi, %xmm2
3935 ; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2
3936 ; KNL_64-NEXT: vpaddq %xmm1, %xmm2, %xmm1
3937 ; KNL_64-NEXT: kmovw %k0, %eax
3938 ; KNL_64-NEXT: testb $1, %al
3939 ; KNL_64-NEXT: jne .LBB52_1
3940 ; KNL_64-NEXT: # %bb.2: # %else
3941 ; KNL_64-NEXT: testb $2, %al
3942 ; KNL_64-NEXT: jne .LBB52_3
3943 ; KNL_64-NEXT: .LBB52_4: # %else2
3944 ; KNL_64-NEXT: vzeroupper
3946 ; KNL_64-NEXT: .LBB52_1: # %cond.store
3947 ; KNL_64-NEXT: vmovq %xmm1, %rcx
3948 ; KNL_64-NEXT: vmovlps %xmm0, (%rcx)
3949 ; KNL_64-NEXT: testb $2, %al
3950 ; KNL_64-NEXT: je .LBB52_4
3951 ; KNL_64-NEXT: .LBB52_3: # %cond.store1
3952 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
3953 ; KNL_64-NEXT: vmovhps %xmm0, (%rax)
3954 ; KNL_64-NEXT: vzeroupper
3957 ; KNL_32-LABEL: test_scatter_2i32_index:
3959 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
3960 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
3961 ; KNL_32-NEXT: vpslld $3, %xmm1, %xmm1
3962 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2
3963 ; KNL_32-NEXT: vpaddd %xmm1, %xmm2, %xmm1
3964 ; KNL_32-NEXT: kmovw %k0, %eax
3965 ; KNL_32-NEXT: testb $1, %al
3966 ; KNL_32-NEXT: jne .LBB52_1
3967 ; KNL_32-NEXT: # %bb.2: # %else
3968 ; KNL_32-NEXT: testb $2, %al
3969 ; KNL_32-NEXT: jne .LBB52_3
3970 ; KNL_32-NEXT: .LBB52_4: # %else2
3971 ; KNL_32-NEXT: vzeroupper
3973 ; KNL_32-NEXT: .LBB52_1: # %cond.store
3974 ; KNL_32-NEXT: vmovd %xmm1, %ecx
3975 ; KNL_32-NEXT: vmovlps %xmm0, (%ecx)
3976 ; KNL_32-NEXT: testb $2, %al
3977 ; KNL_32-NEXT: je .LBB52_4
3978 ; KNL_32-NEXT: .LBB52_3: # %cond.store1
3979 ; KNL_32-NEXT: vpextrd $1, %xmm1, %eax
3980 ; KNL_32-NEXT: vmovhps %xmm0, (%eax)
3981 ; KNL_32-NEXT: vzeroupper
3984 ; SKX-LABEL: test_scatter_2i32_index:
3986 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
3987 ; SKX-NEXT: vpmovq2m %xmm2, %k0
3988 ; SKX-NEXT: vpbroadcastq %rdi, %xmm2
3989 ; SKX-NEXT: vpmovsxdq %xmm1, %xmm1
3990 ; SKX-NEXT: vpsllq $3, %xmm1, %xmm1
3991 ; SKX-NEXT: vpaddq %xmm1, %xmm2, %xmm1
3992 ; SKX-NEXT: kmovw %k0, %eax
3993 ; SKX-NEXT: testb $1, %al
3994 ; SKX-NEXT: jne .LBB52_1
3995 ; SKX-NEXT: # %bb.2: # %else
3996 ; SKX-NEXT: testb $2, %al
3997 ; SKX-NEXT: jne .LBB52_3
3998 ; SKX-NEXT: .LBB52_4: # %else2
4000 ; SKX-NEXT: .LBB52_1: # %cond.store
4001 ; SKX-NEXT: vmovq %xmm1, %rcx
4002 ; SKX-NEXT: vmovlps %xmm0, (%rcx)
4003 ; SKX-NEXT: testb $2, %al
4004 ; SKX-NEXT: je .LBB52_4
4005 ; SKX-NEXT: .LBB52_3: # %cond.store1
4006 ; SKX-NEXT: vpextrq $1, %xmm1, %rax
4007 ; SKX-NEXT: vmovhps %xmm0, (%rax)
4010 ; SKX_32-LABEL: test_scatter_2i32_index:
4012 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
4013 ; SKX_32-NEXT: vpmovq2m %xmm2, %k0
4014 ; SKX_32-NEXT: vpslld $3, %xmm1, %xmm1
4015 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm1, %xmm1
4016 ; SKX_32-NEXT: kmovw %k0, %eax
4017 ; SKX_32-NEXT: testb $1, %al
4018 ; SKX_32-NEXT: jne .LBB52_1
4019 ; SKX_32-NEXT: # %bb.2: # %else
4020 ; SKX_32-NEXT: testb $2, %al
4021 ; SKX_32-NEXT: jne .LBB52_3
4022 ; SKX_32-NEXT: .LBB52_4: # %else2
4024 ; SKX_32-NEXT: .LBB52_1: # %cond.store
4025 ; SKX_32-NEXT: vmovd %xmm1, %ecx
4026 ; SKX_32-NEXT: vmovlps %xmm0, (%ecx)
4027 ; SKX_32-NEXT: testb $2, %al
4028 ; SKX_32-NEXT: je .LBB52_4
4029 ; SKX_32-NEXT: .LBB52_3: # %cond.store1
4030 ; SKX_32-NEXT: vpextrd $1, %xmm1, %eax
4031 ; SKX_32-NEXT: vmovhps %xmm0, (%eax)
4033 %gep = getelementptr double, double *%base, <2 x i32> %ind
4034 call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask)
4037 declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>)
4039 define <16 x float> @zext_index(float* %base, <16 x i32> %ind) {
4040 ; KNL_64-LABEL: zext_index:
4042 ; KNL_64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1
4043 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
4044 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
4047 ; KNL_32-LABEL: zext_index:
4049 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4050 ; KNL_32-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm1
4051 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
4052 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
4055 ; SKX_SMALL-LABEL: zext_index:
4056 ; SKX_SMALL: # %bb.0:
4057 ; SKX_SMALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1
4058 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
4059 ; SKX_SMALL-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
4060 ; SKX_SMALL-NEXT: retq
4062 ; SKX_LARGE-LABEL: zext_index:
4063 ; SKX_LARGE: # %bb.0:
4064 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
4065 ; SKX_LARGE-NEXT: vandps (%rax){1to16}, %zmm0, %zmm1
4066 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
4067 ; SKX_LARGE-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
4068 ; SKX_LARGE-NEXT: retq
4070 ; SKX_32-LABEL: zext_index:
4072 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4073 ; SKX_32-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm1
4074 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
4075 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
4077 %ind_masked = and <16 x i32> %ind, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
4078 %sext_ind = zext <16 x i32> %ind_masked to <16 x i64>
4079 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
4081 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
4082 ret <16 x float>%res
4085 define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %passthru) {
4086 ; KNL_64-LABEL: test_gather_setcc_split:
4088 ; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm4
4089 ; KNL_64-NEXT: vptestnmd %zmm4, %zmm4, %k1
4090 ; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
4091 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
4092 ; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4093 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm3 {%k1}
4094 ; KNL_64-NEXT: vmovapd %zmm2, %zmm0
4095 ; KNL_64-NEXT: vmovapd %zmm3, %zmm1
4098 ; KNL_32-LABEL: test_gather_setcc_split:
4100 ; KNL_32-NEXT: pushl %ebp
4101 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
4102 ; KNL_32-NEXT: .cfi_offset %ebp, -8
4103 ; KNL_32-NEXT: movl %esp, %ebp
4104 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
4105 ; KNL_32-NEXT: andl $-64, %esp
4106 ; KNL_32-NEXT: subl $64, %esp
4107 ; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
4108 ; KNL_32-NEXT: movl 8(%ebp), %eax
4109 ; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
4110 ; KNL_32-NEXT: vptestnmd %zmm4, %zmm4, %k1
4111 ; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
4112 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
4113 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4114 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1}
4115 ; KNL_32-NEXT: vmovapd %zmm2, %zmm0
4116 ; KNL_32-NEXT: vmovapd %zmm3, %zmm1
4117 ; KNL_32-NEXT: movl %ebp, %esp
4118 ; KNL_32-NEXT: popl %ebp
4119 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
4122 ; SKX-LABEL: test_gather_setcc_split:
4124 ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm4
4125 ; SKX-NEXT: vptestnmd %ymm4, %ymm4, %k1
4126 ; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
4127 ; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
4128 ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4129 ; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm3 {%k1}
4130 ; SKX-NEXT: vmovapd %zmm2, %zmm0
4131 ; SKX-NEXT: vmovapd %zmm3, %zmm1
4134 ; SKX_32-LABEL: test_gather_setcc_split:
4136 ; SKX_32-NEXT: pushl %ebp
4137 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
4138 ; SKX_32-NEXT: .cfi_offset %ebp, -8
4139 ; SKX_32-NEXT: movl %esp, %ebp
4140 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
4141 ; SKX_32-NEXT: andl $-64, %esp
4142 ; SKX_32-NEXT: subl $64, %esp
4143 ; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
4144 ; SKX_32-NEXT: movl 8(%ebp), %eax
4145 ; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
4146 ; SKX_32-NEXT: vptestnmd %ymm4, %ymm4, %k1
4147 ; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
4148 ; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
4149 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4150 ; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1}
4151 ; SKX_32-NEXT: vmovapd %zmm2, %zmm0
4152 ; SKX_32-NEXT: vmovapd %zmm3, %zmm1
4153 ; SKX_32-NEXT: movl %ebp, %esp
4154 ; SKX_32-NEXT: popl %ebp
4155 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
4157 %sext_ind = sext <16 x i32> %ind to <16 x i64>
4158 %gep.random = getelementptr double, double *%base, <16 x i64> %sext_ind
4160 %mask = icmp eq <16 x i32> %cmp, zeroinitializer
4161 %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %gep.random, i32 4, <16 x i1> %mask, <16 x double> %passthru)
4162 ret <16 x double>%res
4165 define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %src0) {
4166 ; KNL_64-LABEL: test_scatter_setcc_split:
4168 ; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm4
4169 ; KNL_64-NEXT: vptestnmd %zmm4, %zmm4, %k1
4170 ; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
4171 ; KNL_64-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
4172 ; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4173 ; KNL_64-NEXT: vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1}
4174 ; KNL_64-NEXT: vzeroupper
4177 ; KNL_32-LABEL: test_scatter_setcc_split:
4179 ; KNL_32-NEXT: pushl %ebp
4180 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
4181 ; KNL_32-NEXT: .cfi_offset %ebp, -8
4182 ; KNL_32-NEXT: movl %esp, %ebp
4183 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
4184 ; KNL_32-NEXT: andl $-64, %esp
4185 ; KNL_32-NEXT: subl $64, %esp
4186 ; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
4187 ; KNL_32-NEXT: movl 8(%ebp), %eax
4188 ; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
4189 ; KNL_32-NEXT: vptestnmd %zmm4, %zmm4, %k1
4190 ; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
4191 ; KNL_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
4192 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4193 ; KNL_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
4194 ; KNL_32-NEXT: movl %ebp, %esp
4195 ; KNL_32-NEXT: popl %ebp
4196 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
4197 ; KNL_32-NEXT: vzeroupper
4200 ; SKX-LABEL: test_scatter_setcc_split:
4202 ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm4
4203 ; SKX-NEXT: vptestnmd %ymm4, %ymm4, %k1
4204 ; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
4205 ; SKX-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
4206 ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4207 ; SKX-NEXT: vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1}
4208 ; SKX-NEXT: vzeroupper
4211 ; SKX_32-LABEL: test_scatter_setcc_split:
4213 ; SKX_32-NEXT: pushl %ebp
4214 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
4215 ; SKX_32-NEXT: .cfi_offset %ebp, -8
4216 ; SKX_32-NEXT: movl %esp, %ebp
4217 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
4218 ; SKX_32-NEXT: andl $-64, %esp
4219 ; SKX_32-NEXT: subl $64, %esp
4220 ; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
4221 ; SKX_32-NEXT: movl 8(%ebp), %eax
4222 ; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
4223 ; SKX_32-NEXT: vptestnmd %ymm4, %ymm4, %k1
4224 ; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
4225 ; SKX_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
4226 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4227 ; SKX_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
4228 ; SKX_32-NEXT: movl %ebp, %esp
4229 ; SKX_32-NEXT: popl %ebp
4230 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
4231 ; SKX_32-NEXT: vzeroupper
4233 %sext_ind = sext <16 x i32> %ind to <16 x i64>
4234 %gep.random = getelementptr double, double *%base, <16 x i64> %sext_ind
4236 %mask = icmp eq <16 x i32> %cmp, zeroinitializer
4237 call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %gep.random, i32 4, <16 x i1> %mask)
4241 ; This test case previously triggered an infinite loop when the two gathers became identical after DAG combine removed the sign extend.
4242 define <16 x float> @test_sext_cse(float* %base, <16 x i32> %ind, <16 x i32>* %foo) {
4243 ; KNL_64-LABEL: test_sext_cse:
4245 ; KNL_64-NEXT: vmovaps %zmm0, (%rsi)
4246 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
4247 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
4248 ; KNL_64-NEXT: vaddps %zmm1, %zmm1, %zmm0
4251 ; KNL_32-LABEL: test_sext_cse:
4253 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4254 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
4255 ; KNL_32-NEXT: vmovaps %zmm0, (%ecx)
4256 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
4257 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
4258 ; KNL_32-NEXT: vaddps %zmm1, %zmm1, %zmm0
4261 ; SKX-LABEL: test_sext_cse:
4263 ; SKX-NEXT: vmovaps %zmm0, (%rsi)
4264 ; SKX-NEXT: kxnorw %k0, %k0, %k1
4265 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
4266 ; SKX-NEXT: vaddps %zmm1, %zmm1, %zmm0
4269 ; SKX_32-LABEL: test_sext_cse:
4271 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4272 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
4273 ; SKX_32-NEXT: vmovaps %zmm0, (%ecx)
4274 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
4275 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
4276 ; SKX_32-NEXT: vaddps %zmm1, %zmm1, %zmm0
4278 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
4279 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
4281 %sext_ind = sext <16 x i32> %ind to <16 x i64>
4282 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
4284 store <16 x i32> %ind, <16 x i32>* %foo
4285 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
4286 %gep.random2 = getelementptr float, <16 x float*> %broadcast.splat, <16 x i32> %ind
4287 %res2 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random2, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
4288 %res3 = fadd <16 x float> %res2, %res
4289 ret <16 x float>%res3
4292 define void @zero_mask(<2 x double>%a1, <2 x double*> %ptr) {
4293 ; ALL-LABEL: zero_mask:
4295 ; ALL-NEXT: ret{{[l|q]}}
4296 call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %ptr, i32 4, <2 x i1> zeroinitializer)
4300 define <2 x i64> @gather_2i64_constant_indices(i64* %ptr, <2 x i1> %mask) {
4301 ; KNL_64-LABEL: gather_2i64_constant_indices:
4303 ; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0
4304 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0
4305 ; KNL_64-NEXT: vmovq %rdi, %xmm0
4306 ; KNL_64-NEXT: vpbroadcastq %xmm0, %xmm0
4307 ; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
4308 ; KNL_64-NEXT: kmovw %k0, %eax
4309 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
4310 ; KNL_64-NEXT: testb $1, %al
4311 ; KNL_64-NEXT: jne .LBB58_1
4312 ; KNL_64-NEXT: # %bb.2: # %else
4313 ; KNL_64-NEXT: testb $2, %al
4314 ; KNL_64-NEXT: jne .LBB58_3
4315 ; KNL_64-NEXT: .LBB58_4: # %else2
4316 ; KNL_64-NEXT: vzeroupper
4318 ; KNL_64-NEXT: .LBB58_1: # %cond.load
4319 ; KNL_64-NEXT: vmovq %xmm1, %rcx
4320 ; KNL_64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4321 ; KNL_64-NEXT: testb $2, %al
4322 ; KNL_64-NEXT: je .LBB58_4
4323 ; KNL_64-NEXT: .LBB58_3: # %cond.load1
4324 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
4325 ; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0
4326 ; KNL_64-NEXT: vzeroupper
4329 ; KNL_32-LABEL: gather_2i64_constant_indices:
4331 ; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0
4332 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0
4333 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
4334 ; KNL_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
4335 ; KNL_32-NEXT: kmovw %k0, %eax
4336 ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
4337 ; KNL_32-NEXT: testb $1, %al
4338 ; KNL_32-NEXT: jne .LBB58_1
4339 ; KNL_32-NEXT: # %bb.2: # %else
4340 ; KNL_32-NEXT: testb $2, %al
4341 ; KNL_32-NEXT: jne .LBB58_3
4342 ; KNL_32-NEXT: .LBB58_4: # %else2
4343 ; KNL_32-NEXT: vzeroupper
4345 ; KNL_32-NEXT: .LBB58_1: # %cond.load
4346 ; KNL_32-NEXT: vmovd %xmm1, %ecx
4347 ; KNL_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4348 ; KNL_32-NEXT: testb $2, %al
4349 ; KNL_32-NEXT: je .LBB58_4
4350 ; KNL_32-NEXT: .LBB58_3: # %cond.load1
4351 ; KNL_32-NEXT: vpextrd $1, %xmm1, %eax
4352 ; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
4353 ; KNL_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0
4354 ; KNL_32-NEXT: vzeroupper
4357 ; SKX_SMALL-LABEL: gather_2i64_constant_indices:
4358 ; SKX_SMALL: # %bb.0:
4359 ; SKX_SMALL-NEXT: vpsllq $63, %xmm0, %xmm0
4360 ; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k0
4361 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %xmm0
4362 ; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
4363 ; SKX_SMALL-NEXT: kmovw %k0, %eax
4364 ; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
4365 ; SKX_SMALL-NEXT: testb $1, %al
4366 ; SKX_SMALL-NEXT: jne .LBB58_1
4367 ; SKX_SMALL-NEXT: # %bb.2: # %else
4368 ; SKX_SMALL-NEXT: testb $2, %al
4369 ; SKX_SMALL-NEXT: jne .LBB58_3
4370 ; SKX_SMALL-NEXT: .LBB58_4: # %else2
4371 ; SKX_SMALL-NEXT: retq
4372 ; SKX_SMALL-NEXT: .LBB58_1: # %cond.load
4373 ; SKX_SMALL-NEXT: vmovq %xmm1, %rcx
4374 ; SKX_SMALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4375 ; SKX_SMALL-NEXT: testb $2, %al
4376 ; SKX_SMALL-NEXT: je .LBB58_4
4377 ; SKX_SMALL-NEXT: .LBB58_3: # %cond.load1
4378 ; SKX_SMALL-NEXT: vpextrq $1, %xmm1, %rax
4379 ; SKX_SMALL-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0
4380 ; SKX_SMALL-NEXT: retq
4382 ; SKX_LARGE-LABEL: gather_2i64_constant_indices:
4383 ; SKX_LARGE: # %bb.0:
4384 ; SKX_LARGE-NEXT: vpsllq $63, %xmm0, %xmm0
4385 ; SKX_LARGE-NEXT: vpmovq2m %xmm0, %k0
4386 ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %xmm0
4387 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
4388 ; SKX_LARGE-NEXT: vpaddq (%rax), %xmm0, %xmm1
4389 ; SKX_LARGE-NEXT: kmovw %k0, %eax
4390 ; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
4391 ; SKX_LARGE-NEXT: testb $1, %al
4392 ; SKX_LARGE-NEXT: jne .LBB58_1
4393 ; SKX_LARGE-NEXT: # %bb.2: # %else
4394 ; SKX_LARGE-NEXT: testb $2, %al
4395 ; SKX_LARGE-NEXT: jne .LBB58_3
4396 ; SKX_LARGE-NEXT: .LBB58_4: # %else2
4397 ; SKX_LARGE-NEXT: retq
4398 ; SKX_LARGE-NEXT: .LBB58_1: # %cond.load
4399 ; SKX_LARGE-NEXT: vmovq %xmm1, %rcx
4400 ; SKX_LARGE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4401 ; SKX_LARGE-NEXT: testb $2, %al
4402 ; SKX_LARGE-NEXT: je .LBB58_4
4403 ; SKX_LARGE-NEXT: .LBB58_3: # %cond.load1
4404 ; SKX_LARGE-NEXT: vpextrq $1, %xmm1, %rax
4405 ; SKX_LARGE-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0
4406 ; SKX_LARGE-NEXT: retq
4408 ; SKX_32-LABEL: gather_2i64_constant_indices:
4410 ; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0
4411 ; SKX_32-NEXT: vpmovq2m %xmm0, %k0
4412 ; SKX_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
4413 ; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
4414 ; SKX_32-NEXT: kmovw %k0, %eax
4415 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
4416 ; SKX_32-NEXT: testb $1, %al
4417 ; SKX_32-NEXT: jne .LBB58_1
4418 ; SKX_32-NEXT: # %bb.2: # %else
4419 ; SKX_32-NEXT: testb $2, %al
4420 ; SKX_32-NEXT: jne .LBB58_3
4421 ; SKX_32-NEXT: .LBB58_4: # %else2
4423 ; SKX_32-NEXT: .LBB58_1: # %cond.load
4424 ; SKX_32-NEXT: vmovd %xmm1, %ecx
4425 ; SKX_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4426 ; SKX_32-NEXT: testb $2, %al
4427 ; SKX_32-NEXT: je .LBB58_4
4428 ; SKX_32-NEXT: .LBB58_3: # %cond.load1
4429 ; SKX_32-NEXT: vpextrd $1, %xmm1, %eax
4430 ; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
4431 ; SKX_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0
4433 %gep = getelementptr i64, i64* %ptr, <2 x i64> <i64 0, i64 -2>
4434 %res = tail call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep, i32 8, <2 x i1> %mask, <2 x i64> zeroinitializer) #1
4438 define <16 x i32> @gather_16i64_constant_indices(i32* %ptr, <16 x i1> %mask) {
4439 ; KNL_64-LABEL: gather_16i64_constant_indices:
4441 ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0
4442 ; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0
4443 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
4444 ; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
4445 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
4446 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm1,4), %zmm0 {%k1}
4449 ; KNL_32-LABEL: gather_16i64_constant_indices:
4451 ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm0
4452 ; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0
4453 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
4454 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4455 ; KNL_32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
4456 ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
4457 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm1,4), %zmm0 {%k1}
4460 ; SKX_SMALL-LABEL: gather_16i64_constant_indices:
4461 ; SKX_SMALL: # %bb.0:
4462 ; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
4463 ; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0
4464 ; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1
4465 ; SKX_SMALL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
4466 ; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
4467 ; SKX_SMALL-NEXT: vpgatherdd (%rdi,%zmm1,4), %zmm0 {%k1}
4468 ; SKX_SMALL-NEXT: retq
4470 ; SKX_LARGE-LABEL: gather_16i64_constant_indices:
4471 ; SKX_LARGE: # %bb.0:
4472 ; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
4473 ; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0
4474 ; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1
4475 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
4476 ; SKX_LARGE-NEXT: vmovdqa64 (%rax), %zmm1
4477 ; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
4478 ; SKX_LARGE-NEXT: vpgatherdd (%rdi,%zmm1,4), %zmm0 {%k1}
4479 ; SKX_LARGE-NEXT: retq
4481 ; SKX_32-LABEL: gather_16i64_constant_indices:
4483 ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0
4484 ; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0
4485 ; SKX_32-NEXT: vpmovd2m %zmm0, %k1
4486 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4487 ; SKX_32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
4488 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
4489 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm1,4), %zmm0 {%k1}
4491 %gep = getelementptr i32, i32* %ptr, <16 x i64> <i64 0, i64 -2, i64 1, i64 -8, i64 10, i64 20, i64 50, i64 65536, i64 16777215, i64 2147483647, i64 100, i64 -2000, i64 -2147483648, i64 76897723, i64 7, i64 -67897687>
4492 %res = tail call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep, i32 4, <16 x i1> %mask, <16 x i32> zeroinitializer) #1
4496 define void @scatter_2i64_constant_indices(i32* %ptr, <2 x i1> %mask, <2 x i32> %src0) {
4497 ; KNL_64-LABEL: scatter_2i64_constant_indices:
4499 ; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0
4500 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0
4501 ; KNL_64-NEXT: vmovq %rdi, %xmm0
4502 ; KNL_64-NEXT: vpbroadcastq %xmm0, %xmm0
4503 ; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4504 ; KNL_64-NEXT: kmovw %k0, %eax
4505 ; KNL_64-NEXT: testb $1, %al
4506 ; KNL_64-NEXT: jne .LBB60_1
4507 ; KNL_64-NEXT: # %bb.2: # %else
4508 ; KNL_64-NEXT: testb $2, %al
4509 ; KNL_64-NEXT: jne .LBB60_3
4510 ; KNL_64-NEXT: .LBB60_4: # %else2
4511 ; KNL_64-NEXT: vzeroupper
4513 ; KNL_64-NEXT: .LBB60_1: # %cond.store
4514 ; KNL_64-NEXT: vmovq %xmm0, %rcx
4515 ; KNL_64-NEXT: vmovss %xmm1, (%rcx)
4516 ; KNL_64-NEXT: testb $2, %al
4517 ; KNL_64-NEXT: je .LBB60_4
4518 ; KNL_64-NEXT: .LBB60_3: # %cond.store1
4519 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
4520 ; KNL_64-NEXT: vextractps $1, %xmm1, (%rax)
4521 ; KNL_64-NEXT: vzeroupper
4524 ; KNL_32-LABEL: scatter_2i64_constant_indices:
4526 ; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0
4527 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0
4528 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
4529 ; KNL_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
4530 ; KNL_32-NEXT: kmovw %k0, %eax
4531 ; KNL_32-NEXT: testb $1, %al
4532 ; KNL_32-NEXT: jne .LBB60_1
4533 ; KNL_32-NEXT: # %bb.2: # %else
4534 ; KNL_32-NEXT: testb $2, %al
4535 ; KNL_32-NEXT: jne .LBB60_3
4536 ; KNL_32-NEXT: .LBB60_4: # %else2
4537 ; KNL_32-NEXT: vzeroupper
4539 ; KNL_32-NEXT: .LBB60_1: # %cond.store
4540 ; KNL_32-NEXT: vmovd %xmm0, %ecx
4541 ; KNL_32-NEXT: vmovss %xmm1, (%ecx)
4542 ; KNL_32-NEXT: testb $2, %al
4543 ; KNL_32-NEXT: je .LBB60_4
4544 ; KNL_32-NEXT: .LBB60_3: # %cond.store1
4545 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
4546 ; KNL_32-NEXT: vextractps $1, %xmm1, (%eax)
4547 ; KNL_32-NEXT: vzeroupper
4550 ; SKX_SMALL-LABEL: scatter_2i64_constant_indices:
4551 ; SKX_SMALL: # %bb.0:
4552 ; SKX_SMALL-NEXT: vpsllq $63, %xmm0, %xmm0
4553 ; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k0
4554 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %xmm0
4555 ; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4556 ; SKX_SMALL-NEXT: kmovw %k0, %eax
4557 ; SKX_SMALL-NEXT: testb $1, %al
4558 ; SKX_SMALL-NEXT: jne .LBB60_1
4559 ; SKX_SMALL-NEXT: # %bb.2: # %else
4560 ; SKX_SMALL-NEXT: testb $2, %al
4561 ; SKX_SMALL-NEXT: jne .LBB60_3
4562 ; SKX_SMALL-NEXT: .LBB60_4: # %else2
4563 ; SKX_SMALL-NEXT: retq
4564 ; SKX_SMALL-NEXT: .LBB60_1: # %cond.store
4565 ; SKX_SMALL-NEXT: vmovq %xmm0, %rcx
4566 ; SKX_SMALL-NEXT: vmovss %xmm1, (%rcx)
4567 ; SKX_SMALL-NEXT: testb $2, %al
4568 ; SKX_SMALL-NEXT: je .LBB60_4
4569 ; SKX_SMALL-NEXT: .LBB60_3: # %cond.store1
4570 ; SKX_SMALL-NEXT: vpextrq $1, %xmm0, %rax
4571 ; SKX_SMALL-NEXT: vextractps $1, %xmm1, (%rax)
4572 ; SKX_SMALL-NEXT: retq
4574 ; SKX_LARGE-LABEL: scatter_2i64_constant_indices:
4575 ; SKX_LARGE: # %bb.0:
4576 ; SKX_LARGE-NEXT: vpsllq $63, %xmm0, %xmm0
4577 ; SKX_LARGE-NEXT: vpmovq2m %xmm0, %k0
4578 ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %xmm0
4579 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
4580 ; SKX_LARGE-NEXT: vpaddq (%rax), %xmm0, %xmm0
4581 ; SKX_LARGE-NEXT: kmovw %k0, %eax
4582 ; SKX_LARGE-NEXT: testb $1, %al
4583 ; SKX_LARGE-NEXT: jne .LBB60_1
4584 ; SKX_LARGE-NEXT: # %bb.2: # %else
4585 ; SKX_LARGE-NEXT: testb $2, %al
4586 ; SKX_LARGE-NEXT: jne .LBB60_3
4587 ; SKX_LARGE-NEXT: .LBB60_4: # %else2
4588 ; SKX_LARGE-NEXT: retq
4589 ; SKX_LARGE-NEXT: .LBB60_1: # %cond.store
4590 ; SKX_LARGE-NEXT: vmovq %xmm0, %rcx
4591 ; SKX_LARGE-NEXT: vmovss %xmm1, (%rcx)
4592 ; SKX_LARGE-NEXT: testb $2, %al
4593 ; SKX_LARGE-NEXT: je .LBB60_4
4594 ; SKX_LARGE-NEXT: .LBB60_3: # %cond.store1
4595 ; SKX_LARGE-NEXT: vpextrq $1, %xmm0, %rax
4596 ; SKX_LARGE-NEXT: vextractps $1, %xmm1, (%rax)
4597 ; SKX_LARGE-NEXT: retq
4599 ; SKX_32-LABEL: scatter_2i64_constant_indices:
4601 ; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0
4602 ; SKX_32-NEXT: vpmovq2m %xmm0, %k0
4603 ; SKX_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
4604 ; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
4605 ; SKX_32-NEXT: kmovw %k0, %eax
4606 ; SKX_32-NEXT: testb $1, %al
4607 ; SKX_32-NEXT: jne .LBB60_1
4608 ; SKX_32-NEXT: # %bb.2: # %else
4609 ; SKX_32-NEXT: testb $2, %al
4610 ; SKX_32-NEXT: jne .LBB60_3
4611 ; SKX_32-NEXT: .LBB60_4: # %else2
4613 ; SKX_32-NEXT: .LBB60_1: # %cond.store
4614 ; SKX_32-NEXT: vmovd %xmm0, %ecx
4615 ; SKX_32-NEXT: vmovss %xmm1, (%ecx)
4616 ; SKX_32-NEXT: testb $2, %al
4617 ; SKX_32-NEXT: je .LBB60_4
4618 ; SKX_32-NEXT: .LBB60_3: # %cond.store1
4619 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
4620 ; SKX_32-NEXT: vextractps $1, %xmm1, (%eax)
4622 %gep = getelementptr i32, i32* %ptr, <2 x i64> <i64 0, i64 -2>
4623 call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %src0, <2 x i32*> %gep, i32 4, <2 x i1> %mask)
4627 define void @scatter_16i64_constant_indices(i32* %ptr, <16 x i1> %mask, <16 x i32> %src0) {
4628 ; KNL_64-LABEL: scatter_16i64_constant_indices:
4630 ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0
4631 ; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0
4632 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
4633 ; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
4634 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
4635 ; KNL_64-NEXT: vzeroupper
4638 ; KNL_32-LABEL: scatter_16i64_constant_indices:
4640 ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm0
4641 ; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0
4642 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
4643 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4644 ; KNL_32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
4645 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
4646 ; KNL_32-NEXT: vzeroupper
4649 ; SKX_SMALL-LABEL: scatter_16i64_constant_indices:
4650 ; SKX_SMALL: # %bb.0:
4651 ; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
4652 ; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0
4653 ; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1
4654 ; SKX_SMALL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
4655 ; SKX_SMALL-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
4656 ; SKX_SMALL-NEXT: vzeroupper
4657 ; SKX_SMALL-NEXT: retq
4659 ; SKX_LARGE-LABEL: scatter_16i64_constant_indices:
4660 ; SKX_LARGE: # %bb.0:
4661 ; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
4662 ; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0
4663 ; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1
4664 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
4665 ; SKX_LARGE-NEXT: vmovdqa64 (%rax), %zmm0
4666 ; SKX_LARGE-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
4667 ; SKX_LARGE-NEXT: vzeroupper
4668 ; SKX_LARGE-NEXT: retq
4670 ; SKX_32-LABEL: scatter_16i64_constant_indices:
4672 ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0
4673 ; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0
4674 ; SKX_32-NEXT: vpmovd2m %zmm0, %k1
4675 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4676 ; SKX_32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
4677 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
4678 ; SKX_32-NEXT: vzeroupper
4680 %gep = getelementptr i32, i32* %ptr, <16 x i64> <i64 0, i64 -2, i64 1, i64 -8, i64 10, i64 20, i64 50, i64 65536, i64 16777215, i64 2147483647, i64 100, i64 -2000, i64 -2147483648, i64 76897723, i64 7, i64 -67897687>
4681 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %src0, <16 x i32*> %gep, i32 4, <16 x i1> %mask)
4685 define <4 x i32> @splat_ptr_gather(i32* %ptr, <4 x i1> %mask, <4 x i32> %passthru) {
4686 ; KNL_64-LABEL: splat_ptr_gather:
4688 ; KNL_64-NEXT: vpslld $31, %xmm0, %xmm0
4689 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k0
4690 ; KNL_64-NEXT: vmovq %rdi, %xmm0
4691 ; KNL_64-NEXT: vpbroadcastq %xmm0, %ymm0
4692 ; KNL_64-NEXT: kmovw %k0, %eax
4693 ; KNL_64-NEXT: testb $1, %al
4694 ; KNL_64-NEXT: je .LBB62_2
4695 ; KNL_64-NEXT: # %bb.1: # %cond.load
4696 ; KNL_64-NEXT: vmovq %xmm0, %rcx
4697 ; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm1, %xmm1
4698 ; KNL_64-NEXT: .LBB62_2: # %else
4699 ; KNL_64-NEXT: testb $2, %al
4700 ; KNL_64-NEXT: je .LBB62_4
4701 ; KNL_64-NEXT: # %bb.3: # %cond.load1
4702 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
4703 ; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm1
4704 ; KNL_64-NEXT: .LBB62_4: # %else2
4705 ; KNL_64-NEXT: testb $4, %al
4706 ; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
4707 ; KNL_64-NEXT: jne .LBB62_5
4708 ; KNL_64-NEXT: # %bb.6: # %else5
4709 ; KNL_64-NEXT: testb $8, %al
4710 ; KNL_64-NEXT: jne .LBB62_7
4711 ; KNL_64-NEXT: .LBB62_8: # %else8
4712 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
4713 ; KNL_64-NEXT: vzeroupper
4715 ; KNL_64-NEXT: .LBB62_5: # %cond.load4
4716 ; KNL_64-NEXT: vmovq %xmm0, %rcx
4717 ; KNL_64-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm1
4718 ; KNL_64-NEXT: testb $8, %al
4719 ; KNL_64-NEXT: je .LBB62_8
4720 ; KNL_64-NEXT: .LBB62_7: # %cond.load7
4721 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
4722 ; KNL_64-NEXT: vpinsrd $3, (%rax), %xmm1, %xmm1
4723 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
4724 ; KNL_64-NEXT: vzeroupper
4727 ; KNL_32-LABEL: splat_ptr_gather:
4729 ; KNL_32-NEXT: vpslld $31, %xmm0, %xmm0
4730 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0
4731 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
4732 ; KNL_32-NEXT: kmovw %k0, %eax
4733 ; KNL_32-NEXT: testb $1, %al
4734 ; KNL_32-NEXT: jne .LBB62_1
4735 ; KNL_32-NEXT: # %bb.2: # %else
4736 ; KNL_32-NEXT: testb $2, %al
4737 ; KNL_32-NEXT: jne .LBB62_3
4738 ; KNL_32-NEXT: .LBB62_4: # %else2
4739 ; KNL_32-NEXT: testb $4, %al
4740 ; KNL_32-NEXT: jne .LBB62_5
4741 ; KNL_32-NEXT: .LBB62_6: # %else5
4742 ; KNL_32-NEXT: testb $8, %al
4743 ; KNL_32-NEXT: jne .LBB62_7
4744 ; KNL_32-NEXT: .LBB62_8: # %else8
4745 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
4746 ; KNL_32-NEXT: vzeroupper
4748 ; KNL_32-NEXT: .LBB62_1: # %cond.load
4749 ; KNL_32-NEXT: vmovd %xmm0, %ecx
4750 ; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm1, %xmm1
4751 ; KNL_32-NEXT: testb $2, %al
4752 ; KNL_32-NEXT: je .LBB62_4
4753 ; KNL_32-NEXT: .LBB62_3: # %cond.load1
4754 ; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
4755 ; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm1, %xmm1
4756 ; KNL_32-NEXT: testb $4, %al
4757 ; KNL_32-NEXT: je .LBB62_6
4758 ; KNL_32-NEXT: .LBB62_5: # %cond.load4
4759 ; KNL_32-NEXT: vpextrd $2, %xmm0, %ecx
4760 ; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm1, %xmm1
4761 ; KNL_32-NEXT: testb $8, %al
4762 ; KNL_32-NEXT: je .LBB62_8
4763 ; KNL_32-NEXT: .LBB62_7: # %cond.load7
4764 ; KNL_32-NEXT: vpextrd $3, %xmm0, %eax
4765 ; KNL_32-NEXT: vpinsrd $3, (%eax), %xmm1, %xmm1
4766 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
4767 ; KNL_32-NEXT: vzeroupper
4770 ; SKX-LABEL: splat_ptr_gather:
4772 ; SKX-NEXT: vpslld $31, %xmm0, %xmm0
4773 ; SKX-NEXT: vpmovd2m %xmm0, %k1
4774 ; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
4775 ; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
4776 ; SKX-NEXT: vmovdqa %xmm1, %xmm0
4779 ; SKX_32-LABEL: splat_ptr_gather:
4781 ; SKX_32-NEXT: vpslld $31, %xmm0, %xmm0
4782 ; SKX_32-NEXT: vpmovd2m %xmm0, %k1
4783 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4784 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
4785 ; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
4786 ; SKX_32-NEXT: vmovdqa %xmm1, %xmm0
4788 %1 = insertelement <4 x i32*> undef, i32* %ptr, i32 0
4789 %2 = shufflevector <4 x i32*> %1, <4 x i32*> undef, <4 x i32> zeroinitializer
4790 %3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
4793 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
4795 define void @splat_ptr_scatter(i32* %ptr, <4 x i1> %mask, <4 x i32> %val) {
4796 ; KNL_64-LABEL: splat_ptr_scatter:
4798 ; KNL_64-NEXT: vpslld $31, %xmm0, %xmm0
4799 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k0
4800 ; KNL_64-NEXT: vmovq %rdi, %xmm0
4801 ; KNL_64-NEXT: vpbroadcastq %xmm0, %ymm0
4802 ; KNL_64-NEXT: kmovw %k0, %eax
4803 ; KNL_64-NEXT: testb $1, %al
4804 ; KNL_64-NEXT: je .LBB63_2
4805 ; KNL_64-NEXT: # %bb.1: # %cond.store
4806 ; KNL_64-NEXT: vmovq %xmm0, %rcx
4807 ; KNL_64-NEXT: vmovss %xmm1, (%rcx)
4808 ; KNL_64-NEXT: .LBB63_2: # %else
4809 ; KNL_64-NEXT: testb $2, %al
4810 ; KNL_64-NEXT: je .LBB63_4
4811 ; KNL_64-NEXT: # %bb.3: # %cond.store1
4812 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
4813 ; KNL_64-NEXT: vextractps $1, %xmm1, (%rcx)
4814 ; KNL_64-NEXT: .LBB63_4: # %else2
4815 ; KNL_64-NEXT: testb $4, %al
4816 ; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
4817 ; KNL_64-NEXT: jne .LBB63_5
4818 ; KNL_64-NEXT: # %bb.6: # %else4
4819 ; KNL_64-NEXT: testb $8, %al
4820 ; KNL_64-NEXT: jne .LBB63_7
4821 ; KNL_64-NEXT: .LBB63_8: # %else6
4822 ; KNL_64-NEXT: vzeroupper
4824 ; KNL_64-NEXT: .LBB63_5: # %cond.store3
4825 ; KNL_64-NEXT: vmovq %xmm0, %rcx
4826 ; KNL_64-NEXT: vextractps $2, %xmm1, (%rcx)
4827 ; KNL_64-NEXT: testb $8, %al
4828 ; KNL_64-NEXT: je .LBB63_8
4829 ; KNL_64-NEXT: .LBB63_7: # %cond.store5
4830 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
4831 ; KNL_64-NEXT: vextractps $3, %xmm1, (%rax)
4832 ; KNL_64-NEXT: vzeroupper
4835 ; KNL_32-LABEL: splat_ptr_scatter:
4837 ; KNL_32-NEXT: vpslld $31, %xmm0, %xmm0
4838 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0
4839 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
4840 ; KNL_32-NEXT: kmovw %k0, %eax
4841 ; KNL_32-NEXT: testb $1, %al
4842 ; KNL_32-NEXT: jne .LBB63_1
4843 ; KNL_32-NEXT: # %bb.2: # %else
4844 ; KNL_32-NEXT: testb $2, %al
4845 ; KNL_32-NEXT: jne .LBB63_3
4846 ; KNL_32-NEXT: .LBB63_4: # %else2
4847 ; KNL_32-NEXT: testb $4, %al
4848 ; KNL_32-NEXT: jne .LBB63_5
4849 ; KNL_32-NEXT: .LBB63_6: # %else4
4850 ; KNL_32-NEXT: testb $8, %al
4851 ; KNL_32-NEXT: jne .LBB63_7
4852 ; KNL_32-NEXT: .LBB63_8: # %else6
4853 ; KNL_32-NEXT: vzeroupper
4855 ; KNL_32-NEXT: .LBB63_1: # %cond.store
4856 ; KNL_32-NEXT: vmovd %xmm0, %ecx
4857 ; KNL_32-NEXT: vmovss %xmm1, (%ecx)
4858 ; KNL_32-NEXT: testb $2, %al
4859 ; KNL_32-NEXT: je .LBB63_4
4860 ; KNL_32-NEXT: .LBB63_3: # %cond.store1
4861 ; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
4862 ; KNL_32-NEXT: vextractps $1, %xmm1, (%ecx)
4863 ; KNL_32-NEXT: testb $4, %al
4864 ; KNL_32-NEXT: je .LBB63_6
4865 ; KNL_32-NEXT: .LBB63_5: # %cond.store3
4866 ; KNL_32-NEXT: vpextrd $2, %xmm0, %ecx
4867 ; KNL_32-NEXT: vextractps $2, %xmm1, (%ecx)
4868 ; KNL_32-NEXT: testb $8, %al
4869 ; KNL_32-NEXT: je .LBB63_8
4870 ; KNL_32-NEXT: .LBB63_7: # %cond.store5
4871 ; KNL_32-NEXT: vpextrd $3, %xmm0, %eax
4872 ; KNL_32-NEXT: vextractps $3, %xmm1, (%eax)
4873 ; KNL_32-NEXT: vzeroupper
4876 ; SKX-LABEL: splat_ptr_scatter:
4878 ; SKX-NEXT: vpslld $31, %xmm0, %xmm0
4879 ; SKX-NEXT: vpmovd2m %xmm0, %k1
4880 ; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
4881 ; SKX-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
4884 ; SKX_32-LABEL: splat_ptr_scatter:
4886 ; SKX_32-NEXT: vpslld $31, %xmm0, %xmm0
4887 ; SKX_32-NEXT: vpmovd2m %xmm0, %k1
4888 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4889 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
4890 ; SKX_32-NEXT: vpscatterdd %xmm1, (%eax,%xmm0,4) {%k1}
4892 %1 = insertelement <4 x i32*> undef, i32* %ptr, i32 0
4893 %2 = shufflevector <4 x i32*> %1, <4 x i32*> undef, <4 x i32> zeroinitializer
4894 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %2, i32 4, <4 x i1> %mask)
4898 %struct.foo = type { i8*, i64, i16, i16, i32 }
4900 ; This used to cause fast-isel to generate bad copy instructions that would
4901 ; cause an error in copyPhysReg.
4902 define <8 x i64> @pr45906(<8 x %struct.foo*> %ptr) {
4903 ; KNL_64-LABEL: pr45906:
4904 ; KNL_64: # %bb.0: # %bb
4905 ; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
4906 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
4907 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm0 {%k1}
4910 ; KNL_32-LABEL: pr45906:
4911 ; KNL_32: # %bb.0: # %bb
4912 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
4913 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1
4914 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
4915 ; KNL_32-NEXT: vpgatherdq (,%ymm1), %zmm0 {%k1}
4918 ; SKX_SMALL-LABEL: pr45906:
4919 ; SKX_SMALL: # %bb.0: # %bb
4920 ; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
4921 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
4922 ; SKX_SMALL-NEXT: vpgatherqq (,%zmm1), %zmm0 {%k1}
4923 ; SKX_SMALL-NEXT: retq
4925 ; SKX_LARGE-LABEL: pr45906:
4926 ; SKX_LARGE: # %bb.0: # %bb
4927 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
4928 ; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1
4929 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
4930 ; SKX_LARGE-NEXT: vpgatherqq (,%zmm1), %zmm0 {%k1}
4931 ; SKX_LARGE-NEXT: retq
4933 ; SKX_32-LABEL: pr45906:
4934 ; SKX_32: # %bb.0: # %bb
4935 ; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1
4936 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
4937 ; SKX_32-NEXT: vpgatherdq (,%ymm1), %zmm0 {%k1}
4940 %tmp = getelementptr inbounds %struct.foo, <8 x %struct.foo*> %ptr, i64 0, i32 1
4941 %tmp1 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %tmp, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i64> undef)
4944 declare <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*>, i32, <8 x i1>, <8 x i64>)