1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_64
3 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32
4 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_SMALL
5 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq -code-model=large < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_LARGE
6 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32
7 ; RUN: opt -mtriple=x86_64-apple-darwin -passes=scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
8 ; RUN: opt -mtriple=x86_64-apple-darwin -passes=scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
9 ; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
11 @glob_array = internal unnamed_addr constant [16 x i32] [i32 1, i32 1, i32 2, i32 3, i32 5, i32 8, i32 13, i32 21, i32 34, i32 55, i32 89, i32 144, i32 233, i32 377, i32 610, i32 987], align 16
14 ; SCALAR: extractelement <16 x ptr>
15 ; SCALAR-NEXT: load float
16 ; SCALAR-NEXT: insertelement <16 x float>
17 ; SCALAR-NEXT: extractelement <16 x ptr>
18 ; SCALAR-NEXT: load float
20 define <16 x float> @test1(ptr %base, <16 x i32> %ind) {
21 ; KNL_64-LABEL: test1:
23 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
24 ; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
25 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
26 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
29 ; KNL_32-LABEL: test1:
31 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
32 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
33 ; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
34 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
35 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
40 ; SKX-NEXT: kxnorw %k0, %k0, %k1
41 ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
42 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
43 ; SKX-NEXT: vmovaps %zmm1, %zmm0
46 ; SKX_32-LABEL: test1:
48 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
49 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
50 ; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
51 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
52 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
55 %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
56 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
58 %sext_ind = sext <16 x i32> %ind to <16 x i64>
59 %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
61 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
65 declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
66 declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
67 declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> , i32, <8 x i1> , <8 x i32> )
71 ; SCALAR: extractelement <16 x ptr>
72 ; SCALAR-NEXT: load float
73 ; SCALAR-NEXT: insertelement <16 x float>
74 ; SCALAR-NEXT: br label %else
76 ; SCALAR-NEXT: %res.phi.else = phi
77 ; SCALAR-NEXT: and i16 %{{.*}}, 2
78 ; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0
79 ; SCALAR-NEXT: br i1 %{{.*}}, label %cond.load1, label %else2
81 define <16 x float> @test2(ptr %base, <16 x i32> %ind, i16 %mask) {
82 ; KNL_64-LABEL: test2:
84 ; KNL_64-NEXT: kmovw %esi, %k1
85 ; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
86 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
87 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
90 ; KNL_32-LABEL: test2:
92 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
93 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
94 ; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
95 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
96 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
101 ; SKX-NEXT: kmovw %esi, %k1
102 ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
103 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
104 ; SKX-NEXT: vmovaps %zmm1, %zmm0
107 ; SKX_32-LABEL: test2:
109 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
110 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
111 ; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
112 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
113 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
116 %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
117 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
119 %sext_ind = sext <16 x i32> %ind to <16 x i64>
120 %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
121 %imask = bitcast i16 %mask to <16 x i1>
122 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
123 ret <16 x float> %res
126 define <16 x i32> @test3(ptr %base, <16 x i32> %ind, i16 %mask) {
127 ; KNL_64-LABEL: test3:
129 ; KNL_64-NEXT: kmovw %esi, %k1
130 ; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
131 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
132 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
135 ; KNL_32-LABEL: test3:
137 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
138 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
139 ; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
140 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
141 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
146 ; SKX-NEXT: kmovw %esi, %k1
147 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
148 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
149 ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
152 ; SKX_32-LABEL: test3:
154 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
155 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
156 ; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
157 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
158 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
161 %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
162 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
164 %sext_ind = sext <16 x i32> %ind to <16 x i64>
165 %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
166 %imask = bitcast i16 %mask to <16 x i1>
167 %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
172 define <16 x i32> @test4(ptr %base, <16 x i32> %ind, i16 %mask) {
173 ; KNL_64-LABEL: test4:
175 ; KNL_64-NEXT: kmovw %esi, %k1
176 ; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
177 ; KNL_64-NEXT: kmovw %k1, %k2
178 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
179 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
180 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
181 ; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0
184 ; KNL_32-LABEL: test4:
186 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
187 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
188 ; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
189 ; KNL_32-NEXT: kmovw %k1, %k2
190 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
191 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
192 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
193 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
198 ; SKX-NEXT: kmovw %esi, %k1
199 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
200 ; SKX-NEXT: kmovw %k1, %k2
201 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
202 ; SKX-NEXT: vmovdqa64 %zmm1, %zmm2
203 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
204 ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0
207 ; SKX_32-LABEL: test4:
209 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
210 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
211 ; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
212 ; SKX_32-NEXT: kmovw %k1, %k2
213 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
214 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
215 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
216 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
219 %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
220 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
222 %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
223 %imask = bitcast i16 %mask to <16 x i1>
224 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
225 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
226 %res = add <16 x i32> %gt1, %gt2
231 ; SCALAR-LABEL: test5
232 ; SCALAR: and i16 %scalar_mask, 1
233 ; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0
234 ; SCALAR-NEXT: br i1 %{{.*}}, label %cond.store, label %else
235 ; SCALAR: cond.store:
236 ; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i64 0
237 ; SCALAR-NEXT: %Ptr0 = extractelement <16 x ptr> %gep.random, i64 0
238 ; SCALAR-NEXT: store i32 %Elt0, ptr %Ptr0, align 4
239 ; SCALAR-NEXT: br label %else
241 ; SCALAR-NEXT: and i16 %scalar_mask, 2
242 ; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0
243 ; SCALAR-NEXT: br i1 %{{.*}}, label %cond.store1, label %else2
245 define void @test5(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
246 ; KNL_64-LABEL: test5:
248 ; KNL_64-NEXT: kmovw %esi, %k1
249 ; KNL_64-NEXT: kmovw %k1, %k2
250 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
251 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
252 ; KNL_64-NEXT: vzeroupper
255 ; KNL_32-LABEL: test5:
257 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
258 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
259 ; KNL_32-NEXT: kmovw %k1, %k2
260 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
261 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
262 ; KNL_32-NEXT: vzeroupper
267 ; SKX-NEXT: kmovw %esi, %k1
268 ; SKX-NEXT: kmovw %k1, %k2
269 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
270 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
271 ; SKX-NEXT: vzeroupper
274 ; SKX_32-LABEL: test5:
276 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
277 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
278 ; SKX_32-NEXT: kmovw %k1, %k2
279 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
280 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
281 ; SKX_32-NEXT: vzeroupper
284 %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
285 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
287 %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
288 %imask = bitcast i16 %mask to <16 x i1>
289 call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
290 call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
294 declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> , <8 x ptr> , i32 , <8 x i1> )
295 declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> , <16 x ptr> , i32 , <16 x i1> )
298 ; SCALAR-LABEL: test6
299 ; SCALAR: store i32 %Elt0, ptr %Ptr01, align 4
300 ; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i64 1
301 ; SCALAR-NEXT: %Ptr12 = extractelement <8 x ptr> %ptr, i64 1
302 ; SCALAR-NEXT: store i32 %Elt1, ptr %Ptr12, align 4
303 ; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i64 2
304 ; SCALAR-NEXT: %Ptr23 = extractelement <8 x ptr> %ptr, i64 2
305 ; SCALAR-NEXT: store i32 %Elt2, ptr %Ptr23, align 4
307 define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) {
308 ; KNL_64-LABEL: test6:
310 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
311 ; KNL_64-NEXT: vpxor %xmm2, %xmm2, %xmm2
312 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
313 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
314 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
315 ; KNL_64-NEXT: vmovdqa %ymm2, %ymm0
318 ; KNL_32-LABEL: test6:
320 ; KNL_32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
321 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
322 ; KNL_32-NEXT: vpxor %xmm2, %xmm2, %xmm2
323 ; KNL_32-NEXT: movw $255, %ax
324 ; KNL_32-NEXT: kmovw %eax, %k1
325 ; KNL_32-NEXT: kmovw %k1, %k2
326 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm2 {%k2}
327 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
328 ; KNL_32-NEXT: vmovdqa %ymm2, %ymm0
333 ; SKX-NEXT: kxnorw %k0, %k0, %k1
334 ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
335 ; SKX-NEXT: kxnorw %k0, %k0, %k2
336 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
337 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
338 ; SKX-NEXT: vmovdqa %ymm2, %ymm0
341 ; SKX_32-LABEL: test6:
343 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
344 ; SKX_32-NEXT: vpxor %xmm2, %xmm2, %xmm2
345 ; SKX_32-NEXT: kxnorw %k0, %k0, %k2
346 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2}
347 ; SKX_32-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1}
348 ; SKX_32-NEXT: vmovdqa %ymm2, %ymm0
351 %a = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
353 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
357 define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) {
359 ; KNL_64-LABEL: test7:
361 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
362 ; KNL_64-NEXT: kmovw %esi, %k0
363 ; KNL_64-NEXT: kshiftlw $8, %k0, %k0
364 ; KNL_64-NEXT: kshiftrw $8, %k0, %k1
365 ; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
366 ; KNL_64-NEXT: kmovw %k1, %k2
367 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
368 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
369 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
370 ; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0
373 ; KNL_32-LABEL: test7:
375 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
376 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
377 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
378 ; KNL_32-NEXT: kmovw %ecx, %k0
379 ; KNL_32-NEXT: kshiftlw $8, %k0, %k0
380 ; KNL_32-NEXT: kshiftrw $8, %k0, %k1
381 ; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
382 ; KNL_32-NEXT: kmovw %k1, %k2
383 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
384 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
385 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
386 ; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
391 ; SKX-NEXT: kmovw %esi, %k1
392 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
393 ; SKX-NEXT: kmovw %k1, %k2
394 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
395 ; SKX-NEXT: vmovdqa %ymm1, %ymm2
396 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
397 ; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0
400 ; SKX_32-LABEL: test7:
402 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
403 ; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
404 ; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
405 ; SKX_32-NEXT: kmovw %k1, %k2
406 ; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm1 {%k2}
407 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm2
408 ; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm2 {%k1}
409 ; SKX_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
412 %broadcast.splatinsert = insertelement <8 x ptr> undef, ptr %base, i32 0
413 %broadcast.splat = shufflevector <8 x ptr> %broadcast.splatinsert, <8 x ptr> undef, <8 x i32> zeroinitializer
415 %gep.random = getelementptr i32, <8 x ptr> %broadcast.splat, <8 x i32> %ind
416 %imask = bitcast i8 %mask to <8 x i1>
417 %gt1 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
418 %gt2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
419 %res = add <8 x i32> %gt1, %gt2
423 ; No uniform base in this case, index <8 x i64> contains addresses,
424 ; each gather call will be split into two
425 define <16 x i32> @test8(<16 x ptr> %ptr.random, <16 x i32> %ind, i16 %mask) {
426 ; KNL_64-LABEL: test8:
428 ; KNL_64-NEXT: kmovw %edi, %k1
429 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
430 ; KNL_64-NEXT: vpxor %xmm2, %xmm2, %xmm2
431 ; KNL_64-NEXT: kmovw %k2, %k3
432 ; KNL_64-NEXT: vpxor %xmm3, %xmm3, %xmm3
433 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k3}
434 ; KNL_64-NEXT: kmovw %k1, %k3
435 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k3}
436 ; KNL_64-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm4
437 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k2}
438 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1}
439 ; KNL_64-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0
440 ; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0
443 ; KNL_32-LABEL: test8:
445 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
446 ; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
447 ; KNL_32-NEXT: kmovw %k1, %k2
448 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
449 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
450 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
451 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
456 ; SKX-NEXT: kmovw %edi, %k1
457 ; SKX-NEXT: kshiftrw $8, %k1, %k2
458 ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
459 ; SKX-NEXT: kmovw %k2, %k3
460 ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3
461 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k3}
462 ; SKX-NEXT: kmovw %k1, %k3
463 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k3}
464 ; SKX-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm4
465 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k2}
466 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1}
467 ; SKX-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0
468 ; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
471 ; SKX_32-LABEL: test8:
473 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
474 ; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
475 ; SKX_32-NEXT: kmovw %k1, %k2
476 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
477 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
478 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
479 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
482 %imask = bitcast i16 %mask to <16 x i1>
483 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
484 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
485 %res = add <16 x i32> %gt1, %gt2
489 %struct.RT = type { i8, [10 x [20 x i32]], i8 }
490 %struct.ST = type { i32, double, %struct.RT }
492 ; Masked gather for aggregate types
493 ; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
496 define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
497 ; KNL_64-LABEL: test9:
498 ; KNL_64: # %bb.0: # %entry
499 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
500 ; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
501 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
502 ; KNL_64-NEXT: vpaddq %zmm4, %zmm2, %zmm2
503 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
504 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
505 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
506 ; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
507 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
508 ; KNL_64-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
509 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm1
510 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
511 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
512 ; KNL_64-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
515 ; KNL_32-LABEL: test9:
516 ; KNL_32: # %bb.0: # %entry
517 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
518 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
519 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
520 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
521 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
522 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
523 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
524 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1
525 ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
526 ; KNL_32-NEXT: movw $255, %ax
527 ; KNL_32-NEXT: kmovw %eax, %k1
528 ; KNL_32-NEXT: vpgatherdd 68(,%zmm1), %zmm0 {%k1}
529 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
532 ; SKX_SMALL-LABEL: test9:
533 ; SKX_SMALL: # %bb.0: # %entry
534 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
535 ; SKX_SMALL-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
536 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
537 ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
538 ; SKX_SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
539 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
540 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
541 ; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
542 ; SKX_SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
543 ; SKX_SMALL-NEXT: retq
545 ; SKX_LARGE-LABEL: test9:
546 ; SKX_LARGE: # %bb.0: # %entry
547 ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2
548 ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
549 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
550 ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
551 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
552 ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
553 ; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
554 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1
555 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
556 ; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
557 ; SKX_LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
558 ; SKX_LARGE-NEXT: retq
560 ; SKX_32-LABEL: test9:
561 ; SKX_32: # %bb.0: # %entry
562 ; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm1, %ymm1
563 ; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
564 ; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
565 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
566 ; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1
567 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
568 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
569 ; SKX_32-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1}
572 %broadcast.splatinsert = insertelement <8 x ptr> undef, ptr %base, i32 0
573 %broadcast.splat = shufflevector <8 x ptr> %broadcast.splatinsert, <8 x ptr> undef, <8 x i32> zeroinitializer
575 %arrayidx = getelementptr %struct.ST, <8 x ptr> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
576 %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0(<8 x ptr>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
580 define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
581 ; KNL_64-LABEL: test10:
582 ; KNL_64: # %bb.0: # %entry
583 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
584 ; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
585 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
586 ; KNL_64-NEXT: vpaddq %zmm4, %zmm2, %zmm2
587 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
588 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
589 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
590 ; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
591 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
592 ; KNL_64-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
593 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm1
594 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
595 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
596 ; KNL_64-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
599 ; KNL_32-LABEL: test10:
600 ; KNL_32: # %bb.0: # %entry
601 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
602 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
603 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
604 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
605 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
606 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
607 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
608 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1
609 ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
610 ; KNL_32-NEXT: movw $255, %ax
611 ; KNL_32-NEXT: kmovw %eax, %k1
612 ; KNL_32-NEXT: vpgatherdd 68(,%zmm1), %zmm0 {%k1}
613 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
616 ; SKX_SMALL-LABEL: test10:
617 ; SKX_SMALL: # %bb.0: # %entry
618 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
619 ; SKX_SMALL-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
620 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
621 ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
622 ; SKX_SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
623 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
624 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
625 ; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
626 ; SKX_SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
627 ; SKX_SMALL-NEXT: retq
629 ; SKX_LARGE-LABEL: test10:
630 ; SKX_LARGE: # %bb.0: # %entry
631 ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2
632 ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
633 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
634 ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
635 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
636 ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
637 ; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
638 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1
639 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
640 ; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
641 ; SKX_LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
642 ; SKX_LARGE-NEXT: retq
644 ; SKX_32-LABEL: test10:
645 ; SKX_32: # %bb.0: # %entry
646 ; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm1, %ymm1
647 ; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
648 ; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
649 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
650 ; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1
651 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
652 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
653 ; SKX_32-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1}
656 %broadcast.splatinsert = insertelement <8 x ptr> undef, ptr %base, i32 0
657 %broadcast.splat = shufflevector <8 x ptr> %broadcast.splatinsert, <8 x ptr> undef, <8 x i32> zeroinitializer
659 %arrayidx = getelementptr %struct.ST, <8 x ptr> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
660 %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0(<8 x ptr>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
664 ; Splat index in GEP, requires broadcast
665 define <16 x float> @test11(ptr %base, i32 %ind) {
666 ; KNL_64-LABEL: test11:
668 ; KNL_64-NEXT: movslq %esi, %rax
669 ; KNL_64-NEXT: leaq (%rdi,%rax,4), %rax
670 ; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
671 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
672 ; KNL_64-NEXT: vxorps %xmm0, %xmm0, %xmm0
673 ; KNL_64-NEXT: vgatherdps (%rax,%zmm1,4), %zmm0 {%k1}
676 ; KNL_32-LABEL: test11:
678 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
679 ; KNL_32-NEXT: shll $2, %eax
680 ; KNL_32-NEXT: addl {{[0-9]+}}(%esp), %eax
681 ; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
682 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
683 ; KNL_32-NEXT: vxorps %xmm0, %xmm0, %xmm0
684 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
689 ; SKX-NEXT: movslq %esi, %rax
690 ; SKX-NEXT: leaq (%rdi,%rax,4), %rax
691 ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
692 ; SKX-NEXT: kxnorw %k0, %k0, %k1
693 ; SKX-NEXT: vxorps %xmm0, %xmm0, %xmm0
694 ; SKX-NEXT: vgatherdps (%rax,%zmm1,4), %zmm0 {%k1}
697 ; SKX_32-LABEL: test11:
699 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
700 ; SKX_32-NEXT: shll $2, %eax
701 ; SKX_32-NEXT: addl {{[0-9]+}}(%esp), %eax
702 ; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
703 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
704 ; SKX_32-NEXT: vxorps %xmm0, %xmm0, %xmm0
705 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
708 %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
709 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
711 %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, i32 %ind
713 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
717 ; We are checking the uniform base here. It is taken directly from input to vgatherdps
718 define <16 x float> @test12(ptr %base, <16 x i32> %ind) {
719 ; KNL_64-LABEL: test12:
721 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
722 ; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
723 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
724 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
727 ; KNL_32-LABEL: test12:
729 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
730 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
731 ; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
732 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
733 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
738 ; SKX-NEXT: kxnorw %k0, %k0, %k1
739 ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
740 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
741 ; SKX-NEXT: vmovaps %zmm1, %zmm0
744 ; SKX_32-LABEL: test12:
746 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
747 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
748 ; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
749 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
750 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
753 %sext_ind = sext <16 x i32> %ind to <16 x i64>
754 %gep.random = getelementptr float, ptr%base, <16 x i64> %sext_ind
756 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
760 ; The same as the previous, but the mask is undefined
761 define <16 x float> @test13(ptr %base, <16 x i32> %ind) {
762 ; KNL_64-LABEL: test13:
764 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
765 ; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
766 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
767 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
770 ; KNL_32-LABEL: test13:
772 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
773 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
774 ; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
775 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
776 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
781 ; SKX-NEXT: kxnorw %k0, %k0, %k1
782 ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
783 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
784 ; SKX-NEXT: vmovaps %zmm1, %zmm0
787 ; SKX_32-LABEL: test13:
789 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
790 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
791 ; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
792 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
793 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
796 %sext_ind = sext <16 x i32> %ind to <16 x i64>
797 %gep.random = getelementptr float, ptr%base, <16 x i64> %sext_ind
799 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
803 ; The base pointer is not splat, can't find unform base
804 define <16 x float> @test14(ptr %base, i32 %ind, <16 x ptr> %vec) {
805 ; KNL_64-LABEL: test14:
807 ; KNL_64-NEXT: vmovq %xmm0, %rax
808 ; KNL_64-NEXT: vmovd %esi, %xmm0
809 ; KNL_64-NEXT: vpbroadcastd %xmm0, %ymm0
810 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
811 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
812 ; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
813 ; KNL_64-NEXT: vgatherqps (%rax,%zmm0,4), %ymm1 {%k1}
814 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
817 ; KNL_32-LABEL: test14:
819 ; KNL_32-NEXT: vmovd %xmm0, %eax
820 ; KNL_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
821 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
822 ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
823 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
828 ; SKX-NEXT: vmovq %xmm0, %rax
829 ; SKX-NEXT: vpbroadcastd %esi, %ymm0
830 ; SKX-NEXT: vpmovsxdq %ymm0, %zmm0
831 ; SKX-NEXT: kxnorw %k0, %k0, %k1
832 ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
833 ; SKX-NEXT: vgatherqps (%rax,%zmm0,4), %ymm1 {%k1}
834 ; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
837 ; SKX_32-LABEL: test14:
839 ; SKX_32-NEXT: vmovd %xmm0, %eax
840 ; SKX_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
841 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
842 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
843 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
846 %broadcast.splatinsert = insertelement <16 x ptr> %vec, ptr %base, i32 1
847 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
849 %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, i32 %ind
851 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
855 declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
856 declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>)
857 declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
859 ; Gather smaller than existing instruction
860 define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) {
861 ; KNL_64-LABEL: test15:
863 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
864 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
865 ; KNL_64-NEXT: vpmovsxdq %xmm0, %ymm0
866 ; KNL_64-NEXT: vpsllq $2, %ymm0, %ymm0
867 ; KNL_64-NEXT: vmovq %rdi, %xmm1
868 ; KNL_64-NEXT: vpbroadcastq %xmm1, %ymm1
869 ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm1
870 ; KNL_64-NEXT: kmovw %k0, %eax
871 ; KNL_64-NEXT: testb $1, %al
872 ; KNL_64-NEXT: # implicit-def: $xmm0
873 ; KNL_64-NEXT: je .LBB14_2
874 ; KNL_64-NEXT: # %bb.1: # %cond.load
875 ; KNL_64-NEXT: vmovq %xmm1, %rcx
876 ; KNL_64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
877 ; KNL_64-NEXT: .LBB14_2: # %else
878 ; KNL_64-NEXT: testb $2, %al
879 ; KNL_64-NEXT: je .LBB14_4
880 ; KNL_64-NEXT: # %bb.3: # %cond.load1
881 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx
882 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
883 ; KNL_64-NEXT: .LBB14_4: # %else2
884 ; KNL_64-NEXT: testb $4, %al
885 ; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1
886 ; KNL_64-NEXT: jne .LBB14_5
887 ; KNL_64-NEXT: # %bb.6: # %else5
888 ; KNL_64-NEXT: testb $8, %al
889 ; KNL_64-NEXT: jne .LBB14_7
890 ; KNL_64-NEXT: .LBB14_8: # %else8
891 ; KNL_64-NEXT: vzeroupper
893 ; KNL_64-NEXT: .LBB14_5: # %cond.load4
894 ; KNL_64-NEXT: vmovq %xmm1, %rcx
895 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
896 ; KNL_64-NEXT: testb $8, %al
897 ; KNL_64-NEXT: je .LBB14_8
898 ; KNL_64-NEXT: .LBB14_7: # %cond.load7
899 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
900 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
901 ; KNL_64-NEXT: vzeroupper
904 ; KNL_32-LABEL: test15:
906 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
907 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
908 ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
909 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
910 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm1
911 ; KNL_32-NEXT: kmovw %k0, %eax
912 ; KNL_32-NEXT: testb $1, %al
913 ; KNL_32-NEXT: # implicit-def: $xmm0
914 ; KNL_32-NEXT: jne .LBB14_1
915 ; KNL_32-NEXT: # %bb.2: # %else
916 ; KNL_32-NEXT: testb $2, %al
917 ; KNL_32-NEXT: jne .LBB14_3
918 ; KNL_32-NEXT: .LBB14_4: # %else2
919 ; KNL_32-NEXT: testb $4, %al
920 ; KNL_32-NEXT: jne .LBB14_5
921 ; KNL_32-NEXT: .LBB14_6: # %else5
922 ; KNL_32-NEXT: testb $8, %al
923 ; KNL_32-NEXT: jne .LBB14_7
924 ; KNL_32-NEXT: .LBB14_8: # %else8
925 ; KNL_32-NEXT: vzeroupper
927 ; KNL_32-NEXT: .LBB14_1: # %cond.load
928 ; KNL_32-NEXT: vmovd %xmm1, %ecx
929 ; KNL_32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
930 ; KNL_32-NEXT: testb $2, %al
931 ; KNL_32-NEXT: je .LBB14_4
932 ; KNL_32-NEXT: .LBB14_3: # %cond.load1
933 ; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
934 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
935 ; KNL_32-NEXT: testb $4, %al
936 ; KNL_32-NEXT: je .LBB14_6
937 ; KNL_32-NEXT: .LBB14_5: # %cond.load4
938 ; KNL_32-NEXT: vpextrd $2, %xmm1, %ecx
939 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
940 ; KNL_32-NEXT: testb $8, %al
941 ; KNL_32-NEXT: je .LBB14_8
942 ; KNL_32-NEXT: .LBB14_7: # %cond.load7
943 ; KNL_32-NEXT: vpextrd $3, %xmm1, %eax
944 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
945 ; KNL_32-NEXT: vzeroupper
950 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
951 ; SKX-NEXT: vpmovd2m %xmm1, %k1
952 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
953 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
954 ; SKX-NEXT: vmovaps %xmm1, %xmm0
957 ; SKX_32-LABEL: test15:
959 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
960 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
961 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
962 ; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
963 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
964 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
967 %sext_ind = sext <4 x i32> %ind to <4 x i64>
968 %gep.random = getelementptr float, ptr %base, <4 x i64> %sext_ind
969 %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
973 ; Gather smaller than existing instruction
974 define <4 x double> @test16(ptr %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
975 ; KNL_64-LABEL: test16:
977 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
978 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
979 ; KNL_64-NEXT: vpmovsxdq %xmm0, %ymm0
980 ; KNL_64-NEXT: vpsllq $3, %ymm0, %ymm0
981 ; KNL_64-NEXT: vmovq %rdi, %xmm1
982 ; KNL_64-NEXT: vpbroadcastq %xmm1, %ymm1
983 ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0
984 ; KNL_64-NEXT: kmovw %k0, %eax
985 ; KNL_64-NEXT: testb $1, %al
986 ; KNL_64-NEXT: je .LBB15_2
987 ; KNL_64-NEXT: # %bb.1: # %cond.load
988 ; KNL_64-NEXT: vmovq %xmm0, %rcx
989 ; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
990 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7]
991 ; KNL_64-NEXT: .LBB15_2: # %else
992 ; KNL_64-NEXT: testb $2, %al
993 ; KNL_64-NEXT: je .LBB15_4
994 ; KNL_64-NEXT: # %bb.3: # %cond.load1
995 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
996 ; KNL_64-NEXT: vmovhps {{.*#+}} xmm1 = xmm2[0,1],mem[0,1]
997 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
998 ; KNL_64-NEXT: .LBB15_4: # %else2
999 ; KNL_64-NEXT: testb $4, %al
1000 ; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
1001 ; KNL_64-NEXT: jne .LBB15_5
1002 ; KNL_64-NEXT: # %bb.6: # %else5
1003 ; KNL_64-NEXT: testb $8, %al
1004 ; KNL_64-NEXT: jne .LBB15_7
1005 ; KNL_64-NEXT: .LBB15_8: # %else8
1006 ; KNL_64-NEXT: vmovdqa %ymm2, %ymm0
1008 ; KNL_64-NEXT: .LBB15_5: # %cond.load4
1009 ; KNL_64-NEXT: vmovq %xmm0, %rcx
1010 ; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm1
1011 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
1012 ; KNL_64-NEXT: testb $8, %al
1013 ; KNL_64-NEXT: je .LBB15_8
1014 ; KNL_64-NEXT: .LBB15_7: # %cond.load7
1015 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
1016 ; KNL_64-NEXT: vpbroadcastq (%rax), %ymm0
1017 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
1018 ; KNL_64-NEXT: vmovdqa %ymm2, %ymm0
1021 ; KNL_32-LABEL: test16:
1023 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
1024 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
1025 ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0
1026 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
1027 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1028 ; KNL_32-NEXT: kmovw %k0, %eax
1029 ; KNL_32-NEXT: testb $1, %al
1030 ; KNL_32-NEXT: jne .LBB15_1
1031 ; KNL_32-NEXT: # %bb.2: # %else
1032 ; KNL_32-NEXT: testb $2, %al
1033 ; KNL_32-NEXT: jne .LBB15_3
1034 ; KNL_32-NEXT: .LBB15_4: # %else2
1035 ; KNL_32-NEXT: testb $4, %al
1036 ; KNL_32-NEXT: jne .LBB15_5
1037 ; KNL_32-NEXT: .LBB15_6: # %else5
1038 ; KNL_32-NEXT: testb $8, %al
1039 ; KNL_32-NEXT: jne .LBB15_7
1040 ; KNL_32-NEXT: .LBB15_8: # %else8
1041 ; KNL_32-NEXT: vmovdqa %ymm2, %ymm0
1043 ; KNL_32-NEXT: .LBB15_1: # %cond.load
1044 ; KNL_32-NEXT: vmovd %xmm0, %ecx
1045 ; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1046 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1047 ; KNL_32-NEXT: testb $2, %al
1048 ; KNL_32-NEXT: je .LBB15_4
1049 ; KNL_32-NEXT: .LBB15_3: # %cond.load1
1050 ; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
1051 ; KNL_32-NEXT: vmovhps {{.*#+}} xmm1 = xmm2[0,1],mem[0,1]
1052 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1053 ; KNL_32-NEXT: testb $4, %al
1054 ; KNL_32-NEXT: je .LBB15_6
1055 ; KNL_32-NEXT: .LBB15_5: # %cond.load4
1056 ; KNL_32-NEXT: vpextrd $2, %xmm0, %ecx
1057 ; KNL_32-NEXT: vpbroadcastq (%ecx), %ymm1
1058 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
1059 ; KNL_32-NEXT: testb $8, %al
1060 ; KNL_32-NEXT: je .LBB15_8
1061 ; KNL_32-NEXT: .LBB15_7: # %cond.load7
1062 ; KNL_32-NEXT: vpextrd $3, %xmm0, %eax
1063 ; KNL_32-NEXT: vpbroadcastq (%eax), %ymm0
1064 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
1065 ; KNL_32-NEXT: vmovdqa %ymm2, %ymm0
1068 ; SKX-LABEL: test16:
1070 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
1071 ; SKX-NEXT: vpmovd2m %xmm1, %k1
1072 ; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
1073 ; SKX-NEXT: vmovapd %ymm2, %ymm0
1076 ; SKX_32-LABEL: test16:
1078 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
1079 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
1080 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1081 ; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
1082 ; SKX_32-NEXT: vmovapd %ymm2, %ymm0
1085 %sext_ind = sext <4 x i32> %ind to <4 x i64>
1086 %gep.random = getelementptr double, ptr %base, <4 x i64> %sext_ind
1087 %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
1088 ret <4 x double>%res
1091 define <2 x double> @test17(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
1092 ; KNL_64-LABEL: test17:
1094 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1095 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1096 ; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0
1097 ; KNL_64-NEXT: vpsllq $3, %xmm0, %xmm0
1098 ; KNL_64-NEXT: vmovq %rdi, %xmm1
1099 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
1100 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1101 ; KNL_64-NEXT: kmovw %k0, %eax
1102 ; KNL_64-NEXT: testb $1, %al
1103 ; KNL_64-NEXT: jne .LBB16_1
1104 ; KNL_64-NEXT: # %bb.2: # %else
1105 ; KNL_64-NEXT: testb $2, %al
1106 ; KNL_64-NEXT: jne .LBB16_3
1107 ; KNL_64-NEXT: .LBB16_4: # %else2
1108 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1109 ; KNL_64-NEXT: vzeroupper
1111 ; KNL_64-NEXT: .LBB16_1: # %cond.load
1112 ; KNL_64-NEXT: vmovq %xmm0, %rcx
1113 ; KNL_64-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
1114 ; KNL_64-NEXT: testb $2, %al
1115 ; KNL_64-NEXT: je .LBB16_4
1116 ; KNL_64-NEXT: .LBB16_3: # %cond.load1
1117 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
1118 ; KNL_64-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
1119 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1120 ; KNL_64-NEXT: vzeroupper
1123 ; KNL_32-LABEL: test17:
1125 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1126 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1127 ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0
1128 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
1129 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1130 ; KNL_32-NEXT: kmovw %k0, %eax
1131 ; KNL_32-NEXT: testb $1, %al
1132 ; KNL_32-NEXT: jne .LBB16_1
1133 ; KNL_32-NEXT: # %bb.2: # %else
1134 ; KNL_32-NEXT: testb $2, %al
1135 ; KNL_32-NEXT: jne .LBB16_3
1136 ; KNL_32-NEXT: .LBB16_4: # %else2
1137 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1138 ; KNL_32-NEXT: vzeroupper
1140 ; KNL_32-NEXT: .LBB16_1: # %cond.load
1141 ; KNL_32-NEXT: vmovd %xmm0, %ecx
1142 ; KNL_32-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
1143 ; KNL_32-NEXT: testb $2, %al
1144 ; KNL_32-NEXT: je .LBB16_4
1145 ; KNL_32-NEXT: .LBB16_3: # %cond.load1
1146 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
1147 ; KNL_32-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
1148 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1149 ; KNL_32-NEXT: vzeroupper
1152 ; SKX-LABEL: test17:
1154 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1155 ; SKX-NEXT: vpmovq2m %xmm1, %k0
1156 ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
1157 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
1158 ; SKX-NEXT: vpsllq $3, %xmm0, %xmm0
1159 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1160 ; SKX-NEXT: kmovw %k0, %eax
1161 ; SKX-NEXT: testb $1, %al
1162 ; SKX-NEXT: jne .LBB16_1
1163 ; SKX-NEXT: # %bb.2: # %else
1164 ; SKX-NEXT: testb $2, %al
1165 ; SKX-NEXT: jne .LBB16_3
1166 ; SKX-NEXT: .LBB16_4: # %else2
1167 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1169 ; SKX-NEXT: .LBB16_1: # %cond.load
1170 ; SKX-NEXT: vmovq %xmm0, %rcx
1171 ; SKX-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
1172 ; SKX-NEXT: testb $2, %al
1173 ; SKX-NEXT: je .LBB16_4
1174 ; SKX-NEXT: .LBB16_3: # %cond.load1
1175 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
1176 ; SKX-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
1177 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1180 ; SKX_32-LABEL: test17:
1182 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1183 ; SKX_32-NEXT: vpmovq2m %xmm1, %k0
1184 ; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0
1185 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
1186 ; SKX_32-NEXT: kmovw %k0, %eax
1187 ; SKX_32-NEXT: testb $1, %al
1188 ; SKX_32-NEXT: jne .LBB16_1
1189 ; SKX_32-NEXT: # %bb.2: # %else
1190 ; SKX_32-NEXT: testb $2, %al
1191 ; SKX_32-NEXT: jne .LBB16_3
1192 ; SKX_32-NEXT: .LBB16_4: # %else2
1193 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1195 ; SKX_32-NEXT: .LBB16_1: # %cond.load
1196 ; SKX_32-NEXT: vmovd %xmm0, %ecx
1197 ; SKX_32-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
1198 ; SKX_32-NEXT: testb $2, %al
1199 ; SKX_32-NEXT: je .LBB16_4
1200 ; SKX_32-NEXT: .LBB16_3: # %cond.load1
1201 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
1202 ; SKX_32-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
1203 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1206 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1207 %gep.random = getelementptr double, ptr %base, <2 x i64> %sext_ind
1208 %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
1209 ret <2 x double>%res
1212 declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> , <4 x ptr> , i32 , <4 x i1> )
1213 declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double> , <4 x ptr> , i32 , <4 x i1> )
1214 declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> , <2 x ptr> , i32 , <2 x i1> )
1215 declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> , <2 x ptr> , i32 , <2 x i1> )
1216 declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float> , <2 x ptr> , i32 , <2 x i1> )
1218 define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
1219 ; KNL_64-LABEL: test18:
1221 ; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2
1222 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0
1223 ; KNL_64-NEXT: kmovw %k0, %eax
1224 ; KNL_64-NEXT: testb $1, %al
1225 ; KNL_64-NEXT: je .LBB17_2
1226 ; KNL_64-NEXT: # %bb.1: # %cond.store
1227 ; KNL_64-NEXT: vmovq %xmm1, %rcx
1228 ; KNL_64-NEXT: vmovss %xmm0, (%rcx)
1229 ; KNL_64-NEXT: .LBB17_2: # %else
1230 ; KNL_64-NEXT: testb $2, %al
1231 ; KNL_64-NEXT: je .LBB17_4
1232 ; KNL_64-NEXT: # %bb.3: # %cond.store1
1233 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx
1234 ; KNL_64-NEXT: vextractps $1, %xmm0, (%rcx)
1235 ; KNL_64-NEXT: .LBB17_4: # %else2
1236 ; KNL_64-NEXT: testb $4, %al
1237 ; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1
1238 ; KNL_64-NEXT: jne .LBB17_5
1239 ; KNL_64-NEXT: # %bb.6: # %else4
1240 ; KNL_64-NEXT: testb $8, %al
1241 ; KNL_64-NEXT: jne .LBB17_7
1242 ; KNL_64-NEXT: .LBB17_8: # %else6
1243 ; KNL_64-NEXT: vzeroupper
1245 ; KNL_64-NEXT: .LBB17_5: # %cond.store3
1246 ; KNL_64-NEXT: vmovq %xmm1, %rcx
1247 ; KNL_64-NEXT: vextractps $2, %xmm0, (%rcx)
1248 ; KNL_64-NEXT: testb $8, %al
1249 ; KNL_64-NEXT: je .LBB17_8
1250 ; KNL_64-NEXT: .LBB17_7: # %cond.store5
1251 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
1252 ; KNL_64-NEXT: vextractps $3, %xmm0, (%rax)
1253 ; KNL_64-NEXT: vzeroupper
1256 ; KNL_32-LABEL: test18:
1258 ; KNL_32-NEXT: vpslld $31, %xmm2, %xmm2
1259 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k0
1260 ; KNL_32-NEXT: kmovw %k0, %eax
1261 ; KNL_32-NEXT: testb $1, %al
1262 ; KNL_32-NEXT: jne .LBB17_1
1263 ; KNL_32-NEXT: # %bb.2: # %else
1264 ; KNL_32-NEXT: testb $2, %al
1265 ; KNL_32-NEXT: jne .LBB17_3
1266 ; KNL_32-NEXT: .LBB17_4: # %else2
1267 ; KNL_32-NEXT: testb $4, %al
1268 ; KNL_32-NEXT: jne .LBB17_5
1269 ; KNL_32-NEXT: .LBB17_6: # %else4
1270 ; KNL_32-NEXT: testb $8, %al
1271 ; KNL_32-NEXT: jne .LBB17_7
1272 ; KNL_32-NEXT: .LBB17_8: # %else6
1273 ; KNL_32-NEXT: vzeroupper
1275 ; KNL_32-NEXT: .LBB17_1: # %cond.store
1276 ; KNL_32-NEXT: vmovd %xmm1, %ecx
1277 ; KNL_32-NEXT: vmovss %xmm0, (%ecx)
1278 ; KNL_32-NEXT: testb $2, %al
1279 ; KNL_32-NEXT: je .LBB17_4
1280 ; KNL_32-NEXT: .LBB17_3: # %cond.store1
1281 ; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
1282 ; KNL_32-NEXT: vextractps $1, %xmm0, (%ecx)
1283 ; KNL_32-NEXT: testb $4, %al
1284 ; KNL_32-NEXT: je .LBB17_6
1285 ; KNL_32-NEXT: .LBB17_5: # %cond.store3
1286 ; KNL_32-NEXT: vpextrd $2, %xmm1, %ecx
1287 ; KNL_32-NEXT: vextractps $2, %xmm0, (%ecx)
1288 ; KNL_32-NEXT: testb $8, %al
1289 ; KNL_32-NEXT: je .LBB17_8
1290 ; KNL_32-NEXT: .LBB17_7: # %cond.store5
1291 ; KNL_32-NEXT: vpextrd $3, %xmm1, %eax
1292 ; KNL_32-NEXT: vextractps $3, %xmm0, (%eax)
1293 ; KNL_32-NEXT: vzeroupper
1296 ; SKX-LABEL: test18:
1298 ; SKX-NEXT: vpslld $31, %xmm2, %xmm2
1299 ; SKX-NEXT: vpmovd2m %xmm2, %k1
1300 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
1301 ; SKX-NEXT: vzeroupper
1304 ; SKX_32-LABEL: test18:
1306 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
1307 ; SKX_32-NEXT: vpmovd2m %xmm2, %k1
1308 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
1310 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
1314 define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
1315 ; KNL_64-LABEL: test19:
1317 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
1318 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
1319 ; KNL_64-NEXT: vpsllq $3, %ymm2, %ymm1
1320 ; KNL_64-NEXT: vmovq %rdi, %xmm2
1321 ; KNL_64-NEXT: vpbroadcastq %xmm2, %ymm2
1322 ; KNL_64-NEXT: vpaddq %ymm1, %ymm2, %ymm1
1323 ; KNL_64-NEXT: kmovw %k0, %eax
1324 ; KNL_64-NEXT: testb $1, %al
1325 ; KNL_64-NEXT: je .LBB18_2
1326 ; KNL_64-NEXT: # %bb.1: # %cond.store
1327 ; KNL_64-NEXT: vmovq %xmm1, %rcx
1328 ; KNL_64-NEXT: vmovlps %xmm0, (%rcx)
1329 ; KNL_64-NEXT: .LBB18_2: # %else
1330 ; KNL_64-NEXT: testb $2, %al
1331 ; KNL_64-NEXT: je .LBB18_4
1332 ; KNL_64-NEXT: # %bb.3: # %cond.store1
1333 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx
1334 ; KNL_64-NEXT: vmovhps %xmm0, (%rcx)
1335 ; KNL_64-NEXT: .LBB18_4: # %else2
1336 ; KNL_64-NEXT: testb $4, %al
1337 ; KNL_64-NEXT: vextractf128 $1, %ymm0, %xmm0
1338 ; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1
1339 ; KNL_64-NEXT: jne .LBB18_5
1340 ; KNL_64-NEXT: # %bb.6: # %else4
1341 ; KNL_64-NEXT: testb $8, %al
1342 ; KNL_64-NEXT: jne .LBB18_7
1343 ; KNL_64-NEXT: .LBB18_8: # %else6
1344 ; KNL_64-NEXT: vzeroupper
1346 ; KNL_64-NEXT: .LBB18_5: # %cond.store3
1347 ; KNL_64-NEXT: vmovq %xmm1, %rcx
1348 ; KNL_64-NEXT: vmovlps %xmm0, (%rcx)
1349 ; KNL_64-NEXT: testb $8, %al
1350 ; KNL_64-NEXT: je .LBB18_8
1351 ; KNL_64-NEXT: .LBB18_7: # %cond.store5
1352 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
1353 ; KNL_64-NEXT: vmovhps %xmm0, (%rax)
1354 ; KNL_64-NEXT: vzeroupper
1357 ; KNL_32-LABEL: test19:
1359 ; KNL_32-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
1360 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
1361 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
1362 ; KNL_32-NEXT: vpmovqd %zmm2, %ymm1
1363 ; KNL_32-NEXT: vpslld $3, %xmm1, %xmm1
1364 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2
1365 ; KNL_32-NEXT: vpaddd %xmm1, %xmm2, %xmm1
1366 ; KNL_32-NEXT: kmovw %k0, %eax
1367 ; KNL_32-NEXT: testb $1, %al
1368 ; KNL_32-NEXT: je .LBB18_2
1369 ; KNL_32-NEXT: # %bb.1: # %cond.store
1370 ; KNL_32-NEXT: vmovd %xmm1, %ecx
1371 ; KNL_32-NEXT: vmovlps %xmm0, (%ecx)
1372 ; KNL_32-NEXT: .LBB18_2: # %else
1373 ; KNL_32-NEXT: testb $2, %al
1374 ; KNL_32-NEXT: je .LBB18_4
1375 ; KNL_32-NEXT: # %bb.3: # %cond.store1
1376 ; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
1377 ; KNL_32-NEXT: vmovhps %xmm0, (%ecx)
1378 ; KNL_32-NEXT: .LBB18_4: # %else2
1379 ; KNL_32-NEXT: testb $4, %al
1380 ; KNL_32-NEXT: vextractf128 $1, %ymm0, %xmm0
1381 ; KNL_32-NEXT: jne .LBB18_5
1382 ; KNL_32-NEXT: # %bb.6: # %else4
1383 ; KNL_32-NEXT: testb $8, %al
1384 ; KNL_32-NEXT: jne .LBB18_7
1385 ; KNL_32-NEXT: .LBB18_8: # %else6
1386 ; KNL_32-NEXT: vzeroupper
1388 ; KNL_32-NEXT: .LBB18_5: # %cond.store3
1389 ; KNL_32-NEXT: vpextrd $2, %xmm1, %ecx
1390 ; KNL_32-NEXT: vmovlps %xmm0, (%ecx)
1391 ; KNL_32-NEXT: testb $8, %al
1392 ; KNL_32-NEXT: je .LBB18_8
1393 ; KNL_32-NEXT: .LBB18_7: # %cond.store5
1394 ; KNL_32-NEXT: vpextrd $3, %xmm1, %eax
1395 ; KNL_32-NEXT: vmovhps %xmm0, (%eax)
1396 ; KNL_32-NEXT: vzeroupper
1399 ; SKX-LABEL: test19:
1401 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
1402 ; SKX-NEXT: vpmovd2m %xmm1, %k1
1403 ; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
1404 ; SKX-NEXT: vzeroupper
1407 ; SKX_32-LABEL: test19:
1409 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
1410 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
1411 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1412 ; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
1413 ; SKX_32-NEXT: vzeroupper
1415 %gep = getelementptr double, ptr %ptr, <4 x i64> %ind
1416 call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %a1, <4 x ptr> %gep, i32 8, <4 x i1> %mask)
1420 ; Data type requires widening
1421 define void @test20(<2 x float>%a1, <2 x ptr> %ptr, <2 x i1> %mask) {
1422 ; KNL_64-LABEL: test20:
1424 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
1425 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
1426 ; KNL_64-NEXT: kmovw %k0, %eax
1427 ; KNL_64-NEXT: testb $1, %al
1428 ; KNL_64-NEXT: jne .LBB19_1
1429 ; KNL_64-NEXT: # %bb.2: # %else
1430 ; KNL_64-NEXT: testb $2, %al
1431 ; KNL_64-NEXT: jne .LBB19_3
1432 ; KNL_64-NEXT: .LBB19_4: # %else2
1433 ; KNL_64-NEXT: vzeroupper
1435 ; KNL_64-NEXT: .LBB19_1: # %cond.store
1436 ; KNL_64-NEXT: vmovq %xmm1, %rcx
1437 ; KNL_64-NEXT: vmovd %xmm0, (%rcx)
1438 ; KNL_64-NEXT: testb $2, %al
1439 ; KNL_64-NEXT: je .LBB19_4
1440 ; KNL_64-NEXT: .LBB19_3: # %cond.store1
1441 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
1442 ; KNL_64-NEXT: vextractps $1, %xmm0, (%rax)
1443 ; KNL_64-NEXT: vzeroupper
1446 ; KNL_32-LABEL: test20:
1448 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
1449 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
1450 ; KNL_32-NEXT: kmovw %k0, %eax
1451 ; KNL_32-NEXT: testb $1, %al
1452 ; KNL_32-NEXT: jne .LBB19_1
1453 ; KNL_32-NEXT: # %bb.2: # %else
1454 ; KNL_32-NEXT: testb $2, %al
1455 ; KNL_32-NEXT: jne .LBB19_3
1456 ; KNL_32-NEXT: .LBB19_4: # %else2
1457 ; KNL_32-NEXT: vzeroupper
1459 ; KNL_32-NEXT: .LBB19_1: # %cond.store
1460 ; KNL_32-NEXT: vmovd %xmm1, %ecx
1461 ; KNL_32-NEXT: vmovd %xmm0, (%ecx)
1462 ; KNL_32-NEXT: testb $2, %al
1463 ; KNL_32-NEXT: je .LBB19_4
1464 ; KNL_32-NEXT: .LBB19_3: # %cond.store1
1465 ; KNL_32-NEXT: vpextrd $1, %xmm1, %eax
1466 ; KNL_32-NEXT: vextractps $1, %xmm0, (%eax)
1467 ; KNL_32-NEXT: vzeroupper
1470 ; SKX-LABEL: test20:
1472 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
1473 ; SKX-NEXT: vpmovq2m %xmm2, %k0
1474 ; SKX-NEXT: kmovw %k0, %eax
1475 ; SKX-NEXT: testb $1, %al
1476 ; SKX-NEXT: jne .LBB19_1
1477 ; SKX-NEXT: # %bb.2: # %else
1478 ; SKX-NEXT: testb $2, %al
1479 ; SKX-NEXT: jne .LBB19_3
1480 ; SKX-NEXT: .LBB19_4: # %else2
1482 ; SKX-NEXT: .LBB19_1: # %cond.store
1483 ; SKX-NEXT: vmovq %xmm1, %rcx
1484 ; SKX-NEXT: vmovd %xmm0, (%rcx)
1485 ; SKX-NEXT: testb $2, %al
1486 ; SKX-NEXT: je .LBB19_4
1487 ; SKX-NEXT: .LBB19_3: # %cond.store1
1488 ; SKX-NEXT: vpextrq $1, %xmm1, %rax
1489 ; SKX-NEXT: vextractps $1, %xmm0, (%rax)
1492 ; SKX_32-LABEL: test20:
1494 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
1495 ; SKX_32-NEXT: vpmovq2m %xmm2, %k0
1496 ; SKX_32-NEXT: kmovw %k0, %eax
1497 ; SKX_32-NEXT: testb $1, %al
1498 ; SKX_32-NEXT: jne .LBB19_1
1499 ; SKX_32-NEXT: # %bb.2: # %else
1500 ; SKX_32-NEXT: testb $2, %al
1501 ; SKX_32-NEXT: jne .LBB19_3
1502 ; SKX_32-NEXT: .LBB19_4: # %else2
1504 ; SKX_32-NEXT: .LBB19_1: # %cond.store
1505 ; SKX_32-NEXT: vmovd %xmm1, %ecx
1506 ; SKX_32-NEXT: vmovd %xmm0, (%ecx)
1507 ; SKX_32-NEXT: testb $2, %al
1508 ; SKX_32-NEXT: je .LBB19_4
1509 ; SKX_32-NEXT: .LBB19_3: # %cond.store1
1510 ; SKX_32-NEXT: vpextrd $1, %xmm1, %eax
1511 ; SKX_32-NEXT: vextractps $1, %xmm0, (%eax)
1513 call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> %a1, <2 x ptr> %ptr, i32 4, <2 x i1> %mask)
1517 ; Data type requires promotion
1518 define void @test21(<2 x i32>%a1, <2 x ptr> %ptr, <2 x i1>%mask) {
1519 ; KNL_64-LABEL: test21:
1521 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
1522 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
1523 ; KNL_64-NEXT: kmovw %k0, %eax
1524 ; KNL_64-NEXT: testb $1, %al
1525 ; KNL_64-NEXT: jne .LBB20_1
1526 ; KNL_64-NEXT: # %bb.2: # %else
1527 ; KNL_64-NEXT: testb $2, %al
1528 ; KNL_64-NEXT: jne .LBB20_3
1529 ; KNL_64-NEXT: .LBB20_4: # %else2
1530 ; KNL_64-NEXT: vzeroupper
1532 ; KNL_64-NEXT: .LBB20_1: # %cond.store
1533 ; KNL_64-NEXT: vmovq %xmm1, %rcx
1534 ; KNL_64-NEXT: vmovss %xmm0, (%rcx)
1535 ; KNL_64-NEXT: testb $2, %al
1536 ; KNL_64-NEXT: je .LBB20_4
1537 ; KNL_64-NEXT: .LBB20_3: # %cond.store1
1538 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
1539 ; KNL_64-NEXT: vextractps $1, %xmm0, (%rax)
1540 ; KNL_64-NEXT: vzeroupper
1543 ; KNL_32-LABEL: test21:
1545 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
1546 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
1547 ; KNL_32-NEXT: kmovw %k0, %eax
1548 ; KNL_32-NEXT: testb $1, %al
1549 ; KNL_32-NEXT: jne .LBB20_1
1550 ; KNL_32-NEXT: # %bb.2: # %else
1551 ; KNL_32-NEXT: testb $2, %al
1552 ; KNL_32-NEXT: jne .LBB20_3
1553 ; KNL_32-NEXT: .LBB20_4: # %else2
1554 ; KNL_32-NEXT: vzeroupper
1556 ; KNL_32-NEXT: .LBB20_1: # %cond.store
1557 ; KNL_32-NEXT: vmovd %xmm1, %ecx
1558 ; KNL_32-NEXT: vmovss %xmm0, (%ecx)
1559 ; KNL_32-NEXT: testb $2, %al
1560 ; KNL_32-NEXT: je .LBB20_4
1561 ; KNL_32-NEXT: .LBB20_3: # %cond.store1
1562 ; KNL_32-NEXT: vpextrd $1, %xmm1, %eax
1563 ; KNL_32-NEXT: vextractps $1, %xmm0, (%eax)
1564 ; KNL_32-NEXT: vzeroupper
1567 ; SKX-LABEL: test21:
1569 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
1570 ; SKX-NEXT: vpmovq2m %xmm2, %k0
1571 ; SKX-NEXT: kmovw %k0, %eax
1572 ; SKX-NEXT: testb $1, %al
1573 ; SKX-NEXT: jne .LBB20_1
1574 ; SKX-NEXT: # %bb.2: # %else
1575 ; SKX-NEXT: testb $2, %al
1576 ; SKX-NEXT: jne .LBB20_3
1577 ; SKX-NEXT: .LBB20_4: # %else2
1579 ; SKX-NEXT: .LBB20_1: # %cond.store
1580 ; SKX-NEXT: vmovq %xmm1, %rcx
1581 ; SKX-NEXT: vmovss %xmm0, (%rcx)
1582 ; SKX-NEXT: testb $2, %al
1583 ; SKX-NEXT: je .LBB20_4
1584 ; SKX-NEXT: .LBB20_3: # %cond.store1
1585 ; SKX-NEXT: vpextrq $1, %xmm1, %rax
1586 ; SKX-NEXT: vextractps $1, %xmm0, (%rax)
1589 ; SKX_32-LABEL: test21:
1591 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
1592 ; SKX_32-NEXT: vpmovq2m %xmm2, %k0
1593 ; SKX_32-NEXT: kmovw %k0, %eax
1594 ; SKX_32-NEXT: testb $1, %al
1595 ; SKX_32-NEXT: jne .LBB20_1
1596 ; SKX_32-NEXT: # %bb.2: # %else
1597 ; SKX_32-NEXT: testb $2, %al
1598 ; SKX_32-NEXT: jne .LBB20_3
1599 ; SKX_32-NEXT: .LBB20_4: # %else2
1601 ; SKX_32-NEXT: .LBB20_1: # %cond.store
1602 ; SKX_32-NEXT: vmovd %xmm1, %ecx
1603 ; SKX_32-NEXT: vmovss %xmm0, (%ecx)
1604 ; SKX_32-NEXT: testb $2, %al
1605 ; SKX_32-NEXT: je .LBB20_4
1606 ; SKX_32-NEXT: .LBB20_3: # %cond.store1
1607 ; SKX_32-NEXT: vpextrd $1, %xmm1, %eax
1608 ; SKX_32-NEXT: vextractps $1, %xmm0, (%eax)
1610 call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> %a1, <2 x ptr> %ptr, i32 4, <2 x i1> %mask)
1614 ; The result type requires widening
1615 declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>)
1617 define <2 x float> @test22(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
1618 ; KNL_64-LABEL: test22:
1620 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1621 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1622 ; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0
1623 ; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0
1624 ; KNL_64-NEXT: vmovq %rdi, %xmm1
1625 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
1626 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1627 ; KNL_64-NEXT: kmovw %k0, %eax
1628 ; KNL_64-NEXT: testb $1, %al
1629 ; KNL_64-NEXT: jne .LBB21_1
1630 ; KNL_64-NEXT: # %bb.2: # %else
1631 ; KNL_64-NEXT: testb $2, %al
1632 ; KNL_64-NEXT: jne .LBB21_3
1633 ; KNL_64-NEXT: .LBB21_4: # %else2
1634 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1635 ; KNL_64-NEXT: vzeroupper
1637 ; KNL_64-NEXT: .LBB21_1: # %cond.load
1638 ; KNL_64-NEXT: vmovq %xmm0, %rcx
1639 ; KNL_64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1640 ; KNL_64-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1641 ; KNL_64-NEXT: testb $2, %al
1642 ; KNL_64-NEXT: je .LBB21_4
1643 ; KNL_64-NEXT: .LBB21_3: # %cond.load1
1644 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
1645 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1646 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1647 ; KNL_64-NEXT: vzeroupper
1650 ; KNL_32-LABEL: test22:
1652 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1653 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1654 ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
1655 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
1656 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1657 ; KNL_32-NEXT: kmovw %k0, %eax
1658 ; KNL_32-NEXT: testb $1, %al
1659 ; KNL_32-NEXT: jne .LBB21_1
1660 ; KNL_32-NEXT: # %bb.2: # %else
1661 ; KNL_32-NEXT: testb $2, %al
1662 ; KNL_32-NEXT: jne .LBB21_3
1663 ; KNL_32-NEXT: .LBB21_4: # %else2
1664 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1665 ; KNL_32-NEXT: vzeroupper
1667 ; KNL_32-NEXT: .LBB21_1: # %cond.load
1668 ; KNL_32-NEXT: vmovd %xmm0, %ecx
1669 ; KNL_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1670 ; KNL_32-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1671 ; KNL_32-NEXT: testb $2, %al
1672 ; KNL_32-NEXT: je .LBB21_4
1673 ; KNL_32-NEXT: .LBB21_3: # %cond.load1
1674 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
1675 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1676 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1677 ; KNL_32-NEXT: vzeroupper
1680 ; SKX-LABEL: test22:
1682 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1683 ; SKX-NEXT: vpmovq2m %xmm1, %k0
1684 ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
1685 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
1686 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
1687 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1688 ; SKX-NEXT: kmovw %k0, %eax
1689 ; SKX-NEXT: testb $1, %al
1690 ; SKX-NEXT: jne .LBB21_1
1691 ; SKX-NEXT: # %bb.2: # %else
1692 ; SKX-NEXT: testb $2, %al
1693 ; SKX-NEXT: jne .LBB21_3
1694 ; SKX-NEXT: .LBB21_4: # %else2
1695 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1697 ; SKX-NEXT: .LBB21_1: # %cond.load
1698 ; SKX-NEXT: vmovq %xmm0, %rcx
1699 ; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1700 ; SKX-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1701 ; SKX-NEXT: testb $2, %al
1702 ; SKX-NEXT: je .LBB21_4
1703 ; SKX-NEXT: .LBB21_3: # %cond.load1
1704 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
1705 ; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1706 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1709 ; SKX_32-LABEL: test22:
1711 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1712 ; SKX_32-NEXT: vpmovq2m %xmm1, %k0
1713 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
1714 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
1715 ; SKX_32-NEXT: kmovw %k0, %eax
1716 ; SKX_32-NEXT: testb $1, %al
1717 ; SKX_32-NEXT: jne .LBB21_1
1718 ; SKX_32-NEXT: # %bb.2: # %else
1719 ; SKX_32-NEXT: testb $2, %al
1720 ; SKX_32-NEXT: jne .LBB21_3
1721 ; SKX_32-NEXT: .LBB21_4: # %else2
1722 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1724 ; SKX_32-NEXT: .LBB21_1: # %cond.load
1725 ; SKX_32-NEXT: vmovd %xmm0, %ecx
1726 ; SKX_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1727 ; SKX_32-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1728 ; SKX_32-NEXT: testb $2, %al
1729 ; SKX_32-NEXT: je .LBB21_4
1730 ; SKX_32-NEXT: .LBB21_3: # %cond.load1
1731 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
1732 ; SKX_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1733 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1735 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1736 %gep.random = getelementptr float, ptr %base, <2 x i64> %sext_ind
1737 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1741 define <2 x float> @test22a(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x float> %src0) {
1742 ; KNL_64-LABEL: test22a:
1744 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1745 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1746 ; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0
1747 ; KNL_64-NEXT: vmovq %rdi, %xmm1
1748 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
1749 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1750 ; KNL_64-NEXT: kmovw %k0, %eax
1751 ; KNL_64-NEXT: testb $1, %al
1752 ; KNL_64-NEXT: jne .LBB22_1
1753 ; KNL_64-NEXT: # %bb.2: # %else
1754 ; KNL_64-NEXT: testb $2, %al
1755 ; KNL_64-NEXT: jne .LBB22_3
1756 ; KNL_64-NEXT: .LBB22_4: # %else2
1757 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1758 ; KNL_64-NEXT: vzeroupper
1760 ; KNL_64-NEXT: .LBB22_1: # %cond.load
1761 ; KNL_64-NEXT: vmovq %xmm0, %rcx
1762 ; KNL_64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1763 ; KNL_64-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1764 ; KNL_64-NEXT: testb $2, %al
1765 ; KNL_64-NEXT: je .LBB22_4
1766 ; KNL_64-NEXT: .LBB22_3: # %cond.load1
1767 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
1768 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1769 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1770 ; KNL_64-NEXT: vzeroupper
1773 ; KNL_32-LABEL: test22a:
1775 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1776 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1777 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1778 ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
1779 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
1780 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1781 ; KNL_32-NEXT: kmovw %k0, %eax
1782 ; KNL_32-NEXT: testb $1, %al
1783 ; KNL_32-NEXT: jne .LBB22_1
1784 ; KNL_32-NEXT: # %bb.2: # %else
1785 ; KNL_32-NEXT: testb $2, %al
1786 ; KNL_32-NEXT: jne .LBB22_3
1787 ; KNL_32-NEXT: .LBB22_4: # %else2
1788 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1789 ; KNL_32-NEXT: vzeroupper
1791 ; KNL_32-NEXT: .LBB22_1: # %cond.load
1792 ; KNL_32-NEXT: vmovd %xmm0, %ecx
1793 ; KNL_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1794 ; KNL_32-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1795 ; KNL_32-NEXT: testb $2, %al
1796 ; KNL_32-NEXT: je .LBB22_4
1797 ; KNL_32-NEXT: .LBB22_3: # %cond.load1
1798 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
1799 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1800 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1801 ; KNL_32-NEXT: vzeroupper
1804 ; SKX-LABEL: test22a:
1806 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1807 ; SKX-NEXT: vpmovq2m %xmm1, %k0
1808 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
1809 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
1810 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1811 ; SKX-NEXT: kmovw %k0, %eax
1812 ; SKX-NEXT: testb $1, %al
1813 ; SKX-NEXT: jne .LBB22_1
1814 ; SKX-NEXT: # %bb.2: # %else
1815 ; SKX-NEXT: testb $2, %al
1816 ; SKX-NEXT: jne .LBB22_3
1817 ; SKX-NEXT: .LBB22_4: # %else2
1818 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1820 ; SKX-NEXT: .LBB22_1: # %cond.load
1821 ; SKX-NEXT: vmovq %xmm0, %rcx
1822 ; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1823 ; SKX-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1824 ; SKX-NEXT: testb $2, %al
1825 ; SKX-NEXT: je .LBB22_4
1826 ; SKX-NEXT: .LBB22_3: # %cond.load1
1827 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
1828 ; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1829 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1832 ; SKX_32-LABEL: test22a:
1834 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1835 ; SKX_32-NEXT: vpmovq2m %xmm1, %k0
1836 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1837 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
1838 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
1839 ; SKX_32-NEXT: kmovw %k0, %eax
1840 ; SKX_32-NEXT: testb $1, %al
1841 ; SKX_32-NEXT: jne .LBB22_1
1842 ; SKX_32-NEXT: # %bb.2: # %else
1843 ; SKX_32-NEXT: testb $2, %al
1844 ; SKX_32-NEXT: jne .LBB22_3
1845 ; SKX_32-NEXT: .LBB22_4: # %else2
1846 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1848 ; SKX_32-NEXT: .LBB22_1: # %cond.load
1849 ; SKX_32-NEXT: vmovd %xmm0, %ecx
1850 ; SKX_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1851 ; SKX_32-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1852 ; SKX_32-NEXT: testb $2, %al
1853 ; SKX_32-NEXT: je .LBB22_4
1854 ; SKX_32-NEXT: .LBB22_3: # %cond.load1
1855 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
1856 ; SKX_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
1857 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1859 %gep.random = getelementptr float, ptr %base, <2 x i64> %ind
1860 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1864 declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
1865 declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
1867 define <2 x i32> @test23(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1868 ; KNL_64-LABEL: test23:
1870 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1871 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1872 ; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0
1873 ; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0
1874 ; KNL_64-NEXT: vmovq %rdi, %xmm1
1875 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
1876 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1877 ; KNL_64-NEXT: kmovw %k0, %eax
1878 ; KNL_64-NEXT: testb $1, %al
1879 ; KNL_64-NEXT: jne .LBB23_1
1880 ; KNL_64-NEXT: # %bb.2: # %else
1881 ; KNL_64-NEXT: testb $2, %al
1882 ; KNL_64-NEXT: jne .LBB23_3
1883 ; KNL_64-NEXT: .LBB23_4: # %else2
1884 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1885 ; KNL_64-NEXT: vzeroupper
1887 ; KNL_64-NEXT: .LBB23_1: # %cond.load
1888 ; KNL_64-NEXT: vmovq %xmm0, %rcx
1889 ; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2
1890 ; KNL_64-NEXT: testb $2, %al
1891 ; KNL_64-NEXT: je .LBB23_4
1892 ; KNL_64-NEXT: .LBB23_3: # %cond.load1
1893 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
1894 ; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
1895 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1896 ; KNL_64-NEXT: vzeroupper
1899 ; KNL_32-LABEL: test23:
1901 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1902 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1903 ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
1904 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
1905 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1906 ; KNL_32-NEXT: kmovw %k0, %eax
1907 ; KNL_32-NEXT: testb $1, %al
1908 ; KNL_32-NEXT: jne .LBB23_1
1909 ; KNL_32-NEXT: # %bb.2: # %else
1910 ; KNL_32-NEXT: testb $2, %al
1911 ; KNL_32-NEXT: jne .LBB23_3
1912 ; KNL_32-NEXT: .LBB23_4: # %else2
1913 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1914 ; KNL_32-NEXT: vzeroupper
1916 ; KNL_32-NEXT: .LBB23_1: # %cond.load
1917 ; KNL_32-NEXT: vmovd %xmm0, %ecx
1918 ; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2
1919 ; KNL_32-NEXT: testb $2, %al
1920 ; KNL_32-NEXT: je .LBB23_4
1921 ; KNL_32-NEXT: .LBB23_3: # %cond.load1
1922 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
1923 ; KNL_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2
1924 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1925 ; KNL_32-NEXT: vzeroupper
1928 ; SKX-LABEL: test23:
1930 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1931 ; SKX-NEXT: vpmovq2m %xmm1, %k0
1932 ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
1933 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
1934 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
1935 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1936 ; SKX-NEXT: kmovw %k0, %eax
1937 ; SKX-NEXT: testb $1, %al
1938 ; SKX-NEXT: jne .LBB23_1
1939 ; SKX-NEXT: # %bb.2: # %else
1940 ; SKX-NEXT: testb $2, %al
1941 ; SKX-NEXT: jne .LBB23_3
1942 ; SKX-NEXT: .LBB23_4: # %else2
1943 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1945 ; SKX-NEXT: .LBB23_1: # %cond.load
1946 ; SKX-NEXT: vmovq %xmm0, %rcx
1947 ; SKX-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2
1948 ; SKX-NEXT: testb $2, %al
1949 ; SKX-NEXT: je .LBB23_4
1950 ; SKX-NEXT: .LBB23_3: # %cond.load1
1951 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
1952 ; SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
1953 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1956 ; SKX_32-LABEL: test23:
1958 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1959 ; SKX_32-NEXT: vpmovq2m %xmm1, %k0
1960 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
1961 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
1962 ; SKX_32-NEXT: kmovw %k0, %eax
1963 ; SKX_32-NEXT: testb $1, %al
1964 ; SKX_32-NEXT: jne .LBB23_1
1965 ; SKX_32-NEXT: # %bb.2: # %else
1966 ; SKX_32-NEXT: testb $2, %al
1967 ; SKX_32-NEXT: jne .LBB23_3
1968 ; SKX_32-NEXT: .LBB23_4: # %else2
1969 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1971 ; SKX_32-NEXT: .LBB23_1: # %cond.load
1972 ; SKX_32-NEXT: vmovd %xmm0, %ecx
1973 ; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2
1974 ; SKX_32-NEXT: testb $2, %al
1975 ; SKX_32-NEXT: je .LBB23_4
1976 ; SKX_32-NEXT: .LBB23_3: # %cond.load1
1977 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
1978 ; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2
1979 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1981 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1982 %gep.random = getelementptr i32, ptr %base, <2 x i64> %sext_ind
1983 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1987 define <2 x i32> @test23b(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1988 ; KNL_64-LABEL: test23b:
1990 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1991 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1992 ; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0
1993 ; KNL_64-NEXT: vmovq %rdi, %xmm1
1994 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
1995 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
1996 ; KNL_64-NEXT: kmovw %k0, %eax
1997 ; KNL_64-NEXT: testb $1, %al
1998 ; KNL_64-NEXT: jne .LBB24_1
1999 ; KNL_64-NEXT: # %bb.2: # %else
2000 ; KNL_64-NEXT: testb $2, %al
2001 ; KNL_64-NEXT: jne .LBB24_3
2002 ; KNL_64-NEXT: .LBB24_4: # %else2
2003 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
2004 ; KNL_64-NEXT: vzeroupper
2006 ; KNL_64-NEXT: .LBB24_1: # %cond.load
2007 ; KNL_64-NEXT: vmovq %xmm0, %rcx
2008 ; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2
2009 ; KNL_64-NEXT: testb $2, %al
2010 ; KNL_64-NEXT: je .LBB24_4
2011 ; KNL_64-NEXT: .LBB24_3: # %cond.load1
2012 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
2013 ; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
2014 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
2015 ; KNL_64-NEXT: vzeroupper
2018 ; KNL_32-LABEL: test23b:
2020 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
2021 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
2022 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2023 ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
2024 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
2025 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
2026 ; KNL_32-NEXT: kmovw %k0, %eax
2027 ; KNL_32-NEXT: testb $1, %al
2028 ; KNL_32-NEXT: jne .LBB24_1
2029 ; KNL_32-NEXT: # %bb.2: # %else
2030 ; KNL_32-NEXT: testb $2, %al
2031 ; KNL_32-NEXT: jne .LBB24_3
2032 ; KNL_32-NEXT: .LBB24_4: # %else2
2033 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
2034 ; KNL_32-NEXT: vzeroupper
2036 ; KNL_32-NEXT: .LBB24_1: # %cond.load
2037 ; KNL_32-NEXT: vmovd %xmm0, %ecx
2038 ; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2
2039 ; KNL_32-NEXT: testb $2, %al
2040 ; KNL_32-NEXT: je .LBB24_4
2041 ; KNL_32-NEXT: .LBB24_3: # %cond.load1
2042 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
2043 ; KNL_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2
2044 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
2045 ; KNL_32-NEXT: vzeroupper
2048 ; SKX-LABEL: test23b:
2050 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
2051 ; SKX-NEXT: vpmovq2m %xmm1, %k0
2052 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
2053 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
2054 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2055 ; SKX-NEXT: kmovw %k0, %eax
2056 ; SKX-NEXT: testb $1, %al
2057 ; SKX-NEXT: jne .LBB24_1
2058 ; SKX-NEXT: # %bb.2: # %else
2059 ; SKX-NEXT: testb $2, %al
2060 ; SKX-NEXT: jne .LBB24_3
2061 ; SKX-NEXT: .LBB24_4: # %else2
2062 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
2064 ; SKX-NEXT: .LBB24_1: # %cond.load
2065 ; SKX-NEXT: vmovq %xmm0, %rcx
2066 ; SKX-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2
2067 ; SKX-NEXT: testb $2, %al
2068 ; SKX-NEXT: je .LBB24_4
2069 ; SKX-NEXT: .LBB24_3: # %cond.load1
2070 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
2071 ; SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
2072 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
2075 ; SKX_32-LABEL: test23b:
2077 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
2078 ; SKX_32-NEXT: vpmovq2m %xmm1, %k0
2079 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2080 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
2081 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
2082 ; SKX_32-NEXT: kmovw %k0, %eax
2083 ; SKX_32-NEXT: testb $1, %al
2084 ; SKX_32-NEXT: jne .LBB24_1
2085 ; SKX_32-NEXT: # %bb.2: # %else
2086 ; SKX_32-NEXT: testb $2, %al
2087 ; SKX_32-NEXT: jne .LBB24_3
2088 ; SKX_32-NEXT: .LBB24_4: # %else2
2089 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
2091 ; SKX_32-NEXT: .LBB24_1: # %cond.load
2092 ; SKX_32-NEXT: vmovd %xmm0, %ecx
2093 ; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2
2094 ; SKX_32-NEXT: testb $2, %al
2095 ; SKX_32-NEXT: je .LBB24_4
2096 ; SKX_32-NEXT: .LBB24_3: # %cond.load1
2097 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
2098 ; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2
2099 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
2101 %gep.random = getelementptr i32, ptr %base, <2 x i64> %ind
2102 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
2106 define <2 x i32> @test24(ptr %base, <2 x i32> %ind) {
2107 ; KNL_64-LABEL: test24:
2109 ; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0
2110 ; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0
2111 ; KNL_64-NEXT: vmovq %rdi, %xmm1
2112 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
2113 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2114 ; KNL_64-NEXT: vmovq %xmm0, %rax
2115 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
2116 ; KNL_64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2117 ; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0
2120 ; KNL_32-LABEL: test24:
2122 ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
2123 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
2124 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
2125 ; KNL_32-NEXT: vmovd %xmm0, %eax
2126 ; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
2127 ; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2128 ; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
2131 ; SKX-LABEL: test24:
2133 ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
2134 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
2135 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
2136 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2137 ; SKX-NEXT: vmovq %xmm0, %rax
2138 ; SKX-NEXT: vpextrq $1, %xmm0, %rcx
2139 ; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2140 ; SKX-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0
2143 ; SKX_32-LABEL: test24:
2145 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
2146 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
2147 ; SKX_32-NEXT: vmovd %xmm0, %eax
2148 ; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx
2149 ; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2150 ; SKX_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
2152 %sext_ind = sext <2 x i32> %ind to <2 x i64>
2153 %gep.random = getelementptr i32, ptr %base, <2 x i64> %sext_ind
2154 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
2158 define <2 x i64> @test25(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
2159 ; KNL_64-LABEL: test25:
2161 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
2162 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
2163 ; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0
2164 ; KNL_64-NEXT: vpsllq $3, %xmm0, %xmm0
2165 ; KNL_64-NEXT: vmovq %rdi, %xmm1
2166 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
2167 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2168 ; KNL_64-NEXT: kmovw %k0, %eax
2169 ; KNL_64-NEXT: testb $1, %al
2170 ; KNL_64-NEXT: jne .LBB26_1
2171 ; KNL_64-NEXT: # %bb.2: # %else
2172 ; KNL_64-NEXT: testb $2, %al
2173 ; KNL_64-NEXT: jne .LBB26_3
2174 ; KNL_64-NEXT: .LBB26_4: # %else2
2175 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
2176 ; KNL_64-NEXT: vzeroupper
2178 ; KNL_64-NEXT: .LBB26_1: # %cond.load
2179 ; KNL_64-NEXT: vmovq %xmm0, %rcx
2180 ; KNL_64-NEXT: vpinsrq $0, (%rcx), %xmm2, %xmm2
2181 ; KNL_64-NEXT: testb $2, %al
2182 ; KNL_64-NEXT: je .LBB26_4
2183 ; KNL_64-NEXT: .LBB26_3: # %cond.load1
2184 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
2185 ; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm2
2186 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
2187 ; KNL_64-NEXT: vzeroupper
2190 ; KNL_32-LABEL: test25:
2192 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
2193 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
2194 ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0
2195 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
2196 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
2197 ; KNL_32-NEXT: kmovw %k0, %eax
2198 ; KNL_32-NEXT: testb $1, %al
2199 ; KNL_32-NEXT: jne .LBB26_1
2200 ; KNL_32-NEXT: # %bb.2: # %else
2201 ; KNL_32-NEXT: testb $2, %al
2202 ; KNL_32-NEXT: jne .LBB26_3
2203 ; KNL_32-NEXT: .LBB26_4: # %else2
2204 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
2205 ; KNL_32-NEXT: vzeroupper
2207 ; KNL_32-NEXT: .LBB26_1: # %cond.load
2208 ; KNL_32-NEXT: vmovd %xmm0, %ecx
2209 ; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm1
2210 ; KNL_32-NEXT: vpinsrd $1, 4(%ecx), %xmm1, %xmm2
2211 ; KNL_32-NEXT: testb $2, %al
2212 ; KNL_32-NEXT: je .LBB26_4
2213 ; KNL_32-NEXT: .LBB26_3: # %cond.load1
2214 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
2215 ; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm0
2216 ; KNL_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm2
2217 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
2218 ; KNL_32-NEXT: vzeroupper
2221 ; SKX-LABEL: test25:
2223 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
2224 ; SKX-NEXT: vpmovq2m %xmm1, %k0
2225 ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
2226 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
2227 ; SKX-NEXT: vpsllq $3, %xmm0, %xmm0
2228 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2229 ; SKX-NEXT: kmovw %k0, %eax
2230 ; SKX-NEXT: testb $1, %al
2231 ; SKX-NEXT: jne .LBB26_1
2232 ; SKX-NEXT: # %bb.2: # %else
2233 ; SKX-NEXT: testb $2, %al
2234 ; SKX-NEXT: jne .LBB26_3
2235 ; SKX-NEXT: .LBB26_4: # %else2
2236 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
2238 ; SKX-NEXT: .LBB26_1: # %cond.load
2239 ; SKX-NEXT: vmovq %xmm0, %rcx
2240 ; SKX-NEXT: vpinsrq $0, (%rcx), %xmm2, %xmm2
2241 ; SKX-NEXT: testb $2, %al
2242 ; SKX-NEXT: je .LBB26_4
2243 ; SKX-NEXT: .LBB26_3: # %cond.load1
2244 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
2245 ; SKX-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm2
2246 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
2249 ; SKX_32-LABEL: test25:
2251 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
2252 ; SKX_32-NEXT: vpmovq2m %xmm1, %k0
2253 ; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0
2254 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
2255 ; SKX_32-NEXT: kmovw %k0, %eax
2256 ; SKX_32-NEXT: testb $1, %al
2257 ; SKX_32-NEXT: jne .LBB26_1
2258 ; SKX_32-NEXT: # %bb.2: # %else
2259 ; SKX_32-NEXT: testb $2, %al
2260 ; SKX_32-NEXT: jne .LBB26_3
2261 ; SKX_32-NEXT: .LBB26_4: # %else2
2262 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
2264 ; SKX_32-NEXT: .LBB26_1: # %cond.load
2265 ; SKX_32-NEXT: vmovd %xmm0, %ecx
2266 ; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm1
2267 ; SKX_32-NEXT: vpinsrd $1, 4(%ecx), %xmm1, %xmm2
2268 ; SKX_32-NEXT: testb $2, %al
2269 ; SKX_32-NEXT: je .LBB26_4
2270 ; SKX_32-NEXT: .LBB26_3: # %cond.load1
2271 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
2272 ; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm0
2273 ; SKX_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm2
2274 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
2276 %sext_ind = sext <2 x i32> %ind to <2 x i64>
2277 %gep.random = getelementptr i64, ptr %base, <2 x i64> %sext_ind
2278 %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
2282 define <2 x i64> @test26(ptr %base, <2 x i32> %ind, <2 x i64> %src0) {
2283 ; KNL_64-LABEL: test26:
2285 ; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0
2286 ; KNL_64-NEXT: vpsllq $3, %xmm0, %xmm0
2287 ; KNL_64-NEXT: vmovq %rdi, %xmm1
2288 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
2289 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2290 ; KNL_64-NEXT: vmovq %xmm0, %rax
2291 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
2292 ; KNL_64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2293 ; KNL_64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2294 ; KNL_64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2297 ; KNL_32-LABEL: test26:
2299 ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0
2300 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
2301 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
2302 ; KNL_32-NEXT: vmovd %xmm0, %eax
2303 ; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
2304 ; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2305 ; KNL_32-NEXT: vpinsrd $1, 4(%eax), %xmm0, %xmm0
2306 ; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm0
2307 ; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm0, %xmm0
2310 ; SKX-LABEL: test26:
2312 ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
2313 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
2314 ; SKX-NEXT: vpsllq $3, %xmm0, %xmm0
2315 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2316 ; SKX-NEXT: vmovq %xmm0, %rax
2317 ; SKX-NEXT: vpextrq $1, %xmm0, %rcx
2318 ; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2319 ; SKX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2320 ; SKX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2323 ; SKX_32-LABEL: test26:
2325 ; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0
2326 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
2327 ; SKX_32-NEXT: vmovd %xmm0, %eax
2328 ; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx
2329 ; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2330 ; SKX_32-NEXT: vpinsrd $1, 4(%eax), %xmm0, %xmm0
2331 ; SKX_32-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm0
2332 ; SKX_32-NEXT: vpinsrd $3, 4(%ecx), %xmm0, %xmm0
2334 %sext_ind = sext <2 x i32> %ind to <2 x i64>
2335 %gep.random = getelementptr i64, ptr %base, <2 x i64> %sext_ind
2336 %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
2340 ; Result type requires widening; all-ones mask
2341 define <2 x float> @test27(ptr %base, <2 x i32> %ind) {
2342 ; KNL_64-LABEL: test27:
2344 ; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0
2345 ; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0
2346 ; KNL_64-NEXT: vmovq %rdi, %xmm1
2347 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
2348 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2349 ; KNL_64-NEXT: vmovq %xmm0, %rax
2350 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
2351 ; KNL_64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2352 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
2355 ; KNL_32-LABEL: test27:
2357 ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
2358 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
2359 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
2360 ; KNL_32-NEXT: vmovd %xmm0, %eax
2361 ; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
2362 ; KNL_32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2363 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
2366 ; SKX-LABEL: test27:
2368 ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
2369 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1
2370 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
2371 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2372 ; SKX-NEXT: vmovq %xmm0, %rax
2373 ; SKX-NEXT: vpextrq $1, %xmm0, %rcx
2374 ; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2375 ; SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
2378 ; SKX_32-LABEL: test27:
2380 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
2381 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
2382 ; SKX_32-NEXT: vmovd %xmm0, %eax
2383 ; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx
2384 ; SKX_32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2385 ; SKX_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
2387 %sext_ind = sext <2 x i32> %ind to <2 x i64>
2388 %gep.random = getelementptr float, ptr %base, <2 x i64> %sext_ind
2389 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
2393 ; Data type requires promotion, mask is all-ones
2394 define void @test28(<2 x i32>%a1, <2 x ptr> %ptr) {
2395 ; KNL_64-LABEL: test28:
2397 ; KNL_64-NEXT: vmovq %xmm1, %rax
2398 ; KNL_64-NEXT: vmovss %xmm0, (%rax)
2399 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
2400 ; KNL_64-NEXT: vextractps $1, %xmm0, (%rax)
2403 ; KNL_32-LABEL: test28:
2405 ; KNL_32-NEXT: vmovd %xmm1, %eax
2406 ; KNL_32-NEXT: vmovss %xmm0, (%eax)
2407 ; KNL_32-NEXT: vpextrd $1, %xmm1, %eax
2408 ; KNL_32-NEXT: vextractps $1, %xmm0, (%eax)
2411 ; SKX-LABEL: test28:
2413 ; SKX-NEXT: vmovq %xmm1, %rax
2414 ; SKX-NEXT: vmovss %xmm0, (%rax)
2415 ; SKX-NEXT: vpextrq $1, %xmm1, %rax
2416 ; SKX-NEXT: vextractps $1, %xmm0, (%rax)
2419 ; SKX_32-LABEL: test28:
2421 ; SKX_32-NEXT: vmovd %xmm1, %eax
2422 ; SKX_32-NEXT: vmovss %xmm0, (%eax)
2423 ; SKX_32-NEXT: vpextrd $1, %xmm1, %eax
2424 ; SKX_32-NEXT: vextractps $1, %xmm0, (%eax)
2426 call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> %a1, <2 x ptr> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
2430 ; SCALAR-LABEL: test29
2431 ; SCALAR: extractelement <16 x ptr>
2432 ; SCALAR-NEXT: load float
2433 ; SCALAR-NEXT: insertelement <16 x float>
2434 ; SCALAR-NEXT: extractelement <16 x ptr>
2435 ; SCALAR-NEXT: load float
2437 define <16 x float> @test29(ptr %base, <16 x i32> %ind) {
2438 ; KNL_64-LABEL: test29:
2440 ; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
2441 ; KNL_64-NEXT: movw $44, %ax
2442 ; KNL_64-NEXT: kmovw %eax, %k1
2443 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
2444 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
2447 ; KNL_32-LABEL: test29:
2449 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2450 ; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
2451 ; KNL_32-NEXT: movw $44, %cx
2452 ; KNL_32-NEXT: kmovw %ecx, %k1
2453 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
2454 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
2457 ; SKX-LABEL: test29:
2459 ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
2460 ; SKX-NEXT: movw $44, %ax
2461 ; SKX-NEXT: kmovw %eax, %k1
2462 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
2463 ; SKX-NEXT: vmovaps %zmm1, %zmm0
2466 ; SKX_32-LABEL: test29:
2468 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2469 ; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
2470 ; SKX_32-NEXT: movw $44, %cx
2471 ; SKX_32-NEXT: kmovw %ecx, %k1
2472 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
2473 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
2476 %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
2477 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
2479 %sext_ind = sext <16 x i32> %ind to <16 x i64>
2480 %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
2482 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
2483 ret <16 x float>%res
2486 declare <3 x i32> @llvm.masked.gather.v3i32.v3p0(<3 x ptr>, i32, <3 x i1>, <3 x i32>)
2487 define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
2488 ; KNL_64-LABEL: test30:
2490 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
2491 ; KNL_64-NEXT: andl $1, %edi
2492 ; KNL_64-NEXT: kmovw %edi, %k0
2493 ; KNL_64-NEXT: kmovw %esi, %k1
2494 ; KNL_64-NEXT: kshiftlw $15, %k1, %k1
2495 ; KNL_64-NEXT: kshiftrw $14, %k1, %k1
2496 ; KNL_64-NEXT: korw %k1, %k0, %k0
2497 ; KNL_64-NEXT: movw $-5, %ax
2498 ; KNL_64-NEXT: kmovw %eax, %k1
2499 ; KNL_64-NEXT: kandw %k1, %k0, %k0
2500 ; KNL_64-NEXT: kmovw %edx, %k1
2501 ; KNL_64-NEXT: kshiftlw $15, %k1, %k1
2502 ; KNL_64-NEXT: kshiftrw $13, %k1, %k1
2503 ; KNL_64-NEXT: korw %k1, %k0, %k0
2504 ; KNL_64-NEXT: movb $7, %al
2505 ; KNL_64-NEXT: kmovw %eax, %k1
2506 ; KNL_64-NEXT: kandw %k1, %k0, %k0
2507 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
2508 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
2509 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
2510 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
2511 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
2512 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1}
2513 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
2514 ; KNL_64-NEXT: vzeroupper
2517 ; KNL_32-LABEL: test30:
2519 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
2520 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2521 ; KNL_32-NEXT: andl $1, %eax
2522 ; KNL_32-NEXT: kmovw %eax, %k0
2523 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2524 ; KNL_32-NEXT: kmovw %eax, %k1
2525 ; KNL_32-NEXT: kshiftlw $15, %k1, %k1
2526 ; KNL_32-NEXT: kshiftrw $14, %k1, %k1
2527 ; KNL_32-NEXT: korw %k1, %k0, %k0
2528 ; KNL_32-NEXT: movw $-5, %ax
2529 ; KNL_32-NEXT: kmovw %eax, %k1
2530 ; KNL_32-NEXT: kandw %k1, %k0, %k0
2531 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2532 ; KNL_32-NEXT: kmovw %eax, %k1
2533 ; KNL_32-NEXT: kshiftlw $15, %k1, %k1
2534 ; KNL_32-NEXT: kshiftrw $13, %k1, %k1
2535 ; KNL_32-NEXT: korw %k1, %k0, %k0
2536 ; KNL_32-NEXT: movb $7, %al
2537 ; KNL_32-NEXT: kmovw %eax, %k1
2538 ; KNL_32-NEXT: kandw %k1, %k0, %k0
2539 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
2540 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
2541 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
2542 ; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
2543 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
2544 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
2545 ; KNL_32-NEXT: vzeroupper
2548 ; SKX-LABEL: test30:
2550 ; SKX-NEXT: kmovw %esi, %k0
2551 ; SKX-NEXT: kshiftlb $7, %k0, %k0
2552 ; SKX-NEXT: kshiftrb $6, %k0, %k0
2553 ; SKX-NEXT: kmovw %edi, %k1
2554 ; SKX-NEXT: kshiftlb $7, %k1, %k1
2555 ; SKX-NEXT: kshiftrb $7, %k1, %k1
2556 ; SKX-NEXT: korw %k0, %k1, %k0
2557 ; SKX-NEXT: movb $-5, %al
2558 ; SKX-NEXT: kmovw %eax, %k1
2559 ; SKX-NEXT: kandw %k1, %k0, %k0
2560 ; SKX-NEXT: kmovw %edx, %k1
2561 ; SKX-NEXT: kshiftlb $7, %k1, %k1
2562 ; SKX-NEXT: kshiftrb $5, %k1, %k1
2563 ; SKX-NEXT: korw %k1, %k0, %k0
2564 ; SKX-NEXT: movb $7, %al
2565 ; SKX-NEXT: kmovw %eax, %k1
2566 ; SKX-NEXT: kandw %k1, %k0, %k1
2567 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
2568 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
2569 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
2570 ; SKX-NEXT: vpgatherqd (,%ymm0), %xmm2 {%k1}
2571 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
2572 ; SKX-NEXT: vzeroupper
2575 ; SKX_32-LABEL: test30:
2577 ; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2578 ; SKX_32-NEXT: kmovw %eax, %k0
2579 ; SKX_32-NEXT: kshiftlb $7, %k0, %k0
2580 ; SKX_32-NEXT: kshiftrb $6, %k0, %k0
2581 ; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2582 ; SKX_32-NEXT: kmovw %eax, %k1
2583 ; SKX_32-NEXT: kshiftlb $7, %k1, %k1
2584 ; SKX_32-NEXT: kshiftrb $7, %k1, %k1
2585 ; SKX_32-NEXT: korw %k0, %k1, %k0
2586 ; SKX_32-NEXT: movb $-5, %al
2587 ; SKX_32-NEXT: kmovw %eax, %k1
2588 ; SKX_32-NEXT: kandw %k1, %k0, %k0
2589 ; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2590 ; SKX_32-NEXT: kmovw %eax, %k1
2591 ; SKX_32-NEXT: kshiftlb $7, %k1, %k1
2592 ; SKX_32-NEXT: kshiftrb $5, %k1, %k1
2593 ; SKX_32-NEXT: korw %k1, %k0, %k0
2594 ; SKX_32-NEXT: movb $7, %al
2595 ; SKX_32-NEXT: kmovw %eax, %k1
2596 ; SKX_32-NEXT: kandw %k1, %k0, %k1
2597 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
2598 ; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
2599 ; SKX_32-NEXT: vpgatherdd (,%xmm0), %xmm2 {%k1}
2600 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
2603 %sext_ind = sext <3 x i32> %ind to <3 x i64>
2604 %gep.random = getelementptr i32, <3 x ptr> %base, <3 x i64> %sext_ind
2605 %res = call <3 x i32> @llvm.masked.gather.v3i32.v3p0(<3 x ptr> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
2609 ; Non-power of 2 scatter
2610 declare void @llvm.masked.scatter.v3i32.v3p0(<3 x i32>, <3 x ptr>, i32, <3 x i1>)
2611 define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
2612 ; KNL_64-LABEL: test30b:
2614 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
2615 ; KNL_64-NEXT: andl $1, %edi
2616 ; KNL_64-NEXT: kmovw %edi, %k0
2617 ; KNL_64-NEXT: kmovw %esi, %k1
2618 ; KNL_64-NEXT: kshiftlw $15, %k1, %k1
2619 ; KNL_64-NEXT: kshiftrw $14, %k1, %k1
2620 ; KNL_64-NEXT: korw %k1, %k0, %k0
2621 ; KNL_64-NEXT: movw $-5, %ax
2622 ; KNL_64-NEXT: kmovw %eax, %k1
2623 ; KNL_64-NEXT: kandw %k1, %k0, %k0
2624 ; KNL_64-NEXT: kmovw %edx, %k1
2625 ; KNL_64-NEXT: kshiftlw $15, %k1, %k1
2626 ; KNL_64-NEXT: kshiftrw $13, %k1, %k1
2627 ; KNL_64-NEXT: korw %k1, %k0, %k0
2628 ; KNL_64-NEXT: movb $7, %al
2629 ; KNL_64-NEXT: kmovw %eax, %k1
2630 ; KNL_64-NEXT: kandw %k1, %k0, %k0
2631 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
2632 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
2633 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
2634 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
2635 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
2636 ; KNL_64-NEXT: vpscatterqd %ymm2, (,%zmm0) {%k1}
2637 ; KNL_64-NEXT: vzeroupper
2640 ; KNL_32-LABEL: test30b:
2642 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
2643 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2644 ; KNL_32-NEXT: andl $1, %eax
2645 ; KNL_32-NEXT: kmovw %eax, %k0
2646 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2647 ; KNL_32-NEXT: kmovw %eax, %k1
2648 ; KNL_32-NEXT: kshiftlw $15, %k1, %k1
2649 ; KNL_32-NEXT: kshiftrw $14, %k1, %k1
2650 ; KNL_32-NEXT: korw %k1, %k0, %k0
2651 ; KNL_32-NEXT: movw $-5, %ax
2652 ; KNL_32-NEXT: kmovw %eax, %k1
2653 ; KNL_32-NEXT: kandw %k1, %k0, %k0
2654 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2655 ; KNL_32-NEXT: kmovw %eax, %k1
2656 ; KNL_32-NEXT: kshiftlw $15, %k1, %k1
2657 ; KNL_32-NEXT: kshiftrw $13, %k1, %k1
2658 ; KNL_32-NEXT: korw %k1, %k0, %k0
2659 ; KNL_32-NEXT: movb $7, %al
2660 ; KNL_32-NEXT: kmovw %eax, %k1
2661 ; KNL_32-NEXT: kandw %k1, %k0, %k0
2662 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
2663 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
2664 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
2665 ; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
2666 ; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
2667 ; KNL_32-NEXT: vzeroupper
2670 ; SKX-LABEL: test30b:
2672 ; SKX-NEXT: kmovw %esi, %k0
2673 ; SKX-NEXT: kshiftlb $7, %k0, %k0
2674 ; SKX-NEXT: kshiftrb $6, %k0, %k0
2675 ; SKX-NEXT: kmovw %edi, %k1
2676 ; SKX-NEXT: kshiftlb $7, %k1, %k1
2677 ; SKX-NEXT: kshiftrb $7, %k1, %k1
2678 ; SKX-NEXT: korw %k0, %k1, %k0
2679 ; SKX-NEXT: movb $-5, %al
2680 ; SKX-NEXT: kmovw %eax, %k1
2681 ; SKX-NEXT: kandw %k1, %k0, %k0
2682 ; SKX-NEXT: kmovw %edx, %k1
2683 ; SKX-NEXT: kshiftlb $7, %k1, %k1
2684 ; SKX-NEXT: kshiftrb $5, %k1, %k1
2685 ; SKX-NEXT: korw %k1, %k0, %k0
2686 ; SKX-NEXT: movb $7, %al
2687 ; SKX-NEXT: kmovw %eax, %k1
2688 ; SKX-NEXT: kandw %k1, %k0, %k1
2689 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
2690 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
2691 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
2692 ; SKX-NEXT: vpscatterqd %xmm2, (,%ymm0) {%k1}
2693 ; SKX-NEXT: vzeroupper
2696 ; SKX_32-LABEL: test30b:
2698 ; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2699 ; SKX_32-NEXT: kmovw %eax, %k0
2700 ; SKX_32-NEXT: kshiftlb $7, %k0, %k0
2701 ; SKX_32-NEXT: kshiftrb $6, %k0, %k0
2702 ; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2703 ; SKX_32-NEXT: kmovw %eax, %k1
2704 ; SKX_32-NEXT: kshiftlb $7, %k1, %k1
2705 ; SKX_32-NEXT: kshiftrb $7, %k1, %k1
2706 ; SKX_32-NEXT: korw %k0, %k1, %k0
2707 ; SKX_32-NEXT: movb $-5, %al
2708 ; SKX_32-NEXT: kmovw %eax, %k1
2709 ; SKX_32-NEXT: kandw %k1, %k0, %k0
2710 ; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2711 ; SKX_32-NEXT: kmovw %eax, %k1
2712 ; SKX_32-NEXT: kshiftlb $7, %k1, %k1
2713 ; SKX_32-NEXT: kshiftrb $5, %k1, %k1
2714 ; SKX_32-NEXT: korw %k1, %k0, %k0
2715 ; SKX_32-NEXT: movb $7, %al
2716 ; SKX_32-NEXT: kmovw %eax, %k1
2717 ; SKX_32-NEXT: kandw %k1, %k0, %k1
2718 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
2719 ; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
2720 ; SKX_32-NEXT: vpscatterdd %xmm2, (,%xmm0) {%k1}
2722 %sext_ind = sext <3 x i32> %ind to <3 x i64>
2723 %gep.random = getelementptr i32, <3 x ptr> %base, <3 x i64> %sext_ind
2724 call void @llvm.masked.scatter.v3i32.v3p0(<3 x i32> %src0, <3 x ptr> %gep.random, i32 4, <3 x i1> %mask)
2728 declare <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x ptr>)
2729 define <16 x ptr> @test31(<16 x ptr> %ptrs) {
2730 ; KNL_64-LABEL: test31:
2732 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2733 ; KNL_64-NEXT: vpxor %xmm2, %xmm2, %xmm2
2734 ; KNL_64-NEXT: vpxor %xmm3, %xmm3, %xmm3
2735 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
2736 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2}
2737 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1}
2738 ; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm0
2739 ; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm1
2742 ; KNL_32-LABEL: test31:
2744 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2745 ; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
2746 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
2747 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
2750 ; SKX-LABEL: test31:
2752 ; SKX-NEXT: kxnorw %k0, %k0, %k1
2753 ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
2754 ; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3
2755 ; SKX-NEXT: kxnorw %k0, %k0, %k2
2756 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2}
2757 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1}
2758 ; SKX-NEXT: vmovdqa64 %zmm3, %zmm0
2759 ; SKX-NEXT: vmovdqa64 %zmm2, %zmm1
2762 ; SKX_32-LABEL: test31:
2764 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2765 ; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
2766 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
2767 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
2770 %res = call <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x ptr> undef)
2774 define <16 x i32> @test_gather_16i32(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
2775 ; KNL_64-LABEL: test_gather_16i32:
2777 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2778 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2779 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2780 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2
2781 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2782 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
2783 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
2784 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
2787 ; KNL_32-LABEL: test_gather_16i32:
2789 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2790 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2791 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2792 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
2793 ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
2796 ; SKX-LABEL: test_gather_16i32:
2798 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2799 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2800 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2801 ; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm2
2802 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2803 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
2804 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
2805 ; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
2808 ; SKX_32-LABEL: test_gather_16i32:
2810 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2811 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2812 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2813 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
2814 ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
2816 %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
2819 define <16 x i64> @test_gather_16i64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
2820 ; KNL_64-LABEL: test_gather_16i64:
2822 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2823 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2824 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2825 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2826 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
2827 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
2828 ; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm0
2829 ; KNL_64-NEXT: vmovdqa64 %zmm4, %zmm1
2832 ; KNL_32-LABEL: test_gather_16i64:
2834 ; KNL_32-NEXT: pushl %ebp
2835 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2836 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2837 ; KNL_32-NEXT: movl %esp, %ebp
2838 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2839 ; KNL_32-NEXT: andl $-64, %esp
2840 ; KNL_32-NEXT: subl $64, %esp
2841 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2842 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2843 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2844 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
2845 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2846 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
2847 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2848 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
2849 ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
2850 ; KNL_32-NEXT: movl %ebp, %esp
2851 ; KNL_32-NEXT: popl %ebp
2852 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2855 ; SKX-LABEL: test_gather_16i64:
2857 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2858 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2859 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2860 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2861 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
2862 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
2863 ; SKX-NEXT: vmovdqa64 %zmm3, %zmm0
2864 ; SKX-NEXT: vmovdqa64 %zmm4, %zmm1
2867 ; SKX_32-LABEL: test_gather_16i64:
2869 ; SKX_32-NEXT: pushl %ebp
2870 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2871 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2872 ; SKX_32-NEXT: movl %esp, %ebp
2873 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2874 ; SKX_32-NEXT: andl $-64, %esp
2875 ; SKX_32-NEXT: subl $64, %esp
2876 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2877 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2878 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2879 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
2880 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2881 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
2882 ; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2883 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
2884 ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
2885 ; SKX_32-NEXT: movl %ebp, %esp
2886 ; SKX_32-NEXT: popl %ebp
2887 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2889 %res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
2892 declare <16 x i64> @llvm.masked.gather.v16i64.v16p0(<16 x ptr> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
2893 define <16 x float> @test_gather_16f32(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
2894 ; KNL_64-LABEL: test_gather_16f32:
2896 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2897 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2898 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2899 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2
2900 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2901 ; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
2902 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
2903 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
2906 ; KNL_32-LABEL: test_gather_16f32:
2908 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2909 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2910 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2911 ; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
2912 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
2915 ; SKX-LABEL: test_gather_16f32:
2917 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2918 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2919 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2920 ; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm2
2921 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2922 ; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
2923 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
2924 ; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
2927 ; SKX_32-LABEL: test_gather_16f32:
2929 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2930 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2931 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2932 ; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
2933 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
2935 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
2936 ret <16 x float> %res
2938 define <16 x double> @test_gather_16f64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
2939 ; KNL_64-LABEL: test_gather_16f64:
2941 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2942 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2943 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2944 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2945 ; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
2946 ; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
2947 ; KNL_64-NEXT: vmovapd %zmm3, %zmm0
2948 ; KNL_64-NEXT: vmovapd %zmm4, %zmm1
2951 ; KNL_32-LABEL: test_gather_16f64:
2953 ; KNL_32-NEXT: pushl %ebp
2954 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2955 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2956 ; KNL_32-NEXT: movl %esp, %ebp
2957 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2958 ; KNL_32-NEXT: andl $-64, %esp
2959 ; KNL_32-NEXT: subl $64, %esp
2960 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2961 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2962 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2963 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
2964 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2965 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
2966 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2967 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
2968 ; KNL_32-NEXT: vmovapd %zmm2, %zmm0
2969 ; KNL_32-NEXT: movl %ebp, %esp
2970 ; KNL_32-NEXT: popl %ebp
2971 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2974 ; SKX-LABEL: test_gather_16f64:
2976 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2977 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2978 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2979 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2980 ; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
2981 ; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
2982 ; SKX-NEXT: vmovapd %zmm3, %zmm0
2983 ; SKX-NEXT: vmovapd %zmm4, %zmm1
2986 ; SKX_32-LABEL: test_gather_16f64:
2988 ; SKX_32-NEXT: pushl %ebp
2989 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2990 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2991 ; SKX_32-NEXT: movl %esp, %ebp
2992 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2993 ; SKX_32-NEXT: andl $-64, %esp
2994 ; SKX_32-NEXT: subl $64, %esp
2995 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2996 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2997 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2998 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
2999 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
3000 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
3001 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
3002 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
3003 ; SKX_32-NEXT: vmovapd %zmm2, %zmm0
3004 ; SKX_32-NEXT: movl %ebp, %esp
3005 ; SKX_32-NEXT: popl %ebp
3006 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
3008 %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
3009 ret <16 x double> %res
3011 declare <16 x double> @llvm.masked.gather.v16f64.v16p0(<16 x ptr> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
3012 define void @test_scatter_16i32(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
3013 ; KNL_64-LABEL: test_scatter_16i32:
3015 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
3016 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
3017 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
3018 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
3019 ; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
3020 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
3021 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
3022 ; KNL_64-NEXT: vzeroupper
3025 ; KNL_32-LABEL: test_scatter_16i32:
3027 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
3028 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
3029 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
3030 ; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
3031 ; KNL_32-NEXT: vzeroupper
3034 ; SKX-LABEL: test_scatter_16i32:
3036 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
3037 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
3038 ; SKX-NEXT: vpmovd2m %zmm2, %k1
3039 ; SKX-NEXT: kshiftrw $8, %k1, %k2
3040 ; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
3041 ; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm0
3042 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
3043 ; SKX-NEXT: vzeroupper
3046 ; SKX_32-LABEL: test_scatter_16i32:
3048 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
3049 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
3050 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
3051 ; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
3052 ; SKX_32-NEXT: vzeroupper
3054 call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %src0, <16 x ptr> %ptrs, i32 4, <16 x i1> %mask)
3057 define void @test_scatter_16i64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
3058 ; KNL_64-LABEL: test_scatter_16i64:
3060 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
3061 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
3062 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
3063 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
3064 ; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
3065 ; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
3066 ; KNL_64-NEXT: vzeroupper
3069 ; KNL_32-LABEL: test_scatter_16i64:
3071 ; KNL_32-NEXT: pushl %ebp
3072 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
3073 ; KNL_32-NEXT: .cfi_offset %ebp, -8
3074 ; KNL_32-NEXT: movl %esp, %ebp
3075 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
3076 ; KNL_32-NEXT: andl $-64, %esp
3077 ; KNL_32-NEXT: subl $64, %esp
3078 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
3079 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
3080 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
3081 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
3082 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
3083 ; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
3084 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
3085 ; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
3086 ; KNL_32-NEXT: movl %ebp, %esp
3087 ; KNL_32-NEXT: popl %ebp
3088 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
3089 ; KNL_32-NEXT: vzeroupper
3092 ; SKX-LABEL: test_scatter_16i64:
3094 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
3095 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
3096 ; SKX-NEXT: vpmovd2m %zmm2, %k1
3097 ; SKX-NEXT: kshiftrw $8, %k1, %k2
3098 ; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
3099 ; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
3100 ; SKX-NEXT: vzeroupper
3103 ; SKX_32-LABEL: test_scatter_16i64:
3105 ; SKX_32-NEXT: pushl %ebp
3106 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
3107 ; SKX_32-NEXT: .cfi_offset %ebp, -8
3108 ; SKX_32-NEXT: movl %esp, %ebp
3109 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
3110 ; SKX_32-NEXT: andl $-64, %esp
3111 ; SKX_32-NEXT: subl $64, %esp
3112 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
3113 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
3114 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
3115 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
3116 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
3117 ; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
3118 ; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
3119 ; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
3120 ; SKX_32-NEXT: movl %ebp, %esp
3121 ; SKX_32-NEXT: popl %ebp
3122 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
3123 ; SKX_32-NEXT: vzeroupper
3125 call void @llvm.masked.scatter.v16i64.v16p0(<16 x i64> %src0, <16 x ptr> %ptrs, i32 4, <16 x i1> %mask)
3128 declare void @llvm.masked.scatter.v16i64.v16p0(<16 x i64> %src0, <16 x ptr> %ptrs, i32, <16 x i1> %mask)
3129 define void @test_scatter_16f32(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
3130 ; KNL_64-LABEL: test_scatter_16f32:
3132 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
3133 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
3134 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
3135 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
3136 ; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
3137 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
3138 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
3139 ; KNL_64-NEXT: vzeroupper
3142 ; KNL_32-LABEL: test_scatter_16f32:
3144 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
3145 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
3146 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
3147 ; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
3148 ; KNL_32-NEXT: vzeroupper
3151 ; SKX-LABEL: test_scatter_16f32:
3153 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
3154 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
3155 ; SKX-NEXT: vpmovd2m %zmm2, %k1
3156 ; SKX-NEXT: kshiftrw $8, %k1, %k2
3157 ; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
3158 ; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm0
3159 ; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
3160 ; SKX-NEXT: vzeroupper
3163 ; SKX_32-LABEL: test_scatter_16f32:
3165 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
3166 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
3167 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
3168 ; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
3169 ; SKX_32-NEXT: vzeroupper
3171 call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %src0, <16 x ptr> %ptrs, i32 4, <16 x i1> %mask)
3174 declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %src0, <16 x ptr> %ptrs, i32, <16 x i1> %mask)
3175 define void @test_scatter_16f64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
3176 ; KNL_64-LABEL: test_scatter_16f64:
3178 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
3179 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
3180 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
3181 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
3182 ; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
3183 ; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
3184 ; KNL_64-NEXT: vzeroupper
3187 ; KNL_32-LABEL: test_scatter_16f64:
3189 ; KNL_32-NEXT: pushl %ebp
3190 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
3191 ; KNL_32-NEXT: .cfi_offset %ebp, -8
3192 ; KNL_32-NEXT: movl %esp, %ebp
3193 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
3194 ; KNL_32-NEXT: andl $-64, %esp
3195 ; KNL_32-NEXT: subl $64, %esp
3196 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
3197 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
3198 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
3199 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
3200 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
3201 ; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
3202 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
3203 ; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
3204 ; KNL_32-NEXT: movl %ebp, %esp
3205 ; KNL_32-NEXT: popl %ebp
3206 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
3207 ; KNL_32-NEXT: vzeroupper
3210 ; SKX-LABEL: test_scatter_16f64:
3212 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
3213 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
3214 ; SKX-NEXT: vpmovd2m %zmm2, %k1
3215 ; SKX-NEXT: kshiftrw $8, %k1, %k2
3216 ; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
3217 ; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
3218 ; SKX-NEXT: vzeroupper
3221 ; SKX_32-LABEL: test_scatter_16f64:
3223 ; SKX_32-NEXT: pushl %ebp
3224 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
3225 ; SKX_32-NEXT: .cfi_offset %ebp, -8
3226 ; SKX_32-NEXT: movl %esp, %ebp
3227 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
3228 ; SKX_32-NEXT: andl $-64, %esp
3229 ; SKX_32-NEXT: subl $64, %esp
3230 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
3231 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
3232 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
3233 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
3234 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
3235 ; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
3236 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
3237 ; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
3238 ; SKX_32-NEXT: movl %ebp, %esp
3239 ; SKX_32-NEXT: popl %ebp
3240 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
3241 ; SKX_32-NEXT: vzeroupper
3243 call void @llvm.masked.scatter.v16f64.v16p0(<16 x double> %src0, <16 x ptr> %ptrs, i32 4, <16 x i1> %mask)
3246 declare void @llvm.masked.scatter.v16f64.v16p0(<16 x double> %src0, <16 x ptr> %ptrs, i32, <16 x i1> %mask)
3248 define <4 x i64> @test_pr28312(<4 x ptr> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) {
3249 ; KNL_64-LABEL: test_pr28312:
3251 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
3252 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
3253 ; KNL_64-NEXT: kmovw %k0, %eax
3254 ; KNL_64-NEXT: testb $1, %al
3255 ; KNL_64-NEXT: # implicit-def: $ymm1
3256 ; KNL_64-NEXT: je .LBB42_2
3257 ; KNL_64-NEXT: # %bb.1: # %cond.load
3258 ; KNL_64-NEXT: vmovq %xmm0, %rcx
3259 ; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
3260 ; KNL_64-NEXT: .LBB42_2: # %else
3261 ; KNL_64-NEXT: testb $2, %al
3262 ; KNL_64-NEXT: je .LBB42_4
3263 ; KNL_64-NEXT: # %bb.3: # %cond.load1
3264 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
3265 ; KNL_64-NEXT: vpinsrq $1, (%rcx), %xmm1, %xmm2
3266 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
3267 ; KNL_64-NEXT: .LBB42_4: # %else2
3268 ; KNL_64-NEXT: testb $4, %al
3269 ; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm2
3270 ; KNL_64-NEXT: je .LBB42_6
3271 ; KNL_64-NEXT: # %bb.5: # %cond.load4
3272 ; KNL_64-NEXT: vmovq %xmm2, %rcx
3273 ; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm3
3274 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
3275 ; KNL_64-NEXT: .LBB42_6: # %else5
3276 ; KNL_64-NEXT: testb $8, %al
3277 ; KNL_64-NEXT: je .LBB42_8
3278 ; KNL_64-NEXT: # %bb.7: # %cond.load7
3279 ; KNL_64-NEXT: vpextrq $1, %xmm2, %rax
3280 ; KNL_64-NEXT: vpbroadcastq (%rax), %ymm3
3281 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
3282 ; KNL_64-NEXT: .LBB42_8: # %else8
3283 ; KNL_64-NEXT: kmovw %k0, %eax
3284 ; KNL_64-NEXT: testb $1, %al
3285 ; KNL_64-NEXT: # implicit-def: $ymm3
3286 ; KNL_64-NEXT: jne .LBB42_9
3287 ; KNL_64-NEXT: # %bb.10: # %else15
3288 ; KNL_64-NEXT: testb $2, %al
3289 ; KNL_64-NEXT: jne .LBB42_11
3290 ; KNL_64-NEXT: .LBB42_12: # %else21
3291 ; KNL_64-NEXT: testb $4, %al
3292 ; KNL_64-NEXT: jne .LBB42_13
3293 ; KNL_64-NEXT: .LBB42_14: # %else27
3294 ; KNL_64-NEXT: testb $8, %al
3295 ; KNL_64-NEXT: je .LBB42_16
3296 ; KNL_64-NEXT: .LBB42_15: # %cond.load29
3297 ; KNL_64-NEXT: vpextrq $1, %xmm2, %rax
3298 ; KNL_64-NEXT: vpbroadcastq (%rax), %ymm4
3299 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
3300 ; KNL_64-NEXT: .LBB42_16: # %else33
3301 ; KNL_64-NEXT: kmovw %k0, %eax
3302 ; KNL_64-NEXT: testb $1, %al
3303 ; KNL_64-NEXT: # implicit-def: $ymm4
3304 ; KNL_64-NEXT: jne .LBB42_17
3305 ; KNL_64-NEXT: # %bb.18: # %else40
3306 ; KNL_64-NEXT: testb $2, %al
3307 ; KNL_64-NEXT: jne .LBB42_19
3308 ; KNL_64-NEXT: .LBB42_20: # %else46
3309 ; KNL_64-NEXT: testb $4, %al
3310 ; KNL_64-NEXT: jne .LBB42_21
3311 ; KNL_64-NEXT: .LBB42_22: # %else52
3312 ; KNL_64-NEXT: testb $8, %al
3313 ; KNL_64-NEXT: je .LBB42_24
3314 ; KNL_64-NEXT: .LBB42_23: # %cond.load54
3315 ; KNL_64-NEXT: vpextrq $1, %xmm2, %rax
3316 ; KNL_64-NEXT: vpbroadcastq (%rax), %ymm0
3317 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm0[6,7]
3318 ; KNL_64-NEXT: .LBB42_24: # %else58
3319 ; KNL_64-NEXT: vpaddq %ymm3, %ymm1, %ymm0
3320 ; KNL_64-NEXT: vpaddq %ymm4, %ymm0, %ymm0
3322 ; KNL_64-NEXT: .LBB42_9: # %cond.load11
3323 ; KNL_64-NEXT: vmovq %xmm0, %rcx
3324 ; KNL_64-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
3325 ; KNL_64-NEXT: testb $2, %al
3326 ; KNL_64-NEXT: je .LBB42_12
3327 ; KNL_64-NEXT: .LBB42_11: # %cond.load17
3328 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
3329 ; KNL_64-NEXT: vpinsrq $1, (%rcx), %xmm3, %xmm4
3330 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3331 ; KNL_64-NEXT: testb $4, %al
3332 ; KNL_64-NEXT: je .LBB42_14
3333 ; KNL_64-NEXT: .LBB42_13: # %cond.load23
3334 ; KNL_64-NEXT: vmovq %xmm2, %rcx
3335 ; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm4
3336 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
3337 ; KNL_64-NEXT: testb $8, %al
3338 ; KNL_64-NEXT: jne .LBB42_15
3339 ; KNL_64-NEXT: jmp .LBB42_16
3340 ; KNL_64-NEXT: .LBB42_17: # %cond.load36
3341 ; KNL_64-NEXT: vmovq %xmm0, %rcx
3342 ; KNL_64-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
3343 ; KNL_64-NEXT: testb $2, %al
3344 ; KNL_64-NEXT: je .LBB42_20
3345 ; KNL_64-NEXT: .LBB42_19: # %cond.load42
3346 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
3347 ; KNL_64-NEXT: vpinsrq $1, (%rcx), %xmm4, %xmm0
3348 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm4[4,5,6,7]
3349 ; KNL_64-NEXT: testb $4, %al
3350 ; KNL_64-NEXT: je .LBB42_22
3351 ; KNL_64-NEXT: .LBB42_21: # %cond.load48
3352 ; KNL_64-NEXT: vmovq %xmm2, %rcx
3353 ; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm0
3354 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7]
3355 ; KNL_64-NEXT: testb $8, %al
3356 ; KNL_64-NEXT: jne .LBB42_23
3357 ; KNL_64-NEXT: jmp .LBB42_24
3359 ; KNL_32-LABEL: test_pr28312:
3361 ; KNL_32-NEXT: pushl %ebp
3362 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
3363 ; KNL_32-NEXT: .cfi_offset %ebp, -8
3364 ; KNL_32-NEXT: movl %esp, %ebp
3365 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
3366 ; KNL_32-NEXT: pushl %ebx
3367 ; KNL_32-NEXT: pushl %esi
3368 ; KNL_32-NEXT: andl $-32, %esp
3369 ; KNL_32-NEXT: subl $32, %esp
3370 ; KNL_32-NEXT: .cfi_offset %esi, -16
3371 ; KNL_32-NEXT: .cfi_offset %ebx, -12
3372 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
3373 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
3374 ; KNL_32-NEXT: kmovw %k0, %ebx
3375 ; KNL_32-NEXT: testb $1, %bl
3376 ; KNL_32-NEXT: vmovd %xmm0, %eax
3377 ; KNL_32-NEXT: # implicit-def: $ymm1
3378 ; KNL_32-NEXT: je .LBB42_2
3379 ; KNL_32-NEXT: # %bb.1: # %cond.load
3380 ; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
3381 ; KNL_32-NEXT: .LBB42_2: # %else
3382 ; KNL_32-NEXT: testb $2, %bl
3383 ; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
3384 ; KNL_32-NEXT: je .LBB42_4
3385 ; KNL_32-NEXT: # %bb.3: # %cond.load1
3386 ; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm1, %xmm2
3387 ; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm2, %xmm2
3388 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
3389 ; KNL_32-NEXT: .LBB42_4: # %else2
3390 ; KNL_32-NEXT: testb $4, %bl
3391 ; KNL_32-NEXT: vpextrd $2, %xmm0, %edx
3392 ; KNL_32-NEXT: je .LBB42_6
3393 ; KNL_32-NEXT: # %bb.5: # %cond.load4
3394 ; KNL_32-NEXT: vpbroadcastd (%edx), %ymm2
3395 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7]
3396 ; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm2
3397 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
3398 ; KNL_32-NEXT: .LBB42_6: # %else5
3399 ; KNL_32-NEXT: testb $8, %bl
3400 ; KNL_32-NEXT: vpextrd $3, %xmm0, %esi
3401 ; KNL_32-NEXT: je .LBB42_8
3402 ; KNL_32-NEXT: # %bb.7: # %cond.load7
3403 ; KNL_32-NEXT: vpbroadcastd (%esi), %ymm0
3404 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
3405 ; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm1
3406 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7]
3407 ; KNL_32-NEXT: .LBB42_8: # %else8
3408 ; KNL_32-NEXT: kmovw %k0, %ebx
3409 ; KNL_32-NEXT: testb $1, %bl
3410 ; KNL_32-NEXT: # implicit-def: $ymm0
3411 ; KNL_32-NEXT: jne .LBB42_9
3412 ; KNL_32-NEXT: # %bb.10: # %else15
3413 ; KNL_32-NEXT: testb $2, %bl
3414 ; KNL_32-NEXT: jne .LBB42_11
3415 ; KNL_32-NEXT: .LBB42_12: # %else21
3416 ; KNL_32-NEXT: testb $4, %bl
3417 ; KNL_32-NEXT: jne .LBB42_13
3418 ; KNL_32-NEXT: .LBB42_14: # %else27
3419 ; KNL_32-NEXT: testb $8, %bl
3420 ; KNL_32-NEXT: je .LBB42_16
3421 ; KNL_32-NEXT: .LBB42_15: # %cond.load29
3422 ; KNL_32-NEXT: vpbroadcastd (%esi), %ymm2
3423 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7]
3424 ; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm2
3425 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
3426 ; KNL_32-NEXT: .LBB42_16: # %else33
3427 ; KNL_32-NEXT: kmovw %k0, %ebx
3428 ; KNL_32-NEXT: testb $1, %bl
3429 ; KNL_32-NEXT: # implicit-def: $ymm2
3430 ; KNL_32-NEXT: jne .LBB42_17
3431 ; KNL_32-NEXT: # %bb.18: # %else40
3432 ; KNL_32-NEXT: testb $2, %bl
3433 ; KNL_32-NEXT: jne .LBB42_19
3434 ; KNL_32-NEXT: .LBB42_20: # %else46
3435 ; KNL_32-NEXT: testb $4, %bl
3436 ; KNL_32-NEXT: jne .LBB42_21
3437 ; KNL_32-NEXT: .LBB42_22: # %else52
3438 ; KNL_32-NEXT: testb $8, %bl
3439 ; KNL_32-NEXT: je .LBB42_24
3440 ; KNL_32-NEXT: .LBB42_23: # %cond.load54
3441 ; KNL_32-NEXT: vpbroadcastd (%esi), %ymm3
3442 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7]
3443 ; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm3
3444 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
3445 ; KNL_32-NEXT: .LBB42_24: # %else58
3446 ; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
3447 ; KNL_32-NEXT: vpaddq %ymm2, %ymm0, %ymm0
3448 ; KNL_32-NEXT: leal -8(%ebp), %esp
3449 ; KNL_32-NEXT: popl %esi
3450 ; KNL_32-NEXT: popl %ebx
3451 ; KNL_32-NEXT: popl %ebp
3452 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
3454 ; KNL_32-NEXT: .LBB42_9: # %cond.load11
3455 ; KNL_32-NEXT: .cfi_def_cfa %ebp, 8
3456 ; KNL_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
3457 ; KNL_32-NEXT: testb $2, %bl
3458 ; KNL_32-NEXT: je .LBB42_12
3459 ; KNL_32-NEXT: .LBB42_11: # %cond.load17
3460 ; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm2
3461 ; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm2, %xmm2
3462 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3463 ; KNL_32-NEXT: testb $4, %bl
3464 ; KNL_32-NEXT: je .LBB42_14
3465 ; KNL_32-NEXT: .LBB42_13: # %cond.load23
3466 ; KNL_32-NEXT: vpbroadcastd (%edx), %ymm2
3467 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7]
3468 ; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm2
3469 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
3470 ; KNL_32-NEXT: testb $8, %bl
3471 ; KNL_32-NEXT: jne .LBB42_15
3472 ; KNL_32-NEXT: jmp .LBB42_16
3473 ; KNL_32-NEXT: .LBB42_17: # %cond.load36
3474 ; KNL_32-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
3475 ; KNL_32-NEXT: testb $2, %bl
3476 ; KNL_32-NEXT: je .LBB42_20
3477 ; KNL_32-NEXT: .LBB42_19: # %cond.load42
3478 ; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm2, %xmm3
3479 ; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm3, %xmm3
3480 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
3481 ; KNL_32-NEXT: testb $4, %bl
3482 ; KNL_32-NEXT: je .LBB42_22
3483 ; KNL_32-NEXT: .LBB42_21: # %cond.load48
3484 ; KNL_32-NEXT: vpbroadcastd (%edx), %ymm3
3485 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7]
3486 ; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm3
3487 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
3488 ; KNL_32-NEXT: testb $8, %bl
3489 ; KNL_32-NEXT: jne .LBB42_23
3490 ; KNL_32-NEXT: jmp .LBB42_24
3492 ; SKX-LABEL: test_pr28312:
3494 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
3495 ; SKX-NEXT: vpmovd2m %xmm1, %k1
3496 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
3497 ; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1}
3498 ; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
3499 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
3502 ; SKX_32-LABEL: test_pr28312:
3504 ; SKX_32-NEXT: pushl %ebp
3505 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
3506 ; SKX_32-NEXT: .cfi_offset %ebp, -8
3507 ; SKX_32-NEXT: movl %esp, %ebp
3508 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
3509 ; SKX_32-NEXT: andl $-32, %esp
3510 ; SKX_32-NEXT: subl $32, %esp
3511 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
3512 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
3513 ; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
3514 ; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1}
3515 ; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
3516 ; SKX_32-NEXT: vpaddq %ymm1, %ymm0, %ymm0
3517 ; SKX_32-NEXT: movl %ebp, %esp
3518 ; SKX_32-NEXT: popl %ebp
3519 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
3521 %g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
3522 %g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
3523 %g3 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
3524 %a = add <4 x i64> %g1, %g2
3525 %b = add <4 x i64> %a, %g3
3528 declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>)
3530 define <8 x i32> @test_global_array(<8 x i64> %indxs) {
3531 ; KNL_64-LABEL: test_global_array:
3533 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
3534 ; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
3535 ; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
3536 ; KNL_64-NEXT: vmovdqa %ymm1, %ymm0
3539 ; KNL_32-LABEL: test_global_array:
3541 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
3542 ; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
3543 ; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
3544 ; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
3547 ; SKX_SMALL-LABEL: test_global_array:
3548 ; SKX_SMALL: # %bb.0:
3549 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
3550 ; SKX_SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
3551 ; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
3552 ; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0
3553 ; SKX_SMALL-NEXT: retq
3555 ; SKX_LARGE-LABEL: test_global_array:
3556 ; SKX_LARGE: # %bb.0:
3557 ; SKX_LARGE-NEXT: movabsq $glob_array, %rax
3558 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
3559 ; SKX_LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1
3560 ; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
3561 ; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0
3562 ; SKX_LARGE-NEXT: retq
3564 ; SKX_32-LABEL: test_global_array:
3566 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
3567 ; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
3568 ; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
3569 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm0
3571 %p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <8 x i64> %indxs
3572 %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
3576 define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
3577 ; KNL_64-LABEL: test_global_array_zeroinitializer_index:
3579 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
3580 ; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
3581 ; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
3582 ; KNL_64-NEXT: vmovdqa %ymm1, %ymm0
3585 ; KNL_32-LABEL: test_global_array_zeroinitializer_index:
3587 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
3588 ; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
3589 ; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
3590 ; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
3593 ; SKX_SMALL-LABEL: test_global_array_zeroinitializer_index:
3594 ; SKX_SMALL: # %bb.0:
3595 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
3596 ; SKX_SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
3597 ; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
3598 ; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0
3599 ; SKX_SMALL-NEXT: retq
3601 ; SKX_LARGE-LABEL: test_global_array_zeroinitializer_index:
3602 ; SKX_LARGE: # %bb.0:
3603 ; SKX_LARGE-NEXT: movabsq $glob_array, %rax
3604 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
3605 ; SKX_LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1
3606 ; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
3607 ; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0
3608 ; SKX_LARGE-NEXT: retq
3610 ; SKX_32-LABEL: test_global_array_zeroinitializer_index:
3612 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
3613 ; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
3614 ; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
3615 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm0
3617 %p = getelementptr inbounds [16 x i32], ptr @glob_array, <8 x i64> zeroinitializer, <8 x i64> %indxs
3618 %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
3622 define void @v1_scatter(<1 x i32>%a1, <1 x ptr> %ptr, <1 x i1> %mask) {
3623 ; KNL_64-LABEL: v1_scatter:
3625 ; KNL_64-NEXT: testb $1, %dl
3626 ; KNL_64-NEXT: je .LBB45_2
3627 ; KNL_64-NEXT: # %bb.1: # %cond.store
3628 ; KNL_64-NEXT: movl %edi, (%rsi)
3629 ; KNL_64-NEXT: .LBB45_2: # %else
3632 ; KNL_32-LABEL: v1_scatter:
3634 ; KNL_32-NEXT: testb $1, {{[0-9]+}}(%esp)
3635 ; KNL_32-NEXT: je .LBB45_2
3636 ; KNL_32-NEXT: # %bb.1: # %cond.store
3637 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3638 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
3639 ; KNL_32-NEXT: movl %ecx, (%eax)
3640 ; KNL_32-NEXT: .LBB45_2: # %else
3643 ; SKX-LABEL: v1_scatter:
3645 ; SKX-NEXT: testb $1, %dl
3646 ; SKX-NEXT: je .LBB45_2
3647 ; SKX-NEXT: # %bb.1: # %cond.store
3648 ; SKX-NEXT: movl %edi, (%rsi)
3649 ; SKX-NEXT: .LBB45_2: # %else
3652 ; SKX_32-LABEL: v1_scatter:
3654 ; SKX_32-NEXT: testb $1, {{[0-9]+}}(%esp)
3655 ; SKX_32-NEXT: je .LBB45_2
3656 ; SKX_32-NEXT: # %bb.1: # %cond.store
3657 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3658 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
3659 ; SKX_32-NEXT: movl %ecx, (%eax)
3660 ; SKX_32-NEXT: .LBB45_2: # %else
3662 call void @llvm.masked.scatter.v1i32.v1p0(<1 x i32> %a1, <1 x ptr> %ptr, i32 4, <1 x i1> %mask)
3665 declare void @llvm.masked.scatter.v1i32.v1p0(<1 x i32>, <1 x ptr>, i32, <1 x i1>)
3667 define <1 x i32> @v1_gather(<1 x ptr> %ptr, <1 x i1> %mask, <1 x i32> %src0) {
3668 ; KNL_64-LABEL: v1_gather:
3670 ; KNL_64-NEXT: movl (%rdi), %eax
3673 ; KNL_32-LABEL: v1_gather:
3675 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3676 ; KNL_32-NEXT: movl (%eax), %eax
3679 ; SKX-LABEL: v1_gather:
3681 ; SKX-NEXT: movl (%rdi), %eax
3684 ; SKX_32-LABEL: v1_gather:
3686 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3687 ; SKX_32-NEXT: movl (%eax), %eax
3689 %res = call <1 x i32> @llvm.masked.gather.v1i32.v1p0(<1 x ptr> %ptr, i32 4, <1 x i1> <i1 true>, <1 x i32> %src0)
3692 declare <1 x i32> @llvm.masked.gather.v1i32.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i32>)
3694 ; Make sure we don't crash when the index element type is larger than i64 and we need to widen the result
3695 ; This experienced a bad interaction when we widened and then tried to split.
3696 define <2 x float> @large_index(ptr %base, <2 x i128> %ind, <2 x i1> %mask, <2 x float> %src0) {
3697 ; KNL_64-LABEL: large_index:
3699 ; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0
3700 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0
3701 ; KNL_64-NEXT: vmovq %rcx, %xmm0
3702 ; KNL_64-NEXT: vmovq %rsi, %xmm2
3703 ; KNL_64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
3704 ; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0
3705 ; KNL_64-NEXT: vmovq %rdi, %xmm2
3706 ; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2
3707 ; KNL_64-NEXT: vpaddq %xmm0, %xmm2, %xmm0
3708 ; KNL_64-NEXT: kmovw %k0, %eax
3709 ; KNL_64-NEXT: testb $1, %al
3710 ; KNL_64-NEXT: jne .LBB47_1
3711 ; KNL_64-NEXT: # %bb.2: # %else
3712 ; KNL_64-NEXT: testb $2, %al
3713 ; KNL_64-NEXT: jne .LBB47_3
3714 ; KNL_64-NEXT: .LBB47_4: # %else2
3715 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
3716 ; KNL_64-NEXT: vzeroupper
3718 ; KNL_64-NEXT: .LBB47_1: # %cond.load
3719 ; KNL_64-NEXT: vmovq %xmm0, %rcx
3720 ; KNL_64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
3721 ; KNL_64-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
3722 ; KNL_64-NEXT: testb $2, %al
3723 ; KNL_64-NEXT: je .LBB47_4
3724 ; KNL_64-NEXT: .LBB47_3: # %cond.load1
3725 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
3726 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
3727 ; KNL_64-NEXT: vmovaps %xmm1, %xmm0
3728 ; KNL_64-NEXT: vzeroupper
3731 ; KNL_32-LABEL: large_index:
3733 ; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0
3734 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0
3735 ; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3736 ; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
3737 ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
3738 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2
3739 ; KNL_32-NEXT: vpaddd %xmm0, %xmm2, %xmm0
3740 ; KNL_32-NEXT: kmovw %k0, %eax
3741 ; KNL_32-NEXT: testb $1, %al
3742 ; KNL_32-NEXT: jne .LBB47_1
3743 ; KNL_32-NEXT: # %bb.2: # %else
3744 ; KNL_32-NEXT: testb $2, %al
3745 ; KNL_32-NEXT: jne .LBB47_3
3746 ; KNL_32-NEXT: .LBB47_4: # %else2
3747 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
3748 ; KNL_32-NEXT: vzeroupper
3750 ; KNL_32-NEXT: .LBB47_1: # %cond.load
3751 ; KNL_32-NEXT: vmovd %xmm0, %ecx
3752 ; KNL_32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
3753 ; KNL_32-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
3754 ; KNL_32-NEXT: testb $2, %al
3755 ; KNL_32-NEXT: je .LBB47_4
3756 ; KNL_32-NEXT: .LBB47_3: # %cond.load1
3757 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
3758 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
3759 ; KNL_32-NEXT: vmovaps %xmm1, %xmm0
3760 ; KNL_32-NEXT: vzeroupper
3763 ; SKX-LABEL: large_index:
3765 ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
3766 ; SKX-NEXT: vpmovq2m %xmm0, %k0
3767 ; SKX-NEXT: vmovq %rcx, %xmm0
3768 ; SKX-NEXT: vmovq %rsi, %xmm2
3769 ; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
3770 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
3771 ; SKX-NEXT: vpbroadcastq %rdi, %xmm2
3772 ; SKX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
3773 ; SKX-NEXT: kmovw %k0, %eax
3774 ; SKX-NEXT: testb $1, %al
3775 ; SKX-NEXT: jne .LBB47_1
3776 ; SKX-NEXT: # %bb.2: # %else
3777 ; SKX-NEXT: testb $2, %al
3778 ; SKX-NEXT: jne .LBB47_3
3779 ; SKX-NEXT: .LBB47_4: # %else2
3780 ; SKX-NEXT: vmovdqa %xmm1, %xmm0
3782 ; SKX-NEXT: .LBB47_1: # %cond.load
3783 ; SKX-NEXT: vmovq %xmm0, %rcx
3784 ; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
3785 ; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
3786 ; SKX-NEXT: testb $2, %al
3787 ; SKX-NEXT: je .LBB47_4
3788 ; SKX-NEXT: .LBB47_3: # %cond.load1
3789 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
3790 ; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
3791 ; SKX-NEXT: vmovaps %xmm1, %xmm0
3794 ; SKX_32-LABEL: large_index:
3796 ; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0
3797 ; SKX_32-NEXT: vpmovq2m %xmm0, %k0
3798 ; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
3799 ; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
3800 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
3801 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
3802 ; SKX_32-NEXT: kmovw %k0, %eax
3803 ; SKX_32-NEXT: testb $1, %al
3804 ; SKX_32-NEXT: jne .LBB47_1
3805 ; SKX_32-NEXT: # %bb.2: # %else
3806 ; SKX_32-NEXT: testb $2, %al
3807 ; SKX_32-NEXT: jne .LBB47_3
3808 ; SKX_32-NEXT: .LBB47_4: # %else2
3809 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
3811 ; SKX_32-NEXT: .LBB47_1: # %cond.load
3812 ; SKX_32-NEXT: vmovd %xmm0, %ecx
3813 ; SKX_32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3814 ; SKX_32-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
3815 ; SKX_32-NEXT: testb $2, %al
3816 ; SKX_32-NEXT: je .LBB47_4
3817 ; SKX_32-NEXT: .LBB47_3: # %cond.load1
3818 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
3819 ; SKX_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
3820 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
3822 %gep.random = getelementptr float, ptr %base, <2 x i128> %ind
3823 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
3827 ; Make sure we allow index to be sign extended from a smaller than i32 element size.
3828 define <16 x float> @sext_i8_index(ptr %base, <16 x i8> %ind) {
3829 ; KNL_64-LABEL: sext_i8_index:
3831 ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm1
3832 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
3833 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
3834 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
3837 ; KNL_32-LABEL: sext_i8_index:
3839 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3840 ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm1
3841 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
3842 ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
3843 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
3846 ; SKX-LABEL: sext_i8_index:
3848 ; SKX-NEXT: vpmovsxbd %xmm0, %zmm1
3849 ; SKX-NEXT: kxnorw %k0, %k0, %k1
3850 ; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
3851 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
3854 ; SKX_32-LABEL: sext_i8_index:
3856 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3857 ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm1
3858 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
3859 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
3860 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
3863 %sext_ind = sext <16 x i8> %ind to <16 x i64>
3864 %gep.random = getelementptr float, ptr%base, <16 x i64> %sext_ind
3866 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
3867 ret <16 x float>%res
3870 ; Make sure we allow index to be sign extended from a smaller than i32 element size.
3871 define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) {
3872 ; KNL_64-LABEL: sext_v8i8_index:
3874 ; KNL_64-NEXT: vpmovsxbd %xmm0, %ymm1
3875 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
3876 ; KNL_64-NEXT: movw $255, %ax
3877 ; KNL_64-NEXT: kmovw %eax, %k1
3878 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
3879 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3882 ; KNL_32-LABEL: sext_v8i8_index:
3884 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3885 ; KNL_32-NEXT: vpmovsxbd %xmm0, %ymm1
3886 ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
3887 ; KNL_32-NEXT: movw $255, %cx
3888 ; KNL_32-NEXT: kmovw %ecx, %k1
3889 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
3890 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3893 ; SKX-LABEL: sext_v8i8_index:
3895 ; SKX-NEXT: vpmovsxbd %xmm0, %ymm1
3896 ; SKX-NEXT: kxnorw %k0, %k0, %k1
3897 ; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
3898 ; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
3901 ; SKX_32-LABEL: sext_v8i8_index:
3903 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3904 ; SKX_32-NEXT: vpmovsxbd %xmm0, %ymm1
3905 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
3906 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
3907 ; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
3910 %sext_ind = sext <8 x i8> %ind to <8 x i64>
3911 %gep.random = getelementptr float, ptr%base, <8 x i64> %sext_ind
3913 %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %gep.random, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
3916 declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
3918 ; Make sure we also allow index to be zero extended from a smaller than i32 element size.
3919 define <16 x float> @zext_i8_index(ptr %base, <16 x i8> %ind) {
3920 ; KNL_64-LABEL: zext_i8_index:
3922 ; KNL_64-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
3923 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
3924 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
3925 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
3928 ; KNL_32-LABEL: zext_i8_index:
3930 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3931 ; KNL_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
3932 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
3933 ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
3934 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
3937 ; SKX-LABEL: zext_i8_index:
3939 ; SKX-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
3940 ; SKX-NEXT: kxnorw %k0, %k0, %k1
3941 ; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
3942 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
3945 ; SKX_32-LABEL: zext_i8_index:
3947 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3948 ; SKX_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
3949 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
3950 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
3951 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
3954 %zext_ind = zext <16 x i8> %ind to <16 x i64>
3955 %gep.random = getelementptr float, ptr%base, <16 x i64> %zext_ind
3957 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
3958 ret <16 x float>%res
3961 ; Make sure we also allow index to be zero extended from a smaller than i32 element size.
3962 define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) {
3963 ; KNL_64-LABEL: zext_v8i8_index:
3965 ; KNL_64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3966 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
3967 ; KNL_64-NEXT: movw $255, %ax
3968 ; KNL_64-NEXT: kmovw %eax, %k1
3969 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
3970 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3973 ; KNL_32-LABEL: zext_v8i8_index:
3975 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3976 ; KNL_32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3977 ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
3978 ; KNL_32-NEXT: movw $255, %cx
3979 ; KNL_32-NEXT: kmovw %ecx, %k1
3980 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
3981 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3984 ; SKX-LABEL: zext_v8i8_index:
3986 ; SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3987 ; SKX-NEXT: kxnorw %k0, %k0, %k1
3988 ; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
3989 ; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
3992 ; SKX_32-LABEL: zext_v8i8_index:
3994 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
3995 ; SKX_32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
3996 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
3997 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
3998 ; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
4001 %zext_ind = zext <8 x i8> %ind to <8 x i64>
4002 %gep.random = getelementptr float, ptr%base, <8 x i64> %zext_ind
4004 %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %gep.random, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
4008 ; Index requires promotion
4009 define void @test_scatter_2i32_index(<2 x double> %a1, ptr %base, <2 x i32> %ind, <2 x i1> %mask) {
4010 ; KNL_64-LABEL: test_scatter_2i32_index:
4012 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
4013 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
4014 ; KNL_64-NEXT: vpmovsxdq %xmm1, %xmm1
4015 ; KNL_64-NEXT: vpsllq $3, %xmm1, %xmm1
4016 ; KNL_64-NEXT: vmovq %rdi, %xmm2
4017 ; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2
4018 ; KNL_64-NEXT: vpaddq %xmm1, %xmm2, %xmm1
4019 ; KNL_64-NEXT: kmovw %k0, %eax
4020 ; KNL_64-NEXT: testb $1, %al
4021 ; KNL_64-NEXT: jne .LBB52_1
4022 ; KNL_64-NEXT: # %bb.2: # %else
4023 ; KNL_64-NEXT: testb $2, %al
4024 ; KNL_64-NEXT: jne .LBB52_3
4025 ; KNL_64-NEXT: .LBB52_4: # %else2
4026 ; KNL_64-NEXT: vzeroupper
4028 ; KNL_64-NEXT: .LBB52_1: # %cond.store
4029 ; KNL_64-NEXT: vmovq %xmm1, %rcx
4030 ; KNL_64-NEXT: vmovlps %xmm0, (%rcx)
4031 ; KNL_64-NEXT: testb $2, %al
4032 ; KNL_64-NEXT: je .LBB52_4
4033 ; KNL_64-NEXT: .LBB52_3: # %cond.store1
4034 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
4035 ; KNL_64-NEXT: vmovhps %xmm0, (%rax)
4036 ; KNL_64-NEXT: vzeroupper
4039 ; KNL_32-LABEL: test_scatter_2i32_index:
4041 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
4042 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
4043 ; KNL_32-NEXT: vpslld $3, %xmm1, %xmm1
4044 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2
4045 ; KNL_32-NEXT: vpaddd %xmm1, %xmm2, %xmm1
4046 ; KNL_32-NEXT: kmovw %k0, %eax
4047 ; KNL_32-NEXT: testb $1, %al
4048 ; KNL_32-NEXT: jne .LBB52_1
4049 ; KNL_32-NEXT: # %bb.2: # %else
4050 ; KNL_32-NEXT: testb $2, %al
4051 ; KNL_32-NEXT: jne .LBB52_3
4052 ; KNL_32-NEXT: .LBB52_4: # %else2
4053 ; KNL_32-NEXT: vzeroupper
4055 ; KNL_32-NEXT: .LBB52_1: # %cond.store
4056 ; KNL_32-NEXT: vmovd %xmm1, %ecx
4057 ; KNL_32-NEXT: vmovlps %xmm0, (%ecx)
4058 ; KNL_32-NEXT: testb $2, %al
4059 ; KNL_32-NEXT: je .LBB52_4
4060 ; KNL_32-NEXT: .LBB52_3: # %cond.store1
4061 ; KNL_32-NEXT: vpextrd $1, %xmm1, %eax
4062 ; KNL_32-NEXT: vmovhps %xmm0, (%eax)
4063 ; KNL_32-NEXT: vzeroupper
4066 ; SKX-LABEL: test_scatter_2i32_index:
4068 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
4069 ; SKX-NEXT: vpmovq2m %xmm2, %k0
4070 ; SKX-NEXT: vpbroadcastq %rdi, %xmm2
4071 ; SKX-NEXT: vpmovsxdq %xmm1, %xmm1
4072 ; SKX-NEXT: vpsllq $3, %xmm1, %xmm1
4073 ; SKX-NEXT: vpaddq %xmm1, %xmm2, %xmm1
4074 ; SKX-NEXT: kmovw %k0, %eax
4075 ; SKX-NEXT: testb $1, %al
4076 ; SKX-NEXT: jne .LBB52_1
4077 ; SKX-NEXT: # %bb.2: # %else
4078 ; SKX-NEXT: testb $2, %al
4079 ; SKX-NEXT: jne .LBB52_3
4080 ; SKX-NEXT: .LBB52_4: # %else2
4082 ; SKX-NEXT: .LBB52_1: # %cond.store
4083 ; SKX-NEXT: vmovq %xmm1, %rcx
4084 ; SKX-NEXT: vmovlps %xmm0, (%rcx)
4085 ; SKX-NEXT: testb $2, %al
4086 ; SKX-NEXT: je .LBB52_4
4087 ; SKX-NEXT: .LBB52_3: # %cond.store1
4088 ; SKX-NEXT: vpextrq $1, %xmm1, %rax
4089 ; SKX-NEXT: vmovhps %xmm0, (%rax)
4092 ; SKX_32-LABEL: test_scatter_2i32_index:
4094 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
4095 ; SKX_32-NEXT: vpmovq2m %xmm2, %k0
4096 ; SKX_32-NEXT: vpslld $3, %xmm1, %xmm1
4097 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm1, %xmm1
4098 ; SKX_32-NEXT: kmovw %k0, %eax
4099 ; SKX_32-NEXT: testb $1, %al
4100 ; SKX_32-NEXT: jne .LBB52_1
4101 ; SKX_32-NEXT: # %bb.2: # %else
4102 ; SKX_32-NEXT: testb $2, %al
4103 ; SKX_32-NEXT: jne .LBB52_3
4104 ; SKX_32-NEXT: .LBB52_4: # %else2
4106 ; SKX_32-NEXT: .LBB52_1: # %cond.store
4107 ; SKX_32-NEXT: vmovd %xmm1, %ecx
4108 ; SKX_32-NEXT: vmovlps %xmm0, (%ecx)
4109 ; SKX_32-NEXT: testb $2, %al
4110 ; SKX_32-NEXT: je .LBB52_4
4111 ; SKX_32-NEXT: .LBB52_3: # %cond.store1
4112 ; SKX_32-NEXT: vpextrd $1, %xmm1, %eax
4113 ; SKX_32-NEXT: vmovhps %xmm0, (%eax)
4115 %gep = getelementptr double, ptr%base, <2 x i32> %ind
4116 call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> %a1, <2 x ptr> %gep, i32 4, <2 x i1> %mask)
4119 declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>)
4121 define <16 x float> @zext_index(ptr %base, <16 x i32> %ind) {
4122 ; KNL_64-LABEL: zext_index:
4124 ; KNL_64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1
4125 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
4126 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
4127 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
4130 ; KNL_32-LABEL: zext_index:
4132 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4133 ; KNL_32-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm1
4134 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
4135 ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
4136 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
4139 ; SKX_SMALL-LABEL: zext_index:
4140 ; SKX_SMALL: # %bb.0:
4141 ; SKX_SMALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1
4142 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
4143 ; SKX_SMALL-NEXT: vxorps %xmm0, %xmm0, %xmm0
4144 ; SKX_SMALL-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
4145 ; SKX_SMALL-NEXT: retq
4147 ; SKX_LARGE-LABEL: zext_index:
4148 ; SKX_LARGE: # %bb.0:
4149 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
4150 ; SKX_LARGE-NEXT: vandps (%rax){1to16}, %zmm0, %zmm1
4151 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
4152 ; SKX_LARGE-NEXT: vxorps %xmm0, %xmm0, %xmm0
4153 ; SKX_LARGE-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
4154 ; SKX_LARGE-NEXT: retq
4156 ; SKX_32-LABEL: zext_index:
4158 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4159 ; SKX_32-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm1
4160 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
4161 ; SKX_32-NEXT: vxorps %xmm0, %xmm0, %xmm0
4162 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
4164 %ind_masked = and <16 x i32> %ind, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
4165 %sext_ind = zext <16 x i32> %ind_masked to <16 x i64>
4166 %gep.random = getelementptr float, ptr%base, <16 x i64> %sext_ind
4168 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
4169 ret <16 x float>%res
4172 define <16 x double> @test_gather_setcc_split(ptr %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %passthru) {
4173 ; KNL_64-LABEL: test_gather_setcc_split:
4175 ; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm4
4176 ; KNL_64-NEXT: vptestnmd %zmm4, %zmm4, %k1
4177 ; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
4178 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
4179 ; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4180 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm3 {%k1}
4181 ; KNL_64-NEXT: vmovapd %zmm2, %zmm0
4182 ; KNL_64-NEXT: vmovapd %zmm3, %zmm1
4185 ; KNL_32-LABEL: test_gather_setcc_split:
4187 ; KNL_32-NEXT: pushl %ebp
4188 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
4189 ; KNL_32-NEXT: .cfi_offset %ebp, -8
4190 ; KNL_32-NEXT: movl %esp, %ebp
4191 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
4192 ; KNL_32-NEXT: andl $-64, %esp
4193 ; KNL_32-NEXT: subl $64, %esp
4194 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm3
4195 ; KNL_32-NEXT: vmovapd 72(%ebp), %zmm1
4196 ; KNL_32-NEXT: movl 8(%ebp), %eax
4197 ; KNL_32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
4198 ; KNL_32-NEXT: vptestnmd %zmm4, %zmm4, %k1
4199 ; KNL_32-NEXT: vptestnmd %zmm3, %zmm3, %k2
4200 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
4201 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4202 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm1 {%k1}
4203 ; KNL_32-NEXT: vmovapd %zmm2, %zmm0
4204 ; KNL_32-NEXT: movl %ebp, %esp
4205 ; KNL_32-NEXT: popl %ebp
4206 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
4209 ; SKX-LABEL: test_gather_setcc_split:
4211 ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm4
4212 ; SKX-NEXT: vptestnmd %ymm4, %ymm4, %k1
4213 ; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
4214 ; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
4215 ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4216 ; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm3 {%k1}
4217 ; SKX-NEXT: vmovapd %zmm2, %zmm0
4218 ; SKX-NEXT: vmovapd %zmm3, %zmm1
4221 ; SKX_32-LABEL: test_gather_setcc_split:
4223 ; SKX_32-NEXT: pushl %ebp
4224 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
4225 ; SKX_32-NEXT: .cfi_offset %ebp, -8
4226 ; SKX_32-NEXT: movl %esp, %ebp
4227 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
4228 ; SKX_32-NEXT: andl $-64, %esp
4229 ; SKX_32-NEXT: subl $64, %esp
4230 ; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
4231 ; SKX_32-NEXT: movl 8(%ebp), %eax
4232 ; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
4233 ; SKX_32-NEXT: vptestnmd %ymm4, %ymm4, %k1
4234 ; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
4235 ; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
4236 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4237 ; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1}
4238 ; SKX_32-NEXT: vmovapd %zmm2, %zmm0
4239 ; SKX_32-NEXT: vmovapd %zmm3, %zmm1
4240 ; SKX_32-NEXT: movl %ebp, %esp
4241 ; SKX_32-NEXT: popl %ebp
4242 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
4244 %sext_ind = sext <16 x i32> %ind to <16 x i64>
4245 %gep.random = getelementptr double, ptr%base, <16 x i64> %sext_ind
4247 %mask = icmp eq <16 x i32> %cmp, zeroinitializer
4248 %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> %mask, <16 x double> %passthru)
4249 ret <16 x double>%res
4252 define void @test_scatter_setcc_split(ptr %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %src0) {
4253 ; KNL_64-LABEL: test_scatter_setcc_split:
4255 ; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm4
4256 ; KNL_64-NEXT: vptestnmd %zmm4, %zmm4, %k1
4257 ; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
4258 ; KNL_64-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
4259 ; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4260 ; KNL_64-NEXT: vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1}
4261 ; KNL_64-NEXT: vzeroupper
4264 ; KNL_32-LABEL: test_scatter_setcc_split:
4266 ; KNL_32-NEXT: pushl %ebp
4267 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
4268 ; KNL_32-NEXT: .cfi_offset %ebp, -8
4269 ; KNL_32-NEXT: movl %esp, %ebp
4270 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
4271 ; KNL_32-NEXT: andl $-64, %esp
4272 ; KNL_32-NEXT: subl $64, %esp
4273 ; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
4274 ; KNL_32-NEXT: movl 8(%ebp), %eax
4275 ; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
4276 ; KNL_32-NEXT: vptestnmd %zmm4, %zmm4, %k1
4277 ; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
4278 ; KNL_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
4279 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4280 ; KNL_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
4281 ; KNL_32-NEXT: movl %ebp, %esp
4282 ; KNL_32-NEXT: popl %ebp
4283 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
4284 ; KNL_32-NEXT: vzeroupper
4287 ; SKX-LABEL: test_scatter_setcc_split:
4289 ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm4
4290 ; SKX-NEXT: vptestnmd %ymm4, %ymm4, %k1
4291 ; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
4292 ; SKX-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
4293 ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4294 ; SKX-NEXT: vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1}
4295 ; SKX-NEXT: vzeroupper
4298 ; SKX_32-LABEL: test_scatter_setcc_split:
4300 ; SKX_32-NEXT: pushl %ebp
4301 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
4302 ; SKX_32-NEXT: .cfi_offset %ebp, -8
4303 ; SKX_32-NEXT: movl %esp, %ebp
4304 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
4305 ; SKX_32-NEXT: andl $-64, %esp
4306 ; SKX_32-NEXT: subl $64, %esp
4307 ; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
4308 ; SKX_32-NEXT: movl 8(%ebp), %eax
4309 ; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
4310 ; SKX_32-NEXT: vptestnmd %ymm4, %ymm4, %k1
4311 ; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
4312 ; SKX_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
4313 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
4314 ; SKX_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
4315 ; SKX_32-NEXT: movl %ebp, %esp
4316 ; SKX_32-NEXT: popl %ebp
4317 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
4318 ; SKX_32-NEXT: vzeroupper
4320 %sext_ind = sext <16 x i32> %ind to <16 x i64>
4321 %gep.random = getelementptr double, ptr%base, <16 x i64> %sext_ind
4323 %mask = icmp eq <16 x i32> %cmp, zeroinitializer
4324 call void @llvm.masked.scatter.v16f64.v16p0(<16 x double> %src0, <16 x ptr> %gep.random, i32 4, <16 x i1> %mask)
4328 ; This test case previously triggered an infinite loop when the two gathers became identical after DAG combine removed the sign extend.
4329 define <16 x float> @test_sext_cse(ptr %base, <16 x i32> %ind, ptr %foo) {
4330 ; KNL_64-LABEL: test_sext_cse:
4332 ; KNL_64-NEXT: vmovaps %zmm0, (%rsi)
4333 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
4334 ; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1
4335 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
4336 ; KNL_64-NEXT: vaddps %zmm1, %zmm1, %zmm0
4339 ; KNL_32-LABEL: test_sext_cse:
4341 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4342 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
4343 ; KNL_32-NEXT: vmovaps %zmm0, (%ecx)
4344 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
4345 ; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
4346 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
4347 ; KNL_32-NEXT: vaddps %zmm1, %zmm1, %zmm0
4350 ; SKX-LABEL: test_sext_cse:
4352 ; SKX-NEXT: vmovaps %zmm0, (%rsi)
4353 ; SKX-NEXT: kxnorw %k0, %k0, %k1
4354 ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
4355 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
4356 ; SKX-NEXT: vaddps %zmm1, %zmm1, %zmm0
4359 ; SKX_32-LABEL: test_sext_cse:
4361 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4362 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
4363 ; SKX_32-NEXT: vmovaps %zmm0, (%ecx)
4364 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
4365 ; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
4366 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
4367 ; SKX_32-NEXT: vaddps %zmm1, %zmm1, %zmm0
4369 %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
4370 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
4372 %sext_ind = sext <16 x i32> %ind to <16 x i64>
4373 %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
4375 store <16 x i32> %ind, ptr %foo
4376 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
4377 %gep.random2 = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i32> %ind
4378 %res2 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random2, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
4379 %res3 = fadd <16 x float> %res2, %res
4380 ret <16 x float>%res3
4383 define void @zero_mask(<2 x double>%a1, <2 x ptr> %ptr) {
4384 ; ALL-LABEL: zero_mask:
4386 ; ALL-NEXT: ret{{[l|q]}}
4387 call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> %a1, <2 x ptr> %ptr, i32 4, <2 x i1> zeroinitializer)
4391 define <2 x i64> @gather_2i64_constant_indices(ptr %ptr, <2 x i1> %mask) {
4392 ; KNL_64-LABEL: gather_2i64_constant_indices:
4394 ; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0
4395 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0
4396 ; KNL_64-NEXT: vmovq %rdi, %xmm0
4397 ; KNL_64-NEXT: vpbroadcastq %xmm0, %xmm0
4398 ; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
4399 ; KNL_64-NEXT: kmovw %k0, %eax
4400 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
4401 ; KNL_64-NEXT: testb $1, %al
4402 ; KNL_64-NEXT: jne .LBB58_1
4403 ; KNL_64-NEXT: # %bb.2: # %else
4404 ; KNL_64-NEXT: testb $2, %al
4405 ; KNL_64-NEXT: jne .LBB58_3
4406 ; KNL_64-NEXT: .LBB58_4: # %else2
4407 ; KNL_64-NEXT: vzeroupper
4409 ; KNL_64-NEXT: .LBB58_1: # %cond.load
4410 ; KNL_64-NEXT: vmovq %xmm1, %rcx
4411 ; KNL_64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4412 ; KNL_64-NEXT: testb $2, %al
4413 ; KNL_64-NEXT: je .LBB58_4
4414 ; KNL_64-NEXT: .LBB58_3: # %cond.load1
4415 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
4416 ; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0
4417 ; KNL_64-NEXT: vzeroupper
4420 ; KNL_32-LABEL: gather_2i64_constant_indices:
4422 ; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0
4423 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0
4424 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
4425 ; KNL_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
4426 ; KNL_32-NEXT: kmovw %k0, %eax
4427 ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
4428 ; KNL_32-NEXT: testb $1, %al
4429 ; KNL_32-NEXT: jne .LBB58_1
4430 ; KNL_32-NEXT: # %bb.2: # %else
4431 ; KNL_32-NEXT: testb $2, %al
4432 ; KNL_32-NEXT: jne .LBB58_3
4433 ; KNL_32-NEXT: .LBB58_4: # %else2
4434 ; KNL_32-NEXT: vzeroupper
4436 ; KNL_32-NEXT: .LBB58_1: # %cond.load
4437 ; KNL_32-NEXT: vmovd %xmm1, %ecx
4438 ; KNL_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4439 ; KNL_32-NEXT: testb $2, %al
4440 ; KNL_32-NEXT: je .LBB58_4
4441 ; KNL_32-NEXT: .LBB58_3: # %cond.load1
4442 ; KNL_32-NEXT: vpextrd $1, %xmm1, %eax
4443 ; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
4444 ; KNL_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0
4445 ; KNL_32-NEXT: vzeroupper
4448 ; SKX_SMALL-LABEL: gather_2i64_constant_indices:
4449 ; SKX_SMALL: # %bb.0:
4450 ; SKX_SMALL-NEXT: vpsllq $63, %xmm0, %xmm0
4451 ; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k0
4452 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %xmm0
4453 ; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
4454 ; SKX_SMALL-NEXT: kmovw %k0, %eax
4455 ; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
4456 ; SKX_SMALL-NEXT: testb $1, %al
4457 ; SKX_SMALL-NEXT: jne .LBB58_1
4458 ; SKX_SMALL-NEXT: # %bb.2: # %else
4459 ; SKX_SMALL-NEXT: testb $2, %al
4460 ; SKX_SMALL-NEXT: jne .LBB58_3
4461 ; SKX_SMALL-NEXT: .LBB58_4: # %else2
4462 ; SKX_SMALL-NEXT: retq
4463 ; SKX_SMALL-NEXT: .LBB58_1: # %cond.load
4464 ; SKX_SMALL-NEXT: vmovq %xmm1, %rcx
4465 ; SKX_SMALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4466 ; SKX_SMALL-NEXT: testb $2, %al
4467 ; SKX_SMALL-NEXT: je .LBB58_4
4468 ; SKX_SMALL-NEXT: .LBB58_3: # %cond.load1
4469 ; SKX_SMALL-NEXT: vpextrq $1, %xmm1, %rax
4470 ; SKX_SMALL-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0
4471 ; SKX_SMALL-NEXT: retq
4473 ; SKX_LARGE-LABEL: gather_2i64_constant_indices:
4474 ; SKX_LARGE: # %bb.0:
4475 ; SKX_LARGE-NEXT: vpsllq $63, %xmm0, %xmm0
4476 ; SKX_LARGE-NEXT: vpmovq2m %xmm0, %k0
4477 ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %xmm0
4478 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
4479 ; SKX_LARGE-NEXT: vpaddq (%rax), %xmm0, %xmm1
4480 ; SKX_LARGE-NEXT: kmovw %k0, %eax
4481 ; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
4482 ; SKX_LARGE-NEXT: testb $1, %al
4483 ; SKX_LARGE-NEXT: jne .LBB58_1
4484 ; SKX_LARGE-NEXT: # %bb.2: # %else
4485 ; SKX_LARGE-NEXT: testb $2, %al
4486 ; SKX_LARGE-NEXT: jne .LBB58_3
4487 ; SKX_LARGE-NEXT: .LBB58_4: # %else2
4488 ; SKX_LARGE-NEXT: retq
4489 ; SKX_LARGE-NEXT: .LBB58_1: # %cond.load
4490 ; SKX_LARGE-NEXT: vmovq %xmm1, %rcx
4491 ; SKX_LARGE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4492 ; SKX_LARGE-NEXT: testb $2, %al
4493 ; SKX_LARGE-NEXT: je .LBB58_4
4494 ; SKX_LARGE-NEXT: .LBB58_3: # %cond.load1
4495 ; SKX_LARGE-NEXT: vpextrq $1, %xmm1, %rax
4496 ; SKX_LARGE-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0
4497 ; SKX_LARGE-NEXT: retq
4499 ; SKX_32-LABEL: gather_2i64_constant_indices:
4501 ; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0
4502 ; SKX_32-NEXT: vpmovq2m %xmm0, %k0
4503 ; SKX_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
4504 ; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
4505 ; SKX_32-NEXT: kmovw %k0, %eax
4506 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
4507 ; SKX_32-NEXT: testb $1, %al
4508 ; SKX_32-NEXT: jne .LBB58_1
4509 ; SKX_32-NEXT: # %bb.2: # %else
4510 ; SKX_32-NEXT: testb $2, %al
4511 ; SKX_32-NEXT: jne .LBB58_3
4512 ; SKX_32-NEXT: .LBB58_4: # %else2
4514 ; SKX_32-NEXT: .LBB58_1: # %cond.load
4515 ; SKX_32-NEXT: vmovd %xmm1, %ecx
4516 ; SKX_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4517 ; SKX_32-NEXT: testb $2, %al
4518 ; SKX_32-NEXT: je .LBB58_4
4519 ; SKX_32-NEXT: .LBB58_3: # %cond.load1
4520 ; SKX_32-NEXT: vpextrd $1, %xmm1, %eax
4521 ; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
4522 ; SKX_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0
4524 %gep = getelementptr i64, ptr %ptr, <2 x i64> <i64 0, i64 -2>
4525 %res = tail call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> %gep, i32 8, <2 x i1> %mask, <2 x i64> zeroinitializer) #1
4529 define <16 x i32> @gather_16i64_constant_indices(ptr %ptr, <16 x i1> %mask) {
4530 ; KNL_64-LABEL: gather_16i64_constant_indices:
4532 ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0
4533 ; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0
4534 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
4535 ; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
4536 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
4537 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm1,4), %zmm0 {%k1}
4540 ; KNL_32-LABEL: gather_16i64_constant_indices:
4542 ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm0
4543 ; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0
4544 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
4545 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4546 ; KNL_32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
4547 ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
4548 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm1,4), %zmm0 {%k1}
4551 ; SKX_SMALL-LABEL: gather_16i64_constant_indices:
4552 ; SKX_SMALL: # %bb.0:
4553 ; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
4554 ; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0
4555 ; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1
4556 ; SKX_SMALL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
4557 ; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
4558 ; SKX_SMALL-NEXT: vpgatherdd (%rdi,%zmm1,4), %zmm0 {%k1}
4559 ; SKX_SMALL-NEXT: retq
4561 ; SKX_LARGE-LABEL: gather_16i64_constant_indices:
4562 ; SKX_LARGE: # %bb.0:
4563 ; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
4564 ; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0
4565 ; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1
4566 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
4567 ; SKX_LARGE-NEXT: vmovdqa64 (%rax), %zmm1
4568 ; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
4569 ; SKX_LARGE-NEXT: vpgatherdd (%rdi,%zmm1,4), %zmm0 {%k1}
4570 ; SKX_LARGE-NEXT: retq
4572 ; SKX_32-LABEL: gather_16i64_constant_indices:
4574 ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0
4575 ; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0
4576 ; SKX_32-NEXT: vpmovd2m %zmm0, %k1
4577 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4578 ; SKX_32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
4579 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
4580 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm1,4), %zmm0 {%k1}
4582 %gep = getelementptr i32, ptr %ptr, <16 x i64> <i64 0, i64 -2, i64 1, i64 -8, i64 10, i64 20, i64 50, i64 65536, i64 16777215, i64 2147483647, i64 100, i64 -2000, i64 -2147483648, i64 76897723, i64 7, i64 -67897687>
4583 %res = tail call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %gep, i32 4, <16 x i1> %mask, <16 x i32> zeroinitializer) #1
4587 define void @scatter_2i64_constant_indices(ptr %ptr, <2 x i1> %mask, <2 x i32> %src0) {
4588 ; KNL_64-LABEL: scatter_2i64_constant_indices:
4590 ; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0
4591 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0
4592 ; KNL_64-NEXT: vmovq %rdi, %xmm0
4593 ; KNL_64-NEXT: vpbroadcastq %xmm0, %xmm0
4594 ; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4595 ; KNL_64-NEXT: kmovw %k0, %eax
4596 ; KNL_64-NEXT: testb $1, %al
4597 ; KNL_64-NEXT: jne .LBB60_1
4598 ; KNL_64-NEXT: # %bb.2: # %else
4599 ; KNL_64-NEXT: testb $2, %al
4600 ; KNL_64-NEXT: jne .LBB60_3
4601 ; KNL_64-NEXT: .LBB60_4: # %else2
4602 ; KNL_64-NEXT: vzeroupper
4604 ; KNL_64-NEXT: .LBB60_1: # %cond.store
4605 ; KNL_64-NEXT: vmovq %xmm0, %rcx
4606 ; KNL_64-NEXT: vmovss %xmm1, (%rcx)
4607 ; KNL_64-NEXT: testb $2, %al
4608 ; KNL_64-NEXT: je .LBB60_4
4609 ; KNL_64-NEXT: .LBB60_3: # %cond.store1
4610 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
4611 ; KNL_64-NEXT: vextractps $1, %xmm1, (%rax)
4612 ; KNL_64-NEXT: vzeroupper
4615 ; KNL_32-LABEL: scatter_2i64_constant_indices:
4617 ; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0
4618 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0
4619 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
4620 ; KNL_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
4621 ; KNL_32-NEXT: kmovw %k0, %eax
4622 ; KNL_32-NEXT: testb $1, %al
4623 ; KNL_32-NEXT: jne .LBB60_1
4624 ; KNL_32-NEXT: # %bb.2: # %else
4625 ; KNL_32-NEXT: testb $2, %al
4626 ; KNL_32-NEXT: jne .LBB60_3
4627 ; KNL_32-NEXT: .LBB60_4: # %else2
4628 ; KNL_32-NEXT: vzeroupper
4630 ; KNL_32-NEXT: .LBB60_1: # %cond.store
4631 ; KNL_32-NEXT: vmovd %xmm0, %ecx
4632 ; KNL_32-NEXT: vmovss %xmm1, (%ecx)
4633 ; KNL_32-NEXT: testb $2, %al
4634 ; KNL_32-NEXT: je .LBB60_4
4635 ; KNL_32-NEXT: .LBB60_3: # %cond.store1
4636 ; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
4637 ; KNL_32-NEXT: vextractps $1, %xmm1, (%eax)
4638 ; KNL_32-NEXT: vzeroupper
4641 ; SKX_SMALL-LABEL: scatter_2i64_constant_indices:
4642 ; SKX_SMALL: # %bb.0:
4643 ; SKX_SMALL-NEXT: vpsllq $63, %xmm0, %xmm0
4644 ; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k0
4645 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %xmm0
4646 ; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
4647 ; SKX_SMALL-NEXT: kmovw %k0, %eax
4648 ; SKX_SMALL-NEXT: testb $1, %al
4649 ; SKX_SMALL-NEXT: jne .LBB60_1
4650 ; SKX_SMALL-NEXT: # %bb.2: # %else
4651 ; SKX_SMALL-NEXT: testb $2, %al
4652 ; SKX_SMALL-NEXT: jne .LBB60_3
4653 ; SKX_SMALL-NEXT: .LBB60_4: # %else2
4654 ; SKX_SMALL-NEXT: retq
4655 ; SKX_SMALL-NEXT: .LBB60_1: # %cond.store
4656 ; SKX_SMALL-NEXT: vmovq %xmm0, %rcx
4657 ; SKX_SMALL-NEXT: vmovss %xmm1, (%rcx)
4658 ; SKX_SMALL-NEXT: testb $2, %al
4659 ; SKX_SMALL-NEXT: je .LBB60_4
4660 ; SKX_SMALL-NEXT: .LBB60_3: # %cond.store1
4661 ; SKX_SMALL-NEXT: vpextrq $1, %xmm0, %rax
4662 ; SKX_SMALL-NEXT: vextractps $1, %xmm1, (%rax)
4663 ; SKX_SMALL-NEXT: retq
4665 ; SKX_LARGE-LABEL: scatter_2i64_constant_indices:
4666 ; SKX_LARGE: # %bb.0:
4667 ; SKX_LARGE-NEXT: vpsllq $63, %xmm0, %xmm0
4668 ; SKX_LARGE-NEXT: vpmovq2m %xmm0, %k0
4669 ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %xmm0
4670 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
4671 ; SKX_LARGE-NEXT: vpaddq (%rax), %xmm0, %xmm0
4672 ; SKX_LARGE-NEXT: kmovw %k0, %eax
4673 ; SKX_LARGE-NEXT: testb $1, %al
4674 ; SKX_LARGE-NEXT: jne .LBB60_1
4675 ; SKX_LARGE-NEXT: # %bb.2: # %else
4676 ; SKX_LARGE-NEXT: testb $2, %al
4677 ; SKX_LARGE-NEXT: jne .LBB60_3
4678 ; SKX_LARGE-NEXT: .LBB60_4: # %else2
4679 ; SKX_LARGE-NEXT: retq
4680 ; SKX_LARGE-NEXT: .LBB60_1: # %cond.store
4681 ; SKX_LARGE-NEXT: vmovq %xmm0, %rcx
4682 ; SKX_LARGE-NEXT: vmovss %xmm1, (%rcx)
4683 ; SKX_LARGE-NEXT: testb $2, %al
4684 ; SKX_LARGE-NEXT: je .LBB60_4
4685 ; SKX_LARGE-NEXT: .LBB60_3: # %cond.store1
4686 ; SKX_LARGE-NEXT: vpextrq $1, %xmm0, %rax
4687 ; SKX_LARGE-NEXT: vextractps $1, %xmm1, (%rax)
4688 ; SKX_LARGE-NEXT: retq
4690 ; SKX_32-LABEL: scatter_2i64_constant_indices:
4692 ; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0
4693 ; SKX_32-NEXT: vpmovq2m %xmm0, %k0
4694 ; SKX_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
4695 ; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
4696 ; SKX_32-NEXT: kmovw %k0, %eax
4697 ; SKX_32-NEXT: testb $1, %al
4698 ; SKX_32-NEXT: jne .LBB60_1
4699 ; SKX_32-NEXT: # %bb.2: # %else
4700 ; SKX_32-NEXT: testb $2, %al
4701 ; SKX_32-NEXT: jne .LBB60_3
4702 ; SKX_32-NEXT: .LBB60_4: # %else2
4704 ; SKX_32-NEXT: .LBB60_1: # %cond.store
4705 ; SKX_32-NEXT: vmovd %xmm0, %ecx
4706 ; SKX_32-NEXT: vmovss %xmm1, (%ecx)
4707 ; SKX_32-NEXT: testb $2, %al
4708 ; SKX_32-NEXT: je .LBB60_4
4709 ; SKX_32-NEXT: .LBB60_3: # %cond.store1
4710 ; SKX_32-NEXT: vpextrd $1, %xmm0, %eax
4711 ; SKX_32-NEXT: vextractps $1, %xmm1, (%eax)
4713 %gep = getelementptr i32, ptr %ptr, <2 x i64> <i64 0, i64 -2>
4714 call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> %src0, <2 x ptr> %gep, i32 4, <2 x i1> %mask)
4718 define void @scatter_16i64_constant_indices(ptr %ptr, <16 x i1> %mask, <16 x i32> %src0) {
4719 ; KNL_64-LABEL: scatter_16i64_constant_indices:
4721 ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0
4722 ; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0
4723 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
4724 ; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
4725 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
4726 ; KNL_64-NEXT: vzeroupper
4729 ; KNL_32-LABEL: scatter_16i64_constant_indices:
4731 ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm0
4732 ; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0
4733 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
4734 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4735 ; KNL_32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
4736 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
4737 ; KNL_32-NEXT: vzeroupper
4740 ; SKX_SMALL-LABEL: scatter_16i64_constant_indices:
4741 ; SKX_SMALL: # %bb.0:
4742 ; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
4743 ; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0
4744 ; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1
4745 ; SKX_SMALL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
4746 ; SKX_SMALL-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
4747 ; SKX_SMALL-NEXT: vzeroupper
4748 ; SKX_SMALL-NEXT: retq
4750 ; SKX_LARGE-LABEL: scatter_16i64_constant_indices:
4751 ; SKX_LARGE: # %bb.0:
4752 ; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
4753 ; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0
4754 ; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1
4755 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
4756 ; SKX_LARGE-NEXT: vmovdqa64 (%rax), %zmm0
4757 ; SKX_LARGE-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
4758 ; SKX_LARGE-NEXT: vzeroupper
4759 ; SKX_LARGE-NEXT: retq
4761 ; SKX_32-LABEL: scatter_16i64_constant_indices:
4763 ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0
4764 ; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0
4765 ; SKX_32-NEXT: vpmovd2m %zmm0, %k1
4766 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4767 ; SKX_32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
4768 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
4769 ; SKX_32-NEXT: vzeroupper
4771 %gep = getelementptr i32, ptr %ptr, <16 x i64> <i64 0, i64 -2, i64 1, i64 -8, i64 10, i64 20, i64 50, i64 65536, i64 16777215, i64 2147483647, i64 100, i64 -2000, i64 -2147483648, i64 76897723, i64 7, i64 -67897687>
4772 call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %src0, <16 x ptr> %gep, i32 4, <16 x i1> %mask)
4776 define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru) {
4777 ; KNL_64-LABEL: splat_ptr_gather:
4779 ; KNL_64-NEXT: vpslld $31, %xmm0, %xmm0
4780 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k0
4781 ; KNL_64-NEXT: vmovq %rdi, %xmm0
4782 ; KNL_64-NEXT: vpbroadcastq %xmm0, %ymm0
4783 ; KNL_64-NEXT: kmovw %k0, %eax
4784 ; KNL_64-NEXT: testb $1, %al
4785 ; KNL_64-NEXT: je .LBB62_2
4786 ; KNL_64-NEXT: # %bb.1: # %cond.load
4787 ; KNL_64-NEXT: vmovq %xmm0, %rcx
4788 ; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm1, %xmm1
4789 ; KNL_64-NEXT: .LBB62_2: # %else
4790 ; KNL_64-NEXT: testb $2, %al
4791 ; KNL_64-NEXT: je .LBB62_4
4792 ; KNL_64-NEXT: # %bb.3: # %cond.load1
4793 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
4794 ; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm1
4795 ; KNL_64-NEXT: .LBB62_4: # %else2
4796 ; KNL_64-NEXT: testb $4, %al
4797 ; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
4798 ; KNL_64-NEXT: jne .LBB62_5
4799 ; KNL_64-NEXT: # %bb.6: # %else5
4800 ; KNL_64-NEXT: testb $8, %al
4801 ; KNL_64-NEXT: jne .LBB62_7
4802 ; KNL_64-NEXT: .LBB62_8: # %else8
4803 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
4804 ; KNL_64-NEXT: vzeroupper
4806 ; KNL_64-NEXT: .LBB62_5: # %cond.load4
4807 ; KNL_64-NEXT: vmovq %xmm0, %rcx
4808 ; KNL_64-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm1
4809 ; KNL_64-NEXT: testb $8, %al
4810 ; KNL_64-NEXT: je .LBB62_8
4811 ; KNL_64-NEXT: .LBB62_7: # %cond.load7
4812 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
4813 ; KNL_64-NEXT: vpinsrd $3, (%rax), %xmm1, %xmm1
4814 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
4815 ; KNL_64-NEXT: vzeroupper
4818 ; KNL_32-LABEL: splat_ptr_gather:
4820 ; KNL_32-NEXT: vpslld $31, %xmm0, %xmm0
4821 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0
4822 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
4823 ; KNL_32-NEXT: kmovw %k0, %eax
4824 ; KNL_32-NEXT: testb $1, %al
4825 ; KNL_32-NEXT: jne .LBB62_1
4826 ; KNL_32-NEXT: # %bb.2: # %else
4827 ; KNL_32-NEXT: testb $2, %al
4828 ; KNL_32-NEXT: jne .LBB62_3
4829 ; KNL_32-NEXT: .LBB62_4: # %else2
4830 ; KNL_32-NEXT: testb $4, %al
4831 ; KNL_32-NEXT: jne .LBB62_5
4832 ; KNL_32-NEXT: .LBB62_6: # %else5
4833 ; KNL_32-NEXT: testb $8, %al
4834 ; KNL_32-NEXT: jne .LBB62_7
4835 ; KNL_32-NEXT: .LBB62_8: # %else8
4836 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
4837 ; KNL_32-NEXT: vzeroupper
4839 ; KNL_32-NEXT: .LBB62_1: # %cond.load
4840 ; KNL_32-NEXT: vmovd %xmm0, %ecx
4841 ; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm1, %xmm1
4842 ; KNL_32-NEXT: testb $2, %al
4843 ; KNL_32-NEXT: je .LBB62_4
4844 ; KNL_32-NEXT: .LBB62_3: # %cond.load1
4845 ; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
4846 ; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm1, %xmm1
4847 ; KNL_32-NEXT: testb $4, %al
4848 ; KNL_32-NEXT: je .LBB62_6
4849 ; KNL_32-NEXT: .LBB62_5: # %cond.load4
4850 ; KNL_32-NEXT: vpextrd $2, %xmm0, %ecx
4851 ; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm1, %xmm1
4852 ; KNL_32-NEXT: testb $8, %al
4853 ; KNL_32-NEXT: je .LBB62_8
4854 ; KNL_32-NEXT: .LBB62_7: # %cond.load7
4855 ; KNL_32-NEXT: vpextrd $3, %xmm0, %eax
4856 ; KNL_32-NEXT: vpinsrd $3, (%eax), %xmm1, %xmm1
4857 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
4858 ; KNL_32-NEXT: vzeroupper
4861 ; SKX-LABEL: splat_ptr_gather:
4863 ; SKX-NEXT: vpslld $31, %xmm0, %xmm0
4864 ; SKX-NEXT: vpmovd2m %xmm0, %k1
4865 ; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
4866 ; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
4867 ; SKX-NEXT: vmovdqa %xmm1, %xmm0
4870 ; SKX_32-LABEL: splat_ptr_gather:
4872 ; SKX_32-NEXT: vpslld $31, %xmm0, %xmm0
4873 ; SKX_32-NEXT: vpmovd2m %xmm0, %k1
4874 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4875 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
4876 ; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
4877 ; SKX_32-NEXT: vmovdqa %xmm1, %xmm0
4879 %1 = insertelement <4 x ptr> undef, ptr %ptr, i32 0
4880 %2 = shufflevector <4 x ptr> %1, <4 x ptr> undef, <4 x i32> zeroinitializer
4881 %3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %2, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
4884 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
4886 define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
4887 ; KNL_64-LABEL: splat_ptr_scatter:
4889 ; KNL_64-NEXT: vpslld $31, %xmm0, %xmm0
4890 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k0
4891 ; KNL_64-NEXT: vmovq %rdi, %xmm0
4892 ; KNL_64-NEXT: vpbroadcastq %xmm0, %ymm0
4893 ; KNL_64-NEXT: kmovw %k0, %eax
4894 ; KNL_64-NEXT: testb $1, %al
4895 ; KNL_64-NEXT: je .LBB63_2
4896 ; KNL_64-NEXT: # %bb.1: # %cond.store
4897 ; KNL_64-NEXT: vmovq %xmm0, %rcx
4898 ; KNL_64-NEXT: vmovss %xmm1, (%rcx)
4899 ; KNL_64-NEXT: .LBB63_2: # %else
4900 ; KNL_64-NEXT: testb $2, %al
4901 ; KNL_64-NEXT: je .LBB63_4
4902 ; KNL_64-NEXT: # %bb.3: # %cond.store1
4903 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
4904 ; KNL_64-NEXT: vextractps $1, %xmm1, (%rcx)
4905 ; KNL_64-NEXT: .LBB63_4: # %else2
4906 ; KNL_64-NEXT: testb $4, %al
4907 ; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
4908 ; KNL_64-NEXT: jne .LBB63_5
4909 ; KNL_64-NEXT: # %bb.6: # %else4
4910 ; KNL_64-NEXT: testb $8, %al
4911 ; KNL_64-NEXT: jne .LBB63_7
4912 ; KNL_64-NEXT: .LBB63_8: # %else6
4913 ; KNL_64-NEXT: vzeroupper
4915 ; KNL_64-NEXT: .LBB63_5: # %cond.store3
4916 ; KNL_64-NEXT: vmovq %xmm0, %rcx
4917 ; KNL_64-NEXT: vextractps $2, %xmm1, (%rcx)
4918 ; KNL_64-NEXT: testb $8, %al
4919 ; KNL_64-NEXT: je .LBB63_8
4920 ; KNL_64-NEXT: .LBB63_7: # %cond.store5
4921 ; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
4922 ; KNL_64-NEXT: vextractps $3, %xmm1, (%rax)
4923 ; KNL_64-NEXT: vzeroupper
4926 ; KNL_32-LABEL: splat_ptr_scatter:
4928 ; KNL_32-NEXT: vpslld $31, %xmm0, %xmm0
4929 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0
4930 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
4931 ; KNL_32-NEXT: kmovw %k0, %eax
4932 ; KNL_32-NEXT: testb $1, %al
4933 ; KNL_32-NEXT: jne .LBB63_1
4934 ; KNL_32-NEXT: # %bb.2: # %else
4935 ; KNL_32-NEXT: testb $2, %al
4936 ; KNL_32-NEXT: jne .LBB63_3
4937 ; KNL_32-NEXT: .LBB63_4: # %else2
4938 ; KNL_32-NEXT: testb $4, %al
4939 ; KNL_32-NEXT: jne .LBB63_5
4940 ; KNL_32-NEXT: .LBB63_6: # %else4
4941 ; KNL_32-NEXT: testb $8, %al
4942 ; KNL_32-NEXT: jne .LBB63_7
4943 ; KNL_32-NEXT: .LBB63_8: # %else6
4944 ; KNL_32-NEXT: vzeroupper
4946 ; KNL_32-NEXT: .LBB63_1: # %cond.store
4947 ; KNL_32-NEXT: vmovd %xmm0, %ecx
4948 ; KNL_32-NEXT: vmovss %xmm1, (%ecx)
4949 ; KNL_32-NEXT: testb $2, %al
4950 ; KNL_32-NEXT: je .LBB63_4
4951 ; KNL_32-NEXT: .LBB63_3: # %cond.store1
4952 ; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
4953 ; KNL_32-NEXT: vextractps $1, %xmm1, (%ecx)
4954 ; KNL_32-NEXT: testb $4, %al
4955 ; KNL_32-NEXT: je .LBB63_6
4956 ; KNL_32-NEXT: .LBB63_5: # %cond.store3
4957 ; KNL_32-NEXT: vpextrd $2, %xmm0, %ecx
4958 ; KNL_32-NEXT: vextractps $2, %xmm1, (%ecx)
4959 ; KNL_32-NEXT: testb $8, %al
4960 ; KNL_32-NEXT: je .LBB63_8
4961 ; KNL_32-NEXT: .LBB63_7: # %cond.store5
4962 ; KNL_32-NEXT: vpextrd $3, %xmm0, %eax
4963 ; KNL_32-NEXT: vextractps $3, %xmm1, (%eax)
4964 ; KNL_32-NEXT: vzeroupper
4967 ; SKX-LABEL: splat_ptr_scatter:
4969 ; SKX-NEXT: vpslld $31, %xmm0, %xmm0
4970 ; SKX-NEXT: vpmovd2m %xmm0, %k1
4971 ; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
4972 ; SKX-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
4975 ; SKX_32-LABEL: splat_ptr_scatter:
4977 ; SKX_32-NEXT: vpslld $31, %xmm0, %xmm0
4978 ; SKX_32-NEXT: vpmovd2m %xmm0, %k1
4979 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
4980 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
4981 ; SKX_32-NEXT: vpscatterdd %xmm1, (%eax,%xmm0,4) {%k1}
4983 %1 = insertelement <4 x ptr> undef, ptr %ptr, i32 0
4984 %2 = shufflevector <4 x ptr> %1, <4 x ptr> undef, <4 x i32> zeroinitializer
4985 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %2, i32 4, <4 x i1> %mask)
4991 ; Failure to fold scaled-index into gather/scatter scale operand.
4994 define <8 x float> @scaleidx_x86gather(ptr %base, <8 x i32> %index, <8 x i32> %imask) nounwind {
4995 ; KNL_64-LABEL: scaleidx_x86gather:
4997 ; KNL_64-NEXT: vpslld $2, %ymm0, %ymm2
4998 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
4999 ; KNL_64-NEXT: vgatherdps %ymm1, (%rdi,%ymm2), %ymm0
5002 ; KNL_32-LABEL: scaleidx_x86gather:
5004 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5005 ; KNL_32-NEXT: vxorps %xmm2, %xmm2, %xmm2
5006 ; KNL_32-NEXT: vgatherdps %ymm1, (%eax,%ymm0,4), %ymm2
5007 ; KNL_32-NEXT: vmovaps %ymm2, %ymm0
5010 ; SKX-LABEL: scaleidx_x86gather:
5012 ; SKX-NEXT: vpslld $2, %ymm0, %ymm2
5013 ; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
5014 ; SKX-NEXT: vgatherdps %ymm1, (%rdi,%ymm2), %ymm0
5017 ; SKX_32-LABEL: scaleidx_x86gather:
5019 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5020 ; SKX_32-NEXT: vxorps %xmm2, %xmm2, %xmm2
5021 ; SKX_32-NEXT: vgatherdps %ymm1, (%eax,%ymm0,4), %ymm2
5022 ; SKX_32-NEXT: vmovaps %ymm2, %ymm0
5024 %mask = bitcast <8 x i32> %imask to <8 x float>
5025 %scaledindex = mul <8 x i32> %index, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
5026 %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, ptr %base, <8 x i32> %scaledindex, <8 x float> %mask, i8 1) nounwind
5030 define <8 x float> @scaleidx_x86gather_outofrange(ptr %base, <8 x i32> %index, <8 x i32> %imask) nounwind {
5031 ; KNL_64-LABEL: scaleidx_x86gather_outofrange:
5033 ; KNL_64-NEXT: vpslld $2, %ymm0, %ymm2
5034 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0
5035 ; KNL_64-NEXT: vgatherdps %ymm1, (%rdi,%ymm2,4), %ymm0
5038 ; KNL_32-LABEL: scaleidx_x86gather_outofrange:
5040 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5041 ; KNL_32-NEXT: vpslld $2, %ymm0, %ymm2
5042 ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
5043 ; KNL_32-NEXT: vgatherdps %ymm1, (%eax,%ymm2,4), %ymm0
5046 ; SKX-LABEL: scaleidx_x86gather_outofrange:
5048 ; SKX-NEXT: vpslld $2, %ymm0, %ymm2
5049 ; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
5050 ; SKX-NEXT: vgatherdps %ymm1, (%rdi,%ymm2,4), %ymm0
5053 ; SKX_32-LABEL: scaleidx_x86gather_outofrange:
5055 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5056 ; SKX_32-NEXT: vpslld $2, %ymm0, %ymm2
5057 ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
5058 ; SKX_32-NEXT: vgatherdps %ymm1, (%eax,%ymm2,4), %ymm0
5060 %mask = bitcast <8 x i32> %imask to <8 x float>
5061 %scaledindex = mul <8 x i32> %index, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
5062 %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, ptr %base, <8 x i32> %scaledindex, <8 x float> %mask, i8 4) nounwind
5065 declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, ptr, <8 x i32>, <8 x float>, i8) nounwind readonly
5067 define void @scaleidx_x86scatter(<16 x float> %value, ptr %base, <16 x i32> %index, i16 %imask) nounwind {
5068 ; KNL_64-LABEL: scaleidx_x86scatter:
5070 ; KNL_64-NEXT: kmovw %esi, %k1
5071 ; KNL_64-NEXT: vpaddd %zmm1, %zmm1, %zmm1
5072 ; KNL_64-NEXT: vscatterdps %zmm0, (%rdi,%zmm1,2) {%k1}
5073 ; KNL_64-NEXT: vzeroupper
5076 ; KNL_32-LABEL: scaleidx_x86scatter:
5078 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5079 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5080 ; KNL_32-NEXT: vscatterdps %zmm0, (%eax,%zmm1,4) {%k1}
5081 ; KNL_32-NEXT: vzeroupper
5084 ; SKX-LABEL: scaleidx_x86scatter:
5086 ; SKX-NEXT: kmovw %esi, %k1
5087 ; SKX-NEXT: vpaddd %zmm1, %zmm1, %zmm1
5088 ; SKX-NEXT: vscatterdps %zmm0, (%rdi,%zmm1,2) {%k1}
5089 ; SKX-NEXT: vzeroupper
5092 ; SKX_32-LABEL: scaleidx_x86scatter:
5094 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5095 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
5096 ; SKX_32-NEXT: vscatterdps %zmm0, (%eax,%zmm1,4) {%k1}
5097 ; SKX_32-NEXT: vzeroupper
5099 %mask = bitcast i16 %imask to <16 x i1>
5100 %scaledindex = shl <16 x i32> %index, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
5101 call void @llvm.x86.avx512.mask.scatter.dps.512(ptr %base, <16 x i1> %mask, <16 x i32> %scaledindex, <16 x float> %value, i32 2)
5104 declare void @llvm.x86.avx512.mask.scatter.dps.512(ptr, <16 x i1>, <16 x i32>, <16 x float>, i32)
5106 define void @scaleidx_scatter(<8 x float> %value, ptr %base, <8 x i32> %index, i8 %imask) nounwind {
5107 ; KNL_64-LABEL: scaleidx_scatter:
5109 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
5110 ; KNL_64-NEXT: vpaddd %ymm1, %ymm1, %ymm1
5111 ; KNL_64-NEXT: kmovw %esi, %k0
5112 ; KNL_64-NEXT: kshiftlw $8, %k0, %k0
5113 ; KNL_64-NEXT: kshiftrw $8, %k0, %k1
5114 ; KNL_64-NEXT: vscatterdps %zmm0, (%rdi,%zmm1,4) {%k1}
5115 ; KNL_64-NEXT: vzeroupper
5118 ; KNL_32-LABEL: scaleidx_scatter:
5120 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
5121 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5122 ; KNL_32-NEXT: vpaddd %ymm1, %ymm1, %ymm1
5123 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
5124 ; KNL_32-NEXT: kmovw %ecx, %k0
5125 ; KNL_32-NEXT: kshiftlw $8, %k0, %k0
5126 ; KNL_32-NEXT: kshiftrw $8, %k0, %k1
5127 ; KNL_32-NEXT: vscatterdps %zmm0, (%eax,%zmm1,4) {%k1}
5128 ; KNL_32-NEXT: vzeroupper
5131 ; SKX-LABEL: scaleidx_scatter:
5133 ; SKX-NEXT: vpaddd %ymm1, %ymm1, %ymm1
5134 ; SKX-NEXT: kmovw %esi, %k1
5135 ; SKX-NEXT: vscatterdps %ymm0, (%rdi,%ymm1,4) {%k1}
5136 ; SKX-NEXT: vzeroupper
5139 ; SKX_32-LABEL: scaleidx_scatter:
5141 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5142 ; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
5143 ; SKX_32-NEXT: vscatterdps %ymm0, (%eax,%ymm1,8) {%k1}
5144 ; SKX_32-NEXT: vzeroupper
5146 %scaledindex = mul <8 x i32> %index, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
5147 %ptrs = getelementptr float, ptr %base, <8 x i32> %scaledindex
5148 %mask = bitcast i8 %imask to <8 x i1>
5149 call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %value, <8 x ptr> %ptrs, i32 1, <8 x i1> %mask)
5153 define void @scaleidx_scatter_outofrange(<8 x float> %value, ptr %base, <8 x i32> %index, i8 %imask) nounwind {
5154 ; KNL_64-LABEL: scaleidx_scatter_outofrange:
5156 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
5157 ; KNL_64-NEXT: vpslld $2, %ymm1, %ymm1
5158 ; KNL_64-NEXT: kmovw %esi, %k0
5159 ; KNL_64-NEXT: kshiftlw $8, %k0, %k0
5160 ; KNL_64-NEXT: kshiftrw $8, %k0, %k1
5161 ; KNL_64-NEXT: vscatterdps %zmm0, (%rdi,%zmm1,4) {%k1}
5162 ; KNL_64-NEXT: vzeroupper
5165 ; KNL_32-LABEL: scaleidx_scatter_outofrange:
5167 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
5168 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5169 ; KNL_32-NEXT: vpslld $2, %ymm1, %ymm1
5170 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
5171 ; KNL_32-NEXT: kmovw %ecx, %k0
5172 ; KNL_32-NEXT: kshiftlw $8, %k0, %k0
5173 ; KNL_32-NEXT: kshiftrw $8, %k0, %k1
5174 ; KNL_32-NEXT: vscatterdps %zmm0, (%eax,%zmm1,4) {%k1}
5175 ; KNL_32-NEXT: vzeroupper
5178 ; SKX-LABEL: scaleidx_scatter_outofrange:
5180 ; SKX-NEXT: vpslld $2, %ymm1, %ymm1
5181 ; SKX-NEXT: kmovw %esi, %k1
5182 ; SKX-NEXT: vscatterdps %ymm0, (%rdi,%ymm1,4) {%k1}
5183 ; SKX-NEXT: vzeroupper
5186 ; SKX_32-LABEL: scaleidx_scatter_outofrange:
5188 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
5189 ; SKX_32-NEXT: vpslld $2, %ymm1, %ymm1
5190 ; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
5191 ; SKX_32-NEXT: vscatterdps %ymm0, (%eax,%ymm1,4) {%k1}
5192 ; SKX_32-NEXT: vzeroupper
5194 %scaledindex = mul <8 x i32> %index, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
5195 %ptrs = getelementptr float, ptr %base, <8 x i32> %scaledindex
5196 %mask = bitcast i8 %imask to <8 x i1>
5197 call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %value, <8 x ptr> %ptrs, i32 2, <8 x i1> %mask)
5200 declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32 immarg, <8 x i1>)
5204 ; This used to cause fast-isel to generate bad copy instructions that would
5205 ; cause an error in copyPhysReg.
5208 %struct.foo = type { ptr, i64, i16, i16, i32 }
5210 define <8 x i64> @pr45906(<8 x ptr> %ptr) {
5211 ; KNL_64-LABEL: pr45906:
5212 ; KNL_64: # %bb.0: # %bb
5213 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
5214 ; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
5215 ; KNL_64-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1}
5216 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
5219 ; KNL_32-LABEL: pr45906:
5220 ; KNL_32: # %bb.0: # %bb
5221 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
5222 ; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
5223 ; KNL_32-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1}
5224 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
5227 ; SKX-LABEL: pr45906:
5228 ; SKX: # %bb.0: # %bb
5229 ; SKX-NEXT: kxnorw %k0, %k0, %k1
5230 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
5231 ; SKX-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1}
5232 ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
5235 ; SKX_32-LABEL: pr45906:
5236 ; SKX_32: # %bb.0: # %bb
5237 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
5238 ; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
5239 ; SKX_32-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1}
5240 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
5243 %tmp = getelementptr inbounds %struct.foo, <8 x ptr> %ptr, i64 0, i32 1
5244 %tmp1 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> %tmp, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i64> undef)
5247 declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>)