1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=KNL %s
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_ONLY %s
4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+avx512vbmi | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_VBMI %s
6 define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
9 ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
10 ; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
11 ; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
12 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
13 ; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
15 %rrr = load float, float* %br
16 %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
17 %rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
18 ret <16 x float> %rrr3
21 define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
24 ; CHECK-NEXT: vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1]
25 ; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
26 ; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
27 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
28 ; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
30 %rrr = load double, double* %br
31 %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
32 %rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
33 ret <8 x double> %rrr3
36 define <16 x float> @test3(<16 x float> %x) nounwind {
39 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
40 ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
41 ; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
43 %eee = extractelement <16 x float> %x, i32 4
44 %rrr2 = insertelement <16 x float> %x, float %eee, i32 1
45 ret <16 x float> %rrr2
48 define <8 x i64> @test4(<8 x i64> %x) nounwind {
51 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm1
52 ; CHECK-NEXT: vmovq %xmm1, %rax
53 ; CHECK-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1
54 ; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
56 %eee = extractelement <8 x i64> %x, i32 4
57 %rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1
61 define i32 @test5(<4 x float> %x) nounwind {
64 ; CHECK-NEXT: vextractps $3, %xmm0, %eax
66 %ef = extractelement <4 x float> %x, i32 3
67 %ei = bitcast float %ef to i32
71 define void @test6(<4 x float> %x, float* %out) nounwind {
74 ; CHECK-NEXT: vextractps $3, %xmm0, (%rdi)
76 %ef = extractelement <4 x float> %x, i32 3
77 store float %ef, float* %out, align 4
81 define float @test7(<16 x float> %x, i32 %ind) nounwind {
84 ; CHECK-NEXT: pushq %rbp
85 ; CHECK-NEXT: movq %rsp, %rbp
86 ; CHECK-NEXT: andq $-64, %rsp
87 ; CHECK-NEXT: subq $128, %rsp
88 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
89 ; CHECK-NEXT: vmovaps %zmm0, (%rsp)
90 ; CHECK-NEXT: andl $15, %edi
91 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
92 ; CHECK-NEXT: movq %rbp, %rsp
93 ; CHECK-NEXT: popq %rbp
94 ; CHECK-NEXT: vzeroupper
96 %e = extractelement <16 x float> %x, i32 %ind
100 define double @test8(<8 x double> %x, i32 %ind) nounwind {
101 ; CHECK-LABEL: test8:
103 ; CHECK-NEXT: pushq %rbp
104 ; CHECK-NEXT: movq %rsp, %rbp
105 ; CHECK-NEXT: andq $-64, %rsp
106 ; CHECK-NEXT: subq $128, %rsp
107 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
108 ; CHECK-NEXT: vmovaps %zmm0, (%rsp)
109 ; CHECK-NEXT: andl $7, %edi
110 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
111 ; CHECK-NEXT: movq %rbp, %rsp
112 ; CHECK-NEXT: popq %rbp
113 ; CHECK-NEXT: vzeroupper
115 %e = extractelement <8 x double> %x, i32 %ind
119 define float @test9(<8 x float> %x, i32 %ind) nounwind {
120 ; CHECK-LABEL: test9:
122 ; CHECK-NEXT: pushq %rbp
123 ; CHECK-NEXT: movq %rsp, %rbp
124 ; CHECK-NEXT: andq $-32, %rsp
125 ; CHECK-NEXT: subq $64, %rsp
126 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
127 ; CHECK-NEXT: vmovaps %ymm0, (%rsp)
128 ; CHECK-NEXT: andl $7, %edi
129 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
130 ; CHECK-NEXT: movq %rbp, %rsp
131 ; CHECK-NEXT: popq %rbp
132 ; CHECK-NEXT: vzeroupper
134 %e = extractelement <8 x float> %x, i32 %ind
138 define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
139 ; CHECK-LABEL: test10:
141 ; CHECK-NEXT: pushq %rbp
142 ; CHECK-NEXT: movq %rsp, %rbp
143 ; CHECK-NEXT: andq $-64, %rsp
144 ; CHECK-NEXT: subq $128, %rsp
145 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
146 ; CHECK-NEXT: vmovaps %zmm0, (%rsp)
147 ; CHECK-NEXT: andl $15, %edi
148 ; CHECK-NEXT: movl (%rsp,%rdi,4), %eax
149 ; CHECK-NEXT: movq %rbp, %rsp
150 ; CHECK-NEXT: popq %rbp
151 ; CHECK-NEXT: vzeroupper
153 %e = extractelement <16 x i32> %x, i32 %ind
157 define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
160 ; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0
161 ; KNL-NEXT: kshiftrw $4, %k0, %k0
162 ; KNL-NEXT: kmovw %k0, %eax
163 ; KNL-NEXT: testb $1, %al
164 ; KNL-NEXT: je LBB10_2
165 ; KNL-NEXT: ## %bb.1: ## %A
166 ; KNL-NEXT: vmovdqa64 %zmm1, %zmm0
168 ; KNL-NEXT: LBB10_2: ## %B
169 ; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0
174 ; SKX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
175 ; SKX-NEXT: kshiftrw $4, %k0, %k0
176 ; SKX-NEXT: kmovd %k0, %eax
177 ; SKX-NEXT: testb $1, %al
178 ; SKX-NEXT: je LBB10_2
179 ; SKX-NEXT: ## %bb.1: ## %A
180 ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
182 ; SKX-NEXT: LBB10_2: ## %B
183 ; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0
185 %cmp_res = icmp ult <16 x i32> %a, %b
186 %ia = extractelement <16 x i1> %cmp_res, i32 4
187 br i1 %ia, label %A, label %B
191 %c = add <16 x i32>%b, %a
195 define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
198 ; KNL-NEXT: movq %rdi, %rax
199 ; KNL-NEXT: vpcmpgtq %zmm0, %zmm2, %k0
200 ; KNL-NEXT: kmovw %k0, %ecx
201 ; KNL-NEXT: testb $1, %cl
202 ; KNL-NEXT: cmoveq %rsi, %rax
203 ; KNL-NEXT: vzeroupper
208 ; SKX-NEXT: movq %rdi, %rax
209 ; SKX-NEXT: vpcmpgtq %zmm0, %zmm2, %k0
210 ; SKX-NEXT: kmovd %k0, %ecx
211 ; SKX-NEXT: testb $1, %cl
212 ; SKX-NEXT: cmoveq %rsi, %rax
213 ; SKX-NEXT: vzeroupper
215 %cmpvector_func.i = icmp slt <16 x i64> %a, %b
216 %extract24vector_func.i = extractelement <16 x i1> %cmpvector_func.i, i32 0
217 %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
221 define i16 @test13(i32 %a, i32 %b) {
224 ; KNL-NEXT: cmpl %esi, %edi
226 ; KNL-NEXT: movw $-4, %cx
227 ; KNL-NEXT: kmovw %ecx, %k0
228 ; KNL-NEXT: kshiftrw $1, %k0, %k0
229 ; KNL-NEXT: kshiftlw $1, %k0, %k0
230 ; KNL-NEXT: andl $1, %eax
231 ; KNL-NEXT: kmovw %eax, %k1
232 ; KNL-NEXT: korw %k1, %k0, %k0
233 ; KNL-NEXT: kmovw %k0, %eax
234 ; KNL-NEXT: ## kill: def $ax killed $ax killed $eax
239 ; SKX-NEXT: cmpl %esi, %edi
241 ; SKX-NEXT: movw $-4, %cx
242 ; SKX-NEXT: kmovd %ecx, %k0
243 ; SKX-NEXT: kshiftrw $1, %k0, %k0
244 ; SKX-NEXT: kshiftlw $1, %k0, %k0
245 ; SKX-NEXT: andl $1, %eax
246 ; SKX-NEXT: kmovw %eax, %k1
247 ; SKX-NEXT: korw %k1, %k0, %k0
248 ; SKX-NEXT: kmovd %k0, %eax
249 ; SKX-NEXT: ## kill: def $ax killed $ax killed $eax
251 %cmp_res = icmp ult i32 %a, %b
252 %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %cmp_res, i32 0
253 %res = bitcast <16 x i1> %maskv to i16
257 define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
260 ; KNL-NEXT: movq %rdi, %rax
261 ; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
262 ; KNL-NEXT: kshiftrw $4, %k0, %k0
263 ; KNL-NEXT: kmovw %k0, %ecx
264 ; KNL-NEXT: testb $1, %cl
265 ; KNL-NEXT: cmoveq %rsi, %rax
266 ; KNL-NEXT: vzeroupper
271 ; SKX-NEXT: movq %rdi, %rax
272 ; SKX-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
273 ; SKX-NEXT: kshiftrb $4, %k0, %k0
274 ; SKX-NEXT: kmovd %k0, %ecx
275 ; SKX-NEXT: testb $1, %cl
276 ; SKX-NEXT: cmoveq %rsi, %rax
277 ; SKX-NEXT: vzeroupper
279 %cmpvector_func.i = icmp slt <8 x i64> %a, %b
280 %extract24vector_func.i = extractelement <8 x i1> %cmpvector_func.i, i32 4
281 %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
285 define i16 @test15(i1 *%addr) {
286 ; CHECK-LABEL: test15:
288 ; CHECK-NEXT: xorl %ecx, %ecx
289 ; CHECK-NEXT: cmpb $0, (%rdi)
290 ; CHECK-NEXT: movl $65535, %eax ## imm = 0xFFFF
291 ; CHECK-NEXT: cmovel %ecx, %eax
292 ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
294 %x = load i1 , i1 * %addr, align 1
295 %x1 = insertelement <16 x i1> undef, i1 %x, i32 10
296 %x2 = bitcast <16 x i1>%x1 to i16
300 define i16 @test16(i1 *%addr, i16 %a) {
303 ; KNL-NEXT: movb (%rdi), %al
304 ; KNL-NEXT: kmovw %esi, %k0
305 ; KNL-NEXT: kmovw %eax, %k1
306 ; KNL-NEXT: kshiftrw $10, %k0, %k2
307 ; KNL-NEXT: kxorw %k1, %k2, %k1
308 ; KNL-NEXT: kshiftlw $15, %k1, %k1
309 ; KNL-NEXT: kshiftrw $5, %k1, %k1
310 ; KNL-NEXT: kxorw %k1, %k0, %k0
311 ; KNL-NEXT: kmovw %k0, %eax
312 ; KNL-NEXT: ## kill: def $ax killed $ax killed $eax
317 ; SKX-NEXT: kmovb (%rdi), %k0
318 ; SKX-NEXT: kmovd %esi, %k1
319 ; SKX-NEXT: kshiftrw $10, %k1, %k2
320 ; SKX-NEXT: kxorw %k0, %k2, %k0
321 ; SKX-NEXT: kshiftlw $15, %k0, %k0
322 ; SKX-NEXT: kshiftrw $5, %k0, %k0
323 ; SKX-NEXT: kxorw %k0, %k1, %k0
324 ; SKX-NEXT: kmovd %k0, %eax
325 ; SKX-NEXT: ## kill: def $ax killed $ax killed $eax
327 %x = load i1 , i1 * %addr, align 128
328 %a1 = bitcast i16 %a to <16 x i1>
329 %x1 = insertelement <16 x i1> %a1, i1 %x, i32 10
330 %x2 = bitcast <16 x i1>%x1 to i16
334 define i8 @test17(i1 *%addr, i8 %a) {
337 ; KNL-NEXT: movb (%rdi), %al
338 ; KNL-NEXT: kmovw %esi, %k0
339 ; KNL-NEXT: kmovw %eax, %k1
340 ; KNL-NEXT: kshiftrw $4, %k0, %k2
341 ; KNL-NEXT: kxorw %k1, %k2, %k1
342 ; KNL-NEXT: kshiftlw $15, %k1, %k1
343 ; KNL-NEXT: kshiftrw $11, %k1, %k1
344 ; KNL-NEXT: kxorw %k1, %k0, %k0
345 ; KNL-NEXT: kmovw %k0, %eax
346 ; KNL-NEXT: ## kill: def $al killed $al killed $eax
351 ; SKX-NEXT: kmovb (%rdi), %k0
352 ; SKX-NEXT: kmovd %esi, %k1
353 ; SKX-NEXT: kshiftrb $4, %k1, %k2
354 ; SKX-NEXT: kxorb %k0, %k2, %k0
355 ; SKX-NEXT: kshiftlb $7, %k0, %k0
356 ; SKX-NEXT: kshiftrb $3, %k0, %k0
357 ; SKX-NEXT: kxorb %k0, %k1, %k0
358 ; SKX-NEXT: kmovd %k0, %eax
359 ; SKX-NEXT: ## kill: def $al killed $al killed $eax
361 %x = load i1 , i1 * %addr, align 128
362 %a1 = bitcast i8 %a to <8 x i1>
363 %x1 = insertelement <8 x i1> %a1, i1 %x, i32 4
364 %x2 = bitcast <8 x i1>%x1 to i8
368 define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) {
369 ; CHECK-LABEL: extract_v8i64:
371 ; CHECK-NEXT: vpextrq $1, %xmm0, %rax
372 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
373 ; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi)
374 ; CHECK-NEXT: vzeroupper
376 %r1 = extractelement <8 x i64> %x, i32 1
377 %r2 = extractelement <8 x i64> %x, i32 3
378 store i64 %r2, i64* %dst, align 1
382 define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
383 ; CHECK-LABEL: extract_v4i64:
385 ; CHECK-NEXT: vpextrq $1, %xmm0, %rax
386 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
387 ; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi)
388 ; CHECK-NEXT: vzeroupper
390 %r1 = extractelement <4 x i64> %x, i32 1
391 %r2 = extractelement <4 x i64> %x, i32 3
392 store i64 %r2, i64* %dst, align 1
396 define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) {
397 ; CHECK-LABEL: extract_v2i64:
399 ; CHECK-NEXT: vmovq %xmm0, %rax
400 ; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi)
402 %r1 = extractelement <2 x i64> %x, i32 0
403 %r2 = extractelement <2 x i64> %x, i32 1
404 store i64 %r2, i64* %dst, align 1
408 define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) {
409 ; CHECK-LABEL: extract_v16i32:
411 ; CHECK-NEXT: vextractps $1, %xmm0, %eax
412 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
413 ; CHECK-NEXT: vextractps $1, %xmm0, (%rdi)
414 ; CHECK-NEXT: vzeroupper
416 %r1 = extractelement <16 x i32> %x, i32 1
417 %r2 = extractelement <16 x i32> %x, i32 5
418 store i32 %r2, i32* %dst, align 1
422 define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
423 ; CHECK-LABEL: extract_v8i32:
425 ; CHECK-NEXT: vextractps $1, %xmm0, %eax
426 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
427 ; CHECK-NEXT: vextractps $1, %xmm0, (%rdi)
428 ; CHECK-NEXT: vzeroupper
430 %r1 = extractelement <8 x i32> %x, i32 1
431 %r2 = extractelement <8 x i32> %x, i32 5
432 store i32 %r2, i32* %dst, align 1
436 define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) {
437 ; CHECK-LABEL: extract_v4i32:
439 ; CHECK-NEXT: vextractps $1, %xmm0, %eax
440 ; CHECK-NEXT: vextractps $3, %xmm0, (%rdi)
442 %r1 = extractelement <4 x i32> %x, i32 1
443 %r2 = extractelement <4 x i32> %x, i32 3
444 store i32 %r2, i32* %dst, align 1
448 define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) {
449 ; CHECK-LABEL: extract_v32i16:
451 ; CHECK-NEXT: vpextrw $1, %xmm0, %eax
452 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
453 ; CHECK-NEXT: vpextrw $1, %xmm0, (%rdi)
454 ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
455 ; CHECK-NEXT: vzeroupper
457 %r1 = extractelement <32 x i16> %x, i32 1
458 %r2 = extractelement <32 x i16> %x, i32 9
459 store i16 %r2, i16* %dst, align 1
463 define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
464 ; CHECK-LABEL: extract_v16i16:
466 ; CHECK-NEXT: vpextrw $1, %xmm0, %eax
467 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
468 ; CHECK-NEXT: vpextrw $1, %xmm0, (%rdi)
469 ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
470 ; CHECK-NEXT: vzeroupper
472 %r1 = extractelement <16 x i16> %x, i32 1
473 %r2 = extractelement <16 x i16> %x, i32 9
474 store i16 %r2, i16* %dst, align 1
478 define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) {
479 ; CHECK-LABEL: extract_v8i16:
481 ; CHECK-NEXT: vpextrw $1, %xmm0, %eax
482 ; CHECK-NEXT: vpextrw $3, %xmm0, (%rdi)
483 ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax
485 %r1 = extractelement <8 x i16> %x, i32 1
486 %r2 = extractelement <8 x i16> %x, i32 3
487 store i16 %r2, i16* %dst, align 1
491 define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) {
492 ; CHECK-LABEL: extract_v64i8:
494 ; CHECK-NEXT: vpextrb $1, %xmm0, %eax
495 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
496 ; CHECK-NEXT: vpextrb $1, %xmm0, (%rdi)
497 ; CHECK-NEXT: ## kill: def $al killed $al killed $eax
498 ; CHECK-NEXT: vzeroupper
500 %r1 = extractelement <64 x i8> %x, i32 1
501 %r2 = extractelement <64 x i8> %x, i32 17
502 store i8 %r2, i8* %dst, align 1
506 define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
507 ; CHECK-LABEL: extract_v32i8:
509 ; CHECK-NEXT: vpextrb $1, %xmm0, %eax
510 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
511 ; CHECK-NEXT: vpextrb $1, %xmm0, (%rdi)
512 ; CHECK-NEXT: ## kill: def $al killed $al killed $eax
513 ; CHECK-NEXT: vzeroupper
515 %r1 = extractelement <32 x i8> %x, i32 1
516 %r2 = extractelement <32 x i8> %x, i32 17
517 store i8 %r2, i8* %dst, align 1
521 define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) {
522 ; CHECK-LABEL: extract_v16i8:
524 ; CHECK-NEXT: vpextrb $1, %xmm0, %eax
525 ; CHECK-NEXT: vpextrb $3, %xmm0, (%rdi)
526 ; CHECK-NEXT: ## kill: def $al killed $al killed $eax
528 %r1 = extractelement <16 x i8> %x, i32 1
529 %r2 = extractelement <16 x i8> %x, i32 3
530 store i8 %r2, i8* %dst, align 1
534 define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
535 ; CHECK-LABEL: insert_v8i64:
537 ; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
538 ; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
539 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
540 ; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
541 ; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
543 %val = load i64, i64* %ptr
544 %r1 = insertelement <8 x i64> %x, i64 %val, i32 1
545 %r2 = insertelement <8 x i64> %r1, i64 %y, i32 3
549 define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
550 ; CHECK-LABEL: insert_v4i64:
552 ; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
553 ; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
554 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
555 ; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
556 ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
558 %val = load i64, i64* %ptr
559 %r1 = insertelement <4 x i64> %x, i64 %val, i32 1
560 %r2 = insertelement <4 x i64> %r1, i64 %y, i32 3
564 define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) {
565 ; CHECK-LABEL: insert_v2i64:
567 ; CHECK-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0
568 ; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0
570 %val = load i64, i64* %ptr
571 %r1 = insertelement <2 x i64> %x, i64 %val, i32 1
572 %r2 = insertelement <2 x i64> %r1, i64 %y, i32 0
576 define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) {
577 ; CHECK-LABEL: insert_v16i32:
579 ; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
580 ; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
581 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
582 ; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
583 ; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
585 %val = load i32, i32* %ptr
586 %r1 = insertelement <16 x i32> %x, i32 %val, i32 1
587 %r2 = insertelement <16 x i32> %r1, i32 %y, i32 5
591 define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) {
592 ; CHECK-LABEL: insert_v8i32:
594 ; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
595 ; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
596 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
597 ; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
598 ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
600 %val = load i32, i32* %ptr
601 %r1 = insertelement <8 x i32> %x, i32 %val, i32 1
602 %r2 = insertelement <8 x i32> %r1, i32 %y, i32 5
606 define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) {
607 ; CHECK-LABEL: insert_v4i32:
609 ; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm0
610 ; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
612 %val = load i32, i32* %ptr
613 %r1 = insertelement <4 x i32> %x, i32 %val, i32 1
614 %r2 = insertelement <4 x i32> %r1, i32 %y, i32 3
618 define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) {
619 ; KNL-LABEL: insert_v32i16:
621 ; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm2
622 ; KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
623 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
624 ; KNL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
625 ; KNL-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
628 ; SKX-LABEL: insert_v32i16:
630 ; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
631 ; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
632 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
633 ; SKX-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
634 ; SKX-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
636 %val = load i16, i16* %ptr
637 %r1 = insertelement <32 x i16> %x, i16 %val, i32 1
638 %r2 = insertelement <32 x i16> %r1, i16 %y, i32 9
642 define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) {
643 ; CHECK-LABEL: insert_v16i16:
645 ; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
646 ; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
647 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
648 ; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
649 ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
651 %val = load i16, i16* %ptr
652 %r1 = insertelement <16 x i16> %x, i16 %val, i32 1
653 %r2 = insertelement <16 x i16> %r1, i16 %y, i32 9
657 define <8 x i16> @insert_v8i16(<8 x i16> %x, i16 %y, i16* %ptr) {
658 ; CHECK-LABEL: insert_v8i16:
660 ; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm0
661 ; CHECK-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0
663 %val = load i16, i16* %ptr
664 %r1 = insertelement <8 x i16> %x, i16 %val, i32 1
665 %r2 = insertelement <8 x i16> %r1, i16 %y, i32 5
669 define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) {
670 ; KNL-LABEL: insert_v64i8:
672 ; KNL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm2
673 ; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
674 ; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
675 ; KNL-NEXT: vpinsrb $2, %edi, %xmm2, %xmm2
676 ; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
679 ; SKX-LABEL: insert_v64i8:
681 ; SKX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
682 ; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
683 ; SKX-NEXT: vextracti32x4 $3, %zmm0, %xmm0
684 ; SKX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0
685 ; SKX-NEXT: vinserti32x4 $3, %xmm0, %zmm1, %zmm0
687 %val = load i8, i8* %ptr
688 %r1 = insertelement <64 x i8> %x, i8 %val, i32 1
689 %r2 = insertelement <64 x i8> %r1, i8 %y, i32 50
693 define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) {
694 ; CHECK-LABEL: insert_v32i8:
696 ; CHECK-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
697 ; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
698 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
699 ; CHECK-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0
700 ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
702 %val = load i8, i8* %ptr
703 %r1 = insertelement <32 x i8> %x, i8 %val, i32 1
704 %r2 = insertelement <32 x i8> %r1, i8 %y, i32 17
708 define <16 x i8> @insert_v16i8(<16 x i8> %x, i8 %y, i8* %ptr) {
709 ; CHECK-LABEL: insert_v16i8:
711 ; CHECK-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0
712 ; CHECK-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
714 %val = load i8, i8* %ptr
715 %r1 = insertelement <16 x i8> %x, i8 %val, i32 3
716 %r2 = insertelement <16 x i8> %r1, i8 %y, i32 10
720 define <8 x i64> @test_insert_128_v8i64(<8 x i64> %x, i64 %y) {
721 ; CHECK-LABEL: test_insert_128_v8i64:
723 ; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
724 ; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
726 %r = insertelement <8 x i64> %x, i64 %y, i32 1
730 define <16 x i32> @test_insert_128_v16i32(<16 x i32> %x, i32 %y) {
731 ; CHECK-LABEL: test_insert_128_v16i32:
733 ; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm1
734 ; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
736 %r = insertelement <16 x i32> %x, i32 %y, i32 1
740 define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) {
741 ; CHECK-LABEL: test_insert_128_v8f64:
743 ; CHECK-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0]
744 ; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
746 %r = insertelement <8 x double> %x, double %y, i32 1
750 define <16 x float> @test_insert_128_v16f32(<16 x float> %x, float %y) {
751 ; CHECK-LABEL: test_insert_128_v16f32:
753 ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
754 ; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
756 %r = insertelement <16 x float> %x, float %y, i32 1
760 define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) {
761 ; CHECK-LABEL: test_insert_128_v16i16:
763 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
764 ; CHECK-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1
765 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
767 %r = insertelement <16 x i16> %x, i16 %y, i32 10
771 define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) {
772 ; CHECK-LABEL: test_insert_128_v32i8:
774 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
775 ; CHECK-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
776 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
778 %r = insertelement <32 x i8> %x, i8 %y, i32 20
782 define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> %y) {
783 ; KNL-LABEL: test_insertelement_v32i1:
785 ; KNL-NEXT: cmpl %esi, %edi
787 ; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k0
788 ; KNL-NEXT: kmovw %k0, %ecx
789 ; KNL-NEXT: shll $16, %ecx
790 ; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0
791 ; KNL-NEXT: kshiftrw $4, %k0, %k1
792 ; KNL-NEXT: kmovw %eax, %k2
793 ; KNL-NEXT: kxorw %k2, %k1, %k1
794 ; KNL-NEXT: kshiftlw $15, %k1, %k1
795 ; KNL-NEXT: kshiftrw $11, %k1, %k1
796 ; KNL-NEXT: kxorw %k1, %k0, %k0
797 ; KNL-NEXT: kmovw %k0, %eax
798 ; KNL-NEXT: orl %ecx, %eax
799 ; KNL-NEXT: vzeroupper
802 ; SKX-LABEL: test_insertelement_v32i1:
804 ; SKX-NEXT: cmpl %esi, %edi
806 ; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k0
807 ; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k1
808 ; SKX-NEXT: kunpckwd %k0, %k1, %k0
809 ; SKX-NEXT: kshiftrd $4, %k0, %k1
810 ; SKX-NEXT: kmovd %eax, %k2
811 ; SKX-NEXT: kxord %k2, %k1, %k1
812 ; SKX-NEXT: kshiftld $31, %k1, %k1
813 ; SKX-NEXT: kshiftrd $27, %k1, %k1
814 ; SKX-NEXT: kxord %k1, %k0, %k0
815 ; SKX-NEXT: kmovd %k0, %eax
816 ; SKX-NEXT: vzeroupper
818 %cmp_res_i1 = icmp ult i32 %a, %b
819 %cmp_cmp_vec = icmp ult <32 x i32> %x, %y
820 %maskv = insertelement <32 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 4
821 %res = bitcast <32 x i1> %maskv to i32
825 define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) {
826 ; KNL-LABEL: test_iinsertelement_v4i1:
828 ; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
829 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
830 ; KNL-NEXT: cmpl %esi, %edi
832 ; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0
833 ; KNL-NEXT: kshiftrw $2, %k0, %k1
834 ; KNL-NEXT: kmovw %eax, %k2
835 ; KNL-NEXT: kxorw %k2, %k1, %k1
836 ; KNL-NEXT: kshiftlw $15, %k1, %k1
837 ; KNL-NEXT: kshiftrw $13, %k1, %k1
838 ; KNL-NEXT: kxorw %k1, %k0, %k0
839 ; KNL-NEXT: kmovw %k0, %eax
840 ; KNL-NEXT: ## kill: def $al killed $al killed $eax
841 ; KNL-NEXT: vzeroupper
844 ; SKX-LABEL: test_iinsertelement_v4i1:
846 ; SKX-NEXT: cmpl %esi, %edi
848 ; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
849 ; SKX-NEXT: kshiftrb $2, %k0, %k1
850 ; SKX-NEXT: kmovd %eax, %k2
851 ; SKX-NEXT: kxorb %k2, %k1, %k1
852 ; SKX-NEXT: kshiftlb $7, %k1, %k1
853 ; SKX-NEXT: kshiftrb $5, %k1, %k1
854 ; SKX-NEXT: kxorw %k1, %k0, %k0
855 ; SKX-NEXT: kmovd %k0, %eax
856 ; SKX-NEXT: ## kill: def $al killed $al killed $eax
858 %cmp_res_i1 = icmp ult i32 %a, %b
859 %cmp_cmp_vec = icmp ult <4 x i32> %x, %y
860 %maskv = insertelement <4 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 2
861 %res0 = shufflevector <4 x i1> %maskv, <4 x i1> undef , <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
862 %res = bitcast <8 x i1> %res0 to i8
866 define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y) {
867 ; KNL-LABEL: test_iinsertelement_v2i1:
869 ; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
870 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
871 ; KNL-NEXT: cmpl %esi, %edi
873 ; KNL-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
874 ; KNL-NEXT: kshiftlw $15, %k0, %k0
875 ; KNL-NEXT: kshiftrw $15, %k0, %k0
876 ; KNL-NEXT: kmovw %eax, %k1
877 ; KNL-NEXT: kshiftlw $1, %k1, %k1
878 ; KNL-NEXT: korw %k1, %k0, %k0
879 ; KNL-NEXT: kmovw %k0, %eax
880 ; KNL-NEXT: ## kill: def $al killed $al killed $eax
881 ; KNL-NEXT: vzeroupper
884 ; SKX-LABEL: test_iinsertelement_v2i1:
886 ; SKX-NEXT: cmpl %esi, %edi
888 ; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
889 ; SKX-NEXT: kshiftlb $7, %k0, %k0
890 ; SKX-NEXT: kshiftrb $7, %k0, %k0
891 ; SKX-NEXT: kmovd %eax, %k1
892 ; SKX-NEXT: kshiftlb $1, %k1, %k1
893 ; SKX-NEXT: korw %k1, %k0, %k0
894 ; SKX-NEXT: kmovd %k0, %eax
895 ; SKX-NEXT: ## kill: def $al killed $al killed $eax
897 %cmp_res_i1 = icmp ult i32 %a, %b
898 %cmp_cmp_vec = icmp ult <2 x i64> %x, %y
899 %maskv = insertelement <2 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 1
900 %res0 = shufflevector <2 x i1> %maskv, <2 x i1> undef , <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
901 %res = bitcast <8 x i1> %res0 to i8
905 define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) {
906 ; KNL-LABEL: test_extractelement_v2i1:
908 ; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
909 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
910 ; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0
911 ; KNL-NEXT: kmovw %k0, %ecx
912 ; KNL-NEXT: andl $1, %ecx
913 ; KNL-NEXT: movl $4, %eax
914 ; KNL-NEXT: subl %ecx, %eax
915 ; KNL-NEXT: vzeroupper
918 ; SKX-LABEL: test_extractelement_v2i1:
920 ; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0
921 ; SKX-NEXT: kmovd %k0, %ecx
922 ; SKX-NEXT: andl $1, %ecx
923 ; SKX-NEXT: movl $4, %eax
924 ; SKX-NEXT: subl %ecx, %eax
926 %t1 = icmp ugt <2 x i64> %a, %b
927 %t2 = extractelement <2 x i1> %t1, i32 0
928 %res = select i1 %t2, i8 3, i8 4
932 define zeroext i8 @extractelement_v2i1_alt(<2 x i64> %a, <2 x i64> %b) {
933 ; KNL-LABEL: extractelement_v2i1_alt:
935 ; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
936 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
937 ; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0
938 ; KNL-NEXT: kmovw %k0, %eax
939 ; KNL-NEXT: andb $1, %al
940 ; KNL-NEXT: movb $4, %cl
941 ; KNL-NEXT: subb %al, %cl
942 ; KNL-NEXT: movzbl %cl, %eax
943 ; KNL-NEXT: vzeroupper
946 ; SKX-LABEL: extractelement_v2i1_alt:
948 ; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0
949 ; SKX-NEXT: kmovd %k0, %eax
950 ; SKX-NEXT: andb $1, %al
951 ; SKX-NEXT: movb $4, %cl
952 ; SKX-NEXT: subb %al, %cl
953 ; SKX-NEXT: movzbl %cl, %eax
955 %t1 = icmp ugt <2 x i64> %a, %b
956 %t2 = extractelement <2 x i1> %t1, i32 0
957 %sext = sext i1 %t2 to i8
958 %res = add i8 %sext, 4
962 define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) {
963 ; KNL-LABEL: test_extractelement_v4i1:
965 ; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
966 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
967 ; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
968 ; KNL-NEXT: kshiftrw $3, %k0, %k0
969 ; KNL-NEXT: kmovw %k0, %eax
970 ; KNL-NEXT: andl $1, %eax
971 ; KNL-NEXT: vzeroupper
974 ; SKX-LABEL: test_extractelement_v4i1:
976 ; SKX-NEXT: vpcmpnleud %xmm1, %xmm0, %k0
977 ; SKX-NEXT: kshiftrb $3, %k0, %k0
978 ; SKX-NEXT: kmovd %k0, %eax
979 ; SKX-NEXT: andl $1, %eax
981 %t1 = icmp ugt <4 x i32> %a, %b
982 %t2 = extractelement <4 x i1> %t1, i32 3
983 %res = zext i1 %t2 to i8
987 define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) {
988 ; KNL-LABEL: test_extractelement_v32i1:
990 ; KNL-NEXT: vpminub %xmm1, %xmm0, %xmm1
991 ; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
992 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
993 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
994 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
995 ; KNL-NEXT: kshiftrw $2, %k0, %k0
996 ; KNL-NEXT: kmovw %k0, %eax
997 ; KNL-NEXT: andl $1, %eax
998 ; KNL-NEXT: vzeroupper
1001 ; SKX-LABEL: test_extractelement_v32i1:
1003 ; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0
1004 ; SKX-NEXT: kshiftrd $2, %k0, %k0
1005 ; SKX-NEXT: kmovd %k0, %eax
1006 ; SKX-NEXT: andl $1, %eax
1007 ; SKX-NEXT: vzeroupper
1009 %t1 = icmp ugt <32 x i8> %a, %b
1010 %t2 = extractelement <32 x i1> %t1, i32 2
1011 %res = zext i1 %t2 to i8
1015 define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
1016 ; KNL-LABEL: test_extractelement_v64i1:
1018 ; KNL-NEXT: vextracti128 $1, %ymm3, %xmm0
1019 ; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
1020 ; KNL-NEXT: vpminub %xmm0, %xmm1, %xmm0
1021 ; KNL-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0
1022 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
1023 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
1024 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
1025 ; KNL-NEXT: kshiftrw $15, %k0, %k0
1026 ; KNL-NEXT: kmovw %k0, %ecx
1027 ; KNL-NEXT: andl $1, %ecx
1028 ; KNL-NEXT: movl $4, %eax
1029 ; KNL-NEXT: subl %ecx, %eax
1030 ; KNL-NEXT: vzeroupper
1033 ; SKX-LABEL: test_extractelement_v64i1:
1035 ; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
1036 ; SKX-NEXT: kshiftrq $63, %k0, %k0
1037 ; SKX-NEXT: kmovd %k0, %ecx
1038 ; SKX-NEXT: andl $1, %ecx
1039 ; SKX-NEXT: movl $4, %eax
1040 ; SKX-NEXT: subl %ecx, %eax
1041 ; SKX-NEXT: vzeroupper
1043 %t1 = icmp ugt <64 x i8> %a, %b
1044 %t2 = extractelement <64 x i1> %t1, i32 63
1045 %res = select i1 %t2, i8 3, i8 4
1049 define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
1050 ; KNL-LABEL: extractelement_v64i1_alt:
1052 ; KNL-NEXT: vextracti128 $1, %ymm3, %xmm0
1053 ; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
1054 ; KNL-NEXT: vpminub %xmm0, %xmm1, %xmm0
1055 ; KNL-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0
1056 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
1057 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
1058 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
1059 ; KNL-NEXT: kshiftrw $15, %k0, %k0
1060 ; KNL-NEXT: kmovw %k0, %eax
1061 ; KNL-NEXT: andb $1, %al
1062 ; KNL-NEXT: movb $4, %cl
1063 ; KNL-NEXT: subb %al, %cl
1064 ; KNL-NEXT: movzbl %cl, %eax
1065 ; KNL-NEXT: vzeroupper
1068 ; SKX-LABEL: extractelement_v64i1_alt:
1070 ; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
1071 ; SKX-NEXT: kshiftrq $63, %k0, %k0
1072 ; SKX-NEXT: kmovd %k0, %eax
1073 ; SKX-NEXT: andb $1, %al
1074 ; SKX-NEXT: movb $4, %cl
1075 ; SKX-NEXT: subb %al, %cl
1076 ; SKX-NEXT: movzbl %cl, %eax
1077 ; SKX-NEXT: vzeroupper
1079 %t1 = icmp ugt <64 x i8> %a, %b
1080 %t2 = extractelement <64 x i1> %t1, i32 63
1081 %sext = sext i1 %t2 to i8
1082 %res = add i8 %sext, 4
1086 define i64 @test_extractelement_variable_v2i64(<2 x i64> %t1, i32 %index) {
1087 ; CHECK-LABEL: test_extractelement_variable_v2i64:
1089 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
1090 ; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1091 ; CHECK-NEXT: andl $1, %edi
1092 ; CHECK-NEXT: movq -24(%rsp,%rdi,8), %rax
1094 %t2 = extractelement <2 x i64> %t1, i32 %index
1098 define i64 @test_extractelement_variable_v4i64(<4 x i64> %t1, i32 %index) {
1099 ; CHECK-LABEL: test_extractelement_variable_v4i64:
1101 ; CHECK-NEXT: pushq %rbp
1102 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1103 ; CHECK-NEXT: .cfi_offset %rbp, -16
1104 ; CHECK-NEXT: movq %rsp, %rbp
1105 ; CHECK-NEXT: .cfi_def_cfa_register %rbp
1106 ; CHECK-NEXT: andq $-32, %rsp
1107 ; CHECK-NEXT: subq $64, %rsp
1108 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
1109 ; CHECK-NEXT: vmovaps %ymm0, (%rsp)
1110 ; CHECK-NEXT: andl $3, %edi
1111 ; CHECK-NEXT: movq (%rsp,%rdi,8), %rax
1112 ; CHECK-NEXT: movq %rbp, %rsp
1113 ; CHECK-NEXT: popq %rbp
1114 ; CHECK-NEXT: vzeroupper
1116 %t2 = extractelement <4 x i64> %t1, i32 %index
1120 define i64 @test_extractelement_variable_v8i64(<8 x i64> %t1, i32 %index) {
1121 ; CHECK-LABEL: test_extractelement_variable_v8i64:
1123 ; CHECK-NEXT: pushq %rbp
1124 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1125 ; CHECK-NEXT: .cfi_offset %rbp, -16
1126 ; CHECK-NEXT: movq %rsp, %rbp
1127 ; CHECK-NEXT: .cfi_def_cfa_register %rbp
1128 ; CHECK-NEXT: andq $-64, %rsp
1129 ; CHECK-NEXT: subq $128, %rsp
1130 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
1131 ; CHECK-NEXT: vmovaps %zmm0, (%rsp)
1132 ; CHECK-NEXT: andl $7, %edi
1133 ; CHECK-NEXT: movq (%rsp,%rdi,8), %rax
1134 ; CHECK-NEXT: movq %rbp, %rsp
1135 ; CHECK-NEXT: popq %rbp
1136 ; CHECK-NEXT: vzeroupper
1138 %t2 = extractelement <8 x i64> %t1, i32 %index
1142 define double @test_extractelement_variable_v2f64(<2 x double> %t1, i32 %index) {
1143 ; CHECK-LABEL: test_extractelement_variable_v2f64:
1145 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
1146 ; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1147 ; CHECK-NEXT: andl $1, %edi
1148 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1150 %t2 = extractelement <2 x double> %t1, i32 %index
1154 define double @test_extractelement_variable_v4f64(<4 x double> %t1, i32 %index) {
1155 ; CHECK-LABEL: test_extractelement_variable_v4f64:
1157 ; CHECK-NEXT: pushq %rbp
1158 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1159 ; CHECK-NEXT: .cfi_offset %rbp, -16
1160 ; CHECK-NEXT: movq %rsp, %rbp
1161 ; CHECK-NEXT: .cfi_def_cfa_register %rbp
1162 ; CHECK-NEXT: andq $-32, %rsp
1163 ; CHECK-NEXT: subq $64, %rsp
1164 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
1165 ; CHECK-NEXT: vmovaps %ymm0, (%rsp)
1166 ; CHECK-NEXT: andl $3, %edi
1167 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1168 ; CHECK-NEXT: movq %rbp, %rsp
1169 ; CHECK-NEXT: popq %rbp
1170 ; CHECK-NEXT: vzeroupper
1172 %t2 = extractelement <4 x double> %t1, i32 %index
1176 define double @test_extractelement_variable_v8f64(<8 x double> %t1, i32 %index) {
1177 ; CHECK-LABEL: test_extractelement_variable_v8f64:
1179 ; CHECK-NEXT: pushq %rbp
1180 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1181 ; CHECK-NEXT: .cfi_offset %rbp, -16
1182 ; CHECK-NEXT: movq %rsp, %rbp
1183 ; CHECK-NEXT: .cfi_def_cfa_register %rbp
1184 ; CHECK-NEXT: andq $-64, %rsp
1185 ; CHECK-NEXT: subq $128, %rsp
1186 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
1187 ; CHECK-NEXT: vmovaps %zmm0, (%rsp)
1188 ; CHECK-NEXT: andl $7, %edi
1189 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1190 ; CHECK-NEXT: movq %rbp, %rsp
1191 ; CHECK-NEXT: popq %rbp
1192 ; CHECK-NEXT: vzeroupper
1194 %t2 = extractelement <8 x double> %t1, i32 %index
1198 define i32 @test_extractelement_variable_v4i32(<4 x i32> %t1, i32 %index) {
1199 ; CHECK-LABEL: test_extractelement_variable_v4i32:
1201 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
1202 ; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1203 ; CHECK-NEXT: andl $3, %edi
1204 ; CHECK-NEXT: movl -24(%rsp,%rdi,4), %eax
1206 %t2 = extractelement <4 x i32> %t1, i32 %index
1210 define i32 @test_extractelement_variable_v8i32(<8 x i32> %t1, i32 %index) {
1211 ; CHECK-LABEL: test_extractelement_variable_v8i32:
1213 ; CHECK-NEXT: pushq %rbp
1214 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1215 ; CHECK-NEXT: .cfi_offset %rbp, -16
1216 ; CHECK-NEXT: movq %rsp, %rbp
1217 ; CHECK-NEXT: .cfi_def_cfa_register %rbp
1218 ; CHECK-NEXT: andq $-32, %rsp
1219 ; CHECK-NEXT: subq $64, %rsp
1220 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
1221 ; CHECK-NEXT: vmovaps %ymm0, (%rsp)
1222 ; CHECK-NEXT: andl $7, %edi
1223 ; CHECK-NEXT: movl (%rsp,%rdi,4), %eax
1224 ; CHECK-NEXT: movq %rbp, %rsp
1225 ; CHECK-NEXT: popq %rbp
1226 ; CHECK-NEXT: vzeroupper
1228 %t2 = extractelement <8 x i32> %t1, i32 %index
1232 define i32 @test_extractelement_variable_v16i32(<16 x i32> %t1, i32 %index) {
1233 ; CHECK-LABEL: test_extractelement_variable_v16i32:
1235 ; CHECK-NEXT: pushq %rbp
1236 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1237 ; CHECK-NEXT: .cfi_offset %rbp, -16
1238 ; CHECK-NEXT: movq %rsp, %rbp
1239 ; CHECK-NEXT: .cfi_def_cfa_register %rbp
1240 ; CHECK-NEXT: andq $-64, %rsp
1241 ; CHECK-NEXT: subq $128, %rsp
1242 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
1243 ; CHECK-NEXT: vmovaps %zmm0, (%rsp)
1244 ; CHECK-NEXT: andl $15, %edi
1245 ; CHECK-NEXT: movl (%rsp,%rdi,4), %eax
1246 ; CHECK-NEXT: movq %rbp, %rsp
1247 ; CHECK-NEXT: popq %rbp
1248 ; CHECK-NEXT: vzeroupper
1250 %t2 = extractelement <16 x i32> %t1, i32 %index
1254 define float @test_extractelement_variable_v4f32(<4 x float> %t1, i32 %index) {
1255 ; CHECK-LABEL: test_extractelement_variable_v4f32:
1257 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
1258 ; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1259 ; CHECK-NEXT: andl $3, %edi
1260 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1262 %t2 = extractelement <4 x float> %t1, i32 %index
1266 define float @test_extractelement_variable_v8f32(<8 x float> %t1, i32 %index) {
1267 ; CHECK-LABEL: test_extractelement_variable_v8f32:
1269 ; CHECK-NEXT: pushq %rbp
1270 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1271 ; CHECK-NEXT: .cfi_offset %rbp, -16
1272 ; CHECK-NEXT: movq %rsp, %rbp
1273 ; CHECK-NEXT: .cfi_def_cfa_register %rbp
1274 ; CHECK-NEXT: andq $-32, %rsp
1275 ; CHECK-NEXT: subq $64, %rsp
1276 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
1277 ; CHECK-NEXT: vmovaps %ymm0, (%rsp)
1278 ; CHECK-NEXT: andl $7, %edi
1279 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1280 ; CHECK-NEXT: movq %rbp, %rsp
1281 ; CHECK-NEXT: popq %rbp
1282 ; CHECK-NEXT: vzeroupper
1284 %t2 = extractelement <8 x float> %t1, i32 %index
1288 define float @test_extractelement_variable_v16f32(<16 x float> %t1, i32 %index) {
1289 ; CHECK-LABEL: test_extractelement_variable_v16f32:
1291 ; CHECK-NEXT: pushq %rbp
1292 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1293 ; CHECK-NEXT: .cfi_offset %rbp, -16
1294 ; CHECK-NEXT: movq %rsp, %rbp
1295 ; CHECK-NEXT: .cfi_def_cfa_register %rbp
1296 ; CHECK-NEXT: andq $-64, %rsp
1297 ; CHECK-NEXT: subq $128, %rsp
1298 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
1299 ; CHECK-NEXT: vmovaps %zmm0, (%rsp)
1300 ; CHECK-NEXT: andl $15, %edi
1301 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1302 ; CHECK-NEXT: movq %rbp, %rsp
1303 ; CHECK-NEXT: popq %rbp
1304 ; CHECK-NEXT: vzeroupper
1306 %t2 = extractelement <16 x float> %t1, i32 %index
1310 define i16 @test_extractelement_variable_v8i16(<8 x i16> %t1, i32 %index) {
1311 ; CHECK-LABEL: test_extractelement_variable_v8i16:
1313 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
1314 ; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1315 ; CHECK-NEXT: andl $7, %edi
1316 ; CHECK-NEXT: movzwl -24(%rsp,%rdi,2), %eax
1318 %t2 = extractelement <8 x i16> %t1, i32 %index
1322 define i16 @test_extractelement_variable_v16i16(<16 x i16> %t1, i32 %index) {
1323 ; CHECK-LABEL: test_extractelement_variable_v16i16:
1325 ; CHECK-NEXT: pushq %rbp
1326 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1327 ; CHECK-NEXT: .cfi_offset %rbp, -16
1328 ; CHECK-NEXT: movq %rsp, %rbp
1329 ; CHECK-NEXT: .cfi_def_cfa_register %rbp
1330 ; CHECK-NEXT: andq $-32, %rsp
1331 ; CHECK-NEXT: subq $64, %rsp
1332 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
1333 ; CHECK-NEXT: vmovaps %ymm0, (%rsp)
1334 ; CHECK-NEXT: andl $15, %edi
1335 ; CHECK-NEXT: movzwl (%rsp,%rdi,2), %eax
1336 ; CHECK-NEXT: movq %rbp, %rsp
1337 ; CHECK-NEXT: popq %rbp
1338 ; CHECK-NEXT: vzeroupper
1340 %t2 = extractelement <16 x i16> %t1, i32 %index
1344 define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) {
1345 ; KNL-LABEL: test_extractelement_variable_v32i16:
1347 ; KNL-NEXT: pushq %rbp
1348 ; KNL-NEXT: .cfi_def_cfa_offset 16
1349 ; KNL-NEXT: .cfi_offset %rbp, -16
1350 ; KNL-NEXT: movq %rsp, %rbp
1351 ; KNL-NEXT: .cfi_def_cfa_register %rbp
1352 ; KNL-NEXT: andq $-64, %rsp
1353 ; KNL-NEXT: subq $128, %rsp
1354 ; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
1355 ; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
1356 ; KNL-NEXT: vmovaps %ymm0, (%rsp)
1357 ; KNL-NEXT: andl $31, %edi
1358 ; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax
1359 ; KNL-NEXT: movq %rbp, %rsp
1360 ; KNL-NEXT: popq %rbp
1361 ; KNL-NEXT: vzeroupper
1364 ; SKX-LABEL: test_extractelement_variable_v32i16:
1366 ; SKX-NEXT: pushq %rbp
1367 ; SKX-NEXT: .cfi_def_cfa_offset 16
1368 ; SKX-NEXT: .cfi_offset %rbp, -16
1369 ; SKX-NEXT: movq %rsp, %rbp
1370 ; SKX-NEXT: .cfi_def_cfa_register %rbp
1371 ; SKX-NEXT: andq $-64, %rsp
1372 ; SKX-NEXT: subq $128, %rsp
1373 ; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
1374 ; SKX-NEXT: vmovaps %zmm0, (%rsp)
1375 ; SKX-NEXT: andl $31, %edi
1376 ; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax
1377 ; SKX-NEXT: movq %rbp, %rsp
1378 ; SKX-NEXT: popq %rbp
1379 ; SKX-NEXT: vzeroupper
1381 %t2 = extractelement <32 x i16> %t1, i32 %index
1385 define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
1386 ; CHECK-LABEL: test_extractelement_variable_v16i8:
1388 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
1389 ; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1390 ; CHECK-NEXT: andl $15, %edi
1391 ; CHECK-NEXT: movb -24(%rsp,%rdi), %al
1393 %t2 = extractelement <16 x i8> %t1, i32 %index
1397 define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) {
1398 ; CHECK-LABEL: test_extractelement_variable_v32i8:
1400 ; CHECK-NEXT: pushq %rbp
1401 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1402 ; CHECK-NEXT: .cfi_offset %rbp, -16
1403 ; CHECK-NEXT: movq %rsp, %rbp
1404 ; CHECK-NEXT: .cfi_def_cfa_register %rbp
1405 ; CHECK-NEXT: andq $-32, %rsp
1406 ; CHECK-NEXT: subq $64, %rsp
1407 ; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi
1408 ; CHECK-NEXT: vmovaps %ymm0, (%rsp)
1409 ; CHECK-NEXT: andl $31, %edi
1410 ; CHECK-NEXT: movb (%rsp,%rdi), %al
1411 ; CHECK-NEXT: movq %rbp, %rsp
1412 ; CHECK-NEXT: popq %rbp
1413 ; CHECK-NEXT: vzeroupper
1416 %t2 = extractelement <32 x i8> %t1, i32 %index
1420 define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
1421 ; KNL-LABEL: test_extractelement_variable_v64i8:
1423 ; KNL-NEXT: pushq %rbp
1424 ; KNL-NEXT: .cfi_def_cfa_offset 16
1425 ; KNL-NEXT: .cfi_offset %rbp, -16
1426 ; KNL-NEXT: movq %rsp, %rbp
1427 ; KNL-NEXT: .cfi_def_cfa_register %rbp
1428 ; KNL-NEXT: andq $-64, %rsp
1429 ; KNL-NEXT: subq $128, %rsp
1430 ; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
1431 ; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
1432 ; KNL-NEXT: vmovaps %ymm0, (%rsp)
1433 ; KNL-NEXT: andl $63, %edi
1434 ; KNL-NEXT: movb (%rsp,%rdi), %al
1435 ; KNL-NEXT: movq %rbp, %rsp
1436 ; KNL-NEXT: popq %rbp
1437 ; KNL-NEXT: vzeroupper
1440 ; SKX-LABEL: test_extractelement_variable_v64i8:
1442 ; SKX-NEXT: pushq %rbp
1443 ; SKX-NEXT: .cfi_def_cfa_offset 16
1444 ; SKX-NEXT: .cfi_offset %rbp, -16
1445 ; SKX-NEXT: movq %rsp, %rbp
1446 ; SKX-NEXT: .cfi_def_cfa_register %rbp
1447 ; SKX-NEXT: andq $-64, %rsp
1448 ; SKX-NEXT: subq $128, %rsp
1449 ; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
1450 ; SKX-NEXT: vmovaps %zmm0, (%rsp)
1451 ; SKX-NEXT: andl $63, %edi
1452 ; SKX-NEXT: movb (%rsp,%rdi), %al
1453 ; SKX-NEXT: movq %rbp, %rsp
1454 ; SKX-NEXT: popq %rbp
1455 ; SKX-NEXT: vzeroupper
1458 %t2 = extractelement <64 x i8> %t1, i32 %index
1462 define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index) {
1463 ; KNL-LABEL: test_extractelement_variable_v64i8_indexi8:
1465 ; KNL-NEXT: pushq %rbp
1466 ; KNL-NEXT: .cfi_def_cfa_offset 16
1467 ; KNL-NEXT: .cfi_offset %rbp, -16
1468 ; KNL-NEXT: movq %rsp, %rbp
1469 ; KNL-NEXT: .cfi_def_cfa_register %rbp
1470 ; KNL-NEXT: andq $-64, %rsp
1471 ; KNL-NEXT: subq $128, %rsp
1472 ; KNL-NEXT: addb %dil, %dil
1473 ; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
1474 ; KNL-NEXT: vmovaps %ymm0, (%rsp)
1475 ; KNL-NEXT: movzbl %dil, %eax
1476 ; KNL-NEXT: andl $63, %eax
1477 ; KNL-NEXT: movb (%rsp,%rax), %al
1478 ; KNL-NEXT: movq %rbp, %rsp
1479 ; KNL-NEXT: popq %rbp
1480 ; KNL-NEXT: vzeroupper
1483 ; SKX-LABEL: test_extractelement_variable_v64i8_indexi8:
1485 ; SKX-NEXT: pushq %rbp
1486 ; SKX-NEXT: .cfi_def_cfa_offset 16
1487 ; SKX-NEXT: .cfi_offset %rbp, -16
1488 ; SKX-NEXT: movq %rsp, %rbp
1489 ; SKX-NEXT: .cfi_def_cfa_register %rbp
1490 ; SKX-NEXT: andq $-64, %rsp
1491 ; SKX-NEXT: subq $128, %rsp
1492 ; SKX-NEXT: addb %dil, %dil
1493 ; SKX-NEXT: vmovaps %zmm0, (%rsp)
1494 ; SKX-NEXT: movzbl %dil, %eax
1495 ; SKX-NEXT: andl $63, %eax
1496 ; SKX-NEXT: movb (%rsp,%rax), %al
1497 ; SKX-NEXT: movq %rbp, %rsp
1498 ; SKX-NEXT: popq %rbp
1499 ; SKX-NEXT: vzeroupper
1502 %i = add i8 %index, %index
1503 %t2 = extractelement <64 x i8> %t1, i8 %i
1507 define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) {
1508 ; KNL-LABEL: test_extractelement_varible_v2i1:
1510 ; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
1511 ; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1512 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1513 ; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
1514 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1515 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1516 ; KNL-NEXT: andl $1, %edi
1517 ; KNL-NEXT: movzbl -24(%rsp,%rdi,8), %eax
1518 ; KNL-NEXT: andl $1, %eax
1519 ; KNL-NEXT: vzeroupper
1522 ; SKX-LABEL: test_extractelement_varible_v2i1:
1524 ; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
1525 ; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0
1526 ; SKX-NEXT: vpmovm2q %k0, %xmm0
1527 ; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1528 ; SKX-NEXT: andl $1, %edi
1529 ; SKX-NEXT: movzbl -24(%rsp,%rdi,8), %eax
1530 ; SKX-NEXT: andl $1, %eax
1532 %t1 = icmp ugt <2 x i64> %a, %b
1533 %t2 = extractelement <2 x i1> %t1, i32 %index
1534 %res = zext i1 %t2 to i8
1538 define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) {
1539 ; KNL-LABEL: test_extractelement_varible_v4i1:
1541 ; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
1542 ; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1
1543 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
1544 ; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
1545 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1546 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1547 ; KNL-NEXT: andl $3, %edi
1548 ; KNL-NEXT: movzbl -24(%rsp,%rdi,4), %eax
1549 ; KNL-NEXT: andl $1, %eax
1550 ; KNL-NEXT: vzeroupper
1553 ; SKX-LABEL: test_extractelement_varible_v4i1:
1555 ; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
1556 ; SKX-NEXT: vpcmpnleud %xmm1, %xmm0, %k0
1557 ; SKX-NEXT: vpmovm2d %k0, %xmm0
1558 ; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1559 ; SKX-NEXT: andl $3, %edi
1560 ; SKX-NEXT: movzbl -24(%rsp,%rdi,4), %eax
1561 ; SKX-NEXT: andl $1, %eax
1563 %t1 = icmp ugt <4 x i32> %a, %b
1564 %t2 = extractelement <4 x i1> %t1, i32 %index
1565 %res = zext i1 %t2 to i8
1569 define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b, i32 %index) {
1570 ; KNL-LABEL: test_extractelement_varible_v8i1:
1572 ; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
1573 ; KNL-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1
1574 ; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
1575 ; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
1576 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1577 ; KNL-NEXT: vpmovdw %zmm0, %ymm0
1578 ; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1579 ; KNL-NEXT: andl $7, %edi
1580 ; KNL-NEXT: movzbl -24(%rsp,%rdi,2), %eax
1581 ; KNL-NEXT: andl $1, %eax
1582 ; KNL-NEXT: vzeroupper
1585 ; SKX-LABEL: test_extractelement_varible_v8i1:
1587 ; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
1588 ; SKX-NEXT: vpcmpnleud %ymm1, %ymm0, %k0
1589 ; SKX-NEXT: vpmovm2w %k0, %xmm0
1590 ; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1591 ; SKX-NEXT: andl $7, %edi
1592 ; SKX-NEXT: movzbl -24(%rsp,%rdi,2), %eax
1593 ; SKX-NEXT: andl $1, %eax
1594 ; SKX-NEXT: vzeroupper
1596 %t1 = icmp ugt <8 x i32> %a, %b
1597 %t2 = extractelement <8 x i1> %t1, i32 %index
1598 %res = zext i1 %t2 to i8
1602 define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %b, i32 %index) {
1603 ; KNL-LABEL: test_extractelement_varible_v16i1:
1605 ; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
1606 ; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
1607 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1608 ; KNL-NEXT: vpmovdb %zmm0, -{{[0-9]+}}(%rsp)
1609 ; KNL-NEXT: andl $15, %edi
1610 ; KNL-NEXT: movzbl -24(%rsp,%rdi), %eax
1611 ; KNL-NEXT: andl $1, %eax
1612 ; KNL-NEXT: vzeroupper
1615 ; SKX-LABEL: test_extractelement_varible_v16i1:
1617 ; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
1618 ; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
1619 ; SKX-NEXT: vpmovm2b %k0, %xmm0
1620 ; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1621 ; SKX-NEXT: andl $15, %edi
1622 ; SKX-NEXT: movzbl -24(%rsp,%rdi), %eax
1623 ; SKX-NEXT: andl $1, %eax
1624 ; SKX-NEXT: vzeroupper
1626 %t1 = icmp ugt <16 x i32> %a, %b
1627 %t2 = extractelement <16 x i1> %t1, i32 %index
1628 %res = zext i1 %t2 to i8
1632 define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b, i32 %index) {
1633 ; KNL-LABEL: test_extractelement_varible_v32i1:
1635 ; KNL-NEXT: pushq %rbp
1636 ; KNL-NEXT: .cfi_def_cfa_offset 16
1637 ; KNL-NEXT: .cfi_offset %rbp, -16
1638 ; KNL-NEXT: movq %rsp, %rbp
1639 ; KNL-NEXT: .cfi_def_cfa_register %rbp
1640 ; KNL-NEXT: andq $-32, %rsp
1641 ; KNL-NEXT: subq $64, %rsp
1642 ; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
1643 ; KNL-NEXT: vpminub %ymm1, %ymm0, %ymm1
1644 ; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
1645 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
1646 ; KNL-NEXT: vmovdqa %ymm0, (%rsp)
1647 ; KNL-NEXT: andl $31, %edi
1648 ; KNL-NEXT: movzbl (%rsp,%rdi), %eax
1649 ; KNL-NEXT: andl $1, %eax
1650 ; KNL-NEXT: movq %rbp, %rsp
1651 ; KNL-NEXT: popq %rbp
1652 ; KNL-NEXT: vzeroupper
1655 ; SKX-LABEL: test_extractelement_varible_v32i1:
1657 ; SKX-NEXT: pushq %rbp
1658 ; SKX-NEXT: .cfi_def_cfa_offset 16
1659 ; SKX-NEXT: .cfi_offset %rbp, -16
1660 ; SKX-NEXT: movq %rsp, %rbp
1661 ; SKX-NEXT: .cfi_def_cfa_register %rbp
1662 ; SKX-NEXT: andq $-32, %rsp
1663 ; SKX-NEXT: subq $64, %rsp
1664 ; SKX-NEXT: ## kill: def $edi killed $edi def $rdi
1665 ; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0
1666 ; SKX-NEXT: vpmovm2b %k0, %ymm0
1667 ; SKX-NEXT: vmovdqa %ymm0, (%rsp)
1668 ; SKX-NEXT: andl $31, %edi
1669 ; SKX-NEXT: movzbl (%rsp,%rdi), %eax
1670 ; SKX-NEXT: andl $1, %eax
1671 ; SKX-NEXT: movq %rbp, %rsp
1672 ; SKX-NEXT: popq %rbp
1673 ; SKX-NEXT: vzeroupper
1675 %t1 = icmp ugt <32 x i8> %a, %b
1676 %t2 = extractelement <32 x i1> %t1, i32 %index
1677 %res = zext i1 %t2 to i8
1681 define <8 x i64> @insert_double_zero(<2 x i64> %a) nounwind {
1682 ; CHECK-LABEL: insert_double_zero:
1684 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
1685 ; CHECK-NEXT: vinsertf32x4 $2, %xmm0, %zmm1, %zmm0
1687 %b = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1688 %d = shufflevector <4 x i64> %b, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1689 %e = shufflevector <8 x i64> %d, <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
1693 define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {
1694 ; KNL-LABEL: test_insertelement_variable_v32i1:
1696 ; KNL-NEXT: pushq %rbp
1697 ; KNL-NEXT: .cfi_def_cfa_offset 16
1698 ; KNL-NEXT: .cfi_offset %rbp, -16
1699 ; KNL-NEXT: movq %rsp, %rbp
1700 ; KNL-NEXT: .cfi_def_cfa_register %rbp
1701 ; KNL-NEXT: andq $-32, %rsp
1702 ; KNL-NEXT: subq $64, %rsp
1703 ; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
1704 ; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
1705 ; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
1706 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
1707 ; KNL-NEXT: andl $31, %esi
1708 ; KNL-NEXT: testb %dil, %dil
1709 ; KNL-NEXT: vmovdqa %ymm0, (%rsp)
1710 ; KNL-NEXT: setne (%rsp,%rsi)
1711 ; KNL-NEXT: vpmovsxbd (%rsp), %zmm0
1712 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
1713 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
1714 ; KNL-NEXT: kmovw %k0, %ecx
1715 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1716 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
1717 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
1718 ; KNL-NEXT: kmovw %k0, %eax
1719 ; KNL-NEXT: shll $16, %eax
1720 ; KNL-NEXT: orl %ecx, %eax
1721 ; KNL-NEXT: movq %rbp, %rsp
1722 ; KNL-NEXT: popq %rbp
1723 ; KNL-NEXT: vzeroupper
1726 ; SKX-LABEL: test_insertelement_variable_v32i1:
1728 ; SKX-NEXT: pushq %rbp
1729 ; SKX-NEXT: .cfi_def_cfa_offset 16
1730 ; SKX-NEXT: .cfi_offset %rbp, -16
1731 ; SKX-NEXT: movq %rsp, %rbp
1732 ; SKX-NEXT: .cfi_def_cfa_register %rbp
1733 ; SKX-NEXT: andq $-32, %rsp
1734 ; SKX-NEXT: subq $64, %rsp
1735 ; SKX-NEXT: ## kill: def $esi killed $esi def $rsi
1736 ; SKX-NEXT: vptestmb %ymm0, %ymm0, %k0
1737 ; SKX-NEXT: andl $31, %esi
1738 ; SKX-NEXT: testb %dil, %dil
1739 ; SKX-NEXT: vpmovm2b %k0, %ymm0
1740 ; SKX-NEXT: vmovdqa %ymm0, (%rsp)
1741 ; SKX-NEXT: setne (%rsp,%rsi)
1742 ; SKX-NEXT: vpsllw $7, (%rsp), %ymm0
1743 ; SKX-NEXT: vpmovb2m %ymm0, %k0
1744 ; SKX-NEXT: kmovd %k0, %eax
1745 ; SKX-NEXT: movq %rbp, %rsp
1746 ; SKX-NEXT: popq %rbp
1747 ; SKX-NEXT: vzeroupper
1749 %t1 = icmp ugt <32 x i8> %a, zeroinitializer
1750 %t2 = icmp ugt i8 %b, 0
1751 %t3 = insertelement <32 x i1> %t1, i1 %t2, i32 %index
1752 %t4 = bitcast <32 x i1> %t3 to i32
1756 define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {
1757 ; KNL-LABEL: test_insertelement_variable_v64i1:
1759 ; KNL-NEXT: pushq %rbp
1760 ; KNL-NEXT: .cfi_def_cfa_offset 16
1761 ; KNL-NEXT: .cfi_offset %rbp, -16
1762 ; KNL-NEXT: movq %rsp, %rbp
1763 ; KNL-NEXT: .cfi_def_cfa_register %rbp
1764 ; KNL-NEXT: andq $-64, %rsp
1765 ; KNL-NEXT: subq $128, %rsp
1766 ; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
1767 ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
1768 ; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
1769 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
1770 ; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
1771 ; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
1772 ; KNL-NEXT: andl $63, %esi
1773 ; KNL-NEXT: testb %dil, %dil
1774 ; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
1775 ; KNL-NEXT: vmovdqa %ymm0, (%rsp)
1776 ; KNL-NEXT: setne (%rsp,%rsi)
1777 ; KNL-NEXT: vpmovsxbd (%rsp), %zmm0
1778 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
1779 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
1780 ; KNL-NEXT: kmovw %k0, %eax
1781 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1782 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
1783 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
1784 ; KNL-NEXT: kmovw %k0, %ecx
1785 ; KNL-NEXT: shll $16, %ecx
1786 ; KNL-NEXT: orl %eax, %ecx
1787 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1788 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
1789 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
1790 ; KNL-NEXT: kmovw %k0, %edx
1791 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1792 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
1793 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
1794 ; KNL-NEXT: kmovw %k0, %eax
1795 ; KNL-NEXT: shll $16, %eax
1796 ; KNL-NEXT: orl %edx, %eax
1797 ; KNL-NEXT: shlq $32, %rax
1798 ; KNL-NEXT: orq %rcx, %rax
1799 ; KNL-NEXT: movq %rbp, %rsp
1800 ; KNL-NEXT: popq %rbp
1801 ; KNL-NEXT: vzeroupper
1804 ; SKX-LABEL: test_insertelement_variable_v64i1:
1806 ; SKX-NEXT: pushq %rbp
1807 ; SKX-NEXT: .cfi_def_cfa_offset 16
1808 ; SKX-NEXT: .cfi_offset %rbp, -16
1809 ; SKX-NEXT: movq %rsp, %rbp
1810 ; SKX-NEXT: .cfi_def_cfa_register %rbp
1811 ; SKX-NEXT: andq $-64, %rsp
1812 ; SKX-NEXT: subq $128, %rsp
1813 ; SKX-NEXT: ## kill: def $esi killed $esi def $rsi
1814 ; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0
1815 ; SKX-NEXT: andl $63, %esi
1816 ; SKX-NEXT: testb %dil, %dil
1817 ; SKX-NEXT: vpmovm2b %k0, %zmm0
1818 ; SKX-NEXT: vmovdqa64 %zmm0, (%rsp)
1819 ; SKX-NEXT: setne (%rsp,%rsi)
1820 ; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
1821 ; SKX-NEXT: vpmovb2m %zmm0, %k0
1822 ; SKX-NEXT: kmovq %k0, %rax
1823 ; SKX-NEXT: movq %rbp, %rsp
1824 ; SKX-NEXT: popq %rbp
1825 ; SKX-NEXT: vzeroupper
1827 %t1 = icmp ugt <64 x i8> %a, zeroinitializer
1828 %t2 = icmp ugt i8 %b, 0
1829 %t3 = insertelement <64 x i1> %t1, i1 %t2, i32 %index
1830 %t4 = bitcast <64 x i1> %t3 to i64
1834 define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
1835 ; KNL-LABEL: test_insertelement_variable_v96i1:
1837 ; KNL-NEXT: pushq %rbp
1838 ; KNL-NEXT: .cfi_def_cfa_offset 16
1839 ; KNL-NEXT: .cfi_offset %rbp, -16
1840 ; KNL-NEXT: movq %rsp, %rbp
1841 ; KNL-NEXT: .cfi_def_cfa_register %rbp
1842 ; KNL-NEXT: andq $-128, %rsp
1843 ; KNL-NEXT: subq $256, %rsp ## imm = 0x100
1844 ; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1845 ; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0
1846 ; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0
1847 ; KNL-NEXT: vpinsrb $3, 248(%rbp), %xmm0, %xmm0
1848 ; KNL-NEXT: vpinsrb $4, 256(%rbp), %xmm0, %xmm0
1849 ; KNL-NEXT: vpinsrb $5, 264(%rbp), %xmm0, %xmm0
1850 ; KNL-NEXT: vpinsrb $6, 272(%rbp), %xmm0, %xmm0
1851 ; KNL-NEXT: vpinsrb $7, 280(%rbp), %xmm0, %xmm0
1852 ; KNL-NEXT: vpinsrb $8, 288(%rbp), %xmm0, %xmm0
1853 ; KNL-NEXT: vpinsrb $9, 296(%rbp), %xmm0, %xmm0
1854 ; KNL-NEXT: vpinsrb $10, 304(%rbp), %xmm0, %xmm0
1855 ; KNL-NEXT: vpinsrb $11, 312(%rbp), %xmm0, %xmm0
1856 ; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm0, %xmm0
1857 ; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm0, %xmm0
1858 ; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm0, %xmm0
1859 ; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm0, %xmm0
1860 ; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1861 ; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm1, %xmm1
1862 ; KNL-NEXT: vpinsrb $2, 368(%rbp), %xmm1, %xmm1
1863 ; KNL-NEXT: vpinsrb $3, 376(%rbp), %xmm1, %xmm1
1864 ; KNL-NEXT: vpinsrb $4, 384(%rbp), %xmm1, %xmm1
1865 ; KNL-NEXT: vpinsrb $5, 392(%rbp), %xmm1, %xmm1
1866 ; KNL-NEXT: vpinsrb $6, 400(%rbp), %xmm1, %xmm1
1867 ; KNL-NEXT: vpinsrb $7, 408(%rbp), %xmm1, %xmm1
1868 ; KNL-NEXT: vpinsrb $8, 416(%rbp), %xmm1, %xmm1
1869 ; KNL-NEXT: vpinsrb $9, 424(%rbp), %xmm1, %xmm1
1870 ; KNL-NEXT: vpinsrb $10, 432(%rbp), %xmm1, %xmm1
1871 ; KNL-NEXT: vpinsrb $11, 440(%rbp), %xmm1, %xmm1
1872 ; KNL-NEXT: vpinsrb $12, 448(%rbp), %xmm1, %xmm1
1873 ; KNL-NEXT: vpinsrb $13, 456(%rbp), %xmm1, %xmm1
1874 ; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1
1875 ; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1
1876 ; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1877 ; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1878 ; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm1, %xmm1
1879 ; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm1, %xmm1
1880 ; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm1, %xmm1
1881 ; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm1, %xmm1
1882 ; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm1, %xmm1
1883 ; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm1, %xmm1
1884 ; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm1, %xmm1
1885 ; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm1, %xmm1
1886 ; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm1, %xmm1
1887 ; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm1, %xmm1
1888 ; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm1, %xmm1
1889 ; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm1, %xmm1
1890 ; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm1, %xmm1
1891 ; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm1, %xmm1
1892 ; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm1, %xmm1
1893 ; KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1894 ; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm2, %xmm2
1895 ; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm2, %xmm2
1896 ; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm2, %xmm2
1897 ; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm2, %xmm2
1898 ; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm2, %xmm2
1899 ; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm2, %xmm2
1900 ; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm2, %xmm2
1901 ; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm2, %xmm2
1902 ; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm2, %xmm2
1903 ; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm2, %xmm2
1904 ; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm2, %xmm2
1905 ; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm2, %xmm2
1906 ; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm2, %xmm2
1907 ; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm2, %xmm2
1908 ; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm2, %xmm2
1909 ; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
1910 ; KNL-NEXT: vmovd %edi, %xmm2
1911 ; KNL-NEXT: vpinsrb $1, %esi, %xmm2, %xmm2
1912 ; KNL-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2
1913 ; KNL-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
1914 ; KNL-NEXT: vpinsrb $4, %r8d, %xmm2, %xmm2
1915 ; KNL-NEXT: vpinsrb $5, %r9d, %xmm2, %xmm2
1916 ; KNL-NEXT: vpinsrb $6, 16(%rbp), %xmm2, %xmm2
1917 ; KNL-NEXT: vpinsrb $7, 24(%rbp), %xmm2, %xmm2
1918 ; KNL-NEXT: vpinsrb $8, 32(%rbp), %xmm2, %xmm2
1919 ; KNL-NEXT: vpinsrb $9, 40(%rbp), %xmm2, %xmm2
1920 ; KNL-NEXT: vpinsrb $10, 48(%rbp), %xmm2, %xmm2
1921 ; KNL-NEXT: vpinsrb $11, 56(%rbp), %xmm2, %xmm2
1922 ; KNL-NEXT: vpinsrb $12, 64(%rbp), %xmm2, %xmm2
1923 ; KNL-NEXT: vpinsrb $13, 72(%rbp), %xmm2, %xmm2
1924 ; KNL-NEXT: vpinsrb $14, 80(%rbp), %xmm2, %xmm2
1925 ; KNL-NEXT: vpinsrb $15, 88(%rbp), %xmm2, %xmm2
1926 ; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
1927 ; KNL-NEXT: vpinsrb $1, 104(%rbp), %xmm3, %xmm3
1928 ; KNL-NEXT: vpinsrb $2, 112(%rbp), %xmm3, %xmm3
1929 ; KNL-NEXT: vpinsrb $3, 120(%rbp), %xmm3, %xmm3
1930 ; KNL-NEXT: vpinsrb $4, 128(%rbp), %xmm3, %xmm3
1931 ; KNL-NEXT: vpinsrb $5, 136(%rbp), %xmm3, %xmm3
1932 ; KNL-NEXT: vpinsrb $6, 144(%rbp), %xmm3, %xmm3
1933 ; KNL-NEXT: vpinsrb $7, 152(%rbp), %xmm3, %xmm3
1934 ; KNL-NEXT: vpinsrb $8, 160(%rbp), %xmm3, %xmm3
1935 ; KNL-NEXT: vpinsrb $9, 168(%rbp), %xmm3, %xmm3
1936 ; KNL-NEXT: vpinsrb $10, 176(%rbp), %xmm3, %xmm3
1937 ; KNL-NEXT: vpinsrb $11, 184(%rbp), %xmm3, %xmm3
1938 ; KNL-NEXT: vpinsrb $12, 192(%rbp), %xmm3, %xmm3
1939 ; KNL-NEXT: vpinsrb $13, 200(%rbp), %xmm3, %xmm3
1940 ; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm3, %xmm3
1941 ; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm3, %xmm3
1942 ; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1943 ; KNL-NEXT: movl 744(%rbp), %eax
1944 ; KNL-NEXT: andl $127, %eax
1945 ; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
1946 ; KNL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
1947 ; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
1948 ; KNL-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
1949 ; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
1950 ; KNL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
1951 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
1952 ; KNL-NEXT: cmpb $0, 736(%rbp)
1953 ; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1954 ; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
1955 ; KNL-NEXT: vmovdqa %ymm2, (%rsp)
1956 ; KNL-NEXT: setne (%rsp,%rax)
1957 ; KNL-NEXT: vpmovsxbd (%rsp), %zmm0
1958 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
1959 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
1960 ; KNL-NEXT: kmovw %k0, %eax
1961 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1962 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
1963 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
1964 ; KNL-NEXT: kmovw %k0, %ecx
1965 ; KNL-NEXT: shll $16, %ecx
1966 ; KNL-NEXT: orl %eax, %ecx
1967 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1968 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
1969 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
1970 ; KNL-NEXT: kmovw %k0, %edx
1971 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1972 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
1973 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
1974 ; KNL-NEXT: kmovw %k0, %eax
1975 ; KNL-NEXT: shll $16, %eax
1976 ; KNL-NEXT: orl %edx, %eax
1977 ; KNL-NEXT: shlq $32, %rax
1978 ; KNL-NEXT: orq %rcx, %rax
1979 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1980 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
1981 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
1982 ; KNL-NEXT: kmovw %k0, %ecx
1983 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1984 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
1985 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
1986 ; KNL-NEXT: kmovw %k0, %esi
1987 ; KNL-NEXT: shll $16, %esi
1988 ; KNL-NEXT: orl %ecx, %esi
1989 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1990 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
1991 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
1992 ; KNL-NEXT: kmovw %k0, %ecx
1993 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1994 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
1995 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
1996 ; KNL-NEXT: kmovw %k0, %edx
1997 ; KNL-NEXT: shll $16, %edx
1998 ; KNL-NEXT: orl %ecx, %edx
1999 ; KNL-NEXT: shlq $32, %rdx
2000 ; KNL-NEXT: orq %rsi, %rdx
2001 ; KNL-NEXT: movq %rbp, %rsp
2002 ; KNL-NEXT: popq %rbp
2003 ; KNL-NEXT: vzeroupper
2006 ; SKX-LABEL: test_insertelement_variable_v96i1:
2008 ; SKX-NEXT: pushq %rbp
2009 ; SKX-NEXT: .cfi_def_cfa_offset 16
2010 ; SKX-NEXT: .cfi_offset %rbp, -16
2011 ; SKX-NEXT: movq %rsp, %rbp
2012 ; SKX-NEXT: .cfi_def_cfa_register %rbp
2013 ; SKX-NEXT: andq $-128, %rsp
2014 ; SKX-NEXT: subq $256, %rsp ## imm = 0x100
2015 ; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2016 ; SKX-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0
2017 ; SKX-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0
2018 ; SKX-NEXT: vpinsrb $3, 248(%rbp), %xmm0, %xmm0
2019 ; SKX-NEXT: vpinsrb $4, 256(%rbp), %xmm0, %xmm0
2020 ; SKX-NEXT: vpinsrb $5, 264(%rbp), %xmm0, %xmm0
2021 ; SKX-NEXT: vpinsrb $6, 272(%rbp), %xmm0, %xmm0
2022 ; SKX-NEXT: vpinsrb $7, 280(%rbp), %xmm0, %xmm0
2023 ; SKX-NEXT: vpinsrb $8, 288(%rbp), %xmm0, %xmm0
2024 ; SKX-NEXT: vpinsrb $9, 296(%rbp), %xmm0, %xmm0
2025 ; SKX-NEXT: vpinsrb $10, 304(%rbp), %xmm0, %xmm0
2026 ; SKX-NEXT: vpinsrb $11, 312(%rbp), %xmm0, %xmm0
2027 ; SKX-NEXT: vpinsrb $12, 320(%rbp), %xmm0, %xmm0
2028 ; SKX-NEXT: vpinsrb $13, 328(%rbp), %xmm0, %xmm0
2029 ; SKX-NEXT: vpinsrb $14, 336(%rbp), %xmm0, %xmm0
2030 ; SKX-NEXT: vpinsrb $15, 344(%rbp), %xmm0, %xmm0
2031 ; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2032 ; SKX-NEXT: vpinsrb $1, 360(%rbp), %xmm1, %xmm1
2033 ; SKX-NEXT: vpinsrb $2, 368(%rbp), %xmm1, %xmm1
2034 ; SKX-NEXT: vpinsrb $3, 376(%rbp), %xmm1, %xmm1
2035 ; SKX-NEXT: vpinsrb $4, 384(%rbp), %xmm1, %xmm1
2036 ; SKX-NEXT: vpinsrb $5, 392(%rbp), %xmm1, %xmm1
2037 ; SKX-NEXT: vpinsrb $6, 400(%rbp), %xmm1, %xmm1
2038 ; SKX-NEXT: vpinsrb $7, 408(%rbp), %xmm1, %xmm1
2039 ; SKX-NEXT: vpinsrb $8, 416(%rbp), %xmm1, %xmm1
2040 ; SKX-NEXT: vpinsrb $9, 424(%rbp), %xmm1, %xmm1
2041 ; SKX-NEXT: vpinsrb $10, 432(%rbp), %xmm1, %xmm1
2042 ; SKX-NEXT: vpinsrb $11, 440(%rbp), %xmm1, %xmm1
2043 ; SKX-NEXT: vpinsrb $12, 448(%rbp), %xmm1, %xmm1
2044 ; SKX-NEXT: vpinsrb $13, 456(%rbp), %xmm1, %xmm1
2045 ; SKX-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1
2046 ; SKX-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1
2047 ; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2048 ; SKX-NEXT: vmovd %edi, %xmm1
2049 ; SKX-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1
2050 ; SKX-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1
2051 ; SKX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
2052 ; SKX-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1
2053 ; SKX-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1
2054 ; SKX-NEXT: vpinsrb $6, 16(%rbp), %xmm1, %xmm1
2055 ; SKX-NEXT: vpinsrb $7, 24(%rbp), %xmm1, %xmm1
2056 ; SKX-NEXT: vpinsrb $8, 32(%rbp), %xmm1, %xmm1
2057 ; SKX-NEXT: vpinsrb $9, 40(%rbp), %xmm1, %xmm1
2058 ; SKX-NEXT: vpinsrb $10, 48(%rbp), %xmm1, %xmm1
2059 ; SKX-NEXT: vpinsrb $11, 56(%rbp), %xmm1, %xmm1
2060 ; SKX-NEXT: vpinsrb $12, 64(%rbp), %xmm1, %xmm1
2061 ; SKX-NEXT: vpinsrb $13, 72(%rbp), %xmm1, %xmm1
2062 ; SKX-NEXT: vpinsrb $14, 80(%rbp), %xmm1, %xmm1
2063 ; SKX-NEXT: vpinsrb $15, 88(%rbp), %xmm1, %xmm1
2064 ; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
2065 ; SKX-NEXT: vpinsrb $1, 104(%rbp), %xmm2, %xmm2
2066 ; SKX-NEXT: vpinsrb $2, 112(%rbp), %xmm2, %xmm2
2067 ; SKX-NEXT: vpinsrb $3, 120(%rbp), %xmm2, %xmm2
2068 ; SKX-NEXT: vpinsrb $4, 128(%rbp), %xmm2, %xmm2
2069 ; SKX-NEXT: vpinsrb $5, 136(%rbp), %xmm2, %xmm2
2070 ; SKX-NEXT: vpinsrb $6, 144(%rbp), %xmm2, %xmm2
2071 ; SKX-NEXT: vpinsrb $7, 152(%rbp), %xmm2, %xmm2
2072 ; SKX-NEXT: vpinsrb $8, 160(%rbp), %xmm2, %xmm2
2073 ; SKX-NEXT: vpinsrb $9, 168(%rbp), %xmm2, %xmm2
2074 ; SKX-NEXT: vpinsrb $10, 176(%rbp), %xmm2, %xmm2
2075 ; SKX-NEXT: vpinsrb $11, 184(%rbp), %xmm2, %xmm2
2076 ; SKX-NEXT: vpinsrb $12, 192(%rbp), %xmm2, %xmm2
2077 ; SKX-NEXT: vpinsrb $13, 200(%rbp), %xmm2, %xmm2
2078 ; SKX-NEXT: vpinsrb $14, 208(%rbp), %xmm2, %xmm2
2079 ; SKX-NEXT: vpinsrb $15, 216(%rbp), %xmm2, %xmm2
2080 ; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2081 ; SKX-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
2082 ; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2083 ; SKX-NEXT: vpinsrb $1, 488(%rbp), %xmm1, %xmm1
2084 ; SKX-NEXT: vpinsrb $2, 496(%rbp), %xmm1, %xmm1
2085 ; SKX-NEXT: vpinsrb $3, 504(%rbp), %xmm1, %xmm1
2086 ; SKX-NEXT: vpinsrb $4, 512(%rbp), %xmm1, %xmm1
2087 ; SKX-NEXT: vpinsrb $5, 520(%rbp), %xmm1, %xmm1
2088 ; SKX-NEXT: vpinsrb $6, 528(%rbp), %xmm1, %xmm1
2089 ; SKX-NEXT: vpinsrb $7, 536(%rbp), %xmm1, %xmm1
2090 ; SKX-NEXT: vpinsrb $8, 544(%rbp), %xmm1, %xmm1
2091 ; SKX-NEXT: vpinsrb $9, 552(%rbp), %xmm1, %xmm1
2092 ; SKX-NEXT: vpinsrb $10, 560(%rbp), %xmm1, %xmm1
2093 ; SKX-NEXT: vpinsrb $11, 568(%rbp), %xmm1, %xmm1
2094 ; SKX-NEXT: vpinsrb $12, 576(%rbp), %xmm1, %xmm1
2095 ; SKX-NEXT: vpinsrb $13, 584(%rbp), %xmm1, %xmm1
2096 ; SKX-NEXT: vpinsrb $14, 592(%rbp), %xmm1, %xmm1
2097 ; SKX-NEXT: vpinsrb $15, 600(%rbp), %xmm1, %xmm1
2098 ; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
2099 ; SKX-NEXT: vpinsrb $1, 616(%rbp), %xmm2, %xmm2
2100 ; SKX-NEXT: vpinsrb $2, 624(%rbp), %xmm2, %xmm2
2101 ; SKX-NEXT: vpinsrb $3, 632(%rbp), %xmm2, %xmm2
2102 ; SKX-NEXT: vpinsrb $4, 640(%rbp), %xmm2, %xmm2
2103 ; SKX-NEXT: vpinsrb $5, 648(%rbp), %xmm2, %xmm2
2104 ; SKX-NEXT: vpinsrb $6, 656(%rbp), %xmm2, %xmm2
2105 ; SKX-NEXT: vpinsrb $7, 664(%rbp), %xmm2, %xmm2
2106 ; SKX-NEXT: vpinsrb $8, 672(%rbp), %xmm2, %xmm2
2107 ; SKX-NEXT: vpinsrb $9, 680(%rbp), %xmm2, %xmm2
2108 ; SKX-NEXT: vpinsrb $10, 688(%rbp), %xmm2, %xmm2
2109 ; SKX-NEXT: vpinsrb $11, 696(%rbp), %xmm2, %xmm2
2110 ; SKX-NEXT: vpinsrb $12, 704(%rbp), %xmm2, %xmm2
2111 ; SKX-NEXT: vpinsrb $13, 712(%rbp), %xmm2, %xmm2
2112 ; SKX-NEXT: vpinsrb $14, 720(%rbp), %xmm2, %xmm2
2113 ; SKX-NEXT: vpinsrb $15, 728(%rbp), %xmm2, %xmm2
2114 ; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2115 ; SKX-NEXT: movl 744(%rbp), %eax
2116 ; SKX-NEXT: andl $127, %eax
2117 ; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0
2118 ; SKX-NEXT: vptestmb %zmm1, %zmm1, %k1
2119 ; SKX-NEXT: cmpb $0, 736(%rbp)
2120 ; SKX-NEXT: vpmovm2b %k1, %zmm0
2121 ; SKX-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
2122 ; SKX-NEXT: vpmovm2b %k0, %zmm0
2123 ; SKX-NEXT: vmovdqa64 %zmm0, (%rsp)
2124 ; SKX-NEXT: setne (%rsp,%rax)
2125 ; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
2126 ; SKX-NEXT: vpmovb2m %zmm0, %k0
2127 ; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
2128 ; SKX-NEXT: vpmovb2m %zmm0, %k1
2129 ; SKX-NEXT: kmovq %k1, %rax
2130 ; SKX-NEXT: kmovq %k0, %rdx
2131 ; SKX-NEXT: movq %rbp, %rsp
2132 ; SKX-NEXT: popq %rbp
2133 ; SKX-NEXT: vzeroupper
2135 %t1 = icmp ugt <96 x i8> %a, zeroinitializer
2136 %t2 = icmp ugt i8 %b, 0
2137 %t3 = insertelement <96 x i1> %t1, i1 %t2, i32 %index
2138 %t4 = bitcast <96 x i1> %t3 to i96
2142 define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index) {
2143 ; KNL-LABEL: test_insertelement_variable_v128i1:
2145 ; KNL-NEXT: pushq %rbp
2146 ; KNL-NEXT: .cfi_def_cfa_offset 16
2147 ; KNL-NEXT: .cfi_offset %rbp, -16
2148 ; KNL-NEXT: movq %rsp, %rbp
2149 ; KNL-NEXT: .cfi_def_cfa_register %rbp
2150 ; KNL-NEXT: andq $-128, %rsp
2151 ; KNL-NEXT: subq $256, %rsp ## imm = 0x100
2152 ; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
2153 ; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
2154 ; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
2155 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
2156 ; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1
2157 ; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
2158 ; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
2159 ; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
2160 ; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3
2161 ; KNL-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3
2162 ; KNL-NEXT: andl $127, %esi
2163 ; KNL-NEXT: testb %dil, %dil
2164 ; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp)
2165 ; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
2166 ; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
2167 ; KNL-NEXT: vmovdqa %ymm0, (%rsp)
2168 ; KNL-NEXT: setne (%rsp,%rsi)
2169 ; KNL-NEXT: vpmovsxbd (%rsp), %zmm0
2170 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
2171 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
2172 ; KNL-NEXT: kmovw %k0, %eax
2173 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
2174 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
2175 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
2176 ; KNL-NEXT: kmovw %k0, %ecx
2177 ; KNL-NEXT: shll $16, %ecx
2178 ; KNL-NEXT: orl %eax, %ecx
2179 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
2180 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
2181 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
2182 ; KNL-NEXT: kmovw %k0, %edx
2183 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
2184 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
2185 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
2186 ; KNL-NEXT: kmovw %k0, %eax
2187 ; KNL-NEXT: shll $16, %eax
2188 ; KNL-NEXT: orl %edx, %eax
2189 ; KNL-NEXT: shlq $32, %rax
2190 ; KNL-NEXT: orq %rcx, %rax
2191 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
2192 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
2193 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
2194 ; KNL-NEXT: kmovw %k0, %ecx
2195 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
2196 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
2197 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
2198 ; KNL-NEXT: kmovw %k0, %esi
2199 ; KNL-NEXT: shll $16, %esi
2200 ; KNL-NEXT: orl %ecx, %esi
2201 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
2202 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
2203 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
2204 ; KNL-NEXT: kmovw %k0, %ecx
2205 ; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
2206 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0
2207 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
2208 ; KNL-NEXT: kmovw %k0, %edx
2209 ; KNL-NEXT: shll $16, %edx
2210 ; KNL-NEXT: orl %ecx, %edx
2211 ; KNL-NEXT: shlq $32, %rdx
2212 ; KNL-NEXT: orq %rsi, %rdx
2213 ; KNL-NEXT: movq %rbp, %rsp
2214 ; KNL-NEXT: popq %rbp
2215 ; KNL-NEXT: vzeroupper
2218 ; SKX-LABEL: test_insertelement_variable_v128i1:
2220 ; SKX-NEXT: pushq %rbp
2221 ; SKX-NEXT: .cfi_def_cfa_offset 16
2222 ; SKX-NEXT: .cfi_offset %rbp, -16
2223 ; SKX-NEXT: movq %rsp, %rbp
2224 ; SKX-NEXT: .cfi_def_cfa_register %rbp
2225 ; SKX-NEXT: andq $-128, %rsp
2226 ; SKX-NEXT: subq $256, %rsp ## imm = 0x100
2227 ; SKX-NEXT: ## kill: def $esi killed $esi def $rsi
2228 ; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0
2229 ; SKX-NEXT: vptestmb %zmm1, %zmm1, %k1
2230 ; SKX-NEXT: andl $127, %esi
2231 ; SKX-NEXT: testb %dil, %dil
2232 ; SKX-NEXT: vpmovm2b %k1, %zmm0
2233 ; SKX-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
2234 ; SKX-NEXT: vpmovm2b %k0, %zmm0
2235 ; SKX-NEXT: vmovdqa64 %zmm0, (%rsp)
2236 ; SKX-NEXT: setne (%rsp,%rsi)
2237 ; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
2238 ; SKX-NEXT: vpmovb2m %zmm0, %k0
2239 ; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
2240 ; SKX-NEXT: vpmovb2m %zmm0, %k1
2241 ; SKX-NEXT: kmovq %k1, %rax
2242 ; SKX-NEXT: kmovq %k0, %rdx
2243 ; SKX-NEXT: movq %rbp, %rsp
2244 ; SKX-NEXT: popq %rbp
2245 ; SKX-NEXT: vzeroupper
2247 %t1 = icmp ugt <128 x i8> %a, zeroinitializer
2248 %t2 = icmp ugt i8 %b, 0
2249 %t3 = insertelement <128 x i1> %t1, i1 %t2, i32 %index
2250 %t4 = bitcast <128 x i1> %t3 to i128
2254 define void @test_concat_v2i1(<2 x half>* %arg, <2 x half>* %arg1, <2 x half>* %arg2) {
2255 ; KNL-LABEL: test_concat_v2i1:
2257 ; KNL-NEXT: movswl (%rdi), %eax
2258 ; KNL-NEXT: vmovd %eax, %xmm0
2259 ; KNL-NEXT: vcvtph2ps %xmm0, %xmm0
2260 ; KNL-NEXT: movswl 2(%rdi), %eax
2261 ; KNL-NEXT: vmovd %eax, %xmm1
2262 ; KNL-NEXT: vcvtph2ps %xmm1, %xmm1
2263 ; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2264 ; KNL-NEXT: vucomiss %xmm2, %xmm1
2265 ; KNL-NEXT: setb %al
2266 ; KNL-NEXT: kmovw %eax, %k0
2267 ; KNL-NEXT: kshiftlw $1, %k0, %k0
2268 ; KNL-NEXT: vucomiss %xmm2, %xmm0
2269 ; KNL-NEXT: setb %al
2270 ; KNL-NEXT: andl $1, %eax
2271 ; KNL-NEXT: kmovw %eax, %k1
2272 ; KNL-NEXT: korw %k0, %k1, %k0
2273 ; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2
2274 ; KNL-NEXT: vucomiss %xmm2, %xmm1
2275 ; KNL-NEXT: seta %al
2276 ; KNL-NEXT: kmovw %eax, %k1
2277 ; KNL-NEXT: kshiftlw $1, %k1, %k1
2278 ; KNL-NEXT: vucomiss %xmm2, %xmm0
2279 ; KNL-NEXT: seta %al
2280 ; KNL-NEXT: andl $1, %eax
2281 ; KNL-NEXT: kmovw %eax, %k2
2282 ; KNL-NEXT: korw %k1, %k2, %k1
2283 ; KNL-NEXT: kandw %k1, %k0, %k1
2284 ; KNL-NEXT: kshiftrw $1, %k1, %k2
2285 ; KNL-NEXT: movswl (%rsi), %eax
2286 ; KNL-NEXT: vmovd %eax, %xmm0
2287 ; KNL-NEXT: vcvtph2ps %xmm0, %xmm0
2288 ; KNL-NEXT: movswl 2(%rsi), %eax
2289 ; KNL-NEXT: vmovd %eax, %xmm1
2290 ; KNL-NEXT: vcvtph2ps %xmm1, %xmm1
2291 ; KNL-NEXT: vmovss %xmm1, %xmm0, %xmm1 {%k2} {z}
2292 ; KNL-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z}
2293 ; KNL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2294 ; KNL-NEXT: vmovd %xmm0, %eax
2295 ; KNL-NEXT: movw %ax, (%rdx)
2296 ; KNL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
2297 ; KNL-NEXT: vmovd %xmm0, %eax
2298 ; KNL-NEXT: movw %ax, 2(%rdx)
2301 ; SKX-LABEL: test_concat_v2i1:
2303 ; SKX-NEXT: movswl (%rdi), %eax
2304 ; SKX-NEXT: vmovd %eax, %xmm0
2305 ; SKX-NEXT: vcvtph2ps %xmm0, %xmm0
2306 ; SKX-NEXT: movswl 2(%rdi), %eax
2307 ; SKX-NEXT: vmovd %eax, %xmm1
2308 ; SKX-NEXT: vcvtph2ps %xmm1, %xmm1
2309 ; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2310 ; SKX-NEXT: vucomiss %xmm2, %xmm1
2311 ; SKX-NEXT: setb %al
2312 ; SKX-NEXT: kmovd %eax, %k0
2313 ; SKX-NEXT: kshiftlb $1, %k0, %k0
2314 ; SKX-NEXT: vucomiss %xmm2, %xmm0
2315 ; SKX-NEXT: setb %al
2316 ; SKX-NEXT: kmovd %eax, %k1
2317 ; SKX-NEXT: kshiftlb $7, %k1, %k1
2318 ; SKX-NEXT: kshiftrb $7, %k1, %k1
2319 ; SKX-NEXT: korw %k0, %k1, %k0
2320 ; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2
2321 ; SKX-NEXT: vucomiss %xmm2, %xmm1
2322 ; SKX-NEXT: seta %al
2323 ; SKX-NEXT: kmovd %eax, %k1
2324 ; SKX-NEXT: kshiftlb $1, %k1, %k1
2325 ; SKX-NEXT: vucomiss %xmm2, %xmm0
2326 ; SKX-NEXT: seta %al
2327 ; SKX-NEXT: kmovd %eax, %k2
2328 ; SKX-NEXT: kshiftlb $7, %k2, %k2
2329 ; SKX-NEXT: kshiftrb $7, %k2, %k2
2330 ; SKX-NEXT: korw %k1, %k2, %k1
2331 ; SKX-NEXT: kandw %k1, %k0, %k1
2332 ; SKX-NEXT: kshiftrb $1, %k1, %k2
2333 ; SKX-NEXT: movswl (%rsi), %eax
2334 ; SKX-NEXT: vmovd %eax, %xmm0
2335 ; SKX-NEXT: vcvtph2ps %xmm0, %xmm0
2336 ; SKX-NEXT: movswl 2(%rsi), %eax
2337 ; SKX-NEXT: vmovd %eax, %xmm1
2338 ; SKX-NEXT: vcvtph2ps %xmm1, %xmm1
2339 ; SKX-NEXT: vmovss %xmm1, %xmm0, %xmm1 {%k2} {z}
2340 ; SKX-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z}
2341 ; SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2342 ; SKX-NEXT: vmovd %xmm0, %eax
2343 ; SKX-NEXT: movw %ax, (%rdx)
2344 ; SKX-NEXT: vcvtps2ph $4, %xmm1, %xmm0
2345 ; SKX-NEXT: vmovd %xmm0, %eax
2346 ; SKX-NEXT: movw %ax, 2(%rdx)
2348 %tmp = load <2 x half>, <2 x half>* %arg, align 8
2349 %tmp3 = fcmp fast olt <2 x half> %tmp, <half 0xH4600, half 0xH4600>
2350 %tmp4 = fcmp fast ogt <2 x half> %tmp, zeroinitializer
2351 %tmp5 = and <2 x i1> %tmp3, %tmp4
2352 %tmp6 = load <2 x half>, <2 x half>* %arg1, align 8
2353 %tmp7 = select <2 x i1> %tmp5, <2 x half> %tmp6, <2 x half> zeroinitializer
2354 store <2 x half> %tmp7, <2 x half>* %arg2, align 8