1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
4 ; Make sure CPUs default to prefer-256-bit. avx512vnni isn't interesting as it just adds an isel peephole for vpmaddwd+vpaddd
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cascadelake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cooperlake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=cannonlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-client | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
13 ; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled.
15 define dso_local void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "min-legal-vector-width"="256" {
16 ; CHECK-LABEL: add256:
18 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
19 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
20 ; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1
21 ; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0
22 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
23 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
24 ; CHECK-NEXT: vzeroupper
26 %d = load <16 x i32>, <16 x i32>* %a
27 %e = load <16 x i32>, <16 x i32>* %b
28 %f = add <16 x i32> %d, %e
29 store <16 x i32> %f, <16 x i32>* %c
33 define dso_local void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "min-legal-vector-width"="512" {
34 ; CHECK-LABEL: add512:
36 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
37 ; CHECK-NEXT: vpaddd (%rsi), %zmm0, %zmm0
38 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx)
39 ; CHECK-NEXT: vzeroupper
41 %d = load <16 x i32>, <16 x i32>* %a
42 %e = load <16 x i32>, <16 x i32>* %b
43 %f = add <16 x i32> %d, %e
44 store <16 x i32> %f, <16 x i32>* %c
48 define dso_local void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="256" {
49 ; CHECK-LABEL: avg_v64i8_256:
51 ; CHECK-NEXT: vmovdqa (%rsi), %ymm0
52 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1
53 ; CHECK-NEXT: vpavgb (%rdi), %ymm0, %ymm0
54 ; CHECK-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1
55 ; CHECK-NEXT: vmovdqu %ymm1, (%rax)
56 ; CHECK-NEXT: vmovdqu %ymm0, (%rax)
57 ; CHECK-NEXT: vzeroupper
59 %1 = load <64 x i8>, <64 x i8>* %a
60 %2 = load <64 x i8>, <64 x i8>* %b
61 %3 = zext <64 x i8> %1 to <64 x i32>
62 %4 = zext <64 x i8> %2 to <64 x i32>
63 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
64 %6 = add nuw nsw <64 x i32> %5, %4
65 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
66 %8 = trunc <64 x i32> %7 to <64 x i8>
67 store <64 x i8> %8, <64 x i8>* undef, align 4
72 define dso_local void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="512" {
73 ; CHECK-LABEL: avg_v64i8_512:
75 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm0
76 ; CHECK-NEXT: vpavgb (%rdi), %zmm0, %zmm0
77 ; CHECK-NEXT: vmovdqu64 %zmm0, (%rax)
78 ; CHECK-NEXT: vzeroupper
80 %1 = load <64 x i8>, <64 x i8>* %a
81 %2 = load <64 x i8>, <64 x i8>* %b
82 %3 = zext <64 x i8> %1 to <64 x i32>
83 %4 = zext <64 x i8> %2 to <64 x i32>
84 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
85 %6 = add nuw nsw <64 x i32> %5, %4
86 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
87 %8 = trunc <64 x i32> %7 to <64 x i8>
88 store <64 x i8> %8, <64 x i8>* undef, align 4
92 define dso_local void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="256" {
93 ; CHECK-LABEL: pmaddwd_32_256:
95 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
96 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
97 ; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1
98 ; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0
99 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
100 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
101 ; CHECK-NEXT: vzeroupper
103 %A = load <32 x i16>, <32 x i16>* %APtr
104 %B = load <32 x i16>, <32 x i16>* %BPtr
105 %a = sext <32 x i16> %A to <32 x i32>
106 %b = sext <32 x i16> %B to <32 x i32>
107 %m = mul nsw <32 x i32> %a, %b
108 %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
109 %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
110 %ret = add <16 x i32> %odd, %even
111 store <16 x i32> %ret, <16 x i32>* %CPtr
115 define dso_local void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="512" {
116 ; CHECK-LABEL: pmaddwd_32_512:
118 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
119 ; CHECK-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0
120 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx)
121 ; CHECK-NEXT: vzeroupper
123 %A = load <32 x i16>, <32 x i16>* %APtr
124 %B = load <32 x i16>, <32 x i16>* %BPtr
125 %a = sext <32 x i16> %A to <32 x i32>
126 %b = sext <32 x i16> %B to <32 x i32>
127 %m = mul nsw <32 x i32> %a, %b
128 %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
129 %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
130 %ret = add <16 x i32> %odd, %even
131 store <16 x i32> %ret, <16 x i32>* %CPtr
135 define dso_local void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "min-legal-vector-width"="256" {
136 ; CHECK-LABEL: psubus_64i8_max_256:
138 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
139 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
140 ; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1
141 ; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0
142 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
143 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
144 ; CHECK-NEXT: vzeroupper
146 %x = load <64 x i8>, <64 x i8>* %xptr
147 %y = load <64 x i8>, <64 x i8>* %yptr
148 %cmp = icmp ult <64 x i8> %x, %y
149 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
150 %res = sub <64 x i8> %max, %y
151 store <64 x i8> %res, <64 x i8>* %zptr
155 define dso_local void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "min-legal-vector-width"="512" {
156 ; CHECK-LABEL: psubus_64i8_max_512:
158 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
159 ; CHECK-NEXT: vpsubusb (%rsi), %zmm0, %zmm0
160 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx)
161 ; CHECK-NEXT: vzeroupper
163 %x = load <64 x i8>, <64 x i8>* %xptr
164 %y = load <64 x i8>, <64 x i8>* %yptr
165 %cmp = icmp ult <64 x i8> %x, %y
166 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
167 %res = sub <64 x i8> %max, %y
168 store <64 x i8> %res, <64 x i8>* %zptr
172 define dso_local i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="256" {
173 ; CHECK-LABEL: _Z9test_charPcS_i_256:
174 ; CHECK: # %bb.0: # %entry
175 ; CHECK-NEXT: movl %edx, %eax
176 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
177 ; CHECK-NEXT: xorl %ecx, %ecx
178 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
179 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
180 ; CHECK-NEXT: .p2align 4, 0x90
181 ; CHECK-NEXT: .LBB8_1: # %vector.body
182 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
183 ; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3
184 ; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4
185 ; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5
186 ; CHECK-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3
187 ; CHECK-NEXT: vpaddd %ymm2, %ymm3, %ymm2
188 ; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3
189 ; CHECK-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3
190 ; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm1
191 ; CHECK-NEXT: addq $32, %rcx
192 ; CHECK-NEXT: cmpq %rcx, %rax
193 ; CHECK-NEXT: jne .LBB8_1
194 ; CHECK-NEXT: # %bb.2: # %middle.block
195 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm1
196 ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
197 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
198 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
199 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
200 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
201 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
202 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
203 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
204 ; CHECK-NEXT: vmovd %xmm0, %eax
205 ; CHECK-NEXT: vzeroupper
208 %3 = zext i32 %2 to i64
209 br label %vector.body
212 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
213 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
214 %4 = getelementptr inbounds i8, i8* %0, i64 %index
215 %5 = bitcast i8* %4 to <32 x i8>*
216 %wide.load = load <32 x i8>, <32 x i8>* %5, align 1
217 %6 = sext <32 x i8> %wide.load to <32 x i32>
218 %7 = getelementptr inbounds i8, i8* %1, i64 %index
219 %8 = bitcast i8* %7 to <32 x i8>*
220 %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1
221 %9 = sext <32 x i8> %wide.load14 to <32 x i32>
222 %10 = mul nsw <32 x i32> %9, %6
223 %11 = add nsw <32 x i32> %10, %vec.phi
224 %index.next = add i64 %index, 32
225 %12 = icmp eq i64 %index.next, %3
226 br i1 %12, label %middle.block, label %vector.body
229 %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
230 %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1
231 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
232 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
233 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
234 %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15
235 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
236 %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17
237 %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
238 %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19
239 %13 = extractelement <32 x i32> %bin.rdx20, i32 0
243 define dso_local i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="512" {
244 ; CHECK-LABEL: _Z9test_charPcS_i_512:
245 ; CHECK: # %bb.0: # %entry
246 ; CHECK-NEXT: movl %edx, %eax
247 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
248 ; CHECK-NEXT: xorl %ecx, %ecx
249 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
250 ; CHECK-NEXT: .p2align 4, 0x90
251 ; CHECK-NEXT: .LBB9_1: # %vector.body
252 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
253 ; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2
254 ; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3
255 ; CHECK-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2
256 ; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1
257 ; CHECK-NEXT: addq $32, %rcx
258 ; CHECK-NEXT: cmpq %rcx, %rax
259 ; CHECK-NEXT: jne .LBB9_1
260 ; CHECK-NEXT: # %bb.2: # %middle.block
261 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
262 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
263 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
264 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
265 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
266 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
267 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
268 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
269 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
270 ; CHECK-NEXT: vmovd %xmm0, %eax
271 ; CHECK-NEXT: vzeroupper
274 %3 = zext i32 %2 to i64
275 br label %vector.body
278 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
279 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
280 %4 = getelementptr inbounds i8, i8* %0, i64 %index
281 %5 = bitcast i8* %4 to <32 x i8>*
282 %wide.load = load <32 x i8>, <32 x i8>* %5, align 1
283 %6 = sext <32 x i8> %wide.load to <32 x i32>
284 %7 = getelementptr inbounds i8, i8* %1, i64 %index
285 %8 = bitcast i8* %7 to <32 x i8>*
286 %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1
287 %9 = sext <32 x i8> %wide.load14 to <32 x i32>
288 %10 = mul nsw <32 x i32> %9, %6
289 %11 = add nsw <32 x i32> %10, %vec.phi
290 %index.next = add i64 %index, 32
291 %12 = icmp eq i64 %index.next, %3
292 br i1 %12, label %middle.block, label %vector.body
295 %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
296 %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1
297 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
298 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
299 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
300 %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15
301 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
302 %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17
303 %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
304 %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19
305 %13 = extractelement <32 x i32> %bin.rdx20, i32 0
309 @a = dso_local global [1024 x i8] zeroinitializer, align 16
310 @b = dso_local global [1024 x i8] zeroinitializer, align 16
312 define dso_local i32 @sad_16i8_256() "min-legal-vector-width"="256" {
313 ; CHECK-LABEL: sad_16i8_256:
314 ; CHECK: # %bb.0: # %entry
315 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
316 ; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00
317 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
318 ; CHECK-NEXT: .p2align 4, 0x90
319 ; CHECK-NEXT: .LBB10_1: # %vector.body
320 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
321 ; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm2
322 ; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2
323 ; CHECK-NEXT: vpaddd %ymm1, %ymm2, %ymm1
324 ; CHECK-NEXT: addq $4, %rax
325 ; CHECK-NEXT: jne .LBB10_1
326 ; CHECK-NEXT: # %bb.2: # %middle.block
327 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
328 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
329 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
330 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
331 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
332 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
333 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
334 ; CHECK-NEXT: vmovd %xmm0, %eax
335 ; CHECK-NEXT: vzeroupper
338 br label %vector.body
341 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
342 %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
343 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
344 %1 = bitcast i8* %0 to <16 x i8>*
345 %wide.load = load <16 x i8>, <16 x i8>* %1, align 4
346 %2 = zext <16 x i8> %wide.load to <16 x i32>
347 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
348 %4 = bitcast i8* %3 to <16 x i8>*
349 %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4
350 %5 = zext <16 x i8> %wide.load1 to <16 x i32>
351 %6 = sub nsw <16 x i32> %2, %5
352 %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
353 %8 = sub nsw <16 x i32> zeroinitializer, %6
354 %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
355 %10 = add nsw <16 x i32> %9, %vec.phi
356 %index.next = add i64 %index, 4
357 %11 = icmp eq i64 %index.next, 1024
358 br i1 %11, label %middle.block, label %vector.body
361 %rdx.shuf = shufflevector <16 x i32> %10, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
362 %bin.rdx = add <16 x i32> %10, %rdx.shuf
363 %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
364 %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
365 %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
366 %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
367 %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
368 %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
369 %12 = extractelement <16 x i32> %bin.rdx4, i32 0
373 define dso_local i32 @sad_16i8_512() "min-legal-vector-width"="512" {
374 ; CHECK-LABEL: sad_16i8_512:
375 ; CHECK: # %bb.0: # %entry
376 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
377 ; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00
378 ; CHECK-NEXT: .p2align 4, 0x90
379 ; CHECK-NEXT: .LBB11_1: # %vector.body
380 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
381 ; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm1
382 ; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
383 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
384 ; CHECK-NEXT: addq $4, %rax
385 ; CHECK-NEXT: jne .LBB11_1
386 ; CHECK-NEXT: # %bb.2: # %middle.block
387 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
388 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
389 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
390 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
391 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
392 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
393 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
394 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
395 ; CHECK-NEXT: vmovd %xmm0, %eax
396 ; CHECK-NEXT: vzeroupper
399 br label %vector.body
402 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
403 %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
404 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
405 %1 = bitcast i8* %0 to <16 x i8>*
406 %wide.load = load <16 x i8>, <16 x i8>* %1, align 4
407 %2 = zext <16 x i8> %wide.load to <16 x i32>
408 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
409 %4 = bitcast i8* %3 to <16 x i8>*
410 %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4
411 %5 = zext <16 x i8> %wide.load1 to <16 x i32>
412 %6 = sub nsw <16 x i32> %2, %5
413 %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
414 %8 = sub nsw <16 x i32> zeroinitializer, %6
415 %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
416 %10 = add nsw <16 x i32> %9, %vec.phi
417 %index.next = add i64 %index, 4
418 %11 = icmp eq i64 %index.next, 1024
419 br i1 %11, label %middle.block, label %vector.body
422 %rdx.shuf = shufflevector <16 x i32> %10, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
423 %bin.rdx = add <16 x i32> %10, %rdx.shuf
424 %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
425 %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
426 %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
427 %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
428 %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
429 %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
430 %12 = extractelement <16 x i32> %bin.rdx4, i32 0
434 define dso_local void @sbto16f32_256(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="256" {
435 ; CHECK-LABEL: sbto16f32_256:
437 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
438 ; CHECK-NEXT: kshiftrw $8, %k0, %k1
439 ; CHECK-NEXT: vpmovm2d %k1, %ymm0
440 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
441 ; CHECK-NEXT: vpmovm2d %k0, %ymm1
442 ; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
443 ; CHECK-NEXT: vmovaps %ymm1, (%rdi)
444 ; CHECK-NEXT: vmovaps %ymm0, 32(%rdi)
445 ; CHECK-NEXT: vzeroupper
447 %mask = icmp slt <16 x i16> %a, zeroinitializer
448 %1 = sitofp <16 x i1> %mask to <16 x float>
449 store <16 x float> %1, <16 x float>* %res
453 define dso_local void @sbto16f32_512(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="512" {
454 ; CHECK-LABEL: sbto16f32_512:
456 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
457 ; CHECK-NEXT: vpmovm2d %k0, %zmm0
458 ; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0
459 ; CHECK-NEXT: vmovaps %zmm0, (%rdi)
460 ; CHECK-NEXT: vzeroupper
462 %mask = icmp slt <16 x i16> %a, zeroinitializer
463 %1 = sitofp <16 x i1> %mask to <16 x float>
464 store <16 x float> %1, <16 x float>* %res
468 define dso_local void @sbto16f64_256(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="256" {
469 ; CHECK-LABEL: sbto16f64_256:
471 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
472 ; CHECK-NEXT: kshiftrw $8, %k0, %k1
473 ; CHECK-NEXT: vpmovm2d %k1, %ymm0
474 ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1
475 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
476 ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
477 ; CHECK-NEXT: vpmovm2d %k0, %ymm2
478 ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3
479 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
480 ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2
481 ; CHECK-NEXT: vmovaps %ymm2, 32(%rdi)
482 ; CHECK-NEXT: vmovaps %ymm3, (%rdi)
483 ; CHECK-NEXT: vmovaps %ymm0, 96(%rdi)
484 ; CHECK-NEXT: vmovaps %ymm1, 64(%rdi)
485 ; CHECK-NEXT: vzeroupper
487 %mask = icmp slt <16 x i16> %a, zeroinitializer
488 %1 = sitofp <16 x i1> %mask to <16 x double>
489 store <16 x double> %1, <16 x double>* %res
493 define dso_local void @sbto16f64_512(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="512" {
494 ; CHECK-LABEL: sbto16f64_512:
496 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
497 ; CHECK-NEXT: vpmovm2d %k0, %zmm0
498 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1
499 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
500 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0
501 ; CHECK-NEXT: vmovaps %zmm0, 64(%rdi)
502 ; CHECK-NEXT: vmovaps %zmm1, (%rdi)
503 ; CHECK-NEXT: vzeroupper
505 %mask = icmp slt <16 x i16> %a, zeroinitializer
506 %1 = sitofp <16 x i1> %mask to <16 x double>
507 store <16 x double> %1, <16 x double>* %res
511 define dso_local void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="256" {
512 ; CHECK-LABEL: ubto16f32_256:
514 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
515 ; CHECK-NEXT: kshiftrw $8, %k0, %k1
516 ; CHECK-NEXT: vpmovm2d %k1, %ymm0
517 ; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0
518 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
519 ; CHECK-NEXT: vpmovm2d %k0, %ymm1
520 ; CHECK-NEXT: vpsrld $31, %ymm1, %ymm1
521 ; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
522 ; CHECK-NEXT: vmovaps %ymm1, (%rdi)
523 ; CHECK-NEXT: vmovaps %ymm0, 32(%rdi)
524 ; CHECK-NEXT: vzeroupper
526 %mask = icmp slt <16 x i16> %a, zeroinitializer
527 %1 = uitofp <16 x i1> %mask to <16 x float>
528 store <16 x float> %1, <16 x float>* %res
532 define dso_local void @ubto16f32_512(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="512" {
533 ; CHECK-LABEL: ubto16f32_512:
535 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
536 ; CHECK-NEXT: vpmovm2d %k0, %zmm0
537 ; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0
538 ; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0
539 ; CHECK-NEXT: vmovaps %zmm0, (%rdi)
540 ; CHECK-NEXT: vzeroupper
542 %mask = icmp slt <16 x i16> %a, zeroinitializer
543 %1 = uitofp <16 x i1> %mask to <16 x float>
544 store <16 x float> %1, <16 x float>* %res
548 define dso_local void @ubto16f64_256(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="256" {
549 ; CHECK-LABEL: ubto16f64_256:
551 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
552 ; CHECK-NEXT: kshiftrw $8, %k0, %k1
553 ; CHECK-NEXT: vpmovm2d %k1, %ymm0
554 ; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0
555 ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1
556 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
557 ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
558 ; CHECK-NEXT: vpmovm2d %k0, %ymm2
559 ; CHECK-NEXT: vpsrld $31, %ymm2, %ymm2
560 ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3
561 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
562 ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2
563 ; CHECK-NEXT: vmovaps %ymm2, 32(%rdi)
564 ; CHECK-NEXT: vmovaps %ymm3, (%rdi)
565 ; CHECK-NEXT: vmovaps %ymm0, 96(%rdi)
566 ; CHECK-NEXT: vmovaps %ymm1, 64(%rdi)
567 ; CHECK-NEXT: vzeroupper
569 %mask = icmp slt <16 x i16> %a, zeroinitializer
570 %1 = uitofp <16 x i1> %mask to <16 x double>
571 store <16 x double> %1, <16 x double>* %res
575 define dso_local void @ubto16f64_512(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="512" {
576 ; CHECK-LABEL: ubto16f64_512:
578 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
579 ; CHECK-NEXT: vpmovm2d %k0, %zmm0
580 ; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0
581 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1
582 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
583 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0
584 ; CHECK-NEXT: vmovaps %zmm0, 64(%rdi)
585 ; CHECK-NEXT: vmovaps %zmm1, (%rdi)
586 ; CHECK-NEXT: vzeroupper
588 %mask = icmp slt <16 x i16> %a, zeroinitializer
589 %1 = uitofp <16 x i1> %mask to <16 x double>
590 store <16 x double> %1, <16 x double>* %res
594 define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" {
595 ; CHECK-LABEL: test_16f32toub_256:
597 ; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1
598 ; CHECK-NEXT: vpslld $31, %ymm1, %ymm1
599 ; CHECK-NEXT: vpmovd2m %ymm1, %k0
600 ; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm1
601 ; CHECK-NEXT: vpslld $31, %ymm1, %ymm1
602 ; CHECK-NEXT: vpmovd2m %ymm1, %k1
603 ; CHECK-NEXT: kunpckbw %k0, %k1, %k1
604 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
606 %a = load <16 x float>, <16 x float>* %ptr
607 %mask = fptoui <16 x float> %a to <16 x i1>
608 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
609 ret <16 x i16> %select
612 define <16 x i16> @test_16f32toub_512(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="512" {
613 ; CHECK-LABEL: test_16f32toub_512:
615 ; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1
616 ; CHECK-NEXT: vpslld $31, %zmm1, %zmm1
617 ; CHECK-NEXT: vpmovd2m %zmm1, %k1
618 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
620 %a = load <16 x float>, <16 x float>* %ptr
621 %mask = fptoui <16 x float> %a to <16 x i1>
622 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
623 ret <16 x i16> %select
626 define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" {
627 ; CHECK-LABEL: test_16f32tosb_256:
629 ; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1
630 ; CHECK-NEXT: vpmovd2m %ymm1, %k0
631 ; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm1
632 ; CHECK-NEXT: vpmovd2m %ymm1, %k1
633 ; CHECK-NEXT: kunpckbw %k0, %k1, %k1
634 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
636 %a = load <16 x float>, <16 x float>* %ptr
637 %mask = fptosi <16 x float> %a to <16 x i1>
638 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
639 ret <16 x i16> %select
642 define <16 x i16> @test_16f32tosb_512(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="512" {
643 ; CHECK-LABEL: test_16f32tosb_512:
645 ; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1
646 ; CHECK-NEXT: vpmovd2m %zmm1, %k1
647 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
649 %a = load <16 x float>, <16 x float>* %ptr
650 %mask = fptosi <16 x float> %a to <16 x i1>
651 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
652 ret <16 x i16> %select
655 define dso_local void @mul256(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vector-width"="256" {
656 ; CHECK-AVX512-LABEL: mul256:
657 ; CHECK-AVX512: # %bb.0:
658 ; CHECK-AVX512-NEXT: vmovdqa (%rdi), %ymm0
659 ; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
660 ; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2
661 ; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3
662 ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
663 ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
664 ; CHECK-AVX512-NEXT: vpmullw %ymm4, %ymm5, %ymm4
665 ; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
666 ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm4, %ymm4
667 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
668 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
669 ; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm1, %ymm1
670 ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm1, %ymm1
671 ; CHECK-AVX512-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
672 ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
673 ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
674 ; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm4, %ymm3
675 ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm3, %ymm3
676 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
677 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
678 ; CHECK-AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0
679 ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm0, %ymm0
680 ; CHECK-AVX512-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
681 ; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx)
682 ; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx)
683 ; CHECK-AVX512-NEXT: vzeroupper
684 ; CHECK-AVX512-NEXT: retq
686 ; CHECK-VBMI-LABEL: mul256:
687 ; CHECK-VBMI: # %bb.0:
688 ; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm0
689 ; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
690 ; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2
691 ; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
692 ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
693 ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
694 ; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4
695 ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
696 ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
697 ; CHECK-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1
698 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
699 ; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1
700 ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
701 ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
702 ; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4
703 ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
704 ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
705 ; CHECK-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0
706 ; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0
707 ; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx)
708 ; CHECK-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
709 ; CHECK-VBMI-NEXT: vzeroupper
710 ; CHECK-VBMI-NEXT: retq
711 %d = load <64 x i8>, <64 x i8>* %a
712 %e = load <64 x i8>, <64 x i8>* %b
713 %f = mul <64 x i8> %d, %e
714 store <64 x i8> %f, <64 x i8>* %c
718 define dso_local void @mul512(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vector-width"="512" {
719 ; CHECK-AVX512-LABEL: mul512:
720 ; CHECK-AVX512: # %bb.0:
721 ; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
722 ; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1
723 ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
724 ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
725 ; CHECK-AVX512-NEXT: vpmullw %zmm2, %zmm3, %zmm2
726 ; CHECK-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
727 ; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm2, %zmm2
728 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
729 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
730 ; CHECK-AVX512-NEXT: vpmullw %zmm1, %zmm0, %zmm0
731 ; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm0, %zmm0
732 ; CHECK-AVX512-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
733 ; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
734 ; CHECK-AVX512-NEXT: vzeroupper
735 ; CHECK-AVX512-NEXT: retq
737 ; CHECK-VBMI-LABEL: mul512:
738 ; CHECK-VBMI: # %bb.0:
739 ; CHECK-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
740 ; CHECK-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
741 ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
742 ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
743 ; CHECK-VBMI-NEXT: vpmullw %zmm2, %zmm3, %zmm2
744 ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
745 ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
746 ; CHECK-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm0
747 ; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
748 ; CHECK-VBMI-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
749 ; CHECK-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx)
750 ; CHECK-VBMI-NEXT: vzeroupper
751 ; CHECK-VBMI-NEXT: retq
752 %d = load <64 x i8>, <64 x i8>* %a
753 %e = load <64 x i8>, <64 x i8>* %b
754 %f = mul <64 x i8> %d, %e
755 store <64 x i8> %f, <64 x i8>* %c
759 ; This threw an assertion at one point.
760 define <4 x i32> @mload_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) "min-legal-vector-width"="256" {
761 ; CHECK-LABEL: mload_v4i32:
763 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
764 ; CHECK-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
766 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
767 %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
770 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
772 define <16 x i32> @trunc_v16i64_v16i32(<16 x i64>* %x) nounwind "min-legal-vector-width"="256" {
773 ; CHECK-LABEL: trunc_v16i64_v16i32:
775 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
776 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
777 ; CHECK-NEXT: vmovdqa 64(%rdi), %ymm2
778 ; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3
779 ; CHECK-NEXT: vpmovqd %ymm0, %xmm0
780 ; CHECK-NEXT: vpmovqd %ymm1, %xmm1
781 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
782 ; CHECK-NEXT: vpmovqd %ymm2, %xmm1
783 ; CHECK-NEXT: vpmovqd %ymm3, %xmm2
784 ; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
786 %a = load <16 x i64>, <16 x i64>* %x
787 %b = trunc <16 x i64> %a to <16 x i32>
791 define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %x) nounwind "min-legal-vector-width"="256" {
792 ; CHECK-LABEL: trunc_v16i64_v16i8:
794 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
795 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
796 ; CHECK-NEXT: vmovdqa 64(%rdi), %ymm2
797 ; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3
798 ; CHECK-NEXT: vpmovqb %ymm3, %xmm3
799 ; CHECK-NEXT: vpmovqb %ymm2, %xmm2
800 ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
801 ; CHECK-NEXT: vpmovqb %ymm1, %xmm1
802 ; CHECK-NEXT: vpmovqb %ymm0, %xmm0
803 ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
804 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
805 ; CHECK-NEXT: vzeroupper
807 %a = load <16 x i64>, <16 x i64>* %x
808 %b = trunc <16 x i64> %a to <16 x i8>
812 define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" {
813 ; CHECK-LABEL: trunc_v16i32_v16i8:
815 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
816 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
817 ; CHECK-NEXT: vpmovdb %ymm1, %xmm1
818 ; CHECK-NEXT: vpmovdb %ymm0, %xmm0
819 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
820 ; CHECK-NEXT: vzeroupper
822 %a = load <16 x i32>, <16 x i32>* %x
823 %b = trunc <16 x i32> %a to <16 x i8>
827 define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
828 ; CHECK-LABEL: trunc_v8i64_v8i8:
830 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
831 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
832 ; CHECK-NEXT: vpmovqb %ymm1, %xmm1
833 ; CHECK-NEXT: vpmovqb %ymm0, %xmm0
834 ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
835 ; CHECK-NEXT: vzeroupper
837 %a = load <8 x i64>, <8 x i64>* %x
838 %b = trunc <8 x i64> %a to <8 x i8>
842 define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
843 ; CHECK-LABEL: trunc_v8i64_v8i16:
845 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
846 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
847 ; CHECK-NEXT: vpmovqw %ymm1, %xmm1
848 ; CHECK-NEXT: vpmovqw %ymm0, %xmm0
849 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
850 ; CHECK-NEXT: vzeroupper
852 %a = load <8 x i64>, <8 x i64>* %x
853 %b = trunc <8 x i64> %a to <8 x i16>
857 define <8 x i32> @trunc_v8i64_v8i32_zeroes(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
858 ; CHECK-LABEL: trunc_v8i64_v8i32_zeroes:
860 ; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm0
861 ; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm1
862 ; CHECK-NEXT: vpackusdw %ymm0, %ymm1, %ymm0
863 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
865 %a = load <8 x i64>, <8 x i64>* %x
866 %b = lshr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48>
867 %c = trunc <8 x i64> %b to <8 x i32>
871 define <16 x i16> @trunc_v16i32_v16i16_zeroes(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" {
872 ; CHECK-LABEL: trunc_v16i32_v16i16_zeroes:
874 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
875 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
876 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
878 %a = load <16 x i32>, <16 x i32>* %x
879 %b = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
880 %c = trunc <16 x i32> %b to <16 x i16>
884 define <32 x i8> @trunc_v32i16_v32i8_zeroes(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" {
885 ; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_zeroes:
886 ; CHECK-AVX512: # %bb.0:
887 ; CHECK-AVX512-NEXT: vpsrlw $8, 32(%rdi), %ymm0
888 ; CHECK-AVX512-NEXT: vpsrlw $8, (%rdi), %ymm1
889 ; CHECK-AVX512-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
890 ; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
891 ; CHECK-AVX512-NEXT: retq
893 ; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_zeroes:
894 ; CHECK-VBMI: # %bb.0:
895 ; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1
896 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
897 ; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0
898 ; CHECK-VBMI-NEXT: retq
899 %a = load <32 x i16>, <32 x i16>* %x
900 %b = lshr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
901 %c = trunc <32 x i16> %b to <32 x i8>
905 define <8 x i32> @trunc_v8i64_v8i32_sign(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
906 ; CHECK-LABEL: trunc_v8i64_v8i32_sign:
908 ; CHECK-NEXT: vpsraq $48, 32(%rdi), %ymm0
909 ; CHECK-NEXT: vpsraq $48, (%rdi), %ymm1
910 ; CHECK-NEXT: vpmovqd %ymm1, %xmm1
911 ; CHECK-NEXT: vpmovqd %ymm0, %xmm0
912 ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
914 %a = load <8 x i64>, <8 x i64>* %x
915 %b = ashr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48>
916 %c = trunc <8 x i64> %b to <8 x i32>
920 define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" {
921 ; CHECK-LABEL: trunc_v16i32_v16i16_sign:
923 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
924 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
925 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
927 %a = load <16 x i32>, <16 x i32>* %x
928 %b = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
929 %c = trunc <16 x i32> %b to <16 x i16>
933 define <32 x i8> @trunc_v32i16_v32i8_sign(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" {
934 ; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_sign:
935 ; CHECK-AVX512: # %bb.0:
936 ; CHECK-AVX512-NEXT: vpsrlw $8, 32(%rdi), %ymm0
937 ; CHECK-AVX512-NEXT: vpsrlw $8, (%rdi), %ymm1
938 ; CHECK-AVX512-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
939 ; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
940 ; CHECK-AVX512-NEXT: retq
942 ; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_sign:
943 ; CHECK-VBMI: # %bb.0:
944 ; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1
945 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
946 ; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0
947 ; CHECK-VBMI-NEXT: retq
948 %a = load <32 x i16>, <32 x i16>* %x
949 %b = ashr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
950 %c = trunc <32 x i16> %b to <32 x i8>
954 define dso_local void @zext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal-vector-width"="256" {
955 ; CHECK-LABEL: zext_v16i8_v16i64:
957 ; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
958 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
959 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
960 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
961 ; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
962 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
963 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
964 ; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
965 ; CHECK-NEXT: vmovdqa %ymm0, (%rdi)
966 ; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi)
967 ; CHECK-NEXT: vmovdqa %ymm3, 96(%rdi)
968 ; CHECK-NEXT: vmovdqa %ymm2, 32(%rdi)
969 ; CHECK-NEXT: vzeroupper
971 %a = zext <16 x i8> %x to <16 x i64>
972 store <16 x i64> %a, <16 x i64>* %y
976 define dso_local void @sext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal-vector-width"="256" {
977 ; CHECK-LABEL: sext_v16i8_v16i64:
979 ; CHECK-NEXT: vpmovsxbw %xmm0, %ymm1
980 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
981 ; CHECK-NEXT: vpmovsxwq %xmm2, %ymm2
982 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
983 ; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
984 ; CHECK-NEXT: vpmovsxwq %xmm3, %ymm3
985 ; CHECK-NEXT: vpmovsxwq %xmm1, %ymm1
986 ; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0
987 ; CHECK-NEXT: vmovdqa %ymm0, (%rdi)
988 ; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi)
989 ; CHECK-NEXT: vmovdqa %ymm3, 96(%rdi)
990 ; CHECK-NEXT: vmovdqa %ymm2, 32(%rdi)
991 ; CHECK-NEXT: vzeroupper
993 %a = sext <16 x i8> %x to <16 x i64>
994 store <16 x i64> %a, <16 x i64>* %y
998 define dso_local void @vselect_split_v8i16_setcc(<8 x i16> %s, <8 x i16> %t, <8 x i64>* %p, <8 x i64>* %q, <8 x i64>* %r) "min-legal-vector-width"="256" {
999 ; CHECK-LABEL: vselect_split_v8i16_setcc:
1001 ; CHECK-NEXT: vmovdqa (%rsi), %ymm2
1002 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3
1003 ; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
1004 ; CHECK-NEXT: kshiftrb $4, %k1, %k2
1005 ; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2}
1006 ; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1}
1007 ; CHECK-NEXT: vmovdqa %ymm2, (%rdx)
1008 ; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx)
1009 ; CHECK-NEXT: vzeroupper
1011 %x = load <8 x i64>, <8 x i64>* %p
1012 %y = load <8 x i64>, <8 x i64>* %q
1013 %a = icmp eq <8 x i16> %s, %t
1014 %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y
1015 store <8 x i64> %b, <8 x i64>* %r
1019 define dso_local void @vselect_split_v8i32_setcc(<8 x i32> %s, <8 x i32> %t, <8 x i64>* %p, <8 x i64>* %q, <8 x i64>* %r) "min-legal-vector-width"="256" {
1020 ; CHECK-LABEL: vselect_split_v8i32_setcc:
1022 ; CHECK-NEXT: vmovdqa (%rsi), %ymm2
1023 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3
1024 ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
1025 ; CHECK-NEXT: kshiftrb $4, %k1, %k2
1026 ; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2}
1027 ; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1}
1028 ; CHECK-NEXT: vmovdqa %ymm2, (%rdx)
1029 ; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx)
1030 ; CHECK-NEXT: vzeroupper
1032 %x = load <8 x i64>, <8 x i64>* %p
1033 %y = load <8 x i64>, <8 x i64>* %q
1034 %a = icmp eq <8 x i32> %s, %t
1035 %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y
1036 store <8 x i64> %b, <8 x i64>* %r
1040 define dso_local void @vselect_split_v16i8_setcc(<16 x i8> %s, <16 x i8> %t, <16 x i32>* %p, <16 x i32>* %q, <16 x i32>* %r) "min-legal-vector-width"="256" {
1041 ; CHECK-LABEL: vselect_split_v16i8_setcc:
1043 ; CHECK-NEXT: vmovdqa (%rsi), %ymm2
1044 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3
1045 ; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
1046 ; CHECK-NEXT: kshiftrw $8, %k1, %k2
1047 ; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2}
1048 ; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1}
1049 ; CHECK-NEXT: vmovdqa %ymm2, (%rdx)
1050 ; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx)
1051 ; CHECK-NEXT: vzeroupper
1053 %x = load <16 x i32>, <16 x i32>* %p
1054 %y = load <16 x i32>, <16 x i32>* %q
1055 %a = icmp eq <16 x i8> %s, %t
1056 %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y
1057 store <16 x i32> %b, <16 x i32>* %r
1061 define dso_local void @vselect_split_v16i16_setcc(<16 x i16> %s, <16 x i16> %t, <16 x i32>* %p, <16 x i32>* %q, <16 x i32>* %r) "min-legal-vector-width"="256" {
1062 ; CHECK-LABEL: vselect_split_v16i16_setcc:
1064 ; CHECK-NEXT: vmovdqa (%rsi), %ymm2
1065 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3
1066 ; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
1067 ; CHECK-NEXT: kshiftrw $8, %k1, %k2
1068 ; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2}
1069 ; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1}
1070 ; CHECK-NEXT: vmovdqa %ymm2, (%rdx)
1071 ; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx)
1072 ; CHECK-NEXT: vzeroupper
1074 %x = load <16 x i32>, <16 x i32>* %p
1075 %y = load <16 x i32>, <16 x i32>* %q
1076 %a = icmp eq <16 x i16> %s, %t
1077 %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y
1078 store <16 x i32> %b, <16 x i32>* %r
1082 define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32>* %p) "min-legal-vector-width"="256" {
1083 ; CHECK-LABEL: trunc_packus_v16i32_v16i8:
1085 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
1086 ; CHECK-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0
1087 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1088 ; CHECK-NEXT: vpmovuswb %ymm0, %xmm0
1089 ; CHECK-NEXT: vzeroupper
1091 %a = load <16 x i32>, <16 x i32>* %p
1092 %b = icmp slt <16 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1093 %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1094 %d = icmp sgt <16 x i32> %c, zeroinitializer
1095 %e = select <16 x i1> %d, <16 x i32> %c, <16 x i32> zeroinitializer
1096 %f = trunc <16 x i32> %e to <16 x i8>
1100 define dso_local void @trunc_packus_v16i32_v16i8_store(<16 x i32>* %p, <16 x i8>* %q) "min-legal-vector-width"="256" {
1101 ; CHECK-LABEL: trunc_packus_v16i32_v16i8_store:
1103 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
1104 ; CHECK-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0
1105 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1106 ; CHECK-NEXT: vpmovuswb %ymm0, (%rsi)
1107 ; CHECK-NEXT: vzeroupper
1109 %a = load <16 x i32>, <16 x i32>* %p
1110 %b = icmp slt <16 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1111 %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1112 %d = icmp sgt <16 x i32> %c, zeroinitializer
1113 %e = select <16 x i1> %d, <16 x i32> %c, <16 x i32> zeroinitializer
1114 %f = trunc <16 x i32> %e to <16 x i8>
1115 store <16 x i8> %f, <16 x i8>* %q
1119 define <64 x i1> @v64i1_argument_return(<64 x i1> %x) "min-legal-vector-width"="256" {
1120 ; CHECK-LABEL: v64i1_argument_return:
1126 define dso_local void @v64i1_shuffle(<64 x i8>* %x, <64 x i8>* %y) "min-legal-vector-width"="256" {
1127 ; CHECK-LABEL: v64i1_shuffle:
1128 ; CHECK: # %bb.0: # %entry
1129 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1130 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0
1131 ; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k0
1132 ; CHECK-NEXT: kshiftrd $1, %k0, %k1
1133 ; CHECK-NEXT: movq $-3, %rax
1134 ; CHECK-NEXT: kmovq %rax, %k2
1135 ; CHECK-NEXT: kandq %k2, %k1, %k1
1136 ; CHECK-NEXT: kshiftlq $63, %k0, %k2
1137 ; CHECK-NEXT: kshiftrq $62, %k2, %k2
1138 ; CHECK-NEXT: korq %k2, %k1, %k1
1139 ; CHECK-NEXT: movq $-5, %rax
1140 ; CHECK-NEXT: kmovq %rax, %k2
1141 ; CHECK-NEXT: kandq %k2, %k1, %k1
1142 ; CHECK-NEXT: kshiftrd $3, %k0, %k2
1143 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1144 ; CHECK-NEXT: kshiftrq $61, %k2, %k2
1145 ; CHECK-NEXT: korq %k2, %k1, %k1
1146 ; CHECK-NEXT: movq $-9, %rax
1147 ; CHECK-NEXT: kmovq %rax, %k2
1148 ; CHECK-NEXT: kandq %k2, %k1, %k1
1149 ; CHECK-NEXT: kshiftrd $2, %k0, %k2
1150 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1151 ; CHECK-NEXT: kshiftrq $60, %k2, %k2
1152 ; CHECK-NEXT: korq %k2, %k1, %k1
1153 ; CHECK-NEXT: movq $-17, %rax
1154 ; CHECK-NEXT: kmovq %rax, %k2
1155 ; CHECK-NEXT: kandq %k2, %k1, %k1
1156 ; CHECK-NEXT: kshiftrd $5, %k0, %k2
1157 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1158 ; CHECK-NEXT: kshiftrq $59, %k2, %k2
1159 ; CHECK-NEXT: korq %k2, %k1, %k1
1160 ; CHECK-NEXT: movq $-33, %rax
1161 ; CHECK-NEXT: kmovq %rax, %k2
1162 ; CHECK-NEXT: kandq %k2, %k1, %k1
1163 ; CHECK-NEXT: kshiftrd $4, %k0, %k2
1164 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1165 ; CHECK-NEXT: kshiftrq $58, %k2, %k2
1166 ; CHECK-NEXT: korq %k2, %k1, %k1
1167 ; CHECK-NEXT: movq $-65, %rax
1168 ; CHECK-NEXT: kmovq %rax, %k2
1169 ; CHECK-NEXT: kandq %k2, %k1, %k1
1170 ; CHECK-NEXT: kshiftrd $7, %k0, %k2
1171 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1172 ; CHECK-NEXT: kshiftrq $57, %k2, %k2
1173 ; CHECK-NEXT: korq %k2, %k1, %k1
1174 ; CHECK-NEXT: movq $-129, %rax
1175 ; CHECK-NEXT: kmovq %rax, %k2
1176 ; CHECK-NEXT: kandq %k2, %k1, %k1
1177 ; CHECK-NEXT: kshiftrd $6, %k0, %k2
1178 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1179 ; CHECK-NEXT: kshiftrq $56, %k2, %k2
1180 ; CHECK-NEXT: korq %k2, %k1, %k1
1181 ; CHECK-NEXT: movq $-257, %rax # imm = 0xFEFF
1182 ; CHECK-NEXT: kmovq %rax, %k2
1183 ; CHECK-NEXT: kandq %k2, %k1, %k1
1184 ; CHECK-NEXT: kshiftrd $9, %k0, %k2
1185 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1186 ; CHECK-NEXT: kshiftrq $55, %k2, %k2
1187 ; CHECK-NEXT: korq %k2, %k1, %k1
1188 ; CHECK-NEXT: movq $-513, %rax # imm = 0xFDFF
1189 ; CHECK-NEXT: kmovq %rax, %k2
1190 ; CHECK-NEXT: kandq %k2, %k1, %k1
1191 ; CHECK-NEXT: kshiftrd $8, %k0, %k2
1192 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1193 ; CHECK-NEXT: kshiftrq $54, %k2, %k2
1194 ; CHECK-NEXT: korq %k2, %k1, %k1
1195 ; CHECK-NEXT: movq $-1025, %rax # imm = 0xFBFF
1196 ; CHECK-NEXT: kmovq %rax, %k2
1197 ; CHECK-NEXT: kandq %k2, %k1, %k1
1198 ; CHECK-NEXT: kshiftrd $11, %k0, %k2
1199 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1200 ; CHECK-NEXT: kshiftrq $53, %k2, %k2
1201 ; CHECK-NEXT: korq %k2, %k1, %k1
1202 ; CHECK-NEXT: movq $-2049, %rax # imm = 0xF7FF
1203 ; CHECK-NEXT: kmovq %rax, %k2
1204 ; CHECK-NEXT: kandq %k2, %k1, %k1
1205 ; CHECK-NEXT: kshiftrd $10, %k0, %k2
1206 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1207 ; CHECK-NEXT: kshiftrq $52, %k2, %k2
1208 ; CHECK-NEXT: korq %k2, %k1, %k1
1209 ; CHECK-NEXT: movq $-4097, %rax # imm = 0xEFFF
1210 ; CHECK-NEXT: kmovq %rax, %k2
1211 ; CHECK-NEXT: kandq %k2, %k1, %k1
1212 ; CHECK-NEXT: kshiftrd $13, %k0, %k2
1213 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1214 ; CHECK-NEXT: kshiftrq $51, %k2, %k2
1215 ; CHECK-NEXT: korq %k2, %k1, %k1
1216 ; CHECK-NEXT: movq $-8193, %rax # imm = 0xDFFF
1217 ; CHECK-NEXT: kmovq %rax, %k2
1218 ; CHECK-NEXT: kandq %k2, %k1, %k1
1219 ; CHECK-NEXT: kshiftrd $12, %k0, %k2
1220 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1221 ; CHECK-NEXT: kshiftrq $50, %k2, %k2
1222 ; CHECK-NEXT: korq %k2, %k1, %k1
1223 ; CHECK-NEXT: movq $-16385, %rax # imm = 0xBFFF
1224 ; CHECK-NEXT: kmovq %rax, %k2
1225 ; CHECK-NEXT: kandq %k2, %k1, %k1
1226 ; CHECK-NEXT: kshiftrd $15, %k0, %k2
1227 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1228 ; CHECK-NEXT: kshiftrq $49, %k2, %k2
1229 ; CHECK-NEXT: korq %k2, %k1, %k1
1230 ; CHECK-NEXT: movq $-32769, %rax # imm = 0xFFFF7FFF
1231 ; CHECK-NEXT: kmovq %rax, %k2
1232 ; CHECK-NEXT: kandq %k2, %k1, %k1
1233 ; CHECK-NEXT: kshiftrd $14, %k0, %k2
1234 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1235 ; CHECK-NEXT: kshiftrq $48, %k2, %k2
1236 ; CHECK-NEXT: korq %k2, %k1, %k1
1237 ; CHECK-NEXT: movq $-65537, %rax # imm = 0xFFFEFFFF
1238 ; CHECK-NEXT: kmovq %rax, %k2
1239 ; CHECK-NEXT: kandq %k2, %k1, %k1
1240 ; CHECK-NEXT: kshiftrd $17, %k0, %k2
1241 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1242 ; CHECK-NEXT: kshiftrq $47, %k2, %k2
1243 ; CHECK-NEXT: korq %k2, %k1, %k1
1244 ; CHECK-NEXT: movq $-131073, %rax # imm = 0xFFFDFFFF
1245 ; CHECK-NEXT: kmovq %rax, %k2
1246 ; CHECK-NEXT: kandq %k2, %k1, %k1
1247 ; CHECK-NEXT: kshiftrd $16, %k0, %k2
1248 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1249 ; CHECK-NEXT: kshiftrq $46, %k2, %k2
1250 ; CHECK-NEXT: korq %k2, %k1, %k1
1251 ; CHECK-NEXT: movq $-262145, %rax # imm = 0xFFFBFFFF
1252 ; CHECK-NEXT: kmovq %rax, %k2
1253 ; CHECK-NEXT: kandq %k2, %k1, %k1
1254 ; CHECK-NEXT: kshiftrd $19, %k0, %k2
1255 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1256 ; CHECK-NEXT: kshiftrq $45, %k2, %k2
1257 ; CHECK-NEXT: korq %k2, %k1, %k1
1258 ; CHECK-NEXT: movq $-524289, %rax # imm = 0xFFF7FFFF
1259 ; CHECK-NEXT: kmovq %rax, %k2
1260 ; CHECK-NEXT: kandq %k2, %k1, %k1
1261 ; CHECK-NEXT: kshiftrd $18, %k0, %k2
1262 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1263 ; CHECK-NEXT: kshiftrq $44, %k2, %k2
1264 ; CHECK-NEXT: korq %k2, %k1, %k1
1265 ; CHECK-NEXT: movq $-1048577, %rax # imm = 0xFFEFFFFF
1266 ; CHECK-NEXT: kmovq %rax, %k2
1267 ; CHECK-NEXT: kandq %k2, %k1, %k1
1268 ; CHECK-NEXT: kshiftrd $21, %k0, %k2
1269 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1270 ; CHECK-NEXT: kshiftrq $43, %k2, %k2
1271 ; CHECK-NEXT: korq %k2, %k1, %k1
1272 ; CHECK-NEXT: movq $-2097153, %rax # imm = 0xFFDFFFFF
1273 ; CHECK-NEXT: kmovq %rax, %k2
1274 ; CHECK-NEXT: kandq %k2, %k1, %k1
1275 ; CHECK-NEXT: kshiftrd $20, %k0, %k2
1276 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1277 ; CHECK-NEXT: kshiftrq $42, %k2, %k2
1278 ; CHECK-NEXT: korq %k2, %k1, %k1
1279 ; CHECK-NEXT: movq $-4194305, %rax # imm = 0xFFBFFFFF
1280 ; CHECK-NEXT: kmovq %rax, %k2
1281 ; CHECK-NEXT: kandq %k2, %k1, %k1
1282 ; CHECK-NEXT: kshiftrd $23, %k0, %k2
1283 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1284 ; CHECK-NEXT: kshiftrq $41, %k2, %k2
1285 ; CHECK-NEXT: korq %k2, %k1, %k1
1286 ; CHECK-NEXT: movq $-8388609, %rax # imm = 0xFF7FFFFF
1287 ; CHECK-NEXT: kmovq %rax, %k2
1288 ; CHECK-NEXT: kandq %k2, %k1, %k1
1289 ; CHECK-NEXT: kshiftrd $22, %k0, %k2
1290 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1291 ; CHECK-NEXT: kshiftrq $40, %k2, %k2
1292 ; CHECK-NEXT: korq %k2, %k1, %k1
1293 ; CHECK-NEXT: movq $-16777217, %rax # imm = 0xFEFFFFFF
1294 ; CHECK-NEXT: kmovq %rax, %k2
1295 ; CHECK-NEXT: kandq %k2, %k1, %k1
1296 ; CHECK-NEXT: kshiftrd $25, %k0, %k2
1297 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1298 ; CHECK-NEXT: kshiftrq $39, %k2, %k2
1299 ; CHECK-NEXT: korq %k2, %k1, %k1
1300 ; CHECK-NEXT: movq $-33554433, %rax # imm = 0xFDFFFFFF
1301 ; CHECK-NEXT: kmovq %rax, %k2
1302 ; CHECK-NEXT: kandq %k2, %k1, %k1
1303 ; CHECK-NEXT: kshiftrd $24, %k0, %k2
1304 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1305 ; CHECK-NEXT: kshiftrq $38, %k2, %k2
1306 ; CHECK-NEXT: korq %k2, %k1, %k1
1307 ; CHECK-NEXT: movq $-67108865, %rax # imm = 0xFBFFFFFF
1308 ; CHECK-NEXT: kmovq %rax, %k2
1309 ; CHECK-NEXT: kandq %k2, %k1, %k1
1310 ; CHECK-NEXT: kshiftrd $27, %k0, %k2
1311 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1312 ; CHECK-NEXT: kshiftrq $37, %k2, %k2
1313 ; CHECK-NEXT: korq %k2, %k1, %k1
1314 ; CHECK-NEXT: movq $-134217729, %rax # imm = 0xF7FFFFFF
1315 ; CHECK-NEXT: kmovq %rax, %k2
1316 ; CHECK-NEXT: kandq %k2, %k1, %k1
1317 ; CHECK-NEXT: kshiftrd $26, %k0, %k2
1318 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1319 ; CHECK-NEXT: kshiftrq $36, %k2, %k2
1320 ; CHECK-NEXT: korq %k2, %k1, %k1
1321 ; CHECK-NEXT: movq $-268435457, %rax # imm = 0xEFFFFFFF
1322 ; CHECK-NEXT: kmovq %rax, %k2
1323 ; CHECK-NEXT: kandq %k2, %k1, %k1
1324 ; CHECK-NEXT: kshiftrd $29, %k0, %k2
1325 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1326 ; CHECK-NEXT: kshiftrq $35, %k2, %k2
1327 ; CHECK-NEXT: korq %k2, %k1, %k1
1328 ; CHECK-NEXT: movq $-536870913, %rax # imm = 0xDFFFFFFF
1329 ; CHECK-NEXT: kmovq %rax, %k2
1330 ; CHECK-NEXT: kandq %k2, %k1, %k1
1331 ; CHECK-NEXT: kshiftrd $28, %k0, %k2
1332 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1333 ; CHECK-NEXT: kshiftrq $34, %k2, %k2
1334 ; CHECK-NEXT: korq %k2, %k1, %k1
1335 ; CHECK-NEXT: movq $-1073741825, %rax # imm = 0xBFFFFFFF
1336 ; CHECK-NEXT: kmovq %rax, %k2
1337 ; CHECK-NEXT: kandq %k2, %k1, %k1
1338 ; CHECK-NEXT: kshiftrd $31, %k0, %k2
1339 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1340 ; CHECK-NEXT: kshiftrq $33, %k2, %k2
1341 ; CHECK-NEXT: korq %k2, %k1, %k1
1342 ; CHECK-NEXT: movabsq $-2147483649, %rax # imm = 0xFFFFFFFF7FFFFFFF
1343 ; CHECK-NEXT: kmovq %rax, %k2
1344 ; CHECK-NEXT: kandq %k2, %k1, %k2
1345 ; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1
1346 ; CHECK-NEXT: kshiftrd $30, %k0, %k0
1347 ; CHECK-NEXT: kshiftlq $63, %k0, %k0
1348 ; CHECK-NEXT: kshiftrq $32, %k0, %k0
1349 ; CHECK-NEXT: korq %k0, %k2, %k0
1350 ; CHECK-NEXT: movabsq $-4294967297, %rax # imm = 0xFFFFFFFEFFFFFFFF
1351 ; CHECK-NEXT: kmovq %rax, %k2
1352 ; CHECK-NEXT: kandq %k2, %k0, %k0
1353 ; CHECK-NEXT: kshiftrd $1, %k1, %k2
1354 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1355 ; CHECK-NEXT: kshiftrq $31, %k2, %k2
1356 ; CHECK-NEXT: korq %k2, %k0, %k0
1357 ; CHECK-NEXT: movabsq $-8589934593, %rax # imm = 0xFFFFFFFDFFFFFFFF
1358 ; CHECK-NEXT: kmovq %rax, %k2
1359 ; CHECK-NEXT: kandq %k2, %k0, %k0
1360 ; CHECK-NEXT: kshiftlq $63, %k1, %k2
1361 ; CHECK-NEXT: kshiftrq $30, %k2, %k2
1362 ; CHECK-NEXT: korq %k2, %k0, %k0
1363 ; CHECK-NEXT: movabsq $-17179869185, %rax # imm = 0xFFFFFFFBFFFFFFFF
1364 ; CHECK-NEXT: kmovq %rax, %k2
1365 ; CHECK-NEXT: kandq %k2, %k0, %k0
1366 ; CHECK-NEXT: kshiftrd $3, %k1, %k2
1367 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1368 ; CHECK-NEXT: kshiftrq $29, %k2, %k2
1369 ; CHECK-NEXT: korq %k2, %k0, %k0
1370 ; CHECK-NEXT: movabsq $-34359738369, %rax # imm = 0xFFFFFFF7FFFFFFFF
1371 ; CHECK-NEXT: kmovq %rax, %k2
1372 ; CHECK-NEXT: kandq %k2, %k0, %k0
1373 ; CHECK-NEXT: kshiftrd $2, %k1, %k2
1374 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1375 ; CHECK-NEXT: kshiftrq $28, %k2, %k2
1376 ; CHECK-NEXT: korq %k2, %k0, %k0
1377 ; CHECK-NEXT: movabsq $-68719476737, %rax # imm = 0xFFFFFFEFFFFFFFFF
1378 ; CHECK-NEXT: kmovq %rax, %k2
1379 ; CHECK-NEXT: kandq %k2, %k0, %k0
1380 ; CHECK-NEXT: kshiftrd $5, %k1, %k2
1381 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1382 ; CHECK-NEXT: kshiftrq $27, %k2, %k2
1383 ; CHECK-NEXT: korq %k2, %k0, %k0
1384 ; CHECK-NEXT: movabsq $-137438953473, %rax # imm = 0xFFFFFFDFFFFFFFFF
1385 ; CHECK-NEXT: kmovq %rax, %k2
1386 ; CHECK-NEXT: kandq %k2, %k0, %k0
1387 ; CHECK-NEXT: kshiftrd $4, %k1, %k2
1388 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1389 ; CHECK-NEXT: kshiftrq $26, %k2, %k2
1390 ; CHECK-NEXT: korq %k2, %k0, %k0
1391 ; CHECK-NEXT: movabsq $-274877906945, %rax # imm = 0xFFFFFFBFFFFFFFFF
1392 ; CHECK-NEXT: kmovq %rax, %k2
1393 ; CHECK-NEXT: kandq %k2, %k0, %k0
1394 ; CHECK-NEXT: kshiftrd $7, %k1, %k2
1395 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1396 ; CHECK-NEXT: kshiftrq $25, %k2, %k2
1397 ; CHECK-NEXT: korq %k2, %k0, %k0
1398 ; CHECK-NEXT: movabsq $-549755813889, %rax # imm = 0xFFFFFF7FFFFFFFFF
1399 ; CHECK-NEXT: kmovq %rax, %k2
1400 ; CHECK-NEXT: kandq %k2, %k0, %k0
1401 ; CHECK-NEXT: kshiftrd $6, %k1, %k2
1402 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1403 ; CHECK-NEXT: kshiftrq $24, %k2, %k2
1404 ; CHECK-NEXT: korq %k2, %k0, %k0
1405 ; CHECK-NEXT: movabsq $-1099511627777, %rax # imm = 0xFFFFFEFFFFFFFFFF
1406 ; CHECK-NEXT: kmovq %rax, %k2
1407 ; CHECK-NEXT: kandq %k2, %k0, %k0
1408 ; CHECK-NEXT: kshiftrd $9, %k1, %k2
1409 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1410 ; CHECK-NEXT: kshiftrq $23, %k2, %k2
1411 ; CHECK-NEXT: korq %k2, %k0, %k0
1412 ; CHECK-NEXT: movabsq $-2199023255553, %rax # imm = 0xFFFFFDFFFFFFFFFF
1413 ; CHECK-NEXT: kmovq %rax, %k2
1414 ; CHECK-NEXT: kandq %k2, %k0, %k0
1415 ; CHECK-NEXT: kshiftrd $8, %k1, %k2
1416 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1417 ; CHECK-NEXT: kshiftrq $22, %k2, %k2
1418 ; CHECK-NEXT: korq %k2, %k0, %k0
1419 ; CHECK-NEXT: movabsq $-4398046511105, %rax # imm = 0xFFFFFBFFFFFFFFFF
1420 ; CHECK-NEXT: kmovq %rax, %k2
1421 ; CHECK-NEXT: kandq %k2, %k0, %k0
1422 ; CHECK-NEXT: kshiftrd $11, %k1, %k2
1423 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1424 ; CHECK-NEXT: kshiftrq $21, %k2, %k2
1425 ; CHECK-NEXT: korq %k2, %k0, %k0
1426 ; CHECK-NEXT: movabsq $-8796093022209, %rax # imm = 0xFFFFF7FFFFFFFFFF
1427 ; CHECK-NEXT: kmovq %rax, %k2
1428 ; CHECK-NEXT: kandq %k2, %k0, %k0
1429 ; CHECK-NEXT: kshiftrd $10, %k1, %k2
1430 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1431 ; CHECK-NEXT: kshiftrq $20, %k2, %k2
1432 ; CHECK-NEXT: korq %k2, %k0, %k0
1433 ; CHECK-NEXT: movabsq $-17592186044417, %rax # imm = 0xFFFFEFFFFFFFFFFF
1434 ; CHECK-NEXT: kmovq %rax, %k2
1435 ; CHECK-NEXT: kandq %k2, %k0, %k0
1436 ; CHECK-NEXT: kshiftrd $13, %k1, %k2
1437 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1438 ; CHECK-NEXT: kshiftrq $19, %k2, %k2
1439 ; CHECK-NEXT: korq %k2, %k0, %k0
1440 ; CHECK-NEXT: movabsq $-35184372088833, %rax # imm = 0xFFFFDFFFFFFFFFFF
1441 ; CHECK-NEXT: kmovq %rax, %k2
1442 ; CHECK-NEXT: kandq %k2, %k0, %k0
1443 ; CHECK-NEXT: kshiftrd $12, %k1, %k2
1444 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1445 ; CHECK-NEXT: kshiftrq $18, %k2, %k2
1446 ; CHECK-NEXT: korq %k2, %k0, %k0
1447 ; CHECK-NEXT: movabsq $-70368744177665, %rax # imm = 0xFFFFBFFFFFFFFFFF
1448 ; CHECK-NEXT: kmovq %rax, %k2
1449 ; CHECK-NEXT: kandq %k2, %k0, %k0
1450 ; CHECK-NEXT: kshiftrd $15, %k1, %k2
1451 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1452 ; CHECK-NEXT: kshiftrq $17, %k2, %k2
1453 ; CHECK-NEXT: korq %k2, %k0, %k0
1454 ; CHECK-NEXT: movabsq $-140737488355329, %rax # imm = 0xFFFF7FFFFFFFFFFF
1455 ; CHECK-NEXT: kmovq %rax, %k2
1456 ; CHECK-NEXT: kandq %k2, %k0, %k0
1457 ; CHECK-NEXT: kshiftrd $14, %k1, %k2
1458 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1459 ; CHECK-NEXT: kshiftrq $16, %k2, %k2
1460 ; CHECK-NEXT: korq %k2, %k0, %k0
1461 ; CHECK-NEXT: movabsq $-281474976710657, %rax # imm = 0xFFFEFFFFFFFFFFFF
1462 ; CHECK-NEXT: kmovq %rax, %k2
1463 ; CHECK-NEXT: kandq %k2, %k0, %k0
1464 ; CHECK-NEXT: kshiftrd $17, %k1, %k2
1465 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1466 ; CHECK-NEXT: kshiftrq $15, %k2, %k2
1467 ; CHECK-NEXT: korq %k2, %k0, %k0
1468 ; CHECK-NEXT: movabsq $-562949953421313, %rax # imm = 0xFFFDFFFFFFFFFFFF
1469 ; CHECK-NEXT: kmovq %rax, %k2
1470 ; CHECK-NEXT: kandq %k2, %k0, %k0
1471 ; CHECK-NEXT: kshiftrd $16, %k1, %k2
1472 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1473 ; CHECK-NEXT: kshiftrq $14, %k2, %k2
1474 ; CHECK-NEXT: korq %k2, %k0, %k0
1475 ; CHECK-NEXT: movabsq $-1125899906842625, %rax # imm = 0xFFFBFFFFFFFFFFFF
1476 ; CHECK-NEXT: kmovq %rax, %k2
1477 ; CHECK-NEXT: kandq %k2, %k0, %k0
1478 ; CHECK-NEXT: kshiftrd $19, %k1, %k2
1479 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1480 ; CHECK-NEXT: kshiftrq $13, %k2, %k2
1481 ; CHECK-NEXT: korq %k2, %k0, %k0
1482 ; CHECK-NEXT: movabsq $-2251799813685249, %rax # imm = 0xFFF7FFFFFFFFFFFF
1483 ; CHECK-NEXT: kmovq %rax, %k2
1484 ; CHECK-NEXT: kandq %k2, %k0, %k0
1485 ; CHECK-NEXT: kshiftrd $18, %k1, %k2
1486 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1487 ; CHECK-NEXT: kshiftrq $12, %k2, %k2
1488 ; CHECK-NEXT: korq %k2, %k0, %k0
1489 ; CHECK-NEXT: movabsq $-4503599627370497, %rax # imm = 0xFFEFFFFFFFFFFFFF
1490 ; CHECK-NEXT: kmovq %rax, %k2
1491 ; CHECK-NEXT: kandq %k2, %k0, %k0
1492 ; CHECK-NEXT: kshiftrd $21, %k1, %k2
1493 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1494 ; CHECK-NEXT: kshiftrq $11, %k2, %k2
1495 ; CHECK-NEXT: korq %k2, %k0, %k0
1496 ; CHECK-NEXT: movabsq $-9007199254740993, %rax # imm = 0xFFDFFFFFFFFFFFFF
1497 ; CHECK-NEXT: kmovq %rax, %k2
1498 ; CHECK-NEXT: kandq %k2, %k0, %k0
1499 ; CHECK-NEXT: kshiftrd $20, %k1, %k2
1500 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1501 ; CHECK-NEXT: kshiftrq $10, %k2, %k2
1502 ; CHECK-NEXT: korq %k2, %k0, %k0
1503 ; CHECK-NEXT: movabsq $-18014398509481985, %rax # imm = 0xFFBFFFFFFFFFFFFF
1504 ; CHECK-NEXT: kmovq %rax, %k2
1505 ; CHECK-NEXT: kandq %k2, %k0, %k0
1506 ; CHECK-NEXT: kshiftrd $23, %k1, %k2
1507 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1508 ; CHECK-NEXT: kshiftrq $9, %k2, %k2
1509 ; CHECK-NEXT: korq %k2, %k0, %k0
1510 ; CHECK-NEXT: movabsq $-36028797018963969, %rax # imm = 0xFF7FFFFFFFFFFFFF
1511 ; CHECK-NEXT: kmovq %rax, %k2
1512 ; CHECK-NEXT: kandq %k2, %k0, %k0
1513 ; CHECK-NEXT: kshiftrd $22, %k1, %k2
1514 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1515 ; CHECK-NEXT: kshiftrq $8, %k2, %k2
1516 ; CHECK-NEXT: korq %k2, %k0, %k0
1517 ; CHECK-NEXT: movabsq $-72057594037927937, %rax # imm = 0xFEFFFFFFFFFFFFFF
1518 ; CHECK-NEXT: kmovq %rax, %k2
1519 ; CHECK-NEXT: kandq %k2, %k0, %k0
1520 ; CHECK-NEXT: kshiftrd $25, %k1, %k2
1521 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1522 ; CHECK-NEXT: kshiftrq $7, %k2, %k2
1523 ; CHECK-NEXT: korq %k2, %k0, %k0
1524 ; CHECK-NEXT: movabsq $-144115188075855873, %rax # imm = 0xFDFFFFFFFFFFFFFF
1525 ; CHECK-NEXT: kmovq %rax, %k2
1526 ; CHECK-NEXT: kandq %k2, %k0, %k0
1527 ; CHECK-NEXT: kshiftrd $24, %k1, %k2
1528 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1529 ; CHECK-NEXT: kshiftrq $6, %k2, %k2
1530 ; CHECK-NEXT: korq %k2, %k0, %k0
1531 ; CHECK-NEXT: movabsq $-288230376151711745, %rax # imm = 0xFBFFFFFFFFFFFFFF
1532 ; CHECK-NEXT: kmovq %rax, %k2
1533 ; CHECK-NEXT: kandq %k2, %k0, %k0
1534 ; CHECK-NEXT: kshiftrd $27, %k1, %k2
1535 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1536 ; CHECK-NEXT: kshiftrq $5, %k2, %k2
1537 ; CHECK-NEXT: korq %k2, %k0, %k0
1538 ; CHECK-NEXT: movabsq $-576460752303423489, %rax # imm = 0xF7FFFFFFFFFFFFFF
1539 ; CHECK-NEXT: kmovq %rax, %k2
1540 ; CHECK-NEXT: kandq %k2, %k0, %k0
1541 ; CHECK-NEXT: kshiftrd $26, %k1, %k2
1542 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1543 ; CHECK-NEXT: kshiftrq $4, %k2, %k2
1544 ; CHECK-NEXT: korq %k2, %k0, %k0
1545 ; CHECK-NEXT: movabsq $-1152921504606846977, %rax # imm = 0xEFFFFFFFFFFFFFFF
1546 ; CHECK-NEXT: kmovq %rax, %k2
1547 ; CHECK-NEXT: kandq %k2, %k0, %k0
1548 ; CHECK-NEXT: kshiftrd $29, %k1, %k2
1549 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1550 ; CHECK-NEXT: kshiftrq $3, %k2, %k2
1551 ; CHECK-NEXT: korq %k2, %k0, %k0
1552 ; CHECK-NEXT: movabsq $-2305843009213693953, %rax # imm = 0xDFFFFFFFFFFFFFFF
1553 ; CHECK-NEXT: kmovq %rax, %k2
1554 ; CHECK-NEXT: kandq %k2, %k0, %k0
1555 ; CHECK-NEXT: kshiftrd $28, %k1, %k2
1556 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1557 ; CHECK-NEXT: kshiftrq $2, %k2, %k2
1558 ; CHECK-NEXT: korq %k2, %k0, %k0
1559 ; CHECK-NEXT: movabsq $-4611686018427387905, %rax # imm = 0xBFFFFFFFFFFFFFFF
1560 ; CHECK-NEXT: kmovq %rax, %k2
1561 ; CHECK-NEXT: kandq %k2, %k0, %k0
1562 ; CHECK-NEXT: kshiftrd $31, %k1, %k2
1563 ; CHECK-NEXT: kshiftlq $62, %k2, %k2
1564 ; CHECK-NEXT: korq %k2, %k0, %k0
1565 ; CHECK-NEXT: kshiftrd $30, %k1, %k1
1566 ; CHECK-NEXT: kshiftlq $1, %k0, %k0
1567 ; CHECK-NEXT: kshiftrq $1, %k0, %k0
1568 ; CHECK-NEXT: kshiftlq $63, %k1, %k1
1569 ; CHECK-NEXT: korq %k1, %k0, %k1
1570 ; CHECK-NEXT: vmovdqu8 %ymm1, (%rsi) {%k1}
1571 ; CHECK-NEXT: kshiftrq $32, %k1, %k1
1572 ; CHECK-NEXT: vmovdqu8 %ymm0, 32(%rsi) {%k1}
1573 ; CHECK-NEXT: vzeroupper
1576 %a = load <64 x i8>, <64 x i8>* %x
1577 %b = icmp eq <64 x i8> %a, zeroinitializer
1578 %shuf = shufflevector <64 x i1> %b, <64 x i1> undef, <64 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14, i32 17, i32 16, i32 19, i32 18, i32 21, i32 20, i32 23, i32 22, i32 25, i32 24, i32 27, i32 26, i32 29, i32 28, i32 31, i32 30, i32 33, i32 32, i32 35, i32 34, i32 37, i32 36, i32 39, i32 38, i32 41, i32 40, i32 43, i32 42, i32 45, i32 44, i32 47, i32 46, i32 49, i32 48, i32 51, i32 50, i32 53, i32 52, i32 55, i32 54, i32 57, i32 56, i32 59, i32 58, i32 61, i32 60, i32 63, i32 62>
1579 call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> %a, <64 x i8>* %y, i32 1, <64 x i1> %shuf)
1582 declare void @llvm.masked.store.v64i8.p0v64i8(<64 x i8>, <64 x i8>*, i32, <64 x i1>)
1584 @mem64_dst = dso_local global i64 0, align 8
1585 @mem64_src = dso_local global i64 0, align 8
1586 define dso_local i32 @v64i1_inline_asm() "min-legal-vector-width"="256" {
1587 ; CHECK-LABEL: v64i1_inline_asm:
1589 ; CHECK-NEXT: kmovq mem64_src(%rip), %k0
1591 ; CHECK-NEXT: #NO_APP
1592 ; CHECK-NEXT: kmovq %k0, mem64_dst(%rip)
1593 ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
1595 %1 = alloca i32, align 4
1596 %2 = load i64, i64* @mem64_src, align 8
1597 %3 = call i64 asm "", "=k,k,~{dirflag},~{fpsr},~{flags}"(i64 %2)
1598 store i64 %3, i64* @mem64_dst, align 8
1599 %4 = load i32, i32* %1, align 4
1603 define dso_local void @cmp_v8i64_sext(<8 x i64>* %xptr, <8 x i64>* %yptr, <8 x i64>* %zptr) "min-legal-vector-width"="256" {
1604 ; CHECK-LABEL: cmp_v8i64_sext:
1606 ; CHECK-NEXT: vmovdqa (%rsi), %ymm0
1607 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1
1608 ; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1
1609 ; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0
1610 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
1611 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
1612 ; CHECK-NEXT: vzeroupper
1614 %x = load <8 x i64>, <8 x i64>* %xptr
1615 %y = load <8 x i64>, <8 x i64>* %yptr
1616 %cmp = icmp slt <8 x i64> %x, %y
1617 %ext = sext <8 x i1> %cmp to <8 x i64>
1618 store <8 x i64> %ext, <8 x i64>* %zptr
1622 define dso_local void @cmp_v8i64_zext(<8 x i64>* %xptr, <8 x i64>* %yptr, <8 x i64>* %zptr) "min-legal-vector-width"="256" {
1623 ; CHECK-LABEL: cmp_v8i64_zext:
1625 ; CHECK-NEXT: vmovdqa (%rsi), %ymm0
1626 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1
1627 ; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1
1628 ; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0
1629 ; CHECK-NEXT: vpsrlq $63, %ymm1, %ymm1
1630 ; CHECK-NEXT: vpsrlq $63, %ymm0, %ymm0
1631 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
1632 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
1633 ; CHECK-NEXT: vzeroupper
1635 %x = load <8 x i64>, <8 x i64>* %xptr
1636 %y = load <8 x i64>, <8 x i64>* %yptr
1637 %cmp = icmp slt <8 x i64> %x, %y
1638 %ext = zext <8 x i1> %cmp to <8 x i64>
1639 store <8 x i64> %ext, <8 x i64>* %zptr
1643 define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind "min-legal-vector-width"="256" {
1644 ; CHECK-LABEL: var_rotate_v16i8:
1646 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1647 ; CHECK-NEXT: vpsubb %xmm1, %xmm2, %xmm2
1648 ; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1649 ; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1650 ; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm1
1651 ; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
1652 ; CHECK-NEXT: vpsrlvw %ymm2, %ymm0, %ymm0
1653 ; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0
1654 ; CHECK-NEXT: vpmovwb %ymm0, %xmm0
1655 ; CHECK-NEXT: vzeroupper
1657 %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
1658 %shl = shl <16 x i8> %a, %b
1659 %lshr = lshr <16 x i8> %a, %b8
1660 %or = or <16 x i8> %shl, %lshr
1664 define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" {
1665 ; CHECK-LABEL: var_rotate_v32i8:
1667 ; CHECK-NEXT: vpsllw $4, %ymm0, %ymm2
1668 ; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm3
1669 ; CHECK-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
1670 ; CHECK-NEXT: vpsllw $5, %ymm1, %ymm1
1671 ; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
1672 ; CHECK-NEXT: vpsllw $2, %ymm0, %ymm2
1673 ; CHECK-NEXT: vpsrlw $6, %ymm0, %ymm3
1674 ; CHECK-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
1675 ; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
1676 ; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
1677 ; CHECK-NEXT: vpsrlw $7, %ymm0, %ymm2
1678 ; CHECK-NEXT: vpaddb %ymm0, %ymm0, %ymm3
1679 ; CHECK-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
1680 ; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
1681 ; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
1683 %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
1684 %shl = shl <32 x i8> %a, %b
1685 %lshr = lshr <32 x i8> %a, %b8
1686 %or = or <32 x i8> %shl, %lshr
1690 define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" {
1691 ; CHECK-AVX512-LABEL: splatvar_rotate_v32i8:
1692 ; CHECK-AVX512: # %bb.0:
1693 ; CHECK-AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1694 ; CHECK-AVX512-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1695 ; CHECK-AVX512-NEXT: vpsllw %xmm2, %ymm0, %ymm3
1696 ; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1697 ; CHECK-AVX512-NEXT: vpsubb %xmm1, %xmm4, %xmm1
1698 ; CHECK-AVX512-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
1699 ; CHECK-AVX512-NEXT: vpsllw %xmm2, %xmm4, %xmm2
1700 ; CHECK-AVX512-NEXT: vpbroadcastb %xmm2, %ymm2
1701 ; CHECK-AVX512-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1702 ; CHECK-AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm5
1703 ; CHECK-AVX512-NEXT: vpand %ymm2, %ymm3, %ymm2
1704 ; CHECK-AVX512-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
1705 ; CHECK-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0
1706 ; CHECK-AVX512-NEXT: vpbroadcastb %xmm0, %ymm0
1707 ; CHECK-AVX512-NEXT: vpternlogq $236, %ymm5, %ymm2, %ymm0
1708 ; CHECK-AVX512-NEXT: retq
1710 ; CHECK-VBMI-LABEL: splatvar_rotate_v32i8:
1711 ; CHECK-VBMI: # %bb.0:
1712 ; CHECK-VBMI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1713 ; CHECK-VBMI-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1714 ; CHECK-VBMI-NEXT: vpsllw %xmm2, %ymm0, %ymm3
1715 ; CHECK-VBMI-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
1716 ; CHECK-VBMI-NEXT: vpsllw %xmm2, %xmm4, %xmm2
1717 ; CHECK-VBMI-NEXT: vpbroadcastb %xmm2, %ymm2
1718 ; CHECK-VBMI-NEXT: vpand %ymm2, %ymm3, %ymm2
1719 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1720 ; CHECK-VBMI-NEXT: vpsubb %xmm1, %xmm3, %xmm1
1721 ; CHECK-VBMI-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1722 ; CHECK-VBMI-NEXT: vpsrlw %xmm1, %ymm0, %ymm3
1723 ; CHECK-VBMI-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
1724 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1725 ; CHECK-VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
1726 ; CHECK-VBMI-NEXT: vpternlogq $236, %ymm3, %ymm2, %ymm0
1727 ; CHECK-VBMI-NEXT: retq
1728 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
1729 %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
1730 %shl = shl <32 x i8> %a, %splat
1731 %lshr = lshr <32 x i8> %a, %splat8
1732 %or = or <32 x i8> %shl, %lshr
1736 define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" {
1737 ; CHECK-LABEL: constant_rotate_v32i8:
1739 ; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1
1740 ; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1741 ; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1742 ; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
1743 ; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1744 ; CHECK-NEXT: vpsllw $2, %ymm1, %ymm3
1745 ; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
1746 ; CHECK-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1747 ; CHECK-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1748 ; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm3
1749 ; CHECK-NEXT: vpaddb %ymm2, %ymm2, %ymm2
1750 ; CHECK-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1751 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
1752 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1753 ; CHECK-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
1754 ; CHECK-NEXT: vpsrlw $8, %ymm3, %ymm3
1755 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1756 ; CHECK-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1757 ; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0
1758 ; CHECK-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1759 ; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0
1761 %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1762 %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1763 %or = or <32 x i8> %shl, %lshr
1767 define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" {
1768 ; CHECK-LABEL: splatconstant_rotate_v32i8:
1770 ; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1
1771 ; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0
1772 ; CHECK-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1774 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1775 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1776 %or = or <32 x i8> %shl, %lshr
1780 define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" {
1781 ; CHECK-LABEL: splatconstant_rotate_mask_v32i8:
1783 ; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1
1784 ; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0
1785 ; CHECK-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1786 ; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1788 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1789 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1790 %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
1791 %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
1792 %or = or <32 x i8> %lmask, %rmask