1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-SKX
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-SKX,CHECK-SKX-VBMI
4 ; Make sure CPUs default to prefer-256-bit. avx512vnni isn't interesting as it just adds an isel peephole for vpmaddwd+vpaddd
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cascadelake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cooperlake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=cannonlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI,CHECK-VBMI1
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-client | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI,CHECK-GFNI
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI,CHECK-GFNI
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI,CHECK-GFNI
13 ; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled.
15 define dso_local void @add256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="256" {
16 ; CHECK-LABEL: add256:
18 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
19 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
20 ; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1
21 ; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0
22 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
23 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
24 ; CHECK-NEXT: vzeroupper
26 %d = load <16 x i32>, ptr %a
27 %e = load <16 x i32>, ptr %b
28 %f = add <16 x i32> %d, %e
29 store <16 x i32> %f, ptr %c
33 define dso_local void @add512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="512" {
34 ; CHECK-LABEL: add512:
36 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
37 ; CHECK-NEXT: vpaddd (%rsi), %zmm0, %zmm0
38 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx)
39 ; CHECK-NEXT: vzeroupper
41 %d = load <16 x i32>, ptr %a
42 %e = load <16 x i32>, ptr %b
43 %f = add <16 x i32> %d, %e
44 store <16 x i32> %f, ptr %c
48 define dso_local void @avg_v64i8_256(ptr %a, ptr %b) "min-legal-vector-width"="256" {
49 ; CHECK-LABEL: avg_v64i8_256:
51 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
52 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
53 ; CHECK-NEXT: vpavgb (%rsi), %ymm0, %ymm0
54 ; CHECK-NEXT: vpavgb 32(%rsi), %ymm1, %ymm1
55 ; CHECK-NEXT: vmovdqu %ymm1, (%rax)
56 ; CHECK-NEXT: vmovdqu %ymm0, (%rax)
57 ; CHECK-NEXT: vzeroupper
59 %1 = load <64 x i8>, ptr %a
60 %2 = load <64 x i8>, ptr %b
61 %3 = zext <64 x i8> %1 to <64 x i32>
62 %4 = zext <64 x i8> %2 to <64 x i32>
63 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
64 %6 = add nuw nsw <64 x i32> %5, %4
65 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
66 %8 = trunc <64 x i32> %7 to <64 x i8>
67 store <64 x i8> %8, ptr undef, align 4
72 define dso_local void @avg_v64i8_512(ptr %a, ptr %b) "min-legal-vector-width"="512" {
73 ; CHECK-LABEL: avg_v64i8_512:
75 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
76 ; CHECK-NEXT: vpavgb (%rsi), %zmm0, %zmm0
77 ; CHECK-NEXT: vmovdqu64 %zmm0, (%rax)
78 ; CHECK-NEXT: vzeroupper
80 %1 = load <64 x i8>, ptr %a
81 %2 = load <64 x i8>, ptr %b
82 %3 = zext <64 x i8> %1 to <64 x i32>
83 %4 = zext <64 x i8> %2 to <64 x i32>
84 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
85 %6 = add nuw nsw <64 x i32> %5, %4
86 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
87 %8 = trunc <64 x i32> %7 to <64 x i8>
88 store <64 x i8> %8, ptr undef, align 4
92 define dso_local void @pmaddwd_32_256(ptr %APtr, ptr %BPtr, ptr %CPtr) "min-legal-vector-width"="256" {
93 ; CHECK-LABEL: pmaddwd_32_256:
95 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
96 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
97 ; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1
98 ; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0
99 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
100 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
101 ; CHECK-NEXT: vzeroupper
103 %A = load <32 x i16>, ptr %APtr
104 %B = load <32 x i16>, ptr %BPtr
105 %a = sext <32 x i16> %A to <32 x i32>
106 %b = sext <32 x i16> %B to <32 x i32>
107 %m = mul nsw <32 x i32> %a, %b
108 %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
109 %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
110 %ret = add <16 x i32> %odd, %even
111 store <16 x i32> %ret, ptr %CPtr
115 define dso_local void @pmaddwd_32_512(ptr %APtr, ptr %BPtr, ptr %CPtr) "min-legal-vector-width"="512" {
116 ; CHECK-LABEL: pmaddwd_32_512:
118 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
119 ; CHECK-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0
120 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx)
121 ; CHECK-NEXT: vzeroupper
123 %A = load <32 x i16>, ptr %APtr
124 %B = load <32 x i16>, ptr %BPtr
125 %a = sext <32 x i16> %A to <32 x i32>
126 %b = sext <32 x i16> %B to <32 x i32>
127 %m = mul nsw <32 x i32> %a, %b
128 %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
129 %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
130 %ret = add <16 x i32> %odd, %even
131 store <16 x i32> %ret, ptr %CPtr
135 define dso_local void @psubus_64i8_max_256(ptr %xptr, ptr %yptr, ptr %zptr) "min-legal-vector-width"="256" {
136 ; CHECK-LABEL: psubus_64i8_max_256:
138 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
139 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
140 ; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1
141 ; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0
142 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
143 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
144 ; CHECK-NEXT: vzeroupper
146 %x = load <64 x i8>, ptr %xptr
147 %y = load <64 x i8>, ptr %yptr
148 %cmp = icmp ult <64 x i8> %x, %y
149 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
150 %res = sub <64 x i8> %max, %y
151 store <64 x i8> %res, ptr %zptr
155 define dso_local void @psubus_64i8_max_512(ptr %xptr, ptr %yptr, ptr %zptr) "min-legal-vector-width"="512" {
156 ; CHECK-LABEL: psubus_64i8_max_512:
158 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
159 ; CHECK-NEXT: vpsubusb (%rsi), %zmm0, %zmm0
160 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx)
161 ; CHECK-NEXT: vzeroupper
163 %x = load <64 x i8>, ptr %xptr
164 %y = load <64 x i8>, ptr %yptr
165 %cmp = icmp ult <64 x i8> %x, %y
166 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
167 %res = sub <64 x i8> %max, %y
168 store <64 x i8> %res, ptr %zptr
172 define dso_local i32 @_Z9test_charPcS_i_256(ptr nocapture readonly, ptr nocapture readonly, i32) "min-legal-vector-width"="256" {
173 ; CHECK-SKX-LABEL: _Z9test_charPcS_i_256:
174 ; CHECK-SKX: # %bb.0: # %entry
175 ; CHECK-SKX-NEXT: movl %edx, %eax
176 ; CHECK-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
177 ; CHECK-SKX-NEXT: xorl %ecx, %ecx
178 ; CHECK-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
179 ; CHECK-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
180 ; CHECK-SKX-NEXT: .p2align 4
181 ; CHECK-SKX-NEXT: .LBB8_1: # %vector.body
182 ; CHECK-SKX-NEXT: # =>This Inner Loop Header: Depth=1
183 ; CHECK-SKX-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3
184 ; CHECK-SKX-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4
185 ; CHECK-SKX-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5
186 ; CHECK-SKX-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3
187 ; CHECK-SKX-NEXT: vpaddd %ymm2, %ymm3, %ymm2
188 ; CHECK-SKX-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3
189 ; CHECK-SKX-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3
190 ; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm3, %ymm1
191 ; CHECK-SKX-NEXT: addq $32, %rcx
192 ; CHECK-SKX-NEXT: cmpq %rcx, %rax
193 ; CHECK-SKX-NEXT: jne .LBB8_1
194 ; CHECK-SKX-NEXT: # %bb.2: # %middle.block
195 ; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm1
196 ; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0
197 ; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm0
198 ; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
199 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
200 ; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
201 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
202 ; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1
203 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
204 ; CHECK-SKX-NEXT: vmovd %xmm0, %eax
205 ; CHECK-SKX-NEXT: vzeroupper
206 ; CHECK-SKX-NEXT: retq
208 ; CHECK-AVX512-LABEL: _Z9test_charPcS_i_256:
209 ; CHECK-AVX512: # %bb.0: # %entry
210 ; CHECK-AVX512-NEXT: movl %edx, %eax
211 ; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
212 ; CHECK-AVX512-NEXT: xorl %ecx, %ecx
213 ; CHECK-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
214 ; CHECK-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
215 ; CHECK-AVX512-NEXT: .p2align 4
216 ; CHECK-AVX512-NEXT: .LBB8_1: # %vector.body
217 ; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1
218 ; CHECK-AVX512-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3
219 ; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4
220 ; CHECK-AVX512-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5
221 ; CHECK-AVX512-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3
222 ; CHECK-AVX512-NEXT: vpaddd %ymm2, %ymm3, %ymm2
223 ; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3
224 ; CHECK-AVX512-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3
225 ; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm3, %ymm1
226 ; CHECK-AVX512-NEXT: addq $32, %rcx
227 ; CHECK-AVX512-NEXT: cmpq %rcx, %rax
228 ; CHECK-AVX512-NEXT: jne .LBB8_1
229 ; CHECK-AVX512-NEXT: # %bb.2: # %middle.block
230 ; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm1
231 ; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0
232 ; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0
233 ; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
234 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
235 ; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
236 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
237 ; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1
238 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
239 ; CHECK-AVX512-NEXT: vmovd %xmm0, %eax
240 ; CHECK-AVX512-NEXT: vzeroupper
241 ; CHECK-AVX512-NEXT: retq
243 ; CHECK-VBMI-LABEL: _Z9test_charPcS_i_256:
244 ; CHECK-VBMI: # %bb.0: # %entry
245 ; CHECK-VBMI-NEXT: movl %edx, %eax
246 ; CHECK-VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0
247 ; CHECK-VBMI-NEXT: xorl %ecx, %ecx
248 ; CHECK-VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
249 ; CHECK-VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
250 ; CHECK-VBMI-NEXT: .p2align 4
251 ; CHECK-VBMI-NEXT: .LBB8_1: # %vector.body
252 ; CHECK-VBMI-NEXT: # =>This Inner Loop Header: Depth=1
253 ; CHECK-VBMI-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3
254 ; CHECK-VBMI-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4
255 ; CHECK-VBMI-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5
256 ; CHECK-VBMI-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3
257 ; CHECK-VBMI-NEXT: vpaddd %ymm2, %ymm3, %ymm2
258 ; CHECK-VBMI-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3
259 ; CHECK-VBMI-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3
260 ; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm3, %ymm1
261 ; CHECK-VBMI-NEXT: addq $32, %rcx
262 ; CHECK-VBMI-NEXT: cmpq %rcx, %rax
263 ; CHECK-VBMI-NEXT: jne .LBB8_1
264 ; CHECK-VBMI-NEXT: # %bb.2: # %middle.block
265 ; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm1
266 ; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm2, %ymm0
267 ; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm0
268 ; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
269 ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
270 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
271 ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
272 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
273 ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
274 ; CHECK-VBMI-NEXT: vmovd %xmm0, %eax
275 ; CHECK-VBMI-NEXT: vzeroupper
276 ; CHECK-VBMI-NEXT: retq
278 %3 = zext i32 %2 to i64
279 br label %vector.body
282 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
283 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
284 %4 = getelementptr inbounds i8, ptr %0, i64 %index
285 %5 = bitcast ptr %4 to ptr
286 %wide.load = load <32 x i8>, ptr %5, align 1
287 %6 = sext <32 x i8> %wide.load to <32 x i32>
288 %7 = getelementptr inbounds i8, ptr %1, i64 %index
289 %8 = bitcast ptr %7 to ptr
290 %wide.load14 = load <32 x i8>, ptr %8, align 1
291 %9 = sext <32 x i8> %wide.load14 to <32 x i32>
292 %10 = mul nsw <32 x i32> %9, %6
293 %11 = add nsw <32 x i32> %10, %vec.phi
294 %index.next = add i64 %index, 32
295 %12 = icmp eq i64 %index.next, %3
296 br i1 %12, label %middle.block, label %vector.body
299 %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
300 %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1
301 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
302 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
303 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
304 %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15
305 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
306 %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17
307 %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
308 %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19
309 %13 = extractelement <32 x i32> %bin.rdx20, i32 0
313 define dso_local i32 @_Z9test_charPcS_i_512(ptr nocapture readonly, ptr nocapture readonly, i32) "min-legal-vector-width"="512" {
314 ; CHECK-SKX-LABEL: _Z9test_charPcS_i_512:
315 ; CHECK-SKX: # %bb.0: # %entry
316 ; CHECK-SKX-NEXT: movl %edx, %eax
317 ; CHECK-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
318 ; CHECK-SKX-NEXT: xorl %ecx, %ecx
319 ; CHECK-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
320 ; CHECK-SKX-NEXT: .p2align 4
321 ; CHECK-SKX-NEXT: .LBB9_1: # %vector.body
322 ; CHECK-SKX-NEXT: # =>This Inner Loop Header: Depth=1
323 ; CHECK-SKX-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2
324 ; CHECK-SKX-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3
325 ; CHECK-SKX-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2
326 ; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm2, %zmm1
327 ; CHECK-SKX-NEXT: addq $32, %rcx
328 ; CHECK-SKX-NEXT: cmpq %rcx, %rax
329 ; CHECK-SKX-NEXT: jne .LBB9_1
330 ; CHECK-SKX-NEXT: # %bb.2: # %middle.block
331 ; CHECK-SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0
332 ; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm1
333 ; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
334 ; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
335 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
336 ; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
337 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
338 ; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1
339 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
340 ; CHECK-SKX-NEXT: vmovd %xmm0, %eax
341 ; CHECK-SKX-NEXT: vzeroupper
342 ; CHECK-SKX-NEXT: retq
344 ; CHECK-AVX512-LABEL: _Z9test_charPcS_i_512:
345 ; CHECK-AVX512: # %bb.0: # %entry
346 ; CHECK-AVX512-NEXT: movl %edx, %eax
347 ; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
348 ; CHECK-AVX512-NEXT: xorl %ecx, %ecx
349 ; CHECK-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
350 ; CHECK-AVX512-NEXT: .p2align 4
351 ; CHECK-AVX512-NEXT: .LBB9_1: # %vector.body
352 ; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1
353 ; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2
354 ; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3
355 ; CHECK-AVX512-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2
356 ; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1
357 ; CHECK-AVX512-NEXT: addq $32, %rcx
358 ; CHECK-AVX512-NEXT: cmpq %rcx, %rax
359 ; CHECK-AVX512-NEXT: jne .LBB9_1
360 ; CHECK-AVX512-NEXT: # %bb.2: # %middle.block
361 ; CHECK-AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
362 ; CHECK-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
363 ; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
364 ; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
365 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
366 ; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
367 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
368 ; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1
369 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
370 ; CHECK-AVX512-NEXT: vmovd %xmm0, %eax
371 ; CHECK-AVX512-NEXT: vzeroupper
372 ; CHECK-AVX512-NEXT: retq
374 ; CHECK-VBMI-LABEL: _Z9test_charPcS_i_512:
375 ; CHECK-VBMI: # %bb.0: # %entry
376 ; CHECK-VBMI-NEXT: movl %edx, %eax
377 ; CHECK-VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0
378 ; CHECK-VBMI-NEXT: xorl %ecx, %ecx
379 ; CHECK-VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
380 ; CHECK-VBMI-NEXT: .p2align 4
381 ; CHECK-VBMI-NEXT: .LBB9_1: # %vector.body
382 ; CHECK-VBMI-NEXT: # =>This Inner Loop Header: Depth=1
383 ; CHECK-VBMI-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2
384 ; CHECK-VBMI-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3
385 ; CHECK-VBMI-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2
386 ; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm2, %zmm1
387 ; CHECK-VBMI-NEXT: addq $32, %rcx
388 ; CHECK-VBMI-NEXT: cmpq %rcx, %rax
389 ; CHECK-VBMI-NEXT: jne .LBB9_1
390 ; CHECK-VBMI-NEXT: # %bb.2: # %middle.block
391 ; CHECK-VBMI-NEXT: vpaddd %zmm0, %zmm1, %zmm0
392 ; CHECK-VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1
393 ; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0
394 ; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
395 ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
396 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
397 ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
398 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
399 ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
400 ; CHECK-VBMI-NEXT: vmovd %xmm0, %eax
401 ; CHECK-VBMI-NEXT: vzeroupper
402 ; CHECK-VBMI-NEXT: retq
404 %3 = zext i32 %2 to i64
405 br label %vector.body
408 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
409 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
410 %4 = getelementptr inbounds i8, ptr %0, i64 %index
411 %5 = bitcast ptr %4 to ptr
412 %wide.load = load <32 x i8>, ptr %5, align 1
413 %6 = sext <32 x i8> %wide.load to <32 x i32>
414 %7 = getelementptr inbounds i8, ptr %1, i64 %index
415 %8 = bitcast ptr %7 to ptr
416 %wide.load14 = load <32 x i8>, ptr %8, align 1
417 %9 = sext <32 x i8> %wide.load14 to <32 x i32>
418 %10 = mul nsw <32 x i32> %9, %6
419 %11 = add nsw <32 x i32> %10, %vec.phi
420 %index.next = add i64 %index, 32
421 %12 = icmp eq i64 %index.next, %3
422 br i1 %12, label %middle.block, label %vector.body
425 %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
426 %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1
427 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
428 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
429 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
430 %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15
431 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
432 %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17
433 %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
434 %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19
435 %13 = extractelement <32 x i32> %bin.rdx20, i32 0
439 @a = dso_local global [1024 x i8] zeroinitializer, align 16
440 @b = dso_local global [1024 x i8] zeroinitializer, align 16
442 define dso_local i32 @sad_16i8_256() "min-legal-vector-width"="256" {
443 ; CHECK-SKX-LABEL: sad_16i8_256:
444 ; CHECK-SKX: # %bb.0: # %entry
445 ; CHECK-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
446 ; CHECK-SKX-NEXT: movq $-1024, %rax # imm = 0xFC00
447 ; CHECK-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
448 ; CHECK-SKX-NEXT: .p2align 4
449 ; CHECK-SKX-NEXT: .LBB10_1: # %vector.body
450 ; CHECK-SKX-NEXT: # =>This Inner Loop Header: Depth=1
451 ; CHECK-SKX-NEXT: vmovdqu a+1024(%rax), %xmm2
452 ; CHECK-SKX-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2
453 ; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm2, %ymm1
454 ; CHECK-SKX-NEXT: addq $4, %rax
455 ; CHECK-SKX-NEXT: jne .LBB10_1
456 ; CHECK-SKX-NEXT: # %bb.2: # %middle.block
457 ; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm0
458 ; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
459 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
460 ; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
461 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
462 ; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1
463 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
464 ; CHECK-SKX-NEXT: vmovd %xmm0, %eax
465 ; CHECK-SKX-NEXT: vzeroupper
466 ; CHECK-SKX-NEXT: retq
468 ; CHECK-AVX512-LABEL: sad_16i8_256:
469 ; CHECK-AVX512: # %bb.0: # %entry
470 ; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
471 ; CHECK-AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00
472 ; CHECK-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
473 ; CHECK-AVX512-NEXT: .p2align 4
474 ; CHECK-AVX512-NEXT: .LBB10_1: # %vector.body
475 ; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1
476 ; CHECK-AVX512-NEXT: vmovdqu a+1024(%rax), %xmm2
477 ; CHECK-AVX512-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2
478 ; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm2, %ymm1
479 ; CHECK-AVX512-NEXT: addq $4, %rax
480 ; CHECK-AVX512-NEXT: jne .LBB10_1
481 ; CHECK-AVX512-NEXT: # %bb.2: # %middle.block
482 ; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0
483 ; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
484 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
485 ; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
486 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
487 ; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1
488 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
489 ; CHECK-AVX512-NEXT: vmovd %xmm0, %eax
490 ; CHECK-AVX512-NEXT: vzeroupper
491 ; CHECK-AVX512-NEXT: retq
493 ; CHECK-VBMI-LABEL: sad_16i8_256:
494 ; CHECK-VBMI: # %bb.0: # %entry
495 ; CHECK-VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0
496 ; CHECK-VBMI-NEXT: movq $-1024, %rax # imm = 0xFC00
497 ; CHECK-VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
498 ; CHECK-VBMI-NEXT: .p2align 4
499 ; CHECK-VBMI-NEXT: .LBB10_1: # %vector.body
500 ; CHECK-VBMI-NEXT: # =>This Inner Loop Header: Depth=1
501 ; CHECK-VBMI-NEXT: vmovdqu a+1024(%rax), %xmm2
502 ; CHECK-VBMI-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2
503 ; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm2, %ymm1
504 ; CHECK-VBMI-NEXT: addq $4, %rax
505 ; CHECK-VBMI-NEXT: jne .LBB10_1
506 ; CHECK-VBMI-NEXT: # %bb.2: # %middle.block
507 ; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm0
508 ; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
509 ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
510 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
511 ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
512 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
513 ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
514 ; CHECK-VBMI-NEXT: vmovd %xmm0, %eax
515 ; CHECK-VBMI-NEXT: vzeroupper
516 ; CHECK-VBMI-NEXT: retq
518 br label %vector.body
521 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
522 %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
523 %0 = getelementptr inbounds [1024 x i8], ptr @a, i64 0, i64 %index
524 %1 = bitcast ptr %0 to ptr
525 %wide.load = load <16 x i8>, ptr %1, align 4
526 %2 = zext <16 x i8> %wide.load to <16 x i32>
527 %3 = getelementptr inbounds [1024 x i8], ptr @b, i64 0, i64 %index
528 %4 = bitcast ptr %3 to ptr
529 %wide.load1 = load <16 x i8>, ptr %4, align 4
530 %5 = zext <16 x i8> %wide.load1 to <16 x i32>
531 %6 = sub nsw <16 x i32> %2, %5
532 %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
533 %8 = sub nsw <16 x i32> zeroinitializer, %6
534 %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
535 %10 = add nsw <16 x i32> %9, %vec.phi
536 %index.next = add i64 %index, 4
537 %11 = icmp eq i64 %index.next, 1024
538 br i1 %11, label %middle.block, label %vector.body
541 %rdx.shuf = shufflevector <16 x i32> %10, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
542 %bin.rdx = add <16 x i32> %10, %rdx.shuf
543 %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
544 %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
545 %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
546 %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
547 %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
548 %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
549 %12 = extractelement <16 x i32> %bin.rdx4, i32 0
553 define dso_local i32 @sad_16i8_512() "min-legal-vector-width"="512" {
554 ; CHECK-SKX-LABEL: sad_16i8_512:
555 ; CHECK-SKX: # %bb.0: # %entry
556 ; CHECK-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
557 ; CHECK-SKX-NEXT: movq $-1024, %rax # imm = 0xFC00
558 ; CHECK-SKX-NEXT: .p2align 4
559 ; CHECK-SKX-NEXT: .LBB11_1: # %vector.body
560 ; CHECK-SKX-NEXT: # =>This Inner Loop Header: Depth=1
561 ; CHECK-SKX-NEXT: vmovdqu a+1024(%rax), %xmm1
562 ; CHECK-SKX-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
563 ; CHECK-SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0
564 ; CHECK-SKX-NEXT: addq $4, %rax
565 ; CHECK-SKX-NEXT: jne .LBB11_1
566 ; CHECK-SKX-NEXT: # %bb.2: # %middle.block
567 ; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm1
568 ; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
569 ; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
570 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
571 ; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
572 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
573 ; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1
574 ; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
575 ; CHECK-SKX-NEXT: vmovd %xmm0, %eax
576 ; CHECK-SKX-NEXT: vzeroupper
577 ; CHECK-SKX-NEXT: retq
579 ; CHECK-AVX512-LABEL: sad_16i8_512:
580 ; CHECK-AVX512: # %bb.0: # %entry
581 ; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
582 ; CHECK-AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00
583 ; CHECK-AVX512-NEXT: .p2align 4
584 ; CHECK-AVX512-NEXT: .LBB11_1: # %vector.body
585 ; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1
586 ; CHECK-AVX512-NEXT: vmovdqu a+1024(%rax), %xmm1
587 ; CHECK-AVX512-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
588 ; CHECK-AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
589 ; CHECK-AVX512-NEXT: addq $4, %rax
590 ; CHECK-AVX512-NEXT: jne .LBB11_1
591 ; CHECK-AVX512-NEXT: # %bb.2: # %middle.block
592 ; CHECK-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
593 ; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
594 ; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
595 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
596 ; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
597 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
598 ; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1
599 ; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
600 ; CHECK-AVX512-NEXT: vmovd %xmm0, %eax
601 ; CHECK-AVX512-NEXT: vzeroupper
602 ; CHECK-AVX512-NEXT: retq
604 ; CHECK-VBMI-LABEL: sad_16i8_512:
605 ; CHECK-VBMI: # %bb.0: # %entry
606 ; CHECK-VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0
607 ; CHECK-VBMI-NEXT: movq $-1024, %rax # imm = 0xFC00
608 ; CHECK-VBMI-NEXT: .p2align 4
609 ; CHECK-VBMI-NEXT: .LBB11_1: # %vector.body
610 ; CHECK-VBMI-NEXT: # =>This Inner Loop Header: Depth=1
611 ; CHECK-VBMI-NEXT: vmovdqu a+1024(%rax), %xmm1
612 ; CHECK-VBMI-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
613 ; CHECK-VBMI-NEXT: vpaddd %zmm0, %zmm1, %zmm0
614 ; CHECK-VBMI-NEXT: addq $4, %rax
615 ; CHECK-VBMI-NEXT: jne .LBB11_1
616 ; CHECK-VBMI-NEXT: # %bb.2: # %middle.block
617 ; CHECK-VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1
618 ; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0
619 ; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
620 ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
621 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
622 ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
623 ; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
624 ; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
625 ; CHECK-VBMI-NEXT: vmovd %xmm0, %eax
626 ; CHECK-VBMI-NEXT: vzeroupper
627 ; CHECK-VBMI-NEXT: retq
629 br label %vector.body
632 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
633 %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
634 %0 = getelementptr inbounds [1024 x i8], ptr @a, i64 0, i64 %index
635 %1 = bitcast ptr %0 to ptr
636 %wide.load = load <16 x i8>, ptr %1, align 4
637 %2 = zext <16 x i8> %wide.load to <16 x i32>
638 %3 = getelementptr inbounds [1024 x i8], ptr @b, i64 0, i64 %index
639 %4 = bitcast ptr %3 to ptr
640 %wide.load1 = load <16 x i8>, ptr %4, align 4
641 %5 = zext <16 x i8> %wide.load1 to <16 x i32>
642 %6 = sub nsw <16 x i32> %2, %5
643 %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
644 %8 = sub nsw <16 x i32> zeroinitializer, %6
645 %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
646 %10 = add nsw <16 x i32> %9, %vec.phi
647 %index.next = add i64 %index, 4
648 %11 = icmp eq i64 %index.next, 1024
649 br i1 %11, label %middle.block, label %vector.body
652 %rdx.shuf = shufflevector <16 x i32> %10, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
653 %bin.rdx = add <16 x i32> %10, %rdx.shuf
654 %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
655 %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
656 %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
657 %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
658 %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
659 %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
660 %12 = extractelement <16 x i32> %bin.rdx4, i32 0
664 define dso_local void @sbto16f32_256(<16 x i16> %a, ptr %res) "min-legal-vector-width"="256" {
665 ; CHECK-LABEL: sbto16f32_256:
667 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
668 ; CHECK-NEXT: kshiftrw $8, %k0, %k1
669 ; CHECK-NEXT: vpmovm2d %k1, %ymm0
670 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
671 ; CHECK-NEXT: vpmovm2d %k0, %ymm1
672 ; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
673 ; CHECK-NEXT: vmovaps %ymm1, (%rdi)
674 ; CHECK-NEXT: vmovaps %ymm0, 32(%rdi)
675 ; CHECK-NEXT: vzeroupper
677 %mask = icmp slt <16 x i16> %a, zeroinitializer
678 %1 = sitofp <16 x i1> %mask to <16 x float>
679 store <16 x float> %1, ptr %res
683 define dso_local void @sbto16f32_512(<16 x i16> %a, ptr %res) "min-legal-vector-width"="512" {
684 ; CHECK-LABEL: sbto16f32_512:
686 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
687 ; CHECK-NEXT: vpmovm2d %k0, %zmm0
688 ; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0
689 ; CHECK-NEXT: vmovaps %zmm0, (%rdi)
690 ; CHECK-NEXT: vzeroupper
692 %mask = icmp slt <16 x i16> %a, zeroinitializer
693 %1 = sitofp <16 x i1> %mask to <16 x float>
694 store <16 x float> %1, ptr %res
698 define dso_local void @sbto16f64_256(<16 x i16> %a, ptr %res) "min-legal-vector-width"="256" {
699 ; CHECK-LABEL: sbto16f64_256:
701 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
702 ; CHECK-NEXT: kshiftrw $8, %k0, %k1
703 ; CHECK-NEXT: vpmovm2d %k1, %ymm0
704 ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1
705 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
706 ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
707 ; CHECK-NEXT: vpmovm2d %k0, %ymm2
708 ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3
709 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
710 ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2
711 ; CHECK-NEXT: vmovaps %ymm2, 32(%rdi)
712 ; CHECK-NEXT: vmovaps %ymm3, (%rdi)
713 ; CHECK-NEXT: vmovaps %ymm0, 96(%rdi)
714 ; CHECK-NEXT: vmovaps %ymm1, 64(%rdi)
715 ; CHECK-NEXT: vzeroupper
717 %mask = icmp slt <16 x i16> %a, zeroinitializer
718 %1 = sitofp <16 x i1> %mask to <16 x double>
719 store <16 x double> %1, ptr %res
723 define dso_local void @sbto16f64_512(<16 x i16> %a, ptr %res) "min-legal-vector-width"="512" {
724 ; CHECK-LABEL: sbto16f64_512:
726 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
727 ; CHECK-NEXT: vpmovm2d %k0, %zmm0
728 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1
729 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
730 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0
731 ; CHECK-NEXT: vmovaps %zmm0, 64(%rdi)
732 ; CHECK-NEXT: vmovaps %zmm1, (%rdi)
733 ; CHECK-NEXT: vzeroupper
735 %mask = icmp slt <16 x i16> %a, zeroinitializer
736 %1 = sitofp <16 x i1> %mask to <16 x double>
737 store <16 x double> %1, ptr %res
741 define dso_local void @ubto16f32_256(<16 x i16> %a, ptr %res) "min-legal-vector-width"="256" {
742 ; CHECK-LABEL: ubto16f32_256:
744 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
745 ; CHECK-NEXT: kshiftrw $8, %k0, %k1
746 ; CHECK-NEXT: vpmovm2d %k1, %ymm0
747 ; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0
748 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
749 ; CHECK-NEXT: vpmovm2d %k0, %ymm1
750 ; CHECK-NEXT: vpsrld $31, %ymm1, %ymm1
751 ; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
752 ; CHECK-NEXT: vmovaps %ymm1, (%rdi)
753 ; CHECK-NEXT: vmovaps %ymm0, 32(%rdi)
754 ; CHECK-NEXT: vzeroupper
756 %mask = icmp slt <16 x i16> %a, zeroinitializer
757 %1 = uitofp <16 x i1> %mask to <16 x float>
758 store <16 x float> %1, ptr %res
762 define dso_local void @ubto16f32_512(<16 x i16> %a, ptr %res) "min-legal-vector-width"="512" {
763 ; CHECK-LABEL: ubto16f32_512:
765 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
766 ; CHECK-NEXT: vpmovm2d %k0, %zmm0
767 ; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0
768 ; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0
769 ; CHECK-NEXT: vmovaps %zmm0, (%rdi)
770 ; CHECK-NEXT: vzeroupper
772 %mask = icmp slt <16 x i16> %a, zeroinitializer
773 %1 = uitofp <16 x i1> %mask to <16 x float>
774 store <16 x float> %1, ptr %res
778 define dso_local void @ubto16f64_256(<16 x i16> %a, ptr %res) "min-legal-vector-width"="256" {
779 ; CHECK-LABEL: ubto16f64_256:
781 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
782 ; CHECK-NEXT: kshiftrw $8, %k0, %k1
783 ; CHECK-NEXT: vpmovm2d %k1, %ymm0
784 ; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0
785 ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1
786 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
787 ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
788 ; CHECK-NEXT: vpmovm2d %k0, %ymm2
789 ; CHECK-NEXT: vpsrld $31, %ymm2, %ymm2
790 ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3
791 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
792 ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2
793 ; CHECK-NEXT: vmovaps %ymm2, 32(%rdi)
794 ; CHECK-NEXT: vmovaps %ymm3, (%rdi)
795 ; CHECK-NEXT: vmovaps %ymm0, 96(%rdi)
796 ; CHECK-NEXT: vmovaps %ymm1, 64(%rdi)
797 ; CHECK-NEXT: vzeroupper
799 %mask = icmp slt <16 x i16> %a, zeroinitializer
800 %1 = uitofp <16 x i1> %mask to <16 x double>
801 store <16 x double> %1, ptr %res
805 define dso_local void @ubto16f64_512(<16 x i16> %a, ptr %res) "min-legal-vector-width"="512" {
806 ; CHECK-LABEL: ubto16f64_512:
808 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
809 ; CHECK-NEXT: vpmovm2d %k0, %zmm0
810 ; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0
811 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1
812 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
813 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0
814 ; CHECK-NEXT: vmovaps %zmm0, 64(%rdi)
815 ; CHECK-NEXT: vmovaps %zmm1, (%rdi)
816 ; CHECK-NEXT: vzeroupper
818 %mask = icmp slt <16 x i16> %a, zeroinitializer
819 %1 = uitofp <16 x i1> %mask to <16 x double>
820 store <16 x double> %1, ptr %res
824 define <16 x i16> @test_16f32toub_256(ptr %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" {
825 ; CHECK-LABEL: test_16f32toub_256:
827 ; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1
828 ; CHECK-NEXT: vpslld $31, %ymm1, %ymm1
829 ; CHECK-NEXT: vpmovd2m %ymm1, %k0
830 ; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm1
831 ; CHECK-NEXT: vpslld $31, %ymm1, %ymm1
832 ; CHECK-NEXT: vpmovd2m %ymm1, %k1
833 ; CHECK-NEXT: kunpckbw %k0, %k1, %k1
834 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
836 %a = load <16 x float>, ptr %ptr
837 %mask = fptoui <16 x float> %a to <16 x i1>
838 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
839 ret <16 x i16> %select
842 define <16 x i16> @test_16f32toub_512(ptr %ptr, <16 x i16> %passthru) "min-legal-vector-width"="512" {
843 ; CHECK-LABEL: test_16f32toub_512:
845 ; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1
846 ; CHECK-NEXT: vpslld $31, %zmm1, %zmm1
847 ; CHECK-NEXT: vpmovd2m %zmm1, %k1
848 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
850 %a = load <16 x float>, ptr %ptr
851 %mask = fptoui <16 x float> %a to <16 x i1>
852 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
853 ret <16 x i16> %select
856 define <16 x i16> @test_16f32tosb_256(ptr %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" {
857 ; CHECK-LABEL: test_16f32tosb_256:
859 ; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1
860 ; CHECK-NEXT: vpmovd2m %ymm1, %k0
861 ; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm1
862 ; CHECK-NEXT: vpmovd2m %ymm1, %k1
863 ; CHECK-NEXT: kunpckbw %k0, %k1, %k1
864 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
866 %a = load <16 x float>, ptr %ptr
867 %mask = fptosi <16 x float> %a to <16 x i1>
868 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
869 ret <16 x i16> %select
872 define <16 x i16> @test_16f32tosb_512(ptr %ptr, <16 x i16> %passthru) "min-legal-vector-width"="512" {
873 ; CHECK-LABEL: test_16f32tosb_512:
875 ; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1
876 ; CHECK-NEXT: vpmovd2m %zmm1, %k1
877 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
879 %a = load <16 x float>, ptr %ptr
880 %mask = fptosi <16 x float> %a to <16 x i1>
881 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
882 ret <16 x i16> %select
885 define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="256" {
886 ; CHECK-SKX-VBMI-LABEL: mul256:
887 ; CHECK-SKX-VBMI: # %bb.0:
888 ; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm0
889 ; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
890 ; CHECK-SKX-VBMI-NEXT: vmovdqa (%rsi), %ymm2
891 ; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
892 ; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
893 ; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5
894 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
895 ; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3
896 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
897 ; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62]
898 ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1
899 ; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5
900 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5
901 ; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2
902 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
903 ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0
904 ; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx)
905 ; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
906 ; CHECK-SKX-VBMI-NEXT: vzeroupper
907 ; CHECK-SKX-VBMI-NEXT: retq
909 ; CHECK-AVX512-LABEL: mul256:
910 ; CHECK-AVX512: # %bb.0:
911 ; CHECK-AVX512-NEXT: vmovdqa (%rdi), %ymm0
912 ; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
913 ; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2
914 ; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3
915 ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
916 ; CHECK-AVX512-NEXT: vpand %ymm4, %ymm3, %ymm5
917 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
918 ; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3
919 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
920 ; CHECK-AVX512-NEXT: vpsllw $8, %ymm1, %ymm1
921 ; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4)
922 ; CHECK-AVX512-NEXT: vpand %ymm4, %ymm2, %ymm3
923 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
924 ; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm4, %ymm2
925 ; CHECK-AVX512-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
926 ; CHECK-AVX512-NEXT: vpsllw $8, %ymm0, %ymm0
927 ; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4)
928 ; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx)
929 ; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx)
930 ; CHECK-AVX512-NEXT: vzeroupper
931 ; CHECK-AVX512-NEXT: retq
933 ; CHECK-VBMI-LABEL: mul256:
934 ; CHECK-VBMI: # %bb.0:
935 ; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm0
936 ; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
937 ; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2
938 ; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
939 ; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
940 ; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5
941 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
942 ; CHECK-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3
943 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
944 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62]
945 ; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1
946 ; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5
947 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5
948 ; CHECK-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2
949 ; CHECK-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
950 ; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0
951 ; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx)
952 ; CHECK-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
953 ; CHECK-VBMI-NEXT: vzeroupper
954 ; CHECK-VBMI-NEXT: retq
955 %d = load <64 x i8>, ptr %a
956 %e = load <64 x i8>, ptr %b
957 %f = mul <64 x i8> %d, %e
958 store <64 x i8> %f, ptr %c
962 define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="512" {
963 ; CHECK-SKX-VBMI-LABEL: mul512:
964 ; CHECK-SKX-VBMI: # %bb.0:
965 ; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
966 ; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
967 ; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
968 ; CHECK-SKX-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3
969 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
970 ; CHECK-SKX-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1
971 ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
972 ; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126]
973 ; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1
974 ; CHECK-SKX-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx)
975 ; CHECK-SKX-VBMI-NEXT: vzeroupper
976 ; CHECK-SKX-VBMI-NEXT: retq
978 ; CHECK-AVX512-LABEL: mul512:
979 ; CHECK-AVX512: # %bb.0:
980 ; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
981 ; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1
982 ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
983 ; CHECK-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm3
984 ; CHECK-AVX512-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
985 ; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1
986 ; CHECK-AVX512-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
987 ; CHECK-AVX512-NEXT: vpsllw $8, %zmm0, %zmm0
988 ; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2)
989 ; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
990 ; CHECK-AVX512-NEXT: vzeroupper
991 ; CHECK-AVX512-NEXT: retq
993 ; CHECK-VBMI-LABEL: mul512:
994 ; CHECK-VBMI: # %bb.0:
995 ; CHECK-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
996 ; CHECK-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
997 ; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
998 ; CHECK-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3
999 ; CHECK-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
1000 ; CHECK-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1
1001 ; CHECK-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
1002 ; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126]
1003 ; CHECK-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1
1004 ; CHECK-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx)
1005 ; CHECK-VBMI-NEXT: vzeroupper
1006 ; CHECK-VBMI-NEXT: retq
1007 %d = load <64 x i8>, ptr %a
1008 %e = load <64 x i8>, ptr %b
1009 %f = mul <64 x i8> %d, %e
1010 store <64 x i8> %f, ptr %c
1014 ; This threw an assertion at one point.
1015 define <4 x i32> @mload_v4i32(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) "min-legal-vector-width"="256" {
1016 ; CHECK-LABEL: mload_v4i32:
1018 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1019 ; CHECK-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
1021 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
1022 %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
1025 declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>)
1027 define <16 x i32> @trunc_v16i64_v16i32(ptr %x) nounwind "min-legal-vector-width"="256" {
1028 ; CHECK-LABEL: trunc_v16i64_v16i32:
1030 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
1031 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
1032 ; CHECK-NEXT: vmovdqa 64(%rdi), %ymm2
1033 ; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3
1034 ; CHECK-NEXT: vpmovqd %ymm0, %xmm0
1035 ; CHECK-NEXT: vpmovqd %ymm1, %xmm1
1036 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1037 ; CHECK-NEXT: vpmovqd %ymm2, %xmm1
1038 ; CHECK-NEXT: vpmovqd %ymm3, %xmm2
1039 ; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
1041 %a = load <16 x i64>, ptr %x
1042 %b = trunc <16 x i64> %a to <16 x i32>
1046 define <16 x i8> @trunc_v16i64_v16i8(ptr %x) nounwind "min-legal-vector-width"="256" {
1047 ; CHECK-LABEL: trunc_v16i64_v16i8:
1049 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
1050 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
1051 ; CHECK-NEXT: vmovdqa 64(%rdi), %ymm2
1052 ; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3
1053 ; CHECK-NEXT: vpmovqb %ymm3, %xmm3
1054 ; CHECK-NEXT: vpmovqb %ymm2, %xmm2
1055 ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1056 ; CHECK-NEXT: vpmovqb %ymm1, %xmm1
1057 ; CHECK-NEXT: vpmovqb %ymm0, %xmm0
1058 ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1059 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1060 ; CHECK-NEXT: vzeroupper
1062 %a = load <16 x i64>, ptr %x
1063 %b = trunc <16 x i64> %a to <16 x i8>
1067 define <16 x i8> @trunc_v16i32_v16i8(ptr %x) nounwind "min-legal-vector-width"="256" {
1068 ; CHECK-LABEL: trunc_v16i32_v16i8:
1070 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
1071 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
1072 ; CHECK-NEXT: vpmovdb %ymm1, %xmm1
1073 ; CHECK-NEXT: vpmovdb %ymm0, %xmm0
1074 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1075 ; CHECK-NEXT: vzeroupper
1077 %a = load <16 x i32>, ptr %x
1078 %b = trunc <16 x i32> %a to <16 x i8>
1082 define <8 x i8> @trunc_v8i64_v8i8(ptr %x) nounwind "min-legal-vector-width"="256" {
1083 ; CHECK-LABEL: trunc_v8i64_v8i8:
1085 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
1086 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
1087 ; CHECK-NEXT: vpmovqb %ymm1, %xmm1
1088 ; CHECK-NEXT: vpmovqb %ymm0, %xmm0
1089 ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1090 ; CHECK-NEXT: vzeroupper
1092 %a = load <8 x i64>, ptr %x
1093 %b = trunc <8 x i64> %a to <8 x i8>
1097 define <8 x i16> @trunc_v8i64_v8i16(ptr %x) nounwind "min-legal-vector-width"="256" {
1098 ; CHECK-LABEL: trunc_v8i64_v8i16:
1100 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
1101 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
1102 ; CHECK-NEXT: vpmovqw %ymm1, %xmm1
1103 ; CHECK-NEXT: vpmovqw %ymm0, %xmm0
1104 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1105 ; CHECK-NEXT: vzeroupper
1107 %a = load <8 x i64>, ptr %x
1108 %b = trunc <8 x i64> %a to <8 x i16>
1112 define <8 x i32> @trunc_v8i64_v8i32_zeroes(ptr %x) nounwind "min-legal-vector-width"="256" {
1113 ; CHECK-LABEL: trunc_v8i64_v8i32_zeroes:
1115 ; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm0
1116 ; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm1
1117 ; CHECK-NEXT: vpackusdw %ymm0, %ymm1, %ymm0
1118 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1120 %a = load <8 x i64>, ptr %x
1121 %b = lshr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48>
1122 %c = trunc <8 x i64> %b to <8 x i32>
1126 define <16 x i16> @trunc_v16i32_v16i16_zeroes(ptr %x) nounwind "min-legal-vector-width"="256" {
1127 ; CHECK-LABEL: trunc_v16i32_v16i16_zeroes:
1129 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1130 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
1131 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
1133 %a = load <16 x i32>, ptr %x
1134 %b = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1135 %c = trunc <16 x i32> %b to <16 x i16>
1139 define <32 x i8> @trunc_v32i16_v32i8_zeroes(ptr %x) nounwind "min-legal-vector-width"="256" {
1140 ; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_zeroes:
1141 ; CHECK-SKX-VBMI: # %bb.0:
1142 ; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm1
1143 ; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
1144 ; CHECK-SKX-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0
1145 ; CHECK-SKX-VBMI-NEXT: retq
1147 ; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_zeroes:
1148 ; CHECK-AVX512: # %bb.0:
1149 ; CHECK-AVX512-NEXT: vpsrlw $8, 32(%rdi), %ymm0
1150 ; CHECK-AVX512-NEXT: vpsrlw $8, (%rdi), %ymm1
1151 ; CHECK-AVX512-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
1152 ; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1153 ; CHECK-AVX512-NEXT: retq
1155 ; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_zeroes:
1156 ; CHECK-VBMI: # %bb.0:
1157 ; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1
1158 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
1159 ; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0
1160 ; CHECK-VBMI-NEXT: retq
1161 %a = load <32 x i16>, ptr %x
1162 %b = lshr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1163 %c = trunc <32 x i16> %b to <32 x i8>
1167 define <8 x i32> @trunc_v8i64_v8i32_sign(ptr %x) nounwind "min-legal-vector-width"="256" {
1168 ; CHECK-LABEL: trunc_v8i64_v8i32_sign:
1170 ; CHECK-NEXT: vpsraq $48, 32(%rdi), %ymm0
1171 ; CHECK-NEXT: vpsraq $48, (%rdi), %ymm1
1172 ; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0
1173 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1175 %a = load <8 x i64>, ptr %x
1176 %b = ashr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48>
1177 %c = trunc <8 x i64> %b to <8 x i32>
1181 define <16 x i16> @trunc_v16i32_v16i16_sign(ptr %x) nounwind "min-legal-vector-width"="256" {
1182 ; CHECK-LABEL: trunc_v16i32_v16i16_sign:
1184 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1185 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
1186 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
1188 %a = load <16 x i32>, ptr %x
1189 %b = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1190 %c = trunc <16 x i32> %b to <16 x i16>
1194 define <32 x i8> @trunc_v32i16_v32i8_sign(ptr %x) nounwind "min-legal-vector-width"="256" {
1195 ; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_sign:
1196 ; CHECK-SKX-VBMI: # %bb.0:
1197 ; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm1
1198 ; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
1199 ; CHECK-SKX-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0
1200 ; CHECK-SKX-VBMI-NEXT: retq
1202 ; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_sign:
1203 ; CHECK-AVX512: # %bb.0:
1204 ; CHECK-AVX512-NEXT: vpsrlw $8, 32(%rdi), %ymm0
1205 ; CHECK-AVX512-NEXT: vpsrlw $8, (%rdi), %ymm1
1206 ; CHECK-AVX512-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
1207 ; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1208 ; CHECK-AVX512-NEXT: retq
1210 ; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_sign:
1211 ; CHECK-VBMI: # %bb.0:
1212 ; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1
1213 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
1214 ; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0
1215 ; CHECK-VBMI-NEXT: retq
1216 %a = load <32 x i16>, ptr %x
1217 %b = ashr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1218 %c = trunc <32 x i16> %b to <32 x i8>
1222 define dso_local void @zext_v16i8_v16i64(<16 x i8> %x, ptr %y) nounwind "min-legal-vector-width"="256" {
1223 ; CHECK-LABEL: zext_v16i8_v16i64:
1225 ; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1226 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
1227 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
1228 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
1229 ; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
1230 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
1231 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1232 ; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
1233 ; CHECK-NEXT: vmovdqa %ymm0, (%rdi)
1234 ; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi)
1235 ; CHECK-NEXT: vmovdqa %ymm3, 96(%rdi)
1236 ; CHECK-NEXT: vmovdqa %ymm2, 32(%rdi)
1237 ; CHECK-NEXT: vzeroupper
1239 %a = zext <16 x i8> %x to <16 x i64>
1240 store <16 x i64> %a, ptr %y
1244 define dso_local void @sext_v16i8_v16i64(<16 x i8> %x, ptr %y) nounwind "min-legal-vector-width"="256" {
1245 ; CHECK-LABEL: sext_v16i8_v16i64:
1247 ; CHECK-NEXT: vpmovsxbw %xmm0, %ymm1
1248 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
1249 ; CHECK-NEXT: vpmovsxwq %xmm2, %ymm2
1250 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
1251 ; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
1252 ; CHECK-NEXT: vpmovsxwq %xmm3, %ymm3
1253 ; CHECK-NEXT: vpmovsxwq %xmm1, %ymm1
1254 ; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0
1255 ; CHECK-NEXT: vmovdqa %ymm0, (%rdi)
1256 ; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi)
1257 ; CHECK-NEXT: vmovdqa %ymm3, 96(%rdi)
1258 ; CHECK-NEXT: vmovdqa %ymm2, 32(%rdi)
1259 ; CHECK-NEXT: vzeroupper
1261 %a = sext <16 x i8> %x to <16 x i64>
1262 store <16 x i64> %a, ptr %y
1266 define dso_local void @vselect_split_v8i16_setcc(<8 x i16> %s, <8 x i16> %t, ptr %p, ptr %q, ptr %r) "min-legal-vector-width"="256" {
1267 ; CHECK-LABEL: vselect_split_v8i16_setcc:
1269 ; CHECK-NEXT: vmovdqa (%rsi), %ymm2
1270 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3
1271 ; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
1272 ; CHECK-NEXT: kshiftrb $4, %k1, %k2
1273 ; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2}
1274 ; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1}
1275 ; CHECK-NEXT: vmovdqa %ymm2, (%rdx)
1276 ; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx)
1277 ; CHECK-NEXT: vzeroupper
1279 %x = load <8 x i64>, ptr %p
1280 %y = load <8 x i64>, ptr %q
1281 %a = icmp eq <8 x i16> %s, %t
1282 %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y
1283 store <8 x i64> %b, ptr %r
1287 define dso_local void @vselect_split_v8i32_setcc(<8 x i32> %s, <8 x i32> %t, ptr %p, ptr %q, ptr %r) "min-legal-vector-width"="256" {
1288 ; CHECK-LABEL: vselect_split_v8i32_setcc:
1290 ; CHECK-NEXT: vmovdqa (%rsi), %ymm2
1291 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3
1292 ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
1293 ; CHECK-NEXT: kshiftrb $4, %k1, %k2
1294 ; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2}
1295 ; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1}
1296 ; CHECK-NEXT: vmovdqa %ymm2, (%rdx)
1297 ; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx)
1298 ; CHECK-NEXT: vzeroupper
1300 %x = load <8 x i64>, ptr %p
1301 %y = load <8 x i64>, ptr %q
1302 %a = icmp eq <8 x i32> %s, %t
1303 %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y
1304 store <8 x i64> %b, ptr %r
1308 define dso_local void @vselect_split_v16i8_setcc(<16 x i8> %s, <16 x i8> %t, ptr %p, ptr %q, ptr %r) "min-legal-vector-width"="256" {
1309 ; CHECK-LABEL: vselect_split_v16i8_setcc:
1311 ; CHECK-NEXT: vmovdqa (%rsi), %ymm2
1312 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3
1313 ; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
1314 ; CHECK-NEXT: kshiftrw $8, %k1, %k2
1315 ; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2}
1316 ; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1}
1317 ; CHECK-NEXT: vmovdqa %ymm2, (%rdx)
1318 ; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx)
1319 ; CHECK-NEXT: vzeroupper
1321 %x = load <16 x i32>, ptr %p
1322 %y = load <16 x i32>, ptr %q
1323 %a = icmp eq <16 x i8> %s, %t
1324 %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y
1325 store <16 x i32> %b, ptr %r
1329 define dso_local void @vselect_split_v16i16_setcc(<16 x i16> %s, <16 x i16> %t, ptr %p, ptr %q, ptr %r) "min-legal-vector-width"="256" {
1330 ; CHECK-LABEL: vselect_split_v16i16_setcc:
1332 ; CHECK-NEXT: vmovdqa (%rsi), %ymm2
1333 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3
1334 ; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
1335 ; CHECK-NEXT: kshiftrw $8, %k1, %k2
1336 ; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2}
1337 ; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1}
1338 ; CHECK-NEXT: vmovdqa %ymm2, (%rdx)
1339 ; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx)
1340 ; CHECK-NEXT: vzeroupper
1342 %x = load <16 x i32>, ptr %p
1343 %y = load <16 x i32>, ptr %q
1344 %a = icmp eq <16 x i16> %s, %t
1345 %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y
1346 store <16 x i32> %b, ptr %r
1350 define <16 x i8> @trunc_packus_v16i32_v16i8(ptr %p) "min-legal-vector-width"="256" {
1351 ; CHECK-LABEL: trunc_packus_v16i32_v16i8:
1353 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
1354 ; CHECK-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0
1355 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1356 ; CHECK-NEXT: vpmovuswb %ymm0, %xmm0
1357 ; CHECK-NEXT: vzeroupper
1359 %a = load <16 x i32>, ptr %p
1360 %b = icmp slt <16 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1361 %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1362 %d = icmp sgt <16 x i32> %c, zeroinitializer
1363 %e = select <16 x i1> %d, <16 x i32> %c, <16 x i32> zeroinitializer
1364 %f = trunc <16 x i32> %e to <16 x i8>
1368 define dso_local void @trunc_packus_v16i32_v16i8_store(ptr %p, ptr %q) "min-legal-vector-width"="256" {
1369 ; CHECK-LABEL: trunc_packus_v16i32_v16i8_store:
1371 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
1372 ; CHECK-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0
1373 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1374 ; CHECK-NEXT: vpmovuswb %ymm0, (%rsi)
1375 ; CHECK-NEXT: vzeroupper
1377 %a = load <16 x i32>, ptr %p
1378 %b = icmp slt <16 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1379 %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1380 %d = icmp sgt <16 x i32> %c, zeroinitializer
1381 %e = select <16 x i1> %d, <16 x i32> %c, <16 x i32> zeroinitializer
1382 %f = trunc <16 x i32> %e to <16 x i8>
1383 store <16 x i8> %f, ptr %q
1387 define <64 x i1> @v64i1_argument_return(<64 x i1> %x) "min-legal-vector-width"="256" {
1388 ; CHECK-LABEL: v64i1_argument_return:
1394 define dso_local void @v64i1_shuffle(ptr %x, ptr %y) "min-legal-vector-width"="256" {
1395 ; CHECK-LABEL: v64i1_shuffle:
1396 ; CHECK: # %bb.0: # %entry
1397 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1398 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0
1399 ; CHECK-NEXT: vptestnmb %ymm1, %ymm1, %k0
1400 ; CHECK-NEXT: kshiftrd $1, %k0, %k1
1401 ; CHECK-NEXT: kshiftlq $63, %k0, %k2
1402 ; CHECK-NEXT: kshiftrq $62, %k2, %k2
1403 ; CHECK-NEXT: kshiftlq $63, %k1, %k1
1404 ; CHECK-NEXT: kshiftrq $63, %k1, %k1
1405 ; CHECK-NEXT: korq %k2, %k1, %k1
1406 ; CHECK-NEXT: movq $-5, %rax
1407 ; CHECK-NEXT: kmovq %rax, %k2
1408 ; CHECK-NEXT: kandq %k2, %k1, %k1
1409 ; CHECK-NEXT: kshiftrd $3, %k0, %k2
1410 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1411 ; CHECK-NEXT: kshiftrq $61, %k2, %k2
1412 ; CHECK-NEXT: korq %k2, %k1, %k1
1413 ; CHECK-NEXT: movq $-9, %rax
1414 ; CHECK-NEXT: kmovq %rax, %k2
1415 ; CHECK-NEXT: kandq %k2, %k1, %k1
1416 ; CHECK-NEXT: kshiftrd $2, %k0, %k2
1417 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1418 ; CHECK-NEXT: kshiftrq $60, %k2, %k2
1419 ; CHECK-NEXT: korq %k2, %k1, %k1
1420 ; CHECK-NEXT: movq $-17, %rax
1421 ; CHECK-NEXT: kmovq %rax, %k2
1422 ; CHECK-NEXT: kandq %k2, %k1, %k1
1423 ; CHECK-NEXT: kshiftrd $5, %k0, %k2
1424 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1425 ; CHECK-NEXT: kshiftrq $59, %k2, %k2
1426 ; CHECK-NEXT: korq %k2, %k1, %k1
1427 ; CHECK-NEXT: movq $-33, %rax
1428 ; CHECK-NEXT: kmovq %rax, %k2
1429 ; CHECK-NEXT: kandq %k2, %k1, %k1
1430 ; CHECK-NEXT: kshiftrd $4, %k0, %k2
1431 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1432 ; CHECK-NEXT: kshiftrq $58, %k2, %k2
1433 ; CHECK-NEXT: korq %k2, %k1, %k1
1434 ; CHECK-NEXT: movq $-65, %rax
1435 ; CHECK-NEXT: kmovq %rax, %k2
1436 ; CHECK-NEXT: kandq %k2, %k1, %k1
1437 ; CHECK-NEXT: kshiftrd $7, %k0, %k2
1438 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1439 ; CHECK-NEXT: kshiftrq $57, %k2, %k2
1440 ; CHECK-NEXT: korq %k2, %k1, %k1
1441 ; CHECK-NEXT: movq $-129, %rax
1442 ; CHECK-NEXT: kmovq %rax, %k2
1443 ; CHECK-NEXT: kandq %k2, %k1, %k1
1444 ; CHECK-NEXT: kshiftrd $6, %k0, %k2
1445 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1446 ; CHECK-NEXT: kshiftrq $56, %k2, %k2
1447 ; CHECK-NEXT: korq %k2, %k1, %k1
1448 ; CHECK-NEXT: movq $-257, %rax # imm = 0xFEFF
1449 ; CHECK-NEXT: kmovq %rax, %k2
1450 ; CHECK-NEXT: kandq %k2, %k1, %k1
1451 ; CHECK-NEXT: kshiftrd $9, %k0, %k2
1452 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1453 ; CHECK-NEXT: kshiftrq $55, %k2, %k2
1454 ; CHECK-NEXT: korq %k2, %k1, %k1
1455 ; CHECK-NEXT: movq $-513, %rax # imm = 0xFDFF
1456 ; CHECK-NEXT: kmovq %rax, %k2
1457 ; CHECK-NEXT: kandq %k2, %k1, %k1
1458 ; CHECK-NEXT: kshiftrd $8, %k0, %k2
1459 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1460 ; CHECK-NEXT: kshiftrq $54, %k2, %k2
1461 ; CHECK-NEXT: korq %k2, %k1, %k1
1462 ; CHECK-NEXT: movq $-1025, %rax # imm = 0xFBFF
1463 ; CHECK-NEXT: kmovq %rax, %k2
1464 ; CHECK-NEXT: kandq %k2, %k1, %k1
1465 ; CHECK-NEXT: kshiftrd $11, %k0, %k2
1466 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1467 ; CHECK-NEXT: kshiftrq $53, %k2, %k2
1468 ; CHECK-NEXT: korq %k2, %k1, %k1
1469 ; CHECK-NEXT: movq $-2049, %rax # imm = 0xF7FF
1470 ; CHECK-NEXT: kmovq %rax, %k2
1471 ; CHECK-NEXT: kandq %k2, %k1, %k1
1472 ; CHECK-NEXT: kshiftrd $10, %k0, %k2
1473 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1474 ; CHECK-NEXT: kshiftrq $52, %k2, %k2
1475 ; CHECK-NEXT: korq %k2, %k1, %k1
1476 ; CHECK-NEXT: movq $-4097, %rax # imm = 0xEFFF
1477 ; CHECK-NEXT: kmovq %rax, %k2
1478 ; CHECK-NEXT: kandq %k2, %k1, %k1
1479 ; CHECK-NEXT: kshiftrd $13, %k0, %k2
1480 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1481 ; CHECK-NEXT: kshiftrq $51, %k2, %k2
1482 ; CHECK-NEXT: korq %k2, %k1, %k1
1483 ; CHECK-NEXT: movq $-8193, %rax # imm = 0xDFFF
1484 ; CHECK-NEXT: kmovq %rax, %k2
1485 ; CHECK-NEXT: kandq %k2, %k1, %k1
1486 ; CHECK-NEXT: kshiftrd $12, %k0, %k2
1487 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1488 ; CHECK-NEXT: kshiftrq $50, %k2, %k2
1489 ; CHECK-NEXT: korq %k2, %k1, %k1
1490 ; CHECK-NEXT: movq $-16385, %rax # imm = 0xBFFF
1491 ; CHECK-NEXT: kmovq %rax, %k2
1492 ; CHECK-NEXT: kandq %k2, %k1, %k1
1493 ; CHECK-NEXT: kshiftrd $15, %k0, %k2
1494 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1495 ; CHECK-NEXT: kshiftrq $49, %k2, %k2
1496 ; CHECK-NEXT: korq %k2, %k1, %k1
1497 ; CHECK-NEXT: movq $-32769, %rax # imm = 0xFFFF7FFF
1498 ; CHECK-NEXT: kmovq %rax, %k2
1499 ; CHECK-NEXT: kandq %k2, %k1, %k1
1500 ; CHECK-NEXT: kshiftrd $14, %k0, %k2
1501 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1502 ; CHECK-NEXT: kshiftrq $48, %k2, %k2
1503 ; CHECK-NEXT: korq %k2, %k1, %k1
1504 ; CHECK-NEXT: movq $-65537, %rax # imm = 0xFFFEFFFF
1505 ; CHECK-NEXT: kmovq %rax, %k2
1506 ; CHECK-NEXT: kandq %k2, %k1, %k1
1507 ; CHECK-NEXT: kshiftrd $17, %k0, %k2
1508 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1509 ; CHECK-NEXT: kshiftrq $47, %k2, %k2
1510 ; CHECK-NEXT: korq %k2, %k1, %k1
1511 ; CHECK-NEXT: movq $-131073, %rax # imm = 0xFFFDFFFF
1512 ; CHECK-NEXT: kmovq %rax, %k2
1513 ; CHECK-NEXT: kandq %k2, %k1, %k1
1514 ; CHECK-NEXT: kshiftrd $16, %k0, %k2
1515 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1516 ; CHECK-NEXT: kshiftrq $46, %k2, %k2
1517 ; CHECK-NEXT: korq %k2, %k1, %k1
1518 ; CHECK-NEXT: movq $-262145, %rax # imm = 0xFFFBFFFF
1519 ; CHECK-NEXT: kmovq %rax, %k2
1520 ; CHECK-NEXT: kandq %k2, %k1, %k1
1521 ; CHECK-NEXT: kshiftrd $19, %k0, %k2
1522 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1523 ; CHECK-NEXT: kshiftrq $45, %k2, %k2
1524 ; CHECK-NEXT: korq %k2, %k1, %k1
1525 ; CHECK-NEXT: movq $-524289, %rax # imm = 0xFFF7FFFF
1526 ; CHECK-NEXT: kmovq %rax, %k2
1527 ; CHECK-NEXT: kandq %k2, %k1, %k1
1528 ; CHECK-NEXT: kshiftrd $18, %k0, %k2
1529 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1530 ; CHECK-NEXT: kshiftrq $44, %k2, %k2
1531 ; CHECK-NEXT: korq %k2, %k1, %k1
1532 ; CHECK-NEXT: movq $-1048577, %rax # imm = 0xFFEFFFFF
1533 ; CHECK-NEXT: kmovq %rax, %k2
1534 ; CHECK-NEXT: kandq %k2, %k1, %k1
1535 ; CHECK-NEXT: kshiftrd $21, %k0, %k2
1536 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1537 ; CHECK-NEXT: kshiftrq $43, %k2, %k2
1538 ; CHECK-NEXT: korq %k2, %k1, %k1
1539 ; CHECK-NEXT: movq $-2097153, %rax # imm = 0xFFDFFFFF
1540 ; CHECK-NEXT: kmovq %rax, %k2
1541 ; CHECK-NEXT: kandq %k2, %k1, %k1
1542 ; CHECK-NEXT: kshiftrd $20, %k0, %k2
1543 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1544 ; CHECK-NEXT: kshiftrq $42, %k2, %k2
1545 ; CHECK-NEXT: korq %k2, %k1, %k1
1546 ; CHECK-NEXT: movq $-4194305, %rax # imm = 0xFFBFFFFF
1547 ; CHECK-NEXT: kmovq %rax, %k2
1548 ; CHECK-NEXT: kandq %k2, %k1, %k1
1549 ; CHECK-NEXT: kshiftrd $23, %k0, %k2
1550 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1551 ; CHECK-NEXT: kshiftrq $41, %k2, %k2
1552 ; CHECK-NEXT: korq %k2, %k1, %k1
1553 ; CHECK-NEXT: movq $-8388609, %rax # imm = 0xFF7FFFFF
1554 ; CHECK-NEXT: kmovq %rax, %k2
1555 ; CHECK-NEXT: kandq %k2, %k1, %k1
1556 ; CHECK-NEXT: kshiftrd $22, %k0, %k2
1557 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1558 ; CHECK-NEXT: kshiftrq $40, %k2, %k2
1559 ; CHECK-NEXT: korq %k2, %k1, %k1
1560 ; CHECK-NEXT: movq $-16777217, %rax # imm = 0xFEFFFFFF
1561 ; CHECK-NEXT: kmovq %rax, %k2
1562 ; CHECK-NEXT: kandq %k2, %k1, %k1
1563 ; CHECK-NEXT: kshiftrd $25, %k0, %k2
1564 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1565 ; CHECK-NEXT: kshiftrq $39, %k2, %k2
1566 ; CHECK-NEXT: korq %k2, %k1, %k1
1567 ; CHECK-NEXT: movq $-33554433, %rax # imm = 0xFDFFFFFF
1568 ; CHECK-NEXT: kmovq %rax, %k2
1569 ; CHECK-NEXT: kandq %k2, %k1, %k1
1570 ; CHECK-NEXT: kshiftrd $24, %k0, %k2
1571 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1572 ; CHECK-NEXT: kshiftrq $38, %k2, %k2
1573 ; CHECK-NEXT: korq %k2, %k1, %k1
1574 ; CHECK-NEXT: movq $-67108865, %rax # imm = 0xFBFFFFFF
1575 ; CHECK-NEXT: kmovq %rax, %k2
1576 ; CHECK-NEXT: kandq %k2, %k1, %k1
1577 ; CHECK-NEXT: kshiftrd $27, %k0, %k2
1578 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1579 ; CHECK-NEXT: kshiftrq $37, %k2, %k2
1580 ; CHECK-NEXT: korq %k2, %k1, %k1
1581 ; CHECK-NEXT: movq $-134217729, %rax # imm = 0xF7FFFFFF
1582 ; CHECK-NEXT: kmovq %rax, %k2
1583 ; CHECK-NEXT: kandq %k2, %k1, %k1
1584 ; CHECK-NEXT: kshiftrd $26, %k0, %k2
1585 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1586 ; CHECK-NEXT: kshiftrq $36, %k2, %k2
1587 ; CHECK-NEXT: korq %k2, %k1, %k1
1588 ; CHECK-NEXT: movq $-268435457, %rax # imm = 0xEFFFFFFF
1589 ; CHECK-NEXT: kmovq %rax, %k2
1590 ; CHECK-NEXT: kandq %k2, %k1, %k1
1591 ; CHECK-NEXT: kshiftrd $29, %k0, %k2
1592 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1593 ; CHECK-NEXT: kshiftrq $35, %k2, %k2
1594 ; CHECK-NEXT: korq %k2, %k1, %k1
1595 ; CHECK-NEXT: movq $-536870913, %rax # imm = 0xDFFFFFFF
1596 ; CHECK-NEXT: kmovq %rax, %k2
1597 ; CHECK-NEXT: kandq %k2, %k1, %k1
1598 ; CHECK-NEXT: kshiftrd $28, %k0, %k2
1599 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1600 ; CHECK-NEXT: kshiftrq $34, %k2, %k2
1601 ; CHECK-NEXT: korq %k2, %k1, %k1
1602 ; CHECK-NEXT: movq $-1073741825, %rax # imm = 0xBFFFFFFF
1603 ; CHECK-NEXT: kmovq %rax, %k2
1604 ; CHECK-NEXT: kandq %k2, %k1, %k1
1605 ; CHECK-NEXT: kshiftrd $31, %k0, %k2
1606 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1607 ; CHECK-NEXT: kshiftrq $33, %k2, %k2
1608 ; CHECK-NEXT: korq %k2, %k1, %k1
1609 ; CHECK-NEXT: movabsq $-2147483649, %rax # imm = 0xFFFFFFFF7FFFFFFF
1610 ; CHECK-NEXT: kmovq %rax, %k2
1611 ; CHECK-NEXT: kandq %k2, %k1, %k2
1612 ; CHECK-NEXT: vptestnmb %ymm0, %ymm0, %k1
1613 ; CHECK-NEXT: kshiftrd $30, %k0, %k0
1614 ; CHECK-NEXT: kshiftlq $63, %k0, %k0
1615 ; CHECK-NEXT: kshiftrq $32, %k0, %k0
1616 ; CHECK-NEXT: korq %k0, %k2, %k0
1617 ; CHECK-NEXT: movabsq $-4294967297, %rax # imm = 0xFFFFFFFEFFFFFFFF
1618 ; CHECK-NEXT: kmovq %rax, %k2
1619 ; CHECK-NEXT: kandq %k2, %k0, %k0
1620 ; CHECK-NEXT: kshiftrd $1, %k1, %k2
1621 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1622 ; CHECK-NEXT: kshiftrq $31, %k2, %k2
1623 ; CHECK-NEXT: korq %k2, %k0, %k0
1624 ; CHECK-NEXT: movabsq $-8589934593, %rax # imm = 0xFFFFFFFDFFFFFFFF
1625 ; CHECK-NEXT: kmovq %rax, %k2
1626 ; CHECK-NEXT: kandq %k2, %k0, %k0
1627 ; CHECK-NEXT: kshiftlq $63, %k1, %k2
1628 ; CHECK-NEXT: kshiftrq $30, %k2, %k2
1629 ; CHECK-NEXT: korq %k2, %k0, %k0
1630 ; CHECK-NEXT: movabsq $-17179869185, %rax # imm = 0xFFFFFFFBFFFFFFFF
1631 ; CHECK-NEXT: kmovq %rax, %k2
1632 ; CHECK-NEXT: kandq %k2, %k0, %k0
1633 ; CHECK-NEXT: kshiftrd $3, %k1, %k2
1634 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1635 ; CHECK-NEXT: kshiftrq $29, %k2, %k2
1636 ; CHECK-NEXT: korq %k2, %k0, %k0
1637 ; CHECK-NEXT: movabsq $-34359738369, %rax # imm = 0xFFFFFFF7FFFFFFFF
1638 ; CHECK-NEXT: kmovq %rax, %k2
1639 ; CHECK-NEXT: kandq %k2, %k0, %k0
1640 ; CHECK-NEXT: kshiftrd $2, %k1, %k2
1641 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1642 ; CHECK-NEXT: kshiftrq $28, %k2, %k2
1643 ; CHECK-NEXT: korq %k2, %k0, %k0
1644 ; CHECK-NEXT: movabsq $-68719476737, %rax # imm = 0xFFFFFFEFFFFFFFFF
1645 ; CHECK-NEXT: kmovq %rax, %k2
1646 ; CHECK-NEXT: kandq %k2, %k0, %k0
1647 ; CHECK-NEXT: kshiftrd $5, %k1, %k2
1648 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1649 ; CHECK-NEXT: kshiftrq $27, %k2, %k2
1650 ; CHECK-NEXT: korq %k2, %k0, %k0
1651 ; CHECK-NEXT: movabsq $-137438953473, %rax # imm = 0xFFFFFFDFFFFFFFFF
1652 ; CHECK-NEXT: kmovq %rax, %k2
1653 ; CHECK-NEXT: kandq %k2, %k0, %k0
1654 ; CHECK-NEXT: kshiftrd $4, %k1, %k2
1655 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1656 ; CHECK-NEXT: kshiftrq $26, %k2, %k2
1657 ; CHECK-NEXT: korq %k2, %k0, %k0
1658 ; CHECK-NEXT: movabsq $-274877906945, %rax # imm = 0xFFFFFFBFFFFFFFFF
1659 ; CHECK-NEXT: kmovq %rax, %k2
1660 ; CHECK-NEXT: kandq %k2, %k0, %k0
1661 ; CHECK-NEXT: kshiftrd $7, %k1, %k2
1662 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1663 ; CHECK-NEXT: kshiftrq $25, %k2, %k2
1664 ; CHECK-NEXT: korq %k2, %k0, %k0
1665 ; CHECK-NEXT: movabsq $-549755813889, %rax # imm = 0xFFFFFF7FFFFFFFFF
1666 ; CHECK-NEXT: kmovq %rax, %k2
1667 ; CHECK-NEXT: kandq %k2, %k0, %k0
1668 ; CHECK-NEXT: kshiftrd $6, %k1, %k2
1669 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1670 ; CHECK-NEXT: kshiftrq $24, %k2, %k2
1671 ; CHECK-NEXT: korq %k2, %k0, %k0
1672 ; CHECK-NEXT: movabsq $-1099511627777, %rax # imm = 0xFFFFFEFFFFFFFFFF
1673 ; CHECK-NEXT: kmovq %rax, %k2
1674 ; CHECK-NEXT: kandq %k2, %k0, %k0
1675 ; CHECK-NEXT: kshiftrd $9, %k1, %k2
1676 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1677 ; CHECK-NEXT: kshiftrq $23, %k2, %k2
1678 ; CHECK-NEXT: korq %k2, %k0, %k0
1679 ; CHECK-NEXT: movabsq $-2199023255553, %rax # imm = 0xFFFFFDFFFFFFFFFF
1680 ; CHECK-NEXT: kmovq %rax, %k2
1681 ; CHECK-NEXT: kandq %k2, %k0, %k0
1682 ; CHECK-NEXT: kshiftrd $8, %k1, %k2
1683 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1684 ; CHECK-NEXT: kshiftrq $22, %k2, %k2
1685 ; CHECK-NEXT: korq %k2, %k0, %k0
1686 ; CHECK-NEXT: movabsq $-4398046511105, %rax # imm = 0xFFFFFBFFFFFFFFFF
1687 ; CHECK-NEXT: kmovq %rax, %k2
1688 ; CHECK-NEXT: kandq %k2, %k0, %k0
1689 ; CHECK-NEXT: kshiftrd $11, %k1, %k2
1690 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1691 ; CHECK-NEXT: kshiftrq $21, %k2, %k2
1692 ; CHECK-NEXT: korq %k2, %k0, %k0
1693 ; CHECK-NEXT: movabsq $-8796093022209, %rax # imm = 0xFFFFF7FFFFFFFFFF
1694 ; CHECK-NEXT: kmovq %rax, %k2
1695 ; CHECK-NEXT: kandq %k2, %k0, %k0
1696 ; CHECK-NEXT: kshiftrd $10, %k1, %k2
1697 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1698 ; CHECK-NEXT: kshiftrq $20, %k2, %k2
1699 ; CHECK-NEXT: korq %k2, %k0, %k0
1700 ; CHECK-NEXT: movabsq $-17592186044417, %rax # imm = 0xFFFFEFFFFFFFFFFF
1701 ; CHECK-NEXT: kmovq %rax, %k2
1702 ; CHECK-NEXT: kandq %k2, %k0, %k0
1703 ; CHECK-NEXT: kshiftrd $13, %k1, %k2
1704 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1705 ; CHECK-NEXT: kshiftrq $19, %k2, %k2
1706 ; CHECK-NEXT: korq %k2, %k0, %k0
1707 ; CHECK-NEXT: movabsq $-35184372088833, %rax # imm = 0xFFFFDFFFFFFFFFFF
1708 ; CHECK-NEXT: kmovq %rax, %k2
1709 ; CHECK-NEXT: kandq %k2, %k0, %k0
1710 ; CHECK-NEXT: kshiftrd $12, %k1, %k2
1711 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1712 ; CHECK-NEXT: kshiftrq $18, %k2, %k2
1713 ; CHECK-NEXT: korq %k2, %k0, %k0
1714 ; CHECK-NEXT: movabsq $-70368744177665, %rax # imm = 0xFFFFBFFFFFFFFFFF
1715 ; CHECK-NEXT: kmovq %rax, %k2
1716 ; CHECK-NEXT: kandq %k2, %k0, %k0
1717 ; CHECK-NEXT: kshiftrd $15, %k1, %k2
1718 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1719 ; CHECK-NEXT: kshiftrq $17, %k2, %k2
1720 ; CHECK-NEXT: korq %k2, %k0, %k0
1721 ; CHECK-NEXT: movabsq $-140737488355329, %rax # imm = 0xFFFF7FFFFFFFFFFF
1722 ; CHECK-NEXT: kmovq %rax, %k2
1723 ; CHECK-NEXT: kandq %k2, %k0, %k0
1724 ; CHECK-NEXT: kshiftrd $14, %k1, %k2
1725 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1726 ; CHECK-NEXT: kshiftrq $16, %k2, %k2
1727 ; CHECK-NEXT: korq %k2, %k0, %k0
1728 ; CHECK-NEXT: movabsq $-281474976710657, %rax # imm = 0xFFFEFFFFFFFFFFFF
1729 ; CHECK-NEXT: kmovq %rax, %k2
1730 ; CHECK-NEXT: kandq %k2, %k0, %k0
1731 ; CHECK-NEXT: kshiftrd $17, %k1, %k2
1732 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1733 ; CHECK-NEXT: kshiftrq $15, %k2, %k2
1734 ; CHECK-NEXT: korq %k2, %k0, %k0
1735 ; CHECK-NEXT: movabsq $-562949953421313, %rax # imm = 0xFFFDFFFFFFFFFFFF
1736 ; CHECK-NEXT: kmovq %rax, %k2
1737 ; CHECK-NEXT: kandq %k2, %k0, %k0
1738 ; CHECK-NEXT: kshiftrd $16, %k1, %k2
1739 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1740 ; CHECK-NEXT: kshiftrq $14, %k2, %k2
1741 ; CHECK-NEXT: korq %k2, %k0, %k0
1742 ; CHECK-NEXT: movabsq $-1125899906842625, %rax # imm = 0xFFFBFFFFFFFFFFFF
1743 ; CHECK-NEXT: kmovq %rax, %k2
1744 ; CHECK-NEXT: kandq %k2, %k0, %k0
1745 ; CHECK-NEXT: kshiftrd $19, %k1, %k2
1746 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1747 ; CHECK-NEXT: kshiftrq $13, %k2, %k2
1748 ; CHECK-NEXT: korq %k2, %k0, %k0
1749 ; CHECK-NEXT: movabsq $-2251799813685249, %rax # imm = 0xFFF7FFFFFFFFFFFF
1750 ; CHECK-NEXT: kmovq %rax, %k2
1751 ; CHECK-NEXT: kandq %k2, %k0, %k0
1752 ; CHECK-NEXT: kshiftrd $18, %k1, %k2
1753 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1754 ; CHECK-NEXT: kshiftrq $12, %k2, %k2
1755 ; CHECK-NEXT: korq %k2, %k0, %k0
1756 ; CHECK-NEXT: movabsq $-4503599627370497, %rax # imm = 0xFFEFFFFFFFFFFFFF
1757 ; CHECK-NEXT: kmovq %rax, %k2
1758 ; CHECK-NEXT: kandq %k2, %k0, %k0
1759 ; CHECK-NEXT: kshiftrd $21, %k1, %k2
1760 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1761 ; CHECK-NEXT: kshiftrq $11, %k2, %k2
1762 ; CHECK-NEXT: korq %k2, %k0, %k0
1763 ; CHECK-NEXT: movabsq $-9007199254740993, %rax # imm = 0xFFDFFFFFFFFFFFFF
1764 ; CHECK-NEXT: kmovq %rax, %k2
1765 ; CHECK-NEXT: kandq %k2, %k0, %k0
1766 ; CHECK-NEXT: kshiftrd $20, %k1, %k2
1767 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1768 ; CHECK-NEXT: kshiftrq $10, %k2, %k2
1769 ; CHECK-NEXT: korq %k2, %k0, %k0
1770 ; CHECK-NEXT: movabsq $-18014398509481985, %rax # imm = 0xFFBFFFFFFFFFFFFF
1771 ; CHECK-NEXT: kmovq %rax, %k2
1772 ; CHECK-NEXT: kandq %k2, %k0, %k0
1773 ; CHECK-NEXT: kshiftrd $23, %k1, %k2
1774 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1775 ; CHECK-NEXT: kshiftrq $9, %k2, %k2
1776 ; CHECK-NEXT: korq %k2, %k0, %k0
1777 ; CHECK-NEXT: movabsq $-36028797018963969, %rax # imm = 0xFF7FFFFFFFFFFFFF
1778 ; CHECK-NEXT: kmovq %rax, %k2
1779 ; CHECK-NEXT: kandq %k2, %k0, %k0
1780 ; CHECK-NEXT: kshiftrd $22, %k1, %k2
1781 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1782 ; CHECK-NEXT: kshiftrq $8, %k2, %k2
1783 ; CHECK-NEXT: korq %k2, %k0, %k0
1784 ; CHECK-NEXT: movabsq $-72057594037927937, %rax # imm = 0xFEFFFFFFFFFFFFFF
1785 ; CHECK-NEXT: kmovq %rax, %k2
1786 ; CHECK-NEXT: kandq %k2, %k0, %k0
1787 ; CHECK-NEXT: kshiftrd $25, %k1, %k2
1788 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1789 ; CHECK-NEXT: kshiftrq $7, %k2, %k2
1790 ; CHECK-NEXT: korq %k2, %k0, %k0
1791 ; CHECK-NEXT: movabsq $-144115188075855873, %rax # imm = 0xFDFFFFFFFFFFFFFF
1792 ; CHECK-NEXT: kmovq %rax, %k2
1793 ; CHECK-NEXT: kandq %k2, %k0, %k0
1794 ; CHECK-NEXT: kshiftrd $24, %k1, %k2
1795 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1796 ; CHECK-NEXT: kshiftrq $6, %k2, %k2
1797 ; CHECK-NEXT: korq %k2, %k0, %k0
1798 ; CHECK-NEXT: movabsq $-288230376151711745, %rax # imm = 0xFBFFFFFFFFFFFFFF
1799 ; CHECK-NEXT: kmovq %rax, %k2
1800 ; CHECK-NEXT: kandq %k2, %k0, %k0
1801 ; CHECK-NEXT: kshiftrd $27, %k1, %k2
1802 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1803 ; CHECK-NEXT: kshiftrq $5, %k2, %k2
1804 ; CHECK-NEXT: korq %k2, %k0, %k0
1805 ; CHECK-NEXT: movabsq $-576460752303423489, %rax # imm = 0xF7FFFFFFFFFFFFFF
1806 ; CHECK-NEXT: kmovq %rax, %k2
1807 ; CHECK-NEXT: kandq %k2, %k0, %k0
1808 ; CHECK-NEXT: kshiftrd $26, %k1, %k2
1809 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1810 ; CHECK-NEXT: kshiftrq $4, %k2, %k2
1811 ; CHECK-NEXT: korq %k2, %k0, %k0
1812 ; CHECK-NEXT: movabsq $-1152921504606846977, %rax # imm = 0xEFFFFFFFFFFFFFFF
1813 ; CHECK-NEXT: kmovq %rax, %k2
1814 ; CHECK-NEXT: kandq %k2, %k0, %k0
1815 ; CHECK-NEXT: kshiftrd $29, %k1, %k2
1816 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1817 ; CHECK-NEXT: kshiftrq $3, %k2, %k2
1818 ; CHECK-NEXT: korq %k2, %k0, %k0
1819 ; CHECK-NEXT: movabsq $-2305843009213693953, %rax # imm = 0xDFFFFFFFFFFFFFFF
1820 ; CHECK-NEXT: kmovq %rax, %k2
1821 ; CHECK-NEXT: kandq %k2, %k0, %k0
1822 ; CHECK-NEXT: kshiftrd $28, %k1, %k2
1823 ; CHECK-NEXT: kshiftlq $63, %k2, %k2
1824 ; CHECK-NEXT: kshiftrq $2, %k2, %k2
1825 ; CHECK-NEXT: korq %k2, %k0, %k0
1826 ; CHECK-NEXT: movabsq $-4611686018427387905, %rax # imm = 0xBFFFFFFFFFFFFFFF
1827 ; CHECK-NEXT: kmovq %rax, %k2
1828 ; CHECK-NEXT: kandq %k2, %k0, %k0
1829 ; CHECK-NEXT: kshiftrd $31, %k1, %k2
1830 ; CHECK-NEXT: kshiftlq $62, %k2, %k2
1831 ; CHECK-NEXT: korq %k2, %k0, %k0
1832 ; CHECK-NEXT: kshiftrd $30, %k1, %k1
1833 ; CHECK-NEXT: kshiftlq $1, %k0, %k0
1834 ; CHECK-NEXT: kshiftrq $1, %k0, %k0
1835 ; CHECK-NEXT: kshiftlq $63, %k1, %k1
1836 ; CHECK-NEXT: korq %k1, %k0, %k1
1837 ; CHECK-NEXT: vmovdqu8 %ymm1, (%rsi) {%k1}
1838 ; CHECK-NEXT: kshiftrq $32, %k1, %k1
1839 ; CHECK-NEXT: vmovdqu8 %ymm0, 32(%rsi) {%k1}
1840 ; CHECK-NEXT: vzeroupper
1843 %a = load <64 x i8>, ptr %x
1844 %b = icmp eq <64 x i8> %a, zeroinitializer
1845 %shuf = shufflevector <64 x i1> %b, <64 x i1> undef, <64 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14, i32 17, i32 16, i32 19, i32 18, i32 21, i32 20, i32 23, i32 22, i32 25, i32 24, i32 27, i32 26, i32 29, i32 28, i32 31, i32 30, i32 33, i32 32, i32 35, i32 34, i32 37, i32 36, i32 39, i32 38, i32 41, i32 40, i32 43, i32 42, i32 45, i32 44, i32 47, i32 46, i32 49, i32 48, i32 51, i32 50, i32 53, i32 52, i32 55, i32 54, i32 57, i32 56, i32 59, i32 58, i32 61, i32 60, i32 63, i32 62>
1846 call void @llvm.masked.store.v64i8.p0(<64 x i8> %a, ptr %y, i32 1, <64 x i1> %shuf)
1849 declare void @llvm.masked.store.v64i8.p0(<64 x i8>, ptr, i32, <64 x i1>)
1851 @mem64_dst = dso_local global i64 0, align 8
1852 @mem64_src = dso_local global i64 0, align 8
1853 define dso_local i32 @v64i1_inline_asm() "min-legal-vector-width"="256" {
1854 ; CHECK-LABEL: v64i1_inline_asm:
1856 ; CHECK-NEXT: kmovq mem64_src(%rip), %k0
1858 ; CHECK-NEXT: #NO_APP
1859 ; CHECK-NEXT: kmovq %k0, mem64_dst(%rip)
1860 ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
1862 %1 = alloca i32, align 4
1863 %2 = load i64, ptr @mem64_src, align 8
1864 %3 = call i64 asm "", "=k,k,~{dirflag},~{fpsr},~{flags}"(i64 %2)
1865 store i64 %3, ptr @mem64_dst, align 8
1866 %4 = load i32, ptr %1, align 4
1870 define dso_local void @cmp_v8i64_sext(ptr %xptr, ptr %yptr, ptr %zptr) "min-legal-vector-width"="256" {
1871 ; CHECK-LABEL: cmp_v8i64_sext:
1873 ; CHECK-NEXT: vmovdqa (%rsi), %ymm0
1874 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1
1875 ; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1
1876 ; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0
1877 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
1878 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
1879 ; CHECK-NEXT: vzeroupper
1881 %x = load <8 x i64>, ptr %xptr
1882 %y = load <8 x i64>, ptr %yptr
1883 %cmp = icmp slt <8 x i64> %x, %y
1884 %ext = sext <8 x i1> %cmp to <8 x i64>
1885 store <8 x i64> %ext, ptr %zptr
1889 define dso_local void @cmp_v8i64_zext(ptr %xptr, ptr %yptr, ptr %zptr) "min-legal-vector-width"="256" {
1890 ; CHECK-LABEL: cmp_v8i64_zext:
1892 ; CHECK-NEXT: vmovdqa (%rsi), %ymm0
1893 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1
1894 ; CHECK-NEXT: vpcmpgtq 32(%rdi), %ymm1, %ymm1
1895 ; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0
1896 ; CHECK-NEXT: vpsrlq $63, %ymm1, %ymm1
1897 ; CHECK-NEXT: vpsrlq $63, %ymm0, %ymm0
1898 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
1899 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
1900 ; CHECK-NEXT: vzeroupper
1902 %x = load <8 x i64>, ptr %xptr
1903 %y = load <8 x i64>, ptr %yptr
1904 %cmp = icmp slt <8 x i64> %x, %y
1905 %ext = zext <8 x i1> %cmp to <8 x i64>
1906 store <8 x i64> %ext, ptr %zptr
1910 define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind "min-legal-vector-width"="256" {
1911 ; CHECK-LABEL: var_rotate_v16i8:
1913 ; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
1914 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
1915 ; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1916 ; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1917 ; CHECK-NEXT: vpsllvw %xmm2, %xmm3, %xmm2
1918 ; CHECK-NEXT: vpsrlw $8, %xmm2, %xmm2
1919 ; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1920 ; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1921 ; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
1922 ; CHECK-NEXT: vpsrlw $8, %xmm0, %xmm0
1923 ; CHECK-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
1925 %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
1926 %shl = shl <16 x i8> %a, %b
1927 %lshr = lshr <16 x i8> %a, %b8
1928 %or = or <16 x i8> %shl, %lshr
1932 define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" {
1933 ; CHECK-LABEL: var_rotate_v32i8:
1935 ; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
1936 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
1937 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
1938 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1939 ; CHECK-NEXT: vpsllvw %ymm3, %ymm4, %ymm3
1940 ; CHECK-NEXT: vpsrlw $8, %ymm3, %ymm3
1941 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
1942 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1943 ; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
1944 ; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0
1945 ; CHECK-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
1947 %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
1948 %shl = shl <32 x i8> %a, %b
1949 %lshr = lshr <32 x i8> %a, %b8
1950 %or = or <32 x i8> %shl, %lshr
1954 define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" {
1955 ; CHECK-LABEL: splatvar_rotate_v32i8:
1957 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1958 ; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1959 ; CHECK-NEXT: vpsllw %xmm1, %ymm2, %ymm2
1960 ; CHECK-NEXT: vpsrlw $8, %ymm2, %ymm2
1961 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1962 ; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0
1963 ; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0
1964 ; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
1966 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
1967 %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
1968 %shl = shl <32 x i8> %a, %splat
1969 %lshr = lshr <32 x i8> %a, %splat8
1970 %or = or <32 x i8> %shl, %lshr
1974 define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" {
1975 ; CHECK-LABEL: constant_rotate_v32i8:
1977 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1978 ; CHECK-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1979 ; CHECK-NEXT: vpsrlw $8, %ymm1, %ymm1
1980 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1981 ; CHECK-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1982 ; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0
1983 ; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
1985 %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1986 %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1987 %or = or <32 x i8> %shl, %lshr
1991 define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" {
1992 ; CHECK-SKX-LABEL: splatconstant_rotate_v32i8:
1993 ; CHECK-SKX: # %bb.0:
1994 ; CHECK-SKX-NEXT: vpsllw $4, %ymm0, %ymm1
1995 ; CHECK-SKX-NEXT: vpsrlw $4, %ymm0, %ymm0
1996 ; CHECK-SKX-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
1997 ; CHECK-SKX-NEXT: retq
1999 ; CHECK-AVX512-LABEL: splatconstant_rotate_v32i8:
2000 ; CHECK-AVX512: # %bb.0:
2001 ; CHECK-AVX512-NEXT: vpsllw $4, %ymm0, %ymm1
2002 ; CHECK-AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
2003 ; CHECK-AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
2004 ; CHECK-AVX512-NEXT: retq
2006 ; CHECK-VBMI1-LABEL: splatconstant_rotate_v32i8:
2007 ; CHECK-VBMI1: # %bb.0:
2008 ; CHECK-VBMI1-NEXT: vpsllw $4, %ymm0, %ymm1
2009 ; CHECK-VBMI1-NEXT: vpsrlw $4, %ymm0, %ymm0
2010 ; CHECK-VBMI1-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
2011 ; CHECK-VBMI1-NEXT: retq
2013 ; CHECK-GFNI-LABEL: splatconstant_rotate_v32i8:
2014 ; CHECK-GFNI: # %bb.0:
2015 ; CHECK-GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
2016 ; CHECK-GFNI-NEXT: retq
2017 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
2018 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
2019 %or = or <32 x i8> %shl, %lshr
2023 define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" {
2024 ; CHECK-SKX-LABEL: splatconstant_rotate_mask_v32i8:
2025 ; CHECK-SKX: # %bb.0:
2026 ; CHECK-SKX-NEXT: vpsllw $4, %ymm0, %ymm1
2027 ; CHECK-SKX-NEXT: vpsrlw $4, %ymm0, %ymm0
2028 ; CHECK-SKX-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
2029 ; CHECK-SKX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
2030 ; CHECK-SKX-NEXT: retq
2032 ; CHECK-AVX512-LABEL: splatconstant_rotate_mask_v32i8:
2033 ; CHECK-AVX512: # %bb.0:
2034 ; CHECK-AVX512-NEXT: vpsllw $4, %ymm0, %ymm1
2035 ; CHECK-AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
2036 ; CHECK-AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
2037 ; CHECK-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
2038 ; CHECK-AVX512-NEXT: retq
2040 ; CHECK-VBMI1-LABEL: splatconstant_rotate_mask_v32i8:
2041 ; CHECK-VBMI1: # %bb.0:
2042 ; CHECK-VBMI1-NEXT: vpsllw $4, %ymm0, %ymm1
2043 ; CHECK-VBMI1-NEXT: vpsrlw $4, %ymm0, %ymm0
2044 ; CHECK-VBMI1-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
2045 ; CHECK-VBMI1-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
2046 ; CHECK-VBMI1-NEXT: retq
2048 ; CHECK-GFNI-LABEL: splatconstant_rotate_mask_v32i8:
2049 ; CHECK-GFNI: # %bb.0:
2050 ; CHECK-GFNI-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
2051 ; CHECK-GFNI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
2052 ; CHECK-GFNI-NEXT: retq
2053 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
2054 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
2055 %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
2056 %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
2057 %or = or <32 x i8> %lmask, %rmask