1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fast-variable-shuffle,avx512vl,avx512bw,avx512dq,prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fast-variable-shuffle,avx512vl,avx512bw,avx512dq,prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
4 ; Make sure CPUs default to prefer-256-bit. avx512vnni isn't interesting as it just adds an isel peephole for vpmaddwd+vpaddd
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cascadelake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cooperlake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=cannonlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-client | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI
13 ; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled.
15 define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "min-legal-vector-width"="256" {
16 ; CHECK-LABEL: add256:
18 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
19 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
20 ; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1
21 ; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0
22 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
23 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
24 ; CHECK-NEXT: vzeroupper
26 %d = load <16 x i32>, <16 x i32>* %a
27 %e = load <16 x i32>, <16 x i32>* %b
28 %f = add <16 x i32> %d, %e
29 store <16 x i32> %f, <16 x i32>* %c
33 define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "min-legal-vector-width"="512" {
34 ; CHECK-LABEL: add512:
36 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
37 ; CHECK-NEXT: vpaddd (%rsi), %zmm0, %zmm0
38 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx)
39 ; CHECK-NEXT: vzeroupper
41 %d = load <16 x i32>, <16 x i32>* %a
42 %e = load <16 x i32>, <16 x i32>* %b
43 %f = add <16 x i32> %d, %e
44 store <16 x i32> %f, <16 x i32>* %c
48 define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="256" {
49 ; CHECK-LABEL: avg_v64i8_256:
51 ; CHECK-NEXT: vmovdqa (%rsi), %ymm0
52 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1
53 ; CHECK-NEXT: vpavgb (%rdi), %ymm0, %ymm0
54 ; CHECK-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1
55 ; CHECK-NEXT: vmovdqu %ymm1, (%rax)
56 ; CHECK-NEXT: vmovdqu %ymm0, (%rax)
57 ; CHECK-NEXT: vzeroupper
59 %1 = load <64 x i8>, <64 x i8>* %a
60 %2 = load <64 x i8>, <64 x i8>* %b
61 %3 = zext <64 x i8> %1 to <64 x i32>
62 %4 = zext <64 x i8> %2 to <64 x i32>
63 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
64 %6 = add nuw nsw <64 x i32> %5, %4
65 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
66 %8 = trunc <64 x i32> %7 to <64 x i8>
67 store <64 x i8> %8, <64 x i8>* undef, align 4
72 define void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="512" {
73 ; CHECK-LABEL: avg_v64i8_512:
75 ; CHECK-NEXT: vmovdqa64 (%rsi), %zmm0
76 ; CHECK-NEXT: vpavgb (%rdi), %zmm0, %zmm0
77 ; CHECK-NEXT: vmovdqu64 %zmm0, (%rax)
78 ; CHECK-NEXT: vzeroupper
80 %1 = load <64 x i8>, <64 x i8>* %a
81 %2 = load <64 x i8>, <64 x i8>* %b
82 %3 = zext <64 x i8> %1 to <64 x i32>
83 %4 = zext <64 x i8> %2 to <64 x i32>
84 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
85 %6 = add nuw nsw <64 x i32> %5, %4
86 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
87 %8 = trunc <64 x i32> %7 to <64 x i8>
88 store <64 x i8> %8, <64 x i8>* undef, align 4
92 define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="256" {
93 ; CHECK-LABEL: pmaddwd_32_256:
95 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
96 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
97 ; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1
98 ; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0
99 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
100 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
101 ; CHECK-NEXT: vzeroupper
103 %A = load <32 x i16>, <32 x i16>* %APtr
104 %B = load <32 x i16>, <32 x i16>* %BPtr
105 %a = sext <32 x i16> %A to <32 x i32>
106 %b = sext <32 x i16> %B to <32 x i32>
107 %m = mul nsw <32 x i32> %a, %b
108 %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
109 %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
110 %ret = add <16 x i32> %odd, %even
111 store <16 x i32> %ret, <16 x i32>* %CPtr
115 define void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="512" {
116 ; CHECK-LABEL: pmaddwd_32_512:
118 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
119 ; CHECK-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0
120 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx)
121 ; CHECK-NEXT: vzeroupper
123 %A = load <32 x i16>, <32 x i16>* %APtr
124 %B = load <32 x i16>, <32 x i16>* %BPtr
125 %a = sext <32 x i16> %A to <32 x i32>
126 %b = sext <32 x i16> %B to <32 x i32>
127 %m = mul nsw <32 x i32> %a, %b
128 %odd = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
129 %even = shufflevector <32 x i32> %m, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
130 %ret = add <16 x i32> %odd, %even
131 store <16 x i32> %ret, <16 x i32>* %CPtr
135 define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "min-legal-vector-width"="256" {
136 ; CHECK-LABEL: psubus_64i8_max_256:
138 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
139 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
140 ; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1
141 ; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0
142 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
143 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
144 ; CHECK-NEXT: vzeroupper
146 %x = load <64 x i8>, <64 x i8>* %xptr
147 %y = load <64 x i8>, <64 x i8>* %yptr
148 %cmp = icmp ult <64 x i8> %x, %y
149 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
150 %res = sub <64 x i8> %max, %y
151 store <64 x i8> %res, <64 x i8>* %zptr
155 define void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "min-legal-vector-width"="512" {
156 ; CHECK-LABEL: psubus_64i8_max_512:
158 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
159 ; CHECK-NEXT: vpsubusb (%rsi), %zmm0, %zmm0
160 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rdx)
161 ; CHECK-NEXT: vzeroupper
163 %x = load <64 x i8>, <64 x i8>* %xptr
164 %y = load <64 x i8>, <64 x i8>* %yptr
165 %cmp = icmp ult <64 x i8> %x, %y
166 %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
167 %res = sub <64 x i8> %max, %y
168 store <64 x i8> %res, <64 x i8>* %zptr
172 define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="256" {
173 ; CHECK-LABEL: _Z9test_charPcS_i_256:
174 ; CHECK: # %bb.0: # %entry
175 ; CHECK-NEXT: movl %edx, %eax
176 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
177 ; CHECK-NEXT: xorl %ecx, %ecx
178 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
179 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
180 ; CHECK-NEXT: .p2align 4, 0x90
181 ; CHECK-NEXT: .LBB8_1: # %vector.body
182 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
183 ; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3
184 ; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm4
185 ; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5
186 ; CHECK-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3
187 ; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm1
188 ; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm3
189 ; CHECK-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3
190 ; CHECK-NEXT: vpaddd %ymm2, %ymm3, %ymm2
191 ; CHECK-NEXT: addq $32, %rcx
192 ; CHECK-NEXT: cmpq %rcx, %rax
193 ; CHECK-NEXT: jne .LBB8_1
194 ; CHECK-NEXT: # %bb.2: # %middle.block
195 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm1
196 ; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
197 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
198 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
199 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
200 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
201 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
202 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
203 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
204 ; CHECK-NEXT: vmovd %xmm0, %eax
205 ; CHECK-NEXT: vzeroupper
208 %3 = zext i32 %2 to i64
209 br label %vector.body
212 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
213 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
214 %4 = getelementptr inbounds i8, i8* %0, i64 %index
215 %5 = bitcast i8* %4 to <32 x i8>*
216 %wide.load = load <32 x i8>, <32 x i8>* %5, align 1
217 %6 = sext <32 x i8> %wide.load to <32 x i32>
218 %7 = getelementptr inbounds i8, i8* %1, i64 %index
219 %8 = bitcast i8* %7 to <32 x i8>*
220 %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1
221 %9 = sext <32 x i8> %wide.load14 to <32 x i32>
222 %10 = mul nsw <32 x i32> %9, %6
223 %11 = add nsw <32 x i32> %10, %vec.phi
224 %index.next = add i64 %index, 32
225 %12 = icmp eq i64 %index.next, %3
226 br i1 %12, label %middle.block, label %vector.body
229 %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
230 %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1
231 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
232 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
233 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
234 %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15
235 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
236 %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17
237 %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
238 %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19
239 %13 = extractelement <32 x i32> %bin.rdx20, i32 0
243 define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="512" {
244 ; CHECK-LABEL: _Z9test_charPcS_i_512:
245 ; CHECK: # %bb.0: # %entry
246 ; CHECK-NEXT: movl %edx, %eax
247 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
248 ; CHECK-NEXT: xorl %ecx, %ecx
249 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
250 ; CHECK-NEXT: .p2align 4, 0x90
251 ; CHECK-NEXT: .LBB9_1: # %vector.body
252 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
253 ; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2
254 ; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3
255 ; CHECK-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2
256 ; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1
257 ; CHECK-NEXT: addq $32, %rcx
258 ; CHECK-NEXT: cmpq %rcx, %rax
259 ; CHECK-NEXT: jne .LBB9_1
260 ; CHECK-NEXT: # %bb.2: # %middle.block
261 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
262 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
263 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
264 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
265 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
266 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
267 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
268 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
269 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
270 ; CHECK-NEXT: vmovd %xmm0, %eax
271 ; CHECK-NEXT: vzeroupper
274 %3 = zext i32 %2 to i64
275 br label %vector.body
278 %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
279 %vec.phi = phi <32 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
280 %4 = getelementptr inbounds i8, i8* %0, i64 %index
281 %5 = bitcast i8* %4 to <32 x i8>*
282 %wide.load = load <32 x i8>, <32 x i8>* %5, align 1
283 %6 = sext <32 x i8> %wide.load to <32 x i32>
284 %7 = getelementptr inbounds i8, i8* %1, i64 %index
285 %8 = bitcast i8* %7 to <32 x i8>*
286 %wide.load14 = load <32 x i8>, <32 x i8>* %8, align 1
287 %9 = sext <32 x i8> %wide.load14 to <32 x i32>
288 %10 = mul nsw <32 x i32> %9, %6
289 %11 = add nsw <32 x i32> %10, %vec.phi
290 %index.next = add i64 %index, 32
291 %12 = icmp eq i64 %index.next, %3
292 br i1 %12, label %middle.block, label %vector.body
295 %rdx.shuf1 = shufflevector <32 x i32> %11, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
296 %bin.rdx1 = add <32 x i32> %11, %rdx.shuf1
297 %rdx.shuf = shufflevector <32 x i32> %bin.rdx1, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
298 %bin.rdx = add <32 x i32> %bin.rdx1, %rdx.shuf
299 %rdx.shuf15 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
300 %bin.rdx32 = add <32 x i32> %bin.rdx, %rdx.shuf15
301 %rdx.shuf17 = shufflevector <32 x i32> %bin.rdx32, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
302 %bin.rdx18 = add <32 x i32> %bin.rdx32, %rdx.shuf17
303 %rdx.shuf19 = shufflevector <32 x i32> %bin.rdx18, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
304 %bin.rdx20 = add <32 x i32> %bin.rdx18, %rdx.shuf19
305 %13 = extractelement <32 x i32> %bin.rdx20, i32 0
309 @a = global [1024 x i8] zeroinitializer, align 16
310 @b = global [1024 x i8] zeroinitializer, align 16
312 define i32 @sad_16i8_256() "min-legal-vector-width"="256" {
313 ; CHECK-LABEL: sad_16i8_256:
314 ; CHECK: # %bb.0: # %entry
315 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
316 ; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00
317 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
318 ; CHECK-NEXT: .p2align 4, 0x90
319 ; CHECK-NEXT: .LBB10_1: # %vector.body
320 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
321 ; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm2
322 ; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2
323 ; CHECK-NEXT: vpaddd %ymm1, %ymm2, %ymm1
324 ; CHECK-NEXT: addq $4, %rax
325 ; CHECK-NEXT: jne .LBB10_1
326 ; CHECK-NEXT: # %bb.2: # %middle.block
327 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
328 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
329 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
330 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
331 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
332 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
333 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
334 ; CHECK-NEXT: vmovd %xmm0, %eax
335 ; CHECK-NEXT: vzeroupper
338 br label %vector.body
341 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
342 %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
343 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
344 %1 = bitcast i8* %0 to <16 x i8>*
345 %wide.load = load <16 x i8>, <16 x i8>* %1, align 4
346 %2 = zext <16 x i8> %wide.load to <16 x i32>
347 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
348 %4 = bitcast i8* %3 to <16 x i8>*
349 %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4
350 %5 = zext <16 x i8> %wide.load1 to <16 x i32>
351 %6 = sub nsw <16 x i32> %2, %5
352 %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
353 %8 = sub nsw <16 x i32> zeroinitializer, %6
354 %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
355 %10 = add nsw <16 x i32> %9, %vec.phi
356 %index.next = add i64 %index, 4
357 %11 = icmp eq i64 %index.next, 1024
358 br i1 %11, label %middle.block, label %vector.body
361 %.lcssa = phi <16 x i32> [ %10, %vector.body ]
362 %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
363 %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf
364 %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
365 %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
366 %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
367 %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
368 %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
369 %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
370 %12 = extractelement <16 x i32> %bin.rdx4, i32 0
374 define i32 @sad_16i8_512() "min-legal-vector-width"="512" {
375 ; CHECK-LABEL: sad_16i8_512:
376 ; CHECK: # %bb.0: # %entry
377 ; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
378 ; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00
379 ; CHECK-NEXT: .p2align 4, 0x90
380 ; CHECK-NEXT: .LBB11_1: # %vector.body
381 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
382 ; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm1
383 ; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
384 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
385 ; CHECK-NEXT: addq $4, %rax
386 ; CHECK-NEXT: jne .LBB11_1
387 ; CHECK-NEXT: # %bb.2: # %middle.block
388 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
389 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
390 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
391 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
392 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
393 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
394 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
395 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
396 ; CHECK-NEXT: vmovd %xmm0, %eax
397 ; CHECK-NEXT: vzeroupper
400 br label %vector.body
403 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
404 %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
405 %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
406 %1 = bitcast i8* %0 to <16 x i8>*
407 %wide.load = load <16 x i8>, <16 x i8>* %1, align 4
408 %2 = zext <16 x i8> %wide.load to <16 x i32>
409 %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
410 %4 = bitcast i8* %3 to <16 x i8>*
411 %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4
412 %5 = zext <16 x i8> %wide.load1 to <16 x i32>
413 %6 = sub nsw <16 x i32> %2, %5
414 %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
415 %8 = sub nsw <16 x i32> zeroinitializer, %6
416 %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
417 %10 = add nsw <16 x i32> %9, %vec.phi
418 %index.next = add i64 %index, 4
419 %11 = icmp eq i64 %index.next, 1024
420 br i1 %11, label %middle.block, label %vector.body
423 %.lcssa = phi <16 x i32> [ %10, %vector.body ]
424 %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
425 %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf
426 %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
427 %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
428 %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
429 %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
430 %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
431 %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
432 %12 = extractelement <16 x i32> %bin.rdx4, i32 0
436 define void @sbto16f32_256(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="256" {
437 ; CHECK-LABEL: sbto16f32_256:
439 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
440 ; CHECK-NEXT: kshiftrw $8, %k0, %k1
441 ; CHECK-NEXT: vpmovm2d %k1, %ymm0
442 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
443 ; CHECK-NEXT: vpmovm2d %k0, %ymm1
444 ; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
445 ; CHECK-NEXT: vmovaps %ymm1, (%rdi)
446 ; CHECK-NEXT: vmovaps %ymm0, 32(%rdi)
447 ; CHECK-NEXT: vzeroupper
449 %mask = icmp slt <16 x i16> %a, zeroinitializer
450 %1 = sitofp <16 x i1> %mask to <16 x float>
451 store <16 x float> %1, <16 x float>* %res
455 define void @sbto16f32_512(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="512" {
456 ; CHECK-LABEL: sbto16f32_512:
458 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
459 ; CHECK-NEXT: vpmovm2d %k0, %zmm0
460 ; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0
461 ; CHECK-NEXT: vmovaps %zmm0, (%rdi)
462 ; CHECK-NEXT: vzeroupper
464 %mask = icmp slt <16 x i16> %a, zeroinitializer
465 %1 = sitofp <16 x i1> %mask to <16 x float>
466 store <16 x float> %1, <16 x float>* %res
470 define void @sbto16f64_256(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="256" {
471 ; CHECK-LABEL: sbto16f64_256:
473 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
474 ; CHECK-NEXT: kshiftrw $8, %k0, %k1
475 ; CHECK-NEXT: vpmovm2d %k1, %ymm0
476 ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1
477 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
478 ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
479 ; CHECK-NEXT: vpmovm2d %k0, %ymm2
480 ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3
481 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
482 ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2
483 ; CHECK-NEXT: vmovaps %ymm2, 32(%rdi)
484 ; CHECK-NEXT: vmovaps %ymm3, (%rdi)
485 ; CHECK-NEXT: vmovaps %ymm0, 96(%rdi)
486 ; CHECK-NEXT: vmovaps %ymm1, 64(%rdi)
487 ; CHECK-NEXT: vzeroupper
489 %mask = icmp slt <16 x i16> %a, zeroinitializer
490 %1 = sitofp <16 x i1> %mask to <16 x double>
491 store <16 x double> %1, <16 x double>* %res
495 define void @sbto16f64_512(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="512" {
496 ; CHECK-LABEL: sbto16f64_512:
498 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
499 ; CHECK-NEXT: vpmovm2d %k0, %zmm0
500 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1
501 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
502 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0
503 ; CHECK-NEXT: vmovaps %zmm0, 64(%rdi)
504 ; CHECK-NEXT: vmovaps %zmm1, (%rdi)
505 ; CHECK-NEXT: vzeroupper
507 %mask = icmp slt <16 x i16> %a, zeroinitializer
508 %1 = sitofp <16 x i1> %mask to <16 x double>
509 store <16 x double> %1, <16 x double>* %res
513 define void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="256" {
514 ; CHECK-LABEL: ubto16f32_256:
516 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
517 ; CHECK-NEXT: kshiftrw $8, %k0, %k1
518 ; CHECK-NEXT: vpmovm2d %k1, %ymm0
519 ; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0
520 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
521 ; CHECK-NEXT: vpmovm2d %k0, %ymm1
522 ; CHECK-NEXT: vpsrld $31, %ymm1, %ymm1
523 ; CHECK-NEXT: vcvtdq2ps %ymm1, %ymm1
524 ; CHECK-NEXT: vmovaps %ymm1, (%rdi)
525 ; CHECK-NEXT: vmovaps %ymm0, 32(%rdi)
526 ; CHECK-NEXT: vzeroupper
528 %mask = icmp slt <16 x i16> %a, zeroinitializer
529 %1 = uitofp <16 x i1> %mask to <16 x float>
530 store <16 x float> %1, <16 x float>* %res
534 define void @ubto16f32_512(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="512" {
535 ; CHECK-LABEL: ubto16f32_512:
537 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
538 ; CHECK-NEXT: vpmovm2d %k0, %zmm0
539 ; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0
540 ; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0
541 ; CHECK-NEXT: vmovaps %zmm0, (%rdi)
542 ; CHECK-NEXT: vzeroupper
544 %mask = icmp slt <16 x i16> %a, zeroinitializer
545 %1 = uitofp <16 x i1> %mask to <16 x float>
546 store <16 x float> %1, <16 x float>* %res
550 define void @ubto16f64_256(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="256" {
551 ; CHECK-LABEL: ubto16f64_256:
553 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
554 ; CHECK-NEXT: kshiftrw $8, %k0, %k1
555 ; CHECK-NEXT: vpmovm2d %k1, %ymm0
556 ; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0
557 ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1
558 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
559 ; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
560 ; CHECK-NEXT: vpmovm2d %k0, %ymm2
561 ; CHECK-NEXT: vpsrld $31, %ymm2, %ymm2
562 ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm3
563 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
564 ; CHECK-NEXT: vcvtdq2pd %xmm2, %ymm2
565 ; CHECK-NEXT: vmovaps %ymm2, 32(%rdi)
566 ; CHECK-NEXT: vmovaps %ymm3, (%rdi)
567 ; CHECK-NEXT: vmovaps %ymm0, 96(%rdi)
568 ; CHECK-NEXT: vmovaps %ymm1, 64(%rdi)
569 ; CHECK-NEXT: vzeroupper
571 %mask = icmp slt <16 x i16> %a, zeroinitializer
572 %1 = uitofp <16 x i1> %mask to <16 x double>
573 store <16 x double> %1, <16 x double>* %res
577 define void @ubto16f64_512(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="512" {
578 ; CHECK-LABEL: ubto16f64_512:
580 ; CHECK-NEXT: vpmovw2m %ymm0, %k0
581 ; CHECK-NEXT: vpmovm2d %k0, %zmm0
582 ; CHECK-NEXT: vpsrld $31, %zmm0, %zmm0
583 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1
584 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
585 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0
586 ; CHECK-NEXT: vmovaps %zmm0, 64(%rdi)
587 ; CHECK-NEXT: vmovaps %zmm1, (%rdi)
588 ; CHECK-NEXT: vzeroupper
590 %mask = icmp slt <16 x i16> %a, zeroinitializer
591 %1 = uitofp <16 x i1> %mask to <16 x double>
592 store <16 x double> %1, <16 x double>* %res
596 define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" {
597 ; CHECK-LABEL: test_16f32toub_256:
599 ; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1
600 ; CHECK-NEXT: vpslld $31, %ymm1, %ymm1
601 ; CHECK-NEXT: vpmovd2m %ymm1, %k0
602 ; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm1
603 ; CHECK-NEXT: vpslld $31, %ymm1, %ymm1
604 ; CHECK-NEXT: vpmovd2m %ymm1, %k1
605 ; CHECK-NEXT: kunpckbw %k0, %k1, %k1
606 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
608 %a = load <16 x float>, <16 x float>* %ptr
609 %mask = fptoui <16 x float> %a to <16 x i1>
610 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
611 ret <16 x i16> %select
614 define <16 x i16> @test_16f32toub_512(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="512" {
615 ; CHECK-LABEL: test_16f32toub_512:
617 ; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1
618 ; CHECK-NEXT: vpslld $31, %zmm1, %zmm1
619 ; CHECK-NEXT: vpmovd2m %zmm1, %k1
620 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
622 %a = load <16 x float>, <16 x float>* %ptr
623 %mask = fptoui <16 x float> %a to <16 x i1>
624 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
625 ret <16 x i16> %select
628 define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" {
629 ; CHECK-LABEL: test_16f32tosb_256:
631 ; CHECK-NEXT: vcvttps2dq (%rdi), %ymm1
632 ; CHECK-NEXT: vpmovd2m %ymm1, %k0
633 ; CHECK-NEXT: vcvttps2dq 32(%rdi), %ymm1
634 ; CHECK-NEXT: vpmovd2m %ymm1, %k1
635 ; CHECK-NEXT: kunpckbw %k0, %k1, %k1
636 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
638 %a = load <16 x float>, <16 x float>* %ptr
639 %mask = fptosi <16 x float> %a to <16 x i1>
640 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
641 ret <16 x i16> %select
644 define <16 x i16> @test_16f32tosb_512(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="512" {
645 ; CHECK-LABEL: test_16f32tosb_512:
647 ; CHECK-NEXT: vcvttps2dq (%rdi), %zmm1
648 ; CHECK-NEXT: vpmovd2m %zmm1, %k1
649 ; CHECK-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
651 %a = load <16 x float>, <16 x float>* %ptr
652 %mask = fptosi <16 x float> %a to <16 x i1>
653 %select = select <16 x i1> %mask, <16 x i16> %passthru, <16 x i16> zeroinitializer
654 ret <16 x i16> %select
657 define void @mul256(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vector-width"="256" {
658 ; CHECK-AVX512-LABEL: mul256:
659 ; CHECK-AVX512: # %bb.0:
660 ; CHECK-AVX512-NEXT: vmovdqa (%rdi), %ymm0
661 ; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
662 ; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2
663 ; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3
664 ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
665 ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
666 ; CHECK-AVX512-NEXT: vpmullw %ymm4, %ymm5, %ymm4
667 ; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
668 ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm4, %ymm4
669 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
670 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
671 ; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm1, %ymm1
672 ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm1, %ymm1
673 ; CHECK-AVX512-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
674 ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
675 ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
676 ; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm4, %ymm3
677 ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm3, %ymm3
678 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
679 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
680 ; CHECK-AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0
681 ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm0, %ymm0
682 ; CHECK-AVX512-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
683 ; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx)
684 ; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx)
685 ; CHECK-AVX512-NEXT: vzeroupper
686 ; CHECK-AVX512-NEXT: retq
688 ; CHECK-VBMI-LABEL: mul256:
689 ; CHECK-VBMI: # %bb.0:
690 ; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm0
691 ; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
692 ; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2
693 ; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
694 ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
695 ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
696 ; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4
697 ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
698 ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
699 ; CHECK-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1
700 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
701 ; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1
702 ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
703 ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
704 ; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4
705 ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
706 ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
707 ; CHECK-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0
708 ; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0
709 ; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx)
710 ; CHECK-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
711 ; CHECK-VBMI-NEXT: vzeroupper
712 ; CHECK-VBMI-NEXT: retq
713 %d = load <64 x i8>, <64 x i8>* %a
714 %e = load <64 x i8>, <64 x i8>* %b
715 %f = mul <64 x i8> %d, %e
716 store <64 x i8> %f, <64 x i8>* %c
720 define void @mul512(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vector-width"="512" {
721 ; CHECK-AVX512-LABEL: mul512:
722 ; CHECK-AVX512: # %bb.0:
723 ; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
724 ; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1
725 ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
726 ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
727 ; CHECK-AVX512-NEXT: vpmullw %zmm2, %zmm3, %zmm2
728 ; CHECK-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
729 ; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm2, %zmm2
730 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
731 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
732 ; CHECK-AVX512-NEXT: vpmullw %zmm1, %zmm0, %zmm0
733 ; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm0, %zmm0
734 ; CHECK-AVX512-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
735 ; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
736 ; CHECK-AVX512-NEXT: vzeroupper
737 ; CHECK-AVX512-NEXT: retq
739 ; CHECK-VBMI-LABEL: mul512:
740 ; CHECK-VBMI: # %bb.0:
741 ; CHECK-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
742 ; CHECK-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
743 ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
744 ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
745 ; CHECK-VBMI-NEXT: vpmullw %zmm2, %zmm3, %zmm2
746 ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
747 ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
748 ; CHECK-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm0
749 ; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
750 ; CHECK-VBMI-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
751 ; CHECK-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx)
752 ; CHECK-VBMI-NEXT: vzeroupper
753 ; CHECK-VBMI-NEXT: retq
754 %d = load <64 x i8>, <64 x i8>* %a
755 %e = load <64 x i8>, <64 x i8>* %b
756 %f = mul <64 x i8> %d, %e
757 store <64 x i8> %f, <64 x i8>* %c
761 ; This threw an assertion at one point.
762 define <4 x i32> @mload_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) "min-legal-vector-width"="256" {
763 ; CHECK-LABEL: mload_v4i32:
765 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
766 ; CHECK-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
768 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
769 %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
772 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
774 define <16 x i32> @trunc_v16i64_v16i32(<16 x i64>* %x) nounwind "min-legal-vector-width"="256" {
775 ; CHECK-LABEL: trunc_v16i64_v16i32:
777 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
778 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
779 ; CHECK-NEXT: vmovdqa 64(%rdi), %ymm2
780 ; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3
781 ; CHECK-NEXT: vpmovqd %ymm0, %xmm0
782 ; CHECK-NEXT: vpmovqd %ymm1, %xmm1
783 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
784 ; CHECK-NEXT: vpmovqd %ymm2, %xmm1
785 ; CHECK-NEXT: vpmovqd %ymm3, %xmm2
786 ; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
788 %a = load <16 x i64>, <16 x i64>* %x
789 %b = trunc <16 x i64> %a to <16 x i32>
793 define <16 x i8> @trunc_v16i64_v16i8(<16 x i64>* %x) nounwind "min-legal-vector-width"="256" {
794 ; CHECK-LABEL: trunc_v16i64_v16i8:
796 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
797 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
798 ; CHECK-NEXT: vmovdqa 64(%rdi), %ymm2
799 ; CHECK-NEXT: vmovdqa 96(%rdi), %ymm3
800 ; CHECK-NEXT: vpmovqb %ymm3, %xmm3
801 ; CHECK-NEXT: vpmovqb %ymm2, %xmm2
802 ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
803 ; CHECK-NEXT: vpmovqb %ymm1, %xmm1
804 ; CHECK-NEXT: vpmovqb %ymm0, %xmm0
805 ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
806 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
807 ; CHECK-NEXT: vzeroupper
809 %a = load <16 x i64>, <16 x i64>* %x
810 %b = trunc <16 x i64> %a to <16 x i8>
814 define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" {
815 ; CHECK-LABEL: trunc_v16i32_v16i8:
817 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
818 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
819 ; CHECK-NEXT: vpmovdb %ymm1, %xmm1
820 ; CHECK-NEXT: vpmovdb %ymm0, %xmm0
821 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
822 ; CHECK-NEXT: vzeroupper
824 %a = load <16 x i32>, <16 x i32>* %x
825 %b = trunc <16 x i32> %a to <16 x i8>
829 define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
830 ; CHECK-LABEL: trunc_v8i64_v8i8:
832 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
833 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
834 ; CHECK-NEXT: vpmovqb %ymm1, %xmm1
835 ; CHECK-NEXT: vpmovqb %ymm0, %xmm0
836 ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
837 ; CHECK-NEXT: vzeroupper
839 %a = load <8 x i64>, <8 x i64>* %x
840 %b = trunc <8 x i64> %a to <8 x i8>
844 define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
845 ; CHECK-LABEL: trunc_v8i64_v8i16:
847 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
848 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
849 ; CHECK-NEXT: vpmovqw %ymm1, %xmm1
850 ; CHECK-NEXT: vpmovqw %ymm0, %xmm0
851 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
852 ; CHECK-NEXT: vzeroupper
854 %a = load <8 x i64>, <8 x i64>* %x
855 %b = trunc <8 x i64> %a to <8 x i16>
859 define <8 x i32> @trunc_v8i64_v8i32_zeroes(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
860 ; CHECK-LABEL: trunc_v8i64_v8i32_zeroes:
862 ; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm1
863 ; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm2
864 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
865 ; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm0
867 %a = load <8 x i64>, <8 x i64>* %x
868 %b = lshr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48>
869 %c = trunc <8 x i64> %b to <8 x i32>
873 define <16 x i16> @trunc_v16i32_v16i16_zeroes(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" {
874 ; CHECK-LABEL: trunc_v16i32_v16i16_zeroes:
876 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
877 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
878 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
880 %a = load <16 x i32>, <16 x i32>* %x
881 %b = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
882 %c = trunc <16 x i32> %b to <16 x i16>
886 define <32 x i8> @trunc_v32i16_v32i8_zeroes(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" {
887 ; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_zeroes:
888 ; CHECK-AVX512: # %bb.0:
889 ; CHECK-AVX512-NEXT: vpsrlw $8, 32(%rdi), %ymm0
890 ; CHECK-AVX512-NEXT: vpsrlw $8, (%rdi), %ymm1
891 ; CHECK-AVX512-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
892 ; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
893 ; CHECK-AVX512-NEXT: retq
895 ; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_zeroes:
896 ; CHECK-VBMI: # %bb.0:
897 ; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1
898 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
899 ; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0
900 ; CHECK-VBMI-NEXT: retq
901 %a = load <32 x i16>, <32 x i16>* %x
902 %b = lshr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
903 %c = trunc <32 x i16> %b to <32 x i8>
907 define <8 x i32> @trunc_v8i64_v8i32_sign(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
908 ; CHECK-LABEL: trunc_v8i64_v8i32_sign:
910 ; CHECK-NEXT: vpsraq $48, 32(%rdi), %ymm1
911 ; CHECK-NEXT: vpsraq $48, (%rdi), %ymm2
912 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
913 ; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm0
915 %a = load <8 x i64>, <8 x i64>* %x
916 %b = ashr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48>
917 %c = trunc <8 x i64> %b to <8 x i32>
921 define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" {
922 ; CHECK-LABEL: trunc_v16i32_v16i16_sign:
924 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
925 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
926 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
928 %a = load <16 x i32>, <16 x i32>* %x
929 %b = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
930 %c = trunc <16 x i32> %b to <16 x i16>
934 define <32 x i8> @trunc_v32i16_v32i8_sign(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" {
935 ; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_sign:
936 ; CHECK-AVX512: # %bb.0:
937 ; CHECK-AVX512-NEXT: vpsraw $8, 32(%rdi), %ymm0
938 ; CHECK-AVX512-NEXT: vpsraw $8, (%rdi), %ymm1
939 ; CHECK-AVX512-NEXT: vpacksswb %ymm0, %ymm1, %ymm0
940 ; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
941 ; CHECK-AVX512-NEXT: retq
943 ; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_sign:
944 ; CHECK-VBMI: # %bb.0:
945 ; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1
946 ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
947 ; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0
948 ; CHECK-VBMI-NEXT: retq
949 %a = load <32 x i16>, <32 x i16>* %x
950 %b = ashr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
951 %c = trunc <32 x i16> %b to <32 x i8>
955 define void @zext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal-vector-width"="256" {
956 ; CHECK-LABEL: zext_v16i8_v16i64:
958 ; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
959 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
960 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
961 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
962 ; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
963 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
964 ; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
965 ; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
966 ; CHECK-NEXT: vmovdqa %ymm0, (%rdi)
967 ; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi)
968 ; CHECK-NEXT: vmovdqa %ymm3, 96(%rdi)
969 ; CHECK-NEXT: vmovdqa %ymm2, 32(%rdi)
970 ; CHECK-NEXT: vzeroupper
972 %a = zext <16 x i8> %x to <16 x i64>
973 store <16 x i64> %a, <16 x i64>* %y
977 define void @sext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal-vector-width"="256" {
978 ; CHECK-LABEL: sext_v16i8_v16i64:
980 ; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0
981 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
982 ; CHECK-NEXT: vpmovsxwq %xmm1, %ymm1
983 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
984 ; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
985 ; CHECK-NEXT: vpmovsxwq %xmm3, %ymm3
986 ; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0
987 ; CHECK-NEXT: vpmovsxwq %xmm2, %ymm2
988 ; CHECK-NEXT: vmovdqa %ymm2, 64(%rdi)
989 ; CHECK-NEXT: vmovdqa %ymm0, (%rdi)
990 ; CHECK-NEXT: vmovdqa %ymm3, 96(%rdi)
991 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdi)
992 ; CHECK-NEXT: vzeroupper
994 %a = sext <16 x i8> %x to <16 x i64>
995 store <16 x i64> %a, <16 x i64>* %y
999 define void @vselect_split_v8i16_setcc(<8 x i16> %s, <8 x i16> %t, <8 x i64>* %p, <8 x i64>* %q, <8 x i64>* %r) "min-legal-vector-width"="256" {
1000 ; CHECK-LABEL: vselect_split_v8i16_setcc:
1002 ; CHECK-NEXT: vmovdqa (%rsi), %ymm2
1003 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3
1004 ; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
1005 ; CHECK-NEXT: kshiftrb $4, %k1, %k2
1006 ; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2}
1007 ; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1}
1008 ; CHECK-NEXT: vmovdqa %ymm2, (%rdx)
1009 ; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx)
1010 ; CHECK-NEXT: vzeroupper
1012 %x = load <8 x i64>, <8 x i64>* %p
1013 %y = load <8 x i64>, <8 x i64>* %q
1014 %a = icmp eq <8 x i16> %s, %t
1015 %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y
1016 store <8 x i64> %b, <8 x i64>* %r
1020 define void @vselect_split_v8i32_setcc(<8 x i32> %s, <8 x i32> %t, <8 x i64>* %p, <8 x i64>* %q, <8 x i64>* %r) "min-legal-vector-width"="256" {
1021 ; CHECK-LABEL: vselect_split_v8i32_setcc:
1023 ; CHECK-NEXT: vmovdqa (%rsi), %ymm2
1024 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3
1025 ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
1026 ; CHECK-NEXT: kshiftrb $4, %k1, %k2
1027 ; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2}
1028 ; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1}
1029 ; CHECK-NEXT: vmovdqa %ymm2, (%rdx)
1030 ; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx)
1031 ; CHECK-NEXT: vzeroupper
1033 %x = load <8 x i64>, <8 x i64>* %p
1034 %y = load <8 x i64>, <8 x i64>* %q
1035 %a = icmp eq <8 x i32> %s, %t
1036 %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y
1037 store <8 x i64> %b, <8 x i64>* %r
1041 define void @vselect_split_v16i8_setcc(<16 x i8> %s, <16 x i8> %t, <16 x i32>* %p, <16 x i32>* %q, <16 x i32>* %r) "min-legal-vector-width"="256" {
1042 ; CHECK-LABEL: vselect_split_v16i8_setcc:
1044 ; CHECK-NEXT: vmovdqa (%rsi), %ymm2
1045 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3
1046 ; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
1047 ; CHECK-NEXT: kshiftrw $8, %k1, %k2
1048 ; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2}
1049 ; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1}
1050 ; CHECK-NEXT: vmovdqa %ymm2, (%rdx)
1051 ; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx)
1052 ; CHECK-NEXT: vzeroupper
1054 %x = load <16 x i32>, <16 x i32>* %p
1055 %y = load <16 x i32>, <16 x i32>* %q
1056 %a = icmp eq <16 x i8> %s, %t
1057 %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y
1058 store <16 x i32> %b, <16 x i32>* %r
1062 define void @vselect_split_v16i16_setcc(<16 x i16> %s, <16 x i16> %t, <16 x i32>* %p, <16 x i32>* %q, <16 x i32>* %r) "min-legal-vector-width"="256" {
1063 ; CHECK-LABEL: vselect_split_v16i16_setcc:
1065 ; CHECK-NEXT: vmovdqa (%rsi), %ymm2
1066 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3
1067 ; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
1068 ; CHECK-NEXT: kshiftrw $8, %k1, %k2
1069 ; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2}
1070 ; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1}
1071 ; CHECK-NEXT: vmovdqa %ymm2, (%rdx)
1072 ; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx)
1073 ; CHECK-NEXT: vzeroupper
1075 %x = load <16 x i32>, <16 x i32>* %p
1076 %y = load <16 x i32>, <16 x i32>* %q
1077 %a = icmp eq <16 x i16> %s, %t
1078 %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y
1079 store <16 x i32> %b, <16 x i32>* %r
1083 define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32>* %p) "min-legal-vector-width"="256" {
1084 ; CHECK-LABEL: trunc_packus_v16i32_v16i8:
1086 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
1087 ; CHECK-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0
1088 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1089 ; CHECK-NEXT: vpmovuswb %ymm0, %xmm0
1090 ; CHECK-NEXT: vzeroupper
1092 %a = load <16 x i32>, <16 x i32>* %p
1093 %b = icmp slt <16 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1094 %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1095 %d = icmp sgt <16 x i32> %c, zeroinitializer
1096 %e = select <16 x i1> %d, <16 x i32> %c, <16 x i32> zeroinitializer
1097 %f = trunc <16 x i32> %e to <16 x i8>
1101 define void @trunc_packus_v16i32_v16i8_store(<16 x i32>* %p, <16 x i8>* %q) "min-legal-vector-width"="256" {
1102 ; CHECK-LABEL: trunc_packus_v16i32_v16i8_store:
1104 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
1105 ; CHECK-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0
1106 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1107 ; CHECK-NEXT: vpmovuswb %ymm0, (%rsi)
1108 ; CHECK-NEXT: vzeroupper
1110 %a = load <16 x i32>, <16 x i32>* %p
1111 %b = icmp slt <16 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1112 %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
1113 %d = icmp sgt <16 x i32> %c, zeroinitializer
1114 %e = select <16 x i1> %d, <16 x i32> %c, <16 x i32> zeroinitializer
1115 %f = trunc <16 x i32> %e to <16 x i8>
1116 store <16 x i8> %f, <16 x i8>* %q