1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+prefer-256-bit | FileCheck %s --check-prefix=AVX256
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,-prefer-256-bit | FileCheck %s --check-prefix=AVX512VL
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+prefer-256-bit | FileCheck %s --check-prefix=AVX512F
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-prefer-256-bit | FileCheck %s --check-prefix=AVX512F
7 define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) {
8 ; AVX256-LABEL: testv8i1_sext_v8i16:
10 ; AVX256-NEXT: vmovdqa (%rdi), %ymm0
11 ; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
12 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
13 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
14 ; AVX256-NEXT: vpmovdw %ymm0, %xmm0
15 ; AVX256-NEXT: vzeroupper
18 ; AVX512VL-LABEL: testv8i1_sext_v8i16:
20 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
21 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
22 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
23 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
24 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
25 ; AVX512VL-NEXT: vzeroupper
28 ; AVX512F-LABEL: testv8i1_sext_v8i16:
30 ; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
31 ; AVX512F-NEXT: vpcmpeqd (%rdi), %ymm0, %ymm0
32 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
33 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
34 ; AVX512F-NEXT: vzeroupper
36 %in = load <8 x i32>, <8 x i32>* %p
37 %cmp = icmp eq <8 x i32> %in, zeroinitializer
38 %ext = sext <8 x i1> %cmp to <8 x i16>
42 define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
43 ; AVX256-LABEL: testv16i1_sext_v16i8:
45 ; AVX256-NEXT: vmovdqa (%rdi), %ymm0
46 ; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
47 ; AVX256-NEXT: vmovdqa (%rsi), %ymm0
48 ; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
49 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
50 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
51 ; AVX256-NEXT: vpmovdw %ymm1, %xmm1
52 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
53 ; AVX256-NEXT: vpmovdw %ymm0, %xmm0
54 ; AVX256-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
55 ; AVX256-NEXT: vzeroupper
58 ; AVX512VL-LABEL: testv16i1_sext_v16i8:
60 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
61 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
62 ; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
63 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
64 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
65 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
66 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
67 ; AVX512VL-NEXT: vzeroupper
70 ; AVX512F-LABEL: testv16i1_sext_v16i8:
72 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
73 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
74 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
75 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
76 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
77 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
78 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
79 ; AVX512F-NEXT: vzeroupper
81 %in = load <8 x i32>, <8 x i32>* %p
82 %cmp = icmp eq <8 x i32> %in, zeroinitializer
83 %in2 = load <8 x i32>, <8 x i32>* %q
84 %cmp2 = icmp eq <8 x i32> %in2, zeroinitializer
85 %concat = shufflevector <8 x i1> %cmp, <8 x i1> %cmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
86 %ext = sext <16 x i1> %concat to <16 x i8>
90 define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
91 ; AVX256-LABEL: testv16i1_sext_v16i16:
93 ; AVX256-NEXT: vmovdqa (%rdi), %ymm0
94 ; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
95 ; AVX256-NEXT: vmovdqa (%rsi), %ymm0
96 ; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
97 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
98 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
99 ; AVX256-NEXT: vpmovdw %ymm1, %xmm1
100 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k2} {z}
101 ; AVX256-NEXT: vpmovdw %ymm0, %xmm0
102 ; AVX256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
105 ; AVX512VL-LABEL: testv16i1_sext_v16i16:
107 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
108 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
109 ; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
110 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
111 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
112 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
113 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
114 ; AVX512VL-NEXT: retq
116 ; AVX512F-LABEL: testv16i1_sext_v16i16:
118 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
119 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
120 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
121 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
122 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
123 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
124 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
126 %in = load <8 x i32>, <8 x i32>* %p
127 %cmp = icmp eq <8 x i32> %in, zeroinitializer
128 %in2 = load <8 x i32>, <8 x i32>* %q
129 %cmp2 = icmp eq <8 x i32> %in2, zeroinitializer
130 %concat = shufflevector <8 x i1> %cmp, <8 x i1> %cmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
131 %ext = sext <16 x i1> %concat to <16 x i16>
135 define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) {
136 ; AVX256-LABEL: testv8i1_zext_v8i16:
138 ; AVX256-NEXT: vmovdqa (%rdi), %ymm0
139 ; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
140 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
141 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
142 ; AVX256-NEXT: vpmovdw %ymm0, %xmm0
143 ; AVX256-NEXT: vpsrlw $15, %xmm0, %xmm0
144 ; AVX256-NEXT: vzeroupper
147 ; AVX512VL-LABEL: testv8i1_zext_v8i16:
149 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
150 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
151 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
152 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
153 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
154 ; AVX512VL-NEXT: vpsrlw $15, %xmm0, %xmm0
155 ; AVX512VL-NEXT: vzeroupper
156 ; AVX512VL-NEXT: retq
158 ; AVX512F-LABEL: testv8i1_zext_v8i16:
160 ; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
161 ; AVX512F-NEXT: vpcmpeqd (%rdi), %ymm0, %ymm0
162 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
163 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
164 ; AVX512F-NEXT: vzeroupper
166 %in = load <8 x i32>, <8 x i32>* %p
167 %cmp = icmp eq <8 x i32> %in, zeroinitializer
168 %ext = zext <8 x i1> %cmp to <8 x i16>
172 define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
173 ; AVX256-LABEL: testv16i1_zext_v16i8:
175 ; AVX256-NEXT: vmovdqa (%rdi), %ymm0
176 ; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
177 ; AVX256-NEXT: vmovdqa (%rsi), %ymm0
178 ; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
179 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
180 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
181 ; AVX256-NEXT: vpmovdw %ymm1, %xmm1
182 ; AVX256-NEXT: vpsrlw $15, %xmm1, %xmm1
183 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
184 ; AVX256-NEXT: vpmovdw %ymm0, %xmm0
185 ; AVX256-NEXT: vpsrlw $15, %xmm0, %xmm0
186 ; AVX256-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
187 ; AVX256-NEXT: vzeroupper
190 ; AVX512VL-LABEL: testv16i1_zext_v16i8:
192 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
193 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
194 ; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
195 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
196 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
197 ; AVX512VL-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k1} {z}
198 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
199 ; AVX512VL-NEXT: vzeroupper
200 ; AVX512VL-NEXT: retq
202 ; AVX512F-LABEL: testv16i1_zext_v16i8:
204 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
205 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
206 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
207 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
208 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
209 ; AVX512F-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k1} {z}
210 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
211 ; AVX512F-NEXT: vzeroupper
213 %in = load <8 x i32>, <8 x i32>* %p
214 %cmp = icmp eq <8 x i32> %in, zeroinitializer
215 %in2 = load <8 x i32>, <8 x i32>* %q
216 %cmp2 = icmp eq <8 x i32> %in2, zeroinitializer
217 %concat = shufflevector <8 x i1> %cmp, <8 x i1> %cmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
218 %ext = zext <16 x i1> %concat to <16 x i8>
222 define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
223 ; AVX256-LABEL: testv16i1_zext_v16i16:
225 ; AVX256-NEXT: vmovdqa (%rdi), %ymm0
226 ; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
227 ; AVX256-NEXT: vmovdqa (%rsi), %ymm0
228 ; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
229 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
230 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
231 ; AVX256-NEXT: vpmovdw %ymm1, %xmm1
232 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k2} {z}
233 ; AVX256-NEXT: vpmovdw %ymm0, %xmm0
234 ; AVX256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
235 ; AVX256-NEXT: vpsrlw $15, %ymm0, %ymm0
238 ; AVX512VL-LABEL: testv16i1_zext_v16i16:
240 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
241 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
242 ; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
243 ; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
244 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
245 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
246 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
247 ; AVX512VL-NEXT: vpsrlw $15, %ymm0, %ymm0
248 ; AVX512VL-NEXT: retq
250 ; AVX512F-LABEL: testv16i1_zext_v16i16:
252 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
253 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
254 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
255 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
256 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
257 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
258 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
259 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0
261 %in = load <8 x i32>, <8 x i32>* %p
262 %cmp = icmp eq <8 x i32> %in, zeroinitializer
263 %in2 = load <8 x i32>, <8 x i32>* %q
264 %cmp2 = icmp eq <8 x i32> %in2, zeroinitializer
265 %concat = shufflevector <8 x i1> %cmp, <8 x i1> %cmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
266 %ext = zext <16 x i1> %concat to <16 x i16>