1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle %s -o - | FileCheck --check-prefixes=CHECK,CHECK-FAST %s
3 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-perlane-shuffle %s -o - | FileCheck --check-prefixes=CHECK,CHECK-FAST-PERLANE %s
5 ; FIXME: All cases here should be fixed by PR34380
7 define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
8 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
10 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [8,6,12,4,7,9,14,8]
11 ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
12 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
13 ; CHECK-NEXT: vzeroupper
15 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
18 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
19 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
21 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [8,6,12,4,7,9,14,8]
22 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
23 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
24 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
25 ; CHECK-NEXT: vzeroupper
27 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
28 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
29 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
33 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
34 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
36 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [8,6,12,4,7,9,14,8]
37 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
38 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
39 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
40 ; CHECK-NEXT: vzeroupper
42 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
43 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
44 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
47 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
48 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
50 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [4,12,9,4,14,15,12,14]
51 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
52 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
53 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
54 ; CHECK-NEXT: vzeroupper
56 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
57 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
58 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
62 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) {
63 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1:
65 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [4,12,9,4,14,15,12,14]
66 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
67 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
68 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
69 ; CHECK-NEXT: vzeroupper
71 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
72 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
73 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
76 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
77 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
79 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [4,11,14,10,7,1,6,9]
80 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
81 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
82 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
83 ; CHECK-NEXT: vzeroupper
85 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
86 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
87 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
91 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) {
92 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2:
94 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [4,11,14,10,7,1,6,9]
95 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
96 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
97 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
98 ; CHECK-NEXT: vzeroupper
100 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
101 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
102 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
105 define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
106 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3:
108 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [14,15,7,13,4,12,8,0]
109 ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
110 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
111 ; CHECK-NEXT: vzeroupper
113 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
116 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
117 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3:
119 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [14,15,7,13,4,12,8,0]
120 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
121 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
122 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
123 ; CHECK-NEXT: vzeroupper
125 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
126 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
127 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
131 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) {
132 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3:
134 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [14,15,7,13,4,12,8,0]
135 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
136 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
137 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
138 ; CHECK-NEXT: vzeroupper
140 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
141 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
142 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
145 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(ptr %vp) {
146 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0:
148 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
149 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9]
150 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0
152 %vec = load <16 x i16>, ptr %vp
153 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
156 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
157 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
159 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
160 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [0,7,13,3,5,13,3,9]
161 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
162 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
163 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
165 %vec = load <16 x i16>, ptr %vp
166 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
167 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
168 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
172 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) {
173 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
175 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
176 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9]
177 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
178 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
179 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
181 %vec = load <16 x i16>, ptr %vp
182 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
183 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
184 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
188 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
189 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1:
191 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
192 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [3,15,12,7,1,5,8,14]
193 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
194 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
195 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
197 %vec = load <16 x i16>, ptr %vp
198 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
199 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
200 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
204 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) {
205 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1:
207 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
208 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14]
209 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
210 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
211 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
213 %vec = load <16 x i16>, ptr %vp
214 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
215 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
216 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
220 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
221 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
223 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
224 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1]
225 ; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm3
226 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
227 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
229 %vec = load <16 x i16>, ptr %vp
230 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
231 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
232 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
236 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) {
237 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
239 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
240 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1]
241 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
242 ; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z}
243 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
245 %vec = load <16 x i16>, ptr %vp
246 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
247 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
248 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
252 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) {
253 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3:
255 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
256 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2]
257 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0
259 %vec = load <16 x i16>, ptr %vp
260 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
263 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
264 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
266 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
267 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [9,7,9,6,9,4,3,2]
268 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
269 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
270 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
272 %vec = load <16 x i16>, ptr %vp
273 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
274 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
275 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
279 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) {
280 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
282 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
283 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2]
284 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
285 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
286 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
288 %vec = load <16 x i16>, ptr %vp
289 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
290 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
291 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
295 define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) {
296 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0:
298 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
299 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
300 ; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1
301 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
303 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
306 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
307 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0:
309 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
310 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
311 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
312 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
313 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
315 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
316 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
317 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
321 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) {
322 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0:
324 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
325 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
326 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
327 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
328 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
330 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
331 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
332 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
335 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
336 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1:
338 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
339 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
340 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
341 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
342 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
344 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
345 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
346 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
350 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) {
351 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1:
353 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
354 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
355 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
356 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
357 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
359 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
360 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
361 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
364 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
365 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2:
367 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
368 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
369 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
370 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
371 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
373 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
374 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
375 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
379 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) {
380 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2:
382 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
383 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
384 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
385 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
386 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
388 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
389 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
390 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
393 define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
394 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3:
396 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
397 ; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0
398 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
400 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
403 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
404 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3:
406 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
407 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm0
408 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
409 ; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
411 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
412 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
413 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
417 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) {
418 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3:
420 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
421 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
422 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
423 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
425 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
426 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
427 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
430 define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) {
431 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0:
433 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14]
434 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
435 ; CHECK-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
436 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
437 ; CHECK-NEXT: vzeroupper
439 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
442 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
443 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0:
445 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14]
446 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
447 ; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm4
448 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
449 ; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
450 ; CHECK-NEXT: vzeroupper
452 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
453 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
454 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
458 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) {
459 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0:
461 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14]
462 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
463 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
464 ; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm2 {%k1} {z}
465 ; CHECK-NEXT: vmovdqa %xmm2, %xmm0
466 ; CHECK-NEXT: vzeroupper
468 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
469 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
470 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
473 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
474 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1:
476 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5]
477 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm0
478 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
479 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
480 ; CHECK-NEXT: vzeroupper
482 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
483 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
484 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
488 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) {
489 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1:
491 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5]
492 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
493 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
494 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
495 ; CHECK-NEXT: vzeroupper
497 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
498 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
499 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
502 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
503 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2:
505 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8]
506 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm0
507 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
508 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
509 ; CHECK-NEXT: vzeroupper
511 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
512 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
513 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
517 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) {
518 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2:
520 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8]
521 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
522 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
523 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
524 ; CHECK-NEXT: vzeroupper
526 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
527 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
528 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
531 define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
532 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3:
534 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30]
535 ; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0
536 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
537 ; CHECK-NEXT: vzeroupper
539 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
542 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
543 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3:
545 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30]
546 ; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm0
547 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
548 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
549 ; CHECK-NEXT: vzeroupper
551 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
552 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
553 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
557 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) {
558 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3:
560 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30]
561 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
562 ; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
563 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
564 ; CHECK-NEXT: vzeroupper
566 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
567 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
568 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
571 define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(ptr %vp) {
572 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0:
574 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
575 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
576 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
578 %vec = load <32 x i16>, ptr %vp
579 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
582 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
583 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0:
585 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
586 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
587 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
588 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
589 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
591 %vec = load <32 x i16>, ptr %vp
592 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
593 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
594 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
598 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %mask) {
599 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0:
601 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
602 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
603 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
604 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
605 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
607 %vec = load <32 x i16>, ptr %vp
608 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
609 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
610 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
614 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
615 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1:
617 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
618 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
619 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
620 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
621 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
623 %vec = load <32 x i16>, ptr %vp
624 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
625 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
626 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
630 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %mask) {
631 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1:
633 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
634 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
635 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
636 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
637 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
639 %vec = load <32 x i16>, ptr %vp
640 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
641 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
642 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
646 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
647 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2:
649 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
650 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
651 ; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3
652 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
653 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
655 %vec = load <32 x i16>, ptr %vp
656 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
657 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
658 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
662 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %mask) {
663 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2:
665 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
666 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
667 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
668 ; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z}
669 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
671 %vec = load <32 x i16>, ptr %vp
672 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
673 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
674 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
678 define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(ptr %vp) {
679 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3:
681 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
682 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
683 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
685 %vec = load <32 x i16>, ptr %vp
686 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
689 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
690 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3:
692 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
693 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
694 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
695 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
696 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
698 %vec = load <32 x i16>, ptr %vp
699 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
700 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
701 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
705 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %mask) {
706 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3:
708 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
709 ; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
710 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
711 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
712 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
714 %vec = load <32 x i16>, ptr %vp
715 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
716 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
717 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
721 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(ptr %vp) {
722 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0:
724 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17]
725 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0
726 ; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm0
727 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
728 ; CHECK-NEXT: vzeroupper
730 %vec = load <32 x i16>, ptr %vp
731 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
734 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
735 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0:
737 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17]
738 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3
739 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3
740 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
741 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
742 ; CHECK-NEXT: vzeroupper
744 %vec = load <32 x i16>, ptr %vp
745 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
746 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
747 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
751 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) {
752 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0:
754 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17]
755 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
756 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
757 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
758 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
759 ; CHECK-NEXT: vzeroupper
761 %vec = load <32 x i16>, ptr %vp
762 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
763 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
764 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
768 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
769 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1:
771 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
772 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3
773 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3
774 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
775 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
776 ; CHECK-NEXT: vzeroupper
778 %vec = load <32 x i16>, ptr %vp
779 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
780 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
781 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
785 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) {
786 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
788 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
789 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
790 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
791 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
792 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
793 ; CHECK-NEXT: vzeroupper
795 %vec = load <32 x i16>, ptr %vp
796 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
797 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
798 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
802 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
803 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2:
805 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10]
806 ; CHECK-NEXT: vmovdqa (%rdi), %ymm3
807 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3
808 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
809 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
810 ; CHECK-NEXT: vzeroupper
812 %vec = load <32 x i16>, ptr %vp
813 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
814 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
815 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
819 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) {
820 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2:
822 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10]
823 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
824 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
825 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
826 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
827 ; CHECK-NEXT: vzeroupper
829 %vec = load <32 x i16>, ptr %vp
830 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
831 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
832 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
836 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(ptr %vp) {
837 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3:
839 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9]
840 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
841 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm0
842 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
843 ; CHECK-NEXT: vzeroupper
845 %vec = load <32 x i16>, ptr %vp
846 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
849 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
850 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3:
852 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9]
853 ; CHECK-NEXT: vmovdqa (%rdi), %ymm3
854 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3
855 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
856 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
857 ; CHECK-NEXT: vzeroupper
859 %vec = load <32 x i16>, ptr %vp
860 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
861 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
862 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
866 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) {
867 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3:
869 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9]
870 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
871 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
872 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
873 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
874 ; CHECK-NEXT: vzeroupper
876 %vec = load <32 x i16>, ptr %vp
877 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
878 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
879 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
883 define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) {
884 ; CHECK-LABEL: test_16xi16_to_8xi16_E84C94EF:
886 ; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [14,8,4,12,9,4,14,15]
887 ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
888 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
889 ; CHECK-NEXT: vzeroupper
891 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15>
895 define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) {
896 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0:
898 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,3,2]
899 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
900 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
901 ; CHECK-NEXT: vzeroupper
903 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
906 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
907 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0:
909 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [4,0,3,2]
910 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
911 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
912 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
913 ; CHECK-NEXT: vzeroupper
915 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
916 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
917 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
921 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) {
922 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0:
924 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,0,3,2]
925 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
926 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
927 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
928 ; CHECK-NEXT: vzeroupper
930 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
931 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
932 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
935 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
936 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1:
938 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,0,7,3]
939 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
940 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
941 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
942 ; CHECK-NEXT: vzeroupper
944 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
945 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
946 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
950 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) {
951 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1:
953 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,0,7,3]
954 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
955 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
956 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
957 ; CHECK-NEXT: vzeroupper
959 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
960 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
961 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
964 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
965 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask2:
967 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3]
968 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
969 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
970 ; CHECK-NEXT: vzeroupper
972 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
973 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
974 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
978 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %mask) {
979 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask2:
981 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3]
982 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
983 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
984 ; CHECK-NEXT: vzeroupper
986 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
987 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
988 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
991 define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
992 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3:
994 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [5,3,2,5]
995 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
996 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
997 ; CHECK-NEXT: vzeroupper
999 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1002 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1003 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3:
1005 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,3,2,5]
1006 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
1007 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1008 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1009 ; CHECK-NEXT: vzeroupper
1011 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1012 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1013 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1017 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) {
1018 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3:
1020 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,3,2,5]
1021 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1022 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
1023 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1024 ; CHECK-NEXT: vzeroupper
1026 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1027 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1028 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1031 define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(ptr %vp) {
1032 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0:
1034 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm0
1035 ; CHECK-NEXT: vshufps $7, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[3,1],mem[0,0]
1037 %vec = load <8 x i32>, ptr %vp
1038 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1041 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1042 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0:
1044 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
1045 ; CHECK-NEXT: vshufps $7, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[3,1],mem[0,0]
1046 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1047 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
1049 %vec = load <8 x i32>, ptr %vp
1050 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1051 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1052 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1056 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) {
1057 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0:
1059 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
1060 ; CHECK-NEXT: vshufps $7, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[3,1],mem[0,0]
1061 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1062 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
1064 %vec = load <8 x i32>, ptr %vp
1065 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1066 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1067 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1071 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1072 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
1074 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
1075 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,0,0,3]
1076 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3
1077 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1078 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1080 %vec = load <8 x i32>, ptr %vp
1081 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
1082 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1083 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1087 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) {
1088 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
1090 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
1091 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [5,0,0,3]
1092 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1093 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z}
1094 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1096 %vec = load <8 x i32>, ptr %vp
1097 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
1098 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1099 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1103 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1104 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
1106 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1107 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,7,7,0]
1108 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3
1109 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1110 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1112 %vec = load <8 x i32>, ptr %vp
1113 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
1114 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1115 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1119 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) {
1120 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
1122 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1123 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,7,7,0]
1124 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1125 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
1126 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1128 %vec = load <8 x i32>, ptr %vp
1129 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
1130 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1131 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1135 define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(ptr %vp) {
1136 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3:
1138 ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm1
1139 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,1,2,7]
1140 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm1, %xmm0
1142 %vec = load <8 x i32>, ptr %vp
1143 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1146 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1147 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3:
1149 ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2
1150 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,1,2,7]
1151 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3
1152 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1153 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1155 %vec = load <8 x i32>, ptr %vp
1156 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1157 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1158 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1162 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) {
1163 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3:
1165 ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2
1166 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [5,1,2,7]
1167 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1168 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z}
1169 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1171 %vec = load <8 x i32>, ptr %vp
1172 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1173 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1174 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1178 define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
1179 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0:
1181 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [1,13,11,14,7,10,1,6]
1182 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
1183 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1185 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1188 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1189 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0:
1191 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,13,11,14,7,10,1,6]
1192 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1193 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1194 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1196 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1197 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1198 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1202 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) {
1203 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0:
1205 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,13,11,14,7,10,1,6]
1206 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1207 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1208 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1210 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1211 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1212 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1215 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1216 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1:
1218 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,0,15,3,2,3,6,8]
1219 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1220 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1221 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1223 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
1224 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1225 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1229 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) {
1230 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1:
1232 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8]
1233 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1234 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1235 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1237 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
1238 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1239 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1242 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1243 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2:
1245 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,15,15,2,6,10,14,7]
1246 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1247 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1248 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1250 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
1251 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1252 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1256 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) {
1257 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2:
1259 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7]
1260 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1261 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1262 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1264 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
1265 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1266 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1269 define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
1270 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3:
1272 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3]
1273 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
1274 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1276 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1279 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1280 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3:
1282 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [14,5,7,7,10,3,9,3]
1283 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1284 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1285 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1287 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1288 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1289 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1293 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) {
1294 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3:
1296 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3]
1297 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1298 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1299 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1301 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1302 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1303 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1306 define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
1307 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0:
1309 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12]
1310 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
1311 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1312 ; CHECK-NEXT: vzeroupper
1314 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1317 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1318 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0:
1320 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12]
1321 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1322 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1323 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1324 ; CHECK-NEXT: vzeroupper
1326 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1327 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1328 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1332 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) {
1333 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0:
1335 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12]
1336 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1337 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1338 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1339 ; CHECK-NEXT: vzeroupper
1341 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1342 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1343 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1346 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1347 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1:
1349 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,1,3,4]
1350 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1351 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
1352 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1353 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1354 ; CHECK-NEXT: vzeroupper
1356 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
1357 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1358 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1362 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) {
1363 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1:
1365 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,1,3,4]
1366 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1367 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1368 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
1369 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1370 ; CHECK-NEXT: vzeroupper
1372 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
1373 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1374 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1377 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1378 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2:
1380 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,1,13,0]
1381 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1382 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1383 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1384 ; CHECK-NEXT: vzeroupper
1386 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
1387 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1388 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1392 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) {
1393 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2:
1395 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,1,13,0]
1396 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1397 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1398 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1399 ; CHECK-NEXT: vzeroupper
1401 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
1402 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1403 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1406 define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
1407 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3:
1409 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,0,0,13]
1410 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
1411 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1412 ; CHECK-NEXT: vzeroupper
1414 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1417 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1418 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3:
1420 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,0,0,13]
1421 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1422 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1423 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1424 ; CHECK-NEXT: vzeroupper
1426 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1427 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1428 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1432 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) {
1433 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3:
1435 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,0,0,13]
1436 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1437 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1438 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1439 ; CHECK-NEXT: vzeroupper
1441 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1442 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1443 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1446 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(ptr %vp) {
1447 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0:
1449 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4]
1450 ; CHECK-NEXT: vpermps 32(%rdi), %ymm0, %ymm0
1452 %vec = load <16 x i32>, ptr %vp
1453 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1456 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1457 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0:
1459 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4]
1460 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1461 ; CHECK-NEXT: vpermd 32(%rdi), %ymm2, %ymm0 {%k1}
1463 %vec = load <16 x i32>, ptr %vp
1464 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1465 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1466 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1470 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %mask) {
1471 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0:
1473 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4]
1474 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1475 ; CHECK-NEXT: vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z}
1477 %vec = load <16 x i32>, ptr %vp
1478 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1479 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1480 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1484 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1485 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1:
1487 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1488 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15]
1489 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
1490 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1491 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
1493 %vec = load <16 x i32>, ptr %vp
1494 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
1495 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1496 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1500 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %mask) {
1501 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1:
1503 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1504 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15]
1505 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1506 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1507 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1509 %vec = load <16 x i32>, ptr %vp
1510 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
1511 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1512 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1516 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1517 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2:
1519 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1520 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10]
1521 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
1522 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1523 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
1525 %vec = load <16 x i32>, ptr %vp
1526 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
1527 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1528 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1532 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %mask) {
1533 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2:
1535 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1536 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10]
1537 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1538 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1539 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1541 %vec = load <16 x i32>, ptr %vp
1542 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
1543 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1544 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1548 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(ptr %vp) {
1549 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3:
1551 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1552 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12]
1553 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0
1555 %vec = load <16 x i32>, ptr %vp
1556 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1559 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1560 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3:
1562 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
1563 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12]
1564 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3
1565 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1566 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
1568 %vec = load <16 x i32>, ptr %vp
1569 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1570 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1571 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1575 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %mask) {
1576 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3:
1578 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
1579 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12]
1580 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1581 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
1582 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1584 %vec = load <16 x i32>, ptr %vp
1585 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1586 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1587 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1591 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) {
1592 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0:
1594 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [13,0,0,6]
1595 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
1596 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm0
1597 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1598 ; CHECK-NEXT: vzeroupper
1600 %vec = load <16 x i32>, ptr %vp
1601 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1604 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1605 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0:
1607 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,0,0,6]
1608 ; CHECK-NEXT: vmovdqa (%rdi), %ymm3
1609 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3
1610 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1611 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1612 ; CHECK-NEXT: vzeroupper
1614 %vec = load <16 x i32>, ptr %vp
1615 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1616 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1617 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1621 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) {
1622 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
1624 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,0,0,6]
1625 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1626 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1627 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
1628 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1629 ; CHECK-NEXT: vzeroupper
1631 %vec = load <16 x i32>, ptr %vp
1632 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1633 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1634 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1638 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1639 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
1641 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1642 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [15,5,3,2,0,0,0,0]
1643 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
1644 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1645 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1646 ; CHECK-NEXT: vzeroupper
1648 %vec = load <16 x i32>, ptr %vp
1649 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
1650 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1651 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1655 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) {
1656 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
1658 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1659 ; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [15,5,3,2,0,0,0,0]
1660 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1661 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1662 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1663 ; CHECK-NEXT: vzeroupper
1665 %vec = load <16 x i32>, ptr %vp
1666 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
1667 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1668 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1672 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1673 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2:
1675 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,15,6,9]
1676 ; CHECK-NEXT: vmovdqa (%rdi), %ymm3
1677 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3
1678 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1679 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1680 ; CHECK-NEXT: vzeroupper
1682 %vec = load <16 x i32>, ptr %vp
1683 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
1684 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1685 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1689 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) {
1690 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2:
1692 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,15,6,9]
1693 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1694 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1695 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
1696 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1697 ; CHECK-NEXT: vzeroupper
1699 %vec = load <16 x i32>, ptr %vp
1700 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
1701 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1702 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1706 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(ptr %vp) {
1707 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3:
1709 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
1710 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,4,3,6]
1711 ; CHECK-NEXT: vpermi2d (%rdi), %xmm1, %xmm0
1713 %vec = load <16 x i32>, ptr %vp
1714 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1717 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1718 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3:
1720 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1721 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,4,3,6]
1722 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3
1723 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1724 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1726 %vec = load <16 x i32>, ptr %vp
1727 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1728 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1729 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1733 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) {
1734 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3:
1736 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1737 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,4,3,6]
1738 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1739 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
1740 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1742 %vec = load <16 x i32>, ptr %vp
1743 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1744 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1745 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1749 define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
1750 ; CHECK-FAST-LABEL: test_16xi32_to_4xi32_perm_mask9:
1751 ; CHECK-FAST: # %bb.0:
1752 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [12,9,4,10]
1753 ; CHECK-FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
1754 ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1755 ; CHECK-FAST-NEXT: vzeroupper
1756 ; CHECK-FAST-NEXT: retq
1758 ; CHECK-FAST-PERLANE-LABEL: test_16xi32_to_4xi32_perm_mask9:
1759 ; CHECK-FAST-PERLANE: # %bb.0:
1760 ; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,0,2]
1761 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1762 ; CHECK-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm1, %ymm1
1763 ; CHECK-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
1764 ; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,1,4,3]
1765 ; CHECK-FAST-PERLANE-NEXT: vpermi2d %xmm2, %xmm1, %xmm0
1766 ; CHECK-FAST-PERLANE-NEXT: vzeroupper
1767 ; CHECK-FAST-PERLANE-NEXT: retq
1768 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 12, i32 9, i32 4, i32 10>
1772 define <2 x i64> @test_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) {
1773 ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mask0:
1775 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
1776 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1777 ; CHECK-NEXT: vzeroupper
1779 %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1782 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
1783 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0:
1785 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,2,3]
1786 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
1787 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
1788 ; CHECK-NEXT: vzeroupper
1790 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1791 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1792 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1796 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %mask) {
1797 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0:
1799 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1800 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3]
1801 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1802 ; CHECK-NEXT: vzeroupper
1804 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1805 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1806 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1809 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
1810 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask1:
1812 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
1813 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
1814 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
1815 ; CHECK-NEXT: vzeroupper
1817 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1818 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1819 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1823 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %mask) {
1824 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask1:
1826 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1827 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
1828 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1829 ; CHECK-NEXT: vzeroupper
1831 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1832 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1833 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1836 define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
1837 ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0:
1839 ; CHECK-NEXT: vmovaps (%rdi), %xmm0
1840 ; CHECK-NEXT: vunpckhpd 16(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[1]
1842 %vec = load <4 x i64>, ptr %vp
1843 %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1846 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
1847 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0:
1849 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
1850 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1851 ; CHECK-NEXT: vpunpckhqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[1]
1853 %vec = load <4 x i64>, ptr %vp
1854 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1855 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1856 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1860 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) {
1861 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0:
1863 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
1864 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
1865 ; CHECK-NEXT: vpunpckhqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[1]
1867 %vec = load <4 x i64>, ptr %vp
1868 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1869 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1870 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1874 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
1875 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1:
1877 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1878 ; CHECK-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
1879 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1880 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
1882 %vec = load <4 x i64>, ptr %vp
1883 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1884 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1885 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1889 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) {
1890 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1:
1892 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
1893 ; CHECK-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
1894 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
1895 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
1897 %vec = load <4 x i64>, ptr %vp
1898 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1899 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1900 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1904 define <4 x i64> @test_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec) {
1905 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask0:
1907 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
1908 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1]
1910 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1913 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1914 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
1916 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1917 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
1918 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1]
1919 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1921 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1922 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1923 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1927 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) {
1928 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
1930 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1931 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
1932 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1]
1934 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1935 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1936 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1939 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1940 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
1941 ; CHECK-FAST: # %bb.0:
1942 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,4,6,1]
1943 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
1944 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
1945 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
1946 ; CHECK-FAST-NEXT: retq
1948 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
1949 ; CHECK-FAST-PERLANE: # %bb.0:
1950 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1951 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
1952 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
1953 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,1]
1954 ; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0
1955 ; CHECK-FAST-PERLANE-NEXT: retq
1956 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
1957 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1958 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1962 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) {
1963 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
1964 ; CHECK-FAST: # %bb.0:
1965 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,4,6,1]
1966 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
1967 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
1968 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1969 ; CHECK-FAST-NEXT: retq
1971 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
1972 ; CHECK-FAST-PERLANE: # %bb.0:
1973 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1974 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
1975 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
1976 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,1]
1977 ; CHECK-FAST-PERLANE-NEXT: retq
1978 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
1979 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1980 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1983 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1984 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
1985 ; CHECK-FAST: # %bb.0:
1986 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,3,6,3]
1987 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
1988 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
1989 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
1990 ; CHECK-FAST-NEXT: retq
1992 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
1993 ; CHECK-FAST-PERLANE: # %bb.0:
1994 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1995 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
1996 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
1997 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
1998 ; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0
1999 ; CHECK-FAST-PERLANE-NEXT: retq
2000 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
2001 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2002 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2006 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
2007 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
2008 ; CHECK-FAST: # %bb.0:
2009 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,3,6,3]
2010 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2011 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2012 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2013 ; CHECK-FAST-NEXT: retq
2015 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
2016 ; CHECK-FAST-PERLANE: # %bb.0:
2017 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2018 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
2019 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2020 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
2021 ; CHECK-FAST-PERLANE-NEXT: retq
2022 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
2023 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2024 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2027 define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
2028 ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask3:
2029 ; CHECK-FAST: # %bb.0:
2030 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [6,0,0,7]
2031 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
2032 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2033 ; CHECK-FAST-NEXT: retq
2035 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask3:
2036 ; CHECK-FAST-PERLANE: # %bb.0:
2037 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1
2038 ; CHECK-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2039 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,3]
2040 ; CHECK-FAST-PERLANE-NEXT: retq
2041 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2044 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2045 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
2046 ; CHECK-FAST: # %bb.0:
2047 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,0,0,7]
2048 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
2049 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
2050 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2051 ; CHECK-FAST-NEXT: retq
2053 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
2054 ; CHECK-FAST-PERLANE: # %bb.0:
2055 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2056 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2057 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
2058 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,0,3]
2059 ; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0
2060 ; CHECK-FAST-PERLANE-NEXT: retq
2061 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2062 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2063 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2067 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) {
2068 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
2069 ; CHECK-FAST: # %bb.0:
2070 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,0,0,7]
2071 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2072 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2073 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2074 ; CHECK-FAST-NEXT: retq
2076 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
2077 ; CHECK-FAST-PERLANE: # %bb.0:
2078 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2079 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2080 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2081 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,0,3]
2082 ; CHECK-FAST-PERLANE-NEXT: retq
2083 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2084 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2085 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2088 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2089 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
2090 ; CHECK-FAST: # %bb.0:
2091 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,7,7,5]
2092 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
2093 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
2094 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2095 ; CHECK-FAST-NEXT: retq
2097 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
2098 ; CHECK-FAST-PERLANE: # %bb.0:
2099 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2100 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
2101 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
2102 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,3,1]
2103 ; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0
2104 ; CHECK-FAST-PERLANE-NEXT: retq
2105 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
2106 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2107 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2111 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {
2112 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
2113 ; CHECK-FAST: # %bb.0:
2114 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,7,7,5]
2115 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2116 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2117 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2118 ; CHECK-FAST-NEXT: retq
2120 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
2121 ; CHECK-FAST-PERLANE: # %bb.0:
2122 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2123 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
2124 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2125 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,3,1]
2126 ; CHECK-FAST-PERLANE-NEXT: retq
2127 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
2128 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2129 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2132 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2133 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5:
2135 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,1,0,6]
2136 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
2137 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2138 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2140 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
2141 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2142 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2146 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) {
2147 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5:
2149 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,1,0,6]
2150 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2151 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2152 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2154 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
2155 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2156 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2159 define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
2160 ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask6:
2161 ; CHECK-FAST: # %bb.0:
2162 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,3]
2163 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
2164 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2165 ; CHECK-FAST-NEXT: retq
2167 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask6:
2168 ; CHECK-FAST-PERLANE: # %bb.0:
2169 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1
2170 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,1,3]
2171 ; CHECK-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2172 ; CHECK-FAST-PERLANE-NEXT: retq
2173 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2176 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2177 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
2178 ; CHECK-FAST: # %bb.0:
2179 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,6,5,3]
2180 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
2181 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
2182 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2183 ; CHECK-FAST-NEXT: retq
2185 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
2186 ; CHECK-FAST-PERLANE: # %bb.0:
2187 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2188 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,2,1,3]
2189 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
2190 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
2191 ; CHECK-FAST-PERLANE-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2192 ; CHECK-FAST-PERLANE-NEXT: retq
2193 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2194 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2195 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2199 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) {
2200 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
2201 ; CHECK-FAST: # %bb.0:
2202 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,6,5,3]
2203 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2204 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2205 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2206 ; CHECK-FAST-NEXT: retq
2208 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
2209 ; CHECK-FAST-PERLANE: # %bb.0:
2210 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2211 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,2,1,3]
2212 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
2213 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2214 ; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
2215 ; CHECK-FAST-PERLANE-NEXT: retq
2216 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2217 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2218 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2221 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2222 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
2224 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3
2225 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm4 = [2,0,3,4]
2226 ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4
2227 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2228 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
2230 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
2231 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2232 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2235 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
2236 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
2238 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm2
2239 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,0,3,4]
2240 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2241 ; CHECK-NEXT: vpermt2q %ymm2, %ymm3, %ymm0 {%k1} {z}
2242 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2244 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
2245 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2246 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2250 define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
2251 ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0:
2253 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7]
2254 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2255 ; CHECK-NEXT: vzeroupper
2257 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2260 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
2261 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0:
2263 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7]
2264 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
2265 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
2266 ; CHECK-NEXT: vzeroupper
2268 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2269 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2270 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2273 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) {
2274 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0:
2276 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2277 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,2,3,7,4,6,7]
2278 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2279 ; CHECK-NEXT: vzeroupper
2281 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2282 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2283 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2287 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
2288 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
2290 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2291 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
2292 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
2293 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
2294 ; CHECK-NEXT: vzeroupper
2296 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
2297 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2298 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2301 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) {
2302 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
2304 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2305 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2306 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
2307 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2308 ; CHECK-NEXT: vzeroupper
2310 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
2311 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2312 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2316 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(ptr %vp) {
2317 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask0:
2319 ; CHECK-NEXT: vpermpd $136, (%rdi), %ymm0 # ymm0 = mem[0,2,0,2]
2321 %vec = load <8 x i64>, ptr %vp
2322 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2325 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2326 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask0:
2328 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2329 ; CHECK-NEXT: vpermq $136, (%rdi), %ymm0 {%k1} # ymm0 {%k1} = mem[0,2,0,2]
2331 %vec = load <8 x i64>, ptr %vp
2332 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2333 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2334 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2337 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> %mask) {
2338 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0:
2340 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2341 ; CHECK-NEXT: vpermq $136, (%rdi), %ymm0 {%k1} {z} # ymm0 {%k1} {z} = mem[0,2,0,2]
2343 %vec = load <8 x i64>, ptr %vp
2344 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2345 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2346 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2350 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2351 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
2352 ; CHECK-FAST: # %bb.0:
2353 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2354 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,3,2,4]
2355 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3
2356 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2357 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2358 ; CHECK-FAST-NEXT: retq
2360 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
2361 ; CHECK-FAST-PERLANE: # %bb.0:
2362 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2
2363 ; CHECK-FAST-PERLANE-NEXT: vpblendd $15, (%rdi), %ymm2, %ymm2 # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
2364 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2365 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,3,2,0]
2366 ; CHECK-FAST-PERLANE-NEXT: retq
2367 %vec = load <8 x i64>, ptr %vp
2368 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
2369 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2370 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2374 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %mask) {
2375 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
2376 ; CHECK-FAST: # %bb.0:
2377 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2378 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,3,2,4]
2379 ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1
2380 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2381 ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
2382 ; CHECK-FAST-NEXT: retq
2384 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
2385 ; CHECK-FAST-PERLANE: # %bb.0:
2386 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
2387 ; CHECK-FAST-PERLANE-NEXT: vpblendd $15, (%rdi), %ymm1, %ymm1 # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
2388 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1
2389 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,3,2,0]
2390 ; CHECK-FAST-PERLANE-NEXT: retq
2391 %vec = load <8 x i64>, ptr %vp
2392 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
2393 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2394 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2398 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2399 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
2400 ; CHECK-FAST: # %bb.0:
2401 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2402 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,5,5,1]
2403 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3
2404 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2405 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2406 ; CHECK-FAST-NEXT: retq
2408 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
2409 ; CHECK-FAST-PERLANE: # %bb.0:
2410 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2
2411 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
2412 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2413 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,0]
2414 ; CHECK-FAST-PERLANE-NEXT: retq
2415 %vec = load <8 x i64>, ptr %vp
2416 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
2417 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2418 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2422 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %mask) {
2423 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
2424 ; CHECK-FAST: # %bb.0:
2425 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2426 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [3,5,5,1]
2427 ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1
2428 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2429 ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
2430 ; CHECK-FAST-NEXT: retq
2432 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
2433 ; CHECK-FAST-PERLANE: # %bb.0:
2434 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
2435 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
2436 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1
2437 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,0]
2438 ; CHECK-FAST-PERLANE-NEXT: retq
2439 %vec = load <8 x i64>, ptr %vp
2440 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
2441 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2442 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2446 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) {
2447 ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
2448 ; CHECK-FAST: # %bb.0:
2449 ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm1
2450 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [7,0,0,2]
2451 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0
2452 ; CHECK-FAST-NEXT: retq
2454 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
2455 ; CHECK-FAST-PERLANE: # %bb.0:
2456 ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
2457 ; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm0, %ymm0 # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
2458 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,1,3]
2459 ; CHECK-FAST-PERLANE-NEXT: retq
2460 %vec = load <8 x i64>, ptr %vp
2461 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2464 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2465 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
2466 ; CHECK-FAST: # %bb.0:
2467 ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2
2468 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,0,0,2]
2469 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2470 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2471 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2472 ; CHECK-FAST-NEXT: retq
2474 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
2475 ; CHECK-FAST-PERLANE: # %bb.0:
2476 ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2
2477 ; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm2, %ymm2 # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
2478 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2479 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,3]
2480 ; CHECK-FAST-PERLANE-NEXT: retq
2481 %vec = load <8 x i64>, ptr %vp
2482 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2483 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2484 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2488 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %mask) {
2489 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
2490 ; CHECK-FAST: # %bb.0:
2491 ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2
2492 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,0,0,2]
2493 ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1
2494 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2495 ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
2496 ; CHECK-FAST-NEXT: retq
2498 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
2499 ; CHECK-FAST-PERLANE: # %bb.0:
2500 ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1
2501 ; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm1, %ymm1 # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
2502 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1
2503 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,3]
2504 ; CHECK-FAST-PERLANE-NEXT: retq
2505 %vec = load <8 x i64>, ptr %vp
2506 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2507 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2508 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2512 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2513 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4:
2515 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2516 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,4,6,1]
2517 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2518 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2519 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2521 %vec = load <8 x i64>, ptr %vp
2522 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
2523 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2524 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2528 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %mask) {
2529 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:
2531 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2532 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,4,6,1]
2533 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2534 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2535 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2537 %vec = load <8 x i64>, ptr %vp
2538 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
2539 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2540 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2544 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2545 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
2546 ; CHECK-FAST: # %bb.0:
2547 ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2
2548 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,2,7,1]
2549 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2550 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2551 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2552 ; CHECK-FAST-NEXT: retq
2554 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
2555 ; CHECK-FAST-PERLANE: # %bb.0:
2556 ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2
2557 ; CHECK-FAST-PERLANE-NEXT: vpblendd $192, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
2558 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2559 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,2,3,1]
2560 ; CHECK-FAST-PERLANE-NEXT: retq
2561 %vec = load <8 x i64>, ptr %vp
2562 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
2563 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2564 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2568 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %mask) {
2569 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
2570 ; CHECK-FAST: # %bb.0:
2571 ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2
2572 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,7,1]
2573 ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1
2574 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2575 ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
2576 ; CHECK-FAST-NEXT: retq
2578 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
2579 ; CHECK-FAST-PERLANE: # %bb.0:
2580 ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1
2581 ; CHECK-FAST-PERLANE-NEXT: vpblendd $192, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
2582 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1
2583 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,2,3,1]
2584 ; CHECK-FAST-PERLANE-NEXT: retq
2585 %vec = load <8 x i64>, ptr %vp
2586 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
2587 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2588 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2592 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) {
2593 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6:
2595 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
2596 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [7,2,3,2]
2597 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0
2599 %vec = load <8 x i64>, ptr %vp
2600 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2603 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2604 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6:
2606 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2607 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,2,3,2]
2608 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2609 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2610 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2612 %vec = load <8 x i64>, ptr %vp
2613 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2614 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2615 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2619 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %mask) {
2620 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:
2622 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2623 ; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,2,3,2]
2624 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2625 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2626 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2628 %vec = load <8 x i64>, ptr %vp
2629 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2630 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2631 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2635 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2636 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
2637 ; CHECK-FAST: # %bb.0:
2638 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2639 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,3,1,5]
2640 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3
2641 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2642 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2643 ; CHECK-FAST-NEXT: retq
2645 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
2646 ; CHECK-FAST-PERLANE: # %bb.0:
2647 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2
2648 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
2649 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2650 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,2,0,1]
2651 ; CHECK-FAST-PERLANE-NEXT: retq
2652 %vec = load <8 x i64>, ptr %vp
2653 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
2654 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2655 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2659 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %mask) {
2660 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
2661 ; CHECK-FAST: # %bb.0:
2662 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2663 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [3,3,1,5]
2664 ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1
2665 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2666 ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
2667 ; CHECK-FAST-NEXT: retq
2669 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
2670 ; CHECK-FAST-PERLANE: # %bb.0:
2671 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
2672 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
2673 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1
2674 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,2,0,1]
2675 ; CHECK-FAST-PERLANE-NEXT: retq
2676 %vec = load <8 x i64>, ptr %vp
2677 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
2678 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2679 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2683 define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
2684 ; CHECK-FAST-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
2685 ; CHECK-FAST: # %bb.0:
2686 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm0 = [4,1]
2687 ; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0
2688 ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2689 ; CHECK-FAST-NEXT: vzeroupper
2690 ; CHECK-FAST-NEXT: retq
2692 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
2693 ; CHECK-FAST-PERLANE: # %bb.0:
2694 ; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm0
2695 ; CHECK-FAST-PERLANE-NEXT: vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3]
2696 ; CHECK-FAST-PERLANE-NEXT: retq
2697 %vec = load <8 x i64>, ptr %vp
2698 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2701 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
2702 ; CHECK-FAST-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
2703 ; CHECK-FAST: # %bb.0:
2704 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,1]
2705 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm2
2706 ; CHECK-FAST-NEXT: vptestnmq %xmm1, %xmm1, %k1
2707 ; CHECK-FAST-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
2708 ; CHECK-FAST-NEXT: vzeroupper
2709 ; CHECK-FAST-NEXT: retq
2711 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
2712 ; CHECK-FAST-PERLANE: # %bb.0:
2713 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2
2714 ; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
2715 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm1, %xmm1, %k1
2716 ; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
2717 ; CHECK-FAST-PERLANE-NEXT: retq
2718 %vec = load <8 x i64>, ptr %vp
2719 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2720 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2721 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2725 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) {
2726 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
2727 ; CHECK-FAST: # %bb.0:
2728 ; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,1]
2729 ; CHECK-FAST-NEXT: vptestnmq %xmm0, %xmm0, %k1
2730 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
2731 ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2732 ; CHECK-FAST-NEXT: vzeroupper
2733 ; CHECK-FAST-NEXT: retq
2735 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
2736 ; CHECK-FAST-PERLANE: # %bb.0:
2737 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm1
2738 ; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
2739 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm0, %xmm0, %k1
2740 ; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
2741 ; CHECK-FAST-PERLANE-NEXT: retq
2742 %vec = load <8 x i64>, ptr %vp
2743 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2744 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2745 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2749 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
2750 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
2752 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2753 ; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
2754 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
2755 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2756 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
2757 ; CHECK-NEXT: vzeroupper
2759 %vec = load <8 x i64>, ptr %vp
2760 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
2761 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2762 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2766 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) {
2767 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
2769 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
2770 ; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
2771 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
2772 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
2773 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
2774 ; CHECK-NEXT: vzeroupper
2776 %vec = load <8 x i64>, ptr %vp
2777 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
2778 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2779 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2783 define <4 x float> @test_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) {
2784 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
2786 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
2787 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1]
2788 ; CHECK-NEXT: vzeroupper
2790 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2791 ret <4 x float> %res
2793 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2794 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0:
2796 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
2797 ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
2798 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
2799 ; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1]
2800 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
2801 ; CHECK-NEXT: vzeroupper
2803 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2804 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2805 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2806 ret <4 x float> %res
2809 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %mask) {
2810 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0:
2812 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
2813 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2814 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2815 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1]
2816 ; CHECK-NEXT: vzeroupper
2818 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2819 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2820 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2821 ret <4 x float> %res
2823 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2824 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1:
2826 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,0]
2827 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
2828 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2829 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2830 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2831 ; CHECK-NEXT: vzeroupper
2833 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
2834 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2835 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2836 ret <4 x float> %res
2839 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) {
2840 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1:
2842 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,0]
2843 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2844 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2845 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2846 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2847 ; CHECK-NEXT: vzeroupper
2849 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
2850 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2851 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2852 ret <4 x float> %res
2854 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2855 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2:
2857 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,2,7,0]
2858 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
2859 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2860 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2861 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2862 ; CHECK-NEXT: vzeroupper
2864 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
2865 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2866 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2867 ret <4 x float> %res
2870 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) {
2871 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2:
2873 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,2,7,0]
2874 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2875 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2876 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2877 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2878 ; CHECK-NEXT: vzeroupper
2880 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
2881 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2882 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2883 ret <4 x float> %res
2885 define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) {
2886 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3:
2888 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,5,2]
2889 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
2890 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2891 ; CHECK-NEXT: vzeroupper
2893 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2894 ret <4 x float> %res
2896 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2897 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3:
2899 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,3,5,2]
2900 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
2901 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2902 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2903 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2904 ; CHECK-NEXT: vzeroupper
2906 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2907 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2908 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2909 ret <4 x float> %res
2912 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) {
2913 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3:
2915 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,5,2]
2916 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2917 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2918 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2919 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2920 ; CHECK-NEXT: vzeroupper
2922 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2923 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2924 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2925 ret <4 x float> %res
2927 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
2928 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
2930 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
2931 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,6,0,1]
2932 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm1, %xmm0
2934 %vec = load <8 x float>, ptr %vp
2935 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2936 ret <4 x float> %res
2938 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
2939 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
2941 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2942 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,0,1]
2943 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3
2944 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
2945 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
2946 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
2948 %vec = load <8 x float>, ptr %vp
2949 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2950 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2951 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2952 ret <4 x float> %res
2955 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) {
2956 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
2958 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2959 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,0,1]
2960 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2961 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
2962 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
2963 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
2965 %vec = load <8 x float>, ptr %vp
2966 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2967 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2968 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2969 ret <4 x float> %res
2972 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
2973 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
2975 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2976 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,7,7,2]
2977 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3
2978 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
2979 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
2980 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
2982 %vec = load <8 x float>, ptr %vp
2983 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
2984 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2985 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2986 ret <4 x float> %res
2989 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) {
2990 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
2992 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2993 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,7,7,2]
2994 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2995 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
2996 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
2997 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
2999 %vec = load <8 x float>, ptr %vp
3000 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
3001 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3002 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3003 ret <4 x float> %res
3006 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3007 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
3009 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
3010 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,1,3,7]
3011 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
3012 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3013 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3014 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3016 %vec = load <8 x float>, ptr %vp
3017 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
3018 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3019 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3020 ret <4 x float> %res
3023 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) {
3024 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
3026 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
3027 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,1,3,7]
3028 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3029 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3030 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
3031 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3033 %vec = load <8 x float>, ptr %vp
3034 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
3035 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3036 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3037 ret <4 x float> %res
3040 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
3041 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3:
3043 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
3044 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,3,5,3]
3045 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0
3047 %vec = load <8 x float>, ptr %vp
3048 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
3049 ret <4 x float> %res
3051 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3052 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
3054 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
3055 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,3]
3056 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
3057 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3058 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3059 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3061 %vec = load <8 x float>, ptr %vp
3062 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
3063 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3064 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3065 ret <4 x float> %res
3068 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) {
3069 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
3071 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
3072 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [1,3,5,3]
3073 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3074 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3075 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
3076 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3078 %vec = load <8 x float>, ptr %vp
3079 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
3080 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3081 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3082 ret <4 x float> %res
3085 define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) {
3086 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0:
3088 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7]
3089 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
3090 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3092 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
3093 ret <8 x float> %res
3095 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3096 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0:
3098 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,12,10,8,2,11,7]
3099 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3100 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3101 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
3102 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
3104 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
3105 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3106 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3107 ret <8 x float> %res
3110 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) {
3111 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0:
3113 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7]
3114 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3115 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
3116 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3117 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3119 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
3120 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3121 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3122 ret <8 x float> %res
3124 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3125 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1:
3127 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [10,12,3,12,4,15,1,14]
3128 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3129 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3130 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
3131 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
3133 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
3134 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3135 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3136 ret <8 x float> %res
3139 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) {
3140 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1:
3142 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [10,12,3,12,4,15,1,14]
3143 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3144 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
3145 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3146 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3148 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
3149 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3150 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3151 ret <8 x float> %res
3153 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3154 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2:
3156 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4]
3157 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3158 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3159 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
3160 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
3162 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
3163 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3164 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3165 ret <8 x float> %res
3168 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) {
3169 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2:
3171 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4]
3172 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3173 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
3174 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3175 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3177 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
3178 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3179 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3180 ret <8 x float> %res
3182 define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) {
3183 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3:
3185 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [12,14,9,0,12,4,5,8]
3186 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
3187 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3189 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3190 ret <8 x float> %res
3192 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3193 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3:
3195 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [12,14,9,0,12,4,5,8]
3196 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3197 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3198 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
3199 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
3201 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3202 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3203 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3204 ret <8 x float> %res
3207 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) {
3208 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3:
3210 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [12,14,9,0,12,4,5,8]
3211 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3212 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
3213 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3214 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3216 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3217 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3218 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3219 ret <8 x float> %res
3221 define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) {
3222 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0:
3224 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,8,9,10]
3225 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
3226 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3227 ; CHECK-NEXT: vzeroupper
3229 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3230 ret <4 x float> %res
3232 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3233 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0:
3235 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [4,8,9,10]
3236 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3237 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3238 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3239 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3240 ; CHECK-NEXT: vzeroupper
3242 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3243 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3244 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3245 ret <4 x float> %res
3248 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) {
3249 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0:
3251 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [4,8,9,10]
3252 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3253 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3254 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3255 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3256 ; CHECK-NEXT: vzeroupper
3258 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3259 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3260 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3261 ret <4 x float> %res
3263 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3264 ; CHECK-FAST-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
3265 ; CHECK-FAST: # %bb.0:
3266 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm3 = [8,6,10,6]
3267 ; CHECK-FAST-NEXT: vpermps %zmm0, %zmm3, %zmm0
3268 ; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3
3269 ; CHECK-FAST-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3270 ; CHECK-FAST-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3271 ; CHECK-FAST-NEXT: vzeroupper
3272 ; CHECK-FAST-NEXT: retq
3274 ; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
3275 ; CHECK-FAST-PERLANE: # %bb.0:
3276 ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm3
3277 ; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0
3278 ; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = [0,6,2,6]
3279 ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm0, %xmm3, %xmm4
3280 ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm0, %xmm0, %xmm0
3281 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm0, %xmm2, %k1
3282 ; CHECK-FAST-PERLANE-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1}
3283 ; CHECK-FAST-PERLANE-NEXT: vzeroupper
3284 ; CHECK-FAST-PERLANE-NEXT: retq
3285 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
3286 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3287 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3288 ret <4 x float> %res
3291 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
3292 ; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
3293 ; CHECK-FAST: # %bb.0:
3294 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [8,6,10,6]
3295 ; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3
3296 ; CHECK-FAST-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3297 ; CHECK-FAST-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3298 ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3299 ; CHECK-FAST-NEXT: vzeroupper
3300 ; CHECK-FAST-NEXT: retq
3302 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
3303 ; CHECK-FAST-PERLANE: # %bb.0:
3304 ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm2
3305 ; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm3
3306 ; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,2,6]
3307 ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4
3308 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm4, %xmm1, %k1
3309 ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z}
3310 ; CHECK-FAST-PERLANE-NEXT: vzeroupper
3311 ; CHECK-FAST-PERLANE-NEXT: retq
3312 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
3313 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3314 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3315 ret <4 x float> %res
3317 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3318 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2:
3320 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3321 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5]
3322 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3323 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3324 ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1}
3325 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3326 ; CHECK-NEXT: vzeroupper
3328 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
3329 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3330 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3331 ret <4 x float> %res
3334 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) {
3335 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2:
3337 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
3338 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5]
3339 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3340 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3341 ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
3342 ; CHECK-NEXT: vzeroupper
3344 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
3345 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3346 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3347 ret <4 x float> %res
3349 define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) {
3350 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3:
3352 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [10,2,11,6]
3353 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
3354 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3355 ; CHECK-NEXT: vzeroupper
3357 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3358 ret <4 x float> %res
3360 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3361 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3:
3363 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [10,2,11,6]
3364 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3365 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3366 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3367 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3368 ; CHECK-NEXT: vzeroupper
3370 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3371 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3372 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3373 ret <4 x float> %res
3376 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) {
3377 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3:
3379 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [10,2,11,6]
3380 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3381 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3382 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3383 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3384 ; CHECK-NEXT: vzeroupper
3386 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3387 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3388 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3389 ret <4 x float> %res
3391 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp) {
3392 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0:
3394 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
3395 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4]
3396 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm0
3398 %vec = load <16 x float>, ptr %vp
3399 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3400 ret <8 x float> %res
3402 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
3403 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0:
3405 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3406 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4]
3407 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3
3408 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3409 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3410 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3412 %vec = load <16 x float>, ptr %vp
3413 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3414 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3415 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3416 ret <8 x float> %res
3419 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %mask) {
3420 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0:
3422 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3423 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4]
3424 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3425 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3426 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3427 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3429 %vec = load <16 x float>, ptr %vp
3430 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3431 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3432 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3433 ret <8 x float> %res
3436 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
3437 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1:
3439 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3440 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8]
3441 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3
3442 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3443 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3444 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3446 %vec = load <16 x float>, ptr %vp
3447 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
3448 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3449 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3450 ret <8 x float> %res
3453 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %mask) {
3454 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1:
3456 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3457 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8]
3458 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3459 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3460 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3461 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3463 %vec = load <16 x float>, ptr %vp
3464 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
3465 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3466 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3467 ret <8 x float> %res
3470 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
3471 ; CHECK-FAST-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
3472 ; CHECK-FAST: # %bb.0:
3473 ; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2
3474 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1]
3475 ; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3476 ; CHECK-FAST-NEXT: vxorps %xmm2, %xmm2, %xmm2
3477 ; CHECK-FAST-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3478 ; CHECK-FAST-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3479 ; CHECK-FAST-NEXT: retq
3481 ; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
3482 ; CHECK-FAST-PERLANE: # %bb.0:
3483 ; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2
3484 ; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3
3485 ; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = [9,5,2,3,2,8,8,1]
3486 ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4
3487 ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm2, %xmm2, %xmm2
3488 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3489 ; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm0 {%k1}
3490 ; CHECK-FAST-PERLANE-NEXT: retq
3491 %vec = load <16 x float>, ptr %vp
3492 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
3493 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3494 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3495 ret <8 x float> %res
3498 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %mask) {
3499 ; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
3500 ; CHECK-FAST: # %bb.0:
3501 ; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2
3502 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1]
3503 ; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3
3504 ; CHECK-FAST-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3505 ; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3506 ; CHECK-FAST-NEXT: vmovaps %ymm1, %ymm0
3507 ; CHECK-FAST-NEXT: retq
3509 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
3510 ; CHECK-FAST-PERLANE: # %bb.0:
3511 ; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2
3512 ; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3
3513 ; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1]
3514 ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4
3515 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm4, %ymm0, %k1
3516 ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z}
3517 ; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm0
3518 ; CHECK-FAST-PERLANE-NEXT: retq
3519 %vec = load <16 x float>, ptr %vp
3520 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
3521 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3522 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3523 ret <8 x float> %res
3526 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp) {
3527 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3:
3529 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm1
3530 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9]
3531 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm0
3533 %vec = load <16 x float>, ptr %vp
3534 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3535 ret <8 x float> %res
3537 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
3538 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3:
3540 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3541 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9]
3542 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3543 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3544 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3545 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3547 %vec = load <16 x float>, ptr %vp
3548 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3549 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3550 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3551 ret <8 x float> %res
3554 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %mask) {
3555 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3:
3557 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3558 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9]
3559 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3560 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3561 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3562 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3564 %vec = load <16 x float>, ptr %vp
3565 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3566 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3567 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3568 ret <8 x float> %res
3571 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
3572 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0:
3574 ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm1 # ymm1 = mem[3,1,2,3]
3575 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,7,3]
3576 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0
3577 ; CHECK-NEXT: vzeroupper
3579 %vec = load <16 x float>, ptr %vp
3580 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3581 ret <4 x float> %res
3583 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3584 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
3586 ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3]
3587 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,6,7,3]
3588 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
3589 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3590 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3591 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3592 ; CHECK-NEXT: vzeroupper
3594 %vec = load <16 x float>, ptr %vp
3595 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3596 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3597 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3598 ret <4 x float> %res
3601 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) {
3602 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
3604 ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3]
3605 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,7,3]
3606 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3607 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3608 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
3609 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3610 ; CHECK-NEXT: vzeroupper
3612 %vec = load <16 x float>, ptr %vp
3613 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3614 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3615 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3616 ret <4 x float> %res
3619 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3620 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1:
3622 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3623 ; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,10,6,15,0,10,6,15]
3624 ; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
3625 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3626 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3627 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3628 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3629 ; CHECK-NEXT: vzeroupper
3631 %vec = load <16 x float>, ptr %vp
3632 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
3633 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3634 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3635 ret <4 x float> %res
3638 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) {
3639 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
3641 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3642 ; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,10,6,15,0,10,6,15]
3643 ; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
3644 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3645 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3646 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3647 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3648 ; CHECK-NEXT: vzeroupper
3650 %vec = load <16 x float>, ptr %vp
3651 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
3652 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3653 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3654 ret <4 x float> %res
3657 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3658 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
3660 ; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [4,14,4,14]
3661 ; CHECK-NEXT: # xmm2 = mem[0,0]
3662 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm3
3663 ; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm3
3664 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3665 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3666 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3667 ; CHECK-NEXT: vzeroupper
3669 %vec = load <16 x float>, ptr %vp
3670 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
3671 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3672 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3673 ret <4 x float> %res
3676 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) {
3677 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
3679 ; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [4,14,4,14]
3680 ; CHECK-NEXT: # xmm2 = mem[0,0]
3681 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm1
3682 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3683 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3684 ; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3685 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3686 ; CHECK-NEXT: vzeroupper
3688 %vec = load <16 x float>, ptr %vp
3689 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
3690 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3691 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3692 ret <4 x float> %res
3695 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
3696 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3:
3698 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,15,9]
3699 ; CHECK-NEXT: vmovaps (%rdi), %ymm0
3700 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm1, %ymm0
3701 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3702 ; CHECK-NEXT: vzeroupper
3704 %vec = load <16 x float>, ptr %vp
3705 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3706 ret <4 x float> %res
3708 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3709 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3:
3711 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9]
3712 ; CHECK-NEXT: vmovaps (%rdi), %ymm3
3713 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm3
3714 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3715 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3716 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3717 ; CHECK-NEXT: vzeroupper
3719 %vec = load <16 x float>, ptr %vp
3720 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3721 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3722 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3723 ret <4 x float> %res
3726 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) {
3727 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
3729 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9]
3730 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
3731 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3732 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3733 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3734 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3735 ; CHECK-NEXT: vzeroupper
3737 %vec = load <16 x float>, ptr %vp
3738 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3739 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3740 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3741 ret <4 x float> %res
3744 define <2 x double> @test_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec) {
3745 ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mask0:
3747 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
3748 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3749 ; CHECK-NEXT: vzeroupper
3751 %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3752 ret <2 x double> %res
3754 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3755 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask0:
3757 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
3758 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3759 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
3760 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
3761 ; CHECK-NEXT: vzeroupper
3763 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3764 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3765 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3766 ret <2 x double> %res
3769 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %mask) {
3770 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask0:
3772 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3773 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
3774 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3]
3775 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3776 ; CHECK-NEXT: vzeroupper
3778 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3779 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3780 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3781 ret <2 x double> %res
3783 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3784 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask1:
3786 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
3787 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3788 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
3789 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
3790 ; CHECK-NEXT: vzeroupper
3792 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
3793 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3794 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3795 ret <2 x double> %res
3798 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %mask) {
3799 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask1:
3801 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3802 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
3803 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,2,3]
3804 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3805 ; CHECK-NEXT: vzeroupper
3807 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
3808 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3809 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3810 ret <2 x double> %res
3812 define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) {
3813 ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0:
3815 ; CHECK-NEXT: vmovaps (%rdi), %xmm0
3816 ; CHECK-NEXT: vblendps $3, 16(%rdi), %xmm0, %xmm0 # xmm0 = mem[0,1],xmm0[2,3]
3818 %vec = load <4 x double>, ptr %vp
3819 %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3820 ret <2 x double> %res
3822 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
3823 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0:
3825 ; CHECK-NEXT: vmovapd (%rdi), %xmm2
3826 ; CHECK-NEXT: vblendpd $1, 16(%rdi), %xmm2, %xmm2 # xmm2 = mem[0],xmm2[1]
3827 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3828 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
3829 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1}
3831 %vec = load <4 x double>, ptr %vp
3832 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3833 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3834 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3835 ret <2 x double> %res
3838 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) {
3839 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0:
3841 ; CHECK-NEXT: vmovapd (%rdi), %xmm1
3842 ; CHECK-NEXT: vblendpd $1, 16(%rdi), %xmm1, %xmm1 # xmm1 = mem[0],xmm1[1]
3843 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3844 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
3845 ; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z}
3847 %vec = load <4 x double>, ptr %vp
3848 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3849 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3850 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3851 ret <2 x double> %res
3854 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
3855 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1:
3857 ; CHECK-NEXT: vmovapd 16(%rdi), %xmm2
3858 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3859 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
3860 ; CHECK-NEXT: vunpcklpd (%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
3862 %vec = load <4 x double>, ptr %vp
3863 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3864 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3865 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3866 ret <2 x double> %res
3869 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) {
3870 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1:
3872 ; CHECK-NEXT: vmovapd 16(%rdi), %xmm1
3873 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3874 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
3875 ; CHECK-NEXT: vunpcklpd (%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
3877 %vec = load <4 x double>, ptr %vp
3878 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3879 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3880 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3881 ret <2 x double> %res
3884 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
3885 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
3887 ; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,3,7,3]
3888 ; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
3889 ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
3890 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3892 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3893 ret <4 x double> %res
3895 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3896 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
3898 ; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [7,3,7,3]
3899 ; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
3900 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
3901 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3902 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3903 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3905 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3906 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3907 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3908 ret <4 x double> %res
3911 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
3912 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
3914 ; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [7,3,7,3]
3915 ; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
3916 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3917 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
3918 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3919 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3921 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3922 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3923 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3924 ret <4 x double> %res
3926 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3927 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1:
3929 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,0,7,6]
3930 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
3931 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3932 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3933 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3935 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
3936 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3937 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3938 ret <4 x double> %res
3941 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %mask) {
3942 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask1:
3944 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,0,7,6]
3945 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3946 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
3947 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3948 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3950 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
3951 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3952 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3953 ret <4 x double> %res
3955 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3956 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask2:
3958 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3959 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3960 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,0]
3961 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
3963 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
3964 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3965 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3966 ret <4 x double> %res
3969 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %mask) {
3970 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask2:
3972 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3973 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
3974 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,0]
3976 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
3977 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3978 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3979 ret <4 x double> %res
3981 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) {
3982 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3:
3984 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,1,4]
3985 ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
3986 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3988 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
3989 ret <4 x double> %res
3991 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3992 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3:
3994 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,2,1,4]
3995 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
3996 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3997 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3998 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
4000 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
4001 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4002 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4003 ret <4 x double> %res
4006 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) {
4007 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3:
4009 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,1,4]
4010 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4011 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4012 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4013 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4015 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
4016 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4017 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4018 ret <4 x double> %res
4020 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4021 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
4022 ; CHECK-FAST: # %bb.0:
4023 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm3 = [1,5]
4024 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0
4025 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4026 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4027 ; CHECK-FAST-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
4028 ; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0
4029 ; CHECK-FAST-NEXT: retq
4031 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
4032 ; CHECK-FAST-PERLANE: # %bb.0:
4033 ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm3
4034 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
4035 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4036 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4037 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
4038 ; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0
4039 ; CHECK-FAST-PERLANE-NEXT: retq
4040 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
4041 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4042 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4043 ret <4 x double> %res
4046 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
4047 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
4048 ; CHECK-FAST: # %bb.0:
4049 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm2 = [1,5]
4050 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0
4051 ; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4052 ; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4053 ; CHECK-FAST-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
4054 ; CHECK-FAST-NEXT: retq
4056 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
4057 ; CHECK-FAST-PERLANE: # %bb.0:
4058 ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm2
4059 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
4060 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4061 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4062 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
4063 ; CHECK-FAST-PERLANE-NEXT: retq
4064 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
4065 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4066 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4067 ret <4 x double> %res
4069 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4070 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5:
4072 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,6,2,2]
4073 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
4074 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4075 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4076 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
4078 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
4079 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4080 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4081 ret <4 x double> %res
4084 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) {
4085 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5:
4087 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,6,2,2]
4088 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4089 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4090 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4091 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4093 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
4094 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4095 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4096 ret <4 x double> %res
4098 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
4099 ; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
4100 ; CHECK-FAST: # %bb.0:
4101 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [5,8,7,8]
4102 ; CHECK-FAST-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0
4103 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4104 ; CHECK-FAST-NEXT: retq
4106 ; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
4107 ; CHECK-FAST-PERLANE: # %bb.0:
4108 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1
4109 ; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
4110 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
4111 ; CHECK-FAST-PERLANE-NEXT: retq
4112 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
4113 ret <4 x double> %res
4115 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4116 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
4117 ; CHECK-FAST: # %bb.0:
4118 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [5,8,7,8]
4119 ; CHECK-FAST-NEXT: vpermi2pd %zmm0, %zmm0, %zmm3
4120 ; CHECK-FAST-NEXT: vxorpd %xmm0, %xmm0, %xmm0
4121 ; CHECK-FAST-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
4122 ; CHECK-FAST-NEXT: vblendmpd %ymm3, %ymm1, %ymm0 {%k1}
4123 ; CHECK-FAST-NEXT: retq
4125 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
4126 ; CHECK-FAST-PERLANE: # %bb.0:
4127 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4128 ; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
4129 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4130 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
4131 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
4132 ; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0
4133 ; CHECK-FAST-PERLANE-NEXT: retq
4134 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
4135 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4136 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4137 ret <4 x double> %res
4140 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
4141 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
4142 ; CHECK-FAST: # %bb.0:
4143 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [5,8,7,8]
4144 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4145 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4146 ; CHECK-FAST-NEXT: vpermt2pd %zmm0, %zmm2, %zmm0 {%k1} {z}
4147 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4148 ; CHECK-FAST-NEXT: retq
4150 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
4151 ; CHECK-FAST-PERLANE: # %bb.0:
4152 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4153 ; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
4154 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4155 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4156 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
4157 ; CHECK-FAST-PERLANE-NEXT: retq
4158 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
4159 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4160 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4161 ret <4 x double> %res
4163 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4164 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
4165 ; CHECK-FAST: # %bb.0:
4166 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,5,0,6]
4167 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0
4168 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4169 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4170 ; CHECK-FAST-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
4171 ; CHECK-FAST-NEXT: retq
4173 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
4174 ; CHECK-FAST-PERLANE: # %bb.0:
4175 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4176 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3]
4177 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4178 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
4179 ; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],ymm3[1],ymm0[2],ymm3[2]
4180 ; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0
4181 ; CHECK-FAST-PERLANE-NEXT: retq
4182 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
4183 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4184 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4185 ret <4 x double> %res
4188 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) {
4189 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
4190 ; CHECK-FAST: # %bb.0:
4191 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [3,5,0,6]
4192 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4193 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4194 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4195 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4196 ; CHECK-FAST-NEXT: retq
4198 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
4199 ; CHECK-FAST-PERLANE: # %bb.0:
4200 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4201 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3]
4202 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4203 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4204 ; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm2[1],ymm0[2],ymm2[2]
4205 ; CHECK-FAST-PERLANE-NEXT: retq
4206 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
4207 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4208 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4209 ret <4 x double> %res
4211 define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
4212 ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
4214 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
4215 ; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
4216 ; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4217 ; CHECK-NEXT: vzeroupper
4219 %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
4220 ret <2 x double> %res
4222 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
4223 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
4225 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4226 ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3
4227 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4228 ; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
4229 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
4230 ; CHECK-NEXT: vmovapd %xmm1, %xmm0
4231 ; CHECK-NEXT: vzeroupper
4233 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
4234 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4235 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4236 ret <2 x double> %res
4239 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
4240 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
4242 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4243 ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2
4244 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4245 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4246 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
4247 ; CHECK-NEXT: vzeroupper
4249 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
4250 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4251 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4252 ret <2 x double> %res
4254 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
4255 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1:
4257 ; CHECK-NEXT: vmovapd {{.*#+}} xmm3 = [3,7]
4258 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
4259 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4260 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
4261 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
4262 ; CHECK-NEXT: vzeroupper
4264 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
4265 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4266 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4267 ret <2 x double> %res
4270 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %mask) {
4271 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask1:
4273 ; CHECK-NEXT: vmovapd {{.*#+}} xmm2 = [3,7]
4274 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4275 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4276 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4277 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
4278 ; CHECK-NEXT: vzeroupper
4280 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
4281 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4282 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4283 ret <2 x double> %res
4285 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp) {
4286 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0:
4288 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
4289 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,7,2]
4290 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0
4292 %vec = load <8 x double>, ptr %vp
4293 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4294 ret <4 x double> %res
4296 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4297 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0:
4299 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4300 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,6,7,2]
4301 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4302 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4303 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4304 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4306 %vec = load <8 x double>, ptr %vp
4307 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4308 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4309 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4310 ret <4 x double> %res
4313 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %mask) {
4314 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0:
4316 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4317 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,6,7,2]
4318 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4319 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4320 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4321 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4323 %vec = load <8 x double>, ptr %vp
4324 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4325 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4326 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4327 ret <4 x double> %res
4330 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4331 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
4332 ; CHECK-FAST: # %bb.0:
4333 ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4334 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,6]
4335 ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3
4336 ; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4337 ; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4338 ; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4339 ; CHECK-FAST-NEXT: retq
4341 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
4342 ; CHECK-FAST-PERLANE: # %bb.0:
4343 ; CHECK-FAST-PERLANE-NEXT: vpermpd $236, (%rdi), %ymm2 # ymm2 = mem[0,3,2,3]
4344 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4345 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4346 ; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 32(%rdi){1to4}, %ymm2, %ymm0 {%k1}
4347 ; CHECK-FAST-PERLANE-NEXT: retq
4348 %vec = load <8 x double>, ptr %vp
4349 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
4350 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4351 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4352 ret <4 x double> %res
4355 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %mask) {
4356 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
4357 ; CHECK-FAST: # %bb.0:
4358 ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4359 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,6]
4360 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4361 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4362 ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}
4363 ; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0
4364 ; CHECK-FAST-NEXT: retq
4366 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
4367 ; CHECK-FAST-PERLANE: # %bb.0:
4368 ; CHECK-FAST-PERLANE-NEXT: vpermpd $236, (%rdi), %ymm1 # ymm1 = mem[0,3,2,3]
4369 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4370 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4371 ; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 32(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
4372 ; CHECK-FAST-PERLANE-NEXT: retq
4373 %vec = load <8 x double>, ptr %vp
4374 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
4375 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4376 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4377 ret <4 x double> %res
4380 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4381 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
4382 ; CHECK-FAST: # %bb.0:
4383 ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4384 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [1,2,3,4]
4385 ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4386 ; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4387 ; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4388 ; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4389 ; CHECK-FAST-NEXT: retq
4391 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
4392 ; CHECK-FAST-PERLANE: # %bb.0:
4393 ; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %ymm2
4394 ; CHECK-FAST-PERLANE-NEXT: vperm2f128 $33, 32(%rdi), %ymm2, %ymm3 # ymm3 = ymm2[2,3],mem[0,1]
4395 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4396 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
4397 ; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm2[1],ymm3[0],ymm2[3],ymm3[2]
4398 ; CHECK-FAST-PERLANE-NEXT: retq
4399 %vec = load <8 x double>, ptr %vp
4400 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
4401 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4402 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4403 ret <4 x double> %res
4406 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %mask) {
4407 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
4408 ; CHECK-FAST: # %bb.0:
4409 ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4410 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [1,2,3,4]
4411 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4412 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4413 ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4414 ; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0
4415 ; CHECK-FAST-NEXT: retq
4417 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
4418 ; CHECK-FAST-PERLANE: # %bb.0:
4419 ; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %ymm1
4420 ; CHECK-FAST-PERLANE-NEXT: vperm2f128 $33, 32(%rdi), %ymm1, %ymm2 # ymm2 = ymm1[2,3],mem[0,1]
4421 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4422 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4423 ; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm1[1],ymm2[0],ymm1[3],ymm2[2]
4424 ; CHECK-FAST-PERLANE-NEXT: retq
4425 %vec = load <8 x double>, ptr %vp
4426 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
4427 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4428 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4429 ret <4 x double> %res
4432 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) {
4433 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
4435 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
4436 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [4,2,1,0]
4437 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0
4439 %vec = load <8 x double>, ptr %vp
4440 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4441 ret <4 x double> %res
4443 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4444 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
4446 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4447 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [4,2,1,0]
4448 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4449 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4450 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4451 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4453 %vec = load <8 x double>, ptr %vp
4454 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4455 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4456 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4457 ret <4 x double> %res
4460 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %mask) {
4461 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
4463 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4464 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [4,2,1,0]
4465 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4466 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4467 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4468 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4470 %vec = load <8 x double>, ptr %vp
4471 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4472 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4473 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4474 ret <4 x double> %res
4477 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4478 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4:
4480 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4481 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,4,1,5]
4482 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3
4483 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4484 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4485 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4487 %vec = load <8 x double>, ptr %vp
4488 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
4489 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4490 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4491 ret <4 x double> %res
4494 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %mask) {
4495 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4:
4497 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4498 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [2,4,1,5]
4499 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4500 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4501 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
4502 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4504 %vec = load <8 x double>, ptr %vp
4505 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
4506 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4507 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4508 ret <4 x double> %res
4511 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4512 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
4514 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4515 ; CHECK-NEXT: vperm2f128 $33, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[2,3],mem[0,1]
4516 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4517 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4518 ; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm2, %ymm0 {%k1}
4520 %vec = load <8 x double>, ptr %vp
4521 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
4522 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4523 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4524 ret <4 x double> %res
4527 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 x double> %mask) {
4528 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
4530 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
4531 ; CHECK-NEXT: vperm2f128 $33, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[2,3],mem[0,1]
4532 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4533 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4534 ; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
4536 %vec = load <8 x double>, ptr %vp
4537 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
4538 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4539 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4540 ret <4 x double> %res
4543 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp) {
4544 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
4546 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm1
4547 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [0,2,4,1]
4548 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm1, %ymm0
4550 %vec = load <8 x double>, ptr %vp
4551 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4552 ret <4 x double> %res
4554 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4555 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
4557 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4558 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,2,4,1]
4559 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3
4560 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4561 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4562 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4564 %vec = load <8 x double>, ptr %vp
4565 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4566 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4567 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4568 ret <4 x double> %res
4571 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %mask) {
4572 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
4574 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4575 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,1]
4576 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4577 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4578 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
4579 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4581 %vec = load <8 x double>, ptr %vp
4582 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4583 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4584 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4585 ret <4 x double> %res
4588 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4589 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask7:
4591 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4592 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4593 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4594 ; CHECK-NEXT: vunpcklpd 40(%rdi){1to4}, %ymm2, %ymm0 {%k1}
4596 %vec = load <8 x double>, ptr %vp
4597 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
4598 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4599 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4600 ret <4 x double> %res
4603 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(ptr %vp, <4 x double> %mask) {
4604 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7:
4606 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
4607 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4608 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4609 ; CHECK-NEXT: vunpcklpd 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
4611 %vec = load <8 x double>, ptr %vp
4612 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
4613 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4614 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4615 ret <4 x double> %res
4618 define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) {
4619 ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
4621 ; CHECK-NEXT: vmovapd (%rdi), %xmm0
4622 ; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[0]
4624 %vec = load <8 x double>, ptr %vp
4625 %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4626 ret <2 x double> %res
4628 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
4629 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
4631 ; CHECK-NEXT: vmovapd (%rdi), %xmm2
4632 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4633 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4634 ; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[0]
4636 %vec = load <8 x double>, ptr %vp
4637 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4638 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4639 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4640 ret <2 x double> %res
4643 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) {
4644 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
4646 ; CHECK-NEXT: vmovapd (%rdi), %xmm1
4647 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4648 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
4649 ; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[0]
4651 %vec = load <8 x double>, ptr %vp
4652 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4653 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4654 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4655 ret <2 x double> %res
4658 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
4659 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1:
4661 ; CHECK-NEXT: vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0]
4662 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4663 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4664 ; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
4666 %vec = load <8 x double>, ptr %vp
4667 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
4668 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4669 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4670 ret <2 x double> %res
4673 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) {
4674 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
4676 ; CHECK-NEXT: vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0]
4677 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4678 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
4679 ; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
4681 %vec = load <8 x double>, ptr %vp
4682 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
4683 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4684 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4685 ret <2 x double> %res
4689 define void @test_zext_v8i8_to_v8i16(ptr %arg, ptr %arg1) {
4690 ; CHECK-LABEL: test_zext_v8i8_to_v8i16:
4692 ; CHECK-NEXT: vpmovzxbw (%rdi), %xmm0 # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
4693 ; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0
4694 ; CHECK-NEXT: vmovdqa %xmm0, (%rsi)
4696 %tmp2 = load <8 x i8>, ptr %arg
4697 %tmp3 = extractelement <8 x i8> %tmp2, i32 0
4698 %tmp4 = zext i8 %tmp3 to i16
4699 %tmp5 = insertelement <8 x i16> undef, i16 %tmp4, i32 0
4700 %tmp6 = extractelement <8 x i8> %tmp2, i32 1
4701 %tmp7 = zext i8 %tmp6 to i16
4702 %tmp8 = insertelement <8 x i16> %tmp5, i16 %tmp7, i32 1
4703 %tmp9 = extractelement <8 x i8> %tmp2, i32 2
4704 %tmp10 = zext i8 %tmp9 to i16
4705 %tmp11 = insertelement <8 x i16> %tmp8, i16 %tmp10, i32 2
4706 %tmp12 = extractelement <8 x i8> %tmp2, i32 3
4707 %tmp13 = zext i8 %tmp12 to i16
4708 %tmp14 = insertelement <8 x i16> %tmp11, i16 %tmp13, i32 3
4709 %tmp15 = extractelement <8 x i8> %tmp2, i32 4
4710 %tmp16 = zext i8 %tmp15 to i16
4711 %tmp17 = insertelement <8 x i16> %tmp14, i16 %tmp16, i32 4
4712 %tmp18 = extractelement <8 x i8> %tmp2, i32 5
4713 %tmp19 = zext i8 %tmp18 to i16
4714 %tmp20 = insertelement <8 x i16> %tmp17, i16 %tmp19, i32 5
4715 %tmp21 = extractelement <8 x i8> %tmp2, i32 6
4716 %tmp22 = zext i8 %tmp21 to i16
4717 %tmp23 = insertelement <8 x i16> %tmp20, i16 %tmp22, i32 6
4718 %tmp24 = extractelement <8 x i8> %tmp2, i32 7
4719 %tmp25 = zext i8 %tmp24 to i16
4720 %tmp26 = insertelement <8 x i16> %tmp23, i16 %tmp25, i32 7
4721 %tmp27 = shl <8 x i16> %tmp26, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
4722 store <8 x i16> %tmp27, ptr %arg1