1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle %s -o - | FileCheck --check-prefixes=CHECK,CHECK-FAST %s
3 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-perlane-shuffle %s -o - | FileCheck --check-prefixes=CHECK,CHECK-FAST-PERLANE %s
5 ; FIXME: All cases here should be fixed by PR34380
7 define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
8 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
10 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [8,6,12,4,7,9,14,8]
11 ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
12 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
13 ; CHECK-NEXT: vzeroupper
15 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
18 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
19 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
21 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [8,6,12,4,7,9,14,8]
22 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
23 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
24 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
25 ; CHECK-NEXT: vzeroupper
27 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
28 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
29 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
33 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
34 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
36 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,6,12,4,7,9,14,8]
37 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
38 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
39 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
40 ; CHECK-NEXT: vzeroupper
42 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
43 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
44 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
47 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
48 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
50 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,12,9,4,14,15,12,14]
51 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
52 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
53 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
54 ; CHECK-NEXT: vzeroupper
56 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
57 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
58 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
62 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) {
63 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1:
65 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,12,9,4,14,15,12,14]
66 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
67 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
68 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
69 ; CHECK-NEXT: vzeroupper
71 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
72 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
73 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
76 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
77 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
79 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11,14,10,7,1,6,9]
80 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
81 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
82 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
83 ; CHECK-NEXT: vzeroupper
85 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
86 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
87 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
91 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) {
92 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2:
94 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,14,10,7,1,6,9]
95 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
96 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
97 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
98 ; CHECK-NEXT: vzeroupper
100 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
101 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
102 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
105 define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
106 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3:
108 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [14,15,7,13,4,12,8,0]
109 ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
110 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
111 ; CHECK-NEXT: vzeroupper
113 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
116 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
117 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3:
119 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [14,15,7,13,4,12,8,0]
120 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
121 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
122 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
123 ; CHECK-NEXT: vzeroupper
125 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
126 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
127 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
131 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) {
132 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3:
134 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,7,13,4,12,8,0]
135 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
136 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
137 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
138 ; CHECK-NEXT: vzeroupper
140 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
141 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
142 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
145 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(ptr %vp) {
146 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0:
148 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
149 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9]
150 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0
152 %vec = load <16 x i16>, ptr %vp
153 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
156 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
157 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
159 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
160 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,13,3,5,13,3,9]
161 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
162 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
163 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
165 %vec = load <16 x i16>, ptr %vp
166 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
167 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
168 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
172 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) {
173 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
175 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
176 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9]
177 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
178 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
179 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
181 %vec = load <16 x i16>, ptr %vp
182 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
183 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
184 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
188 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
189 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1:
191 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
192 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,15,12,7,1,5,8,14]
193 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
194 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
195 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
197 %vec = load <16 x i16>, ptr %vp
198 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
199 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
200 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
204 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) {
205 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1:
207 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
208 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14]
209 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
210 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
211 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
213 %vec = load <16 x i16>, ptr %vp
214 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
215 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
216 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
220 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
221 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
223 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
224 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1]
225 ; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm3
226 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
227 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
229 %vec = load <16 x i16>, ptr %vp
230 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
231 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
232 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
236 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) {
237 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
239 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
240 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1]
241 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
242 ; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z}
243 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
245 %vec = load <16 x i16>, ptr %vp
246 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
247 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
248 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
252 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) {
253 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3:
255 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
256 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2]
257 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0
259 %vec = load <16 x i16>, ptr %vp
260 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
263 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
264 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
266 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
267 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [9,7,9,6,9,4,3,2]
268 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
269 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
270 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
272 %vec = load <16 x i16>, ptr %vp
273 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
274 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
275 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
279 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) {
280 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
282 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
283 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2]
284 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
285 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
286 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
288 %vec = load <16 x i16>, ptr %vp
289 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
290 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
291 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
295 define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) {
296 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0:
298 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
299 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
300 ; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1
301 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
303 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
306 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
307 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0:
309 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
310 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
311 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
312 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
313 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
315 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
316 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
317 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
321 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) {
322 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0:
324 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
325 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
326 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
327 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
328 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
330 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
331 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
332 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
335 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
336 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1:
338 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
339 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
340 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
341 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
342 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
344 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
345 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
346 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
350 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) {
351 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1:
353 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
354 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
355 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
356 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
357 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
359 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
360 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
361 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
364 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
365 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2:
367 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
368 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
369 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
370 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
371 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
373 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
374 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
375 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
379 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) {
380 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2:
382 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
383 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
384 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
385 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
386 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
388 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
389 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
390 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
393 define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
394 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3:
396 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
397 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
398 ; CHECK-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
399 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
401 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
404 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
405 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3:
407 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
408 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
409 ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
410 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
411 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
413 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
414 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
415 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
419 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) {
420 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3:
422 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
423 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
424 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
425 ; CHECK-NEXT: vpermt2w %ymm2, %ymm3, %ymm0 {%k1} {z}
426 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
428 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
429 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
430 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
433 define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) {
434 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0:
436 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14]
437 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
438 ; CHECK-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
439 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
440 ; CHECK-NEXT: vzeroupper
442 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
445 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
446 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0:
448 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14]
449 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
450 ; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm4
451 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
452 ; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
453 ; CHECK-NEXT: vzeroupper
455 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
456 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
457 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
461 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) {
462 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0:
464 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14]
465 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
466 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
467 ; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm2 {%k1} {z}
468 ; CHECK-NEXT: vmovdqa %xmm2, %xmm0
469 ; CHECK-NEXT: vzeroupper
471 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
472 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
473 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
476 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
477 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1:
479 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5]
480 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
481 ; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0
482 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
483 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
484 ; CHECK-NEXT: vzeroupper
486 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
487 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
488 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
492 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) {
493 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1:
495 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5]
496 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
497 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
498 ; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
499 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
500 ; CHECK-NEXT: vzeroupper
502 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
503 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
504 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
507 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
508 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2:
510 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8]
511 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
512 ; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0
513 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
514 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
515 ; CHECK-NEXT: vzeroupper
517 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
518 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
519 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
523 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) {
524 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2:
526 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8]
527 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
528 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
529 ; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
530 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
531 ; CHECK-NEXT: vzeroupper
533 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
534 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
535 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
538 define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
539 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3:
541 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30]
542 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
543 ; CHECK-NEXT: vpermt2w %ymm2, %ymm1, %ymm0
544 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
545 ; CHECK-NEXT: vzeroupper
547 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
550 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
551 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3:
553 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30]
554 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
555 ; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0
556 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
557 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
558 ; CHECK-NEXT: vzeroupper
560 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
561 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
562 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
566 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) {
567 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3:
569 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30]
570 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
571 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
572 ; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
573 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
574 ; CHECK-NEXT: vzeroupper
576 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
577 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
578 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
581 define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(ptr %vp) {
582 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0:
584 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
585 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
586 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
588 %vec = load <32 x i16>, ptr %vp
589 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
592 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
593 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0:
595 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
596 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
597 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
598 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
599 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
601 %vec = load <32 x i16>, ptr %vp
602 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
603 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
604 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
608 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %mask) {
609 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0:
611 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
612 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
613 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
614 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
615 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
617 %vec = load <32 x i16>, ptr %vp
618 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
619 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
620 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
624 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
625 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1:
627 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
628 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
629 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
630 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
631 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
633 %vec = load <32 x i16>, ptr %vp
634 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
635 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
636 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
640 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %mask) {
641 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1:
643 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
644 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
645 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
646 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
647 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
649 %vec = load <32 x i16>, ptr %vp
650 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
651 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
652 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
656 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
657 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2:
659 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
660 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
661 ; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3
662 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
663 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
665 %vec = load <32 x i16>, ptr %vp
666 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
667 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
668 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
672 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %mask) {
673 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2:
675 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
676 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
677 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
678 ; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z}
679 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
681 %vec = load <32 x i16>, ptr %vp
682 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
683 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
684 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
688 define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(ptr %vp) {
689 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3:
691 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
692 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
693 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
695 %vec = load <32 x i16>, ptr %vp
696 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
699 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) {
700 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3:
702 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
703 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
704 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
705 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
706 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
708 %vec = load <32 x i16>, ptr %vp
709 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
710 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
711 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
715 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %mask) {
716 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3:
718 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
719 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
720 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
721 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
722 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
724 %vec = load <32 x i16>, ptr %vp
725 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
726 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
727 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
731 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(ptr %vp) {
732 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0:
734 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17]
735 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0
736 ; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm0
737 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
738 ; CHECK-NEXT: vzeroupper
740 %vec = load <32 x i16>, ptr %vp
741 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
744 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
745 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0:
747 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17]
748 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3
749 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3
750 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
751 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
752 ; CHECK-NEXT: vzeroupper
754 %vec = load <32 x i16>, ptr %vp
755 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
756 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
757 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
761 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) {
762 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0:
764 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17]
765 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
766 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
767 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
768 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
769 ; CHECK-NEXT: vzeroupper
771 %vec = load <32 x i16>, ptr %vp
772 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
773 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
774 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
778 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
779 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1:
781 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
782 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3
783 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3
784 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
785 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
786 ; CHECK-NEXT: vzeroupper
788 %vec = load <32 x i16>, ptr %vp
789 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
790 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
791 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
795 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) {
796 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
798 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
799 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
800 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
801 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
802 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
803 ; CHECK-NEXT: vzeroupper
805 %vec = load <32 x i16>, ptr %vp
806 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
807 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
808 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
812 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
813 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2:
815 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10]
816 ; CHECK-NEXT: vmovdqa (%rdi), %ymm3
817 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3
818 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
819 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
820 ; CHECK-NEXT: vzeroupper
822 %vec = load <32 x i16>, ptr %vp
823 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
824 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
825 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
829 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) {
830 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2:
832 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10]
833 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
834 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
835 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
836 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
837 ; CHECK-NEXT: vzeroupper
839 %vec = load <32 x i16>, ptr %vp
840 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
841 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
842 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
846 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(ptr %vp) {
847 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3:
849 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9]
850 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
851 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm0
852 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
853 ; CHECK-NEXT: vzeroupper
855 %vec = load <32 x i16>, ptr %vp
856 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
859 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) {
860 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3:
862 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9]
863 ; CHECK-NEXT: vmovdqa (%rdi), %ymm3
864 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3
865 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
866 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
867 ; CHECK-NEXT: vzeroupper
869 %vec = load <32 x i16>, ptr %vp
870 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
871 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
872 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
876 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) {
877 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3:
879 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9]
880 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
881 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
882 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
883 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
884 ; CHECK-NEXT: vzeroupper
886 %vec = load <32 x i16>, ptr %vp
887 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
888 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
889 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
893 define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) {
894 ; CHECK-LABEL: test_16xi16_to_8xi16_E84C94EF:
896 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [14,8,4,12,9,4,14,15]
897 ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
898 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
899 ; CHECK-NEXT: vzeroupper
901 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15>
905 define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) {
906 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0:
908 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,3,2]
909 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
910 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
911 ; CHECK-NEXT: vzeroupper
913 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
916 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
917 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0:
919 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,0,3,2]
920 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
921 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
922 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
923 ; CHECK-NEXT: vzeroupper
925 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
926 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
927 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
931 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) {
932 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0:
934 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,3,2]
935 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
936 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
937 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
938 ; CHECK-NEXT: vzeroupper
940 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
941 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
942 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
945 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
946 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1:
948 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,7,3]
949 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
950 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
951 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
952 ; CHECK-NEXT: vzeroupper
954 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
955 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
956 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
960 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) {
961 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1:
963 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,7,3]
964 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
965 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
966 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
967 ; CHECK-NEXT: vzeroupper
969 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
970 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
971 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
974 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
975 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask2:
977 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3]
978 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
979 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
980 ; CHECK-NEXT: vzeroupper
982 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
983 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
984 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
988 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %mask) {
989 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask2:
991 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3]
992 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
993 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
994 ; CHECK-NEXT: vzeroupper
996 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
997 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
998 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1001 define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
1002 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3:
1004 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [5,3,2,5]
1005 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
1006 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1007 ; CHECK-NEXT: vzeroupper
1009 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1012 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1013 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3:
1015 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,3,2,5]
1016 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
1017 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1018 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1019 ; CHECK-NEXT: vzeroupper
1021 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1022 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1023 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1027 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) {
1028 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3:
1030 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,3,2,5]
1031 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1032 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
1033 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1034 ; CHECK-NEXT: vzeroupper
1036 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1037 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1038 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1041 define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(ptr %vp) {
1042 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0:
1044 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm0
1045 ; CHECK-NEXT: vshufps $7, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[3,1],mem[0,0]
1047 %vec = load <8 x i32>, ptr %vp
1048 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1051 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1052 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0:
1054 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
1055 ; CHECK-NEXT: vshufps $7, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[3,1],mem[0,0]
1056 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1057 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
1059 %vec = load <8 x i32>, ptr %vp
1060 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1061 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1062 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1066 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) {
1067 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0:
1069 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
1070 ; CHECK-NEXT: vshufps $7, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[3,1],mem[0,0]
1071 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1072 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
1074 %vec = load <8 x i32>, ptr %vp
1075 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1076 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1077 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1081 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1082 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
1084 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
1085 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,0,0,3]
1086 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3
1087 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1088 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1090 %vec = load <8 x i32>, ptr %vp
1091 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
1092 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1093 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1097 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) {
1098 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
1100 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
1101 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,0,0,3]
1102 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1103 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z}
1104 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1106 %vec = load <8 x i32>, ptr %vp
1107 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
1108 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1109 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1113 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1114 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
1116 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1117 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,7,0]
1118 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3
1119 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1120 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1122 %vec = load <8 x i32>, ptr %vp
1123 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
1124 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1125 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1129 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) {
1130 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
1132 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1133 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,7,0]
1134 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1135 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
1136 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1138 %vec = load <8 x i32>, ptr %vp
1139 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
1140 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1141 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1145 define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(ptr %vp) {
1146 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3:
1148 ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm1
1149 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [5,1,2,7]
1150 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm1, %xmm0
1152 %vec = load <8 x i32>, ptr %vp
1153 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1156 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1157 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3:
1159 ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2
1160 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,2,7]
1161 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3
1162 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1163 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1165 %vec = load <8 x i32>, ptr %vp
1166 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1167 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1168 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1172 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) {
1173 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3:
1175 ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2
1176 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,1,2,7]
1177 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1178 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z}
1179 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1181 %vec = load <8 x i32>, ptr %vp
1182 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1183 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1184 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1188 define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
1189 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0:
1191 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [1,13,11,14,7,10,1,6]
1192 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
1193 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1195 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1198 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1199 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0:
1201 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,13,11,14,7,10,1,6]
1202 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1203 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1204 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1206 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1207 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1208 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1212 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) {
1213 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0:
1215 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,13,11,14,7,10,1,6]
1216 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1217 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1218 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1220 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1221 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1222 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1225 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1226 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1:
1228 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,15,3,2,3,6,8]
1229 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1230 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1231 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1233 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
1234 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1235 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1239 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) {
1240 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1:
1242 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8]
1243 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1244 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1245 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1247 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
1248 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1249 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1252 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1253 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2:
1255 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [2,15,15,2,6,10,14,7]
1256 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1257 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1258 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1260 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
1261 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1262 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1266 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) {
1267 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2:
1269 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7]
1270 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1271 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1272 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1274 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
1275 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1276 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1279 define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
1280 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3:
1282 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3]
1283 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
1284 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1286 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1289 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1290 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3:
1292 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [14,5,7,7,10,3,9,3]
1293 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1294 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1295 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1297 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1298 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1299 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1303 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) {
1304 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3:
1306 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3]
1307 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1308 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1309 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1311 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1312 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1313 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1316 define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
1317 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0:
1319 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12]
1320 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
1321 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1322 ; CHECK-NEXT: vzeroupper
1324 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1327 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1328 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0:
1330 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12]
1331 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1332 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1333 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1334 ; CHECK-NEXT: vzeroupper
1336 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1337 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1338 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1342 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) {
1343 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0:
1345 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12]
1346 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1347 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1348 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1349 ; CHECK-NEXT: vzeroupper
1351 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1352 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1353 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1356 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1357 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1:
1359 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,3,4]
1360 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1361 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
1362 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1363 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1364 ; CHECK-NEXT: vzeroupper
1366 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
1367 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1368 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1372 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) {
1373 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1:
1375 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,1,3,4]
1376 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1377 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1378 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
1379 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1380 ; CHECK-NEXT: vzeroupper
1382 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
1383 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1384 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1387 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1388 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2:
1390 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,13,0]
1391 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1392 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1393 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1394 ; CHECK-NEXT: vzeroupper
1396 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
1397 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1398 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1402 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) {
1403 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2:
1405 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,13,0]
1406 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1407 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1408 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1409 ; CHECK-NEXT: vzeroupper
1411 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
1412 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1413 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1416 define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
1417 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3:
1419 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,0,0,13]
1420 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
1421 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1422 ; CHECK-NEXT: vzeroupper
1424 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1427 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1428 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3:
1430 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,0,13]
1431 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1432 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1433 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1434 ; CHECK-NEXT: vzeroupper
1436 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1437 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1438 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1442 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) {
1443 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3:
1445 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,0,13]
1446 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1447 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1448 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1449 ; CHECK-NEXT: vzeroupper
1451 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1452 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1453 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1456 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(ptr %vp) {
1457 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0:
1459 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4]
1460 ; CHECK-NEXT: vpermps 32(%rdi), %ymm0, %ymm0
1462 %vec = load <16 x i32>, ptr %vp
1463 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1466 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1467 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0:
1469 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4]
1470 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1471 ; CHECK-NEXT: vpermd 32(%rdi), %ymm2, %ymm0 {%k1}
1473 %vec = load <16 x i32>, ptr %vp
1474 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1475 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1476 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1480 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %mask) {
1481 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0:
1483 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4]
1484 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1485 ; CHECK-NEXT: vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z}
1487 %vec = load <16 x i32>, ptr %vp
1488 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1489 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1490 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1494 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1495 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1:
1497 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1498 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15]
1499 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
1500 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1501 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
1503 %vec = load <16 x i32>, ptr %vp
1504 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
1505 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1506 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1510 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %mask) {
1511 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1:
1513 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1514 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15]
1515 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1516 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1517 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1519 %vec = load <16 x i32>, ptr %vp
1520 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
1521 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1522 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1526 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1527 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2:
1529 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1530 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10]
1531 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
1532 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1533 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
1535 %vec = load <16 x i32>, ptr %vp
1536 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
1537 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1538 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1542 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %mask) {
1543 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2:
1545 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1546 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10]
1547 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1548 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1549 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1551 %vec = load <16 x i32>, ptr %vp
1552 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
1553 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1554 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1558 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(ptr %vp) {
1559 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3:
1561 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1562 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12]
1563 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0
1565 %vec = load <16 x i32>, ptr %vp
1566 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1569 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1570 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3:
1572 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
1573 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12]
1574 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3
1575 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1576 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
1578 %vec = load <16 x i32>, ptr %vp
1579 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1580 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1581 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1585 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %mask) {
1586 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3:
1588 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
1589 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12]
1590 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1591 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
1592 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1594 %vec = load <16 x i32>, ptr %vp
1595 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1596 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1597 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1601 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) {
1602 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0:
1604 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [13,0,0,6]
1605 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
1606 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm0
1607 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1608 ; CHECK-NEXT: vzeroupper
1610 %vec = load <16 x i32>, ptr %vp
1611 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1614 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1615 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0:
1617 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [13,0,0,6]
1618 ; CHECK-NEXT: vmovdqa (%rdi), %ymm3
1619 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3
1620 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1621 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1622 ; CHECK-NEXT: vzeroupper
1624 %vec = load <16 x i32>, ptr %vp
1625 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1626 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1627 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1631 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) {
1632 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
1634 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [13,0,0,6]
1635 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1636 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1637 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
1638 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1639 ; CHECK-NEXT: vzeroupper
1641 %vec = load <16 x i32>, ptr %vp
1642 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1643 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1644 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1648 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1649 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
1651 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1652 ; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [15,5,3,2,15,5,3,2]
1653 ; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
1654 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
1655 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1656 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1657 ; CHECK-NEXT: vzeroupper
1659 %vec = load <16 x i32>, ptr %vp
1660 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
1661 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1662 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1666 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) {
1667 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
1669 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1670 ; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,5,3,2,15,5,3,2]
1671 ; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
1672 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1673 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1674 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1675 ; CHECK-NEXT: vzeroupper
1677 %vec = load <16 x i32>, ptr %vp
1678 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
1679 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1680 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1684 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1685 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2:
1687 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [2,15,6,9]
1688 ; CHECK-NEXT: vmovdqa (%rdi), %ymm3
1689 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3
1690 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1691 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1692 ; CHECK-NEXT: vzeroupper
1694 %vec = load <16 x i32>, ptr %vp
1695 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
1696 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1697 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1701 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) {
1702 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2:
1704 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [2,15,6,9]
1705 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1706 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1707 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
1708 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1709 ; CHECK-NEXT: vzeroupper
1711 %vec = load <16 x i32>, ptr %vp
1712 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
1713 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1714 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1718 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(ptr %vp) {
1719 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3:
1721 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
1722 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,4,3,6]
1723 ; CHECK-NEXT: vpermi2d (%rdi), %xmm1, %xmm0
1725 %vec = load <16 x i32>, ptr %vp
1726 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1729 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1730 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3:
1732 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1733 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,3,6]
1734 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3
1735 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1736 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1738 %vec = load <16 x i32>, ptr %vp
1739 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1740 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1741 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1745 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) {
1746 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3:
1748 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1749 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,4,3,6]
1750 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1751 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
1752 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1754 %vec = load <16 x i32>, ptr %vp
1755 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1756 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1757 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1761 define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
1762 ; CHECK-FAST-LABEL: test_16xi32_to_4xi32_perm_mask9:
1763 ; CHECK-FAST: # %bb.0:
1764 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [12,9,4,10]
1765 ; CHECK-FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
1766 ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1767 ; CHECK-FAST-NEXT: vzeroupper
1768 ; CHECK-FAST-NEXT: retq
1770 ; CHECK-FAST-PERLANE-LABEL: test_16xi32_to_4xi32_perm_mask9:
1771 ; CHECK-FAST-PERLANE: # %bb.0:
1772 ; CHECK-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <4,1,u,2>
1773 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1774 ; CHECK-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm1, %ymm1
1775 ; CHECK-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
1776 ; CHECK-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,3]
1777 ; CHECK-FAST-PERLANE-NEXT: vpermi2d %xmm2, %xmm1, %xmm0
1778 ; CHECK-FAST-PERLANE-NEXT: vzeroupper
1779 ; CHECK-FAST-PERLANE-NEXT: retq
1780 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 12, i32 9, i32 4, i32 10>
1784 define <2 x i64> @test_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) {
1785 ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mask0:
1787 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
1788 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1789 ; CHECK-NEXT: vzeroupper
1791 %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1794 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
1795 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0:
1797 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,2,3]
1798 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
1799 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
1800 ; CHECK-NEXT: vzeroupper
1802 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1803 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1804 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1808 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %mask) {
1809 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0:
1811 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1812 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3]
1813 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1814 ; CHECK-NEXT: vzeroupper
1816 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1817 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1818 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1821 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
1822 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask1:
1824 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
1825 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
1826 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
1827 ; CHECK-NEXT: vzeroupper
1829 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1830 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1831 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1835 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %mask) {
1836 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask1:
1838 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1839 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
1840 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1841 ; CHECK-NEXT: vzeroupper
1843 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1844 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1845 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1848 define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
1849 ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0:
1851 ; CHECK-NEXT: vmovaps (%rdi), %xmm0
1852 ; CHECK-NEXT: vunpckhpd 16(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[1]
1854 %vec = load <4 x i64>, ptr %vp
1855 %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1858 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
1859 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0:
1861 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
1862 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1863 ; CHECK-NEXT: vpunpckhqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[1]
1865 %vec = load <4 x i64>, ptr %vp
1866 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1867 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1868 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1872 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) {
1873 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0:
1875 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
1876 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
1877 ; CHECK-NEXT: vpunpckhqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[1]
1879 %vec = load <4 x i64>, ptr %vp
1880 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1881 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1882 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1886 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
1887 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1:
1889 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1890 ; CHECK-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
1891 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1892 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
1894 %vec = load <4 x i64>, ptr %vp
1895 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1896 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1897 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1901 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) {
1902 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1:
1904 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
1905 ; CHECK-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
1906 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
1907 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
1909 %vec = load <4 x i64>, ptr %vp
1910 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1911 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1912 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1916 define <4 x i64> @test_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec) {
1917 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask0:
1919 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
1920 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1]
1922 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1925 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1926 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
1928 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1929 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
1930 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1]
1931 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1933 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1934 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1935 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1939 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) {
1940 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
1942 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1943 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
1944 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1]
1946 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1947 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1948 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1951 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1952 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
1953 ; CHECK-FAST: # %bb.0:
1954 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,4,6,1]
1955 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
1956 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
1957 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
1958 ; CHECK-FAST-NEXT: retq
1960 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
1961 ; CHECK-FAST-PERLANE: # %bb.0:
1962 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1963 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
1964 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
1965 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,1]
1966 ; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0
1967 ; CHECK-FAST-PERLANE-NEXT: retq
1968 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
1969 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1970 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1974 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) {
1975 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
1976 ; CHECK-FAST: # %bb.0:
1977 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1]
1978 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
1979 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
1980 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1981 ; CHECK-FAST-NEXT: retq
1983 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
1984 ; CHECK-FAST-PERLANE: # %bb.0:
1985 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1986 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
1987 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
1988 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,1]
1989 ; CHECK-FAST-PERLANE-NEXT: retq
1990 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
1991 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1992 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1995 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1996 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
1997 ; CHECK-FAST: # %bb.0:
1998 ; CHECK-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,3,6,3]
1999 ; CHECK-FAST-NEXT: # ymm3 = mem[0,1,0,1]
2000 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
2001 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
2002 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2003 ; CHECK-FAST-NEXT: retq
2005 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
2006 ; CHECK-FAST-PERLANE: # %bb.0:
2007 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2008 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
2009 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
2010 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
2011 ; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0
2012 ; CHECK-FAST-PERLANE-NEXT: retq
2013 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
2014 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2015 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2019 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
2020 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
2021 ; CHECK-FAST: # %bb.0:
2022 ; CHECK-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,3,6,3]
2023 ; CHECK-FAST-NEXT: # ymm2 = mem[0,1,0,1]
2024 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2025 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2026 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2027 ; CHECK-FAST-NEXT: retq
2029 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
2030 ; CHECK-FAST-PERLANE: # %bb.0:
2031 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2032 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
2033 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2034 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
2035 ; CHECK-FAST-PERLANE-NEXT: retq
2036 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
2037 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2038 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2041 define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
2042 ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask3:
2043 ; CHECK-FAST: # %bb.0:
2044 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [6,0,0,7]
2045 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
2046 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2047 ; CHECK-FAST-NEXT: retq
2049 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask3:
2050 ; CHECK-FAST-PERLANE: # %bb.0:
2051 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1
2052 ; CHECK-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2053 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,3]
2054 ; CHECK-FAST-PERLANE-NEXT: retq
2055 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2058 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2059 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
2060 ; CHECK-FAST: # %bb.0:
2061 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,0,0,7]
2062 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
2063 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
2064 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2065 ; CHECK-FAST-NEXT: retq
2067 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
2068 ; CHECK-FAST-PERLANE: # %bb.0:
2069 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2070 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2071 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
2072 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,0,3]
2073 ; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0
2074 ; CHECK-FAST-PERLANE-NEXT: retq
2075 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2076 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2077 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2081 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) {
2082 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
2083 ; CHECK-FAST: # %bb.0:
2084 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7]
2085 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2086 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2087 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2088 ; CHECK-FAST-NEXT: retq
2090 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
2091 ; CHECK-FAST-PERLANE: # %bb.0:
2092 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2093 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2094 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2095 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,0,3]
2096 ; CHECK-FAST-PERLANE-NEXT: retq
2097 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2098 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2099 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2102 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2103 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
2104 ; CHECK-FAST: # %bb.0:
2105 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,7,7,5]
2106 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
2107 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
2108 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2109 ; CHECK-FAST-NEXT: retq
2111 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
2112 ; CHECK-FAST-PERLANE: # %bb.0:
2113 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2114 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
2115 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
2116 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,3,1]
2117 ; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0
2118 ; CHECK-FAST-PERLANE-NEXT: retq
2119 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
2120 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2121 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2125 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {
2126 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
2127 ; CHECK-FAST: # %bb.0:
2128 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,7,5]
2129 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2130 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2131 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2132 ; CHECK-FAST-NEXT: retq
2134 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
2135 ; CHECK-FAST-PERLANE: # %bb.0:
2136 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2137 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
2138 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2139 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,3,1]
2140 ; CHECK-FAST-PERLANE-NEXT: retq
2141 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
2142 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2143 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2146 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2147 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5:
2149 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,1,0,6]
2150 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
2151 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2152 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2154 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
2155 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2156 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2160 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) {
2161 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5:
2163 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,1,0,6]
2164 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2165 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2166 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2168 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
2169 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2170 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2173 define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
2174 ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask6:
2175 ; CHECK-FAST: # %bb.0:
2176 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,3]
2177 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
2178 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2179 ; CHECK-FAST-NEXT: retq
2181 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask6:
2182 ; CHECK-FAST-PERLANE: # %bb.0:
2183 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1
2184 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,1,3]
2185 ; CHECK-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2186 ; CHECK-FAST-PERLANE-NEXT: retq
2187 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2190 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2191 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
2192 ; CHECK-FAST: # %bb.0:
2193 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [7,6,5,3]
2194 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
2195 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
2196 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2197 ; CHECK-FAST-NEXT: retq
2199 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
2200 ; CHECK-FAST-PERLANE: # %bb.0:
2201 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2202 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,2,1,3]
2203 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
2204 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
2205 ; CHECK-FAST-PERLANE-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2206 ; CHECK-FAST-PERLANE-NEXT: retq
2207 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2208 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2209 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2213 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) {
2214 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
2215 ; CHECK-FAST: # %bb.0:
2216 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,3]
2217 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2218 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2219 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2220 ; CHECK-FAST-NEXT: retq
2222 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
2223 ; CHECK-FAST-PERLANE: # %bb.0:
2224 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2225 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,2,1,3]
2226 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
2227 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2228 ; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
2229 ; CHECK-FAST-PERLANE-NEXT: retq
2230 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2231 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2232 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2235 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2236 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
2238 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3
2239 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4]
2240 ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4
2241 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2242 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
2244 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
2245 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2246 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2249 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
2250 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
2252 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm2
2253 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [2,0,3,4]
2254 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2255 ; CHECK-NEXT: vpermt2q %ymm2, %ymm3, %ymm0 {%k1} {z}
2256 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2258 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
2259 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2260 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2264 define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
2265 ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0:
2267 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7]
2268 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2269 ; CHECK-NEXT: vzeroupper
2271 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2274 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
2275 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0:
2277 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7]
2278 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
2279 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
2280 ; CHECK-NEXT: vzeroupper
2282 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2283 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2284 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2287 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) {
2288 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0:
2290 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2291 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,2,3,7,4,6,7]
2292 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2293 ; CHECK-NEXT: vzeroupper
2295 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2296 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2297 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2301 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
2302 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
2304 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2305 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
2306 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
2307 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
2308 ; CHECK-NEXT: vzeroupper
2310 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
2311 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2312 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2315 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) {
2316 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
2318 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2319 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2320 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
2321 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2322 ; CHECK-NEXT: vzeroupper
2324 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
2325 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2326 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2330 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(ptr %vp) {
2331 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask0:
2333 ; CHECK-NEXT: vpermpd $136, (%rdi), %ymm0 # ymm0 = mem[0,2,0,2]
2335 %vec = load <8 x i64>, ptr %vp
2336 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2339 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2340 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask0:
2342 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2343 ; CHECK-NEXT: vpermq $136, (%rdi), %ymm0 {%k1} # ymm0 {%k1} = mem[0,2,0,2]
2345 %vec = load <8 x i64>, ptr %vp
2346 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2347 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2348 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2351 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> %mask) {
2352 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0:
2354 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2355 ; CHECK-NEXT: vpermq $136, (%rdi), %ymm0 {%k1} {z} # ymm0 {%k1} {z} = mem[0,2,0,2]
2357 %vec = load <8 x i64>, ptr %vp
2358 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2359 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2360 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2364 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2365 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
2366 ; CHECK-FAST: # %bb.0:
2367 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2368 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,4]
2369 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3
2370 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2371 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2372 ; CHECK-FAST-NEXT: retq
2374 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
2375 ; CHECK-FAST-PERLANE: # %bb.0:
2376 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2
2377 ; CHECK-FAST-PERLANE-NEXT: vpblendd $15, (%rdi), %ymm2, %ymm2 # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
2378 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2379 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,3,2,0]
2380 ; CHECK-FAST-PERLANE-NEXT: retq
2381 %vec = load <8 x i64>, ptr %vp
2382 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
2383 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2384 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2388 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %mask) {
2389 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
2390 ; CHECK-FAST: # %bb.0:
2391 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2392 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,4]
2393 ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1
2394 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2395 ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
2396 ; CHECK-FAST-NEXT: retq
2398 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
2399 ; CHECK-FAST-PERLANE: # %bb.0:
2400 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
2401 ; CHECK-FAST-PERLANE-NEXT: vpblendd $15, (%rdi), %ymm1, %ymm1 # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
2402 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1
2403 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,3,2,0]
2404 ; CHECK-FAST-PERLANE-NEXT: retq
2405 %vec = load <8 x i64>, ptr %vp
2406 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
2407 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2408 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2412 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2413 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
2414 ; CHECK-FAST: # %bb.0:
2415 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2416 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,5,5,1]
2417 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3
2418 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2419 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2420 ; CHECK-FAST-NEXT: retq
2422 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
2423 ; CHECK-FAST-PERLANE: # %bb.0:
2424 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2
2425 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
2426 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2427 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,0]
2428 ; CHECK-FAST-PERLANE-NEXT: retq
2429 %vec = load <8 x i64>, ptr %vp
2430 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
2431 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2432 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2436 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %mask) {
2437 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
2438 ; CHECK-FAST: # %bb.0:
2439 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2440 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [3,5,5,1]
2441 ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1
2442 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2443 ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
2444 ; CHECK-FAST-NEXT: retq
2446 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
2447 ; CHECK-FAST-PERLANE: # %bb.0:
2448 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
2449 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
2450 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1
2451 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,0]
2452 ; CHECK-FAST-PERLANE-NEXT: retq
2453 %vec = load <8 x i64>, ptr %vp
2454 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
2455 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2456 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2460 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) {
2461 ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
2462 ; CHECK-FAST: # %bb.0:
2463 ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm1
2464 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,0,2]
2465 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0
2466 ; CHECK-FAST-NEXT: retq
2468 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
2469 ; CHECK-FAST-PERLANE: # %bb.0:
2470 ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
2471 ; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm0, %ymm0 # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
2472 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,1,3]
2473 ; CHECK-FAST-PERLANE-NEXT: retq
2474 %vec = load <8 x i64>, ptr %vp
2475 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2478 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2479 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
2480 ; CHECK-FAST: # %bb.0:
2481 ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2
2482 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [7,0,0,2]
2483 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2484 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2485 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2486 ; CHECK-FAST-NEXT: retq
2488 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
2489 ; CHECK-FAST-PERLANE: # %bb.0:
2490 ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2
2491 ; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm2, %ymm2 # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
2492 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2493 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,3]
2494 ; CHECK-FAST-PERLANE-NEXT: retq
2495 %vec = load <8 x i64>, ptr %vp
2496 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2497 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2498 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2502 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %mask) {
2503 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
2504 ; CHECK-FAST: # %bb.0:
2505 ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2
2506 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,2]
2507 ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1
2508 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2509 ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
2510 ; CHECK-FAST-NEXT: retq
2512 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
2513 ; CHECK-FAST-PERLANE: # %bb.0:
2514 ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1
2515 ; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm1, %ymm1 # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
2516 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1
2517 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,3]
2518 ; CHECK-FAST-PERLANE-NEXT: retq
2519 %vec = load <8 x i64>, ptr %vp
2520 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2521 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2522 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2526 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2527 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4:
2529 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2530 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,6,1]
2531 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2532 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2533 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2535 %vec = load <8 x i64>, ptr %vp
2536 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
2537 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2538 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2542 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %mask) {
2543 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:
2545 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2546 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,6,1]
2547 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2548 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2549 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2551 %vec = load <8 x i64>, ptr %vp
2552 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
2553 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2554 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2558 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2559 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
2560 ; CHECK-FAST: # %bb.0:
2561 ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2
2562 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,7,1]
2563 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2564 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2565 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2566 ; CHECK-FAST-NEXT: retq
2568 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
2569 ; CHECK-FAST-PERLANE: # %bb.0:
2570 ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2
2571 ; CHECK-FAST-PERLANE-NEXT: vpblendd $192, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
2572 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2573 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,2,3,1]
2574 ; CHECK-FAST-PERLANE-NEXT: retq
2575 %vec = load <8 x i64>, ptr %vp
2576 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
2577 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2578 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2582 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %mask) {
2583 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
2584 ; CHECK-FAST: # %bb.0:
2585 ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2
2586 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,7,1]
2587 ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1
2588 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2589 ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
2590 ; CHECK-FAST-NEXT: retq
2592 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
2593 ; CHECK-FAST-PERLANE: # %bb.0:
2594 ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1
2595 ; CHECK-FAST-PERLANE-NEXT: vpblendd $192, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
2596 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1
2597 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,2,3,1]
2598 ; CHECK-FAST-PERLANE-NEXT: retq
2599 %vec = load <8 x i64>, ptr %vp
2600 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
2601 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2602 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2606 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) {
2607 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6:
2609 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
2610 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,2,3,2]
2611 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0
2613 %vec = load <8 x i64>, ptr %vp
2614 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2617 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2618 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6:
2620 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2621 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,2,3,2]
2622 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2623 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2624 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2626 %vec = load <8 x i64>, ptr %vp
2627 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2628 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2629 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2633 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %mask) {
2634 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:
2636 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2637 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,2,3,2]
2638 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2639 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2640 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2642 %vec = load <8 x i64>, ptr %vp
2643 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2644 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2645 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2649 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2650 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
2651 ; CHECK-FAST: # %bb.0:
2652 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2653 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,1,5]
2654 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3
2655 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2656 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2657 ; CHECK-FAST-NEXT: retq
2659 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
2660 ; CHECK-FAST-PERLANE: # %bb.0:
2661 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2
2662 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
2663 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2664 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,2,0,1]
2665 ; CHECK-FAST-PERLANE-NEXT: retq
2666 %vec = load <8 x i64>, ptr %vp
2667 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
2668 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2669 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2673 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %mask) {
2674 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
2675 ; CHECK-FAST: # %bb.0:
2676 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2677 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,1,5]
2678 ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1
2679 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2680 ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
2681 ; CHECK-FAST-NEXT: retq
2683 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
2684 ; CHECK-FAST-PERLANE: # %bb.0:
2685 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
2686 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
2687 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1
2688 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,2,0,1]
2689 ; CHECK-FAST-PERLANE-NEXT: retq
2690 %vec = load <8 x i64>, ptr %vp
2691 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
2692 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2693 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2697 define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
2698 ; CHECK-FAST-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
2699 ; CHECK-FAST: # %bb.0:
2700 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm0 = [4,1]
2701 ; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0
2702 ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2703 ; CHECK-FAST-NEXT: vzeroupper
2704 ; CHECK-FAST-NEXT: retq
2706 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
2707 ; CHECK-FAST-PERLANE: # %bb.0:
2708 ; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm0
2709 ; CHECK-FAST-PERLANE-NEXT: vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3]
2710 ; CHECK-FAST-PERLANE-NEXT: retq
2711 %vec = load <8 x i64>, ptr %vp
2712 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2715 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
2716 ; CHECK-FAST-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
2717 ; CHECK-FAST: # %bb.0:
2718 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,1]
2719 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm2
2720 ; CHECK-FAST-NEXT: vptestnmq %xmm1, %xmm1, %k1
2721 ; CHECK-FAST-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
2722 ; CHECK-FAST-NEXT: vzeroupper
2723 ; CHECK-FAST-NEXT: retq
2725 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
2726 ; CHECK-FAST-PERLANE: # %bb.0:
2727 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2
2728 ; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
2729 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm1, %xmm1, %k1
2730 ; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
2731 ; CHECK-FAST-PERLANE-NEXT: retq
2732 %vec = load <8 x i64>, ptr %vp
2733 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2734 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2735 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2739 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) {
2740 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
2741 ; CHECK-FAST: # %bb.0:
2742 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1]
2743 ; CHECK-FAST-NEXT: vptestnmq %xmm0, %xmm0, %k1
2744 ; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
2745 ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2746 ; CHECK-FAST-NEXT: vzeroupper
2747 ; CHECK-FAST-NEXT: retq
2749 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
2750 ; CHECK-FAST-PERLANE: # %bb.0:
2751 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm1
2752 ; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
2753 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm0, %xmm0, %k1
2754 ; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
2755 ; CHECK-FAST-PERLANE-NEXT: retq
2756 %vec = load <8 x i64>, ptr %vp
2757 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2758 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2759 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2763 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
2764 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
2766 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2767 ; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
2768 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
2769 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2770 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
2771 ; CHECK-NEXT: vzeroupper
2773 %vec = load <8 x i64>, ptr %vp
2774 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
2775 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2776 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2780 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) {
2781 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
2783 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
2784 ; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
2785 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
2786 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
2787 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
2788 ; CHECK-NEXT: vzeroupper
2790 %vec = load <8 x i64>, ptr %vp
2791 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
2792 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2793 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2797 define <4 x float> @test_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) {
2798 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
2800 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
2801 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1]
2802 ; CHECK-NEXT: vzeroupper
2804 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2805 ret <4 x float> %res
2807 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2808 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0:
2810 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
2811 ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
2812 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
2813 ; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1]
2814 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
2815 ; CHECK-NEXT: vzeroupper
2817 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2818 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2819 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2820 ret <4 x float> %res
2823 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %mask) {
2824 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0:
2826 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
2827 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2828 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2829 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1]
2830 ; CHECK-NEXT: vzeroupper
2832 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2833 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2834 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2835 ret <4 x float> %res
2837 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2838 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1:
2840 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,0]
2841 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
2842 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2843 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2844 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2845 ; CHECK-NEXT: vzeroupper
2847 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
2848 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2849 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2850 ret <4 x float> %res
2853 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) {
2854 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1:
2856 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,0]
2857 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2858 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2859 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2860 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2861 ; CHECK-NEXT: vzeroupper
2863 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
2864 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2865 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2866 ret <4 x float> %res
2868 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2869 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2:
2871 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,2,7,0]
2872 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
2873 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2874 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2875 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2876 ; CHECK-NEXT: vzeroupper
2878 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
2879 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2880 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2881 ret <4 x float> %res
2884 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) {
2885 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2:
2887 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,2,7,0]
2888 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2889 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2890 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2891 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2892 ; CHECK-NEXT: vzeroupper
2894 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
2895 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2896 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2897 ret <4 x float> %res
2899 define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) {
2900 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3:
2902 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,5,2]
2903 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
2904 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2905 ; CHECK-NEXT: vzeroupper
2907 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2908 ret <4 x float> %res
2910 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2911 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3:
2913 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,3,5,2]
2914 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
2915 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2916 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2917 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2918 ; CHECK-NEXT: vzeroupper
2920 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2921 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2922 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2923 ret <4 x float> %res
2926 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) {
2927 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3:
2929 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,5,2]
2930 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2931 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2932 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2933 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2934 ; CHECK-NEXT: vzeroupper
2936 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2937 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2938 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2939 ret <4 x float> %res
2941 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
2942 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
2944 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
2945 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,6,0,1]
2946 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm1, %xmm0
2948 %vec = load <8 x float>, ptr %vp
2949 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2950 ret <4 x float> %res
2952 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
2953 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
2955 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2956 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,0,1]
2957 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3
2958 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
2959 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
2960 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
2962 %vec = load <8 x float>, ptr %vp
2963 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2964 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2965 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2966 ret <4 x float> %res
2969 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) {
2970 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
2972 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2973 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,0,1]
2974 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2975 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
2976 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
2977 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
2979 %vec = load <8 x float>, ptr %vp
2980 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2981 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2982 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2983 ret <4 x float> %res
2986 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
2987 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
2989 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2990 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,7,7,2]
2991 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3
2992 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
2993 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
2994 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
2996 %vec = load <8 x float>, ptr %vp
2997 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
2998 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2999 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3000 ret <4 x float> %res
3003 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) {
3004 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
3006 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
3007 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,7,7,2]
3008 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3009 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3010 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
3011 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3013 %vec = load <8 x float>, ptr %vp
3014 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
3015 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3016 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3017 ret <4 x float> %res
3020 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3021 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
3023 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
3024 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,1,3,7]
3025 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
3026 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3027 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3028 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3030 %vec = load <8 x float>, ptr %vp
3031 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
3032 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3033 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3034 ret <4 x float> %res
3037 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) {
3038 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
3040 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
3041 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,1,3,7]
3042 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3043 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3044 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
3045 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3047 %vec = load <8 x float>, ptr %vp
3048 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
3049 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3050 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3051 ret <4 x float> %res
3054 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
3055 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3:
3057 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
3058 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,3,5,3]
3059 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0
3061 %vec = load <8 x float>, ptr %vp
3062 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
3063 ret <4 x float> %res
3065 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3066 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
3068 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
3069 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,3]
3070 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
3071 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3072 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3073 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3075 %vec = load <8 x float>, ptr %vp
3076 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
3077 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3078 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3079 ret <4 x float> %res
3082 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) {
3083 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
3085 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
3086 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [1,3,5,3]
3087 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3088 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3089 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
3090 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3092 %vec = load <8 x float>, ptr %vp
3093 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
3094 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3095 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3096 ret <4 x float> %res
3099 define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) {
3100 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0:
3102 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7]
3103 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
3104 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3106 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
3107 ret <8 x float> %res
3109 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3110 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0:
3112 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,12,10,8,2,11,7]
3113 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3114 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3115 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
3116 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
3118 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
3119 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3120 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3121 ret <8 x float> %res
3124 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) {
3125 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0:
3127 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7]
3128 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3129 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
3130 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3131 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3133 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
3134 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3135 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3136 ret <8 x float> %res
3138 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3139 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1:
3141 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [10,12,3,12,4,15,1,14]
3142 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3143 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3144 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
3145 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
3147 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
3148 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3149 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3150 ret <8 x float> %res
3153 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) {
3154 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1:
3156 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [10,12,3,12,4,15,1,14]
3157 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3158 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
3159 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3160 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3162 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
3163 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3164 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3165 ret <8 x float> %res
3167 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3168 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2:
3170 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4]
3171 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3172 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3173 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
3174 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
3176 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
3177 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3178 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3179 ret <8 x float> %res
3182 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) {
3183 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2:
3185 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4]
3186 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3187 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
3188 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3189 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3191 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
3192 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3193 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3194 ret <8 x float> %res
3196 define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) {
3197 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3:
3199 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [12,14,9,0,12,4,5,8]
3200 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
3201 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3203 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3204 ret <8 x float> %res
3206 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3207 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3:
3209 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [12,14,9,0,12,4,5,8]
3210 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3211 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3212 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
3213 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
3215 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3216 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3217 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3218 ret <8 x float> %res
3221 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) {
3222 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3:
3224 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [12,14,9,0,12,4,5,8]
3225 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3226 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
3227 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3228 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3230 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3231 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3232 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3233 ret <8 x float> %res
3235 define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) {
3236 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0:
3238 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,8,9,10]
3239 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
3240 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3241 ; CHECK-NEXT: vzeroupper
3243 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3244 ret <4 x float> %res
3246 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3247 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0:
3249 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [4,8,9,10]
3250 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3251 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3252 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3253 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3254 ; CHECK-NEXT: vzeroupper
3256 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3257 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3258 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3259 ret <4 x float> %res
3262 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) {
3263 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0:
3265 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [4,8,9,10]
3266 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3267 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3268 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3269 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3270 ; CHECK-NEXT: vzeroupper
3272 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3273 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3274 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3275 ret <4 x float> %res
3277 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3278 ; CHECK-FAST-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
3279 ; CHECK-FAST: # %bb.0:
3280 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm3 = [8,6,10,6]
3281 ; CHECK-FAST-NEXT: vpermps %zmm0, %zmm3, %zmm0
3282 ; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3
3283 ; CHECK-FAST-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3284 ; CHECK-FAST-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3285 ; CHECK-FAST-NEXT: vzeroupper
3286 ; CHECK-FAST-NEXT: retq
3288 ; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
3289 ; CHECK-FAST-PERLANE: # %bb.0:
3290 ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm3
3291 ; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0
3292 ; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = [0,6,2,6]
3293 ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm0, %xmm3, %xmm4
3294 ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm0, %xmm0, %xmm0
3295 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm0, %xmm2, %k1
3296 ; CHECK-FAST-PERLANE-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1}
3297 ; CHECK-FAST-PERLANE-NEXT: vzeroupper
3298 ; CHECK-FAST-PERLANE-NEXT: retq
3299 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
3300 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3301 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3302 ret <4 x float> %res
3305 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
3306 ; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
3307 ; CHECK-FAST: # %bb.0:
3308 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [8,6,10,6]
3309 ; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3
3310 ; CHECK-FAST-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3311 ; CHECK-FAST-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3312 ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3313 ; CHECK-FAST-NEXT: vzeroupper
3314 ; CHECK-FAST-NEXT: retq
3316 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
3317 ; CHECK-FAST-PERLANE: # %bb.0:
3318 ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm2
3319 ; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm3
3320 ; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,2,6]
3321 ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4
3322 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm4, %xmm1, %k1
3323 ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z}
3324 ; CHECK-FAST-PERLANE-NEXT: vzeroupper
3325 ; CHECK-FAST-PERLANE-NEXT: retq
3326 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
3327 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3328 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3329 ret <4 x float> %res
3331 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3332 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2:
3334 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3335 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5]
3336 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3337 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3338 ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1}
3339 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3340 ; CHECK-NEXT: vzeroupper
3342 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
3343 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3344 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3345 ret <4 x float> %res
3348 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) {
3349 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2:
3351 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
3352 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5]
3353 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3354 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3355 ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
3356 ; CHECK-NEXT: vzeroupper
3358 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
3359 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3360 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3361 ret <4 x float> %res
3363 define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) {
3364 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3:
3366 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [10,2,11,6]
3367 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
3368 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3369 ; CHECK-NEXT: vzeroupper
3371 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3372 ret <4 x float> %res
3374 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3375 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3:
3377 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [10,2,11,6]
3378 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3379 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3380 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3381 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3382 ; CHECK-NEXT: vzeroupper
3384 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3385 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3386 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3387 ret <4 x float> %res
3390 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) {
3391 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3:
3393 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [10,2,11,6]
3394 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3395 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3396 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3397 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3398 ; CHECK-NEXT: vzeroupper
3400 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3401 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3402 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3403 ret <4 x float> %res
3405 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp) {
3406 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0:
3408 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
3409 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4]
3410 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm0
3412 %vec = load <16 x float>, ptr %vp
3413 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3414 ret <8 x float> %res
3416 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
3417 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0:
3419 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3420 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4]
3421 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3
3422 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3423 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3424 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3426 %vec = load <16 x float>, ptr %vp
3427 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3428 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3429 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3430 ret <8 x float> %res
3433 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %mask) {
3434 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0:
3436 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3437 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4]
3438 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3439 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3440 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3441 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3443 %vec = load <16 x float>, ptr %vp
3444 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3445 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3446 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3447 ret <8 x float> %res
3450 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
3451 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1:
3453 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3454 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8]
3455 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3
3456 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3457 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3458 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3460 %vec = load <16 x float>, ptr %vp
3461 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
3462 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3463 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3464 ret <8 x float> %res
3467 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %mask) {
3468 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1:
3470 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3471 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8]
3472 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3473 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3474 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3475 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3477 %vec = load <16 x float>, ptr %vp
3478 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
3479 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3480 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3481 ret <8 x float> %res
3484 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
3485 ; CHECK-FAST-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
3486 ; CHECK-FAST: # %bb.0:
3487 ; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2
3488 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1]
3489 ; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3490 ; CHECK-FAST-NEXT: vxorps %xmm2, %xmm2, %xmm2
3491 ; CHECK-FAST-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3492 ; CHECK-FAST-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3493 ; CHECK-FAST-NEXT: retq
3495 ; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
3496 ; CHECK-FAST-PERLANE: # %bb.0:
3497 ; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2
3498 ; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3
3499 ; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = [9,5,2,3,2,8,8,1]
3500 ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4
3501 ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm2, %xmm2, %xmm2
3502 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3503 ; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm0 {%k1}
3504 ; CHECK-FAST-PERLANE-NEXT: retq
3505 %vec = load <16 x float>, ptr %vp
3506 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
3507 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3508 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3509 ret <8 x float> %res
3512 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %mask) {
3513 ; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
3514 ; CHECK-FAST: # %bb.0:
3515 ; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2
3516 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1]
3517 ; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3
3518 ; CHECK-FAST-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3519 ; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3520 ; CHECK-FAST-NEXT: vmovaps %ymm1, %ymm0
3521 ; CHECK-FAST-NEXT: retq
3523 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
3524 ; CHECK-FAST-PERLANE: # %bb.0:
3525 ; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2
3526 ; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3
3527 ; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1]
3528 ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4
3529 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm4, %ymm0, %k1
3530 ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z}
3531 ; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm0
3532 ; CHECK-FAST-PERLANE-NEXT: retq
3533 %vec = load <16 x float>, ptr %vp
3534 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
3535 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3536 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3537 ret <8 x float> %res
3540 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp) {
3541 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3:
3543 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm1
3544 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9]
3545 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm0
3547 %vec = load <16 x float>, ptr %vp
3548 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3549 ret <8 x float> %res
3551 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %vec2, <8 x float> %mask) {
3552 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3:
3554 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3555 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9]
3556 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3557 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3558 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3559 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3561 %vec = load <16 x float>, ptr %vp
3562 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3563 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3564 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3565 ret <8 x float> %res
3568 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %mask) {
3569 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3:
3571 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3572 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9]
3573 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3574 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3575 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3576 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3578 %vec = load <16 x float>, ptr %vp
3579 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3580 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3581 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3582 ret <8 x float> %res
3585 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
3586 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0:
3588 ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm1 # ymm1 = mem[3,1,2,3]
3589 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,7,3]
3590 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0
3591 ; CHECK-NEXT: vzeroupper
3593 %vec = load <16 x float>, ptr %vp
3594 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3595 ret <4 x float> %res
3597 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3598 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
3600 ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3]
3601 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,6,7,3]
3602 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
3603 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3604 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3605 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3606 ; CHECK-NEXT: vzeroupper
3608 %vec = load <16 x float>, ptr %vp
3609 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3610 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3611 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3612 ret <4 x float> %res
3615 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) {
3616 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
3618 ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3]
3619 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,7,3]
3620 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3621 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3622 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
3623 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3624 ; CHECK-NEXT: vzeroupper
3626 %vec = load <16 x float>, ptr %vp
3627 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3628 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3629 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3630 ret <4 x float> %res
3633 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3634 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1:
3636 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3637 ; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,10,6,15,0,10,6,15]
3638 ; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
3639 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3640 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3641 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3642 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3643 ; CHECK-NEXT: vzeroupper
3645 %vec = load <16 x float>, ptr %vp
3646 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
3647 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3648 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3649 ret <4 x float> %res
3652 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) {
3653 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
3655 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3656 ; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,10,6,15,0,10,6,15]
3657 ; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
3658 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3659 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3660 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3661 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3662 ; CHECK-NEXT: vzeroupper
3664 %vec = load <16 x float>, ptr %vp
3665 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
3666 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3667 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3668 ret <4 x float> %res
3671 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3672 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
3674 ; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [4,14,4,14]
3675 ; CHECK-NEXT: # xmm2 = mem[0,0]
3676 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm3
3677 ; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm3
3678 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3679 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3680 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3681 ; CHECK-NEXT: vzeroupper
3683 %vec = load <16 x float>, ptr %vp
3684 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
3685 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3686 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3687 ret <4 x float> %res
3690 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) {
3691 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
3693 ; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [4,14,4,14]
3694 ; CHECK-NEXT: # xmm2 = mem[0,0]
3695 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm1
3696 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3697 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3698 ; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3699 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3700 ; CHECK-NEXT: vzeroupper
3702 %vec = load <16 x float>, ptr %vp
3703 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
3704 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3705 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3706 ret <4 x float> %res
3709 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) {
3710 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3:
3712 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,15,9]
3713 ; CHECK-NEXT: vmovaps (%rdi), %ymm0
3714 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm1, %ymm0
3715 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3716 ; CHECK-NEXT: vzeroupper
3718 %vec = load <16 x float>, ptr %vp
3719 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3720 ret <4 x float> %res
3722 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
3723 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3:
3725 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9]
3726 ; CHECK-NEXT: vmovaps (%rdi), %ymm3
3727 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm3
3728 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3729 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3730 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3731 ; CHECK-NEXT: vzeroupper
3733 %vec = load <16 x float>, ptr %vp
3734 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3735 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3736 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3737 ret <4 x float> %res
3740 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) {
3741 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
3743 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9]
3744 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
3745 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3746 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3747 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3748 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3749 ; CHECK-NEXT: vzeroupper
3751 %vec = load <16 x float>, ptr %vp
3752 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3753 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3754 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3755 ret <4 x float> %res
3758 define <2 x double> @test_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec) {
3759 ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mask0:
3761 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
3762 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3763 ; CHECK-NEXT: vzeroupper
3765 %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3766 ret <2 x double> %res
3768 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3769 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask0:
3771 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
3772 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3773 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
3774 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
3775 ; CHECK-NEXT: vzeroupper
3777 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3778 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3779 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3780 ret <2 x double> %res
3783 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %mask) {
3784 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask0:
3786 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3787 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
3788 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3]
3789 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3790 ; CHECK-NEXT: vzeroupper
3792 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3793 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3794 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3795 ret <2 x double> %res
3797 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3798 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask1:
3800 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
3801 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3802 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
3803 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
3804 ; CHECK-NEXT: vzeroupper
3806 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
3807 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3808 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3809 ret <2 x double> %res
3812 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %mask) {
3813 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask1:
3815 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3816 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
3817 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,2,3]
3818 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3819 ; CHECK-NEXT: vzeroupper
3821 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
3822 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3823 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3824 ret <2 x double> %res
3826 define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) {
3827 ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0:
3829 ; CHECK-NEXT: vmovaps (%rdi), %xmm0
3830 ; CHECK-NEXT: vblendps $3, 16(%rdi), %xmm0, %xmm0 # xmm0 = mem[0,1],xmm0[2,3]
3832 %vec = load <4 x double>, ptr %vp
3833 %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3834 ret <2 x double> %res
3836 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
3837 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0:
3839 ; CHECK-NEXT: vmovapd (%rdi), %xmm2
3840 ; CHECK-NEXT: vblendpd $1, 16(%rdi), %xmm2, %xmm2 # xmm2 = mem[0],xmm2[1]
3841 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3842 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
3843 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1}
3845 %vec = load <4 x double>, ptr %vp
3846 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3847 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3848 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3849 ret <2 x double> %res
3852 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) {
3853 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0:
3855 ; CHECK-NEXT: vmovapd (%rdi), %xmm1
3856 ; CHECK-NEXT: vblendpd $1, 16(%rdi), %xmm1, %xmm1 # xmm1 = mem[0],xmm1[1]
3857 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3858 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
3859 ; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z}
3861 %vec = load <4 x double>, ptr %vp
3862 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3863 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3864 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3865 ret <2 x double> %res
3868 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
3869 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1:
3871 ; CHECK-NEXT: vmovapd 16(%rdi), %xmm2
3872 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3873 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
3874 ; CHECK-NEXT: vunpcklpd (%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
3876 %vec = load <4 x double>, ptr %vp
3877 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3878 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3879 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3880 ret <2 x double> %res
3883 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) {
3884 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1:
3886 ; CHECK-NEXT: vmovapd 16(%rdi), %xmm1
3887 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3888 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
3889 ; CHECK-NEXT: vunpcklpd (%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
3891 %vec = load <4 x double>, ptr %vp
3892 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3893 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3894 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3895 ret <2 x double> %res
3898 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
3899 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
3901 ; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,3,7,3]
3902 ; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
3903 ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
3904 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3906 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3907 ret <4 x double> %res
3909 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3910 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
3912 ; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [7,3,7,3]
3913 ; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
3914 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
3915 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3916 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3917 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3919 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3920 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3921 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3922 ret <4 x double> %res
3925 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
3926 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
3928 ; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [7,3,7,3]
3929 ; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
3930 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3931 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
3932 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3933 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3935 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3936 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3937 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3938 ret <4 x double> %res
3940 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3941 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1:
3943 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,0,7,6]
3944 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
3945 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3946 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3947 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3949 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
3950 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3951 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3952 ret <4 x double> %res
3955 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %mask) {
3956 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask1:
3958 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,0,7,6]
3959 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3960 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
3961 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3962 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3964 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
3965 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3966 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3967 ret <4 x double> %res
3969 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3970 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask2:
3972 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3973 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3974 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,0]
3975 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
3977 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
3978 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3979 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3980 ret <4 x double> %res
3983 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %mask) {
3984 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask2:
3986 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3987 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
3988 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,0]
3990 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
3991 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3992 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3993 ret <4 x double> %res
3995 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) {
3996 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3:
3998 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,1,4]
3999 ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
4000 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4002 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
4003 ret <4 x double> %res
4005 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4006 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3:
4008 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,2,1,4]
4009 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
4010 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4011 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4012 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
4014 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
4015 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4016 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4017 ret <4 x double> %res
4020 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) {
4021 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3:
4023 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,1,4]
4024 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4025 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4026 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4027 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4029 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
4030 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4031 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4032 ret <4 x double> %res
4034 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4035 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
4036 ; CHECK-FAST: # %bb.0:
4037 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm3 = [1,5]
4038 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0
4039 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4040 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4041 ; CHECK-FAST-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
4042 ; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0
4043 ; CHECK-FAST-NEXT: retq
4045 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
4046 ; CHECK-FAST-PERLANE: # %bb.0:
4047 ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm3
4048 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
4049 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4050 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4051 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
4052 ; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0
4053 ; CHECK-FAST-PERLANE-NEXT: retq
4054 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
4055 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4056 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4057 ret <4 x double> %res
4060 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
4061 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
4062 ; CHECK-FAST: # %bb.0:
4063 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm2 = [1,5]
4064 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0
4065 ; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4066 ; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4067 ; CHECK-FAST-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
4068 ; CHECK-FAST-NEXT: retq
4070 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
4071 ; CHECK-FAST-PERLANE: # %bb.0:
4072 ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm2
4073 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
4074 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4075 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4076 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
4077 ; CHECK-FAST-PERLANE-NEXT: retq
4078 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
4079 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4080 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4081 ret <4 x double> %res
4083 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4084 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5:
4086 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,6,2,2]
4087 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
4088 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4089 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4090 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
4092 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
4093 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4094 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4095 ret <4 x double> %res
4098 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) {
4099 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5:
4101 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,6,2,2]
4102 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4103 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4104 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4105 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4107 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
4108 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4109 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4110 ret <4 x double> %res
4112 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
4113 ; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
4114 ; CHECK-FAST: # %bb.0:
4115 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [5,8,7,8]
4116 ; CHECK-FAST-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0
4117 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4118 ; CHECK-FAST-NEXT: retq
4120 ; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
4121 ; CHECK-FAST-PERLANE: # %bb.0:
4122 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1
4123 ; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
4124 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
4125 ; CHECK-FAST-PERLANE-NEXT: retq
4126 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
4127 ret <4 x double> %res
4129 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4130 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
4131 ; CHECK-FAST: # %bb.0:
4132 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [5,8,7,8]
4133 ; CHECK-FAST-NEXT: vpermi2pd %zmm0, %zmm0, %zmm3
4134 ; CHECK-FAST-NEXT: vxorpd %xmm0, %xmm0, %xmm0
4135 ; CHECK-FAST-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
4136 ; CHECK-FAST-NEXT: vblendmpd %ymm3, %ymm1, %ymm0 {%k1}
4137 ; CHECK-FAST-NEXT: retq
4139 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
4140 ; CHECK-FAST-PERLANE: # %bb.0:
4141 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4142 ; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
4143 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4144 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
4145 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
4146 ; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0
4147 ; CHECK-FAST-PERLANE-NEXT: retq
4148 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
4149 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4150 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4151 ret <4 x double> %res
4154 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
4155 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
4156 ; CHECK-FAST: # %bb.0:
4157 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [5,8,7,8]
4158 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4159 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4160 ; CHECK-FAST-NEXT: vpermt2pd %zmm0, %zmm2, %zmm0 {%k1} {z}
4161 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4162 ; CHECK-FAST-NEXT: retq
4164 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
4165 ; CHECK-FAST-PERLANE: # %bb.0:
4166 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4167 ; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
4168 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4169 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4170 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
4171 ; CHECK-FAST-PERLANE-NEXT: retq
4172 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
4173 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4174 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4175 ret <4 x double> %res
4177 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4178 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
4179 ; CHECK-FAST: # %bb.0:
4180 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,5,0,6]
4181 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0
4182 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4183 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4184 ; CHECK-FAST-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
4185 ; CHECK-FAST-NEXT: retq
4187 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
4188 ; CHECK-FAST-PERLANE: # %bb.0:
4189 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4190 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3]
4191 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4192 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
4193 ; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],ymm3[1],ymm0[2],ymm3[2]
4194 ; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0
4195 ; CHECK-FAST-PERLANE-NEXT: retq
4196 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
4197 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4198 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4199 ret <4 x double> %res
4202 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) {
4203 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
4204 ; CHECK-FAST: # %bb.0:
4205 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [3,5,0,6]
4206 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4207 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4208 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4209 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4210 ; CHECK-FAST-NEXT: retq
4212 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
4213 ; CHECK-FAST-PERLANE: # %bb.0:
4214 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4215 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3]
4216 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4217 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4218 ; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm2[1],ymm0[2],ymm2[2]
4219 ; CHECK-FAST-PERLANE-NEXT: retq
4220 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
4221 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4222 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4223 ret <4 x double> %res
4225 define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
4226 ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
4228 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
4229 ; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
4230 ; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4231 ; CHECK-NEXT: vzeroupper
4233 %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
4234 ret <2 x double> %res
4236 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
4237 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
4239 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4240 ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3
4241 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4242 ; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
4243 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
4244 ; CHECK-NEXT: vmovapd %xmm1, %xmm0
4245 ; CHECK-NEXT: vzeroupper
4247 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
4248 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4249 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4250 ret <2 x double> %res
4253 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
4254 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
4256 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4257 ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2
4258 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4259 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4260 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
4261 ; CHECK-NEXT: vzeroupper
4263 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
4264 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4265 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4266 ret <2 x double> %res
4268 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
4269 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1:
4271 ; CHECK-NEXT: vmovapd {{.*#+}} xmm3 = [3,7]
4272 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
4273 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4274 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
4275 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
4276 ; CHECK-NEXT: vzeroupper
4278 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
4279 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4280 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4281 ret <2 x double> %res
4284 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %mask) {
4285 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask1:
4287 ; CHECK-NEXT: vmovapd {{.*#+}} xmm2 = [3,7]
4288 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4289 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4290 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4291 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
4292 ; CHECK-NEXT: vzeroupper
4294 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
4295 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4296 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4297 ret <2 x double> %res
4299 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp) {
4300 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0:
4302 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
4303 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,7,2]
4304 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0
4306 %vec = load <8 x double>, ptr %vp
4307 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4308 ret <4 x double> %res
4310 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4311 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0:
4313 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4314 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,6,7,2]
4315 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4316 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4317 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4318 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4320 %vec = load <8 x double>, ptr %vp
4321 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4322 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4323 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4324 ret <4 x double> %res
4327 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %mask) {
4328 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0:
4330 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4331 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,6,7,2]
4332 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4333 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4334 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4335 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4337 %vec = load <8 x double>, ptr %vp
4338 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4339 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4340 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4341 ret <4 x double> %res
4344 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4345 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
4346 ; CHECK-FAST: # %bb.0:
4347 ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4348 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,6]
4349 ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3
4350 ; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4351 ; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4352 ; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4353 ; CHECK-FAST-NEXT: retq
4355 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
4356 ; CHECK-FAST-PERLANE: # %bb.0:
4357 ; CHECK-FAST-PERLANE-NEXT: vpermpd $236, (%rdi), %ymm2 # ymm2 = mem[0,3,2,3]
4358 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4359 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4360 ; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 32(%rdi){1to4}, %ymm2, %ymm0 {%k1}
4361 ; CHECK-FAST-PERLANE-NEXT: retq
4362 %vec = load <8 x double>, ptr %vp
4363 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
4364 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4365 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4366 ret <4 x double> %res
4369 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %mask) {
4370 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
4371 ; CHECK-FAST: # %bb.0:
4372 ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4373 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,6]
4374 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4375 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4376 ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}
4377 ; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0
4378 ; CHECK-FAST-NEXT: retq
4380 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
4381 ; CHECK-FAST-PERLANE: # %bb.0:
4382 ; CHECK-FAST-PERLANE-NEXT: vpermpd $236, (%rdi), %ymm1 # ymm1 = mem[0,3,2,3]
4383 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4384 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4385 ; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 32(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
4386 ; CHECK-FAST-PERLANE-NEXT: retq
4387 %vec = load <8 x double>, ptr %vp
4388 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
4389 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4390 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4391 ret <4 x double> %res
4394 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4395 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
4396 ; CHECK-FAST: # %bb.0:
4397 ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4398 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [1,2,3,4]
4399 ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4400 ; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4401 ; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4402 ; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4403 ; CHECK-FAST-NEXT: retq
4405 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
4406 ; CHECK-FAST-PERLANE: # %bb.0:
4407 ; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %ymm2
4408 ; CHECK-FAST-PERLANE-NEXT: vperm2f128 $33, 32(%rdi), %ymm2, %ymm3 # ymm3 = ymm2[2,3],mem[0,1]
4409 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4410 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
4411 ; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm2[1],ymm3[0],ymm2[3],ymm3[2]
4412 ; CHECK-FAST-PERLANE-NEXT: retq
4413 %vec = load <8 x double>, ptr %vp
4414 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
4415 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4416 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4417 ret <4 x double> %res
4420 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %mask) {
4421 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
4422 ; CHECK-FAST: # %bb.0:
4423 ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4424 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [1,2,3,4]
4425 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4426 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4427 ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4428 ; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0
4429 ; CHECK-FAST-NEXT: retq
4431 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
4432 ; CHECK-FAST-PERLANE: # %bb.0:
4433 ; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %ymm1
4434 ; CHECK-FAST-PERLANE-NEXT: vperm2f128 $33, 32(%rdi), %ymm1, %ymm2 # ymm2 = ymm1[2,3],mem[0,1]
4435 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4436 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4437 ; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm1[1],ymm2[0],ymm1[3],ymm2[2]
4438 ; CHECK-FAST-PERLANE-NEXT: retq
4439 %vec = load <8 x double>, ptr %vp
4440 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
4441 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4442 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4443 ret <4 x double> %res
4446 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) {
4447 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
4449 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
4450 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [4,2,1,0]
4451 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0
4453 %vec = load <8 x double>, ptr %vp
4454 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4455 ret <4 x double> %res
4457 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4458 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
4460 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4461 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [4,2,1,0]
4462 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4463 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4464 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4465 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4467 %vec = load <8 x double>, ptr %vp
4468 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4469 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4470 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4471 ret <4 x double> %res
4474 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %mask) {
4475 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
4477 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4478 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [4,2,1,0]
4479 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4480 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4481 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4482 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4484 %vec = load <8 x double>, ptr %vp
4485 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4486 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4487 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4488 ret <4 x double> %res
4491 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4492 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4:
4494 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4495 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,4,1,5]
4496 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3
4497 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4498 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4499 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4501 %vec = load <8 x double>, ptr %vp
4502 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
4503 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4504 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4505 ret <4 x double> %res
4508 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %mask) {
4509 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4:
4511 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4512 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [2,4,1,5]
4513 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4514 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4515 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
4516 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4518 %vec = load <8 x double>, ptr %vp
4519 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
4520 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4521 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4522 ret <4 x double> %res
4525 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4526 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
4528 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4529 ; CHECK-NEXT: vperm2f128 $33, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[2,3],mem[0,1]
4530 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4531 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4532 ; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm2, %ymm0 {%k1}
4534 %vec = load <8 x double>, ptr %vp
4535 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
4536 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4537 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4538 ret <4 x double> %res
4541 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 x double> %mask) {
4542 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
4544 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
4545 ; CHECK-NEXT: vperm2f128 $33, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[2,3],mem[0,1]
4546 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4547 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4548 ; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
4550 %vec = load <8 x double>, ptr %vp
4551 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
4552 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4553 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4554 ret <4 x double> %res
4557 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp) {
4558 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
4560 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm1
4561 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [0,2,4,1]
4562 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm1, %ymm0
4564 %vec = load <8 x double>, ptr %vp
4565 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4566 ret <4 x double> %res
4568 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4569 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
4571 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4572 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,2,4,1]
4573 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3
4574 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4575 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4576 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4578 %vec = load <8 x double>, ptr %vp
4579 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4580 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4581 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4582 ret <4 x double> %res
4585 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %mask) {
4586 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
4588 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4589 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,1]
4590 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4591 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4592 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
4593 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4595 %vec = load <8 x double>, ptr %vp
4596 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4597 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4598 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4599 ret <4 x double> %res
4602 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
4603 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask7:
4605 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4606 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4607 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4608 ; CHECK-NEXT: vunpcklpd 40(%rdi){1to4}, %ymm2, %ymm0 {%k1}
4610 %vec = load <8 x double>, ptr %vp
4611 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
4612 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4613 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4614 ret <4 x double> %res
4617 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(ptr %vp, <4 x double> %mask) {
4618 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7:
4620 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
4621 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4622 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4623 ; CHECK-NEXT: vunpcklpd 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
4625 %vec = load <8 x double>, ptr %vp
4626 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
4627 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4628 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4629 ret <4 x double> %res
4632 define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) {
4633 ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
4635 ; CHECK-NEXT: vmovapd (%rdi), %xmm0
4636 ; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[0]
4638 %vec = load <8 x double>, ptr %vp
4639 %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4640 ret <2 x double> %res
4642 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
4643 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
4645 ; CHECK-NEXT: vmovapd (%rdi), %xmm2
4646 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4647 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4648 ; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[0]
4650 %vec = load <8 x double>, ptr %vp
4651 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4652 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4653 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4654 ret <2 x double> %res
4657 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) {
4658 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
4660 ; CHECK-NEXT: vmovapd (%rdi), %xmm1
4661 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4662 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
4663 ; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[0]
4665 %vec = load <8 x double>, ptr %vp
4666 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4667 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4668 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4669 ret <2 x double> %res
4672 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
4673 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1:
4675 ; CHECK-NEXT: vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0]
4676 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4677 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4678 ; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
4680 %vec = load <8 x double>, ptr %vp
4681 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
4682 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4683 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4684 ret <2 x double> %res
4687 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) {
4688 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
4690 ; CHECK-NEXT: vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0]
4691 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4692 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
4693 ; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
4695 %vec = load <8 x double>, ptr %vp
4696 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
4697 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4698 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4699 ret <2 x double> %res
4703 define void @test_zext_v8i8_to_v8i16(ptr %arg, ptr %arg1) {
4704 ; CHECK-LABEL: test_zext_v8i8_to_v8i16:
4706 ; CHECK-NEXT: vpmovzxbw (%rdi), %xmm0 # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
4707 ; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0
4708 ; CHECK-NEXT: vmovdqa %xmm0, (%rsi)
4710 %tmp2 = load <8 x i8>, ptr %arg
4711 %tmp3 = extractelement <8 x i8> %tmp2, i32 0
4712 %tmp4 = zext i8 %tmp3 to i16
4713 %tmp5 = insertelement <8 x i16> undef, i16 %tmp4, i32 0
4714 %tmp6 = extractelement <8 x i8> %tmp2, i32 1
4715 %tmp7 = zext i8 %tmp6 to i16
4716 %tmp8 = insertelement <8 x i16> %tmp5, i16 %tmp7, i32 1
4717 %tmp9 = extractelement <8 x i8> %tmp2, i32 2
4718 %tmp10 = zext i8 %tmp9 to i16
4719 %tmp11 = insertelement <8 x i16> %tmp8, i16 %tmp10, i32 2
4720 %tmp12 = extractelement <8 x i8> %tmp2, i32 3
4721 %tmp13 = zext i8 %tmp12 to i16
4722 %tmp14 = insertelement <8 x i16> %tmp11, i16 %tmp13, i32 3
4723 %tmp15 = extractelement <8 x i8> %tmp2, i32 4
4724 %tmp16 = zext i8 %tmp15 to i16
4725 %tmp17 = insertelement <8 x i16> %tmp14, i16 %tmp16, i32 4
4726 %tmp18 = extractelement <8 x i8> %tmp2, i32 5
4727 %tmp19 = zext i8 %tmp18 to i16
4728 %tmp20 = insertelement <8 x i16> %tmp17, i16 %tmp19, i32 5
4729 %tmp21 = extractelement <8 x i8> %tmp2, i32 6
4730 %tmp22 = zext i8 %tmp21 to i16
4731 %tmp23 = insertelement <8 x i16> %tmp20, i16 %tmp22, i32 6
4732 %tmp24 = extractelement <8 x i8> %tmp2, i32 7
4733 %tmp25 = zext i8 %tmp24 to i16
4734 %tmp26 = insertelement <8 x i16> %tmp23, i16 %tmp25, i32 7
4735 %tmp27 = shl <8 x i16> %tmp26, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
4736 store <8 x i16> %tmp27, ptr %arg1