1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle %s -o - | FileCheck --check-prefixes=CHECK,CHECK-FAST %s
3 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-perlane-shuffle %s -o - | FileCheck --check-prefixes=CHECK,CHECK-FAST-PERLANE %s
5 ; FIXME: All cases here should be fixed by PR34380
7 define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
8 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
10 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [8,6,12,4,7,9,14,8]
11 ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
12 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
13 ; CHECK-NEXT: vzeroupper
15 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
18 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
19 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
21 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [8,6,12,4,7,9,14,8]
22 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
23 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
24 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
25 ; CHECK-NEXT: vzeroupper
27 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
28 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
29 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
33 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
34 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
36 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,6,12,4,7,9,14,8]
37 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
38 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
39 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
40 ; CHECK-NEXT: vzeroupper
42 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
43 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
44 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
47 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
48 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
50 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,12,9,4,14,15,12,14]
51 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
52 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
53 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
54 ; CHECK-NEXT: vzeroupper
56 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
57 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
58 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
62 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) {
63 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1:
65 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,12,9,4,14,15,12,14]
66 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
67 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
68 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
69 ; CHECK-NEXT: vzeroupper
71 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
72 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
73 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
76 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
77 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
79 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11,14,10,7,1,6,9]
80 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
81 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
82 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
83 ; CHECK-NEXT: vzeroupper
85 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
86 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
87 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
91 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) {
92 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2:
94 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,14,10,7,1,6,9]
95 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
96 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
97 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
98 ; CHECK-NEXT: vzeroupper
100 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
101 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
102 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
105 define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
106 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3:
108 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [14,15,7,13,4,12,8,0]
109 ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
110 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
111 ; CHECK-NEXT: vzeroupper
113 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
116 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
117 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3:
119 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [14,15,7,13,4,12,8,0]
120 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
121 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
122 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
123 ; CHECK-NEXT: vzeroupper
125 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
126 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
127 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
131 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) {
132 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3:
134 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,7,13,4,12,8,0]
135 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
136 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
137 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
138 ; CHECK-NEXT: vzeroupper
140 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
141 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
142 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
145 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp) {
146 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0:
148 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
149 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9]
150 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0
152 %vec = load <16 x i16>, <16 x i16>* %vp
153 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
156 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
157 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
159 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
160 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,13,3,5,13,3,9]
161 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
162 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
163 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
165 %vec = load <16 x i16>, <16 x i16>* %vp
166 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
167 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
168 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
172 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %mask) {
173 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
175 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
176 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9]
177 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
178 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
179 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
181 %vec = load <16 x i16>, <16 x i16>* %vp
182 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
183 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
184 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
188 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
189 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1:
191 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
192 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,15,12,7,1,5,8,14]
193 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
194 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
195 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
197 %vec = load <16 x i16>, <16 x i16>* %vp
198 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
199 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
200 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
204 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %mask) {
205 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1:
207 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
208 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14]
209 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
210 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
211 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
213 %vec = load <16 x i16>, <16 x i16>* %vp
214 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
215 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
216 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
220 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
221 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
223 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
224 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1]
225 ; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm3
226 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
227 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
229 %vec = load <16 x i16>, <16 x i16>* %vp
230 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
231 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
232 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
236 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %mask) {
237 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
239 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
240 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1]
241 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
242 ; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z}
243 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
245 %vec = load <16 x i16>, <16 x i16>* %vp
246 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
247 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
248 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
252 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp) {
253 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3:
255 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
256 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2]
257 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0
259 %vec = load <16 x i16>, <16 x i16>* %vp
260 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
263 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
264 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
266 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
267 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [9,7,9,6,9,4,3,2]
268 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
269 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
270 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
272 %vec = load <16 x i16>, <16 x i16>* %vp
273 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
274 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
275 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
279 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %mask) {
280 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
282 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
283 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2]
284 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
285 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
286 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
288 %vec = load <16 x i16>, <16 x i16>* %vp
289 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
290 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
291 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
295 define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) {
296 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0:
298 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
299 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
300 ; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1
301 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
303 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
306 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
307 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0:
309 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
310 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
311 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
312 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
313 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
315 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
316 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
317 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
321 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) {
322 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0:
324 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
325 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
326 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
327 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
328 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
330 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
331 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
332 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
335 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
336 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1:
338 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
339 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
340 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
341 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
342 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
344 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
345 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
346 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
350 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) {
351 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1:
353 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
354 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
355 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
356 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
357 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
359 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
360 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
361 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
364 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
365 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2:
367 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
368 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
369 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
370 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
371 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
373 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
374 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
375 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
379 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) {
380 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2:
382 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
383 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
384 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
385 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
386 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
388 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
389 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
390 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
393 define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
394 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3:
396 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
397 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
398 ; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1
399 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
401 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
404 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
405 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3:
407 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
408 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
409 ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
410 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
411 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
413 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
414 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
415 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
419 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) {
420 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3:
422 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
423 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
424 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
425 ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm2 {%k1} {z}
426 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
428 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
429 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
430 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
433 define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) {
434 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0:
436 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14]
437 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
438 ; CHECK-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
439 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
440 ; CHECK-NEXT: vzeroupper
442 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
445 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
446 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0:
448 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14]
449 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
450 ; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm4
451 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
452 ; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
453 ; CHECK-NEXT: vzeroupper
455 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
456 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
457 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
461 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) {
462 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0:
464 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14]
465 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
466 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
467 ; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm2 {%k1} {z}
468 ; CHECK-NEXT: vmovdqa %xmm2, %xmm0
469 ; CHECK-NEXT: vzeroupper
471 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
472 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
473 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
476 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
477 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1:
479 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5]
480 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
481 ; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0
482 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
483 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
484 ; CHECK-NEXT: vzeroupper
486 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
487 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
488 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
492 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) {
493 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1:
495 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5]
496 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
497 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
498 ; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
499 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
500 ; CHECK-NEXT: vzeroupper
502 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
503 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
504 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
507 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
508 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2:
510 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8]
511 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
512 ; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0
513 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
514 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
515 ; CHECK-NEXT: vzeroupper
517 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
518 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
519 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
523 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) {
524 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2:
526 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8]
527 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
528 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
529 ; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
530 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
531 ; CHECK-NEXT: vzeroupper
533 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
534 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
535 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
538 define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
539 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3:
541 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30]
542 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
543 ; CHECK-NEXT: vpermt2w %ymm2, %ymm1, %ymm0
544 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
545 ; CHECK-NEXT: vzeroupper
547 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
550 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
551 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3:
553 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30]
554 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
555 ; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0
556 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
557 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
558 ; CHECK-NEXT: vzeroupper
560 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
561 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
562 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
566 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) {
567 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3:
569 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30]
570 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
571 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
572 ; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
573 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
574 ; CHECK-NEXT: vzeroupper
576 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
577 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
578 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
581 define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp) {
582 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0:
584 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
585 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
586 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
588 %vec = load <32 x i16>, <32 x i16>* %vp
589 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
592 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
593 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0:
595 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
596 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
597 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
598 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
599 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
601 %vec = load <32 x i16>, <32 x i16>* %vp
602 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
603 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
604 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
608 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %mask) {
609 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0:
611 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
612 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
613 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
614 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
615 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
617 %vec = load <32 x i16>, <32 x i16>* %vp
618 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
619 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
620 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
624 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
625 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1:
627 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
628 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
629 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
630 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
631 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
633 %vec = load <32 x i16>, <32 x i16>* %vp
634 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
635 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
636 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
640 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %mask) {
641 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1:
643 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
644 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
645 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
646 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
647 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
649 %vec = load <32 x i16>, <32 x i16>* %vp
650 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
651 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
652 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
656 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
657 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2:
659 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
660 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
661 ; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3
662 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
663 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
665 %vec = load <32 x i16>, <32 x i16>* %vp
666 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
667 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
668 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
672 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %mask) {
673 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2:
675 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
676 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
677 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
678 ; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z}
679 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
681 %vec = load <32 x i16>, <32 x i16>* %vp
682 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
683 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
684 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
688 define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp) {
689 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3:
691 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
692 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
693 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
695 %vec = load <32 x i16>, <32 x i16>* %vp
696 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
699 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
700 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3:
702 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
703 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
704 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
705 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
706 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
708 %vec = load <32 x i16>, <32 x i16>* %vp
709 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
710 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
711 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
715 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %mask) {
716 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3:
718 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
719 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
720 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
721 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
722 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
724 %vec = load <32 x i16>, <32 x i16>* %vp
725 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
726 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
727 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
731 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp) {
732 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0:
734 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17]
735 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0
736 ; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm0
737 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
738 ; CHECK-NEXT: vzeroupper
740 %vec = load <32 x i16>, <32 x i16>* %vp
741 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
744 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
745 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0:
747 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17]
748 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3
749 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3
750 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
751 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
752 ; CHECK-NEXT: vzeroupper
754 %vec = load <32 x i16>, <32 x i16>* %vp
755 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
756 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
757 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
761 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %mask) {
762 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0:
764 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17]
765 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
766 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
767 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
768 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
769 ; CHECK-NEXT: vzeroupper
771 %vec = load <32 x i16>, <32 x i16>* %vp
772 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
773 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
774 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
778 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
779 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1:
781 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
782 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3
783 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3
784 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
785 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
786 ; CHECK-NEXT: vzeroupper
788 %vec = load <32 x i16>, <32 x i16>* %vp
789 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
790 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
791 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
795 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %mask) {
796 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
798 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
799 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
800 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
801 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
802 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
803 ; CHECK-NEXT: vzeroupper
805 %vec = load <32 x i16>, <32 x i16>* %vp
806 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
807 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
808 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
812 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
813 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2:
815 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10]
816 ; CHECK-NEXT: vmovdqa (%rdi), %ymm3
817 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3
818 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
819 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
820 ; CHECK-NEXT: vzeroupper
822 %vec = load <32 x i16>, <32 x i16>* %vp
823 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
824 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
825 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
829 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %mask) {
830 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2:
832 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10]
833 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
834 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
835 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
836 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
837 ; CHECK-NEXT: vzeroupper
839 %vec = load <32 x i16>, <32 x i16>* %vp
840 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
841 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
842 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
846 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp) {
847 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3:
849 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9]
850 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
851 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm0
852 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
853 ; CHECK-NEXT: vzeroupper
855 %vec = load <32 x i16>, <32 x i16>* %vp
856 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
859 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
860 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3:
862 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9]
863 ; CHECK-NEXT: vmovdqa (%rdi), %ymm3
864 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3
865 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
866 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
867 ; CHECK-NEXT: vzeroupper
869 %vec = load <32 x i16>, <32 x i16>* %vp
870 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
871 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
872 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
876 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %mask) {
877 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3:
879 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9]
880 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
881 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
882 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
883 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
884 ; CHECK-NEXT: vzeroupper
886 %vec = load <32 x i16>, <32 x i16>* %vp
887 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
888 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
889 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
893 define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) {
894 ; CHECK-LABEL: test_16xi16_to_8xi16_E84C94EF:
896 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [14,8,4,12,9,4,14,15]
897 ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
898 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
899 ; CHECK-NEXT: vzeroupper
901 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15>
905 define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) {
906 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0:
908 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,3,2]
909 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
910 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
911 ; CHECK-NEXT: vzeroupper
913 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
916 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
917 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0:
919 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,0,3,2]
920 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
921 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
922 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
923 ; CHECK-NEXT: vzeroupper
925 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
926 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
927 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
931 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) {
932 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0:
934 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,3,2]
935 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
936 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
937 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
938 ; CHECK-NEXT: vzeroupper
940 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
941 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
942 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
945 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
946 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1:
948 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,7,3]
949 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
950 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
951 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
952 ; CHECK-NEXT: vzeroupper
954 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
955 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
956 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
960 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) {
961 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1:
963 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,7,3]
964 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
965 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
966 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
967 ; CHECK-NEXT: vzeroupper
969 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
970 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
971 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
974 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
975 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask2:
977 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
978 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm3[1],xmm0[1]
979 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
980 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
981 ; CHECK-NEXT: vzeroupper
983 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
984 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
985 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
989 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %mask) {
990 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask2:
992 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
993 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1]
994 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
995 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
996 ; CHECK-NEXT: vzeroupper
998 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
999 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1000 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1003 define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
1004 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3:
1006 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [5,3,2,5]
1007 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
1008 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1009 ; CHECK-NEXT: vzeroupper
1011 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1014 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1015 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3:
1017 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,3,2,5]
1018 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
1019 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1020 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1021 ; CHECK-NEXT: vzeroupper
1023 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1024 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1025 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1029 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) {
1030 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3:
1032 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,3,2,5]
1033 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1034 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
1035 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1036 ; CHECK-NEXT: vzeroupper
1038 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1039 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1040 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1043 define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp) {
1044 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0:
1046 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm0
1047 ; CHECK-NEXT: vshufps $7, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[3,1],mem[0,0]
1049 %vec = load <8 x i32>, <8 x i32>* %vp
1050 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1053 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1054 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0:
1056 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
1057 ; CHECK-NEXT: vshufps $7, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[3,1],mem[0,0]
1058 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1059 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
1061 %vec = load <8 x i32>, <8 x i32>* %vp
1062 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1063 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1064 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1068 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %mask) {
1069 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0:
1071 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
1072 ; CHECK-NEXT: vshufps $7, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[3,1],mem[0,0]
1073 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1074 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
1076 %vec = load <8 x i32>, <8 x i32>* %vp
1077 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1078 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1079 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1083 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1084 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
1086 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
1087 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,0,0,3]
1088 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3
1089 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1090 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1092 %vec = load <8 x i32>, <8 x i32>* %vp
1093 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
1094 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1095 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1099 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %mask) {
1100 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
1102 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
1103 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,0,0,3]
1104 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1105 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z}
1106 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1108 %vec = load <8 x i32>, <8 x i32>* %vp
1109 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
1110 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1111 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1115 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1116 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
1118 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1119 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,7,0]
1120 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3
1121 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1122 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1124 %vec = load <8 x i32>, <8 x i32>* %vp
1125 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
1126 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1127 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1131 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %mask) {
1132 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
1134 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1135 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,7,0]
1136 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1137 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
1138 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1140 %vec = load <8 x i32>, <8 x i32>* %vp
1141 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
1142 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1143 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1147 define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) {
1148 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3:
1150 ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm1
1151 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [5,1,2,7]
1152 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm1, %xmm0
1154 %vec = load <8 x i32>, <8 x i32>* %vp
1155 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1158 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1159 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3:
1161 ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2
1162 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,2,7]
1163 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3
1164 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1165 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1167 %vec = load <8 x i32>, <8 x i32>* %vp
1168 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1169 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1170 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1174 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %mask) {
1175 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3:
1177 ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2
1178 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,1,2,7]
1179 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1180 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z}
1181 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1183 %vec = load <8 x i32>, <8 x i32>* %vp
1184 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1185 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1186 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1190 define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
1191 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0:
1193 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [1,13,11,14,7,10,1,6]
1194 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
1195 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1197 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1200 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1201 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0:
1203 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,13,11,14,7,10,1,6]
1204 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1205 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1206 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1208 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1209 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1210 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1214 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) {
1215 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0:
1217 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,13,11,14,7,10,1,6]
1218 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1219 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1220 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1222 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1223 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1224 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1227 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1228 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1:
1230 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,15,3,2,3,6,8]
1231 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1232 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1233 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1235 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
1236 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1237 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1241 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) {
1242 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1:
1244 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8]
1245 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1246 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1247 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1249 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
1250 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1251 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1254 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1255 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2:
1257 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [2,15,15,2,6,10,14,7]
1258 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1259 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1260 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1262 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
1263 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1264 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1268 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) {
1269 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2:
1271 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7]
1272 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1273 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1274 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1276 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
1277 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1278 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1281 define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
1282 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3:
1284 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3]
1285 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
1286 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1288 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1291 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1292 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3:
1294 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [14,5,7,7,10,3,9,3]
1295 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1296 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1297 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1299 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1300 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1301 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1305 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) {
1306 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3:
1308 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3]
1309 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1310 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1311 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1313 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1314 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1315 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1318 define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
1319 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0:
1321 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12]
1322 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
1323 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1324 ; CHECK-NEXT: vzeroupper
1326 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1329 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1330 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0:
1332 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12]
1333 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1334 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1335 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1336 ; CHECK-NEXT: vzeroupper
1338 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1339 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1340 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1344 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) {
1345 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0:
1347 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12]
1348 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1349 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1350 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1351 ; CHECK-NEXT: vzeroupper
1353 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1354 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1355 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1358 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1359 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1:
1361 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,3,4]
1362 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1363 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
1364 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1365 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1366 ; CHECK-NEXT: vzeroupper
1368 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
1369 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1370 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1374 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) {
1375 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1:
1377 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,1,3,4]
1378 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1379 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1380 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
1381 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1382 ; CHECK-NEXT: vzeroupper
1384 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
1385 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1386 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1389 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1390 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2:
1392 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,13,0]
1393 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1394 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1395 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1396 ; CHECK-NEXT: vzeroupper
1398 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
1399 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1400 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1404 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) {
1405 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2:
1407 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,13,0]
1408 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1409 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1410 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1411 ; CHECK-NEXT: vzeroupper
1413 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
1414 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1415 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1418 define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
1419 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3:
1421 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,0,0,13]
1422 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
1423 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1424 ; CHECK-NEXT: vzeroupper
1426 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1429 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1430 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3:
1432 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,0,13]
1433 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1434 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1435 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1436 ; CHECK-NEXT: vzeroupper
1438 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1439 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1440 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1444 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) {
1445 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3:
1447 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,0,13]
1448 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1449 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1450 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1451 ; CHECK-NEXT: vzeroupper
1453 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1454 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1455 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1458 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp) {
1459 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0:
1461 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4]
1462 ; CHECK-NEXT: vpermps 32(%rdi), %ymm0, %ymm0
1464 %vec = load <16 x i32>, <16 x i32>* %vp
1465 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1468 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1469 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0:
1471 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4]
1472 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1473 ; CHECK-NEXT: vpermd 32(%rdi), %ymm2, %ymm0 {%k1}
1475 %vec = load <16 x i32>, <16 x i32>* %vp
1476 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1477 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1478 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1482 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %mask) {
1483 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0:
1485 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4]
1486 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1487 ; CHECK-NEXT: vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z}
1489 %vec = load <16 x i32>, <16 x i32>* %vp
1490 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1491 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1492 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1496 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1497 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1:
1499 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1500 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15]
1501 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
1502 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1503 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
1505 %vec = load <16 x i32>, <16 x i32>* %vp
1506 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
1507 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1508 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1512 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %mask) {
1513 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1:
1515 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1516 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15]
1517 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1518 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1519 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1521 %vec = load <16 x i32>, <16 x i32>* %vp
1522 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
1523 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1524 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1528 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1529 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2:
1531 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1532 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10]
1533 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
1534 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1535 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
1537 %vec = load <16 x i32>, <16 x i32>* %vp
1538 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
1539 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1540 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1544 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %mask) {
1545 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2:
1547 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1548 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10]
1549 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1550 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1551 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1553 %vec = load <16 x i32>, <16 x i32>* %vp
1554 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
1555 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1556 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1560 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp) {
1561 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3:
1563 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1564 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12]
1565 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0
1567 %vec = load <16 x i32>, <16 x i32>* %vp
1568 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1571 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1572 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3:
1574 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
1575 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12]
1576 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3
1577 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1578 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
1580 %vec = load <16 x i32>, <16 x i32>* %vp
1581 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1582 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1583 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1587 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %mask) {
1588 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3:
1590 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
1591 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12]
1592 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1593 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
1594 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1596 %vec = load <16 x i32>, <16 x i32>* %vp
1597 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1598 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1599 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1603 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp) {
1604 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0:
1606 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [13,0,0,6]
1607 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
1608 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm0
1609 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1610 ; CHECK-NEXT: vzeroupper
1612 %vec = load <16 x i32>, <16 x i32>* %vp
1613 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1616 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1617 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0:
1619 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [13,0,0,6]
1620 ; CHECK-NEXT: vmovdqa (%rdi), %ymm3
1621 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3
1622 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1623 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1624 ; CHECK-NEXT: vzeroupper
1626 %vec = load <16 x i32>, <16 x i32>* %vp
1627 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1628 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1629 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1633 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %mask) {
1634 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
1636 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [13,0,0,6]
1637 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1638 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1639 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
1640 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1641 ; CHECK-NEXT: vzeroupper
1643 %vec = load <16 x i32>, <16 x i32>* %vp
1644 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1645 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1646 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1650 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1651 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
1653 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1654 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <15,5,3,2,u,u,u,u>
1655 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
1656 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1657 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1658 ; CHECK-NEXT: vzeroupper
1660 %vec = load <16 x i32>, <16 x i32>* %vp
1661 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
1662 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1663 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1667 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %mask) {
1668 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
1670 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1671 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <15,5,3,2,u,u,u,u>
1672 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1673 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1674 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1675 ; CHECK-NEXT: vzeroupper
1677 %vec = load <16 x i32>, <16 x i32>* %vp
1678 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
1679 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1680 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1684 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1685 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2:
1687 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [2,15,6,9]
1688 ; CHECK-NEXT: vmovdqa (%rdi), %ymm3
1689 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3
1690 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1691 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1692 ; CHECK-NEXT: vzeroupper
1694 %vec = load <16 x i32>, <16 x i32>* %vp
1695 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
1696 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1697 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1701 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %mask) {
1702 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2:
1704 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [2,15,6,9]
1705 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1706 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1707 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
1708 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1709 ; CHECK-NEXT: vzeroupper
1711 %vec = load <16 x i32>, <16 x i32>* %vp
1712 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
1713 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1714 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1718 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) {
1719 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3:
1721 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
1722 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,4,3,6]
1723 ; CHECK-NEXT: vpermi2d (%rdi), %xmm1, %xmm0
1725 %vec = load <16 x i32>, <16 x i32>* %vp
1726 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1729 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1730 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3:
1732 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1733 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,3,6]
1734 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3
1735 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1736 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1738 %vec = load <16 x i32>, <16 x i32>* %vp
1739 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1740 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1741 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1745 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %mask) {
1746 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3:
1748 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1749 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,4,3,6]
1750 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1751 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
1752 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1754 %vec = load <16 x i32>, <16 x i32>* %vp
1755 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1756 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1757 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1761 define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
1762 ; CHECK-FAST-LABEL: test_16xi32_to_4xi32_perm_mask9:
1763 ; CHECK-FAST: # %bb.0:
1764 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm1 = [12,9,4,10]
1765 ; CHECK-FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
1766 ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1767 ; CHECK-FAST-NEXT: vzeroupper
1768 ; CHECK-FAST-NEXT: retq
1770 ; CHECK-FAST-PERLANE-LABEL: test_16xi32_to_4xi32_perm_mask9:
1771 ; CHECK-FAST-PERLANE: # %bb.0:
1772 ; CHECK-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <4,1,u,2>
1773 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1774 ; CHECK-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm1, %ymm1
1775 ; CHECK-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
1776 ; CHECK-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,3]
1777 ; CHECK-FAST-PERLANE-NEXT: vpermi2d %xmm2, %xmm1, %xmm0
1778 ; CHECK-FAST-PERLANE-NEXT: vzeroupper
1779 ; CHECK-FAST-PERLANE-NEXT: retq
1780 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 12, i32 9, i32 4, i32 10>
1784 define <2 x i64> @test_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) {
1785 ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mask0:
1787 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
1788 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1789 ; CHECK-NEXT: vzeroupper
1791 %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1794 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
1795 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0:
1797 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,2,3]
1798 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
1799 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
1800 ; CHECK-NEXT: vzeroupper
1802 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1803 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1804 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1808 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %mask) {
1809 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0:
1811 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1812 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3]
1813 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1814 ; CHECK-NEXT: vzeroupper
1816 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1817 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1818 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1821 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
1822 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask1:
1824 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
1825 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
1826 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
1827 ; CHECK-NEXT: vzeroupper
1829 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1830 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1831 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1835 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %mask) {
1836 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask1:
1838 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1839 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
1840 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1841 ; CHECK-NEXT: vzeroupper
1843 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1844 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1845 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1848 define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp) {
1849 ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0:
1851 ; CHECK-NEXT: vmovaps (%rdi), %xmm0
1852 ; CHECK-NEXT: vunpckhpd 16(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[1]
1854 %vec = load <4 x i64>, <4 x i64>* %vp
1855 %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1858 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
1859 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0:
1861 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
1862 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1863 ; CHECK-NEXT: vpunpckhqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[1]
1865 %vec = load <4 x i64>, <4 x i64>* %vp
1866 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1867 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1868 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1872 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %mask) {
1873 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0:
1875 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
1876 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
1877 ; CHECK-NEXT: vpunpckhqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[1]
1879 %vec = load <4 x i64>, <4 x i64>* %vp
1880 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1881 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1882 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1886 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
1887 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1:
1889 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1890 ; CHECK-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
1891 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1892 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
1894 %vec = load <4 x i64>, <4 x i64>* %vp
1895 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1896 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1897 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1901 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %mask) {
1902 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1:
1904 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
1905 ; CHECK-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
1906 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
1907 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
1909 %vec = load <4 x i64>, <4 x i64>* %vp
1910 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1911 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1912 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1916 define <4 x i64> @test_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec) {
1917 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask0:
1919 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
1920 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1]
1922 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1925 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1926 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
1928 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1929 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
1930 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1]
1931 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1933 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1934 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1935 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1939 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) {
1940 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
1942 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1943 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
1944 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1]
1946 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1947 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1948 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1951 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1952 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
1953 ; CHECK-FAST: # %bb.0:
1954 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,4,6,1]
1955 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
1956 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
1957 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
1958 ; CHECK-FAST-NEXT: retq
1960 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
1961 ; CHECK-FAST-PERLANE: # %bb.0:
1962 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1963 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
1964 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
1965 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,1]
1966 ; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0
1967 ; CHECK-FAST-PERLANE-NEXT: retq
1968 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
1969 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1970 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1974 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) {
1975 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
1976 ; CHECK-FAST: # %bb.0:
1977 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1]
1978 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
1979 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
1980 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1981 ; CHECK-FAST-NEXT: retq
1983 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
1984 ; CHECK-FAST-PERLANE: # %bb.0:
1985 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1986 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
1987 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
1988 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,1]
1989 ; CHECK-FAST-PERLANE-NEXT: retq
1990 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
1991 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1992 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1995 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1996 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
1997 ; CHECK-FAST: # %bb.0:
1998 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,3,6,3]
1999 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
2000 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
2001 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2002 ; CHECK-FAST-NEXT: retq
2004 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
2005 ; CHECK-FAST-PERLANE: # %bb.0:
2006 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2007 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
2008 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
2009 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
2010 ; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0
2011 ; CHECK-FAST-PERLANE-NEXT: retq
2012 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
2013 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2014 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2018 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
2019 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
2020 ; CHECK-FAST: # %bb.0:
2021 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,3,6,3]
2022 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2023 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2024 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2025 ; CHECK-FAST-NEXT: retq
2027 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
2028 ; CHECK-FAST-PERLANE: # %bb.0:
2029 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2030 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
2031 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2032 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
2033 ; CHECK-FAST-PERLANE-NEXT: retq
2034 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
2035 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2036 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2039 define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
2040 ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask3:
2041 ; CHECK-FAST: # %bb.0:
2042 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [6,0,0,7]
2043 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
2044 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2045 ; CHECK-FAST-NEXT: retq
2047 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask3:
2048 ; CHECK-FAST-PERLANE: # %bb.0:
2049 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1
2050 ; CHECK-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
2051 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,3]
2052 ; CHECK-FAST-PERLANE-NEXT: retq
2053 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2056 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2057 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
2058 ; CHECK-FAST: # %bb.0:
2059 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,0,0,7]
2060 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
2061 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
2062 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2063 ; CHECK-FAST-NEXT: retq
2065 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
2066 ; CHECK-FAST-PERLANE: # %bb.0:
2067 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2068 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
2069 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
2070 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,0,3]
2071 ; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0
2072 ; CHECK-FAST-PERLANE-NEXT: retq
2073 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2074 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2075 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2079 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) {
2080 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
2081 ; CHECK-FAST: # %bb.0:
2082 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7]
2083 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2084 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2085 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2086 ; CHECK-FAST-NEXT: retq
2088 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
2089 ; CHECK-FAST-PERLANE: # %bb.0:
2090 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2091 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
2092 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2093 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,0,3]
2094 ; CHECK-FAST-PERLANE-NEXT: retq
2095 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2096 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2097 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2100 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2101 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
2102 ; CHECK-FAST: # %bb.0:
2103 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,7,7,5]
2104 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
2105 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
2106 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2107 ; CHECK-FAST-NEXT: retq
2109 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
2110 ; CHECK-FAST-PERLANE: # %bb.0:
2111 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2112 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
2113 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
2114 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,3,1]
2115 ; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0
2116 ; CHECK-FAST-PERLANE-NEXT: retq
2117 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
2118 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2119 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2123 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {
2124 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
2125 ; CHECK-FAST: # %bb.0:
2126 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,7,5]
2127 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2128 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2129 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2130 ; CHECK-FAST-NEXT: retq
2132 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
2133 ; CHECK-FAST-PERLANE: # %bb.0:
2134 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2135 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
2136 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2137 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,3,1]
2138 ; CHECK-FAST-PERLANE-NEXT: retq
2139 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
2140 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2141 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2144 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2145 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5:
2147 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,1,0,6]
2148 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
2149 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2150 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2152 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
2153 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2154 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2158 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) {
2159 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5:
2161 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,1,0,6]
2162 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2163 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2164 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2166 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
2167 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2168 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2171 define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
2172 ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask6:
2173 ; CHECK-FAST: # %bb.0:
2174 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,3]
2175 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
2176 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2177 ; CHECK-FAST-NEXT: retq
2179 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask6:
2180 ; CHECK-FAST-PERLANE: # %bb.0:
2181 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1
2182 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,1,3]
2183 ; CHECK-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2184 ; CHECK-FAST-PERLANE-NEXT: retq
2185 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2188 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2189 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
2190 ; CHECK-FAST: # %bb.0:
2191 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [7,6,5,3]
2192 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0
2193 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
2194 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2195 ; CHECK-FAST-NEXT: retq
2197 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
2198 ; CHECK-FAST-PERLANE: # %bb.0:
2199 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2200 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,2,1,3]
2201 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
2202 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
2203 ; CHECK-FAST-PERLANE-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2204 ; CHECK-FAST-PERLANE-NEXT: retq
2205 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2206 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2207 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2211 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) {
2212 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
2213 ; CHECK-FAST: # %bb.0:
2214 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,3]
2215 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2216 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2217 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2218 ; CHECK-FAST-NEXT: retq
2220 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
2221 ; CHECK-FAST-PERLANE: # %bb.0:
2222 ; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2223 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,2,1,3]
2224 ; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
2225 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2226 ; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
2227 ; CHECK-FAST-PERLANE-NEXT: retq
2228 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2229 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2230 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2233 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2234 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
2236 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3
2237 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4]
2238 ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4
2239 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2240 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
2242 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
2243 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2244 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2248 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
2249 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
2251 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3
2252 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4]
2253 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2254 ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
2255 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
2257 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
2258 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2259 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2262 define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
2263 ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0:
2265 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
2266 ; CHECK-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
2267 ; CHECK-NEXT: vzeroupper
2269 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2272 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
2273 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0:
2275 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
2276 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
2277 ; CHECK-NEXT: valignq {{.*#+}} xmm1 {%k1} = xmm3[1],xmm0[0]
2278 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
2279 ; CHECK-NEXT: vzeroupper
2281 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2282 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2283 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2287 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) {
2288 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0:
2290 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
2291 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2292 ; CHECK-NEXT: valignq {{.*#+}} xmm0 {%k1} {z} = xmm2[1],xmm0[0]
2293 ; CHECK-NEXT: vzeroupper
2295 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2296 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2297 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2300 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
2301 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
2303 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2304 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
2305 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
2306 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
2307 ; CHECK-NEXT: vzeroupper
2309 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
2310 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2311 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2315 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) {
2316 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
2318 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2319 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2320 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
2321 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2322 ; CHECK-NEXT: vzeroupper
2324 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
2325 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2326 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2329 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp) {
2330 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask0:
2332 ; CHECK-NEXT: vpermpd $136, (%rdi), %ymm0 # ymm0 = mem[0,2,0,2]
2334 %vec = load <8 x i64>, <8 x i64>* %vp
2335 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2338 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2339 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask0:
2341 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2342 ; CHECK-NEXT: vpermq $136, (%rdi), %ymm0 {%k1} # ymm0 {%k1} = mem[0,2,0,2]
2344 %vec = load <8 x i64>, <8 x i64>* %vp
2345 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2346 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2347 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2351 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %mask) {
2352 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0:
2354 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2355 ; CHECK-NEXT: vpermq $136, (%rdi), %ymm0 {%k1} {z} # ymm0 {%k1} {z} = mem[0,2,0,2]
2357 %vec = load <8 x i64>, <8 x i64>* %vp
2358 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2359 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2360 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2364 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2365 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
2366 ; CHECK-FAST: # %bb.0:
2367 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2368 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,4]
2369 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3
2370 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2371 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2372 ; CHECK-FAST-NEXT: retq
2374 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
2375 ; CHECK-FAST-PERLANE: # %bb.0:
2376 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2
2377 ; CHECK-FAST-PERLANE-NEXT: vpblendd $3, (%rdi), %ymm2, %ymm2 # ymm2 = mem[0,1],ymm2[2,3,4,5,6,7]
2378 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2379 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,3,2,0]
2380 ; CHECK-FAST-PERLANE-NEXT: retq
2381 %vec = load <8 x i64>, <8 x i64>* %vp
2382 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
2383 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2384 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2388 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %mask) {
2389 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
2390 ; CHECK-FAST: # %bb.0:
2391 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2392 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,4]
2393 ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1
2394 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2395 ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
2396 ; CHECK-FAST-NEXT: retq
2398 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
2399 ; CHECK-FAST-PERLANE: # %bb.0:
2400 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
2401 ; CHECK-FAST-PERLANE-NEXT: vpblendd $3, (%rdi), %ymm1, %ymm1 # ymm1 = mem[0,1],ymm1[2,3,4,5,6,7]
2402 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1
2403 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,3,2,0]
2404 ; CHECK-FAST-PERLANE-NEXT: retq
2405 %vec = load <8 x i64>, <8 x i64>* %vp
2406 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
2407 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2408 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2412 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2413 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
2414 ; CHECK-FAST: # %bb.0:
2415 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2416 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,5,5,1]
2417 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3
2418 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2419 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2420 ; CHECK-FAST-NEXT: retq
2422 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
2423 ; CHECK-FAST-PERLANE: # %bb.0:
2424 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2
2425 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
2426 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2427 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,0]
2428 ; CHECK-FAST-PERLANE-NEXT: retq
2429 %vec = load <8 x i64>, <8 x i64>* %vp
2430 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
2431 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2432 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2436 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) {
2437 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
2438 ; CHECK-FAST: # %bb.0:
2439 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2440 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [3,5,5,1]
2441 ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1
2442 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2443 ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
2444 ; CHECK-FAST-NEXT: retq
2446 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
2447 ; CHECK-FAST-PERLANE: # %bb.0:
2448 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
2449 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
2450 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1
2451 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,0]
2452 ; CHECK-FAST-PERLANE-NEXT: retq
2453 %vec = load <8 x i64>, <8 x i64>* %vp
2454 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
2455 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2456 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2460 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp) {
2461 ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
2462 ; CHECK-FAST: # %bb.0:
2463 ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm1
2464 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,0,2]
2465 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0
2466 ; CHECK-FAST-NEXT: retq
2468 ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
2469 ; CHECK-FAST-PERLANE: # %bb.0:
2470 ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
2471 ; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm0, %ymm0 # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
2472 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,1,3]
2473 ; CHECK-FAST-PERLANE-NEXT: retq
2474 %vec = load <8 x i64>, <8 x i64>* %vp
2475 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2478 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2479 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
2480 ; CHECK-FAST: # %bb.0:
2481 ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2
2482 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [7,0,0,2]
2483 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2484 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2485 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2486 ; CHECK-FAST-NEXT: retq
2488 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
2489 ; CHECK-FAST-PERLANE: # %bb.0:
2490 ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2
2491 ; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm2, %ymm2 # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
2492 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2493 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,3]
2494 ; CHECK-FAST-PERLANE-NEXT: retq
2495 %vec = load <8 x i64>, <8 x i64>* %vp
2496 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2497 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2498 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2502 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %mask) {
2503 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
2504 ; CHECK-FAST: # %bb.0:
2505 ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2
2506 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,2]
2507 ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1
2508 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2509 ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
2510 ; CHECK-FAST-NEXT: retq
2512 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
2513 ; CHECK-FAST-PERLANE: # %bb.0:
2514 ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1
2515 ; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm1, %ymm1 # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
2516 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1
2517 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,3]
2518 ; CHECK-FAST-PERLANE-NEXT: retq
2519 %vec = load <8 x i64>, <8 x i64>* %vp
2520 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2521 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2522 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2526 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2527 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4:
2529 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2530 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,6,1]
2531 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2532 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2533 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2535 %vec = load <8 x i64>, <8 x i64>* %vp
2536 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
2537 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2538 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2542 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %mask) {
2543 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:
2545 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2546 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,6,1]
2547 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2548 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2549 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2551 %vec = load <8 x i64>, <8 x i64>* %vp
2552 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
2553 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2554 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2558 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2559 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
2560 ; CHECK-FAST: # %bb.0:
2561 ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2
2562 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,7,1]
2563 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2564 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2565 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2566 ; CHECK-FAST-NEXT: retq
2568 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
2569 ; CHECK-FAST-PERLANE: # %bb.0:
2570 ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2
2571 ; CHECK-FAST-PERLANE-NEXT: vpblendd $192, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7]
2572 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2573 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,2,3,1]
2574 ; CHECK-FAST-PERLANE-NEXT: retq
2575 %vec = load <8 x i64>, <8 x i64>* %vp
2576 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
2577 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2578 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2582 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %mask) {
2583 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
2584 ; CHECK-FAST: # %bb.0:
2585 ; CHECK-FAST-NEXT: vmovdqa (%rdi), %ymm2
2586 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,7,1]
2587 ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1
2588 ; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2589 ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
2590 ; CHECK-FAST-NEXT: retq
2592 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
2593 ; CHECK-FAST-PERLANE: # %bb.0:
2594 ; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1
2595 ; CHECK-FAST-PERLANE-NEXT: vpblendd $192, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
2596 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1
2597 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,2,3,1]
2598 ; CHECK-FAST-PERLANE-NEXT: retq
2599 %vec = load <8 x i64>, <8 x i64>* %vp
2600 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
2601 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2602 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2606 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp) {
2607 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6:
2609 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
2610 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,2,3,2]
2611 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0
2613 %vec = load <8 x i64>, <8 x i64>* %vp
2614 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2617 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2618 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6:
2620 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2621 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,2,3,2]
2622 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2623 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2624 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2626 %vec = load <8 x i64>, <8 x i64>* %vp
2627 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2628 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2629 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2633 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %mask) {
2634 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:
2636 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2637 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,2,3,2]
2638 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2639 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2640 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2642 %vec = load <8 x i64>, <8 x i64>* %vp
2643 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2644 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2645 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2649 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2650 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
2651 ; CHECK-FAST: # %bb.0:
2652 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2653 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,1,5]
2654 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3
2655 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
2656 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2657 ; CHECK-FAST-NEXT: retq
2659 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
2660 ; CHECK-FAST-PERLANE: # %bb.0:
2661 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2
2662 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
2663 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
2664 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,2,0,1]
2665 ; CHECK-FAST-PERLANE-NEXT: retq
2666 %vec = load <8 x i64>, <8 x i64>* %vp
2667 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
2668 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2669 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2673 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %mask) {
2674 ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
2675 ; CHECK-FAST: # %bb.0:
2676 ; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
2677 ; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,1,5]
2678 ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1
2679 ; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2680 ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
2681 ; CHECK-FAST-NEXT: retq
2683 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
2684 ; CHECK-FAST-PERLANE: # %bb.0:
2685 ; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1
2686 ; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3]
2687 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1
2688 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,2,0,1]
2689 ; CHECK-FAST-PERLANE-NEXT: retq
2690 %vec = load <8 x i64>, <8 x i64>* %vp
2691 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
2692 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2693 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2697 define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp) {
2698 ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
2700 ; CHECK-NEXT: vmovaps 32(%rdi), %xmm0
2701 ; CHECK-NEXT: vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3]
2703 %vec = load <8 x i64>, <8 x i64>* %vp
2704 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2707 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
2708 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
2710 ; CHECK-NEXT: vmovdqa 32(%rdi), %xmm2
2711 ; CHECK-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
2712 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2713 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
2715 %vec = load <8 x i64>, <8 x i64>* %vp
2716 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2717 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2718 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2722 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %mask) {
2723 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
2725 ; CHECK-NEXT: vmovdqa 32(%rdi), %xmm1
2726 ; CHECK-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
2727 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
2728 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
2730 %vec = load <8 x i64>, <8 x i64>* %vp
2731 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2732 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2733 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2737 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
2738 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
2740 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2741 ; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
2742 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
2743 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2744 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
2745 ; CHECK-NEXT: vzeroupper
2747 %vec = load <8 x i64>, <8 x i64>* %vp
2748 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
2749 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2750 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2754 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %mask) {
2755 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
2757 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
2758 ; CHECK-NEXT: vpunpcklqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
2759 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
2760 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
2761 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
2762 ; CHECK-NEXT: vzeroupper
2764 %vec = load <8 x i64>, <8 x i64>* %vp
2765 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
2766 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2767 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2771 define <4 x float> @test_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) {
2772 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
2774 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
2775 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1]
2776 ; CHECK-NEXT: vzeroupper
2778 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2779 ret <4 x float> %res
2781 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2782 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0:
2784 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
2785 ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
2786 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
2787 ; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1]
2788 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
2789 ; CHECK-NEXT: vzeroupper
2791 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2792 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2793 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2794 ret <4 x float> %res
2797 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %mask) {
2798 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0:
2800 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
2801 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2802 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2803 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1]
2804 ; CHECK-NEXT: vzeroupper
2806 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2807 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2808 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2809 ret <4 x float> %res
2811 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2812 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1:
2814 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,0]
2815 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
2816 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2817 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2818 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2819 ; CHECK-NEXT: vzeroupper
2821 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
2822 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2823 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2824 ret <4 x float> %res
2827 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) {
2828 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1:
2830 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,0]
2831 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2832 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2833 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2834 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2835 ; CHECK-NEXT: vzeroupper
2837 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
2838 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2839 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2840 ret <4 x float> %res
2842 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2843 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2:
2845 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,2,7,0]
2846 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
2847 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2848 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2849 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2850 ; CHECK-NEXT: vzeroupper
2852 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
2853 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2854 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2855 ret <4 x float> %res
2858 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) {
2859 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2:
2861 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,2,7,0]
2862 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2863 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2864 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2865 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2866 ; CHECK-NEXT: vzeroupper
2868 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
2869 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2870 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2871 ret <4 x float> %res
2873 define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) {
2874 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3:
2876 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,5,2]
2877 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
2878 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2879 ; CHECK-NEXT: vzeroupper
2881 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2882 ret <4 x float> %res
2884 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2885 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3:
2887 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,3,5,2]
2888 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
2889 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2890 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2891 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2892 ; CHECK-NEXT: vzeroupper
2894 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2895 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2896 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2897 ret <4 x float> %res
2900 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) {
2901 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3:
2903 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,5,2]
2904 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2905 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2906 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2907 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2908 ; CHECK-NEXT: vzeroupper
2910 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2911 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2912 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2913 ret <4 x float> %res
2915 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) {
2916 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
2918 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
2919 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,6,0,1]
2920 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm1, %xmm0
2922 %vec = load <8 x float>, <8 x float>* %vp
2923 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2924 ret <4 x float> %res
2926 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
2927 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
2929 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2930 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,0,1]
2931 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3
2932 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
2933 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
2934 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
2936 %vec = load <8 x float>, <8 x float>* %vp
2937 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2938 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2939 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2940 ret <4 x float> %res
2943 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %mask) {
2944 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
2946 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2947 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,0,1]
2948 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2949 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
2950 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
2951 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
2953 %vec = load <8 x float>, <8 x float>* %vp
2954 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2955 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2956 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2957 ret <4 x float> %res
2960 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
2961 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
2963 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2964 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,7,7,2]
2965 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3
2966 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
2967 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
2968 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
2970 %vec = load <8 x float>, <8 x float>* %vp
2971 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
2972 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2973 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2974 ret <4 x float> %res
2977 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %mask) {
2978 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
2980 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2981 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,7,7,2]
2982 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2983 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
2984 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
2985 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
2987 %vec = load <8 x float>, <8 x float>* %vp
2988 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
2989 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2990 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2991 ret <4 x float> %res
2994 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
2995 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
2997 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
2998 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,1,3,7]
2999 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
3000 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3001 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3002 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3004 %vec = load <8 x float>, <8 x float>* %vp
3005 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
3006 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3007 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3008 ret <4 x float> %res
3011 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %mask) {
3012 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
3014 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
3015 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,1,3,7]
3016 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3017 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3018 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
3019 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3021 %vec = load <8 x float>, <8 x float>* %vp
3022 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
3023 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3024 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3025 ret <4 x float> %res
3028 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp) {
3029 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3:
3031 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
3032 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,3,5,3]
3033 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0
3035 %vec = load <8 x float>, <8 x float>* %vp
3036 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
3037 ret <4 x float> %res
3039 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3040 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
3042 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
3043 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,3]
3044 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
3045 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3046 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3047 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3049 %vec = load <8 x float>, <8 x float>* %vp
3050 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
3051 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3052 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3053 ret <4 x float> %res
3056 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %mask) {
3057 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
3059 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
3060 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [1,3,5,3]
3061 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3062 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3063 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
3064 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3066 %vec = load <8 x float>, <8 x float>* %vp
3067 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
3068 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3069 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3070 ret <4 x float> %res
3073 define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) {
3074 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0:
3076 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7]
3077 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
3078 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3080 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
3081 ret <8 x float> %res
3083 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3084 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0:
3086 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,12,10,8,2,11,7]
3087 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3088 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3089 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
3090 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
3092 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
3093 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3094 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3095 ret <8 x float> %res
3098 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) {
3099 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0:
3101 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7]
3102 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3103 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
3104 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3105 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3107 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
3108 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3109 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3110 ret <8 x float> %res
3112 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3113 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1:
3115 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [10,12,3,12,4,15,1,14]
3116 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3117 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3118 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
3119 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
3121 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
3122 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3123 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3124 ret <8 x float> %res
3127 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) {
3128 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1:
3130 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [10,12,3,12,4,15,1,14]
3131 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3132 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
3133 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3134 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3136 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
3137 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3138 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3139 ret <8 x float> %res
3141 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3142 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2:
3144 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4]
3145 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3146 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3147 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
3148 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
3150 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
3151 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3152 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3153 ret <8 x float> %res
3156 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) {
3157 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2:
3159 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4]
3160 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3161 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
3162 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3163 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3165 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
3166 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3167 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3168 ret <8 x float> %res
3170 define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) {
3171 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3:
3173 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [12,14,9,0,12,4,5,8]
3174 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
3175 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3177 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3178 ret <8 x float> %res
3180 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3181 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3:
3183 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [12,14,9,0,12,4,5,8]
3184 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3185 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3186 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
3187 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
3189 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3190 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3191 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3192 ret <8 x float> %res
3195 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) {
3196 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3:
3198 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [12,14,9,0,12,4,5,8]
3199 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3200 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
3201 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3202 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3204 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3205 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3206 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3207 ret <8 x float> %res
3209 define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) {
3210 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0:
3212 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,8,9,10]
3213 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
3214 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3215 ; CHECK-NEXT: vzeroupper
3217 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3218 ret <4 x float> %res
3220 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3221 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0:
3223 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [4,8,9,10]
3224 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3225 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3226 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3227 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3228 ; CHECK-NEXT: vzeroupper
3230 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3231 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3232 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3233 ret <4 x float> %res
3236 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) {
3237 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0:
3239 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [4,8,9,10]
3240 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3241 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3242 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3243 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3244 ; CHECK-NEXT: vzeroupper
3246 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3247 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3248 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3249 ret <4 x float> %res
3251 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3252 ; CHECK-FAST-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
3253 ; CHECK-FAST: # %bb.0:
3254 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm3 = [8,6,10,6]
3255 ; CHECK-FAST-NEXT: vpermps %zmm0, %zmm3, %zmm0
3256 ; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3
3257 ; CHECK-FAST-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3258 ; CHECK-FAST-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3259 ; CHECK-FAST-NEXT: vzeroupper
3260 ; CHECK-FAST-NEXT: retq
3262 ; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
3263 ; CHECK-FAST-PERLANE: # %bb.0:
3264 ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm3
3265 ; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0
3266 ; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = [0,6,2,6]
3267 ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm0, %xmm3, %xmm4
3268 ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm0, %xmm0, %xmm0
3269 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm0, %xmm2, %k1
3270 ; CHECK-FAST-PERLANE-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1}
3271 ; CHECK-FAST-PERLANE-NEXT: vzeroupper
3272 ; CHECK-FAST-PERLANE-NEXT: retq
3273 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
3274 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3275 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3276 ret <4 x float> %res
3279 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
3280 ; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
3281 ; CHECK-FAST: # %bb.0:
3282 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm2 = [8,6,10,6]
3283 ; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3
3284 ; CHECK-FAST-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3285 ; CHECK-FAST-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3286 ; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3287 ; CHECK-FAST-NEXT: vzeroupper
3288 ; CHECK-FAST-NEXT: retq
3290 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
3291 ; CHECK-FAST-PERLANE: # %bb.0:
3292 ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm2
3293 ; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm3
3294 ; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,2,6]
3295 ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4
3296 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm4, %xmm1, %k1
3297 ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z}
3298 ; CHECK-FAST-PERLANE-NEXT: vzeroupper
3299 ; CHECK-FAST-PERLANE-NEXT: retq
3300 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
3301 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3302 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3303 ret <4 x float> %res
3305 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3306 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2:
3308 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3309 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5]
3310 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3311 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3312 ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1}
3313 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3314 ; CHECK-NEXT: vzeroupper
3316 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
3317 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3318 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3319 ret <4 x float> %res
3322 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) {
3323 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2:
3325 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
3326 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5]
3327 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3328 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3329 ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
3330 ; CHECK-NEXT: vzeroupper
3332 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
3333 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3334 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3335 ret <4 x float> %res
3337 define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) {
3338 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3:
3340 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [10,2,11,6]
3341 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
3342 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3343 ; CHECK-NEXT: vzeroupper
3345 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3346 ret <4 x float> %res
3348 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3349 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3:
3351 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [10,2,11,6]
3352 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3353 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3354 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3355 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3356 ; CHECK-NEXT: vzeroupper
3358 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3359 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3360 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3361 ret <4 x float> %res
3364 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) {
3365 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3:
3367 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [10,2,11,6]
3368 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3369 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3370 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3371 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3372 ; CHECK-NEXT: vzeroupper
3374 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3375 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3376 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3377 ret <4 x float> %res
3379 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp) {
3380 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0:
3382 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
3383 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4]
3384 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm0
3386 %vec = load <16 x float>, <16 x float>* %vp
3387 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3388 ret <8 x float> %res
3390 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
3391 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0:
3393 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3394 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4]
3395 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3
3396 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3397 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3398 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3400 %vec = load <16 x float>, <16 x float>* %vp
3401 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3402 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3403 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3404 ret <8 x float> %res
3407 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %mask) {
3408 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0:
3410 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3411 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4]
3412 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3413 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3414 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3415 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3417 %vec = load <16 x float>, <16 x float>* %vp
3418 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3419 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3420 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3421 ret <8 x float> %res
3424 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
3425 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1:
3427 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3428 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8]
3429 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3
3430 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3431 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3432 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3434 %vec = load <16 x float>, <16 x float>* %vp
3435 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
3436 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3437 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3438 ret <8 x float> %res
3441 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %mask) {
3442 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1:
3444 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3445 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8]
3446 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3447 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3448 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3449 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3451 %vec = load <16 x float>, <16 x float>* %vp
3452 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
3453 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3454 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3455 ret <8 x float> %res
3458 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
3459 ; CHECK-FAST-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
3460 ; CHECK-FAST: # %bb.0:
3461 ; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2
3462 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1]
3463 ; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3464 ; CHECK-FAST-NEXT: vxorps %xmm2, %xmm2, %xmm2
3465 ; CHECK-FAST-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3466 ; CHECK-FAST-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3467 ; CHECK-FAST-NEXT: retq
3469 ; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
3470 ; CHECK-FAST-PERLANE: # %bb.0:
3471 ; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2
3472 ; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3
3473 ; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = [9,5,2,3,2,8,8,1]
3474 ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4
3475 ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm2, %xmm2, %xmm2
3476 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3477 ; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm0 {%k1}
3478 ; CHECK-FAST-PERLANE-NEXT: retq
3479 %vec = load <16 x float>, <16 x float>* %vp
3480 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
3481 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3482 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3483 ret <8 x float> %res
3486 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %mask) {
3487 ; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
3488 ; CHECK-FAST: # %bb.0:
3489 ; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2
3490 ; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1]
3491 ; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3
3492 ; CHECK-FAST-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3493 ; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3494 ; CHECK-FAST-NEXT: vmovaps %ymm1, %ymm0
3495 ; CHECK-FAST-NEXT: retq
3497 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
3498 ; CHECK-FAST-PERLANE: # %bb.0:
3499 ; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2
3500 ; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3
3501 ; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1]
3502 ; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4
3503 ; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm4, %ymm0, %k1
3504 ; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z}
3505 ; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm0
3506 ; CHECK-FAST-PERLANE-NEXT: retq
3507 %vec = load <16 x float>, <16 x float>* %vp
3508 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
3509 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3510 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3511 ret <8 x float> %res
3514 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp) {
3515 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3:
3517 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm1
3518 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9]
3519 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm0
3521 %vec = load <16 x float>, <16 x float>* %vp
3522 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3523 ret <8 x float> %res
3525 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
3526 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3:
3528 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3529 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9]
3530 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3531 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3532 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3533 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3535 %vec = load <16 x float>, <16 x float>* %vp
3536 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3537 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3538 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3539 ret <8 x float> %res
3542 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %mask) {
3543 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3:
3545 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3546 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9]
3547 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3548 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3549 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3550 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3552 %vec = load <16 x float>, <16 x float>* %vp
3553 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3554 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3555 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3556 ret <8 x float> %res
3559 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp) {
3560 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0:
3562 ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm1 # ymm1 = mem[3,1,2,3]
3563 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,7,3]
3564 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0
3565 ; CHECK-NEXT: vzeroupper
3567 %vec = load <16 x float>, <16 x float>* %vp
3568 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3569 ret <4 x float> %res
3571 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3572 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
3574 ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3]
3575 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,6,7,3]
3576 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
3577 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3578 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3579 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3580 ; CHECK-NEXT: vzeroupper
3582 %vec = load <16 x float>, <16 x float>* %vp
3583 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3584 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3585 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3586 ret <4 x float> %res
3589 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %mask) {
3590 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
3592 ; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3]
3593 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,7,3]
3594 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3595 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3596 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
3597 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3598 ; CHECK-NEXT: vzeroupper
3600 %vec = load <16 x float>, <16 x float>* %vp
3601 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3602 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3603 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3604 ret <4 x float> %res
3607 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3608 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1:
3610 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3611 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <0,10,6,15,u,u,u,u>
3612 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3613 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3614 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3615 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3616 ; CHECK-NEXT: vzeroupper
3618 %vec = load <16 x float>, <16 x float>* %vp
3619 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
3620 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3621 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3622 ret <4 x float> %res
3625 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %mask) {
3626 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
3628 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3629 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <0,10,6,15,u,u,u,u>
3630 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3631 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3632 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3633 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3634 ; CHECK-NEXT: vzeroupper
3636 %vec = load <16 x float>, <16 x float>* %vp
3637 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
3638 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3639 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3640 ret <4 x float> %res
3643 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3644 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
3646 ; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [60129542148,60129542148]
3647 ; CHECK-NEXT: # xmm2 = mem[0,0]
3648 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm3
3649 ; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm3
3650 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3651 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3652 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3653 ; CHECK-NEXT: vzeroupper
3655 %vec = load <16 x float>, <16 x float>* %vp
3656 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
3657 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3658 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3659 ret <4 x float> %res
3662 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %mask) {
3663 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
3665 ; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [60129542148,60129542148]
3666 ; CHECK-NEXT: # xmm2 = mem[0,0]
3667 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm1
3668 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3669 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3670 ; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3671 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3672 ; CHECK-NEXT: vzeroupper
3674 %vec = load <16 x float>, <16 x float>* %vp
3675 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
3676 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3677 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3678 ret <4 x float> %res
3681 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp) {
3682 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3:
3684 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,15,9]
3685 ; CHECK-NEXT: vmovaps (%rdi), %ymm0
3686 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm1, %ymm0
3687 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3688 ; CHECK-NEXT: vzeroupper
3690 %vec = load <16 x float>, <16 x float>* %vp
3691 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3692 ret <4 x float> %res
3694 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3695 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3:
3697 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9]
3698 ; CHECK-NEXT: vmovaps (%rdi), %ymm3
3699 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm3
3700 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3701 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3702 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3703 ; CHECK-NEXT: vzeroupper
3705 %vec = load <16 x float>, <16 x float>* %vp
3706 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3707 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3708 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3709 ret <4 x float> %res
3712 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %mask) {
3713 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
3715 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9]
3716 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
3717 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3718 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3719 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3720 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3721 ; CHECK-NEXT: vzeroupper
3723 %vec = load <16 x float>, <16 x float>* %vp
3724 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3725 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3726 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3727 ret <4 x float> %res
3730 define <2 x double> @test_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec) {
3731 ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mask0:
3733 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
3734 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3735 ; CHECK-NEXT: vzeroupper
3737 %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3738 ret <2 x double> %res
3740 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3741 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask0:
3743 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
3744 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3745 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
3746 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
3747 ; CHECK-NEXT: vzeroupper
3749 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3750 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3751 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3752 ret <2 x double> %res
3755 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %mask) {
3756 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask0:
3758 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3759 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
3760 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3]
3761 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3762 ; CHECK-NEXT: vzeroupper
3764 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3765 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3766 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3767 ret <2 x double> %res
3769 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3770 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask1:
3772 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
3773 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3774 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
3775 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
3776 ; CHECK-NEXT: vzeroupper
3778 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
3779 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3780 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3781 ret <2 x double> %res
3784 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %mask) {
3785 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask1:
3787 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3788 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
3789 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,2,3]
3790 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3791 ; CHECK-NEXT: vzeroupper
3793 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
3794 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3795 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3796 ret <2 x double> %res
3798 define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp) {
3799 ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0:
3801 ; CHECK-NEXT: vmovaps (%rdi), %xmm0
3802 ; CHECK-NEXT: vblendps $3, 16(%rdi), %xmm0, %xmm0 # xmm0 = mem[0,1],xmm0[2,3]
3804 %vec = load <4 x double>, <4 x double>* %vp
3805 %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3806 ret <2 x double> %res
3808 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
3809 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0:
3811 ; CHECK-NEXT: vmovapd (%rdi), %xmm2
3812 ; CHECK-NEXT: vblendpd $1, 16(%rdi), %xmm2, %xmm2 # xmm2 = mem[0],xmm2[1]
3813 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3814 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
3815 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1}
3817 %vec = load <4 x double>, <4 x double>* %vp
3818 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3819 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3820 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3821 ret <2 x double> %res
3824 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %mask) {
3825 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0:
3827 ; CHECK-NEXT: vmovapd (%rdi), %xmm1
3828 ; CHECK-NEXT: vblendpd $1, 16(%rdi), %xmm1, %xmm1 # xmm1 = mem[0],xmm1[1]
3829 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3830 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
3831 ; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z}
3833 %vec = load <4 x double>, <4 x double>* %vp
3834 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3835 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3836 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3837 ret <2 x double> %res
3840 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
3841 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1:
3843 ; CHECK-NEXT: vmovapd 16(%rdi), %xmm2
3844 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3845 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
3846 ; CHECK-NEXT: vunpcklpd (%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
3848 %vec = load <4 x double>, <4 x double>* %vp
3849 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3850 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3851 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3852 ret <2 x double> %res
3855 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %mask) {
3856 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1:
3858 ; CHECK-NEXT: vmovapd 16(%rdi), %xmm1
3859 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3860 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
3861 ; CHECK-NEXT: vunpcklpd (%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
3863 %vec = load <4 x double>, <4 x double>* %vp
3864 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3865 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3866 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3867 ret <2 x double> %res
3870 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
3871 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
3873 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,3,7,3]
3874 ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
3875 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3877 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3878 ret <4 x double> %res
3880 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3881 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
3883 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [7,3,7,3]
3884 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
3885 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3886 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3887 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3889 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3890 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3891 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3892 ret <4 x double> %res
3895 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
3896 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
3898 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [7,3,7,3]
3899 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3900 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
3901 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3902 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3904 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3905 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3906 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3907 ret <4 x double> %res
3909 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3910 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1:
3912 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,0,7,6]
3913 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
3914 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3915 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3916 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3918 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
3919 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3920 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3921 ret <4 x double> %res
3924 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %mask) {
3925 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask1:
3927 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,0,7,6]
3928 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3929 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
3930 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3931 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3933 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
3934 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3935 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3936 ret <4 x double> %res
3938 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3939 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask2:
3941 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3942 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3943 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,0]
3944 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
3946 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
3947 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3948 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3949 ret <4 x double> %res
3952 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %mask) {
3953 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask2:
3955 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3956 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
3957 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,0]
3959 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
3960 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3961 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3962 ret <4 x double> %res
3964 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) {
3965 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3:
3967 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,1,4]
3968 ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
3969 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3971 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
3972 ret <4 x double> %res
3974 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3975 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3:
3977 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,2,1,4]
3978 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
3979 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3980 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3981 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3983 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
3984 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3985 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3986 ret <4 x double> %res
3989 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) {
3990 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3:
3992 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,1,4]
3993 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3994 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
3995 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3996 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3998 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
3999 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4000 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4001 ret <4 x double> %res
4003 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4004 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
4005 ; CHECK-FAST: # %bb.0:
4006 ; CHECK-FAST-NEXT: vextractf32x4 $2, %zmm0, %xmm3
4007 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm4 = [1,1,5,5]
4008 ; CHECK-FAST-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
4009 ; CHECK-FAST-NEXT: vxorpd %xmm0, %xmm0, %xmm0
4010 ; CHECK-FAST-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
4011 ; CHECK-FAST-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
4012 ; CHECK-FAST-NEXT: retq
4014 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
4015 ; CHECK-FAST-PERLANE: # %bb.0:
4016 ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm3
4017 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
4018 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4019 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4020 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
4021 ; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0
4022 ; CHECK-FAST-PERLANE-NEXT: retq
4023 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
4024 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4025 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4026 ret <4 x double> %res
4029 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
4030 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
4031 ; CHECK-FAST: # %bb.0:
4032 ; CHECK-FAST-NEXT: vextractf32x4 $2, %zmm0, %xmm3
4033 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [1,1,5,5]
4034 ; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4035 ; CHECK-FAST-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
4036 ; CHECK-FAST-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
4037 ; CHECK-FAST-NEXT: vmovapd %ymm2, %ymm0
4038 ; CHECK-FAST-NEXT: retq
4040 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
4041 ; CHECK-FAST-PERLANE: # %bb.0:
4042 ; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm2
4043 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
4044 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4045 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4046 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
4047 ; CHECK-FAST-PERLANE-NEXT: retq
4048 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
4049 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4050 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4051 ret <4 x double> %res
4053 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4054 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5:
4056 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,6,2,2]
4057 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
4058 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4059 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4060 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
4062 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
4063 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4064 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4065 ret <4 x double> %res
4068 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) {
4069 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5:
4071 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,6,2,2]
4072 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4073 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4074 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4075 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4077 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
4078 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4079 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4080 ret <4 x double> %res
4082 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
4083 ; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
4084 ; CHECK-FAST: # %bb.0:
4085 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [5,8,7,8]
4086 ; CHECK-FAST-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0
4087 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4088 ; CHECK-FAST-NEXT: retq
4090 ; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
4091 ; CHECK-FAST-PERLANE: # %bb.0:
4092 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1
4093 ; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
4094 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
4095 ; CHECK-FAST-PERLANE-NEXT: retq
4096 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
4097 ret <4 x double> %res
4099 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4100 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
4101 ; CHECK-FAST: # %bb.0:
4102 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [5,8,7,8]
4103 ; CHECK-FAST-NEXT: vpermi2pd %zmm0, %zmm0, %zmm3
4104 ; CHECK-FAST-NEXT: vxorpd %xmm0, %xmm0, %xmm0
4105 ; CHECK-FAST-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
4106 ; CHECK-FAST-NEXT: vblendmpd %ymm3, %ymm1, %ymm0 {%k1}
4107 ; CHECK-FAST-NEXT: retq
4109 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
4110 ; CHECK-FAST-PERLANE: # %bb.0:
4111 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4112 ; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
4113 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4114 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
4115 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
4116 ; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0
4117 ; CHECK-FAST-PERLANE-NEXT: retq
4118 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
4119 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4120 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4121 ret <4 x double> %res
4124 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
4125 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
4126 ; CHECK-FAST: # %bb.0:
4127 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [5,8,7,8]
4128 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4129 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4130 ; CHECK-FAST-NEXT: vpermt2pd %zmm0, %zmm2, %zmm0 {%k1} {z}
4131 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4132 ; CHECK-FAST-NEXT: retq
4134 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
4135 ; CHECK-FAST-PERLANE: # %bb.0:
4136 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4137 ; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
4138 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4139 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4140 ; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
4141 ; CHECK-FAST-PERLANE-NEXT: retq
4142 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
4143 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4144 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4145 ret <4 x double> %res
4147 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
4148 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
4149 ; CHECK-FAST: # %bb.0:
4150 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,5,0,6]
4151 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0
4152 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4153 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
4154 ; CHECK-FAST-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
4155 ; CHECK-FAST-NEXT: retq
4157 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
4158 ; CHECK-FAST-PERLANE: # %bb.0:
4159 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4160 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3]
4161 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4162 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
4163 ; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],ymm3[1],ymm0[2],ymm3[2]
4164 ; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0
4165 ; CHECK-FAST-PERLANE-NEXT: retq
4166 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
4167 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4168 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4169 ret <4 x double> %res
4172 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) {
4173 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
4174 ; CHECK-FAST: # %bb.0:
4175 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [3,5,0,6]
4176 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4177 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4178 ; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4179 ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
4180 ; CHECK-FAST-NEXT: retq
4182 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
4183 ; CHECK-FAST-PERLANE: # %bb.0:
4184 ; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4185 ; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3]
4186 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4187 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4188 ; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm2[1],ymm0[2],ymm2[2]
4189 ; CHECK-FAST-PERLANE-NEXT: retq
4190 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
4191 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4192 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4193 ret <4 x double> %res
4195 define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
4196 ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
4198 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
4199 ; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
4200 ; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4201 ; CHECK-NEXT: vzeroupper
4203 %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
4204 ret <2 x double> %res
4206 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
4207 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
4209 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4210 ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3
4211 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4212 ; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
4213 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
4214 ; CHECK-NEXT: vmovapd %xmm1, %xmm0
4215 ; CHECK-NEXT: vzeroupper
4217 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
4218 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4219 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4220 ret <2 x double> %res
4223 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
4224 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
4226 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4227 ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2
4228 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4229 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4230 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
4231 ; CHECK-NEXT: vzeroupper
4233 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
4234 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4235 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4236 ret <2 x double> %res
4238 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
4239 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1:
4241 ; CHECK-NEXT: vmovapd {{.*#+}} xmm3 = [3,7]
4242 ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
4243 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4244 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
4245 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
4246 ; CHECK-NEXT: vzeroupper
4248 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
4249 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4250 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4251 ret <2 x double> %res
4254 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %mask) {
4255 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask1:
4257 ; CHECK-NEXT: vmovapd {{.*#+}} xmm2 = [3,7]
4258 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4259 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4260 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
4261 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
4262 ; CHECK-NEXT: vzeroupper
4264 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
4265 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4266 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4267 ret <2 x double> %res
4269 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp) {
4270 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0:
4272 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
4273 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,7,2]
4274 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0
4276 %vec = load <8 x double>, <8 x double>* %vp
4277 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4278 ret <4 x double> %res
4280 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4281 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0:
4283 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4284 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,6,7,2]
4285 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4286 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4287 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4288 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4290 %vec = load <8 x double>, <8 x double>* %vp
4291 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4292 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4293 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4294 ret <4 x double> %res
4297 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %mask) {
4298 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0:
4300 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4301 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,6,7,2]
4302 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4303 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4304 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4305 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4307 %vec = load <8 x double>, <8 x double>* %vp
4308 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4309 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4310 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4311 ret <4 x double> %res
4314 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4315 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
4316 ; CHECK-FAST: # %bb.0:
4317 ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4318 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,6]
4319 ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3
4320 ; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4321 ; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4322 ; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4323 ; CHECK-FAST-NEXT: retq
4325 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
4326 ; CHECK-FAST-PERLANE: # %bb.0:
4327 ; CHECK-FAST-PERLANE-NEXT: vpermpd $236, (%rdi), %ymm2 # ymm2 = mem[0,3,2,3]
4328 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4329 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4330 ; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 32(%rdi){1to4}, %ymm2, %ymm0 {%k1}
4331 ; CHECK-FAST-PERLANE-NEXT: retq
4332 %vec = load <8 x double>, <8 x double>* %vp
4333 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
4334 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4335 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4336 ret <4 x double> %res
4339 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %mask) {
4340 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
4341 ; CHECK-FAST: # %bb.0:
4342 ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4343 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,6]
4344 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4345 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4346 ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}
4347 ; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0
4348 ; CHECK-FAST-NEXT: retq
4350 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
4351 ; CHECK-FAST-PERLANE: # %bb.0:
4352 ; CHECK-FAST-PERLANE-NEXT: vpermpd $236, (%rdi), %ymm1 # ymm1 = mem[0,3,2,3]
4353 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4354 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4355 ; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 32(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
4356 ; CHECK-FAST-PERLANE-NEXT: retq
4357 %vec = load <8 x double>, <8 x double>* %vp
4358 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
4359 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4360 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4361 ret <4 x double> %res
4364 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4365 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
4366 ; CHECK-FAST: # %bb.0:
4367 ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4368 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [1,2,3,4]
4369 ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4370 ; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4371 ; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4372 ; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4373 ; CHECK-FAST-NEXT: retq
4375 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
4376 ; CHECK-FAST-PERLANE: # %bb.0:
4377 ; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %ymm2
4378 ; CHECK-FAST-PERLANE-NEXT: vperm2f128 $33, 32(%rdi), %ymm2, %ymm3 # ymm3 = ymm2[2,3],mem[0,1]
4379 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4380 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
4381 ; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm2[1],ymm3[0],ymm2[3],ymm3[2]
4382 ; CHECK-FAST-PERLANE-NEXT: retq
4383 %vec = load <8 x double>, <8 x double>* %vp
4384 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
4385 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4386 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4387 ret <4 x double> %res
4390 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %mask) {
4391 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
4392 ; CHECK-FAST: # %bb.0:
4393 ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2
4394 ; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [1,2,3,4]
4395 ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4396 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4397 ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4398 ; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0
4399 ; CHECK-FAST-NEXT: retq
4401 ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
4402 ; CHECK-FAST-PERLANE: # %bb.0:
4403 ; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %ymm1
4404 ; CHECK-FAST-PERLANE-NEXT: vperm2f128 $33, 32(%rdi), %ymm1, %ymm2 # ymm2 = ymm1[2,3],mem[0,1]
4405 ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4406 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4407 ; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm1[1],ymm2[0],ymm1[3],ymm2[2]
4408 ; CHECK-FAST-PERLANE-NEXT: retq
4409 %vec = load <8 x double>, <8 x double>* %vp
4410 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
4411 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4412 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4413 ret <4 x double> %res
4416 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp) {
4417 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
4419 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
4420 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [4,2,1,0]
4421 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0
4423 %vec = load <8 x double>, <8 x double>* %vp
4424 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4425 ret <4 x double> %res
4427 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4428 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
4430 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4431 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [4,2,1,0]
4432 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4433 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4434 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4435 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4437 %vec = load <8 x double>, <8 x double>* %vp
4438 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4439 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4440 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4441 ret <4 x double> %res
4444 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %mask) {
4445 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
4447 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4448 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [4,2,1,0]
4449 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4450 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4451 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4452 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4454 %vec = load <8 x double>, <8 x double>* %vp
4455 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4456 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4457 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4458 ret <4 x double> %res
4461 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4462 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4:
4464 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4465 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,4,1,5]
4466 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3
4467 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4468 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4469 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4471 %vec = load <8 x double>, <8 x double>* %vp
4472 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
4473 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4474 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4475 ret <4 x double> %res
4478 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %mask) {
4479 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4:
4481 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4482 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [2,4,1,5]
4483 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4484 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4485 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
4486 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4488 %vec = load <8 x double>, <8 x double>* %vp
4489 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
4490 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4491 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4492 ret <4 x double> %res
4495 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4496 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
4498 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4499 ; CHECK-NEXT: vperm2f128 $33, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[2,3],mem[0,1]
4500 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4501 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4502 ; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm2, %ymm0 {%k1}
4504 %vec = load <8 x double>, <8 x double>* %vp
4505 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
4506 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4507 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4508 ret <4 x double> %res
4511 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) {
4512 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
4514 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
4515 ; CHECK-NEXT: vperm2f128 $33, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[2,3],mem[0,1]
4516 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4517 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4518 ; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
4520 %vec = load <8 x double>, <8 x double>* %vp
4521 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
4522 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4523 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4524 ret <4 x double> %res
4527 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp) {
4528 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
4530 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm1
4531 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [0,2,4,1]
4532 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm1, %ymm0
4534 %vec = load <8 x double>, <8 x double>* %vp
4535 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4536 ret <4 x double> %res
4538 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4539 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
4541 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4542 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,2,4,1]
4543 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3
4544 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4545 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4546 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4548 %vec = load <8 x double>, <8 x double>* %vp
4549 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4550 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4551 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4552 ret <4 x double> %res
4555 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %mask) {
4556 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
4558 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4559 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,1]
4560 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4561 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4562 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
4563 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4565 %vec = load <8 x double>, <8 x double>* %vp
4566 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4567 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4568 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4569 ret <4 x double> %res
4572 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4573 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask7:
4575 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4576 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4577 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4578 ; CHECK-NEXT: vunpcklpd 40(%rdi){1to4}, %ymm2, %ymm0 {%k1}
4580 %vec = load <8 x double>, <8 x double>* %vp
4581 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
4582 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4583 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4584 ret <4 x double> %res
4587 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %mask) {
4588 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7:
4590 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
4591 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4592 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4593 ; CHECK-NEXT: vunpcklpd 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z}
4595 %vec = load <8 x double>, <8 x double>* %vp
4596 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
4597 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4598 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4599 ret <4 x double> %res
4602 define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp) {
4603 ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
4605 ; CHECK-NEXT: vmovapd (%rdi), %xmm0
4606 ; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[0]
4608 %vec = load <8 x double>, <8 x double>* %vp
4609 %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4610 ret <2 x double> %res
4612 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
4613 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
4615 ; CHECK-NEXT: vmovapd (%rdi), %xmm2
4616 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4617 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4618 ; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[0]
4620 %vec = load <8 x double>, <8 x double>* %vp
4621 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4622 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4623 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4624 ret <2 x double> %res
4627 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %mask) {
4628 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
4630 ; CHECK-NEXT: vmovapd (%rdi), %xmm1
4631 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4632 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
4633 ; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[0]
4635 %vec = load <8 x double>, <8 x double>* %vp
4636 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4637 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4638 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4639 ret <2 x double> %res
4642 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
4643 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1:
4645 ; CHECK-NEXT: vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0]
4646 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4647 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4648 ; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
4650 %vec = load <8 x double>, <8 x double>* %vp
4651 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
4652 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4653 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4654 ret <2 x double> %res
4657 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %mask) {
4658 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
4660 ; CHECK-NEXT: vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0]
4661 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4662 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
4663 ; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
4665 %vec = load <8 x double>, <8 x double>* %vp
4666 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
4667 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4668 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4669 ret <2 x double> %res
4673 define void @test_zext_v8i8_to_v8i16(<8 x i8>* %arg, <8 x i16>* %arg1) {
4674 ; CHECK-LABEL: test_zext_v8i8_to_v8i16:
4676 ; CHECK-NEXT: vpmovzxbw (%rdi), %xmm0 # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
4677 ; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0
4678 ; CHECK-NEXT: vmovdqa %xmm0, (%rsi)
4680 %tmp = getelementptr <8 x i8>, <8 x i8>* %arg, i32 0
4681 %tmp2 = load <8 x i8>, <8 x i8>* %tmp
4682 %tmp3 = extractelement <8 x i8> %tmp2, i32 0
4683 %tmp4 = zext i8 %tmp3 to i16
4684 %tmp5 = insertelement <8 x i16> undef, i16 %tmp4, i32 0
4685 %tmp6 = extractelement <8 x i8> %tmp2, i32 1
4686 %tmp7 = zext i8 %tmp6 to i16
4687 %tmp8 = insertelement <8 x i16> %tmp5, i16 %tmp7, i32 1
4688 %tmp9 = extractelement <8 x i8> %tmp2, i32 2
4689 %tmp10 = zext i8 %tmp9 to i16
4690 %tmp11 = insertelement <8 x i16> %tmp8, i16 %tmp10, i32 2
4691 %tmp12 = extractelement <8 x i8> %tmp2, i32 3
4692 %tmp13 = zext i8 %tmp12 to i16
4693 %tmp14 = insertelement <8 x i16> %tmp11, i16 %tmp13, i32 3
4694 %tmp15 = extractelement <8 x i8> %tmp2, i32 4
4695 %tmp16 = zext i8 %tmp15 to i16
4696 %tmp17 = insertelement <8 x i16> %tmp14, i16 %tmp16, i32 4
4697 %tmp18 = extractelement <8 x i8> %tmp2, i32 5
4698 %tmp19 = zext i8 %tmp18 to i16
4699 %tmp20 = insertelement <8 x i16> %tmp17, i16 %tmp19, i32 5
4700 %tmp21 = extractelement <8 x i8> %tmp2, i32 6
4701 %tmp22 = zext i8 %tmp21 to i16
4702 %tmp23 = insertelement <8 x i16> %tmp20, i16 %tmp22, i32 6
4703 %tmp24 = extractelement <8 x i8> %tmp2, i32 7
4704 %tmp25 = zext i8 %tmp24 to i16
4705 %tmp26 = insertelement <8 x i16> %tmp23, i16 %tmp25, i32 7
4706 %tmp27 = shl <8 x i16> %tmp26, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
4707 %tmp28 = getelementptr <8 x i16>, <8 x i16>* %arg1, i32 0
4708 store <8 x i16> %tmp27, <8 x i16>* %tmp28