1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle %s -o - | FileCheck %s
4 ; FIXME: All cases here should be fixed by PR34380
6 define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
7 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
9 ; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,8,6,12,4,7,9,14,8]
10 ; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
11 ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
12 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
13 ; CHECK-NEXT: vzeroupper
15 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
18 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
19 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
21 ; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,8,6,12,4,7,9,14,8]
22 ; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
23 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
24 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
25 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
26 ; CHECK-NEXT: vzeroupper
28 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
29 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
30 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
34 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
35 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
37 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,6,12,4,7,9,14,8]
38 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
39 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
40 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
41 ; CHECK-NEXT: vzeroupper
43 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
44 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
45 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
48 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
49 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
51 ; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,9,4,14,15,12,14,4,12,9,4,14,15,12,14]
52 ; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
53 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
54 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
55 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
56 ; CHECK-NEXT: vzeroupper
58 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
59 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
60 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
64 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) {
65 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1:
67 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,12,9,4,14,15,12,14]
68 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
69 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
70 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
71 ; CHECK-NEXT: vzeroupper
73 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
74 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
75 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
78 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
79 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
81 ; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,4,11,14,10,7,1,6,9]
82 ; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
83 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
84 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
85 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
86 ; CHECK-NEXT: vzeroupper
88 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
89 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
90 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
94 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) {
95 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2:
97 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,14,10,7,1,6,9]
98 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
99 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
100 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
101 ; CHECK-NEXT: vzeroupper
103 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
104 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
105 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
108 define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
109 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3:
111 ; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [14,15,7,13,4,12,8,0,14,15,7,13,4,12,8,0]
112 ; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
113 ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
114 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
115 ; CHECK-NEXT: vzeroupper
117 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
120 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
121 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3:
123 ; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [14,15,7,13,4,12,8,0,14,15,7,13,4,12,8,0]
124 ; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
125 ; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
126 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
127 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
128 ; CHECK-NEXT: vzeroupper
130 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
131 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
132 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
136 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) {
137 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3:
139 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,7,13,4,12,8,0]
140 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
141 ; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
142 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
143 ; CHECK-NEXT: vzeroupper
145 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
146 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
147 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
150 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp) {
151 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0:
153 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
154 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9]
155 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0
157 %vec = load <16 x i16>, <16 x i16>* %vp
158 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
161 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
162 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
164 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
165 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,13,3,5,13,3,9]
166 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
167 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
168 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
170 %vec = load <16 x i16>, <16 x i16>* %vp
171 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
172 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
173 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
177 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %mask) {
178 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
180 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
181 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9]
182 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
183 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
184 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
186 %vec = load <16 x i16>, <16 x i16>* %vp
187 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
188 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
189 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
193 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
194 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1:
196 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
197 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,15,12,7,1,5,8,14]
198 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
199 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
200 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
202 %vec = load <16 x i16>, <16 x i16>* %vp
203 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
204 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
205 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
209 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %mask) {
210 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1:
212 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
213 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14]
214 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
215 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
216 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
218 %vec = load <16 x i16>, <16 x i16>* %vp
219 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
220 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
221 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
225 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
226 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
228 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
229 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1]
230 ; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm3
231 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
232 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
234 %vec = load <16 x i16>, <16 x i16>* %vp
235 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
236 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
237 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
241 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %mask) {
242 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
244 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
245 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1]
246 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
247 ; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z}
248 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
250 %vec = load <16 x i16>, <16 x i16>* %vp
251 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
252 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
253 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
257 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp) {
258 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3:
260 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
261 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2]
262 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm1, %xmm0
264 %vec = load <16 x i16>, <16 x i16>* %vp
265 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
268 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
269 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
271 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
272 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [9,7,9,6,9,4,3,2]
273 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3
274 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
275 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
277 %vec = load <16 x i16>, <16 x i16>* %vp
278 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
279 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
280 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
284 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %mask) {
285 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
287 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
288 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2]
289 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
290 ; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z}
291 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
293 %vec = load <16 x i16>, <16 x i16>* %vp
294 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
295 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
296 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
300 define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) {
301 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0:
303 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
304 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
305 ; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1
306 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
308 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
311 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
312 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0:
314 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
315 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
316 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
317 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
318 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
320 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
321 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
322 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
326 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) {
327 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0:
329 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
330 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
331 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
332 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
333 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
335 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
336 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
337 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
340 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
341 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1:
343 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
344 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
345 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
346 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
347 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
349 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
350 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
351 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
355 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) {
356 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1:
358 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
359 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
360 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
361 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
362 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
364 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
365 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
366 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
369 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
370 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2:
372 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
373 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
374 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
375 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
376 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
378 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
379 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
380 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
384 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) {
385 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2:
387 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
388 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
389 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
390 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
391 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
393 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
394 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
395 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
398 define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
399 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3:
401 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
402 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
403 ; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1
404 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
406 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
409 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
410 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3:
412 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
413 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
414 ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
415 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
416 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
418 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
419 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
420 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
424 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) {
425 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3:
427 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
428 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
429 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
430 ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm2 {%k1} {z}
431 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
433 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
434 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
435 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
438 define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) {
439 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0:
441 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14]
442 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
443 ; CHECK-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
444 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
445 ; CHECK-NEXT: vzeroupper
447 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
450 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
451 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0:
453 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14]
454 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
455 ; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm4
456 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
457 ; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
458 ; CHECK-NEXT: vzeroupper
460 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
461 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
462 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
466 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) {
467 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0:
469 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14]
470 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
471 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
472 ; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm2 {%k1} {z}
473 ; CHECK-NEXT: vmovdqa %xmm2, %xmm0
474 ; CHECK-NEXT: vzeroupper
476 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
477 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
478 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
481 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
482 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1:
484 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5]
485 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
486 ; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0
487 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
488 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
489 ; CHECK-NEXT: vzeroupper
491 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
492 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
493 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
497 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) {
498 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1:
500 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5]
501 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
502 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
503 ; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
504 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
505 ; CHECK-NEXT: vzeroupper
507 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
508 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
509 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
512 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
513 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2:
515 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8]
516 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
517 ; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0
518 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
519 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
520 ; CHECK-NEXT: vzeroupper
522 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
523 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
524 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
528 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) {
529 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2:
531 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8]
532 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
533 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
534 ; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
535 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
536 ; CHECK-NEXT: vzeroupper
538 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
539 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
540 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
543 define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
544 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3:
546 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30]
547 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
548 ; CHECK-NEXT: vpermt2w %ymm2, %ymm1, %ymm0
549 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
550 ; CHECK-NEXT: vzeroupper
552 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
555 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
556 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3:
558 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30]
559 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
560 ; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0
561 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
562 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
563 ; CHECK-NEXT: vzeroupper
565 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
566 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
567 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
571 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) {
572 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3:
574 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30]
575 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
576 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
577 ; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
578 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
579 ; CHECK-NEXT: vzeroupper
581 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
582 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
583 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
586 define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp) {
587 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0:
589 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
590 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
591 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
593 %vec = load <32 x i16>, <32 x i16>* %vp
594 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
597 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
598 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0:
600 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
601 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
602 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
603 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
604 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
606 %vec = load <32 x i16>, <32 x i16>* %vp
607 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
608 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
609 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
613 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %mask) {
614 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0:
616 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
617 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
618 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
619 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
620 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
622 %vec = load <32 x i16>, <32 x i16>* %vp
623 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
624 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
625 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
629 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
630 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1:
632 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
633 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
634 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
635 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
636 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
638 %vec = load <32 x i16>, <32 x i16>* %vp
639 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
640 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
641 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
645 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %mask) {
646 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1:
648 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
649 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
650 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
651 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
652 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
654 %vec = load <32 x i16>, <32 x i16>* %vp
655 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
656 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
657 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
661 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
662 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2:
664 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
665 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
666 ; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3
667 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
668 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
670 %vec = load <32 x i16>, <32 x i16>* %vp
671 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
672 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
673 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
677 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %mask) {
678 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2:
680 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
681 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
682 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
683 ; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z}
684 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
686 %vec = load <32 x i16>, <32 x i16>* %vp
687 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
688 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
689 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
693 define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp) {
694 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3:
696 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
697 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
698 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
700 %vec = load <32 x i16>, <32 x i16>* %vp
701 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
704 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
705 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3:
707 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
708 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
709 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
710 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
711 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
713 %vec = load <32 x i16>, <32 x i16>* %vp
714 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
715 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
716 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
720 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %mask) {
721 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3:
723 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
724 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
725 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
726 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
727 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
729 %vec = load <32 x i16>, <32 x i16>* %vp
730 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
731 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
732 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
736 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp) {
737 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0:
739 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17]
740 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0
741 ; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm0
742 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
743 ; CHECK-NEXT: vzeroupper
745 %vec = load <32 x i16>, <32 x i16>* %vp
746 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
749 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
750 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0:
752 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17]
753 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3
754 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3
755 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
756 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
757 ; CHECK-NEXT: vzeroupper
759 %vec = load <32 x i16>, <32 x i16>* %vp
760 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
761 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
762 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
766 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %mask) {
767 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0:
769 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17]
770 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
771 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
772 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
773 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
774 ; CHECK-NEXT: vzeroupper
776 %vec = load <32 x i16>, <32 x i16>* %vp
777 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
778 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
779 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
783 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
784 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1:
786 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
787 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3
788 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3
789 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
790 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
791 ; CHECK-NEXT: vzeroupper
793 %vec = load <32 x i16>, <32 x i16>* %vp
794 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
795 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
796 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
800 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %mask) {
801 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
803 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
804 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
805 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
806 ; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
807 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
808 ; CHECK-NEXT: vzeroupper
810 %vec = load <32 x i16>, <32 x i16>* %vp
811 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
812 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
813 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
817 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
818 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2:
820 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10]
821 ; CHECK-NEXT: vmovdqa (%rdi), %ymm3
822 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3
823 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
824 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
825 ; CHECK-NEXT: vzeroupper
827 %vec = load <32 x i16>, <32 x i16>* %vp
828 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
829 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
830 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
834 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %mask) {
835 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2:
837 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10]
838 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
839 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
840 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
841 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
842 ; CHECK-NEXT: vzeroupper
844 %vec = load <32 x i16>, <32 x i16>* %vp
845 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
846 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
847 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
851 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp) {
852 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3:
854 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9]
855 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
856 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm0
857 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
858 ; CHECK-NEXT: vzeroupper
860 %vec = load <32 x i16>, <32 x i16>* %vp
861 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
864 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
865 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3:
867 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9]
868 ; CHECK-NEXT: vmovdqa (%rdi), %ymm3
869 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3
870 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
871 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
872 ; CHECK-NEXT: vzeroupper
874 %vec = load <32 x i16>, <32 x i16>* %vp
875 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
876 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
877 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
881 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %mask) {
882 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3:
884 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9]
885 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
886 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
887 ; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
888 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
889 ; CHECK-NEXT: vzeroupper
891 %vec = load <32 x i16>, <32 x i16>* %vp
892 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
893 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
894 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
898 define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) {
899 ; CHECK-LABEL: test_16xi16_to_8xi16_E84C94EF:
901 ; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [14,8,4,12,9,4,14,15,14,8,4,12,9,4,14,15]
902 ; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
903 ; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
904 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
905 ; CHECK-NEXT: vzeroupper
907 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15>
911 define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) {
912 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0:
914 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,3,2]
915 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
916 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
917 ; CHECK-NEXT: vzeroupper
919 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
922 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
923 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0:
925 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,0,3,2]
926 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
927 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
928 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
929 ; CHECK-NEXT: vzeroupper
931 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
932 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
933 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
937 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) {
938 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0:
940 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,3,2]
941 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
942 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
943 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
944 ; CHECK-NEXT: vzeroupper
946 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
947 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
948 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
951 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
952 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1:
954 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,7,3]
955 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
956 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
957 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
958 ; CHECK-NEXT: vzeroupper
960 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
961 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
962 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
966 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) {
967 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1:
969 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,7,3]
970 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
971 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
972 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
973 ; CHECK-NEXT: vzeroupper
975 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
976 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
977 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
980 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
981 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask2:
983 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
984 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm3[1],xmm0[1]
985 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
986 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
987 ; CHECK-NEXT: vzeroupper
989 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
990 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
991 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
995 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %mask) {
996 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask2:
998 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
999 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1]
1000 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1001 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1002 ; CHECK-NEXT: vzeroupper
1004 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
1005 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1006 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1009 define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
1010 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3:
1012 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [5,3,2,5]
1013 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
1014 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1015 ; CHECK-NEXT: vzeroupper
1017 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1020 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1021 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3:
1023 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,3,2,5]
1024 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
1025 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1026 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1027 ; CHECK-NEXT: vzeroupper
1029 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1030 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1031 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1035 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) {
1036 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3:
1038 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,3,2,5]
1039 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1040 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
1041 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1042 ; CHECK-NEXT: vzeroupper
1044 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1045 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1046 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1049 define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp) {
1050 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0:
1052 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm0
1053 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1],mem[0,0]
1055 %vec = load <8 x i32>, <8 x i32>* %vp
1056 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1059 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1060 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0:
1062 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
1063 ; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1],mem[0,0]
1064 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1065 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
1067 %vec = load <8 x i32>, <8 x i32>* %vp
1068 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1069 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1070 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1074 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %mask) {
1075 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0:
1077 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
1078 ; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1],mem[0,0]
1079 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1080 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
1082 %vec = load <8 x i32>, <8 x i32>* %vp
1083 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1084 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1085 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1089 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1090 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
1092 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
1093 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,0,0,3]
1094 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3
1095 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1096 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1098 %vec = load <8 x i32>, <8 x i32>* %vp
1099 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
1100 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1101 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1105 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %mask) {
1106 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
1108 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
1109 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,0,0,3]
1110 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1111 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z}
1112 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1114 %vec = load <8 x i32>, <8 x i32>* %vp
1115 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
1116 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1117 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1121 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1122 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
1124 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1125 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [0,7,7,0]
1126 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3
1127 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1128 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1130 %vec = load <8 x i32>, <8 x i32>* %vp
1131 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
1132 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1133 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1137 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %mask) {
1138 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
1140 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1141 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [0,7,7,0]
1142 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1143 ; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z}
1144 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1146 %vec = load <8 x i32>, <8 x i32>* %vp
1147 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
1148 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1149 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1153 define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) {
1154 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3:
1156 ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm1
1157 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [5,1,2,7]
1158 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm1, %xmm0
1160 %vec = load <8 x i32>, <8 x i32>* %vp
1161 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1164 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1165 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3:
1167 ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2
1168 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,2,7]
1169 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3
1170 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1171 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1173 %vec = load <8 x i32>, <8 x i32>* %vp
1174 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1175 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1176 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1180 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %mask) {
1181 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3:
1183 ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2
1184 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [5,1,2,7]
1185 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1186 ; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z}
1187 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1189 %vec = load <8 x i32>, <8 x i32>* %vp
1190 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1191 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1192 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1196 define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
1197 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0:
1199 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,13,11,14,7,10,1,6,1,13,11,14,7,10,1,6]
1200 ; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
1201 ; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0
1202 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1204 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1207 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1208 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0:
1210 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [1,13,11,14,7,10,1,6,1,13,11,14,7,10,1,6]
1211 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
1212 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1213 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1214 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1216 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1217 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1218 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1222 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) {
1223 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0:
1225 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,13,11,14,7,10,1,6]
1226 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1227 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1228 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1230 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1231 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1232 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1235 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1236 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1:
1238 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,0,15,3,2,3,6,8,3,0,15,3,2,3,6,8]
1239 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
1240 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1241 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1242 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1244 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
1245 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1246 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1250 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) {
1251 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1:
1253 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8]
1254 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1255 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1256 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1258 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
1259 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1260 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1263 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1264 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2:
1266 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,15,15,2,6,10,14,7,2,15,15,2,6,10,14,7]
1267 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
1268 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1269 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1270 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1272 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
1273 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1274 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1278 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) {
1279 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2:
1281 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7]
1282 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1283 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1284 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1286 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
1287 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1288 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1291 define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
1292 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3:
1294 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [14,5,7,7,10,3,9,3,14,5,7,7,10,3,9,3]
1295 ; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
1296 ; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0
1297 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1299 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1302 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1303 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3:
1305 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,5,7,7,10,3,9,3,14,5,7,7,10,3,9,3]
1306 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
1307 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1308 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1309 ; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
1311 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1312 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1313 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1317 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) {
1318 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3:
1320 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3]
1321 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1322 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1323 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1325 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1326 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1327 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1330 define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
1331 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0:
1333 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12]
1334 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
1335 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1336 ; CHECK-NEXT: vzeroupper
1338 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1341 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1342 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0:
1344 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12]
1345 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1346 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1347 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1348 ; CHECK-NEXT: vzeroupper
1350 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1351 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1352 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1356 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) {
1357 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0:
1359 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12]
1360 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1361 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1362 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1363 ; CHECK-NEXT: vzeroupper
1365 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1366 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1367 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1370 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1371 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1:
1373 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,3,4]
1374 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1375 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
1376 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1377 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1378 ; CHECK-NEXT: vzeroupper
1380 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
1381 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1382 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1386 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) {
1387 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1:
1389 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,1,3,4]
1390 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1391 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1392 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
1393 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1394 ; CHECK-NEXT: vzeroupper
1396 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
1397 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1398 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1401 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1402 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2:
1404 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,13,0]
1405 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1406 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1407 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1408 ; CHECK-NEXT: vzeroupper
1410 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
1411 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1412 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1416 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) {
1417 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2:
1419 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,13,0]
1420 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1421 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1422 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1423 ; CHECK-NEXT: vzeroupper
1425 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
1426 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1427 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1430 define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
1431 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3:
1433 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,0,0,13]
1434 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
1435 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1436 ; CHECK-NEXT: vzeroupper
1438 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1441 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1442 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3:
1444 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,0,13]
1445 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
1446 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1447 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1448 ; CHECK-NEXT: vzeroupper
1450 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1451 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1452 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1456 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) {
1457 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3:
1459 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,0,13]
1460 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1461 ; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
1462 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1463 ; CHECK-NEXT: vzeroupper
1465 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1466 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1467 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1470 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp) {
1471 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0:
1473 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4]
1474 ; CHECK-NEXT: vpermps 32(%rdi), %ymm0, %ymm0
1476 %vec = load <16 x i32>, <16 x i32>* %vp
1477 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1480 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1481 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0:
1483 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4]
1484 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1485 ; CHECK-NEXT: vpermd 32(%rdi), %ymm2, %ymm0 {%k1}
1487 %vec = load <16 x i32>, <16 x i32>* %vp
1488 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1489 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1490 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1494 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %mask) {
1495 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0:
1497 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4]
1498 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1499 ; CHECK-NEXT: vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z}
1501 %vec = load <16 x i32>, <16 x i32>* %vp
1502 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1503 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1504 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1508 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1509 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1:
1511 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1512 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15]
1513 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
1514 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1515 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
1517 %vec = load <16 x i32>, <16 x i32>* %vp
1518 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
1519 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1520 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1524 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %mask) {
1525 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1:
1527 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1528 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15]
1529 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1530 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1531 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1533 %vec = load <16 x i32>, <16 x i32>* %vp
1534 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
1535 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1536 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1540 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1541 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2:
1543 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1544 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10]
1545 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
1546 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1547 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
1549 %vec = load <16 x i32>, <16 x i32>* %vp
1550 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
1551 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1552 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1556 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %mask) {
1557 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2:
1559 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1560 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10]
1561 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1562 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1563 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1565 %vec = load <16 x i32>, <16 x i32>* %vp
1566 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
1567 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1568 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1572 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp) {
1573 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3:
1575 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1576 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12]
1577 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0
1579 %vec = load <16 x i32>, <16 x i32>* %vp
1580 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1583 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1584 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3:
1586 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
1587 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12]
1588 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3
1589 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1590 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
1592 %vec = load <16 x i32>, <16 x i32>* %vp
1593 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1594 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1595 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1599 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %mask) {
1600 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3:
1602 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
1603 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12]
1604 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1605 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
1606 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1608 %vec = load <16 x i32>, <16 x i32>* %vp
1609 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1610 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1611 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1615 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp) {
1616 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0:
1618 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [13,0,0,6]
1619 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
1620 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm0
1621 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1622 ; CHECK-NEXT: vzeroupper
1624 %vec = load <16 x i32>, <16 x i32>* %vp
1625 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1628 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1629 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0:
1631 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [13,0,0,6]
1632 ; CHECK-NEXT: vmovdqa (%rdi), %ymm3
1633 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3
1634 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1635 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1636 ; CHECK-NEXT: vzeroupper
1638 %vec = load <16 x i32>, <16 x i32>* %vp
1639 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1640 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1641 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1645 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %mask) {
1646 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
1648 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [13,0,0,6]
1649 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1650 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1651 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
1652 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1653 ; CHECK-NEXT: vzeroupper
1655 %vec = load <16 x i32>, <16 x i32>* %vp
1656 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1657 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1658 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1662 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1663 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
1665 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1666 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [15,5,3,2,15,5,7,6]
1667 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
1668 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1669 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1670 ; CHECK-NEXT: vzeroupper
1672 %vec = load <16 x i32>, <16 x i32>* %vp
1673 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
1674 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1675 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1679 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %mask) {
1680 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
1682 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1683 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [15,5,3,2,15,5,7,6]
1684 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1685 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1686 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1687 ; CHECK-NEXT: vzeroupper
1689 %vec = load <16 x i32>, <16 x i32>* %vp
1690 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
1691 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1692 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1696 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1697 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2:
1699 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [2,15,6,9]
1700 ; CHECK-NEXT: vmovdqa (%rdi), %ymm3
1701 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3
1702 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1703 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1704 ; CHECK-NEXT: vzeroupper
1706 %vec = load <16 x i32>, <16 x i32>* %vp
1707 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
1708 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1709 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1713 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %mask) {
1714 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2:
1716 ; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [2,15,6,9]
1717 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1718 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1719 ; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
1720 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1721 ; CHECK-NEXT: vzeroupper
1723 %vec = load <16 x i32>, <16 x i32>* %vp
1724 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
1725 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1726 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1730 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) {
1731 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3:
1733 ; CHECK-NEXT: vmovdqa (%rdi), %xmm0
1734 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
1735 ; CHECK-NEXT: vmovd %xmm0, %eax
1736 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1737 ; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
1738 ; CHECK-NEXT: vpextrd $3, %xmm1, %eax
1739 ; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm1
1740 ; CHECK-NEXT: vpextrd $2, %xmm0, %eax
1741 ; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
1743 %vec = load <16 x i32>, <16 x i32>* %vp
1744 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1747 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1748 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3:
1750 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
1751 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
1752 ; CHECK-NEXT: vmovd %xmm2, %eax
1753 ; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
1754 ; CHECK-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
1755 ; CHECK-NEXT: vpextrd $3, %xmm3, %eax
1756 ; CHECK-NEXT: vpinsrd $2, %eax, %xmm4, %xmm3
1757 ; CHECK-NEXT: vpextrd $2, %xmm2, %eax
1758 ; CHECK-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
1759 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1760 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
1762 %vec = load <16 x i32>, <16 x i32>* %vp
1763 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1764 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1765 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1769 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %mask) {
1770 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3:
1772 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
1773 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1774 ; CHECK-NEXT: vmovd %xmm1, %eax
1775 ; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
1776 ; CHECK-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
1777 ; CHECK-NEXT: vpextrd $3, %xmm2, %eax
1778 ; CHECK-NEXT: vpinsrd $2, %eax, %xmm3, %xmm2
1779 ; CHECK-NEXT: vpextrd $2, %xmm1, %eax
1780 ; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
1781 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1782 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
1784 %vec = load <16 x i32>, <16 x i32>* %vp
1785 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1786 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1787 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1791 define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
1792 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9:
1794 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [12,9,4,10]
1795 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
1796 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1797 ; CHECK-NEXT: vzeroupper
1799 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 12, i32 9, i32 4, i32 10>
1803 define <2 x i64> @test_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) {
1804 ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mask0:
1806 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
1807 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1808 ; CHECK-NEXT: vzeroupper
1810 %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1813 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
1814 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0:
1816 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,2,3]
1817 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
1818 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
1819 ; CHECK-NEXT: vzeroupper
1821 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1822 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1823 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1827 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %mask) {
1828 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0:
1830 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1831 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3]
1832 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1833 ; CHECK-NEXT: vzeroupper
1835 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1836 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1837 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1840 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
1841 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask1:
1843 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
1844 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
1845 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
1846 ; CHECK-NEXT: vzeroupper
1848 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1849 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1850 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1854 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %mask) {
1855 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask1:
1857 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1858 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
1859 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1860 ; CHECK-NEXT: vzeroupper
1862 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1863 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1864 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1867 define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp) {
1868 ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0:
1870 ; CHECK-NEXT: vmovaps (%rdi), %xmm0
1871 ; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
1873 %vec = load <4 x i64>, <4 x i64>* %vp
1874 %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1877 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
1878 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0:
1880 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
1881 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1882 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} = xmm2[1],mem[1]
1884 %vec = load <4 x i64>, <4 x i64>* %vp
1885 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1886 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1887 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1891 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %mask) {
1892 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0:
1894 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
1895 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
1896 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm1[1],mem[1]
1898 %vec = load <4 x i64>, <4 x i64>* %vp
1899 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1900 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1901 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1905 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
1906 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1:
1908 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1909 ; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
1910 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1911 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
1913 %vec = load <4 x i64>, <4 x i64>* %vp
1914 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1915 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1916 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1920 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %mask) {
1921 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1:
1923 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
1924 ; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
1925 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
1926 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
1928 %vec = load <4 x i64>, <4 x i64>* %vp
1929 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1930 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1931 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1935 define <4 x i64> @test_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec) {
1936 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask0:
1938 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
1939 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1]
1941 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1944 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1945 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
1947 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1948 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
1949 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1]
1950 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1952 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1953 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1954 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1958 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) {
1959 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
1961 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1962 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
1963 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1]
1965 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1966 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1967 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1970 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1971 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
1973 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,4,6,1,6,4,6,1]
1974 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
1975 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
1976 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
1977 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
1979 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
1980 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1981 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1985 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) {
1986 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
1988 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1]
1989 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
1990 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
1991 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
1993 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
1994 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1995 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1998 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1999 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
2001 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,3,6,3,6,3,6,3]
2002 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2003 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
2004 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2005 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2007 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
2008 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2009 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2013 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
2014 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
2016 ; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,3,6,3]
2017 ; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
2018 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2019 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2020 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2022 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
2023 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2024 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2027 define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
2028 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask3:
2030 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,0,0,7,6,0,0,7]
2031 ; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
2032 ; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0
2033 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2035 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2038 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2039 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
2041 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,0,0,7,6,0,0,7]
2042 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
2043 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
2044 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2045 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2047 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2048 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2049 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2053 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) {
2054 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
2056 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7]
2057 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2058 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2059 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2061 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2062 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2063 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2066 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2067 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
2069 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,7,7,5,3,7,7,5]
2070 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
2071 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
2072 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2073 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2075 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
2076 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2077 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2081 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {
2082 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
2084 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,7,5]
2085 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2086 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2087 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2089 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
2090 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2091 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2094 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2095 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5:
2097 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,1,0,6,4,1,0,6]
2098 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
2099 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
2100 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2101 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2103 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
2104 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2105 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2109 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) {
2110 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5:
2112 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,1,0,6]
2113 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2114 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2115 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2117 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
2118 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2119 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2122 define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
2123 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask6:
2125 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [7,6,5,3,7,6,5,3]
2126 ; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
2127 ; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0
2128 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2130 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2133 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2134 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
2136 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [7,6,5,3,7,6,5,3]
2137 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
2138 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
2139 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2140 ; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
2142 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2143 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2144 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2148 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) {
2149 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
2151 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,3]
2152 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2153 ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
2154 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2156 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2157 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2158 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2161 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2162 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
2164 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3
2165 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4]
2166 ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4
2167 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2168 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
2170 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
2171 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2172 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2176 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
2177 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
2179 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3
2180 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4]
2181 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2182 ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
2183 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
2185 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
2186 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2187 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2190 define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
2191 ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0:
2193 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7]
2194 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2195 ; CHECK-NEXT: vzeroupper
2197 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2200 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
2201 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0:
2203 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7]
2204 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
2205 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
2206 ; CHECK-NEXT: vzeroupper
2208 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2209 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2210 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2214 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) {
2215 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0:
2217 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2218 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,2,3,7,4,6,7]
2219 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
2220 ; CHECK-NEXT: vzeroupper
2222 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2223 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2224 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2227 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
2228 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
2230 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2231 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
2232 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
2233 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
2234 ; CHECK-NEXT: vzeroupper
2236 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
2237 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2238 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2242 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) {
2243 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
2245 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2246 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2247 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
2248 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2249 ; CHECK-NEXT: vzeroupper
2251 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
2252 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2253 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2256 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp) {
2257 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask0:
2259 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,2,0,2]
2261 %vec = load <8 x i64>, <8 x i64>* %vp
2262 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2265 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2266 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask0:
2268 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2269 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,2,0,2]
2271 %vec = load <8 x i64>, <8 x i64>* %vp
2272 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2273 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2274 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2278 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %mask) {
2279 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0:
2281 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2282 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,2,0,2]
2284 %vec = load <8 x i64>, <8 x i64>* %vp
2285 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2286 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2287 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2291 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2292 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
2294 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2295 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,4]
2296 ; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3
2297 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2298 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2300 %vec = load <8 x i64>, <8 x i64>* %vp
2301 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
2302 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2303 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2307 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %mask) {
2308 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
2310 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2311 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,4]
2312 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2313 ; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2314 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2316 %vec = load <8 x i64>, <8 x i64>* %vp
2317 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
2318 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2319 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2323 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2324 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
2326 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2327 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,5,5,1]
2328 ; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3
2329 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2330 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2332 %vec = load <8 x i64>, <8 x i64>* %vp
2333 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
2334 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2335 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2339 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) {
2340 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
2342 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2343 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,5,5,1]
2344 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2345 ; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2346 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2348 %vec = load <8 x i64>, <8 x i64>* %vp
2349 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
2350 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2351 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2355 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp) {
2356 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
2358 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
2359 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,0,2]
2360 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0
2362 %vec = load <8 x i64>, <8 x i64>* %vp
2363 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2366 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2367 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
2369 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2370 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,0,0,2]
2371 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2372 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2373 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2375 %vec = load <8 x i64>, <8 x i64>* %vp
2376 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2377 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2378 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2382 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %mask) {
2383 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
2385 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2386 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,2]
2387 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2388 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2389 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2391 %vec = load <8 x i64>, <8 x i64>* %vp
2392 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2393 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2394 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2398 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2399 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4:
2401 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2402 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,6,1]
2403 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2404 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2405 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2407 %vec = load <8 x i64>, <8 x i64>* %vp
2408 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
2409 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2410 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2414 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %mask) {
2415 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:
2417 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2418 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,6,1]
2419 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2420 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2421 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2423 %vec = load <8 x i64>, <8 x i64>* %vp
2424 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
2425 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2426 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2430 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2431 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
2433 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2434 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,7,1]
2435 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2436 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2437 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2439 %vec = load <8 x i64>, <8 x i64>* %vp
2440 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
2441 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2442 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2446 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %mask) {
2447 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
2449 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2450 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,7,1]
2451 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2452 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2453 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2455 %vec = load <8 x i64>, <8 x i64>* %vp
2456 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
2457 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2458 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2462 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp) {
2463 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6:
2465 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
2466 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,2,3,2]
2467 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0
2469 %vec = load <8 x i64>, <8 x i64>* %vp
2470 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2473 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2474 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6:
2476 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2477 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,2,3,2]
2478 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2479 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2480 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2482 %vec = load <8 x i64>, <8 x i64>* %vp
2483 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2484 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2485 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2489 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %mask) {
2490 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:
2492 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2493 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,2,3,2]
2494 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2495 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2496 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2498 %vec = load <8 x i64>, <8 x i64>* %vp
2499 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2500 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2501 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2505 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2506 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
2508 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2509 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,1,5]
2510 ; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3
2511 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2512 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2514 %vec = load <8 x i64>, <8 x i64>* %vp
2515 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
2516 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2517 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2521 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %mask) {
2522 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
2524 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2525 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,1,5]
2526 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2527 ; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2528 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2530 %vec = load <8 x i64>, <8 x i64>* %vp
2531 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
2532 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2533 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2537 define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp) {
2538 ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
2540 ; CHECK-NEXT: vmovaps 32(%rdi), %xmm0
2541 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
2543 %vec = load <8 x i64>, <8 x i64>* %vp
2544 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2547 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
2548 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
2550 ; CHECK-NEXT: vmovdqa 32(%rdi), %xmm2
2551 ; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
2552 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2553 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
2555 %vec = load <8 x i64>, <8 x i64>* %vp
2556 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2557 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2558 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2562 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %mask) {
2563 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
2565 ; CHECK-NEXT: vmovdqa 32(%rdi), %xmm1
2566 ; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
2567 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
2568 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
2570 %vec = load <8 x i64>, <8 x i64>* %vp
2571 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2572 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2573 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2577 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
2578 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
2580 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2581 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
2582 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
2583 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2584 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
2585 ; CHECK-NEXT: vzeroupper
2587 %vec = load <8 x i64>, <8 x i64>* %vp
2588 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
2589 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2590 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2594 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %mask) {
2595 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
2597 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
2598 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
2599 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
2600 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
2601 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
2602 ; CHECK-NEXT: vzeroupper
2604 %vec = load <8 x i64>, <8 x i64>* %vp
2605 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
2606 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2607 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2611 define <4 x float> @test_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) {
2612 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
2614 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
2615 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1]
2616 ; CHECK-NEXT: vzeroupper
2618 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2619 ret <4 x float> %res
2621 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2622 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0:
2624 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
2625 ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
2626 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
2627 ; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1]
2628 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
2629 ; CHECK-NEXT: vzeroupper
2631 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2632 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2633 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2634 ret <4 x float> %res
2637 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %mask) {
2638 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0:
2640 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
2641 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2642 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2643 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1]
2644 ; CHECK-NEXT: vzeroupper
2646 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2647 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2648 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2649 ret <4 x float> %res
2651 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2652 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1:
2654 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,0]
2655 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
2656 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2657 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2658 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2659 ; CHECK-NEXT: vzeroupper
2661 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
2662 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2663 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2664 ret <4 x float> %res
2667 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) {
2668 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1:
2670 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,0]
2671 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2672 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2673 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2674 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2675 ; CHECK-NEXT: vzeroupper
2677 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
2678 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2679 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2680 ret <4 x float> %res
2682 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2683 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2:
2685 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,2,7,0]
2686 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
2687 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2688 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2689 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2690 ; CHECK-NEXT: vzeroupper
2692 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
2693 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2694 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2695 ret <4 x float> %res
2698 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) {
2699 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2:
2701 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,2,7,0]
2702 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2703 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2704 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2705 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2706 ; CHECK-NEXT: vzeroupper
2708 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
2709 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2710 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2711 ret <4 x float> %res
2713 define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) {
2714 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3:
2716 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,5,2]
2717 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
2718 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2719 ; CHECK-NEXT: vzeroupper
2721 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2722 ret <4 x float> %res
2724 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2725 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3:
2727 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,3,5,2]
2728 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
2729 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2730 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2731 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2732 ; CHECK-NEXT: vzeroupper
2734 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2735 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2736 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2737 ret <4 x float> %res
2740 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) {
2741 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3:
2743 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,5,2]
2744 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2745 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2746 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
2747 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2748 ; CHECK-NEXT: vzeroupper
2750 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2751 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2752 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2753 ret <4 x float> %res
2755 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) {
2756 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
2758 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
2759 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [2,6,0,1]
2760 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm1, %xmm0
2762 %vec = load <8 x float>, <8 x float>* %vp
2763 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2764 ret <4 x float> %res
2766 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
2767 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
2769 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2770 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,6,0,1]
2771 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3
2772 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
2773 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
2774 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
2776 %vec = load <8 x float>, <8 x float>* %vp
2777 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2778 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2779 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2780 ret <4 x float> %res
2783 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %mask) {
2784 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
2786 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2787 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,0,1]
2788 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2789 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
2790 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
2791 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
2793 %vec = load <8 x float>, <8 x float>* %vp
2794 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2795 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2796 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2797 ret <4 x float> %res
2800 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
2801 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
2803 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2804 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [2,7,7,2]
2805 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3
2806 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
2807 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
2808 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
2810 %vec = load <8 x float>, <8 x float>* %vp
2811 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
2812 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2813 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2814 ret <4 x float> %res
2817 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %mask) {
2818 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
2820 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2821 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [2,7,7,2]
2822 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2823 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
2824 ; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z}
2825 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
2827 %vec = load <8 x float>, <8 x float>* %vp
2828 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
2829 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2830 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2831 ret <4 x float> %res
2834 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
2835 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
2837 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
2838 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,1,3,7]
2839 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
2840 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
2841 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
2842 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
2844 %vec = load <8 x float>, <8 x float>* %vp
2845 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
2846 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2847 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2848 ret <4 x float> %res
2851 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %mask) {
2852 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
2854 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
2855 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,1,3,7]
2856 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2857 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
2858 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
2859 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
2861 %vec = load <8 x float>, <8 x float>* %vp
2862 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
2863 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2864 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2865 ret <4 x float> %res
2868 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp) {
2869 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3:
2871 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
2872 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,3,5,3]
2873 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0
2875 %vec = load <8 x float>, <8 x float>* %vp
2876 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
2877 ret <4 x float> %res
2879 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
2880 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
2882 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
2883 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,3]
2884 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
2885 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
2886 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
2887 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
2889 %vec = load <8 x float>, <8 x float>* %vp
2890 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
2891 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2892 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2893 ret <4 x float> %res
2896 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %mask) {
2897 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
2899 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
2900 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [1,3,5,3]
2901 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2902 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
2903 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
2904 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
2906 %vec = load <8 x float>, <8 x float>* %vp
2907 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
2908 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2909 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2910 ret <4 x float> %res
2913 define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) {
2914 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0:
2916 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,4,12,10,8,2,11,7,0,4,12,10,8,2,11,7]
2917 ; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
2918 ; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0
2919 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2921 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
2922 ret <8 x float> %res
2924 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
2925 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0:
2927 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,4,12,10,8,2,11,7,0,4,12,10,8,2,11,7]
2928 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
2929 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
2930 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
2931 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
2932 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
2934 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
2935 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
2936 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
2937 ret <8 x float> %res
2940 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) {
2941 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0:
2943 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7]
2944 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2945 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
2946 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
2947 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2949 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
2950 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
2951 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
2952 ret <8 x float> %res
2954 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
2955 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1:
2957 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [10,12,3,12,4,15,1,14,10,12,3,12,4,15,1,14]
2958 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
2959 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
2960 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
2961 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
2962 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
2964 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
2965 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
2966 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
2967 ret <8 x float> %res
2970 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) {
2971 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1:
2973 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [10,12,3,12,4,15,1,14]
2974 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2975 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
2976 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
2977 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2979 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
2980 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
2981 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
2982 ret <8 x float> %res
2984 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
2985 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2:
2987 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4]
2988 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
2989 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2990 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
2991 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
2993 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
2994 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
2995 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
2996 ret <8 x float> %res
2999 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) {
3000 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2:
3002 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4]
3003 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3004 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
3005 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3006 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3008 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
3009 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3010 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3011 ret <8 x float> %res
3013 define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) {
3014 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3:
3016 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [12,14,9,0,12,4,5,8,12,14,9,0,12,4,5,8]
3017 ; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
3018 ; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0
3019 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3021 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3022 ret <8 x float> %res
3024 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3025 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3:
3027 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,14,9,0,12,4,5,8,12,14,9,0,12,4,5,8]
3028 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
3029 ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
3030 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
3031 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
3032 ; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
3034 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3035 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3036 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3037 ret <8 x float> %res
3040 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) {
3041 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3:
3043 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [12,14,9,0,12,4,5,8]
3044 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3045 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
3046 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3047 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3049 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3050 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3051 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3052 ret <8 x float> %res
3054 define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) {
3055 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0:
3057 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,8,9,10]
3058 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
3059 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3060 ; CHECK-NEXT: vzeroupper
3062 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3063 ret <4 x float> %res
3065 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3066 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0:
3068 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [4,8,9,10]
3069 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3070 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3071 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3072 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3073 ; CHECK-NEXT: vzeroupper
3075 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3076 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3077 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3078 ret <4 x float> %res
3081 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) {
3082 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0:
3084 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [4,8,9,10]
3085 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3086 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3087 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3088 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3089 ; CHECK-NEXT: vzeroupper
3091 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3092 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3093 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3094 ret <4 x float> %res
3096 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3097 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
3099 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [8,6,10,6]
3100 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3101 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3102 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3103 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3104 ; CHECK-NEXT: vzeroupper
3106 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
3107 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3108 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3109 ret <4 x float> %res
3112 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
3113 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
3115 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [8,6,10,6]
3116 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3117 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3118 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3119 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3120 ; CHECK-NEXT: vzeroupper
3122 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
3123 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3124 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3125 ret <4 x float> %res
3127 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3128 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2:
3130 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3131 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5]
3132 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3133 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3134 ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1}
3135 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3136 ; CHECK-NEXT: vzeroupper
3138 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
3139 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3140 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3141 ret <4 x float> %res
3144 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) {
3145 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2:
3147 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
3148 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5]
3149 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3150 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3151 ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
3152 ; CHECK-NEXT: vzeroupper
3154 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
3155 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3156 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3157 ret <4 x float> %res
3159 define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) {
3160 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3:
3162 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [10,2,11,6]
3163 ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
3164 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3165 ; CHECK-NEXT: vzeroupper
3167 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3168 ret <4 x float> %res
3170 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3171 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3:
3173 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [10,2,11,6]
3174 ; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
3175 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3176 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3177 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3178 ; CHECK-NEXT: vzeroupper
3180 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3181 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3182 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3183 ret <4 x float> %res
3186 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) {
3187 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3:
3189 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [10,2,11,6]
3190 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3191 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3192 ; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
3193 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
3194 ; CHECK-NEXT: vzeroupper
3196 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3197 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3198 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3199 ret <4 x float> %res
3201 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp) {
3202 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0:
3204 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
3205 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4]
3206 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm0
3208 %vec = load <16 x float>, <16 x float>* %vp
3209 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3210 ret <8 x float> %res
3212 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
3213 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0:
3215 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3216 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4]
3217 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3
3218 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3219 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3220 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3222 %vec = load <16 x float>, <16 x float>* %vp
3223 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3224 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3225 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3226 ret <8 x float> %res
3229 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %mask) {
3230 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0:
3232 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3233 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4]
3234 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3235 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3236 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3237 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3239 %vec = load <16 x float>, <16 x float>* %vp
3240 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3241 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3242 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3243 ret <8 x float> %res
3246 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
3247 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1:
3249 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3250 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8]
3251 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3
3252 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3253 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3254 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3256 %vec = load <16 x float>, <16 x float>* %vp
3257 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
3258 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3259 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3260 ret <8 x float> %res
3263 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %mask) {
3264 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1:
3266 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3267 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8]
3268 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3269 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3270 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3271 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3273 %vec = load <16 x float>, <16 x float>* %vp
3274 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
3275 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3276 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3277 ret <8 x float> %res
3280 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
3281 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
3283 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3284 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1]
3285 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3286 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3287 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3288 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3290 %vec = load <16 x float>, <16 x float>* %vp
3291 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
3292 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3293 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3294 ret <8 x float> %res
3297 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %mask) {
3298 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
3300 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3301 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1]
3302 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3303 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3304 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3305 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3307 %vec = load <16 x float>, <16 x float>* %vp
3308 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
3309 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3310 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3311 ret <8 x float> %res
3314 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp) {
3315 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3:
3317 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm1
3318 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9]
3319 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm0
3321 %vec = load <16 x float>, <16 x float>* %vp
3322 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3323 ret <8 x float> %res
3325 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
3326 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3:
3328 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3329 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9]
3330 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3331 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3332 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3333 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3335 %vec = load <16 x float>, <16 x float>* %vp
3336 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3337 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3338 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3339 ret <8 x float> %res
3342 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %mask) {
3343 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3:
3345 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3346 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9]
3347 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3348 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3349 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3350 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3352 %vec = load <16 x float>, <16 x float>* %vp
3353 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3354 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3355 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3356 ret <8 x float> %res
3359 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp) {
3360 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0:
3362 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = mem[3,1,2,3]
3363 ; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,7,3]
3364 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0
3365 ; CHECK-NEXT: vzeroupper
3367 %vec = load <16 x float>, <16 x float>* %vp
3368 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3369 ret <4 x float> %res
3371 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3372 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
3374 ; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = mem[3,1,2,3]
3375 ; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,6,7,3]
3376 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
3377 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3378 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3379 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3380 ; CHECK-NEXT: vzeroupper
3382 %vec = load <16 x float>, <16 x float>* %vp
3383 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3384 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3385 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3386 ret <4 x float> %res
3389 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %mask) {
3390 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
3392 ; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = mem[3,1,2,3]
3393 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,7,3]
3394 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3395 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3396 ; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
3397 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3398 ; CHECK-NEXT: vzeroupper
3400 %vec = load <16 x float>, <16 x float>* %vp
3401 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3402 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3403 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3404 ret <4 x float> %res
3407 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3408 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1:
3410 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3411 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,10,6,15,4,14,6,15]
3412 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3413 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3414 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3415 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3416 ; CHECK-NEXT: vzeroupper
3418 %vec = load <16 x float>, <16 x float>* %vp
3419 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
3420 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3421 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3422 ret <4 x float> %res
3425 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %mask) {
3426 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
3428 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3429 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,10,6,15,4,14,6,15]
3430 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3431 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3432 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3433 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3434 ; CHECK-NEXT: vzeroupper
3436 %vec = load <16 x float>, <16 x float>* %vp
3437 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
3438 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3439 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3440 ret <4 x float> %res
3443 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3444 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
3446 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3447 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [4,14,4,14,4,14,6,7]
3448 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3449 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3450 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3451 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3452 ; CHECK-NEXT: vzeroupper
3454 %vec = load <16 x float>, <16 x float>* %vp
3455 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
3456 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3457 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3458 ret <4 x float> %res
3461 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %mask) {
3462 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
3464 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3465 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,14,4,14,4,14,6,7]
3466 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3467 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3468 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3469 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3470 ; CHECK-NEXT: vzeroupper
3472 %vec = load <16 x float>, <16 x float>* %vp
3473 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
3474 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3475 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3476 ret <4 x float> %res
3479 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp) {
3480 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3:
3482 ; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,15,9]
3483 ; CHECK-NEXT: vmovaps (%rdi), %ymm0
3484 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm1, %ymm0
3485 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3486 ; CHECK-NEXT: vzeroupper
3488 %vec = load <16 x float>, <16 x float>* %vp
3489 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3490 ret <4 x float> %res
3492 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3493 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3:
3495 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9]
3496 ; CHECK-NEXT: vmovaps (%rdi), %ymm3
3497 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm3
3498 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3499 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3500 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3501 ; CHECK-NEXT: vzeroupper
3503 %vec = load <16 x float>, <16 x float>* %vp
3504 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3505 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3506 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3507 ret <4 x float> %res
3510 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %mask) {
3511 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
3513 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9]
3514 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
3515 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3516 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
3517 ; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3518 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3519 ; CHECK-NEXT: vzeroupper
3521 %vec = load <16 x float>, <16 x float>* %vp
3522 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3523 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3524 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3525 ret <4 x float> %res
3528 define <2 x double> @test_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec) {
3529 ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mask0:
3531 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
3532 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3533 ; CHECK-NEXT: vzeroupper
3535 %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3536 ret <2 x double> %res
3538 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3539 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask0:
3541 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
3542 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3543 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
3544 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
3545 ; CHECK-NEXT: vzeroupper
3547 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3548 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3549 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3550 ret <2 x double> %res
3553 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %mask) {
3554 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask0:
3556 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3557 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
3558 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3]
3559 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3560 ; CHECK-NEXT: vzeroupper
3562 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3563 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3564 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3565 ret <2 x double> %res
3567 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3568 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask1:
3570 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
3571 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3572 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
3573 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
3574 ; CHECK-NEXT: vzeroupper
3576 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
3577 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3578 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3579 ret <2 x double> %res
3582 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %mask) {
3583 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask1:
3585 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3586 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
3587 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,2,3]
3588 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3589 ; CHECK-NEXT: vzeroupper
3591 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
3592 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3593 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3594 ret <2 x double> %res
3596 define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp) {
3597 ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0:
3599 ; CHECK-NEXT: vmovaps (%rdi), %xmm0
3600 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
3602 %vec = load <4 x double>, <4 x double>* %vp
3603 %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3604 ret <2 x double> %res
3606 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
3607 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0:
3609 ; CHECK-NEXT: vmovapd (%rdi), %xmm2
3610 ; CHECK-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm2[1]
3611 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3612 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
3613 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1}
3615 %vec = load <4 x double>, <4 x double>* %vp
3616 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3617 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3618 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3619 ret <2 x double> %res
3622 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %mask) {
3623 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0:
3625 ; CHECK-NEXT: vmovapd (%rdi), %xmm1
3626 ; CHECK-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1]
3627 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3628 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
3629 ; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z}
3631 %vec = load <4 x double>, <4 x double>* %vp
3632 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3633 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3634 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3635 ret <2 x double> %res
3638 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
3639 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1:
3641 ; CHECK-NEXT: vmovapd 16(%rdi), %xmm2
3642 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3643 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
3644 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} = xmm2[0],mem[0]
3646 %vec = load <4 x double>, <4 x double>* %vp
3647 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3648 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3649 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3650 ret <2 x double> %res
3653 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %mask) {
3654 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1:
3656 ; CHECK-NEXT: vmovapd 16(%rdi), %xmm1
3657 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3658 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
3659 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm1[0],mem[0]
3661 %vec = load <4 x double>, <4 x double>* %vp
3662 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3663 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3664 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3665 ret <2 x double> %res
3668 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
3669 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
3671 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,3,7,3,7,3,7,3]
3672 ; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3673 ; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0
3674 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3676 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3677 ret <4 x double> %res
3679 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3680 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
3682 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,3,7,3,7,3,7,3]
3683 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3684 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
3685 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
3686 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3687 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3689 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3690 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3691 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3692 ret <4 x double> %res
3695 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
3696 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
3698 ; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,3,7,3]
3699 ; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
3700 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3701 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
3702 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3703 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3705 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3706 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3707 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3708 ret <4 x double> %res
3710 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3711 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1:
3713 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,0,7,6,2,0,7,6]
3714 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
3715 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
3716 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
3717 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3718 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3720 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
3721 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3722 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3723 ret <4 x double> %res
3726 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %mask) {
3727 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask1:
3729 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,0,7,6]
3730 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3731 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
3732 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3733 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3735 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
3736 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3737 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3738 ret <4 x double> %res
3740 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3741 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask2:
3743 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3744 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3745 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,0]
3746 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
3748 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
3749 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3750 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3751 ret <4 x double> %res
3754 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %mask) {
3755 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask2:
3757 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3758 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
3759 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,0]
3761 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
3762 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3763 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3764 ret <4 x double> %res
3766 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) {
3767 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3:
3769 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2
3770 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,1,4]
3771 ; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
3772 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
3774 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
3775 ret <4 x double> %res
3777 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3778 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3:
3780 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
3781 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,1,4]
3782 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
3783 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
3784 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
3785 ; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
3787 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
3788 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3789 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3790 ret <4 x double> %res
3793 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) {
3794 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3:
3796 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
3797 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,1,4]
3798 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
3799 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
3800 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
3801 ; CHECK-NEXT: vmovapd %ymm2, %ymm0
3803 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
3804 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3805 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3806 ret <4 x double> %res
3808 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3809 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
3811 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
3812 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,1,5,5]
3813 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
3814 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
3815 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
3816 ; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
3818 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
3819 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3820 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3821 ret <4 x double> %res
3824 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
3825 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
3827 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
3828 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [1,1,5,5]
3829 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
3830 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
3831 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
3832 ; CHECK-NEXT: vmovapd %ymm2, %ymm0
3834 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
3835 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3836 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3837 ret <4 x double> %res
3839 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3840 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5:
3842 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,6,2,2,2,6,2,2]
3843 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
3844 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
3845 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
3846 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3847 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3849 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
3850 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3851 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3852 ret <4 x double> %res
3855 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) {
3856 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5:
3858 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,6,2,2]
3859 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3860 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
3861 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3862 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3864 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
3865 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3866 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3867 ret <4 x double> %res
3869 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
3870 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
3872 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,0,7,0,5,0,7,0]
3873 ; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
3874 ; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0
3875 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3877 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
3878 ret <4 x double> %res
3880 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3881 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
3883 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [5,0,7,0,5,0,7,0]
3884 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
3885 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
3886 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
3887 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3888 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3890 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
3891 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3892 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3893 ret <4 x double> %res
3896 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
3897 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
3899 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [5,0,7,0]
3900 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3901 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
3902 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3903 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3905 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
3906 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3907 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3908 ret <4 x double> %res
3910 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3911 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
3913 ; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,5,0,6,3,5,0,6]
3914 ; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
3915 ; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
3916 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
3917 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3918 ; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
3920 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
3921 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3922 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3923 ret <4 x double> %res
3926 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) {
3927 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
3929 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [3,5,0,6]
3930 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3931 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
3932 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
3933 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
3935 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
3936 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3937 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3938 ret <4 x double> %res
3940 define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
3941 ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
3943 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
3944 ; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
3945 ; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3946 ; CHECK-NEXT: vzeroupper
3948 %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
3949 ret <2 x double> %res
3951 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3952 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
3954 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3955 ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3
3956 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
3957 ; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
3958 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
3959 ; CHECK-NEXT: vmovapd %xmm1, %xmm0
3960 ; CHECK-NEXT: vzeroupper
3962 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
3963 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3964 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3965 ret <2 x double> %res
3968 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
3969 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
3971 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
3972 ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2
3973 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3974 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
3975 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
3976 ; CHECK-NEXT: vzeroupper
3978 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
3979 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3980 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3981 ret <2 x double> %res
3983 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3984 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1:
3986 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3987 ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
3988 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
3989 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3990 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
3991 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
3992 ; CHECK-NEXT: vzeroupper
3994 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
3995 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3996 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3997 ret <2 x double> %res
4000 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %mask) {
4001 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask1:
4003 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4004 ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
4005 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
4006 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4007 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
4008 ; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
4009 ; CHECK-NEXT: vzeroupper
4011 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
4012 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4013 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4014 ret <2 x double> %res
4016 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp) {
4017 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0:
4019 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
4020 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,7,2]
4021 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0
4023 %vec = load <8 x double>, <8 x double>* %vp
4024 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4025 ret <4 x double> %res
4027 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4028 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0:
4030 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4031 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,6,7,2]
4032 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4033 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4034 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4035 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4037 %vec = load <8 x double>, <8 x double>* %vp
4038 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4039 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4040 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4041 ret <4 x double> %res
4044 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %mask) {
4045 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0:
4047 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4048 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,6,7,2]
4049 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4050 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4051 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4052 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4054 %vec = load <8 x double>, <8 x double>* %vp
4055 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4056 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4057 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4058 ret <4 x double> %res
4061 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4062 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
4064 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4065 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,4]
4066 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4067 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4068 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4069 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4071 %vec = load <8 x double>, <8 x double>* %vp
4072 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
4073 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4074 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4075 ret <4 x double> %res
4078 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %mask) {
4079 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
4081 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4082 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,4]
4083 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4084 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4085 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4086 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4088 %vec = load <8 x double>, <8 x double>* %vp
4089 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
4090 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4091 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4092 ret <4 x double> %res
4095 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4096 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
4098 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4099 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,2,3,4]
4100 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4101 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4102 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4103 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4105 %vec = load <8 x double>, <8 x double>* %vp
4106 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
4107 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4108 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4109 ret <4 x double> %res
4112 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %mask) {
4113 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
4115 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4116 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,2,3,4]
4117 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4118 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4119 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4120 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4122 %vec = load <8 x double>, <8 x double>* %vp
4123 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
4124 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4125 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4126 ret <4 x double> %res
4129 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp) {
4130 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
4132 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
4133 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [4,2,1,0]
4134 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0
4136 %vec = load <8 x double>, <8 x double>* %vp
4137 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4138 ret <4 x double> %res
4140 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4141 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
4143 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4144 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [4,2,1,0]
4145 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4146 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4147 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4148 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4150 %vec = load <8 x double>, <8 x double>* %vp
4151 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4152 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4153 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4154 ret <4 x double> %res
4157 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %mask) {
4158 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
4160 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4161 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [4,2,1,0]
4162 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4163 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4164 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4165 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4167 %vec = load <8 x double>, <8 x double>* %vp
4168 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4169 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4170 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4171 ret <4 x double> %res
4174 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4175 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4:
4177 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4178 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,4,1,5]
4179 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3
4180 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4181 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4182 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4184 %vec = load <8 x double>, <8 x double>* %vp
4185 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
4186 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4187 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4188 ret <4 x double> %res
4191 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %mask) {
4192 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4:
4194 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4195 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [2,4,1,5]
4196 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4197 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4198 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
4199 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4201 %vec = load <8 x double>, <8 x double>* %vp
4202 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
4203 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4204 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4205 ret <4 x double> %res
4208 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4209 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
4211 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4212 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [6,1,1,1]
4213 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3
4214 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4215 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4216 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4218 %vec = load <8 x double>, <8 x double>* %vp
4219 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
4220 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4221 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4222 ret <4 x double> %res
4225 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) {
4226 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
4228 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4229 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [6,1,1,1]
4230 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4231 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4232 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
4233 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4235 %vec = load <8 x double>, <8 x double>* %vp
4236 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
4237 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4238 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4239 ret <4 x double> %res
4242 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp) {
4243 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
4245 ; CHECK-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm1
4246 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4247 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [0,2,6,1]
4248 ; CHECK-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0
4250 %vec = load <8 x double>, <8 x double>* %vp
4251 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4252 ret <4 x double> %res
4254 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4255 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
4257 ; CHECK-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm2
4258 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm3
4259 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,6,1]
4260 ; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm4
4261 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4262 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4263 ; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
4265 %vec = load <8 x double>, <8 x double>* %vp
4266 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4267 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4268 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4269 ret <4 x double> %res
4272 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %mask) {
4273 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
4275 ; CHECK-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm2
4276 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm3
4277 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,6,1]
4278 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4279 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
4280 ; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm1 {%k1} {z}
4281 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4283 %vec = load <8 x double>, <8 x double>* %vp
4284 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4285 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4286 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4287 ret <4 x double> %res
4290 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4291 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask7:
4293 ; CHECK-NEXT: vbroadcastsd 40(%rdi), %ymm2
4294 ; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = mem[0],ymm2[1],mem[2],ymm2[3]
4295 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4296 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
4297 ; CHECK-NEXT: vmovapd %ymm2, %ymm0 {%k1}
4299 %vec = load <8 x double>, <8 x double>* %vp
4300 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
4301 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4302 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4303 ret <4 x double> %res
4306 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %mask) {
4307 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7:
4309 ; CHECK-NEXT: vbroadcastsd 40(%rdi), %ymm1
4310 ; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3]
4311 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4312 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
4313 ; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1} {z}
4315 %vec = load <8 x double>, <8 x double>* %vp
4316 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
4317 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4318 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4319 ret <4 x double> %res
4322 define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp) {
4323 ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
4325 ; CHECK-NEXT: vmovapd (%rdi), %xmm0
4326 ; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],mem[0]
4328 %vec = load <8 x double>, <8 x double>* %vp
4329 %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4330 ret <2 x double> %res
4332 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
4333 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
4335 ; CHECK-NEXT: vmovapd (%rdi), %xmm2
4336 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4337 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4338 ; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} = xmm2[1],mem[0]
4340 %vec = load <8 x double>, <8 x double>* %vp
4341 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4342 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4343 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4344 ret <2 x double> %res
4347 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %mask) {
4348 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
4350 ; CHECK-NEXT: vmovapd (%rdi), %xmm1
4351 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4352 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
4353 ; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm1[1],mem[0]
4355 %vec = load <8 x double>, <8 x double>* %vp
4356 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4357 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4358 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4359 ret <2 x double> %res
4362 ; TODO - we'd be better off splitting the load to 2*xmm and performing a VSHUFPD.
4363 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
4364 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1:
4366 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,4,1,4,1,4,1,4]
4367 ; CHECK-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4368 ; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm2
4369 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4370 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4371 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1}
4372 ; CHECK-NEXT: vzeroupper
4374 %vec = load <8 x double>, <8 x double>* %vp
4375 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
4376 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4377 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4378 ret <2 x double> %res
4381 ; TODO - we'd be better off splitting the load to 2*xmm and performing a VSHUFPD.
4382 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %mask) {
4383 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
4385 ; CHECK-NEXT: vmovapd {{.*#+}} xmm1 = [1,4]
4386 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4387 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
4388 ; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
4389 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
4390 ; CHECK-NEXT: vzeroupper
4392 %vec = load <8 x double>, <8 x double>* %vp
4393 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
4394 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4395 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4396 ret <2 x double> %res
4400 define void @test_zext_v8i8_to_v8i16(<8 x i8>* %arg, <8 x i16>* %arg1) {
4401 ; CHECK-LABEL: test_zext_v8i8_to_v8i16:
4403 ; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4404 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
4405 ; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
4406 ; CHECK-NEXT: vmovdqa %xmm0, (%rsi)
4408 %tmp = getelementptr <8 x i8>, <8 x i8>* %arg, i32 0
4409 %tmp2 = load <8 x i8>, <8 x i8>* %tmp
4410 %tmp3 = extractelement <8 x i8> %tmp2, i32 0
4411 %tmp4 = zext i8 %tmp3 to i16
4412 %tmp5 = insertelement <8 x i16> undef, i16 %tmp4, i32 0
4413 %tmp6 = extractelement <8 x i8> %tmp2, i32 1
4414 %tmp7 = zext i8 %tmp6 to i16
4415 %tmp8 = insertelement <8 x i16> %tmp5, i16 %tmp7, i32 1
4416 %tmp9 = extractelement <8 x i8> %tmp2, i32 2
4417 %tmp10 = zext i8 %tmp9 to i16
4418 %tmp11 = insertelement <8 x i16> %tmp8, i16 %tmp10, i32 2
4419 %tmp12 = extractelement <8 x i8> %tmp2, i32 3
4420 %tmp13 = zext i8 %tmp12 to i16
4421 %tmp14 = insertelement <8 x i16> %tmp11, i16 %tmp13, i32 3
4422 %tmp15 = extractelement <8 x i8> %tmp2, i32 4
4423 %tmp16 = zext i8 %tmp15 to i16
4424 %tmp17 = insertelement <8 x i16> %tmp14, i16 %tmp16, i32 4
4425 %tmp18 = extractelement <8 x i8> %tmp2, i32 5
4426 %tmp19 = zext i8 %tmp18 to i16
4427 %tmp20 = insertelement <8 x i16> %tmp17, i16 %tmp19, i32 5
4428 %tmp21 = extractelement <8 x i8> %tmp2, i32 6
4429 %tmp22 = zext i8 %tmp21 to i16
4430 %tmp23 = insertelement <8 x i16> %tmp20, i16 %tmp22, i32 6
4431 %tmp24 = extractelement <8 x i8> %tmp2, i32 7
4432 %tmp25 = zext i8 %tmp24 to i16
4433 %tmp26 = insertelement <8 x i16> %tmp23, i16 %tmp25, i32 7
4434 %tmp27 = shl <8 x i16> %tmp26, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
4435 %tmp28 = getelementptr <8 x i16>, <8 x i16>* %arg1, i32 0
4436 store <8 x i16> %tmp27, <8 x i16>* %tmp28