1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle %s -o - | FileCheck %s
4 ; FIXME: All cases here should be fixed by PR34380
6 define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
7 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
9 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9]
10 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
11 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
12 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4],xmm0[5,6,7]
13 ; CHECK-NEXT: vzeroupper
15 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
18 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
19 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
21 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9]
22 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
23 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
24 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3,4],xmm0[5,6,7]
25 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
26 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
27 ; CHECK-NEXT: vzeroupper
29 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
30 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
31 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
35 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
36 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
38 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9]
39 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
40 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
41 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4],xmm0[5,6,7]
42 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
43 ; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
44 ; CHECK-NEXT: vzeroupper
46 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
47 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
48 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
51 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
52 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
54 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,12,13,14,15]
55 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
56 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
57 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
58 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
59 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
60 ; CHECK-NEXT: vzeroupper
62 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
63 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
64 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
68 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) {
69 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1:
71 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,12,13,14,15]
72 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
73 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
74 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
75 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
76 ; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
77 ; CHECK-NEXT: vzeroupper
79 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
80 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
81 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
84 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
85 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
87 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,10,11,4,5,6,7,14,15,2,3,12,13,14,15]
88 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
89 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
90 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3],xmm3[4,5,6],xmm0[7]
91 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
92 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
93 ; CHECK-NEXT: vzeroupper
95 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
96 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
97 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
101 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) {
102 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2:
104 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,10,11,4,5,6,7,14,15,2,3,12,13,14,15]
105 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
106 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
107 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3],xmm2[4,5,6],xmm0[7]
108 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
109 ; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
110 ; CHECK-NEXT: vzeroupper
112 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
113 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
114 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
117 define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
118 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3:
120 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
121 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
122 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
123 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4],xmm0[5,6],xmm1[7]
124 ; CHECK-NEXT: vzeroupper
126 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
129 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
130 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3:
132 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
133 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
134 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
135 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4],xmm0[5,6],xmm3[7]
136 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
137 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
138 ; CHECK-NEXT: vzeroupper
140 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
141 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
142 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
146 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) {
147 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3:
149 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
150 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
151 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
152 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4],xmm0[5,6],xmm2[7]
153 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
154 ; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
155 ; CHECK-NEXT: vzeroupper
157 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
158 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
159 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
162 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp) {
163 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0:
165 ; CHECK-NEXT: vmovdqa (%rdi), %xmm0
166 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
167 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
168 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
169 ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
171 %vec = load <16 x i16>, <16 x i16>* %vp
172 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
175 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
176 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
178 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
179 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
180 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
181 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
182 ; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
183 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
184 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
186 %vec = load <16 x i16>, <16 x i16>* %vp
187 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
188 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
189 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
193 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %mask) {
194 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
196 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
197 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
198 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
199 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
200 ; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7]
201 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
202 ; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
204 %vec = load <16 x i16>, <16 x i16>* %vp
205 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
206 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
207 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
211 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
212 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1:
214 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
215 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
216 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
217 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
218 ; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4,5],xmm3[6,7]
219 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
220 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
222 %vec = load <16 x i16>, <16 x i16>* %vp
223 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
224 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
225 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
229 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %mask) {
230 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1:
232 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
233 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
234 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
235 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
236 ; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4,5],xmm2[6,7]
237 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
238 ; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
240 %vec = load <16 x i16>, <16 x i16>* %vp
241 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
242 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
243 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
247 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
248 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
250 ; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm2
251 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
252 ; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
253 ; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4,5,6,7]
254 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
255 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
257 %vec = load <16 x i16>, <16 x i16>* %vp
258 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
259 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
260 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
264 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %mask) {
265 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
267 ; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm1
268 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
269 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
270 ; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
271 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
272 ; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
274 %vec = load <16 x i16>, <16 x i16>* %vp
275 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
276 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
277 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
281 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp) {
282 ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3:
284 ; CHECK-NEXT: vmovdqa (%rdi), %xmm0
285 ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
286 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
288 %vec = load <16 x i16>, <16 x i16>* %vp
289 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
292 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
293 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
295 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
296 ; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = mem[0],xmm2[1,2,3]
297 ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
298 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
299 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
301 %vec = load <16 x i16>, <16 x i16>* %vp
302 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
303 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
304 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
308 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %mask) {
309 ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
311 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
312 ; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1,2,3]
313 ; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
314 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
315 ; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
317 %vec = load <16 x i16>, <16 x i16>* %vp
318 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
319 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
320 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
324 define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) {
325 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0:
327 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
328 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
329 ; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1
330 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
332 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
335 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
336 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0:
338 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
339 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
340 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
341 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
342 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
344 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
345 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
346 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
350 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) {
351 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0:
353 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
354 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
355 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
356 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
357 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
359 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
360 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
361 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
364 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
365 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1:
367 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
368 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
369 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
370 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
371 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
373 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
374 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
375 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
379 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) {
380 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1:
382 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
383 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
384 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
385 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
386 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
388 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
389 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
390 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
393 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
394 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2:
396 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
397 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
398 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
399 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
400 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
402 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
403 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
404 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
408 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) {
409 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2:
411 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
412 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
413 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
414 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
415 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
417 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
418 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
419 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
422 define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
423 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3:
425 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
426 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
427 ; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1
428 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
430 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
433 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
434 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3:
436 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
437 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
438 ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
439 ; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1
440 ; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
442 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
443 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
444 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
448 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) {
449 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3:
451 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
452 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
453 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
454 ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm2 {%k1} {z}
455 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
457 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
458 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
459 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
462 define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) {
463 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0:
465 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
466 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
467 ; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1
468 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
469 ; CHECK-NEXT: vzeroupper
471 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
474 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
475 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0:
477 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
478 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
479 ; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
480 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
481 ; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
482 ; CHECK-NEXT: vzeroupper
484 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
485 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
486 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
490 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) {
491 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0:
493 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
494 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
495 ; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm3
496 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
497 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
498 ; CHECK-NEXT: vzeroupper
500 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
501 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
502 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
505 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
506 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1:
508 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
509 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
510 ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
511 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
512 ; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
513 ; CHECK-NEXT: vzeroupper
515 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
516 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
517 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
521 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) {
522 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1:
524 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
525 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
526 ; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
527 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
528 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
529 ; CHECK-NEXT: vzeroupper
531 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
532 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
533 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
536 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
537 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2:
539 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
540 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
541 ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
542 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
543 ; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
544 ; CHECK-NEXT: vzeroupper
546 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
547 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
548 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
552 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) {
553 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2:
555 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
556 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
557 ; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
558 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
559 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
560 ; CHECK-NEXT: vzeroupper
562 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
563 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
564 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
567 define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
568 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3:
570 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
571 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
572 ; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1
573 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
574 ; CHECK-NEXT: vzeroupper
576 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
579 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
580 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3:
582 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
583 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
584 ; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
585 ; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
586 ; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
587 ; CHECK-NEXT: vzeroupper
589 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
590 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
591 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
595 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) {
596 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3:
598 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
599 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
600 ; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
601 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
602 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
603 ; CHECK-NEXT: vzeroupper
605 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
606 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
607 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
610 define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp) {
611 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0:
613 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
614 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
615 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
617 %vec = load <32 x i16>, <32 x i16>* %vp
618 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
621 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
622 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0:
624 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
625 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
626 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
627 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
628 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
630 %vec = load <32 x i16>, <32 x i16>* %vp
631 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
632 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
633 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
637 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %mask) {
638 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0:
640 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
641 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
642 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
643 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
644 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
646 %vec = load <32 x i16>, <32 x i16>* %vp
647 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
648 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
649 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
653 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
654 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1:
656 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
657 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
658 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
659 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
660 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
662 %vec = load <32 x i16>, <32 x i16>* %vp
663 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
664 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
665 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
669 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %mask) {
670 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1:
672 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
673 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
674 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
675 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
676 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
678 %vec = load <32 x i16>, <32 x i16>* %vp
679 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
680 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
681 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
685 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
686 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2:
688 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
689 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
690 ; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3
691 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
692 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
694 %vec = load <32 x i16>, <32 x i16>* %vp
695 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
696 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
697 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
701 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %mask) {
702 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2:
704 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
705 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
706 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
707 ; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z}
708 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
710 %vec = load <32 x i16>, <32 x i16>* %vp
711 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
712 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
713 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
717 define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp) {
718 ; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3:
720 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
721 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
722 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
724 %vec = load <32 x i16>, <32 x i16>* %vp
725 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
728 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
729 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3:
731 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
732 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
733 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
734 ; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1
735 ; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1}
737 %vec = load <32 x i16>, <32 x i16>* %vp
738 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
739 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
740 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
744 define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %mask) {
745 ; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3:
747 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
748 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
749 ; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1
750 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
751 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
753 %vec = load <32 x i16>, <32 x i16>* %vp
754 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
755 %cmp = icmp eq <16 x i16> %mask, zeroinitializer
756 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
760 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp) {
761 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0:
763 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
764 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
765 ; CHECK-NEXT: vpermi2w (%rdi), %ymm1, %ymm0
766 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
767 ; CHECK-NEXT: vzeroupper
769 %vec = load <32 x i16>, <32 x i16>* %vp
770 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
773 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
774 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0:
776 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
777 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
778 ; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3
779 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
780 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
781 ; CHECK-NEXT: vzeroupper
783 %vec = load <32 x i16>, <32 x i16>* %vp
784 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
785 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
786 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
790 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %mask) {
791 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0:
793 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
794 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
795 ; CHECK-NEXT: vpermi2w (%rdi), %ymm1, %ymm2
796 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
797 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z}
798 ; CHECK-NEXT: vzeroupper
800 %vec = load <32 x i16>, <32 x i16>* %vp
801 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
802 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
803 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
807 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
808 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1:
810 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
811 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
812 ; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3
813 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
814 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
815 ; CHECK-NEXT: vzeroupper
817 %vec = load <32 x i16>, <32 x i16>* %vp
818 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
819 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
820 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
824 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %mask) {
825 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
827 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
828 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
829 ; CHECK-NEXT: vpermi2w (%rdi), %ymm1, %ymm2
830 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
831 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z}
832 ; CHECK-NEXT: vzeroupper
834 %vec = load <32 x i16>, <32 x i16>* %vp
835 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
836 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
837 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
841 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
842 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2:
844 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
845 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
846 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
847 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
848 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
849 ; CHECK-NEXT: vzeroupper
851 %vec = load <32 x i16>, <32 x i16>* %vp
852 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
853 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
854 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
858 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %mask) {
859 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2:
861 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
862 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
863 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm2
864 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
865 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z}
866 ; CHECK-NEXT: vzeroupper
868 %vec = load <32 x i16>, <32 x i16>* %vp
869 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
870 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
871 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
875 define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp) {
876 ; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3:
878 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
879 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
880 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
881 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
882 ; CHECK-NEXT: vzeroupper
884 %vec = load <32 x i16>, <32 x i16>* %vp
885 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
888 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
889 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3:
891 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
892 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
893 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
894 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
895 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
896 ; CHECK-NEXT: vzeroupper
898 %vec = load <32 x i16>, <32 x i16>* %vp
899 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
900 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
901 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
905 define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %mask) {
906 ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3:
908 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
909 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
910 ; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm2
911 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
912 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z}
913 ; CHECK-NEXT: vzeroupper
915 %vec = load <32 x i16>, <32 x i16>* %vp
916 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
917 %cmp = icmp eq <8 x i16> %mask, zeroinitializer
918 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
922 define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) {
923 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0:
925 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <4,0,3,2,u,u,u,u>
926 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
927 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
928 ; CHECK-NEXT: vzeroupper
930 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
933 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
934 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0:
936 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <4,0,3,2,u,u,u,u>
937 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
938 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
939 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
940 ; CHECK-NEXT: vzeroupper
942 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
943 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
944 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
948 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) {
949 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0:
951 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <4,0,3,2,u,u,u,u>
952 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
953 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
954 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
955 ; CHECK-NEXT: vzeroupper
957 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
958 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
959 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
962 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
963 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1:
965 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <3,0,7,3,u,u,u,u>
966 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
967 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
968 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
969 ; CHECK-NEXT: vzeroupper
971 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
972 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
973 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
977 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) {
978 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1:
980 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <3,0,7,3,u,u,u,u>
981 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
982 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
983 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
984 ; CHECK-NEXT: vzeroupper
986 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
987 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
988 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
991 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
992 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask2:
994 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
995 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm3[1],xmm0[1]
996 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
997 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
998 ; CHECK-NEXT: vzeroupper
1000 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
1001 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1002 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1006 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %mask) {
1007 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask2:
1009 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
1010 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1]
1011 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1012 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1013 ; CHECK-NEXT: vzeroupper
1015 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
1016 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1017 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1020 define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
1021 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3:
1023 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <5,3,2,5,u,u,u,u>
1024 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
1025 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1026 ; CHECK-NEXT: vzeroupper
1028 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1031 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1032 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3:
1034 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <5,3,2,5,u,u,u,u>
1035 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
1036 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1037 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1038 ; CHECK-NEXT: vzeroupper
1040 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1041 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1042 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1046 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) {
1047 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3:
1049 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <5,3,2,5,u,u,u,u>
1050 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
1051 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1052 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1053 ; CHECK-NEXT: vzeroupper
1055 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
1056 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1057 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1060 define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp) {
1061 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0:
1063 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm0
1064 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1],mem[0,0]
1066 %vec = load <8 x i32>, <8 x i32>* %vp
1067 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1070 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1071 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0:
1073 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
1074 ; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1],mem[0,0]
1075 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1076 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
1078 %vec = load <8 x i32>, <8 x i32>* %vp
1079 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1080 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1081 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1085 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %mask) {
1086 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0:
1088 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
1089 ; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1],mem[0,0]
1090 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1091 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
1093 %vec = load <8 x i32>, <8 x i32>* %vp
1094 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
1095 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1096 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1100 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1101 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
1103 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
1104 ; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2,3]
1105 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1106 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[1,0,0,3]
1108 %vec = load <8 x i32>, <8 x i32>* %vp
1109 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
1110 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1111 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1115 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %mask) {
1116 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
1118 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
1119 ; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2,3]
1120 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1121 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[1,0,0,3]
1123 %vec = load <8 x i32>, <8 x i32>* %vp
1124 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
1125 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1126 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1130 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1131 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
1133 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1134 ; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
1135 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1136 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,3,3,0]
1138 %vec = load <8 x i32>, <8 x i32>* %vp
1139 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
1140 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1141 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1145 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %mask) {
1146 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
1148 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
1149 ; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
1150 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1151 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,3,3,0]
1153 %vec = load <8 x i32>, <8 x i32>* %vp
1154 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
1155 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1156 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1160 define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) {
1161 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3:
1163 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,2,3]
1164 ; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
1165 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1167 %vec = load <8 x i32>, <8 x i32>* %vp
1168 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1171 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1172 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3:
1174 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = mem[1,1,2,3]
1175 ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm3
1176 ; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
1177 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1178 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
1180 %vec = load <8 x i32>, <8 x i32>* %vp
1181 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1182 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1183 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1187 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %mask) {
1188 ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3:
1190 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = mem[1,1,2,3]
1191 ; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2
1192 ; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3]
1193 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1194 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
1196 %vec = load <8 x i32>, <8 x i32>* %vp
1197 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
1198 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1199 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1203 define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
1204 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0:
1206 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1207 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,5,3,6,15,2,9,14]
1208 ; CHECK-NEXT: vpermi2d %ymm0, %ymm2, %ymm1
1209 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1211 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1214 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1215 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0:
1217 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1218 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [9,5,3,6,15,2,9,14]
1219 ; CHECK-NEXT: vpermi2d %ymm0, %ymm3, %ymm4
1220 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1221 ; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
1223 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1224 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1225 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1229 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) {
1230 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0:
1232 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1233 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,5,3,6,15,2,9,14]
1234 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1235 ; CHECK-NEXT: vpermi2d %ymm0, %ymm3, %ymm2 {%k1} {z}
1236 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
1238 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
1239 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1240 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1243 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1244 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1:
1246 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1247 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,0,15,3,2,3,6,8]
1248 ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
1249 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1250 ; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
1252 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
1253 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1254 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1258 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) {
1259 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1:
1261 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1262 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8]
1263 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1264 ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
1265 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
1267 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
1268 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1269 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1272 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1273 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2:
1275 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1276 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,15,15,2,6,10,14,7]
1277 ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
1278 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1279 ; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
1281 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
1282 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1283 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1287 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) {
1288 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2:
1290 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1291 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7]
1292 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1293 ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
1294 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
1296 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
1297 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1298 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1301 define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
1302 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3:
1304 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1305 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3]
1306 ; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1
1307 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1309 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1312 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
1313 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3:
1315 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1316 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [14,5,7,7,10,3,9,3]
1317 ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
1318 ; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
1319 ; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
1321 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1322 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1323 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1327 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) {
1328 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3:
1330 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1331 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3]
1332 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1333 ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
1334 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
1336 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
1337 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1338 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1341 define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
1342 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0:
1344 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1345 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12]
1346 ; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1
1347 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1348 ; CHECK-NEXT: vzeroupper
1350 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1353 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1354 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0:
1356 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1357 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,12,4,6,4,12]
1358 ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
1359 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1360 ; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
1361 ; CHECK-NEXT: vzeroupper
1363 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1364 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1365 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1369 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) {
1370 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0:
1372 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1373 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12]
1374 ; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
1375 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1376 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
1377 ; CHECK-NEXT: vzeroupper
1379 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
1380 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1381 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1384 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1385 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1:
1387 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1388 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <5,1,3,4,u,u,u,u>
1389 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
1390 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1391 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
1392 ; CHECK-NEXT: vzeroupper
1394 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
1395 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1396 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1400 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) {
1401 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1:
1403 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1404 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <5,1,3,4,u,u,u,u>
1405 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
1406 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1407 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1408 ; CHECK-NEXT: vzeroupper
1410 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
1411 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1412 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1415 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1416 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2:
1418 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1419 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <1,1,13,0,u,u,u,u>
1420 ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
1421 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1422 ; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
1423 ; CHECK-NEXT: vzeroupper
1425 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
1426 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1427 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1431 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) {
1432 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2:
1434 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1435 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <1,1,13,0,u,u,u,u>
1436 ; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
1437 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1438 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
1439 ; CHECK-NEXT: vzeroupper
1441 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
1442 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1443 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1446 define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
1447 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3:
1449 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1450 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <3,0,0,13,u,u,u,u>
1451 ; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1
1452 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
1453 ; CHECK-NEXT: vzeroupper
1455 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1458 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
1459 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3:
1461 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1462 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <3,0,0,13,u,u,u,u>
1463 ; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
1464 ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
1465 ; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
1466 ; CHECK-NEXT: vzeroupper
1468 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1469 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1470 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1474 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) {
1475 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3:
1477 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
1478 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <3,0,0,13,u,u,u,u>
1479 ; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
1480 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1481 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
1482 ; CHECK-NEXT: vzeroupper
1484 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
1485 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1486 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1489 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp) {
1490 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0:
1492 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4]
1493 ; CHECK-NEXT: vpermps 32(%rdi), %ymm0, %ymm0
1495 %vec = load <16 x i32>, <16 x i32>* %vp
1496 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1499 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1500 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0:
1502 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4]
1503 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1504 ; CHECK-NEXT: vpermd 32(%rdi), %ymm2, %ymm0 {%k1}
1506 %vec = load <16 x i32>, <16 x i32>* %vp
1507 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1508 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1509 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1513 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %mask) {
1514 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0:
1516 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4]
1517 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1518 ; CHECK-NEXT: vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z}
1520 %vec = load <16 x i32>, <16 x i32>* %vp
1521 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
1522 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1523 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1527 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1528 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1:
1530 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1531 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15]
1532 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
1533 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1534 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
1536 %vec = load <16 x i32>, <16 x i32>* %vp
1537 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
1538 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1539 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1543 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %mask) {
1544 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1:
1546 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1547 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15]
1548 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1549 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1550 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1552 %vec = load <16 x i32>, <16 x i32>* %vp
1553 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
1554 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1555 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1559 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1560 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2:
1562 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1563 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10]
1564 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
1565 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1566 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
1568 %vec = load <16 x i32>, <16 x i32>* %vp
1569 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
1570 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1571 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1575 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %mask) {
1576 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2:
1578 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1579 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10]
1580 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1581 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
1582 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1584 %vec = load <16 x i32>, <16 x i32>* %vp
1585 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
1586 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1587 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1591 define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp) {
1592 ; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3:
1594 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1595 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12]
1596 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0
1598 %vec = load <16 x i32>, <16 x i32>* %vp
1599 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1602 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1603 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3:
1605 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
1606 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [8,4,1,13,15,4,6,12]
1607 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3
1608 ; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1
1609 ; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1}
1611 %vec = load <16 x i32>, <16 x i32>* %vp
1612 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1613 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1614 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1618 define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %mask) {
1619 ; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3:
1621 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
1622 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12]
1623 ; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1
1624 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
1625 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1627 %vec = load <16 x i32>, <16 x i32>* %vp
1628 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
1629 %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1630 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1634 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp) {
1635 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0:
1637 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1638 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <13,0,0,6,u,u,u,u>
1639 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0
1640 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1641 ; CHECK-NEXT: vzeroupper
1643 %vec = load <16 x i32>, <16 x i32>* %vp
1644 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1647 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1648 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0:
1650 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
1651 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <13,0,0,6,u,u,u,u>
1652 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3
1653 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1654 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1655 ; CHECK-NEXT: vzeroupper
1657 %vec = load <16 x i32>, <16 x i32>* %vp
1658 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1659 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1660 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1664 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %mask) {
1665 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
1667 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1668 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <13,0,0,6,u,u,u,u>
1669 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2
1670 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1671 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} {z}
1672 ; CHECK-NEXT: vzeroupper
1674 %vec = load <16 x i32>, <16 x i32>* %vp
1675 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
1676 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1677 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1681 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1682 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
1684 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
1685 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [15,5,3,2,15,5,7,6]
1686 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3
1687 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1688 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1689 ; CHECK-NEXT: vzeroupper
1691 %vec = load <16 x i32>, <16 x i32>* %vp
1692 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
1693 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1694 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1698 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %mask) {
1699 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
1701 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
1702 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [15,5,3,2,15,5,7,6]
1703 ; CHECK-NEXT: vpermi2d (%rdi), %ymm1, %ymm2
1704 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1705 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} {z}
1706 ; CHECK-NEXT: vzeroupper
1708 %vec = load <16 x i32>, <16 x i32>* %vp
1709 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
1710 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1711 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1715 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1716 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2:
1718 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
1719 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <2,15,6,9,u,u,u,u>
1720 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3
1721 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1722 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
1723 ; CHECK-NEXT: vzeroupper
1725 %vec = load <16 x i32>, <16 x i32>* %vp
1726 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
1727 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1728 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1732 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %mask) {
1733 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2:
1735 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
1736 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <2,15,6,9,u,u,u,u>
1737 ; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2
1738 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1739 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} {z}
1740 ; CHECK-NEXT: vzeroupper
1742 %vec = load <16 x i32>, <16 x i32>* %vp
1743 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
1744 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1745 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1749 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) {
1750 ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3:
1752 ; CHECK-NEXT: vmovdqa (%rdi), %xmm0
1753 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
1754 ; CHECK-NEXT: vmovd %xmm0, %eax
1755 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
1756 ; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
1757 ; CHECK-NEXT: vpextrd $3, %xmm1, %eax
1758 ; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm1
1759 ; CHECK-NEXT: vpextrd $2, %xmm0, %eax
1760 ; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
1762 %vec = load <16 x i32>, <16 x i32>* %vp
1763 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1766 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
1767 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3:
1769 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
1770 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3
1771 ; CHECK-NEXT: vmovd %xmm2, %eax
1772 ; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
1773 ; CHECK-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
1774 ; CHECK-NEXT: vpextrd $3, %xmm3, %eax
1775 ; CHECK-NEXT: vpinsrd $2, %eax, %xmm4, %xmm3
1776 ; CHECK-NEXT: vpextrd $2, %xmm2, %eax
1777 ; CHECK-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
1778 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
1779 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
1781 %vec = load <16 x i32>, <16 x i32>* %vp
1782 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1783 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1784 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
1788 define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %mask) {
1789 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3:
1791 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
1792 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1793 ; CHECK-NEXT: vmovd %xmm1, %eax
1794 ; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
1795 ; CHECK-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
1796 ; CHECK-NEXT: vpextrd $3, %xmm2, %eax
1797 ; CHECK-NEXT: vpinsrd $2, %eax, %xmm3, %xmm2
1798 ; CHECK-NEXT: vpextrd $2, %xmm1, %eax
1799 ; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
1800 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
1801 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
1803 %vec = load <16 x i32>, <16 x i32>* %vp
1804 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
1805 %cmp = icmp eq <4 x i32> %mask, zeroinitializer
1806 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
1810 define <2 x i64> @test_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) {
1811 ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mask0:
1813 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
1814 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1815 ; CHECK-NEXT: vzeroupper
1817 %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1820 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
1821 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0:
1823 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,2,3]
1824 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
1825 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
1826 ; CHECK-NEXT: vzeroupper
1828 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1829 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1830 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1834 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %mask) {
1835 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0:
1837 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,2,3]
1838 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1839 ; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1840 ; CHECK-NEXT: vzeroupper
1842 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
1843 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1844 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1847 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
1848 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask1:
1850 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
1851 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
1852 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
1853 ; CHECK-NEXT: vzeroupper
1855 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1856 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1857 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1861 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %mask) {
1862 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask1:
1864 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
1865 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1866 ; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1867 ; CHECK-NEXT: vzeroupper
1869 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1870 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1871 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1874 define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp) {
1875 ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0:
1877 ; CHECK-NEXT: vmovaps (%rdi), %xmm0
1878 ; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
1880 %vec = load <4 x i64>, <4 x i64>* %vp
1881 %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1884 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
1885 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0:
1887 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
1888 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1889 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} = xmm2[1],mem[1]
1891 %vec = load <4 x i64>, <4 x i64>* %vp
1892 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1893 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1894 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1898 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %mask) {
1899 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0:
1901 ; CHECK-NEXT: vmovdqa (%rdi), %xmm1
1902 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
1903 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm1[1],mem[1]
1905 %vec = load <4 x i64>, <4 x i64>* %vp
1906 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
1907 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1908 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1912 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
1913 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1:
1915 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1916 ; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
1917 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
1918 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
1920 %vec = load <4 x i64>, <4 x i64>* %vp
1921 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1922 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1923 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
1927 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %mask) {
1928 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1:
1930 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
1931 ; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
1932 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
1933 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
1935 %vec = load <4 x i64>, <4 x i64>* %vp
1936 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
1937 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
1938 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
1942 define <4 x i64> @test_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec) {
1943 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask0:
1945 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
1946 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1]
1948 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1951 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1952 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
1954 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1955 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
1956 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1]
1957 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
1959 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1960 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1961 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1965 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) {
1966 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
1968 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1969 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
1970 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1]
1972 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
1973 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1974 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1977 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1978 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
1980 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1981 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,2,5]
1982 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
1983 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
1984 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
1986 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
1987 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1988 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1992 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) {
1993 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
1995 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
1996 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,2,5]
1997 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
1998 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
1999 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
2001 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
2002 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2003 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2006 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2007 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
2009 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2010 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,2,7]
2011 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
2012 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2013 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
2015 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
2016 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2017 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2021 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
2022 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
2024 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2025 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,7,2,7]
2026 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2027 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
2028 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
2030 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
2031 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2032 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2035 define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
2036 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask3:
2038 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2039 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [2,4,4,3]
2040 ; CHECK-NEXT: vpermi2q %ymm0, %ymm2, %ymm1
2041 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2043 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2046 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2047 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
2049 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2050 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,4,4,3]
2051 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
2052 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2053 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
2055 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2056 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2057 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2061 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) {
2062 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
2064 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2065 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,4,4,3]
2066 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2067 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
2068 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
2070 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
2071 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2072 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2075 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2076 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
2078 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2079 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,3,3,1]
2080 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
2081 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2082 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
2084 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
2085 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2086 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2090 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {
2091 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
2093 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2094 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,3,3,1]
2095 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2096 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
2097 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
2099 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
2100 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2101 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2104 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2105 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5:
2107 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2108 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,1,0,6]
2109 ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4
2110 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2111 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
2113 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
2114 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2115 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2119 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) {
2120 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5:
2122 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2123 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,1,0,6]
2124 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2125 ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
2126 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
2128 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
2129 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2130 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2133 define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
2134 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask6:
2136 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
2137 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,7]
2138 ; CHECK-NEXT: vpermi2q %ymm0, %ymm2, %ymm1
2139 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2141 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2144 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2145 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
2147 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2148 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,2,1,7]
2149 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
2150 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2151 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
2153 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2154 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2155 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2159 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) {
2160 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
2162 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2163 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,7]
2164 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2165 ; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
2166 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
2168 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
2169 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2170 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2173 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
2174 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
2176 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3
2177 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4]
2178 ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4
2179 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
2180 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
2182 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
2183 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2184 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2188 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
2189 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
2191 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3
2192 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4]
2193 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2194 ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
2195 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
2197 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
2198 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2199 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2202 define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
2203 ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0:
2205 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
2206 ; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
2207 ; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2208 ; CHECK-NEXT: vzeroupper
2210 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2213 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
2214 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0:
2216 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
2217 ; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
2218 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
2219 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 {%k1} = xmm3[0],xmm0[0]
2220 ; CHECK-NEXT: vmovdqa %xmm1, %xmm0
2221 ; CHECK-NEXT: vzeroupper
2223 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2224 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2225 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2229 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) {
2230 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0:
2232 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
2233 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2234 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2235 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm0[0]
2236 ; CHECK-NEXT: vzeroupper
2238 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
2239 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2240 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2243 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
2244 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
2246 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2247 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
2248 ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
2249 ; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
2250 ; CHECK-NEXT: vzeroupper
2252 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
2253 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2254 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2258 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) {
2259 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
2261 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2262 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
2263 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2264 ; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
2265 ; CHECK-NEXT: vzeroupper
2267 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
2268 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2269 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2272 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp) {
2273 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask0:
2275 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,2,0,2]
2277 %vec = load <8 x i64>, <8 x i64>* %vp
2278 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2281 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2282 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask0:
2284 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2285 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,2,0,2]
2287 %vec = load <8 x i64>, <8 x i64>* %vp
2288 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2289 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2290 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2294 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %mask) {
2295 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0:
2297 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2298 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,2,0,2]
2300 %vec = load <8 x i64>, <8 x i64>* %vp
2301 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2302 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2303 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2307 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2308 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
2310 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2311 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,4]
2312 ; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3
2313 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2314 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2316 %vec = load <8 x i64>, <8 x i64>* %vp
2317 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
2318 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2319 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2323 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %mask) {
2324 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
2326 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2327 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,4]
2328 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2329 ; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2330 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2332 %vec = load <8 x i64>, <8 x i64>* %vp
2333 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
2334 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2335 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2339 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2340 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
2342 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2343 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,5,5,1]
2344 ; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3
2345 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2346 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2348 %vec = load <8 x i64>, <8 x i64>* %vp
2349 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
2350 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2351 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2355 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) {
2356 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
2358 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2359 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,5,5,1]
2360 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2361 ; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2362 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2364 %vec = load <8 x i64>, <8 x i64>* %vp
2365 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
2366 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2367 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2371 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp) {
2372 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
2374 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
2375 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,0,2]
2376 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0
2378 %vec = load <8 x i64>, <8 x i64>* %vp
2379 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2382 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2383 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
2385 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2386 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,0,0,2]
2387 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2388 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2389 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2391 %vec = load <8 x i64>, <8 x i64>* %vp
2392 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2393 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2394 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2398 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %mask) {
2399 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
2401 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2402 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,2]
2403 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2404 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2405 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2407 %vec = load <8 x i64>, <8 x i64>* %vp
2408 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
2409 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2410 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2414 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2415 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4:
2417 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2418 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,6,1]
2419 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2420 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2421 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2423 %vec = load <8 x i64>, <8 x i64>* %vp
2424 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
2425 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2426 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2430 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %mask) {
2431 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:
2433 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2434 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,6,1]
2435 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2436 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2437 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2439 %vec = load <8 x i64>, <8 x i64>* %vp
2440 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
2441 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2442 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2446 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2447 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
2449 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2450 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,7,1]
2451 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2452 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2453 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2455 %vec = load <8 x i64>, <8 x i64>* %vp
2456 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
2457 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2458 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2462 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %mask) {
2463 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
2465 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2466 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,7,1]
2467 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2468 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2469 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2471 %vec = load <8 x i64>, <8 x i64>* %vp
2472 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
2473 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2474 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2478 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp) {
2479 ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6:
2481 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
2482 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,2,3,2]
2483 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm1, %ymm0
2485 %vec = load <8 x i64>, <8 x i64>* %vp
2486 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2489 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2490 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6:
2492 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2493 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,2,3,2]
2494 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3
2495 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2496 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2498 %vec = load <8 x i64>, <8 x i64>* %vp
2499 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2500 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2501 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2505 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %mask) {
2506 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:
2508 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
2509 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,2,3,2]
2510 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2511 ; CHECK-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z}
2512 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2514 %vec = load <8 x i64>, <8 x i64>* %vp
2515 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
2516 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2517 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2521 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
2522 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
2524 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2525 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,1,5]
2526 ; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3
2527 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
2528 ; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1}
2530 %vec = load <8 x i64>, <8 x i64>* %vp
2531 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
2532 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2533 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
2537 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %mask) {
2538 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
2540 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2541 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,1,5]
2542 ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1
2543 ; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z}
2544 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
2546 %vec = load <8 x i64>, <8 x i64>* %vp
2547 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
2548 %cmp = icmp eq <4 x i64> %mask, zeroinitializer
2549 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
2553 define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp) {
2554 ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
2556 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2557 ; CHECK-NEXT: vmovaps 32(%rdi), %xmm1
2558 ; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2560 %vec = load <8 x i64>, <8 x i64>* %vp
2561 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2564 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
2565 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
2567 ; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
2568 ; CHECK-NEXT: vmovdqa 32(%rdi), %xmm3
2569 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2570 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
2572 %vec = load <8 x i64>, <8 x i64>* %vp
2573 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2574 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2575 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2579 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %mask) {
2580 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
2582 ; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2583 ; CHECK-NEXT: vmovdqa 32(%rdi), %xmm2
2584 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
2585 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
2587 %vec = load <8 x i64>, <8 x i64>* %vp
2588 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
2589 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2590 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2594 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
2595 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
2597 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
2598 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
2599 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
2600 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
2601 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
2602 ; CHECK-NEXT: vzeroupper
2604 %vec = load <8 x i64>, <8 x i64>* %vp
2605 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
2606 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2607 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
2611 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %mask) {
2612 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
2614 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
2615 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
2616 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
2617 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
2618 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
2619 ; CHECK-NEXT: vzeroupper
2621 %vec = load <8 x i64>, <8 x i64>* %vp
2622 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
2623 %cmp = icmp eq <2 x i64> %mask, zeroinitializer
2624 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
2628 define <4 x float> @test_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) {
2629 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
2631 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
2632 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1]
2633 ; CHECK-NEXT: vzeroupper
2635 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2636 ret <4 x float> %res
2638 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2639 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0:
2641 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
2642 ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
2643 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
2644 ; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1]
2645 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
2646 ; CHECK-NEXT: vzeroupper
2648 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2649 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2650 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2651 ret <4 x float> %res
2654 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %mask) {
2655 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0:
2657 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
2658 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2659 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2660 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1]
2661 ; CHECK-NEXT: vzeroupper
2663 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
2664 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2665 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2666 ret <4 x float> %res
2668 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2669 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1:
2671 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <1,3,5,0,u,u,u,u>
2672 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
2673 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2674 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2675 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2676 ; CHECK-NEXT: vzeroupper
2678 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
2679 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2680 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2681 ret <4 x float> %res
2684 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) {
2685 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1:
2687 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <1,3,5,0,u,u,u,u>
2688 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0
2689 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
2690 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
2691 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
2692 ; CHECK-NEXT: vzeroupper
2694 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
2695 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2696 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2697 ret <4 x float> %res
2699 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2700 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2:
2702 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <3,2,7,0,u,u,u,u>
2703 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
2704 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2705 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2706 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2707 ; CHECK-NEXT: vzeroupper
2709 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
2710 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2711 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2712 ret <4 x float> %res
2715 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) {
2716 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2:
2718 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <3,2,7,0,u,u,u,u>
2719 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0
2720 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
2721 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
2722 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
2723 ; CHECK-NEXT: vzeroupper
2725 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
2726 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2727 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2728 ret <4 x float> %res
2730 define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) {
2731 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3:
2733 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <3,3,5,2,u,u,u,u>
2734 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
2735 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2736 ; CHECK-NEXT: vzeroupper
2738 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2739 ret <4 x float> %res
2741 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
2742 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3:
2744 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <3,3,5,2,u,u,u,u>
2745 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
2746 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2747 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
2748 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
2749 ; CHECK-NEXT: vzeroupper
2751 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2752 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2753 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2754 ret <4 x float> %res
2757 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) {
2758 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3:
2760 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <3,3,5,2,u,u,u,u>
2761 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0
2762 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
2763 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
2764 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
2765 ; CHECK-NEXT: vzeroupper
2767 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
2768 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2769 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2770 ret <4 x float> %res
2772 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) {
2773 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
2775 ; CHECK-NEXT: vmovaps (%rdi), %xmm0
2776 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
2777 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,0]
2778 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,1]
2780 %vec = load <8 x float>, <8 x float>* %vp
2781 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2782 ret <4 x float> %res
2784 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
2785 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
2787 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
2788 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm3
2789 ; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,0]
2790 ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
2791 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
2792 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[2,0],xmm3[0,1]
2794 %vec = load <8 x float>, <8 x float>* %vp
2795 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2796 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2797 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2798 ret <4 x float> %res
2801 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %mask) {
2802 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
2804 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
2805 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2806 ; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0]
2807 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2808 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
2809 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,0],xmm2[0,1]
2811 %vec = load <8 x float>, <8 x float>* %vp
2812 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
2813 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2814 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2815 ret <4 x float> %res
2818 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
2819 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
2821 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2822 ; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],mem[3]
2823 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2824 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
2825 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = xmm2[2,3,3,2]
2827 %vec = load <8 x float>, <8 x float>* %vp
2828 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
2829 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2830 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2831 ret <4 x float> %res
2834 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %mask) {
2835 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
2837 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
2838 ; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3]
2839 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
2840 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
2841 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,3,3,2]
2843 %vec = load <8 x float>, <8 x float>* %vp
2844 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
2845 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2846 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2847 ret <4 x float> %res
2850 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
2851 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
2853 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
2854 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm3
2855 ; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,0]
2856 ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
2857 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
2858 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[3,1],xmm3[2,0]
2860 %vec = load <8 x float>, <8 x float>* %vp
2861 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
2862 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2863 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2864 ret <4 x float> %res
2867 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %mask) {
2868 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
2870 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
2871 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2872 ; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[3,0]
2873 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2874 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
2875 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[3,1],xmm2[2,0]
2877 %vec = load <8 x float>, <8 x float>* %vp
2878 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
2879 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2880 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2881 ret <4 x float> %res
2884 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp) {
2885 ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3:
2887 ; CHECK-NEXT: vmovaps (%rdi), %xmm0
2888 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm1
2889 ; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[3,0]
2890 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
2892 %vec = load <8 x float>, <8 x float>* %vp
2893 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
2894 ret <4 x float> %res
2896 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
2897 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
2899 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
2900 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm3
2901 ; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,0]
2902 ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
2903 ; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
2904 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[1,3],xmm3[0,2]
2906 %vec = load <8 x float>, <8 x float>* %vp
2907 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
2908 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2909 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
2910 ret <4 x float> %res
2913 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %mask) {
2914 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
2916 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
2917 ; CHECK-NEXT: vmovaps 16(%rdi), %xmm2
2918 ; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[3,0]
2919 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
2920 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
2921 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3],xmm2[0,2]
2923 %vec = load <8 x float>, <8 x float>* %vp
2924 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
2925 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
2926 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
2927 ret <4 x float> %res
2930 define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) {
2931 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0:
2933 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
2934 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7]
2935 ; CHECK-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1
2936 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
2938 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
2939 ret <8 x float> %res
2941 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
2942 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0:
2944 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
2945 ; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,4,12,10,8,2,11,7]
2946 ; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm4
2947 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
2948 ; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1
2949 ; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1}
2951 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
2952 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
2953 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
2954 ret <8 x float> %res
2957 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) {
2958 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0:
2960 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
2961 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7]
2962 ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
2963 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1
2964 ; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm2 {%k1} {z}
2965 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
2967 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
2968 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
2969 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
2970 ret <8 x float> %res
2972 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
2973 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1:
2975 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
2976 ; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [2,4,11,4,12,7,9,6]
2977 ; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm4
2978 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
2979 ; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1
2980 ; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1}
2982 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
2983 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
2984 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
2985 ret <8 x float> %res
2988 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) {
2989 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1:
2991 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
2992 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [2,4,11,4,12,7,9,6]
2993 ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
2994 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1
2995 ; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm2 {%k1} {z}
2996 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
2998 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
2999 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3000 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3001 ret <8 x float> %res
3003 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3004 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2:
3006 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3007 ; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,4,8,9,6,1,4,4]
3008 ; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm4
3009 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
3010 ; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1
3011 ; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1}
3013 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
3014 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3015 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3016 ret <8 x float> %res
3019 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) {
3020 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2:
3022 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3023 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4]
3024 ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
3025 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1
3026 ; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm2 {%k1} {z}
3027 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
3029 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
3030 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3031 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3032 ret <8 x float> %res
3034 define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) {
3035 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3:
3037 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
3038 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,6,1,8,4,12,13,0]
3039 ; CHECK-NEXT: vpermi2ps %ymm0, %ymm2, %ymm1
3040 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3042 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3043 ret <8 x float> %res
3045 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
3046 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3:
3048 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3049 ; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [4,6,1,8,4,12,13,0]
3050 ; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm4
3051 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
3052 ; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1
3053 ; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1}
3055 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3056 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3057 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3058 ret <8 x float> %res
3061 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) {
3062 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3:
3064 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3065 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4,6,1,8,4,12,13,0]
3066 ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
3067 ; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1
3068 ; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm2 {%k1} {z}
3069 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
3071 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
3072 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3073 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3074 ret <8 x float> %res
3076 define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) {
3077 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0:
3079 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
3080 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <12,0,1,2,u,u,u,u>
3081 ; CHECK-NEXT: vpermi2ps %ymm0, %ymm2, %ymm1
3082 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3083 ; CHECK-NEXT: vzeroupper
3085 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3086 ret <4 x float> %res
3088 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3089 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0:
3091 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3092 ; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = <12,0,1,2,u,u,u,u>
3093 ; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm4
3094 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
3095 ; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1
3096 ; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1}
3097 ; CHECK-NEXT: vzeroupper
3099 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3100 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3101 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3102 ret <4 x float> %res
3105 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) {
3106 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0:
3108 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
3109 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <12,0,1,2,u,u,u,u>
3110 ; CHECK-NEXT: vpermi2ps %ymm0, %ymm2, %ymm3
3111 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
3112 ; CHECK-NEXT: vcmpeqps %xmm0, %xmm1, %k1
3113 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z}
3114 ; CHECK-NEXT: vzeroupper
3116 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
3117 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3118 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3119 ret <4 x float> %res
3121 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3122 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
3124 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
3125 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
3126 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,2]
3127 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3]
3128 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3129 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3130 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3131 ; CHECK-NEXT: vzeroupper
3133 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
3134 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3135 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3136 ret <4 x float> %res
3139 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
3140 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
3142 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2
3143 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
3144 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,2]
3145 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
3146 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3147 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3148 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
3149 ; CHECK-NEXT: vzeroupper
3151 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
3152 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3153 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3154 ret <4 x float> %res
3156 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3157 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2:
3159 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3160 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5]
3161 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3162 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3163 ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1}
3164 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
3165 ; CHECK-NEXT: vzeroupper
3167 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
3168 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3169 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3170 ret <4 x float> %res
3173 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) {
3174 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2:
3176 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
3177 ; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5]
3178 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3179 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3180 ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
3181 ; CHECK-NEXT: vzeroupper
3183 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
3184 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3185 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3186 ret <4 x float> %res
3188 define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) {
3189 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3:
3191 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
3192 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm1
3193 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0
3194 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
3195 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
3196 ; CHECK-NEXT: vzeroupper
3198 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3199 ret <4 x float> %res
3201 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3202 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3:
3204 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7]
3205 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm3
3206 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0
3207 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
3208 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
3209 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3210 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3211 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3212 ; CHECK-NEXT: vzeroupper
3214 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3215 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3216 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3217 ret <4 x float> %res
3220 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) {
3221 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3:
3223 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
3224 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm2
3225 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0
3226 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
3227 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
3228 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3229 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3230 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
3231 ; CHECK-NEXT: vzeroupper
3233 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
3234 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3235 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3236 ret <4 x float> %res
3238 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp) {
3239 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0:
3241 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
3242 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4]
3243 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm0
3245 %vec = load <16 x float>, <16 x float>* %vp
3246 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3247 ret <8 x float> %res
3249 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
3250 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0:
3252 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3253 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,6,7,11,5,10,0,4]
3254 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3
3255 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3256 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3257 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3259 %vec = load <16 x float>, <16 x float>* %vp
3260 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3261 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3262 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3263 ret <8 x float> %res
3266 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %mask) {
3267 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0:
3269 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3270 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4]
3271 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3272 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3273 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3274 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3276 %vec = load <16 x float>, <16 x float>* %vp
3277 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
3278 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3279 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3280 ret <8 x float> %res
3283 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
3284 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1:
3286 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3287 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [11,0,9,0,7,14,0,8]
3288 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3
3289 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3290 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3291 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3293 %vec = load <16 x float>, <16 x float>* %vp
3294 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
3295 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3296 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3297 ret <8 x float> %res
3300 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %mask) {
3301 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1:
3303 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3304 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8]
3305 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3306 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3307 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
3308 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3310 %vec = load <16 x float>, <16 x float>* %vp
3311 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
3312 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3313 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3314 ret <8 x float> %res
3317 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
3318 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
3320 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3321 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1]
3322 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3323 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3324 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3325 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3327 %vec = load <16 x float>, <16 x float>* %vp
3328 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
3329 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3330 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3331 ret <8 x float> %res
3334 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %mask) {
3335 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
3337 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3338 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1]
3339 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3340 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3341 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3342 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3344 %vec = load <16 x float>, <16 x float>* %vp
3345 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
3346 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3347 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3348 ret <8 x float> %res
3351 define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp) {
3352 ; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3:
3354 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm1
3355 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9]
3356 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm0
3358 %vec = load <16 x float>, <16 x float>* %vp
3359 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3360 ret <8 x float> %res
3362 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
3363 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3:
3365 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3366 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9]
3367 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3368 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3369 ; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
3370 ; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1}
3372 %vec = load <16 x float>, <16 x float>* %vp
3373 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3374 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3375 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3376 ret <8 x float> %res
3379 define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %mask) {
3380 ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3:
3382 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3383 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9]
3384 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3385 ; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1
3386 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
3387 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
3389 %vec = load <16 x float>, <16 x float>* %vp
3390 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
3391 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
3392 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3393 ret <8 x float> %res
3396 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp) {
3397 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0:
3399 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,3,3]
3400 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = mem[3,1,2,3]
3401 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
3402 ; CHECK-NEXT: vzeroupper
3404 %vec = load <16 x float>, <16 x float>* %vp
3405 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3406 ret <4 x float> %res
3408 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3409 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
3411 ; CHECK-NEXT: vpermilps {{.*#+}} xmm2 = mem[0,2,3,3]
3412 ; CHECK-NEXT: vpermpd {{.*#+}} ymm3 = mem[3,1,2,3]
3413 ; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3]
3414 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
3415 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3416 ; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1}
3417 ; CHECK-NEXT: vzeroupper
3419 %vec = load <16 x float>, <16 x float>* %vp
3420 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3421 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3422 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3423 ret <4 x float> %res
3426 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %mask) {
3427 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
3429 ; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,2,3,3]
3430 ; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = mem[3,1,2,3]
3431 ; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
3432 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3433 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
3434 ; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1} {z}
3435 ; CHECK-NEXT: vzeroupper
3437 %vec = load <16 x float>, <16 x float>* %vp
3438 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
3439 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3440 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3441 ret <4 x float> %res
3444 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3445 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1:
3447 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3448 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,10,6,15,4,14,6,15]
3449 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3450 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3451 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3452 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3453 ; CHECK-NEXT: vzeroupper
3455 %vec = load <16 x float>, <16 x float>* %vp
3456 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
3457 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3458 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3459 ret <4 x float> %res
3462 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %mask) {
3463 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
3465 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm1
3466 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,10,6,15,4,14,6,15]
3467 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm2
3468 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
3469 ; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
3470 ; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1} {z}
3471 ; CHECK-NEXT: vzeroupper
3473 %vec = load <16 x float>, <16 x float>* %vp
3474 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
3475 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3476 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3477 ret <4 x float> %res
3480 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3481 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
3483 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
3484 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [4,14,4,14,4,14,6,7]
3485 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3
3486 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3487 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3488 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3489 ; CHECK-NEXT: vzeroupper
3491 %vec = load <16 x float>, <16 x float>* %vp
3492 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
3493 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3494 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3495 ret <4 x float> %res
3498 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %mask) {
3499 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
3501 ; CHECK-NEXT: vmovaps 32(%rdi), %ymm1
3502 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4,14,4,14,4,14,6,7]
3503 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm2
3504 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
3505 ; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
3506 ; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1} {z}
3507 ; CHECK-NEXT: vzeroupper
3509 %vec = load <16 x float>, <16 x float>* %vp
3510 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
3511 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3512 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3513 ret <4 x float> %res
3516 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp) {
3517 ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3:
3519 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
3520 ; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <3,3,15,9,u,u,u,u>
3521 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm0
3522 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3523 ; CHECK-NEXT: vzeroupper
3525 %vec = load <16 x float>, <16 x float>* %vp
3526 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3527 ret <4 x float> %res
3529 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3530 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3:
3532 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3533 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <3,3,15,9,u,u,u,u>
3534 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3
3535 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
3536 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3537 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
3538 ; CHECK-NEXT: vzeroupper
3540 %vec = load <16 x float>, <16 x float>* %vp
3541 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3542 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3543 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
3544 ret <4 x float> %res
3547 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %mask) {
3548 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
3550 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
3551 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <3,3,15,9,u,u,u,u>
3552 ; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm2
3553 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
3554 ; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
3555 ; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1} {z}
3556 ; CHECK-NEXT: vzeroupper
3558 %vec = load <16 x float>, <16 x float>* %vp
3559 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
3560 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
3561 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
3562 ret <4 x float> %res
3565 define <2 x double> @test_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec) {
3566 ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mask0:
3568 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
3569 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3570 ; CHECK-NEXT: vzeroupper
3572 %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3573 ret <2 x double> %res
3575 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3576 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask0:
3578 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
3579 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3580 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
3581 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
3582 ; CHECK-NEXT: vzeroupper
3584 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3585 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3586 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3587 ret <2 x double> %res
3590 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %mask) {
3591 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask0:
3593 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
3594 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3595 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
3596 ; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
3597 ; CHECK-NEXT: vzeroupper
3599 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3600 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3601 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3602 ret <2 x double> %res
3604 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3605 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask1:
3607 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
3608 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3609 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
3610 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
3611 ; CHECK-NEXT: vzeroupper
3613 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
3614 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3615 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3616 ret <2 x double> %res
3619 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %mask) {
3620 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask1:
3622 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
3623 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3624 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
3625 ; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
3626 ; CHECK-NEXT: vzeroupper
3628 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
3629 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3630 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3631 ret <2 x double> %res
3633 define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp) {
3634 ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0:
3636 ; CHECK-NEXT: vmovaps (%rdi), %xmm0
3637 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
3639 %vec = load <4 x double>, <4 x double>* %vp
3640 %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3641 ret <2 x double> %res
3643 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
3644 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0:
3646 ; CHECK-NEXT: vmovapd (%rdi), %xmm2
3647 ; CHECK-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm2[1]
3648 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3649 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
3650 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1}
3652 %vec = load <4 x double>, <4 x double>* %vp
3653 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3654 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3655 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3656 ret <2 x double> %res
3659 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %mask) {
3660 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0:
3662 ; CHECK-NEXT: vmovapd (%rdi), %xmm1
3663 ; CHECK-NEXT: vblendpd {{.*#+}} xmm1 = mem[0],xmm1[1]
3664 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3665 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
3666 ; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z}
3668 %vec = load <4 x double>, <4 x double>* %vp
3669 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
3670 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3671 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3672 ret <2 x double> %res
3675 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
3676 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1:
3678 ; CHECK-NEXT: vmovapd 16(%rdi), %xmm2
3679 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3680 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
3681 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} = xmm2[0],mem[0]
3683 %vec = load <4 x double>, <4 x double>* %vp
3684 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3685 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3686 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
3687 ret <2 x double> %res
3690 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %mask) {
3691 ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1:
3693 ; CHECK-NEXT: vmovapd 16(%rdi), %xmm1
3694 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3695 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
3696 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm1[0],mem[0]
3698 %vec = load <4 x double>, <4 x double>* %vp
3699 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
3700 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
3701 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
3702 ret <2 x double> %res
3705 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
3706 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
3708 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
3709 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [3,7,3,7]
3710 ; CHECK-NEXT: vpermi2pd %ymm0, %ymm2, %ymm1
3711 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
3713 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3714 ret <4 x double> %res
3716 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3717 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
3719 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3720 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [3,7,3,7]
3721 ; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm4
3722 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
3723 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
3724 ; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
3726 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3727 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3728 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3729 ret <4 x double> %res
3732 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
3733 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
3735 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3736 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [3,7,3,7]
3737 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
3738 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
3739 ; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm2 {%k1} {z}
3740 ; CHECK-NEXT: vmovapd %ymm2, %ymm0
3742 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
3743 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3744 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3745 ret <4 x double> %res
3747 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3748 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1:
3750 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3751 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [2,0,7,6]
3752 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
3753 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
3754 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
3755 ; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
3757 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
3758 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3759 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3760 ret <4 x double> %res
3763 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %mask) {
3764 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask1:
3766 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3767 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,0,7,6]
3768 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
3769 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
3770 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
3771 ; CHECK-NEXT: vmovapd %ymm2, %ymm0
3773 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
3774 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3775 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3776 ret <4 x double> %res
3778 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3779 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask2:
3781 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3782 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3783 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,0]
3784 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
3786 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
3787 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3788 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3789 ret <4 x double> %res
3792 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %mask) {
3793 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask2:
3795 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3796 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
3797 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,0]
3799 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
3800 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3801 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3802 ret <4 x double> %res
3804 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) {
3805 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3:
3807 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2
3808 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,1,4]
3809 ; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
3810 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
3812 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
3813 ret <4 x double> %res
3815 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3816 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3:
3818 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
3819 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,1,4]
3820 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
3821 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
3822 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
3823 ; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
3825 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
3826 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3827 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3828 ret <4 x double> %res
3831 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) {
3832 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3:
3834 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
3835 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,1,4]
3836 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
3837 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
3838 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
3839 ; CHECK-NEXT: vmovapd %ymm2, %ymm0
3841 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
3842 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3843 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3844 ret <4 x double> %res
3846 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3847 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
3849 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
3850 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,1,5,5]
3851 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
3852 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
3853 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
3854 ; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
3856 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
3857 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3858 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3859 ret <4 x double> %res
3862 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
3863 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
3865 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
3866 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [1,1,5,5]
3867 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
3868 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
3869 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
3870 ; CHECK-NEXT: vmovapd %ymm2, %ymm0
3872 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
3873 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3874 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3875 ret <4 x double> %res
3877 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3878 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5:
3880 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3881 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [2,6,2,2]
3882 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
3883 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
3884 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
3885 ; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
3887 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
3888 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3889 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3890 ret <4 x double> %res
3893 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) {
3894 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5:
3896 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3897 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,6,2,2]
3898 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
3899 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
3900 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
3901 ; CHECK-NEXT: vmovapd %ymm2, %ymm0
3903 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
3904 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3905 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3906 ret <4 x double> %res
3908 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
3909 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
3911 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
3912 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,4,3,4]
3913 ; CHECK-NEXT: vpermi2pd %ymm0, %ymm2, %ymm1
3914 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
3916 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
3917 ret <4 x double> %res
3919 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3920 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
3922 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3923 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,4,3,4]
3924 ; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm4
3925 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
3926 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
3927 ; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
3929 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
3930 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3931 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3932 ret <4 x double> %res
3935 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
3936 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
3938 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3939 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [1,4,3,4]
3940 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
3941 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
3942 ; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm2 {%k1} {z}
3943 ; CHECK-NEXT: vmovapd %ymm2, %ymm0
3945 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
3946 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3947 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3948 ret <4 x double> %res
3950 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
3951 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
3953 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3954 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [3,5,0,6]
3955 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
3956 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
3957 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
3958 ; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
3960 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
3961 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3962 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3963 ret <4 x double> %res
3966 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) {
3967 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
3969 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3970 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [3,5,0,6]
3971 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
3972 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
3973 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
3974 ; CHECK-NEXT: vmovapd %ymm2, %ymm0
3976 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
3977 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
3978 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3979 ret <4 x double> %res
3981 define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
3982 ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
3984 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
3985 ; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
3986 ; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3987 ; CHECK-NEXT: vzeroupper
3989 %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
3990 ret <2 x double> %res
3992 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
3993 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
3995 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3996 ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3
3997 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
3998 ; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
3999 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
4000 ; CHECK-NEXT: vmovapd %xmm1, %xmm0
4001 ; CHECK-NEXT: vzeroupper
4003 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
4004 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4005 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4006 ret <2 x double> %res
4009 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
4010 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
4012 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4013 ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2
4014 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4015 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4016 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
4017 ; CHECK-NEXT: vzeroupper
4019 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
4020 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4021 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4022 ret <2 x double> %res
4024 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
4025 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1:
4027 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
4028 ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
4029 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
4030 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4031 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
4032 ; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
4033 ; CHECK-NEXT: vzeroupper
4035 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
4036 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4037 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4038 ret <2 x double> %res
4041 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %mask) {
4042 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask1:
4044 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4045 ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
4046 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
4047 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4048 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
4049 ; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
4050 ; CHECK-NEXT: vzeroupper
4052 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
4053 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4054 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4055 ret <2 x double> %res
4057 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp) {
4058 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0:
4060 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
4061 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,7,2]
4062 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0
4064 %vec = load <8 x double>, <8 x double>* %vp
4065 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4066 ret <4 x double> %res
4068 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4069 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0:
4071 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4072 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,6,7,2]
4073 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4074 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4075 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4076 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4078 %vec = load <8 x double>, <8 x double>* %vp
4079 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4080 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4081 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4082 ret <4 x double> %res
4085 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %mask) {
4086 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0:
4088 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4089 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,6,7,2]
4090 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4091 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4092 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4093 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4095 %vec = load <8 x double>, <8 x double>* %vp
4096 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
4097 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4098 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4099 ret <4 x double> %res
4102 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4103 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
4105 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4106 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,4]
4107 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4108 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4109 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4110 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4112 %vec = load <8 x double>, <8 x double>* %vp
4113 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
4114 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4115 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4116 ret <4 x double> %res
4119 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %mask) {
4120 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
4122 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4123 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,4]
4124 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4125 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4126 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4127 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4129 %vec = load <8 x double>, <8 x double>* %vp
4130 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
4131 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4132 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4133 ret <4 x double> %res
4136 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4137 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
4139 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4140 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,2,3,4]
4141 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4142 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4143 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4144 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4146 %vec = load <8 x double>, <8 x double>* %vp
4147 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
4148 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4149 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4150 ret <4 x double> %res
4153 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %mask) {
4154 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
4156 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4157 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,2,3,4]
4158 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4159 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4160 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4161 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4163 %vec = load <8 x double>, <8 x double>* %vp
4164 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
4165 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4166 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4167 ret <4 x double> %res
4170 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp) {
4171 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
4173 ; CHECK-NEXT: vmovapd (%rdi), %ymm1
4174 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [4,2,1,0]
4175 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0
4177 %vec = load <8 x double>, <8 x double>* %vp
4178 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4179 ret <4 x double> %res
4181 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4182 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
4184 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4185 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [4,2,1,0]
4186 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4187 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4188 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4189 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4191 %vec = load <8 x double>, <8 x double>* %vp
4192 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4193 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4194 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4195 ret <4 x double> %res
4198 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %mask) {
4199 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
4201 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4202 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [4,2,1,0]
4203 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4204 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4205 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4206 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4208 %vec = load <8 x double>, <8 x double>* %vp
4209 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
4210 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4211 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4212 ret <4 x double> %res
4215 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4216 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4:
4218 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4219 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,4,1,5]
4220 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3
4221 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4222 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4223 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4225 %vec = load <8 x double>, <8 x double>* %vp
4226 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
4227 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4228 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4229 ret <4 x double> %res
4232 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %mask) {
4233 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4:
4235 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4236 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [2,4,1,5]
4237 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4238 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4239 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
4240 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4242 %vec = load <8 x double>, <8 x double>* %vp
4243 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
4244 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4245 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4246 ret <4 x double> %res
4249 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4250 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
4252 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4253 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [6,1,1,1]
4254 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3
4255 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4256 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4257 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4259 %vec = load <8 x double>, <8 x double>* %vp
4260 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
4261 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4262 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4263 ret <4 x double> %res
4266 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) {
4267 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
4269 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4270 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [6,1,1,1]
4271 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4272 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4273 ; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z}
4274 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4276 %vec = load <8 x double>, <8 x double>* %vp
4277 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
4278 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4279 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4280 ret <4 x double> %res
4283 define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp) {
4284 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
4286 ; CHECK-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm1
4287 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm2
4288 ; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [0,2,6,1]
4289 ; CHECK-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0
4291 %vec = load <8 x double>, <8 x double>* %vp
4292 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4293 ret <4 x double> %res
4295 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4296 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
4298 ; CHECK-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm2
4299 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm3
4300 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,6,1]
4301 ; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm4
4302 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4303 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4304 ; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
4306 %vec = load <8 x double>, <8 x double>* %vp
4307 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4308 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4309 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4310 ret <4 x double> %res
4313 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %mask) {
4314 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
4316 ; CHECK-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm2
4317 ; CHECK-NEXT: vmovapd 32(%rdi), %ymm3
4318 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,6,1]
4319 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
4320 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
4321 ; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm1 {%k1} {z}
4322 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4324 %vec = load <8 x double>, <8 x double>* %vp
4325 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
4326 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4327 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4328 ret <4 x double> %res
4331 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
4332 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask7:
4334 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4335 ; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,5,2,5]
4336 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
4337 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4338 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
4339 ; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
4341 %vec = load <8 x double>, <8 x double>* %vp
4342 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
4343 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4344 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
4345 ret <4 x double> %res
4348 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %mask) {
4349 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7:
4351 ; CHECK-NEXT: vmovapd (%rdi), %ymm2
4352 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,5,2,5]
4353 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4354 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
4355 ; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
4356 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
4358 %vec = load <8 x double>, <8 x double>* %vp
4359 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
4360 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
4361 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
4362 ret <4 x double> %res
4365 define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp) {
4366 ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
4368 ; CHECK-NEXT: vmovapd (%rdi), %xmm0
4369 ; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],mem[0]
4371 %vec = load <8 x double>, <8 x double>* %vp
4372 %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4373 ret <2 x double> %res
4375 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
4376 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
4378 ; CHECK-NEXT: vmovapd (%rdi), %xmm2
4379 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4380 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4381 ; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} = xmm2[1],mem[0]
4383 %vec = load <8 x double>, <8 x double>* %vp
4384 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4385 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4386 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4387 ret <2 x double> %res
4390 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %mask) {
4391 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
4393 ; CHECK-NEXT: vmovapd (%rdi), %xmm1
4394 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4395 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
4396 ; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm1[1],mem[0]
4398 %vec = load <8 x double>, <8 x double>* %vp
4399 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
4400 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4401 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4402 ret <2 x double> %res
4405 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
4406 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1:
4408 ; CHECK-NEXT: vmovapd (%rdi), %xmm2
4409 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
4410 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
4411 ; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} = xmm2[1],mem[0]
4413 %vec = load <8 x double>, <8 x double>* %vp
4414 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
4415 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4416 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
4417 ret <2 x double> %res
4420 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %mask) {
4421 ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
4423 ; CHECK-NEXT: vmovapd (%rdi), %xmm1
4424 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
4425 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
4426 ; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm1[1],mem[0]
4428 %vec = load <8 x double>, <8 x double>* %vp
4429 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
4430 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
4431 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
4432 ret <2 x double> %res
4436 define void @test_zext_v8i8_to_v8i16(<8 x i8>* %arg, <8 x i16>* %arg1) {
4437 ; CHECK-LABEL: test_zext_v8i8_to_v8i16:
4439 ; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4440 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
4441 ; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
4442 ; CHECK-NEXT: vmovdqa %xmm0, (%rsi)
4444 %tmp = getelementptr <8 x i8>, <8 x i8>* %arg, i32 0
4445 %tmp2 = load <8 x i8>, <8 x i8>* %tmp
4446 %tmp3 = extractelement <8 x i8> %tmp2, i32 0
4447 %tmp4 = zext i8 %tmp3 to i16
4448 %tmp5 = insertelement <8 x i16> undef, i16 %tmp4, i32 0
4449 %tmp6 = extractelement <8 x i8> %tmp2, i32 1
4450 %tmp7 = zext i8 %tmp6 to i16
4451 %tmp8 = insertelement <8 x i16> %tmp5, i16 %tmp7, i32 1
4452 %tmp9 = extractelement <8 x i8> %tmp2, i32 2
4453 %tmp10 = zext i8 %tmp9 to i16
4454 %tmp11 = insertelement <8 x i16> %tmp8, i16 %tmp10, i32 2
4455 %tmp12 = extractelement <8 x i8> %tmp2, i32 3
4456 %tmp13 = zext i8 %tmp12 to i16
4457 %tmp14 = insertelement <8 x i16> %tmp11, i16 %tmp13, i32 3
4458 %tmp15 = extractelement <8 x i8> %tmp2, i32 4
4459 %tmp16 = zext i8 %tmp15 to i16
4460 %tmp17 = insertelement <8 x i16> %tmp14, i16 %tmp16, i32 4
4461 %tmp18 = extractelement <8 x i8> %tmp2, i32 5
4462 %tmp19 = zext i8 %tmp18 to i16
4463 %tmp20 = insertelement <8 x i16> %tmp17, i16 %tmp19, i32 5
4464 %tmp21 = extractelement <8 x i8> %tmp2, i32 6
4465 %tmp22 = zext i8 %tmp21 to i16
4466 %tmp23 = insertelement <8 x i16> %tmp20, i16 %tmp22, i32 6
4467 %tmp24 = extractelement <8 x i8> %tmp2, i32 7
4468 %tmp25 = zext i8 %tmp24 to i16
4469 %tmp26 = insertelement <8 x i16> %tmp23, i16 %tmp25, i32 7
4470 %tmp27 = shl <8 x i16> %tmp26, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
4471 %tmp28 = getelementptr <8 x i16>, <8 x i16>* %arg1, i32 0
4472 store <8 x i16> %tmp27, <8 x i16>* %tmp28