1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-SKX
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-ICX,CHECK-ICX-NO-BYPASS-DELAY
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=icelake-server -mattr=-no-bypass-delay-shuffle | FileCheck %s --check-prefixes=CHECK,CHECK-ICX,CHECK-ICX-BYPASS-DELAY
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-V4
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
10 define <16 x float> @transform_VUNPCKLPDZrr(<16 x float> %a, <16 x float> %b) nounwind {
11 ; CHECK-LABEL: transform_VUNPCKLPDZrr:
13 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
15 %shufp = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 20, i32 21, i32 8, i32 9, i32 24, i32 25, i32 12, i32 13, i32 28, i32 29>
16 ret <16 x float> %shufp
19 define <16 x float> @transform_VUNPCKHPDZrr(<16 x float> %a, <16 x float> %b) nounwind {
20 ; CHECK-LABEL: transform_VUNPCKHPDZrr:
22 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
24 %shufp = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 2, i32 3, i32 18, i32 19, i32 6, i32 7, i32 22, i32 23, i32 10, i32 11, i32 26, i32 27, i32 14, i32 15, i32 30, i32 31>
25 ret <16 x float> %shufp
28 define <8 x float> @transform_VUNPCKLPDYrr(<8 x float> %a, <8 x float> %b) nounwind {
29 ; CHECK-SKX-LABEL: transform_VUNPCKLPDYrr:
31 ; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
32 ; CHECK-SKX-NEXT: retq
34 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrr:
35 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
36 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
37 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
39 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrr:
40 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
41 ; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
42 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
44 ; CHECK-V4-LABEL: transform_VUNPCKLPDYrr:
46 ; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
49 ; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrr:
50 ; CHECK-AVX512: # %bb.0:
51 ; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
52 ; CHECK-AVX512-NEXT: retq
54 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrr:
55 ; CHECK-ZNVER4: # %bb.0:
56 ; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
57 ; CHECK-ZNVER4-NEXT: retq
58 %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
59 ret <8 x float> %shufp
62 define <8 x float> @transform_VUNPCKHPDYrr(<8 x float> %a, <8 x float> %b) nounwind {
63 ; CHECK-SKX-LABEL: transform_VUNPCKHPDYrr:
65 ; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
66 ; CHECK-SKX-NEXT: retq
68 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrr:
69 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
70 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
71 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
73 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrr:
74 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
75 ; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
76 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
78 ; CHECK-V4-LABEL: transform_VUNPCKHPDYrr:
80 ; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
83 ; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrr:
84 ; CHECK-AVX512: # %bb.0:
85 ; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
86 ; CHECK-AVX512-NEXT: retq
88 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrr:
89 ; CHECK-ZNVER4: # %bb.0:
90 ; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
91 ; CHECK-ZNVER4-NEXT: retq
92 %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
93 ret <8 x float> %shufp
96 define <4 x float> @transform_VUNPCKLPDrr(<4 x float> %a, <4 x float> %b) nounwind {
97 ; CHECK-SKX-LABEL: transform_VUNPCKLPDrr:
99 ; CHECK-SKX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
100 ; CHECK-SKX-NEXT: retq
102 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr:
103 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
104 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
105 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
107 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrr:
108 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
109 ; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
110 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
112 ; CHECK-V4-LABEL: transform_VUNPCKLPDrr:
114 ; CHECK-V4-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
115 ; CHECK-V4-NEXT: retq
117 ; CHECK-AVX512-LABEL: transform_VUNPCKLPDrr:
118 ; CHECK-AVX512: # %bb.0:
119 ; CHECK-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
120 ; CHECK-AVX512-NEXT: retq
122 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrr:
123 ; CHECK-ZNVER4: # %bb.0:
124 ; CHECK-ZNVER4-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
125 ; CHECK-ZNVER4-NEXT: retq
126 %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
127 ret <4 x float> %shufp
130 define <4 x float> @transform_VUNPCKHPDrr(<4 x float> %a, <4 x float> %b) nounwind {
131 ; CHECK-SKX-LABEL: transform_VUNPCKHPDrr:
132 ; CHECK-SKX: # %bb.0:
133 ; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
134 ; CHECK-SKX-NEXT: retq
136 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr:
137 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
138 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
139 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
141 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrr:
142 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
143 ; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
144 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
146 ; CHECK-V4-LABEL: transform_VUNPCKHPDrr:
148 ; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
149 ; CHECK-V4-NEXT: retq
151 ; CHECK-AVX512-LABEL: transform_VUNPCKHPDrr:
152 ; CHECK-AVX512: # %bb.0:
153 ; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
154 ; CHECK-AVX512-NEXT: retq
156 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrr:
157 ; CHECK-ZNVER4: # %bb.0:
158 ; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
159 ; CHECK-ZNVER4-NEXT: retq
160 %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
161 ret <4 x float> %shufp
164 define <8 x double> @transform_VUNPCKLPDZrrkz(<8 x double> %a, <8 x double> %b, i8 %mask_int) nounwind {
165 ; CHECK-LABEL: transform_VUNPCKLPDZrrkz:
167 ; CHECK-NEXT: kmovd %edi, %k1
168 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
170 %mask = bitcast i8 %mask_int to <8 x i1>
171 %shufp = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
172 %res = select <8 x i1> %mask, <8 x double> %shufp, <8 x double> zeroinitializer
173 ret <8 x double> %res
176 define <8 x double> @transform_VUNPCKHPDZrrkz(<8 x double> %a, <8 x double> %b, i8 %mask_int) nounwind {
177 ; CHECK-LABEL: transform_VUNPCKHPDZrrkz:
179 ; CHECK-NEXT: kmovd %edi, %k1
180 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
182 %mask = bitcast i8 %mask_int to <8 x i1>
183 %shufp = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
184 %res = select <8 x i1> %mask, <8 x double> %shufp, <8 x double> zeroinitializer
185 ret <8 x double> %res
188 define <4 x double> @transform_VUNPCKLPDYrrkz(<4 x double> %a, <4 x double> %b, i4 %mask_int) nounwind {
189 ; CHECK-SKX-LABEL: transform_VUNPCKLPDYrrkz:
190 ; CHECK-SKX: # %bb.0:
191 ; CHECK-SKX-NEXT: kmovd %edi, %k1
192 ; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
193 ; CHECK-SKX-NEXT: retq
195 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrrkz:
196 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
197 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
198 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
199 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
201 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrrkz:
202 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
203 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
204 ; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
205 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
207 ; CHECK-V4-LABEL: transform_VUNPCKLPDYrrkz:
209 ; CHECK-V4-NEXT: kmovd %edi, %k1
210 ; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
211 ; CHECK-V4-NEXT: retq
213 ; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrrkz:
214 ; CHECK-AVX512: # %bb.0:
215 ; CHECK-AVX512-NEXT: kmovd %edi, %k1
216 ; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
217 ; CHECK-AVX512-NEXT: retq
219 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrrkz:
220 ; CHECK-ZNVER4: # %bb.0:
221 ; CHECK-ZNVER4-NEXT: kmovd %edi, %k1
222 ; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
223 ; CHECK-ZNVER4-NEXT: retq
224 %mask = bitcast i4 %mask_int to <4 x i1>
225 %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
226 %res = select <4 x i1> %mask, <4 x double> %shufp, <4 x double> zeroinitializer
227 ret <4 x double> %res
230 define <4 x double> @transform_VUNPCKHPDYrrkz(<4 x double> %a, <4 x double> %b, i4 %mask_int) nounwind {
231 ; CHECK-SKX-LABEL: transform_VUNPCKHPDYrrkz:
232 ; CHECK-SKX: # %bb.0:
233 ; CHECK-SKX-NEXT: kmovd %edi, %k1
234 ; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
235 ; CHECK-SKX-NEXT: retq
237 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrrkz:
238 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
239 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
240 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
241 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
243 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrrkz:
244 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
245 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
246 ; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
247 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
249 ; CHECK-V4-LABEL: transform_VUNPCKHPDYrrkz:
251 ; CHECK-V4-NEXT: kmovd %edi, %k1
252 ; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
253 ; CHECK-V4-NEXT: retq
255 ; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrrkz:
256 ; CHECK-AVX512: # %bb.0:
257 ; CHECK-AVX512-NEXT: kmovd %edi, %k1
258 ; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
259 ; CHECK-AVX512-NEXT: retq
261 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrrkz:
262 ; CHECK-ZNVER4: # %bb.0:
263 ; CHECK-ZNVER4-NEXT: kmovd %edi, %k1
264 ; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
265 ; CHECK-ZNVER4-NEXT: retq
266 %mask = bitcast i4 %mask_int to <4 x i1>
267 %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
268 %res = select <4 x i1> %mask, <4 x double> %shufp, <4 x double> zeroinitializer
269 ret <4 x double> %res
272 define <2 x double> @transform_VUNPCKLPDrrkz(<2 x double> %a, <2 x double> %b, i2 %mask_int) nounwind {
273 ; CHECK-SKX-LABEL: transform_VUNPCKLPDrrkz:
274 ; CHECK-SKX: # %bb.0:
275 ; CHECK-SKX-NEXT: kmovd %edi, %k1
276 ; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
277 ; CHECK-SKX-NEXT: retq
279 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrrkz:
280 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
281 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
282 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
283 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
285 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrrkz:
286 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
287 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
288 ; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
289 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
291 ; CHECK-V4-LABEL: transform_VUNPCKLPDrrkz:
293 ; CHECK-V4-NEXT: kmovd %edi, %k1
294 ; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
295 ; CHECK-V4-NEXT: retq
297 ; CHECK-AVX512-LABEL: transform_VUNPCKLPDrrkz:
298 ; CHECK-AVX512: # %bb.0:
299 ; CHECK-AVX512-NEXT: kmovd %edi, %k1
300 ; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
301 ; CHECK-AVX512-NEXT: retq
303 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrrkz:
304 ; CHECK-ZNVER4: # %bb.0:
305 ; CHECK-ZNVER4-NEXT: kmovd %edi, %k1
306 ; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
307 ; CHECK-ZNVER4-NEXT: retq
308 %mask = bitcast i2 %mask_int to <2 x i1>
309 %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
310 %res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> zeroinitializer
311 ret <2 x double> %res
314 define <2 x double> @transform_VUNPCKHPDrrkz(<2 x double> %a, <2 x double> %b, i2 %mask_int) nounwind {
315 ; CHECK-SKX-LABEL: transform_VUNPCKHPDrrkz:
316 ; CHECK-SKX: # %bb.0:
317 ; CHECK-SKX-NEXT: kmovd %edi, %k1
318 ; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
319 ; CHECK-SKX-NEXT: retq
321 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrrkz:
322 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
323 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
324 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
325 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
327 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrrkz:
328 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
329 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
330 ; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
331 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
333 ; CHECK-V4-LABEL: transform_VUNPCKHPDrrkz:
335 ; CHECK-V4-NEXT: kmovd %edi, %k1
336 ; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
337 ; CHECK-V4-NEXT: retq
339 ; CHECK-AVX512-LABEL: transform_VUNPCKHPDrrkz:
340 ; CHECK-AVX512: # %bb.0:
341 ; CHECK-AVX512-NEXT: kmovd %edi, %k1
342 ; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
343 ; CHECK-AVX512-NEXT: retq
345 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrrkz:
346 ; CHECK-ZNVER4: # %bb.0:
347 ; CHECK-ZNVER4-NEXT: kmovd %edi, %k1
348 ; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
349 ; CHECK-ZNVER4-NEXT: retq
350 %mask = bitcast i2 %mask_int to <2 x i1>
351 %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
352 %res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> zeroinitializer
353 ret <2 x double> %res
356 define <8 x double> @transform_VUNPCKLPDZrrk(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask_int) nounwind {
357 ; CHECK-LABEL: transform_VUNPCKLPDZrrk:
359 ; CHECK-NEXT: kmovd %edi, %k1
360 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
361 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
363 %mask = bitcast i8 %mask_int to <8 x i1>
364 %shufp = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
365 %res = select <8 x i1> %mask, <8 x double> %shufp, <8 x double> %c
366 ret <8 x double> %res
369 define <8 x double> @transform_VUNPCKHPDZrrk(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask_int) nounwind {
370 ; CHECK-LABEL: transform_VUNPCKHPDZrrk:
372 ; CHECK-NEXT: kmovd %edi, %k1
373 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
374 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
376 %mask = bitcast i8 %mask_int to <8 x i1>
377 %shufp = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
378 %res = select <8 x i1> %mask, <8 x double> %shufp, <8 x double> %c
379 ret <8 x double> %res
382 define <4 x double> @transform_VUNPCKLPDYrrk(<4 x double> %a, <4 x double> %b, <4 x double> %c, i4 %mask_int) nounwind {
383 ; CHECK-SKX-LABEL: transform_VUNPCKLPDYrrk:
384 ; CHECK-SKX: # %bb.0:
385 ; CHECK-SKX-NEXT: kmovd %edi, %k1
386 ; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
387 ; CHECK-SKX-NEXT: vmovapd %ymm2, %ymm0
388 ; CHECK-SKX-NEXT: retq
390 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrrk:
391 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
392 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
393 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
394 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %ymm2, %ymm0
395 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
397 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrrk:
398 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
399 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
400 ; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
401 ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %ymm2, %ymm0
402 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
404 ; CHECK-V4-LABEL: transform_VUNPCKLPDYrrk:
406 ; CHECK-V4-NEXT: kmovd %edi, %k1
407 ; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
408 ; CHECK-V4-NEXT: vmovapd %ymm2, %ymm0
409 ; CHECK-V4-NEXT: retq
411 ; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrrk:
412 ; CHECK-AVX512: # %bb.0:
413 ; CHECK-AVX512-NEXT: kmovd %edi, %k1
414 ; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
415 ; CHECK-AVX512-NEXT: vmovapd %ymm2, %ymm0
416 ; CHECK-AVX512-NEXT: retq
418 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrrk:
419 ; CHECK-ZNVER4: # %bb.0:
420 ; CHECK-ZNVER4-NEXT: kmovd %edi, %k1
421 ; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
422 ; CHECK-ZNVER4-NEXT: vmovapd %ymm2, %ymm0
423 ; CHECK-ZNVER4-NEXT: retq
424 %mask = bitcast i4 %mask_int to <4 x i1>
425 %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
426 %res = select <4 x i1> %mask, <4 x double> %shufp, <4 x double> %c
427 ret <4 x double> %res
430 define <4 x double> @transform_VUNPCKHPDYrrk(<4 x double> %a, <4 x double> %b, <4 x double> %c, i4 %mask_int) nounwind {
431 ; CHECK-SKX-LABEL: transform_VUNPCKHPDYrrk:
432 ; CHECK-SKX: # %bb.0:
433 ; CHECK-SKX-NEXT: kmovd %edi, %k1
434 ; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
435 ; CHECK-SKX-NEXT: vmovapd %ymm2, %ymm0
436 ; CHECK-SKX-NEXT: retq
438 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrrk:
439 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
440 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
441 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
442 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %ymm2, %ymm0
443 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
445 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrrk:
446 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
447 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
448 ; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
449 ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %ymm2, %ymm0
450 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
452 ; CHECK-V4-LABEL: transform_VUNPCKHPDYrrk:
454 ; CHECK-V4-NEXT: kmovd %edi, %k1
455 ; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
456 ; CHECK-V4-NEXT: vmovapd %ymm2, %ymm0
457 ; CHECK-V4-NEXT: retq
459 ; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrrk:
460 ; CHECK-AVX512: # %bb.0:
461 ; CHECK-AVX512-NEXT: kmovd %edi, %k1
462 ; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
463 ; CHECK-AVX512-NEXT: vmovapd %ymm2, %ymm0
464 ; CHECK-AVX512-NEXT: retq
466 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrrk:
467 ; CHECK-ZNVER4: # %bb.0:
468 ; CHECK-ZNVER4-NEXT: kmovd %edi, %k1
469 ; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
470 ; CHECK-ZNVER4-NEXT: vmovapd %ymm2, %ymm0
471 ; CHECK-ZNVER4-NEXT: retq
472 %mask = bitcast i4 %mask_int to <4 x i1>
473 %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
474 %res = select <4 x i1> %mask, <4 x double> %shufp, <4 x double> %c
475 ret <4 x double> %res
478 define <2 x double> @transform_VUNPCKLPDrrk(<2 x double> %a, <2 x double> %b, <2 x double> %c, i2 %mask_int) nounwind {
479 ; CHECK-SKX-LABEL: transform_VUNPCKLPDrrk:
480 ; CHECK-SKX: # %bb.0:
481 ; CHECK-SKX-NEXT: kmovd %edi, %k1
482 ; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
483 ; CHECK-SKX-NEXT: vmovapd %xmm2, %xmm0
484 ; CHECK-SKX-NEXT: retq
486 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrrk:
487 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
488 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
489 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
490 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %xmm2, %xmm0
491 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
493 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrrk:
494 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
495 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
496 ; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
497 ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %xmm2, %xmm0
498 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
500 ; CHECK-V4-LABEL: transform_VUNPCKLPDrrk:
502 ; CHECK-V4-NEXT: kmovd %edi, %k1
503 ; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
504 ; CHECK-V4-NEXT: vmovapd %xmm2, %xmm0
505 ; CHECK-V4-NEXT: retq
507 ; CHECK-AVX512-LABEL: transform_VUNPCKLPDrrk:
508 ; CHECK-AVX512: # %bb.0:
509 ; CHECK-AVX512-NEXT: kmovd %edi, %k1
510 ; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
511 ; CHECK-AVX512-NEXT: vmovapd %xmm2, %xmm0
512 ; CHECK-AVX512-NEXT: retq
514 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrrk:
515 ; CHECK-ZNVER4: # %bb.0:
516 ; CHECK-ZNVER4-NEXT: kmovd %edi, %k1
517 ; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
518 ; CHECK-ZNVER4-NEXT: vmovapd %xmm2, %xmm0
519 ; CHECK-ZNVER4-NEXT: retq
520 %mask = bitcast i2 %mask_int to <2 x i1>
521 %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
522 %res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> %c
523 ret <2 x double> %res
526 define <2 x double> @transform_VUNPCKHPDrrk(<2 x double> %a, <2 x double> %b, <2 x double> %c, i2 %mask_int) nounwind {
527 ; CHECK-SKX-LABEL: transform_VUNPCKHPDrrk:
528 ; CHECK-SKX: # %bb.0:
529 ; CHECK-SKX-NEXT: kmovd %edi, %k1
530 ; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
531 ; CHECK-SKX-NEXT: vmovapd %xmm2, %xmm0
532 ; CHECK-SKX-NEXT: retq
534 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrrk:
535 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
536 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
537 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
538 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %xmm2, %xmm0
539 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
541 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrrk:
542 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
543 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
544 ; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
545 ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %xmm2, %xmm0
546 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
548 ; CHECK-V4-LABEL: transform_VUNPCKHPDrrk:
550 ; CHECK-V4-NEXT: kmovd %edi, %k1
551 ; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
552 ; CHECK-V4-NEXT: vmovapd %xmm2, %xmm0
553 ; CHECK-V4-NEXT: retq
555 ; CHECK-AVX512-LABEL: transform_VUNPCKHPDrrk:
556 ; CHECK-AVX512: # %bb.0:
557 ; CHECK-AVX512-NEXT: kmovd %edi, %k1
558 ; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
559 ; CHECK-AVX512-NEXT: vmovapd %xmm2, %xmm0
560 ; CHECK-AVX512-NEXT: retq
562 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrrk:
563 ; CHECK-ZNVER4: # %bb.0:
564 ; CHECK-ZNVER4-NEXT: kmovd %edi, %k1
565 ; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
566 ; CHECK-ZNVER4-NEXT: vmovapd %xmm2, %xmm0
567 ; CHECK-ZNVER4-NEXT: retq
568 %mask = bitcast i2 %mask_int to <2 x i1>
569 %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
570 %res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> %c
571 ret <2 x double> %res
574 define <16 x float> @transform_VUNPCKLPDZrm(<16 x float> %a, ptr %pb) nounwind {
575 ; CHECK-LABEL: transform_VUNPCKLPDZrm:
577 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
579 %b = load <16 x float>, ptr %pb
580 %shufp = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 20, i32 21, i32 8, i32 9, i32 24, i32 25, i32 12, i32 13, i32 28, i32 29>
581 ret <16 x float> %shufp
584 define <16 x float> @transform_VUNPCKHPDZrm(<16 x float> %a, ptr %pb) nounwind {
585 ; CHECK-LABEL: transform_VUNPCKHPDZrm:
587 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
589 %b = load <16 x float>, ptr %pb
590 %shufp = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 2, i32 3, i32 18, i32 19, i32 6, i32 7, i32 22, i32 23, i32 10, i32 11, i32 26, i32 27, i32 14, i32 15, i32 30, i32 31>
591 ret <16 x float> %shufp
594 define <8 x float> @transform_VUNPCKLPDYrm(<8 x float> %a, ptr %pb) nounwind {
595 ; CHECK-SKX-LABEL: transform_VUNPCKLPDYrm:
596 ; CHECK-SKX: # %bb.0:
597 ; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
598 ; CHECK-SKX-NEXT: retq
600 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrm:
601 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
602 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
603 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
605 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrm:
606 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
607 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
608 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
610 ; CHECK-V4-LABEL: transform_VUNPCKLPDYrm:
612 ; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
613 ; CHECK-V4-NEXT: retq
615 ; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrm:
616 ; CHECK-AVX512: # %bb.0:
617 ; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
618 ; CHECK-AVX512-NEXT: retq
620 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrm:
621 ; CHECK-ZNVER4: # %bb.0:
622 ; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
623 ; CHECK-ZNVER4-NEXT: retq
624 %b = load <8 x float>, ptr %pb
625 %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
626 ret <8 x float> %shufp
629 define <8 x float> @transform_VUNPCKHPDYrm(<8 x float> %a, ptr %pb) nounwind {
630 ; CHECK-SKX-LABEL: transform_VUNPCKHPDYrm:
631 ; CHECK-SKX: # %bb.0:
632 ; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
633 ; CHECK-SKX-NEXT: retq
635 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrm:
636 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
637 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
638 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
640 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrm:
641 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
642 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
643 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
645 ; CHECK-V4-LABEL: transform_VUNPCKHPDYrm:
647 ; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
648 ; CHECK-V4-NEXT: retq
650 ; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrm:
651 ; CHECK-AVX512: # %bb.0:
652 ; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
653 ; CHECK-AVX512-NEXT: retq
655 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrm:
656 ; CHECK-ZNVER4: # %bb.0:
657 ; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
658 ; CHECK-ZNVER4-NEXT: retq
659 %b = load <8 x float>, ptr %pb
660 %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
661 ret <8 x float> %shufp
664 define <4 x float> @transform_VUNPCKLPDrm(<4 x float> %a, ptr %pb) nounwind {
665 ; CHECK-SKX-LABEL: transform_VUNPCKLPDrm:
666 ; CHECK-SKX: # %bb.0:
667 ; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
668 ; CHECK-SKX-NEXT: retq
670 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
671 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
672 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
673 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
675 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm:
676 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
677 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
678 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
680 ; CHECK-V4-LABEL: transform_VUNPCKLPDrm:
682 ; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
683 ; CHECK-V4-NEXT: retq
685 ; CHECK-AVX512-LABEL: transform_VUNPCKLPDrm:
686 ; CHECK-AVX512: # %bb.0:
687 ; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
688 ; CHECK-AVX512-NEXT: retq
690 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrm:
691 ; CHECK-ZNVER4: # %bb.0:
692 ; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
693 ; CHECK-ZNVER4-NEXT: retq
694 %b = load <4 x float>, ptr %pb
695 %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
696 ret <4 x float> %shufp
699 define <4 x float> @transform_VUNPCKHPDrm(<4 x float> %a, ptr %pb) nounwind {
700 ; CHECK-SKX-LABEL: transform_VUNPCKHPDrm:
701 ; CHECK-SKX: # %bb.0:
702 ; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
703 ; CHECK-SKX-NEXT: retq
705 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
706 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
707 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1]
708 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
710 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm:
711 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
712 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
713 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
715 ; CHECK-V4-LABEL: transform_VUNPCKHPDrm:
717 ; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
718 ; CHECK-V4-NEXT: retq
720 ; CHECK-AVX512-LABEL: transform_VUNPCKHPDrm:
721 ; CHECK-AVX512: # %bb.0:
722 ; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
723 ; CHECK-AVX512-NEXT: retq
725 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrm:
726 ; CHECK-ZNVER4: # %bb.0:
727 ; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
728 ; CHECK-ZNVER4-NEXT: retq
729 %b = load <4 x float>, ptr %pb
730 %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
731 ret <4 x float> %shufp
734 define <8 x double> @transform_VUNPCKLPDZrmkz(<8 x double> %a, ptr %pb, i8 %mask_int) nounwind {
735 ; CHECK-LABEL: transform_VUNPCKLPDZrmkz:
737 ; CHECK-NEXT: kmovd %esi, %k1
738 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
740 %mask = bitcast i8 %mask_int to <8 x i1>
741 %b = load <8 x double>, ptr %pb
742 %shufp = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
743 %res = select <8 x i1> %mask, <8 x double> %shufp, <8 x double> zeroinitializer
744 ret <8 x double> %res
747 define <8 x double> @transform_VUNPCKHPDZrmkz(<8 x double> %a, ptr %pb, i8 %mask_int) nounwind {
748 ; CHECK-LABEL: transform_VUNPCKHPDZrmkz:
750 ; CHECK-NEXT: kmovd %esi, %k1
751 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
753 %mask = bitcast i8 %mask_int to <8 x i1>
754 %b = load <8 x double>, ptr %pb
755 %shufp = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
756 %res = select <8 x i1> %mask, <8 x double> %shufp, <8 x double> zeroinitializer
757 ret <8 x double> %res
760 define <4 x double> @transform_VUNPCKLPDYrmkz(<4 x double> %a, ptr %pb, i4 %mask_int) nounwind {
761 ; CHECK-SKX-LABEL: transform_VUNPCKLPDYrmkz:
762 ; CHECK-SKX: # %bb.0:
763 ; CHECK-SKX-NEXT: kmovd %esi, %k1
764 ; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
765 ; CHECK-SKX-NEXT: retq
767 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrmkz:
768 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
769 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
770 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
771 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
773 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrmkz:
774 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
775 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
776 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
777 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
779 ; CHECK-V4-LABEL: transform_VUNPCKLPDYrmkz:
781 ; CHECK-V4-NEXT: kmovd %esi, %k1
782 ; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
783 ; CHECK-V4-NEXT: retq
785 ; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrmkz:
786 ; CHECK-AVX512: # %bb.0:
787 ; CHECK-AVX512-NEXT: kmovd %esi, %k1
788 ; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
789 ; CHECK-AVX512-NEXT: retq
791 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrmkz:
792 ; CHECK-ZNVER4: # %bb.0:
793 ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
794 ; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
795 ; CHECK-ZNVER4-NEXT: retq
796 %mask = bitcast i4 %mask_int to <4 x i1>
797 %b = load <4 x double>, ptr %pb
798 %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
799 %res = select <4 x i1> %mask, <4 x double> %shufp, <4 x double> zeroinitializer
800 ret <4 x double> %res
803 define <4 x double> @transform_VUNPCKHPDYrmkz(<4 x double> %a, ptr %pb, i4 %mask_int) nounwind {
804 ; CHECK-SKX-LABEL: transform_VUNPCKHPDYrmkz:
805 ; CHECK-SKX: # %bb.0:
806 ; CHECK-SKX-NEXT: kmovd %esi, %k1
807 ; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
808 ; CHECK-SKX-NEXT: retq
810 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrmkz:
811 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
812 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
813 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
814 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
816 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrmkz:
817 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
818 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
819 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
820 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
822 ; CHECK-V4-LABEL: transform_VUNPCKHPDYrmkz:
824 ; CHECK-V4-NEXT: kmovd %esi, %k1
825 ; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
826 ; CHECK-V4-NEXT: retq
828 ; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrmkz:
829 ; CHECK-AVX512: # %bb.0:
830 ; CHECK-AVX512-NEXT: kmovd %esi, %k1
831 ; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
832 ; CHECK-AVX512-NEXT: retq
834 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrmkz:
835 ; CHECK-ZNVER4: # %bb.0:
836 ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
837 ; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
838 ; CHECK-ZNVER4-NEXT: retq
839 %mask = bitcast i4 %mask_int to <4 x i1>
840 %b = load <4 x double>, ptr %pb
841 %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
842 %res = select <4 x i1> %mask, <4 x double> %shufp, <4 x double> zeroinitializer
843 ret <4 x double> %res
846 define <2 x double> @transform_VUNPCKLPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_int) nounwind {
847 ; CHECK-SKX-LABEL: transform_VUNPCKLPDrmkz:
848 ; CHECK-SKX: # %bb.0:
849 ; CHECK-SKX-NEXT: kmovd %esi, %k1
850 ; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
851 ; CHECK-SKX-NEXT: retq
853 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmkz:
854 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
855 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
856 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
857 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
859 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmkz:
860 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
861 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
862 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
863 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
865 ; CHECK-V4-LABEL: transform_VUNPCKLPDrmkz:
867 ; CHECK-V4-NEXT: kmovd %esi, %k1
868 ; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
869 ; CHECK-V4-NEXT: retq
871 ; CHECK-AVX512-LABEL: transform_VUNPCKLPDrmkz:
872 ; CHECK-AVX512: # %bb.0:
873 ; CHECK-AVX512-NEXT: kmovd %esi, %k1
874 ; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
875 ; CHECK-AVX512-NEXT: retq
877 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrmkz:
878 ; CHECK-ZNVER4: # %bb.0:
879 ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
880 ; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
881 ; CHECK-ZNVER4-NEXT: retq
882 %mask = bitcast i2 %mask_int to <2 x i1>
883 %b = load <2 x double>, ptr %pb
884 %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
885 %res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> zeroinitializer
886 ret <2 x double> %res
889 define <2 x double> @transform_VUNPCKHPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_int) nounwind {
890 ; CHECK-SKX-LABEL: transform_VUNPCKHPDrmkz:
891 ; CHECK-SKX: # %bb.0:
892 ; CHECK-SKX-NEXT: kmovd %esi, %k1
893 ; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
894 ; CHECK-SKX-NEXT: retq
896 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmkz:
897 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
898 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
899 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
900 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
902 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmkz:
903 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
904 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
905 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
906 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
908 ; CHECK-V4-LABEL: transform_VUNPCKHPDrmkz:
910 ; CHECK-V4-NEXT: kmovd %esi, %k1
911 ; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
912 ; CHECK-V4-NEXT: retq
914 ; CHECK-AVX512-LABEL: transform_VUNPCKHPDrmkz:
915 ; CHECK-AVX512: # %bb.0:
916 ; CHECK-AVX512-NEXT: kmovd %esi, %k1
917 ; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
918 ; CHECK-AVX512-NEXT: retq
920 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrmkz:
921 ; CHECK-ZNVER4: # %bb.0:
922 ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
923 ; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
924 ; CHECK-ZNVER4-NEXT: retq
925 %mask = bitcast i2 %mask_int to <2 x i1>
926 %b = load <2 x double>, ptr %pb
927 %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
928 %res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> zeroinitializer
929 ret <2 x double> %res
932 define <8 x double> @transform_VUNPCKLPDZrmk(<8 x double> %a, ptr %pb, <8 x double> %c, i8 %mask_int) nounwind {
933 ; CHECK-LABEL: transform_VUNPCKLPDZrmk:
935 ; CHECK-NEXT: kmovd %esi, %k1
936 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
937 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
939 %mask = bitcast i8 %mask_int to <8 x i1>
940 %b = load <8 x double>, ptr %pb
941 %shufp = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
942 %res = select <8 x i1> %mask, <8 x double> %shufp, <8 x double> %c
943 ret <8 x double> %res
946 define <8 x double> @transform_VUNPCKHPDZrmk(<8 x double> %a, ptr %pb, <8 x double> %c, i8 %mask_int) nounwind {
947 ; CHECK-LABEL: transform_VUNPCKHPDZrmk:
949 ; CHECK-NEXT: kmovd %esi, %k1
950 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
951 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
953 %mask = bitcast i8 %mask_int to <8 x i1>
954 %b = load <8 x double>, ptr %pb
955 %shufp = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
956 %res = select <8 x i1> %mask, <8 x double> %shufp, <8 x double> %c
957 ret <8 x double> %res
960 define <4 x double> @transform_VUNPCKLPDYrmk(<4 x double> %a, ptr %pb, <4 x double> %c, i4 %mask_int) nounwind {
961 ; CHECK-SKX-LABEL: transform_VUNPCKLPDYrmk:
962 ; CHECK-SKX: # %bb.0:
963 ; CHECK-SKX-NEXT: kmovd %esi, %k1
964 ; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
965 ; CHECK-SKX-NEXT: vmovapd %ymm1, %ymm0
966 ; CHECK-SKX-NEXT: retq
968 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrmk:
969 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
970 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
971 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
972 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %ymm1, %ymm0
973 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
975 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDYrmk:
976 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
977 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
978 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
979 ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %ymm1, %ymm0
980 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
982 ; CHECK-V4-LABEL: transform_VUNPCKLPDYrmk:
984 ; CHECK-V4-NEXT: kmovd %esi, %k1
985 ; CHECK-V4-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
986 ; CHECK-V4-NEXT: vmovapd %ymm1, %ymm0
987 ; CHECK-V4-NEXT: retq
989 ; CHECK-AVX512-LABEL: transform_VUNPCKLPDYrmk:
990 ; CHECK-AVX512: # %bb.0:
991 ; CHECK-AVX512-NEXT: kmovd %esi, %k1
992 ; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
993 ; CHECK-AVX512-NEXT: vmovapd %ymm1, %ymm0
994 ; CHECK-AVX512-NEXT: retq
996 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDYrmk:
997 ; CHECK-ZNVER4: # %bb.0:
998 ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
999 ; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
1000 ; CHECK-ZNVER4-NEXT: vmovapd %ymm1, %ymm0
1001 ; CHECK-ZNVER4-NEXT: retq
1002 %mask = bitcast i4 %mask_int to <4 x i1>
1003 %b = load <4 x double>, ptr %pb
1004 %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
1005 %res = select <4 x i1> %mask, <4 x double> %shufp, <4 x double> %c
1006 ret <4 x double> %res
1009 define <4 x double> @transform_VUNPCKHPDYrmk(<4 x double> %a, ptr %pb, <4 x double> %c, i4 %mask_int) nounwind {
1010 ; CHECK-SKX-LABEL: transform_VUNPCKHPDYrmk:
1011 ; CHECK-SKX: # %bb.0:
1012 ; CHECK-SKX-NEXT: kmovd %esi, %k1
1013 ; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
1014 ; CHECK-SKX-NEXT: vmovapd %ymm1, %ymm0
1015 ; CHECK-SKX-NEXT: retq
1017 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrmk:
1018 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
1019 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
1020 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
1021 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %ymm1, %ymm0
1022 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
1024 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDYrmk:
1025 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
1026 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
1027 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
1028 ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %ymm1, %ymm0
1029 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
1031 ; CHECK-V4-LABEL: transform_VUNPCKHPDYrmk:
1032 ; CHECK-V4: # %bb.0:
1033 ; CHECK-V4-NEXT: kmovd %esi, %k1
1034 ; CHECK-V4-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
1035 ; CHECK-V4-NEXT: vmovapd %ymm1, %ymm0
1036 ; CHECK-V4-NEXT: retq
1038 ; CHECK-AVX512-LABEL: transform_VUNPCKHPDYrmk:
1039 ; CHECK-AVX512: # %bb.0:
1040 ; CHECK-AVX512-NEXT: kmovd %esi, %k1
1041 ; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
1042 ; CHECK-AVX512-NEXT: vmovapd %ymm1, %ymm0
1043 ; CHECK-AVX512-NEXT: retq
1045 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDYrmk:
1046 ; CHECK-ZNVER4: # %bb.0:
1047 ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
1048 ; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
1049 ; CHECK-ZNVER4-NEXT: vmovapd %ymm1, %ymm0
1050 ; CHECK-ZNVER4-NEXT: retq
1051 %mask = bitcast i4 %mask_int to <4 x i1>
1052 %b = load <4 x double>, ptr %pb
1053 %shufp = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
1054 %res = select <4 x i1> %mask, <4 x double> %shufp, <4 x double> %c
1055 ret <4 x double> %res
1058 define <2 x double> @transform_VUNPCKLPDrmk(<2 x double> %a, ptr %pb, <2 x double> %c, i2 %mask_int) nounwind {
1059 ; CHECK-SKX-LABEL: transform_VUNPCKLPDrmk:
1060 ; CHECK-SKX: # %bb.0:
1061 ; CHECK-SKX-NEXT: kmovd %esi, %k1
1062 ; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
1063 ; CHECK-SKX-NEXT: vmovapd %xmm1, %xmm0
1064 ; CHECK-SKX-NEXT: retq
1066 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmk:
1067 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
1068 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
1069 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
1070 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %xmm1, %xmm0
1071 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
1073 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmk:
1074 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
1075 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
1076 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
1077 ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %xmm1, %xmm0
1078 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
1080 ; CHECK-V4-LABEL: transform_VUNPCKLPDrmk:
1081 ; CHECK-V4: # %bb.0:
1082 ; CHECK-V4-NEXT: kmovd %esi, %k1
1083 ; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
1084 ; CHECK-V4-NEXT: vmovapd %xmm1, %xmm0
1085 ; CHECK-V4-NEXT: retq
1087 ; CHECK-AVX512-LABEL: transform_VUNPCKLPDrmk:
1088 ; CHECK-AVX512: # %bb.0:
1089 ; CHECK-AVX512-NEXT: kmovd %esi, %k1
1090 ; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
1091 ; CHECK-AVX512-NEXT: vmovapd %xmm1, %xmm0
1092 ; CHECK-AVX512-NEXT: retq
1094 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrmk:
1095 ; CHECK-ZNVER4: # %bb.0:
1096 ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
1097 ; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
1098 ; CHECK-ZNVER4-NEXT: vmovapd %xmm1, %xmm0
1099 ; CHECK-ZNVER4-NEXT: retq
1100 %mask = bitcast i2 %mask_int to <2 x i1>
1101 %b = load <2 x double>, ptr %pb
1102 %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
1103 %res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> %c
1104 ret <2 x double> %res
1107 define <2 x double> @transform_VUNPCKHPDrmk(<2 x double> %a, ptr %pb, <2 x double> %c, i2 %mask_int) nounwind {
1108 ; CHECK-SKX-LABEL: transform_VUNPCKHPDrmk:
1109 ; CHECK-SKX: # %bb.0:
1110 ; CHECK-SKX-NEXT: kmovd %esi, %k1
1111 ; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
1112 ; CHECK-SKX-NEXT: vmovapd %xmm1, %xmm0
1113 ; CHECK-SKX-NEXT: retq
1115 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmk:
1116 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
1117 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
1118 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
1119 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %xmm1, %xmm0
1120 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
1122 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmk:
1123 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
1124 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
1125 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
1126 ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %xmm1, %xmm0
1127 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
1129 ; CHECK-V4-LABEL: transform_VUNPCKHPDrmk:
1130 ; CHECK-V4: # %bb.0:
1131 ; CHECK-V4-NEXT: kmovd %esi, %k1
1132 ; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
1133 ; CHECK-V4-NEXT: vmovapd %xmm1, %xmm0
1134 ; CHECK-V4-NEXT: retq
1136 ; CHECK-AVX512-LABEL: transform_VUNPCKHPDrmk:
1137 ; CHECK-AVX512: # %bb.0:
1138 ; CHECK-AVX512-NEXT: kmovd %esi, %k1
1139 ; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
1140 ; CHECK-AVX512-NEXT: vmovapd %xmm1, %xmm0
1141 ; CHECK-AVX512-NEXT: retq
1143 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrmk:
1144 ; CHECK-ZNVER4: # %bb.0:
1145 ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
1146 ; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
1147 ; CHECK-ZNVER4-NEXT: vmovapd %xmm1, %xmm0
1148 ; CHECK-ZNVER4-NEXT: retq
1149 %mask = bitcast i2 %mask_int to <2 x i1>
1150 %b = load <2 x double>, ptr %pb
1151 %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
1152 %res = select <2 x i1> %mask, <2 x double> %shufp, <2 x double> %c
1153 ret <2 x double> %res
1155 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: