1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-SKX
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=icelake-server | FileCheck %s --check-prefixes=CHECK,CHECK-ICX,CHECK-ICX-NO-BYPASS-DELAY
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=icelake-server -mattr=-no-bypass-delay-shuffle | FileCheck %s --check-prefixes=CHECK,CHECK-ICX,CHECK-ICX-BYPASS-DELAY
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-V4
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
9 define <16 x float> @transform_VUNPCKLPSZrr(<16 x float> %a, <16 x float> %b) nounwind {
10 ; CHECK-LABEL: transform_VUNPCKLPSZrr:
12 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
14 %shufp = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
15 ret <16 x float> %shufp
18 define <16 x float> @transform_VUNPCKHPSZrr(<16 x float> %a, <16 x float> %b) nounwind {
19 ; CHECK-LABEL: transform_VUNPCKHPSZrr:
21 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
23 %shufp = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
24 ret <16 x float> %shufp
27 define <8 x float> @transform_VUNPCKLPSYrr(<8 x float> %a, <8 x float> %b) nounwind {
28 ; CHECK-SKX-LABEL: transform_VUNPCKLPSYrr:
30 ; CHECK-SKX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
31 ; CHECK-SKX-NEXT: retq
33 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPSYrr:
34 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
35 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
36 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
38 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPSYrr:
39 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
40 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
41 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
43 ; CHECK-V4-LABEL: transform_VUNPCKLPSYrr:
45 ; CHECK-V4-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
48 ; CHECK-AVX512-LABEL: transform_VUNPCKLPSYrr:
49 ; CHECK-AVX512: # %bb.0:
50 ; CHECK-AVX512-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
51 ; CHECK-AVX512-NEXT: retq
53 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPSYrr:
54 ; CHECK-ZNVER4: # %bb.0:
55 ; CHECK-ZNVER4-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
56 ; CHECK-ZNVER4-NEXT: retq
57 %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
58 ret <8 x float> %shufp
61 define <8 x float> @transform_VUNPCKHPSYrr(<8 x float> %a, <8 x float> %b) nounwind {
62 ; CHECK-SKX-LABEL: transform_VUNPCKHPSYrr:
64 ; CHECK-SKX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
65 ; CHECK-SKX-NEXT: retq
67 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPSYrr:
68 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
69 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
70 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
72 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPSYrr:
73 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
74 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
75 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
77 ; CHECK-V4-LABEL: transform_VUNPCKHPSYrr:
79 ; CHECK-V4-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
82 ; CHECK-AVX512-LABEL: transform_VUNPCKHPSYrr:
83 ; CHECK-AVX512: # %bb.0:
84 ; CHECK-AVX512-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
85 ; CHECK-AVX512-NEXT: retq
87 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPSYrr:
88 ; CHECK-ZNVER4: # %bb.0:
89 ; CHECK-ZNVER4-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
90 ; CHECK-ZNVER4-NEXT: retq
91 %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
92 ret <8 x float> %shufp
95 define <4 x float> @transform_VUNPCKLPSrr(<4 x float> %a, <4 x float> %b) nounwind {
96 ; CHECK-SKX-LABEL: transform_VUNPCKLPSrr:
98 ; CHECK-SKX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
99 ; CHECK-SKX-NEXT: retq
101 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPSrr:
102 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
103 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
104 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
106 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPSrr:
107 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
108 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
109 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
111 ; CHECK-V4-LABEL: transform_VUNPCKLPSrr:
113 ; CHECK-V4-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
114 ; CHECK-V4-NEXT: retq
116 ; CHECK-AVX512-LABEL: transform_VUNPCKLPSrr:
117 ; CHECK-AVX512: # %bb.0:
118 ; CHECK-AVX512-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
119 ; CHECK-AVX512-NEXT: retq
121 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPSrr:
122 ; CHECK-ZNVER4: # %bb.0:
123 ; CHECK-ZNVER4-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
124 ; CHECK-ZNVER4-NEXT: retq
125 %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
126 ret <4 x float> %shufp
129 define <4 x float> @transform_VUNPCKHPSrr(<4 x float> %a, <4 x float> %b) nounwind {
130 ; CHECK-SKX-LABEL: transform_VUNPCKHPSrr:
131 ; CHECK-SKX: # %bb.0:
132 ; CHECK-SKX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
133 ; CHECK-SKX-NEXT: retq
135 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPSrr:
136 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
137 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
138 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
140 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPSrr:
141 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
142 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
143 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
145 ; CHECK-V4-LABEL: transform_VUNPCKHPSrr:
147 ; CHECK-V4-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
148 ; CHECK-V4-NEXT: retq
150 ; CHECK-AVX512-LABEL: transform_VUNPCKHPSrr:
151 ; CHECK-AVX512: # %bb.0:
152 ; CHECK-AVX512-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
153 ; CHECK-AVX512-NEXT: retq
155 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPSrr:
156 ; CHECK-ZNVER4: # %bb.0:
157 ; CHECK-ZNVER4-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
158 ; CHECK-ZNVER4-NEXT: retq
159 %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
160 ret <4 x float> %shufp
163 define <16 x float> @transform_VUNPCKLPSZrrkz(<16 x float> %a, <16 x float> %b, i16 %mask_int) nounwind {
164 ; CHECK-LABEL: transform_VUNPCKLPSZrrkz:
166 ; CHECK-NEXT: kmovd %edi, %k1
167 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
169 %mask = bitcast i16 %mask_int to <16 x i1>
170 %shufp = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
171 %res = select <16 x i1> %mask, <16 x float> %shufp, <16 x float> zeroinitializer
172 ret <16 x float> %res
175 define <16 x float> @transform_VUNPCKHPSZrrkz(<16 x float> %a, <16 x float> %b, i16 %mask_int) nounwind {
176 ; CHECK-LABEL: transform_VUNPCKHPSZrrkz:
178 ; CHECK-NEXT: kmovd %edi, %k1
179 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
181 %mask = bitcast i16 %mask_int to <16 x i1>
182 %shufp = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
183 %res = select <16 x i1> %mask, <16 x float> %shufp, <16 x float> zeroinitializer
184 ret <16 x float> %res
187 define <8 x float> @transform_VUNPCKLPSYrrkz(<8 x float> %a, <8 x float> %b, i8 %mask_int) nounwind {
188 ; CHECK-SKX-LABEL: transform_VUNPCKLPSYrrkz:
189 ; CHECK-SKX: # %bb.0:
190 ; CHECK-SKX-NEXT: kmovd %edi, %k1
191 ; CHECK-SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
192 ; CHECK-SKX-NEXT: retq
194 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPSYrrkz:
195 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
196 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
197 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckldq {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
198 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
200 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPSYrrkz:
201 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
202 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
203 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
204 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
206 ; CHECK-V4-LABEL: transform_VUNPCKLPSYrrkz:
208 ; CHECK-V4-NEXT: kmovd %edi, %k1
209 ; CHECK-V4-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
210 ; CHECK-V4-NEXT: retq
212 ; CHECK-AVX512-LABEL: transform_VUNPCKLPSYrrkz:
213 ; CHECK-AVX512: # %bb.0:
214 ; CHECK-AVX512-NEXT: kmovd %edi, %k1
215 ; CHECK-AVX512-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
216 ; CHECK-AVX512-NEXT: retq
218 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPSYrrkz:
219 ; CHECK-ZNVER4: # %bb.0:
220 ; CHECK-ZNVER4-NEXT: kmovd %edi, %k1
221 ; CHECK-ZNVER4-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
222 ; CHECK-ZNVER4-NEXT: retq
223 %mask = bitcast i8 %mask_int to <8 x i1>
224 %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
225 %res = select <8 x i1> %mask, <8 x float> %shufp, <8 x float> zeroinitializer
229 define <8 x float> @transform_VUNPCKHPSYrrkz(<8 x float> %a, <8 x float> %b, i8 %mask_int) nounwind {
230 ; CHECK-SKX-LABEL: transform_VUNPCKHPSYrrkz:
231 ; CHECK-SKX: # %bb.0:
232 ; CHECK-SKX-NEXT: kmovd %edi, %k1
233 ; CHECK-SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
234 ; CHECK-SKX-NEXT: retq
236 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPSYrrkz:
237 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
238 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
239 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhdq {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
240 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
242 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPSYrrkz:
243 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
244 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
245 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
246 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
248 ; CHECK-V4-LABEL: transform_VUNPCKHPSYrrkz:
250 ; CHECK-V4-NEXT: kmovd %edi, %k1
251 ; CHECK-V4-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
252 ; CHECK-V4-NEXT: retq
254 ; CHECK-AVX512-LABEL: transform_VUNPCKHPSYrrkz:
255 ; CHECK-AVX512: # %bb.0:
256 ; CHECK-AVX512-NEXT: kmovd %edi, %k1
257 ; CHECK-AVX512-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
258 ; CHECK-AVX512-NEXT: retq
260 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPSYrrkz:
261 ; CHECK-ZNVER4: # %bb.0:
262 ; CHECK-ZNVER4-NEXT: kmovd %edi, %k1
263 ; CHECK-ZNVER4-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
264 ; CHECK-ZNVER4-NEXT: retq
265 %mask = bitcast i8 %mask_int to <8 x i1>
266 %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
267 %res = select <8 x i1> %mask, <8 x float> %shufp, <8 x float> zeroinitializer
271 define <4 x float> @transform_VUNPCKLPSrrkz(<4 x float> %a, <4 x float> %b, i4 %mask_int) nounwind {
272 ; CHECK-SKX-LABEL: transform_VUNPCKLPSrrkz:
273 ; CHECK-SKX: # %bb.0:
274 ; CHECK-SKX-NEXT: kmovd %edi, %k1
275 ; CHECK-SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
276 ; CHECK-SKX-NEXT: retq
278 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPSrrkz:
279 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
280 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
281 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckldq {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
282 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
284 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPSrrkz:
285 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
286 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
287 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
288 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
290 ; CHECK-V4-LABEL: transform_VUNPCKLPSrrkz:
292 ; CHECK-V4-NEXT: kmovd %edi, %k1
293 ; CHECK-V4-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
294 ; CHECK-V4-NEXT: retq
296 ; CHECK-AVX512-LABEL: transform_VUNPCKLPSrrkz:
297 ; CHECK-AVX512: # %bb.0:
298 ; CHECK-AVX512-NEXT: kmovd %edi, %k1
299 ; CHECK-AVX512-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
300 ; CHECK-AVX512-NEXT: retq
302 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPSrrkz:
303 ; CHECK-ZNVER4: # %bb.0:
304 ; CHECK-ZNVER4-NEXT: kmovd %edi, %k1
305 ; CHECK-ZNVER4-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
306 ; CHECK-ZNVER4-NEXT: retq
307 %mask = bitcast i4 %mask_int to <4 x i1>
308 %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
309 %res = select <4 x i1> %mask, <4 x float> %shufp, <4 x float> zeroinitializer
313 define <4 x float> @transform_VUNPCKHPSrrkz(<4 x float> %a, <4 x float> %b, i4 %mask_int) nounwind {
314 ; CHECK-SKX-LABEL: transform_VUNPCKHPSrrkz:
315 ; CHECK-SKX: # %bb.0:
316 ; CHECK-SKX-NEXT: kmovd %edi, %k1
317 ; CHECK-SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
318 ; CHECK-SKX-NEXT: retq
320 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPSrrkz:
321 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
322 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
323 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhdq {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
324 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
326 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPSrrkz:
327 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
328 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
329 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
330 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
332 ; CHECK-V4-LABEL: transform_VUNPCKHPSrrkz:
334 ; CHECK-V4-NEXT: kmovd %edi, %k1
335 ; CHECK-V4-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
336 ; CHECK-V4-NEXT: retq
338 ; CHECK-AVX512-LABEL: transform_VUNPCKHPSrrkz:
339 ; CHECK-AVX512: # %bb.0:
340 ; CHECK-AVX512-NEXT: kmovd %edi, %k1
341 ; CHECK-AVX512-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
342 ; CHECK-AVX512-NEXT: retq
344 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPSrrkz:
345 ; CHECK-ZNVER4: # %bb.0:
346 ; CHECK-ZNVER4-NEXT: kmovd %edi, %k1
347 ; CHECK-ZNVER4-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
348 ; CHECK-ZNVER4-NEXT: retq
349 %mask = bitcast i4 %mask_int to <4 x i1>
350 %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
351 %res = select <4 x i1> %mask, <4 x float> %shufp, <4 x float> zeroinitializer
355 define <16 x float> @transform_VUNPCKLPSZrrk(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask_int) nounwind {
356 ; CHECK-LABEL: transform_VUNPCKLPSZrrk:
358 ; CHECK-NEXT: kmovd %edi, %k1
359 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
360 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
362 %mask = bitcast i16 %mask_int to <16 x i1>
363 %shufp = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
364 %res = select <16 x i1> %mask, <16 x float> %shufp, <16 x float> %c
365 ret <16 x float> %res
368 define <16 x float> @transform_VUNPCKHPSZrrk(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask_int) nounwind {
369 ; CHECK-LABEL: transform_VUNPCKHPSZrrk:
371 ; CHECK-NEXT: kmovd %edi, %k1
372 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
373 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
375 %mask = bitcast i16 %mask_int to <16 x i1>
376 %shufp = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
377 %res = select <16 x i1> %mask, <16 x float> %shufp, <16 x float> %c
378 ret <16 x float> %res
381 define <8 x float> @transform_VUNPCKLPSYrrk(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask_int) nounwind {
382 ; CHECK-SKX-LABEL: transform_VUNPCKLPSYrrk:
383 ; CHECK-SKX: # %bb.0:
384 ; CHECK-SKX-NEXT: kmovd %edi, %k1
385 ; CHECK-SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
386 ; CHECK-SKX-NEXT: vmovaps %ymm2, %ymm0
387 ; CHECK-SKX-NEXT: retq
389 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPSYrrk:
390 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
391 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
392 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckldq {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
393 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovaps %ymm2, %ymm0
394 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
396 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPSYrrk:
397 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
398 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
399 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
400 ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovaps %ymm2, %ymm0
401 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
403 ; CHECK-V4-LABEL: transform_VUNPCKLPSYrrk:
405 ; CHECK-V4-NEXT: kmovd %edi, %k1
406 ; CHECK-V4-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
407 ; CHECK-V4-NEXT: vmovaps %ymm2, %ymm0
408 ; CHECK-V4-NEXT: retq
410 ; CHECK-AVX512-LABEL: transform_VUNPCKLPSYrrk:
411 ; CHECK-AVX512: # %bb.0:
412 ; CHECK-AVX512-NEXT: kmovd %edi, %k1
413 ; CHECK-AVX512-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
414 ; CHECK-AVX512-NEXT: vmovaps %ymm2, %ymm0
415 ; CHECK-AVX512-NEXT: retq
417 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPSYrrk:
418 ; CHECK-ZNVER4: # %bb.0:
419 ; CHECK-ZNVER4-NEXT: kmovd %edi, %k1
420 ; CHECK-ZNVER4-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
421 ; CHECK-ZNVER4-NEXT: vmovaps %ymm2, %ymm0
422 ; CHECK-ZNVER4-NEXT: retq
423 %mask = bitcast i8 %mask_int to <8 x i1>
424 %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
425 %res = select <8 x i1> %mask, <8 x float> %shufp, <8 x float> %c
429 define <8 x float> @transform_VUNPCKHPSYrrk(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask_int) nounwind {
430 ; CHECK-SKX-LABEL: transform_VUNPCKHPSYrrk:
431 ; CHECK-SKX: # %bb.0:
432 ; CHECK-SKX-NEXT: kmovd %edi, %k1
433 ; CHECK-SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
434 ; CHECK-SKX-NEXT: vmovaps %ymm2, %ymm0
435 ; CHECK-SKX-NEXT: retq
437 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPSYrrk:
438 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
439 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
440 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhdq {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
441 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovaps %ymm2, %ymm0
442 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
444 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPSYrrk:
445 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
446 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
447 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
448 ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovaps %ymm2, %ymm0
449 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
451 ; CHECK-V4-LABEL: transform_VUNPCKHPSYrrk:
453 ; CHECK-V4-NEXT: kmovd %edi, %k1
454 ; CHECK-V4-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
455 ; CHECK-V4-NEXT: vmovaps %ymm2, %ymm0
456 ; CHECK-V4-NEXT: retq
458 ; CHECK-AVX512-LABEL: transform_VUNPCKHPSYrrk:
459 ; CHECK-AVX512: # %bb.0:
460 ; CHECK-AVX512-NEXT: kmovd %edi, %k1
461 ; CHECK-AVX512-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
462 ; CHECK-AVX512-NEXT: vmovaps %ymm2, %ymm0
463 ; CHECK-AVX512-NEXT: retq
465 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPSYrrk:
466 ; CHECK-ZNVER4: # %bb.0:
467 ; CHECK-ZNVER4-NEXT: kmovd %edi, %k1
468 ; CHECK-ZNVER4-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
469 ; CHECK-ZNVER4-NEXT: vmovaps %ymm2, %ymm0
470 ; CHECK-ZNVER4-NEXT: retq
471 %mask = bitcast i8 %mask_int to <8 x i1>
472 %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
473 %res = select <8 x i1> %mask, <8 x float> %shufp, <8 x float> %c
477 define <4 x float> @transform_VUNPCKLPSrrk(<4 x float> %a, <4 x float> %b, <4 x float> %c, i4 %mask_int) nounwind {
478 ; CHECK-SKX-LABEL: transform_VUNPCKLPSrrk:
479 ; CHECK-SKX: # %bb.0:
480 ; CHECK-SKX-NEXT: kmovd %edi, %k1
481 ; CHECK-SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
482 ; CHECK-SKX-NEXT: vmovaps %xmm2, %xmm0
483 ; CHECK-SKX-NEXT: retq
485 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPSrrk:
486 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
487 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
488 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckldq {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
489 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovaps %xmm2, %xmm0
490 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
492 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPSrrk:
493 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
494 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
495 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
496 ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovaps %xmm2, %xmm0
497 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
499 ; CHECK-V4-LABEL: transform_VUNPCKLPSrrk:
501 ; CHECK-V4-NEXT: kmovd %edi, %k1
502 ; CHECK-V4-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
503 ; CHECK-V4-NEXT: vmovaps %xmm2, %xmm0
504 ; CHECK-V4-NEXT: retq
506 ; CHECK-AVX512-LABEL: transform_VUNPCKLPSrrk:
507 ; CHECK-AVX512: # %bb.0:
508 ; CHECK-AVX512-NEXT: kmovd %edi, %k1
509 ; CHECK-AVX512-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
510 ; CHECK-AVX512-NEXT: vmovaps %xmm2, %xmm0
511 ; CHECK-AVX512-NEXT: retq
513 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPSrrk:
514 ; CHECK-ZNVER4: # %bb.0:
515 ; CHECK-ZNVER4-NEXT: kmovd %edi, %k1
516 ; CHECK-ZNVER4-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
517 ; CHECK-ZNVER4-NEXT: vmovaps %xmm2, %xmm0
518 ; CHECK-ZNVER4-NEXT: retq
519 %mask = bitcast i4 %mask_int to <4 x i1>
520 %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
521 %res = select <4 x i1> %mask, <4 x float> %shufp, <4 x float> %c
525 define <4 x float> @transform_VUNPCKHPSrrk(<4 x float> %a, <4 x float> %b, <4 x float> %c, i4 %mask_int) nounwind {
526 ; CHECK-SKX-LABEL: transform_VUNPCKHPSrrk:
527 ; CHECK-SKX: # %bb.0:
528 ; CHECK-SKX-NEXT: kmovd %edi, %k1
529 ; CHECK-SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
530 ; CHECK-SKX-NEXT: vmovaps %xmm2, %xmm0
531 ; CHECK-SKX-NEXT: retq
533 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPSrrk:
534 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
535 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %edi, %k1
536 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhdq {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
537 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovaps %xmm2, %xmm0
538 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
540 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPSrrk:
541 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
542 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %edi, %k1
543 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
544 ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovaps %xmm2, %xmm0
545 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
547 ; CHECK-V4-LABEL: transform_VUNPCKHPSrrk:
549 ; CHECK-V4-NEXT: kmovd %edi, %k1
550 ; CHECK-V4-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
551 ; CHECK-V4-NEXT: vmovaps %xmm2, %xmm0
552 ; CHECK-V4-NEXT: retq
554 ; CHECK-AVX512-LABEL: transform_VUNPCKHPSrrk:
555 ; CHECK-AVX512: # %bb.0:
556 ; CHECK-AVX512-NEXT: kmovd %edi, %k1
557 ; CHECK-AVX512-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
558 ; CHECK-AVX512-NEXT: vmovaps %xmm2, %xmm0
559 ; CHECK-AVX512-NEXT: retq
561 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPSrrk:
562 ; CHECK-ZNVER4: # %bb.0:
563 ; CHECK-ZNVER4-NEXT: kmovd %edi, %k1
564 ; CHECK-ZNVER4-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
565 ; CHECK-ZNVER4-NEXT: vmovaps %xmm2, %xmm0
566 ; CHECK-ZNVER4-NEXT: retq
567 %mask = bitcast i4 %mask_int to <4 x i1>
568 %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
569 %res = select <4 x i1> %mask, <4 x float> %shufp, <4 x float> %c
573 define <16 x float> @transform_VUNPCKLPSZrm(<16 x float> %a, ptr %pb) nounwind {
574 ; CHECK-LABEL: transform_VUNPCKLPSZrm:
576 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
578 %b = load <16 x float>, ptr %pb
579 %shufp = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
580 ret <16 x float> %shufp
583 define <16 x float> @transform_VUNPCKHPSZrm(<16 x float> %a, ptr %pb) nounwind {
584 ; CHECK-LABEL: transform_VUNPCKHPSZrm:
586 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
588 %b = load <16 x float>, ptr %pb
589 %shufp = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
590 ret <16 x float> %shufp
593 define <8 x float> @transform_VUNPCKLPSYrm(<8 x float> %a, ptr %pb) nounwind {
594 ; CHECK-SKX-LABEL: transform_VUNPCKLPSYrm:
595 ; CHECK-SKX: # %bb.0:
596 ; CHECK-SKX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
597 ; CHECK-SKX-NEXT: retq
599 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPSYrm:
600 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
601 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
602 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
604 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPSYrm:
605 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
606 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
607 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
609 ; CHECK-V4-LABEL: transform_VUNPCKLPSYrm:
611 ; CHECK-V4-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
612 ; CHECK-V4-NEXT: retq
614 ; CHECK-AVX512-LABEL: transform_VUNPCKLPSYrm:
615 ; CHECK-AVX512: # %bb.0:
616 ; CHECK-AVX512-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
617 ; CHECK-AVX512-NEXT: retq
619 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPSYrm:
620 ; CHECK-ZNVER4: # %bb.0:
621 ; CHECK-ZNVER4-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
622 ; CHECK-ZNVER4-NEXT: retq
623 %b = load <8 x float>, ptr %pb
624 %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
625 ret <8 x float> %shufp
628 define <8 x float> @transform_VUNPCKHPSYrm(<8 x float> %a, ptr %pb) nounwind {
629 ; CHECK-SKX-LABEL: transform_VUNPCKHPSYrm:
630 ; CHECK-SKX: # %bb.0:
631 ; CHECK-SKX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
632 ; CHECK-SKX-NEXT: retq
634 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPSYrm:
635 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
636 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
637 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
639 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPSYrm:
640 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
641 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
642 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
644 ; CHECK-V4-LABEL: transform_VUNPCKHPSYrm:
646 ; CHECK-V4-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
647 ; CHECK-V4-NEXT: retq
649 ; CHECK-AVX512-LABEL: transform_VUNPCKHPSYrm:
650 ; CHECK-AVX512: # %bb.0:
651 ; CHECK-AVX512-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
652 ; CHECK-AVX512-NEXT: retq
654 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPSYrm:
655 ; CHECK-ZNVER4: # %bb.0:
656 ; CHECK-ZNVER4-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
657 ; CHECK-ZNVER4-NEXT: retq
658 %b = load <8 x float>, ptr %pb
659 %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
660 ret <8 x float> %shufp
663 define <4 x float> @transform_VUNPCKLPSrm(<4 x float> %a, ptr %pb) nounwind {
664 ; CHECK-SKX-LABEL: transform_VUNPCKLPSrm:
665 ; CHECK-SKX: # %bb.0:
666 ; CHECK-SKX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
667 ; CHECK-SKX-NEXT: retq
669 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPSrm:
670 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
671 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
672 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
674 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPSrm:
675 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
676 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
677 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
679 ; CHECK-V4-LABEL: transform_VUNPCKLPSrm:
681 ; CHECK-V4-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
682 ; CHECK-V4-NEXT: retq
684 ; CHECK-AVX512-LABEL: transform_VUNPCKLPSrm:
685 ; CHECK-AVX512: # %bb.0:
686 ; CHECK-AVX512-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
687 ; CHECK-AVX512-NEXT: retq
689 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPSrm:
690 ; CHECK-ZNVER4: # %bb.0:
691 ; CHECK-ZNVER4-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
692 ; CHECK-ZNVER4-NEXT: retq
693 %b = load <4 x float>, ptr %pb
694 %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
695 ret <4 x float> %shufp
698 define <4 x float> @transform_VUNPCKHPSrm(<4 x float> %a, ptr %pb) nounwind {
699 ; CHECK-SKX-LABEL: transform_VUNPCKHPSrm:
700 ; CHECK-SKX: # %bb.0:
701 ; CHECK-SKX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
702 ; CHECK-SKX-NEXT: retq
704 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPSrm:
705 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
706 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
707 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
709 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPSrm:
710 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
711 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
712 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
714 ; CHECK-V4-LABEL: transform_VUNPCKHPSrm:
716 ; CHECK-V4-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
717 ; CHECK-V4-NEXT: retq
719 ; CHECK-AVX512-LABEL: transform_VUNPCKHPSrm:
720 ; CHECK-AVX512: # %bb.0:
721 ; CHECK-AVX512-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
722 ; CHECK-AVX512-NEXT: retq
724 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPSrm:
725 ; CHECK-ZNVER4: # %bb.0:
726 ; CHECK-ZNVER4-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
727 ; CHECK-ZNVER4-NEXT: retq
728 %b = load <4 x float>, ptr %pb
729 %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
730 ret <4 x float> %shufp
733 define <16 x float> @transform_VUNPCKLPSZrmkz(<16 x float> %a, ptr %pb, i16 %mask_int) nounwind {
734 ; CHECK-LABEL: transform_VUNPCKLPSZrmkz:
736 ; CHECK-NEXT: kmovd %esi, %k1
737 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
739 %mask = bitcast i16 %mask_int to <16 x i1>
740 %b = load <16 x float>, ptr %pb
741 %shufp = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
742 %res = select <16 x i1> %mask, <16 x float> %shufp, <16 x float> zeroinitializer
743 ret <16 x float> %res
746 define <16 x float> @transform_VUNPCKHPSZrmkz(<16 x float> %a, ptr %pb, i16 %mask_int) nounwind {
747 ; CHECK-LABEL: transform_VUNPCKHPSZrmkz:
749 ; CHECK-NEXT: kmovd %esi, %k1
750 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
752 %mask = bitcast i16 %mask_int to <16 x i1>
753 %b = load <16 x float>, ptr %pb
754 %shufp = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
755 %res = select <16 x i1> %mask, <16 x float> %shufp, <16 x float> zeroinitializer
756 ret <16 x float> %res
759 define <8 x float> @transform_VUNPCKLPSYrmkz(<8 x float> %a, ptr %pb, i8 %mask_int) nounwind {
760 ; CHECK-SKX-LABEL: transform_VUNPCKLPSYrmkz:
761 ; CHECK-SKX: # %bb.0:
762 ; CHECK-SKX-NEXT: kmovd %esi, %k1
763 ; CHECK-SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
764 ; CHECK-SKX-NEXT: retq
766 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPSYrmkz:
767 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
768 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
769 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckldq {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
770 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
772 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPSYrmkz:
773 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
774 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
775 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
776 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
778 ; CHECK-V4-LABEL: transform_VUNPCKLPSYrmkz:
780 ; CHECK-V4-NEXT: kmovd %esi, %k1
781 ; CHECK-V4-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
782 ; CHECK-V4-NEXT: retq
784 ; CHECK-AVX512-LABEL: transform_VUNPCKLPSYrmkz:
785 ; CHECK-AVX512: # %bb.0:
786 ; CHECK-AVX512-NEXT: kmovd %esi, %k1
787 ; CHECK-AVX512-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
788 ; CHECK-AVX512-NEXT: retq
790 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPSYrmkz:
791 ; CHECK-ZNVER4: # %bb.0:
792 ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
793 ; CHECK-ZNVER4-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
794 ; CHECK-ZNVER4-NEXT: retq
795 %mask = bitcast i8 %mask_int to <8 x i1>
796 %b = load <8 x float>, ptr %pb
797 %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
798 %res = select <8 x i1> %mask, <8 x float> %shufp, <8 x float> zeroinitializer
802 define <8 x float> @transform_VUNPCKHPSYrmkz(<8 x float> %a, ptr %pb, i8 %mask_int) nounwind {
803 ; CHECK-SKX-LABEL: transform_VUNPCKHPSYrmkz:
804 ; CHECK-SKX: # %bb.0:
805 ; CHECK-SKX-NEXT: kmovd %esi, %k1
806 ; CHECK-SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
807 ; CHECK-SKX-NEXT: retq
809 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPSYrmkz:
810 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
811 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
812 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhdq {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
813 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
815 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPSYrmkz:
816 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
817 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
818 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
819 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
821 ; CHECK-V4-LABEL: transform_VUNPCKHPSYrmkz:
823 ; CHECK-V4-NEXT: kmovd %esi, %k1
824 ; CHECK-V4-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
825 ; CHECK-V4-NEXT: retq
827 ; CHECK-AVX512-LABEL: transform_VUNPCKHPSYrmkz:
828 ; CHECK-AVX512: # %bb.0:
829 ; CHECK-AVX512-NEXT: kmovd %esi, %k1
830 ; CHECK-AVX512-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
831 ; CHECK-AVX512-NEXT: retq
833 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPSYrmkz:
834 ; CHECK-ZNVER4: # %bb.0:
835 ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
836 ; CHECK-ZNVER4-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
837 ; CHECK-ZNVER4-NEXT: retq
838 %mask = bitcast i8 %mask_int to <8 x i1>
839 %b = load <8 x float>, ptr %pb
840 %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
841 %res = select <8 x i1> %mask, <8 x float> %shufp, <8 x float> zeroinitializer
845 define <4 x float> @transform_VUNPCKLPSrmkz(<4 x float> %a, ptr %pb, i4 %mask_int) nounwind {
846 ; CHECK-SKX-LABEL: transform_VUNPCKLPSrmkz:
847 ; CHECK-SKX: # %bb.0:
848 ; CHECK-SKX-NEXT: kmovd %esi, %k1
849 ; CHECK-SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
850 ; CHECK-SKX-NEXT: retq
852 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPSrmkz:
853 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
854 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
855 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckldq {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
856 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
858 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPSrmkz:
859 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
860 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
861 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
862 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
864 ; CHECK-V4-LABEL: transform_VUNPCKLPSrmkz:
866 ; CHECK-V4-NEXT: kmovd %esi, %k1
867 ; CHECK-V4-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
868 ; CHECK-V4-NEXT: retq
870 ; CHECK-AVX512-LABEL: transform_VUNPCKLPSrmkz:
871 ; CHECK-AVX512: # %bb.0:
872 ; CHECK-AVX512-NEXT: kmovd %esi, %k1
873 ; CHECK-AVX512-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
874 ; CHECK-AVX512-NEXT: retq
876 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPSrmkz:
877 ; CHECK-ZNVER4: # %bb.0:
878 ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
879 ; CHECK-ZNVER4-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
880 ; CHECK-ZNVER4-NEXT: retq
881 %mask = bitcast i4 %mask_int to <4 x i1>
882 %b = load <4 x float>, ptr %pb
883 %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
884 %res = select <4 x i1> %mask, <4 x float> %shufp, <4 x float> zeroinitializer
888 define <4 x float> @transform_VUNPCKHPSrmkz(<4 x float> %a, ptr %pb, i4 %mask_int) nounwind {
889 ; CHECK-SKX-LABEL: transform_VUNPCKHPSrmkz:
890 ; CHECK-SKX: # %bb.0:
891 ; CHECK-SKX-NEXT: kmovd %esi, %k1
892 ; CHECK-SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
893 ; CHECK-SKX-NEXT: retq
895 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPSrmkz:
896 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
897 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
898 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhdq {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
899 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
901 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPSrmkz:
902 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
903 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
904 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
905 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
907 ; CHECK-V4-LABEL: transform_VUNPCKHPSrmkz:
909 ; CHECK-V4-NEXT: kmovd %esi, %k1
910 ; CHECK-V4-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
911 ; CHECK-V4-NEXT: retq
913 ; CHECK-AVX512-LABEL: transform_VUNPCKHPSrmkz:
914 ; CHECK-AVX512: # %bb.0:
915 ; CHECK-AVX512-NEXT: kmovd %esi, %k1
916 ; CHECK-AVX512-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
917 ; CHECK-AVX512-NEXT: retq
919 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPSrmkz:
920 ; CHECK-ZNVER4: # %bb.0:
921 ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
922 ; CHECK-ZNVER4-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
923 ; CHECK-ZNVER4-NEXT: retq
924 %mask = bitcast i4 %mask_int to <4 x i1>
925 %b = load <4 x float>, ptr %pb
926 %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
927 %res = select <4 x i1> %mask, <4 x float> %shufp, <4 x float> zeroinitializer
931 define <16 x float> @transform_VUNPCKLPSZrmk(<16 x float> %a, ptr %pb, <16 x float> %c, i16 %mask_int) nounwind {
932 ; CHECK-LABEL: transform_VUNPCKLPSZrmk:
934 ; CHECK-NEXT: kmovd %esi, %k1
935 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
936 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
938 %mask = bitcast i16 %mask_int to <16 x i1>
939 %b = load <16 x float>, ptr %pb
940 %shufp = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
941 %res = select <16 x i1> %mask, <16 x float> %shufp, <16 x float> %c
942 ret <16 x float> %res
945 define <16 x float> @transform_VUNPCKHPSZrmk(<16 x float> %a, ptr %pb, <16 x float> %c, i16 %mask_int) nounwind {
946 ; CHECK-LABEL: transform_VUNPCKHPSZrmk:
948 ; CHECK-NEXT: kmovd %esi, %k1
949 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
950 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
952 %mask = bitcast i16 %mask_int to <16 x i1>
953 %b = load <16 x float>, ptr %pb
954 %shufp = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
955 %res = select <16 x i1> %mask, <16 x float> %shufp, <16 x float> %c
956 ret <16 x float> %res
959 define <8 x float> @transform_VUNPCKLPSYrmk(<8 x float> %a, ptr %pb, <8 x float> %c, i8 %mask_int) nounwind {
960 ; CHECK-SKX-LABEL: transform_VUNPCKLPSYrmk:
961 ; CHECK-SKX: # %bb.0:
962 ; CHECK-SKX-NEXT: kmovd %esi, %k1
963 ; CHECK-SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
964 ; CHECK-SKX-NEXT: vmovaps %ymm1, %ymm0
965 ; CHECK-SKX-NEXT: retq
967 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPSYrmk:
968 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
969 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
970 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckldq {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
971 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovaps %ymm1, %ymm0
972 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
974 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPSYrmk:
975 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
976 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
977 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
978 ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovaps %ymm1, %ymm0
979 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
981 ; CHECK-V4-LABEL: transform_VUNPCKLPSYrmk:
983 ; CHECK-V4-NEXT: kmovd %esi, %k1
984 ; CHECK-V4-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
985 ; CHECK-V4-NEXT: vmovaps %ymm1, %ymm0
986 ; CHECK-V4-NEXT: retq
988 ; CHECK-AVX512-LABEL: transform_VUNPCKLPSYrmk:
989 ; CHECK-AVX512: # %bb.0:
990 ; CHECK-AVX512-NEXT: kmovd %esi, %k1
991 ; CHECK-AVX512-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
992 ; CHECK-AVX512-NEXT: vmovaps %ymm1, %ymm0
993 ; CHECK-AVX512-NEXT: retq
995 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPSYrmk:
996 ; CHECK-ZNVER4: # %bb.0:
997 ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
998 ; CHECK-ZNVER4-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
999 ; CHECK-ZNVER4-NEXT: vmovaps %ymm1, %ymm0
1000 ; CHECK-ZNVER4-NEXT: retq
1001 %mask = bitcast i8 %mask_int to <8 x i1>
1002 %b = load <8 x float>, ptr %pb
1003 %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
1004 %res = select <8 x i1> %mask, <8 x float> %shufp, <8 x float> %c
1005 ret <8 x float> %res
1008 define <8 x float> @transform_VUNPCKHPSYrmk(<8 x float> %a, ptr %pb, <8 x float> %c, i8 %mask_int) nounwind {
1009 ; CHECK-SKX-LABEL: transform_VUNPCKHPSYrmk:
1010 ; CHECK-SKX: # %bb.0:
1011 ; CHECK-SKX-NEXT: kmovd %esi, %k1
1012 ; CHECK-SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
1013 ; CHECK-SKX-NEXT: vmovaps %ymm1, %ymm0
1014 ; CHECK-SKX-NEXT: retq
1016 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPSYrmk:
1017 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
1018 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
1019 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhdq {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
1020 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovaps %ymm1, %ymm0
1021 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
1023 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPSYrmk:
1024 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
1025 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
1026 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
1027 ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovaps %ymm1, %ymm0
1028 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
1030 ; CHECK-V4-LABEL: transform_VUNPCKHPSYrmk:
1031 ; CHECK-V4: # %bb.0:
1032 ; CHECK-V4-NEXT: kmovd %esi, %k1
1033 ; CHECK-V4-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
1034 ; CHECK-V4-NEXT: vmovaps %ymm1, %ymm0
1035 ; CHECK-V4-NEXT: retq
1037 ; CHECK-AVX512-LABEL: transform_VUNPCKHPSYrmk:
1038 ; CHECK-AVX512: # %bb.0:
1039 ; CHECK-AVX512-NEXT: kmovd %esi, %k1
1040 ; CHECK-AVX512-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
1041 ; CHECK-AVX512-NEXT: vmovaps %ymm1, %ymm0
1042 ; CHECK-AVX512-NEXT: retq
1044 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPSYrmk:
1045 ; CHECK-ZNVER4: # %bb.0:
1046 ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
1047 ; CHECK-ZNVER4-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
1048 ; CHECK-ZNVER4-NEXT: vmovaps %ymm1, %ymm0
1049 ; CHECK-ZNVER4-NEXT: retq
1050 %mask = bitcast i8 %mask_int to <8 x i1>
1051 %b = load <8 x float>, ptr %pb
1052 %shufp = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
1053 %res = select <8 x i1> %mask, <8 x float> %shufp, <8 x float> %c
1054 ret <8 x float> %res
1057 define <4 x float> @transform_VUNPCKLPSrmk(<4 x float> %a, ptr %pb, <4 x float> %c, i4 %mask_int) nounwind {
1058 ; CHECK-SKX-LABEL: transform_VUNPCKLPSrmk:
1059 ; CHECK-SKX: # %bb.0:
1060 ; CHECK-SKX-NEXT: kmovd %esi, %k1
1061 ; CHECK-SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
1062 ; CHECK-SKX-NEXT: vmovaps %xmm1, %xmm0
1063 ; CHECK-SKX-NEXT: retq
1065 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPSrmk:
1066 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
1067 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
1068 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckldq {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
1069 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovaps %xmm1, %xmm0
1070 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
1072 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPSrmk:
1073 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
1074 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
1075 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
1076 ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovaps %xmm1, %xmm0
1077 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
1079 ; CHECK-V4-LABEL: transform_VUNPCKLPSrmk:
1080 ; CHECK-V4: # %bb.0:
1081 ; CHECK-V4-NEXT: kmovd %esi, %k1
1082 ; CHECK-V4-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
1083 ; CHECK-V4-NEXT: vmovaps %xmm1, %xmm0
1084 ; CHECK-V4-NEXT: retq
1086 ; CHECK-AVX512-LABEL: transform_VUNPCKLPSrmk:
1087 ; CHECK-AVX512: # %bb.0:
1088 ; CHECK-AVX512-NEXT: kmovd %esi, %k1
1089 ; CHECK-AVX512-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
1090 ; CHECK-AVX512-NEXT: vmovaps %xmm1, %xmm0
1091 ; CHECK-AVX512-NEXT: retq
1093 ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPSrmk:
1094 ; CHECK-ZNVER4: # %bb.0:
1095 ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
1096 ; CHECK-ZNVER4-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
1097 ; CHECK-ZNVER4-NEXT: vmovaps %xmm1, %xmm0
1098 ; CHECK-ZNVER4-NEXT: retq
1099 %mask = bitcast i4 %mask_int to <4 x i1>
1100 %b = load <4 x float>, ptr %pb
1101 %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
1102 %res = select <4 x i1> %mask, <4 x float> %shufp, <4 x float> %c
1103 ret <4 x float> %res
1106 define <4 x float> @transform_VUNPCKHPSrmk(<4 x float> %a, ptr %pb, <4 x float> %c, i4 %mask_int) nounwind {
1107 ; CHECK-SKX-LABEL: transform_VUNPCKHPSrmk:
1108 ; CHECK-SKX: # %bb.0:
1109 ; CHECK-SKX-NEXT: kmovd %esi, %k1
1110 ; CHECK-SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
1111 ; CHECK-SKX-NEXT: vmovaps %xmm1, %xmm0
1112 ; CHECK-SKX-NEXT: retq
1114 ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPSrmk:
1115 ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0:
1116 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1
1117 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhdq {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
1118 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovaps %xmm1, %xmm0
1119 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq
1121 ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPSrmk:
1122 ; CHECK-ICX-BYPASS-DELAY: # %bb.0:
1123 ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1
1124 ; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
1125 ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovaps %xmm1, %xmm0
1126 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq
1128 ; CHECK-V4-LABEL: transform_VUNPCKHPSrmk:
1129 ; CHECK-V4: # %bb.0:
1130 ; CHECK-V4-NEXT: kmovd %esi, %k1
1131 ; CHECK-V4-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
1132 ; CHECK-V4-NEXT: vmovaps %xmm1, %xmm0
1133 ; CHECK-V4-NEXT: retq
1135 ; CHECK-AVX512-LABEL: transform_VUNPCKHPSrmk:
1136 ; CHECK-AVX512: # %bb.0:
1137 ; CHECK-AVX512-NEXT: kmovd %esi, %k1
1138 ; CHECK-AVX512-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
1139 ; CHECK-AVX512-NEXT: vmovaps %xmm1, %xmm0
1140 ; CHECK-AVX512-NEXT: retq
1142 ; CHECK-ZNVER4-LABEL: transform_VUNPCKHPSrmk:
1143 ; CHECK-ZNVER4: # %bb.0:
1144 ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1
1145 ; CHECK-ZNVER4-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
1146 ; CHECK-ZNVER4-NEXT: vmovaps %xmm1, %xmm0
1147 ; CHECK-ZNVER4-NEXT: retq
1148 %mask = bitcast i4 %mask_int to <4 x i1>
1149 %b = load <4 x float>, ptr %pb
1150 %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
1151 %res = select <4 x i1> %mask, <4 x float> %shufp, <4 x float> %c
1152 ret <4 x float> %res
1154 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: