1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3 ; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
4 ; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
6 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
8 ; For this test we used to optimize the <i1 true, i1 false, i1 false, i1 true>
9 ; mask into <i32 2147483648, i32 0, i32 0, i32 2147483648> because we thought
10 ; we would lower that into a blend where only the high bit is relevant.
11 ; However, since the whole mask is constant, this is simplified incorrectly
12 ; by the generic code, because it was expecting -1 in place of 2147483648.
14 ; The problem does not occur without AVX, because vselect of v4i32 is not legal
17 ; <rdar://problem/18675020>
19 define void @test(ptr %a, ptr %b) {
21 ; AVX: ## %bb.0: ## %body
22 ; AVX-NEXT: movabsq $4167800517033787389, %rax ## imm = 0x39D7007D007CFFFD
23 ; AVX-NEXT: movq %rax, (%rdi)
24 ; AVX-NEXT: movabsq $-281474976645121, %rax ## imm = 0xFFFF00000000FFFF
25 ; AVX-NEXT: movq %rax, (%rsi)
28 %predphi = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -3, i16 545, i16 4385, i16 14807>, <4 x i16> <i16 123, i16 124, i16 125, i16 127>
29 %predphi42 = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer
30 store <4 x i16> %predphi, ptr %a, align 8
31 store <4 x i16> %predphi42, ptr %b, align 8
35 ; Improve code coverage.
37 ; When shrinking the condition used into the select to match a blend, this
38 ; test case exercises the path where the modified node is not the root
41 define void @test2(ptr %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
43 ; AVX1: ## %bb.0: ## %bb
44 ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
45 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
46 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
47 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
48 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
49 ; AVX1-NEXT: movq (%rdi,%rsi,8), %rax
50 ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1]
51 ; AVX1-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
52 ; AVX1-NEXT: vmovupd %ymm0, (%rax)
53 ; AVX1-NEXT: vzeroupper
57 ; AVX2: ## %bb.0: ## %bb
58 ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
59 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
60 ; AVX2-NEXT: movq (%rdi,%rsi,8), %rax
61 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
62 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1]
63 ; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
64 ; AVX2-NEXT: vmovupd %ymm0, (%rax)
65 ; AVX2-NEXT: vzeroupper
68 ; AVX512-LABEL: test2:
69 ; AVX512: ## %bb.0: ## %bb
70 ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
71 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
72 ; AVX512-NEXT: movq (%rdi,%rsi,8), %rax
73 ; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1]
74 ; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 {%k1} = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
75 ; AVX512-NEXT: vmovupd %ymm0, (%rax)
76 ; AVX512-NEXT: vzeroupper
79 %arrayidx1928 = getelementptr inbounds ptr, ptr %call1559, i64 %indvars.iv4198
80 %tmp1888 = load ptr, ptr %arrayidx1928, align 8
81 %predphi.v.v = select <4 x i1> %tmp1895, <4 x double> <double -5.000000e-01, double -5.000000e-01, double -5.000000e-01, double -5.000000e-01>, <4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
82 store <4 x double> %predphi.v.v, ptr %tmp1888, align 8
86 ; For this test, we used to optimized the conditional mask for the blend, i.e.,
87 ; we shrunk some of its bits.
88 ; However, this same mask was used in another select (%predphi31) that turned out
89 ; to be optimized into a and. In that case, the conditional mask was wrong.
91 ; Make sure that the and is fed by the original mask.
93 ; <rdar://problem/18819506>
95 define void @test3(<4 x i32> %induction30, ptr %tmp16, ptr %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) {
98 ; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
99 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
100 ; AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
101 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
102 ; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
103 ; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
104 ; AVX1-NEXT: vmovq %xmm0, (%rdi)
105 ; AVX1-NEXT: vmovq %xmm1, (%rsi)
110 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2863311531,2863311531,2863311531,2863311531]
111 ; AVX2-NEXT: vpmulld %xmm3, %xmm0, %xmm0
112 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [715827882,715827882,715827882,715827882]
113 ; AVX2-NEXT: vpaddd %xmm3, %xmm0, %xmm0
114 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655764,1431655764,1431655764,1431655764]
115 ; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm3
116 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
117 ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
118 ; AVX2-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
119 ; AVX2-NEXT: vmovq %xmm0, (%rdi)
120 ; AVX2-NEXT: vmovq %xmm1, (%rsi)
123 ; AVX512-LABEL: test3:
125 ; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
126 ; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
127 ; AVX512-NEXT: vpcmpleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1
128 ; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
129 ; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
130 ; AVX512-NEXT: vpmovdw %ymm0, %xmm0
131 ; AVX512-NEXT: vpternlogq $226, %xmm2, %xmm0, %xmm1
132 ; AVX512-NEXT: vmovq %xmm0, (%rdi)
133 ; AVX512-NEXT: vmovq %xmm1, (%rsi)
134 ; AVX512-NEXT: vzeroupper
136 %tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3>
137 %tmp7 = icmp eq <4 x i32> %tmp6, zeroinitializer
138 %predphi = select <4 x i1> %tmp7, <4 x i16> %tmp3, <4 x i16> %tmp12
139 %predphi31 = select <4 x i1> %tmp7, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer
141 store <4 x i16> %predphi31, ptr %tmp16, align 8
142 store <4 x i16> %predphi, ptr %tmp17, align 8
146 ; We shouldn't try to lower this directly using VSELECT because we don't have
147 ; vpblendvb in AVX1, only in AVX2. Instead, it should be expanded.
149 define <32 x i8> @PR22706(<32 x i1> %x) {
150 ; AVX1-LABEL: PR22706:
152 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
153 ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
154 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
155 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
156 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
157 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1
158 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
159 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
160 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
161 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
162 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm3, %xmm0
163 ; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
164 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
167 ; AVX2-LABEL: PR22706:
169 ; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
170 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
171 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
172 ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
173 ; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
176 ; AVX512-LABEL: PR22706:
178 ; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0
179 ; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
180 ; AVX512-NEXT: vpblendvb %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
182 %tmp = select <32 x i1> %x, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
186 ; Don't concat select/blendv ops if the concatenated mask isn't legal.
187 define void @PR59003(<2 x float> %0, <2 x float> %1, <8 x i1> %shuffle108) {
188 ; AVX-LABEL: PR59003:
189 ; AVX: ## %bb.0: ## %entry
190 ; AVX-NEXT: .p2align 4, 0x90
191 ; AVX-NEXT: LBB4_1: ## %for.body.i
192 ; AVX-NEXT: ## =>This Inner Loop Header: Depth=1
193 ; AVX-NEXT: jmp LBB4_1
197 for.body.i: ; preds = %for.body.i, %entry
198 %2 = phi <8 x float> [ zeroinitializer, %entry ], [ %3, %for.body.i ]
199 %shuffle111 = shufflevector <2 x float> %0, <2 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
200 %shuffle112 = shufflevector <2 x float> %1, <2 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
201 %3 = select <8 x i1> %shuffle108, <8 x float> %shuffle111, <8 x float> %shuffle112
202 %4 = shufflevector <8 x float> zeroinitializer, <8 x float> %2, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
203 %5 = select <8 x i1> zeroinitializer, <8 x float> zeroinitializer, <8 x float> %2
208 ; Split a 256-bit select into two 128-bit selects when the operands are concatenated.
210 define void @blendv_split(ptr %p, <8 x i32> %cond, <8 x i32> %a, <8 x i32> %x, <8 x i32> %y, <8 x i32> %z, <8 x i32> %w) {
211 ; AVX1-LABEL: blendv_split:
213 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
214 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
215 ; AVX1-NEXT: vpslld %xmm2, %xmm4, %xmm5
216 ; AVX1-NEXT: vpslld %xmm2, %xmm1, %xmm2
217 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
218 ; AVX1-NEXT: vpslld %xmm3, %xmm4, %xmm4
219 ; AVX1-NEXT: vpslld %xmm3, %xmm1, %xmm1
220 ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm1
221 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
222 ; AVX1-NEXT: vblendvps %xmm0, %xmm5, %xmm4, %xmm0
223 ; AVX1-NEXT: vmovups %xmm0, 16(%rdi)
224 ; AVX1-NEXT: vmovups %xmm1, (%rdi)
225 ; AVX1-NEXT: vzeroupper
228 ; AVX2-LABEL: blendv_split:
230 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
231 ; AVX2-NEXT: vpslld %xmm2, %ymm1, %ymm2
232 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
233 ; AVX2-NEXT: vpslld %xmm3, %ymm1, %ymm1
234 ; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
235 ; AVX2-NEXT: vmovups %ymm0, (%rdi)
236 ; AVX2-NEXT: vzeroupper
239 ; AVX512-LABEL: blendv_split:
241 ; AVX512-NEXT: vpsrld $31, %ymm0, %ymm0
242 ; AVX512-NEXT: vpslld $31, %ymm0, %ymm0
243 ; AVX512-NEXT: vptestmd %ymm0, %ymm0, %k1
244 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
245 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero
246 ; AVX512-NEXT: vpslld %xmm2, %ymm1, %ymm2
247 ; AVX512-NEXT: vpslld %xmm0, %ymm1, %ymm2 {%k1}
248 ; AVX512-NEXT: vmovdqu %ymm2, (%rdi)
249 ; AVX512-NEXT: vzeroupper
251 %signbits = ashr <8 x i32> %cond, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
252 %bool = trunc <8 x i32> %signbits to <8 x i1>
253 %shamt1 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> zeroinitializer
254 %shamt2 = shufflevector <8 x i32> %y, <8 x i32> undef, <8 x i32> zeroinitializer
255 %sh1 = shl <8 x i32> %a, %shamt1
256 %sh2 = shl <8 x i32> %a, %shamt2
257 %sel = select <8 x i1> %bool, <8 x i32> %sh1, <8 x i32> %sh2
258 store <8 x i32> %sel, ptr %p, align 4
262 ; TODO: Concatenate 128-bit pblendvb back together on AVX2+ targets (hidden by SSE __m128i bitcasts)
263 define <4 x i64> @vselect_concat_split_v16i8(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
264 ; AVX1-LABEL: vselect_concat_split_v16i8:
266 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
267 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
268 ; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm4
269 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
270 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm2
271 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
272 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
273 ; AVX1-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
274 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
277 ; AVX2-LABEL: vselect_concat_split_v16i8:
279 ; AVX2-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2
280 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm3
281 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
282 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
283 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
284 ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
285 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
288 ; AVX512-LABEL: vselect_concat_split_v16i8:
290 ; AVX512-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2
291 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
292 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm4
293 ; AVX512-NEXT: ## kill: def $xmm1 killed $xmm1 killed $ymm1 def $ymm1
294 ; AVX512-NEXT: vpternlogq $226, %xmm0, %xmm2, %xmm1
295 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
296 ; AVX512-NEXT: vpternlogq $226, %xmm0, %xmm3, %xmm4
297 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm0
299 %a.bc = bitcast <4 x i64> %a to <32 x i8>
300 %b.bc = bitcast <4 x i64> %b to <32 x i8>
301 %c.bc = bitcast <4 x i64> %c to <32 x i8>
302 %d.bc = bitcast <4 x i64> %d to <32 x i8>
303 %cmp = icmp slt <32 x i8> %c.bc, %d.bc
304 %a.lo = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
305 %b.lo = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
306 %cmp.lo = shufflevector <32 x i1> %cmp, <32 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
307 %lo = select <16 x i1> %cmp.lo, <16 x i8> %b.lo, <16 x i8> %a.lo
308 %a.hi = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
309 %b.hi = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
310 %cmp.hi = shufflevector <32 x i1> %cmp, <32 x i1> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
311 %hi = select <16 x i1> %cmp.hi, <16 x i8> %b.hi, <16 x i8> %a.hi
312 %concat = shufflevector <16 x i8> %lo, <16 x i8> %hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
313 %result = bitcast <32 x i8> %concat to <4 x i64>
314 ret <4 x i64> %result
317 ; Regression test for rGea8fb3b60196
318 define void @vselect_concat() {
319 ; AVX-LABEL: vselect_concat:
320 ; AVX: ## %bb.0: ## %entry
323 %0 = load <8 x i32>, ptr undef
324 %1 = shufflevector <8 x i32> zeroinitializer, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
325 %2 = shufflevector <8 x i32> %0, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
326 %3 = select <4 x i1> zeroinitializer, <4 x i32> %1, <4 x i32> %2
327 %4 = shufflevector <8 x i32> zeroinitializer, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
328 %5 = shufflevector <8 x i32> %0, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
329 %6 = select <4 x i1> zeroinitializer, <4 x i32> %4, <4 x i32> %5
330 %7 = shufflevector <4 x i32> %3, <4 x i32> %6, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
331 store <8 x i32> %7, ptr undef
335 ; Regression test for rGb5d7beeb9792
336 define void @vselect_concat_splat() {
337 ; AVX1-LABEL: vselect_concat_splat:
338 ; AVX1: ## %bb.0: ## %entry
339 ; AVX1-NEXT: vmovups (%rax), %xmm0
340 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,3,2,1]
341 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,2]
342 ; AVX1-NEXT: vmovups 16, %xmm2
343 ; AVX1-NEXT: vmovups 32, %xmm3
344 ; AVX1-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm3[1],mem[2,3]
345 ; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3]
346 ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,3,2,1]
347 ; AVX1-NEXT: vblendps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
348 ; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
349 ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,3,2]
350 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
351 ; AVX1-NEXT: vcmpneqps %xmm3, %xmm1, %xmm3
352 ; AVX1-NEXT: vblendvps %xmm3, %xmm4, %xmm1, %xmm1
353 ; AVX1-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm0
354 ; AVX1-NEXT: vmovups %xmm0, (%rax)
355 ; AVX1-NEXT: vmovups %xmm1, (%rax)
358 ; AVX2-LABEL: vselect_concat_splat:
359 ; AVX2: ## %bb.0: ## %entry
360 ; AVX2-NEXT: vmovups (%rax), %ymm0
361 ; AVX2-NEXT: vmovups (%rax), %xmm1
362 ; AVX2-NEXT: vmovaps {{.*#+}} xmm2 = [0,3,6,1]
363 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
364 ; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm3
365 ; AVX2-NEXT: vmovaps {{.*#+}} xmm4 = [1,4,7,2]
366 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
367 ; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm0
368 ; AVX2-NEXT: vmovups 0, %ymm1
369 ; AVX2-NEXT: vmovups 32, %xmm5
370 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm5[1],ymm1[2,3,4,5,6,7]
371 ; AVX2-NEXT: vpermps %ymm6, %ymm2, %ymm2
372 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7]
373 ; AVX2-NEXT: vpermps %ymm1, %ymm4, %ymm1
374 ; AVX2-NEXT: vxorps %xmm4, %xmm4, %xmm4
375 ; AVX2-NEXT: vcmpneqps %xmm4, %xmm3, %xmm4
376 ; AVX2-NEXT: vblendvps %xmm4, %xmm2, %xmm3, %xmm2
377 ; AVX2-NEXT: vblendvps %xmm4, %xmm1, %xmm0, %xmm0
378 ; AVX2-NEXT: vmovups %xmm0, (%rax)
379 ; AVX2-NEXT: vmovups %xmm2, (%rax)
380 ; AVX2-NEXT: vzeroupper
383 ; AVX512-LABEL: vselect_concat_splat:
384 ; AVX512: ## %bb.0: ## %entry
385 ; AVX512-NEXT: vmovups (%rax), %ymm0
386 ; AVX512-NEXT: vmovups (%rax), %xmm1
387 ; AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [0,3,6,9,1,4,7,10]
388 ; AVX512-NEXT: vmovaps %ymm2, %ymm3
389 ; AVX512-NEXT: vpermi2ps %ymm1, %ymm0, %ymm3
390 ; AVX512-NEXT: vmovups 32, %xmm4
391 ; AVX512-NEXT: vmovups 0, %ymm5
392 ; AVX512-NEXT: vxorps %xmm6, %xmm6, %xmm6
393 ; AVX512-NEXT: vcmpneqps %xmm6, %xmm3, %k0
394 ; AVX512-NEXT: kshiftlw $4, %k0, %k1
395 ; AVX512-NEXT: korw %k1, %k0, %k1
396 ; AVX512-NEXT: vpermt2ps %ymm4, %ymm2, %ymm5
397 ; AVX512-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
398 ; AVX512-NEXT: vmovaps %ymm5, %ymm0 {%k1}
399 ; AVX512-NEXT: vmovups %ymm0, (%rax)
400 ; AVX512-NEXT: vzeroupper
403 %wide.vec = load <12 x float>, ptr undef, align 1
404 %strided.vec = shufflevector <12 x float> %wide.vec, <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
405 %strided.vec29 = shufflevector <12 x float> %wide.vec, <12 x float> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
406 %wide.vec31 = load <12 x float>, ptr null, align 1
407 %strided.vec32 = shufflevector <12 x float> %wide.vec31, <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
408 %strided.vec33 = shufflevector <12 x float> %wide.vec31, <12 x float> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
409 %i = select i1 false, <4 x float> zeroinitializer, <4 x float> %strided.vec
410 %i1 = fcmp une <4 x float> %i, zeroinitializer
411 %i2 = select <4 x i1> %i1, <4 x float> %strided.vec32, <4 x float> %strided.vec
412 %.v = select <4 x i1> %i1, <4 x float> %strided.vec33, <4 x float> %strided.vec29
413 %.uncasted = shufflevector <4 x float> %i2, <4 x float> %.v, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
414 store <8 x float> %.uncasted, ptr undef, align 1