1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; PR37428 - https://bugs.llvm.org/show_bug.cgi?id=37428
7 ; This is a larger-than-usual regression test to verify that several backend
8 ; transforms are working together. We want to hoist the expansion of non-uniform
9 ; vector shifts out of a loop if we do not have real vector shift instructions.
10 ; See test/Transforms/CodeGenPrepare/X86/vec-shift.ll for the 1st step in that
13 define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1) nounwind {
14 ; SSE-LABEL: vector_variable_shift_left_loop:
15 ; SSE: # %bb.0: # %entry
16 ; SSE-NEXT: testl %edx, %edx
17 ; SSE-NEXT: jle .LBB0_9
18 ; SSE-NEXT: # %bb.1: # %for.body.preheader
19 ; SSE-NEXT: movl %ecx, %r9d
20 ; SSE-NEXT: movl %edx, %eax
21 ; SSE-NEXT: cmpl $31, %edx
22 ; SSE-NEXT: ja .LBB0_3
24 ; SSE-NEXT: xorl %edx, %edx
25 ; SSE-NEXT: jmp .LBB0_6
26 ; SSE-NEXT: .LBB0_3: # %vector.ph
27 ; SSE-NEXT: movl %eax, %edx
28 ; SSE-NEXT: andl $-32, %edx
29 ; SSE-NEXT: movd %r9d, %xmm0
30 ; SSE-NEXT: movd %r8d, %xmm1
31 ; SSE-NEXT: xorl %ecx, %ecx
32 ; SSE-NEXT: pmovzxdq {{.*#+}} xmm14 = xmm1[0],zero,xmm1[1],zero
33 ; SSE-NEXT: pmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero
34 ; SSE-NEXT: .p2align 4, 0x90
35 ; SSE-NEXT: .LBB0_4: # %vector.body
36 ; SSE-NEXT: # =>This Inner Loop Header: Depth=1
37 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
38 ; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
39 ; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero
40 ; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero
41 ; SSE-NEXT: pxor %xmm1, %xmm1
42 ; SSE-NEXT: pcmpeqb %xmm1, %xmm0
43 ; SSE-NEXT: pmovsxbd %xmm0, %xmm7
44 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
45 ; SSE-NEXT: pmovsxbd %xmm0, %xmm0
46 ; SSE-NEXT: pcmpeqb %xmm1, %xmm3
47 ; SSE-NEXT: pmovsxbd %xmm3, %xmm13
48 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
49 ; SSE-NEXT: pmovsxbd %xmm3, %xmm6
50 ; SSE-NEXT: pcmpeqb %xmm1, %xmm4
51 ; SSE-NEXT: pmovsxbd %xmm4, %xmm11
52 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,3]
53 ; SSE-NEXT: pmovsxbd %xmm3, %xmm2
54 ; SSE-NEXT: pcmpeqb %xmm1, %xmm5
55 ; SSE-NEXT: pmovsxbd %xmm5, %xmm8
56 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,2,3]
57 ; SSE-NEXT: pmovsxbd %xmm3, %xmm9
58 ; SSE-NEXT: movdqu 16(%rdi,%rcx,4), %xmm3
59 ; SSE-NEXT: movdqa %xmm3, %xmm4
60 ; SSE-NEXT: pslld %xmm15, %xmm4
61 ; SSE-NEXT: pslld %xmm14, %xmm3
62 ; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm3
63 ; SSE-NEXT: movdqu (%rdi,%rcx,4), %xmm10
64 ; SSE-NEXT: movdqa %xmm10, %xmm5
65 ; SSE-NEXT: pslld %xmm15, %xmm5
66 ; SSE-NEXT: pslld %xmm14, %xmm10
67 ; SSE-NEXT: movdqa %xmm7, %xmm0
68 ; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm10
69 ; SSE-NEXT: movdqu 48(%rdi,%rcx,4), %xmm12
70 ; SSE-NEXT: movdqa %xmm12, %xmm5
71 ; SSE-NEXT: pslld %xmm15, %xmm5
72 ; SSE-NEXT: pslld %xmm14, %xmm12
73 ; SSE-NEXT: movdqa %xmm6, %xmm0
74 ; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm12
75 ; SSE-NEXT: movdqu 32(%rdi,%rcx,4), %xmm6
76 ; SSE-NEXT: movdqa %xmm6, %xmm5
77 ; SSE-NEXT: pslld %xmm15, %xmm5
78 ; SSE-NEXT: pslld %xmm14, %xmm6
79 ; SSE-NEXT: movdqa %xmm13, %xmm0
80 ; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm6
81 ; SSE-NEXT: movdqu 80(%rdi,%rcx,4), %xmm1
82 ; SSE-NEXT: movdqa %xmm1, %xmm5
83 ; SSE-NEXT: pslld %xmm15, %xmm5
84 ; SSE-NEXT: pslld %xmm14, %xmm1
85 ; SSE-NEXT: movdqa %xmm2, %xmm0
86 ; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm1
87 ; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm5
88 ; SSE-NEXT: movdqa %xmm5, %xmm2
89 ; SSE-NEXT: pslld %xmm15, %xmm2
90 ; SSE-NEXT: pslld %xmm14, %xmm5
91 ; SSE-NEXT: movdqa %xmm11, %xmm0
92 ; SSE-NEXT: blendvps %xmm0, %xmm2, %xmm5
93 ; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm2
94 ; SSE-NEXT: movdqa %xmm2, %xmm4
95 ; SSE-NEXT: pslld %xmm15, %xmm4
96 ; SSE-NEXT: pslld %xmm14, %xmm2
97 ; SSE-NEXT: movdqa %xmm9, %xmm0
98 ; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm2
99 ; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm4
100 ; SSE-NEXT: movdqa %xmm4, %xmm7
101 ; SSE-NEXT: pslld %xmm15, %xmm7
102 ; SSE-NEXT: pslld %xmm14, %xmm4
103 ; SSE-NEXT: movdqa %xmm8, %xmm0
104 ; SSE-NEXT: blendvps %xmm0, %xmm7, %xmm4
105 ; SSE-NEXT: movups %xmm10, (%rdi,%rcx,4)
106 ; SSE-NEXT: movups %xmm3, 16(%rdi,%rcx,4)
107 ; SSE-NEXT: movups %xmm6, 32(%rdi,%rcx,4)
108 ; SSE-NEXT: movups %xmm12, 48(%rdi,%rcx,4)
109 ; SSE-NEXT: movups %xmm5, 64(%rdi,%rcx,4)
110 ; SSE-NEXT: movups %xmm1, 80(%rdi,%rcx,4)
111 ; SSE-NEXT: movups %xmm4, 96(%rdi,%rcx,4)
112 ; SSE-NEXT: movups %xmm2, 112(%rdi,%rcx,4)
113 ; SSE-NEXT: addq $32, %rcx
114 ; SSE-NEXT: cmpq %rcx, %rdx
115 ; SSE-NEXT: jne .LBB0_4
116 ; SSE-NEXT: # %bb.5: # %middle.block
117 ; SSE-NEXT: cmpq %rax, %rdx
118 ; SSE-NEXT: jne .LBB0_6
119 ; SSE-NEXT: .LBB0_9: # %for.cond.cleanup
121 ; SSE-NEXT: .p2align 4, 0x90
122 ; SSE-NEXT: .LBB0_8: # %for.body
123 ; SSE-NEXT: # in Loop: Header=BB0_6 Depth=1
124 ; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
125 ; SSE-NEXT: shll %cl, (%rdi,%rdx,4)
126 ; SSE-NEXT: incq %rdx
127 ; SSE-NEXT: cmpq %rdx, %rax
128 ; SSE-NEXT: je .LBB0_9
129 ; SSE-NEXT: .LBB0_6: # %for.body
130 ; SSE-NEXT: # =>This Inner Loop Header: Depth=1
131 ; SSE-NEXT: cmpb $0, (%rsi,%rdx)
132 ; SSE-NEXT: movl %r9d, %ecx
133 ; SSE-NEXT: je .LBB0_8
134 ; SSE-NEXT: # %bb.7: # %for.body
135 ; SSE-NEXT: # in Loop: Header=BB0_6 Depth=1
136 ; SSE-NEXT: movl %r8d, %ecx
137 ; SSE-NEXT: jmp .LBB0_8
139 ; AVX1-LABEL: vector_variable_shift_left_loop:
140 ; AVX1: # %bb.0: # %entry
141 ; AVX1-NEXT: subq $24, %rsp
142 ; AVX1-NEXT: testl %edx, %edx
143 ; AVX1-NEXT: jle .LBB0_9
144 ; AVX1-NEXT: # %bb.1: # %for.body.preheader
145 ; AVX1-NEXT: movl %ecx, %r9d
146 ; AVX1-NEXT: movl %edx, %eax
147 ; AVX1-NEXT: cmpl $31, %edx
148 ; AVX1-NEXT: ja .LBB0_3
149 ; AVX1-NEXT: # %bb.2:
150 ; AVX1-NEXT: xorl %edx, %edx
151 ; AVX1-NEXT: jmp .LBB0_6
152 ; AVX1-NEXT: .LBB0_3: # %vector.ph
153 ; AVX1-NEXT: movl %eax, %edx
154 ; AVX1-NEXT: andl $-32, %edx
155 ; AVX1-NEXT: vmovd %r9d, %xmm0
156 ; AVX1-NEXT: vmovd %r8d, %xmm1
157 ; AVX1-NEXT: xorl %ecx, %ecx
158 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
159 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
160 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
161 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
162 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
163 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
164 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero
165 ; AVX1-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
166 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm1[0],zero,xmm1[1],zero
167 ; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
168 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero
169 ; AVX1-NEXT: vpxor %xmm11, %xmm11, %xmm11
170 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
171 ; AVX1-NEXT: .p2align 4, 0x90
172 ; AVX1-NEXT: .LBB0_4: # %vector.body
173 ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
174 ; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
175 ; AVX1-NEXT: # xmm1 = mem[0],zero,mem[1],zero
176 ; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
177 ; AVX1-NEXT: # xmm2 = mem[0],zero,mem[1],zero
178 ; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
179 ; AVX1-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
180 ; AVX1-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
181 ; AVX1-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero
182 ; AVX1-NEXT: vpcmpeqb %xmm11, %xmm3, %xmm3
183 ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm7
184 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
185 ; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3
186 ; AVX1-NEXT: vpcmpeqb %xmm11, %xmm4, %xmm4
187 ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm8
188 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
189 ; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4
190 ; AVX1-NEXT: vpcmpeqb %xmm11, %xmm5, %xmm5
191 ; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm9
192 ; AVX1-NEXT: vpslld %xmm2, %xmm9, %xmm10
193 ; AVX1-NEXT: vpslld %xmm1, %xmm9, %xmm0
194 ; AVX1-NEXT: vblendvps %xmm7, %xmm10, %xmm0, %xmm9
195 ; AVX1-NEXT: vpmovsxbd %xmm5, %xmm7
196 ; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
197 ; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5
198 ; AVX1-NEXT: vpcmpeqb %xmm11, %xmm6, %xmm6
199 ; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm0
200 ; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2
201 ; AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0
202 ; AVX1-NEXT: vpmovsxbd %xmm6, %xmm1
203 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
204 ; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6
205 ; AVX1-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm10
206 ; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm2
207 ; AVX1-NEXT: vpslld %xmm15, %xmm2, %xmm3
208 ; AVX1-NEXT: vpslld %xmm14, %xmm2, %xmm2
209 ; AVX1-NEXT: vblendvps %xmm8, %xmm3, %xmm2, %xmm8
210 ; AVX1-NEXT: vmovdqu 48(%rdi,%rcx,4), %xmm3
211 ; AVX1-NEXT: vpslld %xmm15, %xmm3, %xmm0
212 ; AVX1-NEXT: vpslld %xmm14, %xmm3, %xmm3
213 ; AVX1-NEXT: vblendvps %xmm4, %xmm0, %xmm3, %xmm0
214 ; AVX1-NEXT: vmovdqu 64(%rdi,%rcx,4), %xmm3
215 ; AVX1-NEXT: vpslld %xmm13, %xmm3, %xmm4
216 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
217 ; AVX1-NEXT: vpslld %xmm2, %xmm3, %xmm3
218 ; AVX1-NEXT: vblendvps %xmm7, %xmm4, %xmm3, %xmm3
219 ; AVX1-NEXT: vmovdqu 80(%rdi,%rcx,4), %xmm4
220 ; AVX1-NEXT: vpslld %xmm13, %xmm4, %xmm7
221 ; AVX1-NEXT: vpslld %xmm2, %xmm4, %xmm4
222 ; AVX1-NEXT: vblendvps %xmm5, %xmm7, %xmm4, %xmm4
223 ; AVX1-NEXT: vmovdqu 96(%rdi,%rcx,4), %xmm5
224 ; AVX1-NEXT: vpslld %xmm12, %xmm5, %xmm7
225 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
226 ; AVX1-NEXT: vpslld %xmm2, %xmm5, %xmm5
227 ; AVX1-NEXT: vblendvps %xmm1, %xmm7, %xmm5, %xmm1
228 ; AVX1-NEXT: vmovdqu 112(%rdi,%rcx,4), %xmm5
229 ; AVX1-NEXT: vpslld %xmm12, %xmm5, %xmm7
230 ; AVX1-NEXT: vpslld %xmm2, %xmm5, %xmm5
231 ; AVX1-NEXT: vblendvps %xmm6, %xmm7, %xmm5, %xmm5
232 ; AVX1-NEXT: vmovups %xmm9, (%rdi,%rcx,4)
233 ; AVX1-NEXT: vmovups %xmm10, 16(%rdi,%rcx,4)
234 ; AVX1-NEXT: vmovups %xmm8, 32(%rdi,%rcx,4)
235 ; AVX1-NEXT: vmovups %xmm0, 48(%rdi,%rcx,4)
236 ; AVX1-NEXT: vmovups %xmm3, 64(%rdi,%rcx,4)
237 ; AVX1-NEXT: vmovups %xmm4, 80(%rdi,%rcx,4)
238 ; AVX1-NEXT: vmovups %xmm1, 96(%rdi,%rcx,4)
239 ; AVX1-NEXT: vmovups %xmm5, 112(%rdi,%rcx,4)
240 ; AVX1-NEXT: addq $32, %rcx
241 ; AVX1-NEXT: cmpq %rcx, %rdx
242 ; AVX1-NEXT: jne .LBB0_4
243 ; AVX1-NEXT: # %bb.5: # %middle.block
244 ; AVX1-NEXT: cmpq %rax, %rdx
245 ; AVX1-NEXT: jne .LBB0_6
246 ; AVX1-NEXT: .LBB0_9: # %for.cond.cleanup
247 ; AVX1-NEXT: addq $24, %rsp
248 ; AVX1-NEXT: vzeroupper
250 ; AVX1-NEXT: .p2align 4, 0x90
251 ; AVX1-NEXT: .LBB0_8: # %for.body
252 ; AVX1-NEXT: # in Loop: Header=BB0_6 Depth=1
253 ; AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
254 ; AVX1-NEXT: shll %cl, (%rdi,%rdx,4)
255 ; AVX1-NEXT: incq %rdx
256 ; AVX1-NEXT: cmpq %rdx, %rax
257 ; AVX1-NEXT: je .LBB0_9
258 ; AVX1-NEXT: .LBB0_6: # %for.body
259 ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
260 ; AVX1-NEXT: cmpb $0, (%rsi,%rdx)
261 ; AVX1-NEXT: movl %r9d, %ecx
262 ; AVX1-NEXT: je .LBB0_8
263 ; AVX1-NEXT: # %bb.7: # %for.body
264 ; AVX1-NEXT: # in Loop: Header=BB0_6 Depth=1
265 ; AVX1-NEXT: movl %r8d, %ecx
266 ; AVX1-NEXT: jmp .LBB0_8
268 ; AVX2-LABEL: vector_variable_shift_left_loop:
269 ; AVX2: # %bb.0: # %entry
270 ; AVX2-NEXT: testl %edx, %edx
271 ; AVX2-NEXT: jle .LBB0_9
272 ; AVX2-NEXT: # %bb.1: # %for.body.preheader
273 ; AVX2-NEXT: movl %ecx, %r9d
274 ; AVX2-NEXT: movl %edx, %eax
275 ; AVX2-NEXT: cmpl $31, %edx
276 ; AVX2-NEXT: ja .LBB0_3
277 ; AVX2-NEXT: # %bb.2:
278 ; AVX2-NEXT: xorl %edx, %edx
279 ; AVX2-NEXT: jmp .LBB0_6
280 ; AVX2-NEXT: .LBB0_3: # %vector.ph
281 ; AVX2-NEXT: movl %eax, %edx
282 ; AVX2-NEXT: andl $-32, %edx
283 ; AVX2-NEXT: vmovd %r9d, %xmm0
284 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
285 ; AVX2-NEXT: vmovd %r8d, %xmm1
286 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
287 ; AVX2-NEXT: xorl %ecx, %ecx
288 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
289 ; AVX2-NEXT: .p2align 4, 0x90
290 ; AVX2-NEXT: .LBB0_4: # %vector.body
291 ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
292 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
293 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
294 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
295 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
296 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3
297 ; AVX2-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3
298 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm4, %ymm4
299 ; AVX2-NEXT: vblendvps %ymm4, %ymm0, %ymm1, %ymm4
300 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm5, %ymm5
301 ; AVX2-NEXT: vblendvps %ymm5, %ymm0, %ymm1, %ymm5
302 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm6, %ymm6
303 ; AVX2-NEXT: vblendvps %ymm6, %ymm0, %ymm1, %ymm6
304 ; AVX2-NEXT: vmovdqu (%rdi,%rcx,4), %ymm7
305 ; AVX2-NEXT: vpsllvd %ymm3, %ymm7, %ymm3
306 ; AVX2-NEXT: vmovdqu 32(%rdi,%rcx,4), %ymm7
307 ; AVX2-NEXT: vpsllvd %ymm4, %ymm7, %ymm4
308 ; AVX2-NEXT: vmovdqu 64(%rdi,%rcx,4), %ymm7
309 ; AVX2-NEXT: vpsllvd %ymm5, %ymm7, %ymm5
310 ; AVX2-NEXT: vmovdqu 96(%rdi,%rcx,4), %ymm7
311 ; AVX2-NEXT: vpsllvd %ymm6, %ymm7, %ymm6
312 ; AVX2-NEXT: vmovdqu %ymm3, (%rdi,%rcx,4)
313 ; AVX2-NEXT: vmovdqu %ymm4, 32(%rdi,%rcx,4)
314 ; AVX2-NEXT: vmovdqu %ymm5, 64(%rdi,%rcx,4)
315 ; AVX2-NEXT: vmovdqu %ymm6, 96(%rdi,%rcx,4)
316 ; AVX2-NEXT: addq $32, %rcx
317 ; AVX2-NEXT: cmpq %rcx, %rdx
318 ; AVX2-NEXT: jne .LBB0_4
319 ; AVX2-NEXT: # %bb.5: # %middle.block
320 ; AVX2-NEXT: cmpq %rax, %rdx
321 ; AVX2-NEXT: jne .LBB0_6
322 ; AVX2-NEXT: .LBB0_9: # %for.cond.cleanup
323 ; AVX2-NEXT: vzeroupper
325 ; AVX2-NEXT: .p2align 4, 0x90
326 ; AVX2-NEXT: .LBB0_8: # %for.body
327 ; AVX2-NEXT: # in Loop: Header=BB0_6 Depth=1
328 ; AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
329 ; AVX2-NEXT: shll %cl, (%rdi,%rdx,4)
330 ; AVX2-NEXT: incq %rdx
331 ; AVX2-NEXT: cmpq %rdx, %rax
332 ; AVX2-NEXT: je .LBB0_9
333 ; AVX2-NEXT: .LBB0_6: # %for.body
334 ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
335 ; AVX2-NEXT: cmpb $0, (%rsi,%rdx)
336 ; AVX2-NEXT: movl %r9d, %ecx
337 ; AVX2-NEXT: je .LBB0_8
338 ; AVX2-NEXT: # %bb.7: # %for.body
339 ; AVX2-NEXT: # in Loop: Header=BB0_6 Depth=1
340 ; AVX2-NEXT: movl %r8d, %ecx
341 ; AVX2-NEXT: jmp .LBB0_8
343 %cmp12 = icmp sgt i32 %count, 0
344 br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
347 %wide.trip.count = zext i32 %count to i64
348 %min.iters.check = icmp ult i32 %count, 32
349 br i1 %min.iters.check, label %for.body.preheader40, label %vector.ph
351 for.body.preheader40:
352 %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
356 %n.vec = and i64 %wide.trip.count, 4294967264
357 %broadcast.splatinsert20 = insertelement <8 x i32> undef, i32 %amt0, i32 0
358 %broadcast.splat21 = shufflevector <8 x i32> %broadcast.splatinsert20, <8 x i32> undef, <8 x i32> zeroinitializer
359 %broadcast.splatinsert22 = insertelement <8 x i32> undef, i32 %amt1, i32 0
360 %broadcast.splat23 = shufflevector <8 x i32> %broadcast.splatinsert22, <8 x i32> undef, <8 x i32> zeroinitializer
361 %broadcast.splatinsert24 = insertelement <8 x i32> undef, i32 %amt0, i32 0
362 %broadcast.splat25 = shufflevector <8 x i32> %broadcast.splatinsert24, <8 x i32> undef, <8 x i32> zeroinitializer
363 %broadcast.splatinsert26 = insertelement <8 x i32> undef, i32 %amt1, i32 0
364 %broadcast.splat27 = shufflevector <8 x i32> %broadcast.splatinsert26, <8 x i32> undef, <8 x i32> zeroinitializer
365 %broadcast.splatinsert28 = insertelement <8 x i32> undef, i32 %amt0, i32 0
366 %broadcast.splat29 = shufflevector <8 x i32> %broadcast.splatinsert28, <8 x i32> undef, <8 x i32> zeroinitializer
367 %broadcast.splatinsert30 = insertelement <8 x i32> undef, i32 %amt1, i32 0
368 %broadcast.splat31 = shufflevector <8 x i32> %broadcast.splatinsert30, <8 x i32> undef, <8 x i32> zeroinitializer
369 %broadcast.splatinsert32 = insertelement <8 x i32> undef, i32 %amt0, i32 0
370 %broadcast.splat33 = shufflevector <8 x i32> %broadcast.splatinsert32, <8 x i32> undef, <8 x i32> zeroinitializer
371 %broadcast.splatinsert34 = insertelement <8 x i32> undef, i32 %amt1, i32 0
372 %broadcast.splat35 = shufflevector <8 x i32> %broadcast.splatinsert34, <8 x i32> undef, <8 x i32> zeroinitializer
373 br label %vector.body
376 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
377 %0 = getelementptr inbounds i8, i8* %control, i64 %index
378 %1 = bitcast i8* %0 to <8 x i8>*
379 %wide.load = load <8 x i8>, <8 x i8>* %1, align 1
380 %2 = getelementptr inbounds i8, i8* %0, i64 8
381 %3 = bitcast i8* %2 to <8 x i8>*
382 %wide.load17 = load <8 x i8>, <8 x i8>* %3, align 1
383 %4 = getelementptr inbounds i8, i8* %0, i64 16
384 %5 = bitcast i8* %4 to <8 x i8>*
385 %wide.load18 = load <8 x i8>, <8 x i8>* %5, align 1
386 %6 = getelementptr inbounds i8, i8* %0, i64 24
387 %7 = bitcast i8* %6 to <8 x i8>*
388 %wide.load19 = load <8 x i8>, <8 x i8>* %7, align 1
389 %8 = icmp eq <8 x i8> %wide.load, zeroinitializer
390 %9 = icmp eq <8 x i8> %wide.load17, zeroinitializer
391 %10 = icmp eq <8 x i8> %wide.load18, zeroinitializer
392 %11 = icmp eq <8 x i8> %wide.load19, zeroinitializer
393 %12 = select <8 x i1> %8, <8 x i32> %broadcast.splat21, <8 x i32> %broadcast.splat23
394 %13 = select <8 x i1> %9, <8 x i32> %broadcast.splat25, <8 x i32> %broadcast.splat27
395 %14 = select <8 x i1> %10, <8 x i32> %broadcast.splat29, <8 x i32> %broadcast.splat31
396 %15 = select <8 x i1> %11, <8 x i32> %broadcast.splat33, <8 x i32> %broadcast.splat35
397 %16 = getelementptr inbounds i32, i32* %arr, i64 %index
398 %17 = bitcast i32* %16 to <8 x i32>*
399 %wide.load36 = load <8 x i32>, <8 x i32>* %17, align 4
400 %18 = getelementptr inbounds i32, i32* %16, i64 8
401 %19 = bitcast i32* %18 to <8 x i32>*
402 %wide.load37 = load <8 x i32>, <8 x i32>* %19, align 4
403 %20 = getelementptr inbounds i32, i32* %16, i64 16
404 %21 = bitcast i32* %20 to <8 x i32>*
405 %wide.load38 = load <8 x i32>, <8 x i32>* %21, align 4
406 %22 = getelementptr inbounds i32, i32* %16, i64 24
407 %23 = bitcast i32* %22 to <8 x i32>*
408 %wide.load39 = load <8 x i32>, <8 x i32>* %23, align 4
409 %24 = shl <8 x i32> %wide.load36, %12
410 %25 = shl <8 x i32> %wide.load37, %13
411 %26 = shl <8 x i32> %wide.load38, %14
412 %27 = shl <8 x i32> %wide.load39, %15
413 %28 = bitcast i32* %16 to <8 x i32>*
414 store <8 x i32> %24, <8 x i32>* %28, align 4
415 %29 = bitcast i32* %18 to <8 x i32>*
416 store <8 x i32> %25, <8 x i32>* %29, align 4
417 %30 = bitcast i32* %20 to <8 x i32>*
418 store <8 x i32> %26, <8 x i32>* %30, align 4
419 %31 = bitcast i32* %22 to <8 x i32>*
420 store <8 x i32> %27, <8 x i32>* %31, align 4
421 %index.next = add i64 %index, 32
422 %32 = icmp eq i64 %index.next, %n.vec
423 br i1 %32, label %middle.block, label %vector.body
426 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
427 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader40
433 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader40 ]
434 %arrayidx = getelementptr inbounds i8, i8* %control, i64 %indvars.iv
435 %33 = load i8, i8* %arrayidx, align 1
436 %tobool = icmp eq i8 %33, 0
437 %cond = select i1 %tobool, i32 %amt0, i32 %amt1
438 %arrayidx2 = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
439 %34 = load i32, i32* %arrayidx2, align 4
440 %shl = shl i32 %34, %cond
441 store i32 %shl, i32* %arrayidx2, align 4
442 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
443 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
444 br i1 %exitcond, label %for.cond.cleanup, label %for.body
447 define void @vector_variable_shift_left_loop_simpler(i32* nocapture %arr, i8* nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1, i32 %x) nounwind {
448 ; SSE-LABEL: vector_variable_shift_left_loop_simpler:
449 ; SSE: # %bb.0: # %entry
450 ; SSE-NEXT: testl %edx, %edx
451 ; SSE-NEXT: jle .LBB1_3
452 ; SSE-NEXT: # %bb.1: # %vector.ph
453 ; SSE-NEXT: movl %edx, %eax
454 ; SSE-NEXT: andl $-4, %eax
455 ; SSE-NEXT: movd %ecx, %xmm0
456 ; SSE-NEXT: movd %r8d, %xmm2
457 ; SSE-NEXT: movd %r9d, %xmm3
458 ; SSE-NEXT: xorl %ecx, %ecx
459 ; SSE-NEXT: pslld $23, %xmm0
460 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
461 ; SSE-NEXT: paddd %xmm4, %xmm0
462 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0
463 ; SSE-NEXT: pmulld %xmm3, %xmm0
464 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
465 ; SSE-NEXT: pslld $23, %xmm2
466 ; SSE-NEXT: paddd %xmm4, %xmm2
467 ; SSE-NEXT: cvttps2dq %xmm2, %xmm0
468 ; SSE-NEXT: pmulld %xmm3, %xmm0
469 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
470 ; SSE-NEXT: pxor %xmm3, %xmm3
471 ; SSE-NEXT: .p2align 4, 0x90
472 ; SSE-NEXT: .LBB1_2: # %vector.body
473 ; SSE-NEXT: # =>This Inner Loop Header: Depth=1
474 ; SSE-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
475 ; SSE-NEXT: pcmpeqd %xmm3, %xmm0
476 ; SSE-NEXT: movdqa %xmm2, %xmm4
477 ; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm4
478 ; SSE-NEXT: movups %xmm4, (%rdi,%rcx,4)
479 ; SSE-NEXT: addq $4, %rcx
480 ; SSE-NEXT: cmpq %rcx, %rax
481 ; SSE-NEXT: jne .LBB1_2
482 ; SSE-NEXT: .LBB1_3: # %exit
485 ; AVX1-LABEL: vector_variable_shift_left_loop_simpler:
486 ; AVX1: # %bb.0: # %entry
487 ; AVX1-NEXT: testl %edx, %edx
488 ; AVX1-NEXT: jle .LBB1_3
489 ; AVX1-NEXT: # %bb.1: # %vector.ph
490 ; AVX1-NEXT: movl %edx, %eax
491 ; AVX1-NEXT: andl $-4, %eax
492 ; AVX1-NEXT: vmovd %ecx, %xmm0
493 ; AVX1-NEXT: vmovd %r8d, %xmm1
494 ; AVX1-NEXT: vmovd %r9d, %xmm2
495 ; AVX1-NEXT: xorl %ecx, %ecx
496 ; AVX1-NEXT: vpslld $23, %xmm0, %xmm0
497 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
498 ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
499 ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
500 ; AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
501 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
502 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
503 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
504 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
505 ; AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
506 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
507 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
508 ; AVX1-NEXT: .p2align 4, 0x90
509 ; AVX1-NEXT: .LBB1_2: # %vector.body
510 ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
511 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
512 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm3
513 ; AVX1-NEXT: vblendvps %xmm3, %xmm0, %xmm1, %xmm3
514 ; AVX1-NEXT: vmovups %xmm3, (%rdi,%rcx,4)
515 ; AVX1-NEXT: addq $4, %rcx
516 ; AVX1-NEXT: cmpq %rcx, %rax
517 ; AVX1-NEXT: jne .LBB1_2
518 ; AVX1-NEXT: .LBB1_3: # %exit
521 ; AVX2-LABEL: vector_variable_shift_left_loop_simpler:
522 ; AVX2: # %bb.0: # %entry
523 ; AVX2-NEXT: testl %edx, %edx
524 ; AVX2-NEXT: jle .LBB1_3
525 ; AVX2-NEXT: # %bb.1: # %vector.ph
526 ; AVX2-NEXT: movl %edx, %eax
527 ; AVX2-NEXT: andl $-4, %eax
528 ; AVX2-NEXT: vmovd %ecx, %xmm0
529 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
530 ; AVX2-NEXT: vmovd %r8d, %xmm1
531 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
532 ; AVX2-NEXT: vmovd %r9d, %xmm2
533 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2
534 ; AVX2-NEXT: xorl %ecx, %ecx
535 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
536 ; AVX2-NEXT: .p2align 4, 0x90
537 ; AVX2-NEXT: .LBB1_2: # %vector.body
538 ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
539 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
540 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4
541 ; AVX2-NEXT: vblendvps %xmm4, %xmm0, %xmm1, %xmm4
542 ; AVX2-NEXT: vpsllvd %xmm4, %xmm2, %xmm4
543 ; AVX2-NEXT: vmovdqu %xmm4, (%rdi,%rcx,4)
544 ; AVX2-NEXT: addq $4, %rcx
545 ; AVX2-NEXT: cmpq %rcx, %rax
546 ; AVX2-NEXT: jne .LBB1_2
547 ; AVX2-NEXT: .LBB1_3: # %exit
550 %cmp16 = icmp sgt i32 %count, 0
551 %wide.trip.count = zext i32 %count to i64
552 br i1 %cmp16, label %vector.ph, label %exit
555 %n.vec = and i64 %wide.trip.count, 4294967292
556 %splatinsert18 = insertelement <4 x i32> undef, i32 %amt0, i32 0
557 %splat1 = shufflevector <4 x i32> %splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer
558 %splatinsert20 = insertelement <4 x i32> undef, i32 %amt1, i32 0
559 %splat2 = shufflevector <4 x i32> %splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer
560 %splatinsert22 = insertelement <4 x i32> undef, i32 %x, i32 0
561 %splat3 = shufflevector <4 x i32> %splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
562 br label %vector.body
565 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
566 %0 = getelementptr inbounds i8, i8* %control, i64 %index
567 %1 = bitcast i8* %0 to <4 x i8>*
568 %wide.load = load <4 x i8>, <4 x i8>* %1, align 1
569 %2 = icmp eq <4 x i8> %wide.load, zeroinitializer
570 %3 = select <4 x i1> %2, <4 x i32> %splat1, <4 x i32> %splat2
571 %4 = shl <4 x i32> %splat3, %3
572 %5 = getelementptr inbounds i32, i32* %arr, i64 %index
573 %6 = bitcast i32* %5 to <4 x i32>*
574 store <4 x i32> %4, <4 x i32>* %6, align 4
575 %index.next = add i64 %index, 4
576 %7 = icmp eq i64 %index.next, %n.vec
577 br i1 %7, label %exit, label %vector.body