1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64
5 ; Check that under certain conditions we can factor out a rotate
6 ; from the following idioms:
7 ; (a*c0) >> s1 | (a*c1)
8 ; (a/c0) << s1 | (a/c1)
9 ; This targets cases where instcombine has folded a shl/srl/mul/udiv
10 ; with one of the shifts from the rotate idiom
12 define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) {
13 ; CHECK-LABEL: vroll_v4i32_extract_shl:
15 ; CHECK-NEXT: vpslld $3, %xmm0, %xmm0
16 ; CHECK-NEXT: vprold $7, %zmm0, %zmm0
17 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
18 ; CHECK-NEXT: vzeroupper
19 ; CHECK-NEXT: ret{{[l|q]}}
20 %lhs_mul = shl <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
21 %rhs_mul = shl <4 x i32> %i, <i32 10, i32 10, i32 10, i32 10>
22 %lhs_shift = lshr <4 x i32> %lhs_mul, <i32 25, i32 25, i32 25, i32 25>
23 %out = or <4 x i32> %lhs_shift, %rhs_mul
27 define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind {
28 ; CHECK-LABEL: vrolq_v4i64_extract_shrl:
30 ; CHECK-NEXT: vpsrlq $5, %ymm0, %ymm0
31 ; CHECK-NEXT: vprolq $29, %zmm0, %zmm0
32 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
33 ; CHECK-NEXT: ret{{[l|q]}}
34 %lhs_div = lshr <4 x i64> %i, <i64 40, i64 40, i64 40, i64 40>
35 %rhs_div = lshr <4 x i64> %i, <i64 5, i64 5, i64 5, i64 5>
36 %rhs_shift = shl <4 x i64> %rhs_div, <i64 29, i64 29, i64 29, i64 29>
37 %out = or <4 x i64> %lhs_div, %rhs_shift
41 define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind {
42 ; CHECK-LABEL: vroll_extract_mul:
44 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10]
45 ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
46 ; CHECK-NEXT: vprold $6, %zmm0, %zmm0
47 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
48 ; CHECK-NEXT: ret{{[l|q]}}
49 %lhs_mul = mul <8 x i32> %i, <i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640>
50 %rhs_mul = mul <8 x i32> %i, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
51 %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26>
52 %out = or <8 x i32> %lhs_mul, %rhs_shift
56 define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
57 ; X86-LABEL: vrolq_extract_udiv:
59 ; X86-NEXT: subl $32, %esp
60 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
61 ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
62 ; X86-NEXT: vmovss %xmm0, (%esp)
63 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
64 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
65 ; X86-NEXT: calll __udivdi3
66 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
67 ; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
68 ; X86-NEXT: vextractps $2, %xmm0, (%esp)
69 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
70 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
71 ; X86-NEXT: vmovd %eax, %xmm0
72 ; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
73 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
74 ; X86-NEXT: calll __udivdi3
75 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
76 ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
77 ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
78 ; X86-NEXT: vprolq $57, %zmm0, %zmm0
79 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
80 ; X86-NEXT: addl $32, %esp
81 ; X86-NEXT: vzeroupper
84 ; X64-LABEL: vrolq_extract_udiv:
86 ; X64-NEXT: vpextrq $1, %xmm0, %rax
87 ; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
89 ; X64-NEXT: vmovq %rdx, %xmm1
90 ; X64-NEXT: vmovq %xmm0, %rax
92 ; X64-NEXT: vmovq %rdx, %xmm0
93 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
94 ; X64-NEXT: vpsrlq $1, %xmm0, %xmm0
95 ; X64-NEXT: vprolq $57, %zmm0, %zmm0
96 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
97 ; X64-NEXT: vzeroupper
99 %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
100 %rhs_div = udiv <2 x i64> %i, <i64 384, i64 384>
101 %lhs_shift = shl <2 x i64> %lhs_div, <i64 57, i64 57>
102 %out = or <2 x i64> %lhs_shift, %rhs_div
106 define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind {
107 ; X86-LABEL: vrolw_extract_mul_with_mask:
109 ; X86-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9]
110 ; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0
111 ; X86-NEXT: vprold $7, %zmm0, %zmm0
112 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
113 ; X86-NEXT: vzeroupper
116 ; X64-LABEL: vrolw_extract_mul_with_mask:
118 ; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9]
119 ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
120 ; X64-NEXT: vprold $7, %zmm0, %zmm0
121 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
122 ; X64-NEXT: vzeroupper
124 %lhs_mul = mul <4 x i32> %i, <i32 1152, i32 1152, i32 1152, i32 1152>
125 %rhs_mul = mul <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
126 %lhs_and = and <4 x i32> %lhs_mul, <i32 160, i32 160, i32 160, i32 160>
127 %rhs_shift = lshr <4 x i32> %rhs_mul, <i32 25, i32 25, i32 25, i32 25>
128 %out = or <4 x i32> %lhs_and, %rhs_shift
132 define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind {
133 ; X86-LABEL: illegal_no_extract_mul:
135 ; X86-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm1
136 ; X86-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
137 ; X86-NEXT: vpsrlw $10, %zmm0, %zmm0
138 ; X86-NEXT: vporq %zmm0, %zmm1, %zmm0
141 ; X64-LABEL: illegal_no_extract_mul:
143 ; X64-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
144 ; X64-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
145 ; X64-NEXT: vpsrlw $10, %zmm0, %zmm0
146 ; X64-NEXT: vporq %zmm0, %zmm1, %zmm0
148 %lhs_mul = mul <32 x i16> %i, <i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640>
149 %rhs_mul = mul <32 x i16> %i, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
150 %rhs_shift = lshr <32 x i16> %rhs_mul, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
151 %out = or <32 x i16> %lhs_mul, %rhs_shift
155 ; Result would undershift
156 define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
157 ; CHECK-LABEL: no_extract_shl:
159 ; CHECK-NEXT: vpsllq $11, %ymm0, %ymm1
160 ; CHECK-NEXT: vpsllq $24, %ymm0, %ymm0
161 ; CHECK-NEXT: vpsrlq $50, %ymm1, %ymm1
162 ; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0
163 ; CHECK-NEXT: ret{{[l|q]}}
164 %lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11>
165 %rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24>
166 %lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50>
167 %out = or <4 x i64> %lhs_shift, %rhs_mul
171 ; Result would overshift
172 define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind {
173 ; CHECK-LABEL: no_extract_shrl:
175 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4026531840,4026531840,4026531840,4026531840]
176 ; CHECK-NEXT: vpslld $25, %xmm0, %xmm2
177 ; CHECK-NEXT: vpand %xmm1, %xmm2, %xmm1
178 ; CHECK-NEXT: vpsrld $9, %xmm0, %xmm0
179 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
180 ; CHECK-NEXT: ret{{[l|q]}}
181 %lhs_div = lshr <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
182 %rhs_div = lshr <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
183 %lhs_shift = shl <4 x i32> %lhs_div, <i32 28, i32 28, i32 28, i32 28>
184 %out = or <4 x i32> %lhs_shift, %rhs_div
188 ; Can factor 512 from 1536, but result is 3 instead of 9
189 define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind {
190 ; CHECK-LABEL: no_extract_mul:
192 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1536,1536,1536,1536,1536,1536,1536,1536]
193 ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm1
194 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9]
195 ; CHECK-NEXT: vpmulld %ymm2, %ymm0, %ymm0
196 ; CHECK-NEXT: vpsrld $23, %ymm0, %ymm0
197 ; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0
198 ; CHECK-NEXT: ret{{[l|q]}}
199 %lhs_mul = mul <8 x i32> %i, <i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536>
200 %rhs_mul = mul <8 x i32> %i, <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
201 %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
202 %out = or <8 x i32> %lhs_mul, %rhs_shift
206 ; Can't evenly factor 256 from 770
207 define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
208 ; X86-LABEL: no_extract_udiv:
210 ; X86-NEXT: subl $48, %esp
211 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
212 ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
213 ; X86-NEXT: vmovss %xmm0, (%esp)
214 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
215 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
216 ; X86-NEXT: calll __udivdi3
217 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
218 ; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
219 ; X86-NEXT: vextractps $2, %xmm0, (%esp)
220 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
221 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
222 ; X86-NEXT: vmovd %eax, %xmm0
223 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
224 ; X86-NEXT: calll __udivdi3
225 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
226 ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
227 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
228 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
229 ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
230 ; X86-NEXT: vmovss %xmm0, (%esp)
231 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
232 ; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302
233 ; X86-NEXT: calll __udivdi3
234 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
235 ; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
236 ; X86-NEXT: vextractps $2, %xmm0, (%esp)
237 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
238 ; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302
239 ; X86-NEXT: vmovd %eax, %xmm0
240 ; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
241 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
242 ; X86-NEXT: calll __udivdi3
243 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
244 ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
245 ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
246 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
247 ; X86-NEXT: vpsllq $56, %xmm1, %xmm1
248 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
249 ; X86-NEXT: addl $48, %esp
252 ; X64-LABEL: no_extract_udiv:
254 ; X64-NEXT: vpextrq $1, %xmm0, %rcx
255 ; X64-NEXT: movabsq $-6148914691236517205, %rdi # imm = 0xAAAAAAAAAAAAAAAB
256 ; X64-NEXT: movq %rcx, %rax
257 ; X64-NEXT: mulq %rdi
258 ; X64-NEXT: vmovq %rdx, %xmm1
259 ; X64-NEXT: vmovq %xmm0, %rsi
260 ; X64-NEXT: movq %rsi, %rax
261 ; X64-NEXT: mulq %rdi
262 ; X64-NEXT: vmovq %rdx, %xmm0
263 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
264 ; X64-NEXT: vpsrlq $1, %xmm0, %xmm0
265 ; X64-NEXT: movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B
266 ; X64-NEXT: movq %rcx, %rax
267 ; X64-NEXT: mulq %rdi
268 ; X64-NEXT: vmovq %rdx, %xmm1
269 ; X64-NEXT: movq %rsi, %rax
270 ; X64-NEXT: mulq %rdi
271 ; X64-NEXT: vmovq %rdx, %xmm2
272 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
273 ; X64-NEXT: vpsrlq $9, %xmm1, %xmm1
274 ; X64-NEXT: vpsllq $56, %xmm0, %xmm0
275 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
277 %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
278 %rhs_div = udiv <2 x i64> %i, <i64 770, i64 770>
279 %lhs_shift = shl <2 x i64> %lhs_div, <i64 56, i64 56>
280 %out = or <2 x i64> %lhs_shift, %rhs_div
284 ; DAGCombiner transforms shl X, 1 into add X, X.
285 define <4 x i32> @extract_add_1(<4 x i32> %i) nounwind {
286 ; CHECK-LABEL: extract_add_1:
288 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
289 ; CHECK-NEXT: vprold $1, %zmm0, %zmm0
290 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
291 ; CHECK-NEXT: vzeroupper
292 ; CHECK-NEXT: ret{{[l|q]}}
293 %ii = add <4 x i32> %i, %i
294 %rhs = lshr <4 x i32> %i, <i32 31, i32 31, i32 31, i32 31>
295 %out = or <4 x i32> %ii, %rhs
299 define <4 x i32> @extract_add_1_comut(<4 x i32> %i) nounwind {
300 ; CHECK-LABEL: extract_add_1_comut:
302 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
303 ; CHECK-NEXT: vprold $1, %zmm0, %zmm0
304 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
305 ; CHECK-NEXT: vzeroupper
306 ; CHECK-NEXT: ret{{[l|q]}}
307 %ii = add <4 x i32> %i, %i
308 %lhs = lshr <4 x i32> %i, <i32 31, i32 31, i32 31, i32 31>
309 %out = or <4 x i32> %lhs, %ii
313 define <4 x i32> @no_extract_add_1(<4 x i32> %i) nounwind {
314 ; CHECK-LABEL: no_extract_add_1:
316 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm1
317 ; CHECK-NEXT: vpsrld $27, %xmm0, %xmm0
318 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
319 ; CHECK-NEXT: ret{{[l|q]}}
320 %ii = add <4 x i32> %i, %i
321 %rhs = lshr <4 x i32> %i, <i32 27, i32 27, i32 27, i32 27>
322 %out = or <4 x i32> %ii, %rhs