1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
5 ; Check that under certain conditions we can factor out a rotate
6 ; from the following idioms:
7 ; (a*c0) >> s1 | (a*c1)
8 ; (a/c0) << s1 | (a/c1)
9 ; This targets cases where instcombine has folded a shl/srl/mul/udiv
10 ; with one of the shifts from the rotate idiom
12 define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) {
13 ; CHECK-LABEL: vroll_v4i32_extract_shl:
15 ; CHECK-NEXT: vpslld $3, %xmm0, %xmm0
16 ; CHECK-NEXT: vprold $7, %xmm0, %xmm0
17 ; CHECK-NEXT: ret{{[l|q]}}
18 %lhs_mul = shl <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
19 %rhs_mul = shl <4 x i32> %i, <i32 10, i32 10, i32 10, i32 10>
20 %lhs_shift = lshr <4 x i32> %lhs_mul, <i32 25, i32 25, i32 25, i32 25>
21 %out = or <4 x i32> %lhs_shift, %rhs_mul
25 define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind {
26 ; CHECK-LABEL: vrolq_v4i64_extract_shrl:
28 ; CHECK-NEXT: vpsrlq $5, %ymm0, %ymm0
29 ; CHECK-NEXT: vprolq $29, %ymm0, %ymm0
30 ; CHECK-NEXT: ret{{[l|q]}}
31 %lhs_div = lshr <4 x i64> %i, <i64 40, i64 40, i64 40, i64 40>
32 %rhs_div = lshr <4 x i64> %i, <i64 5, i64 5, i64 5, i64 5>
33 %rhs_shift = shl <4 x i64> %rhs_div, <i64 29, i64 29, i64 29, i64 29>
34 %out = or <4 x i64> %lhs_div, %rhs_shift
38 define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind {
39 ; X86-LABEL: vroll_extract_mul:
41 ; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
42 ; X86-NEXT: vprold $6, %ymm0, %ymm0
45 ; X64-LABEL: vroll_extract_mul:
47 ; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
48 ; X64-NEXT: vprold $6, %ymm0, %ymm0
50 %lhs_mul = mul <8 x i32> %i, <i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640>
51 %rhs_mul = mul <8 x i32> %i, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
52 %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26>
53 %out = or <8 x i32> %lhs_mul, %rhs_shift
57 define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
58 ; X86-LABEL: vrolq_extract_udiv:
60 ; X86-NEXT: subl $32, %esp
61 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
62 ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
63 ; X86-NEXT: vmovss %xmm0, (%esp)
64 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
65 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
66 ; X86-NEXT: calll __udivdi3
67 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
68 ; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
69 ; X86-NEXT: vextractps $2, %xmm0, (%esp)
70 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
71 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
72 ; X86-NEXT: vmovd %eax, %xmm0
73 ; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
74 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
75 ; X86-NEXT: calll __udivdi3
76 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
77 ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
78 ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
79 ; X86-NEXT: vprolq $57, %xmm0, %xmm0
80 ; X86-NEXT: addl $32, %esp
83 ; X64-LABEL: vrolq_extract_udiv:
85 ; X64-NEXT: vpextrq $1, %xmm0, %rax
86 ; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
88 ; X64-NEXT: vmovq %rdx, %xmm1
89 ; X64-NEXT: vmovq %xmm0, %rax
91 ; X64-NEXT: vmovq %rdx, %xmm0
92 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
93 ; X64-NEXT: vpsrlq $1, %xmm0, %xmm0
94 ; X64-NEXT: vprolq $57, %xmm0, %xmm0
96 %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
97 %rhs_div = udiv <2 x i64> %i, <i64 384, i64 384>
98 %lhs_shift = shl <2 x i64> %lhs_div, <i64 57, i64 57>
99 %out = or <2 x i64> %lhs_shift, %rhs_div
103 define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind {
104 ; X86-LABEL: vrolw_extract_mul_with_mask:
106 ; X86-NEXT: vpslld $3, %xmm0, %xmm1
107 ; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0
108 ; X86-NEXT: vprold $7, %xmm0, %xmm0
109 ; X86-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
112 ; X64-LABEL: vrolw_extract_mul_with_mask:
114 ; X64-NEXT: vpslld $3, %xmm0, %xmm1
115 ; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
116 ; X64-NEXT: vprold $7, %xmm0, %xmm0
117 ; X64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
119 %lhs_mul = mul <4 x i32> %i, <i32 1152, i32 1152, i32 1152, i32 1152>
120 %rhs_mul = mul <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
121 %lhs_and = and <4 x i32> %lhs_mul, <i32 160, i32 160, i32 160, i32 160>
122 %rhs_shift = lshr <4 x i32> %rhs_mul, <i32 25, i32 25, i32 25, i32 25>
123 %out = or <4 x i32> %lhs_and, %rhs_shift
127 define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind {
128 ; X86-LABEL: illegal_no_extract_mul:
130 ; X86-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 # [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
131 ; X86-NEXT: vpsrlw $10, %zmm0, %zmm1
132 ; X86-NEXT: vpsllw $6, %zmm0, %zmm0
133 ; X86-NEXT: vporq %zmm1, %zmm0, %zmm0
136 ; X64-LABEL: illegal_no_extract_mul:
138 ; X64-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
139 ; X64-NEXT: vpsrlw $10, %zmm0, %zmm1
140 ; X64-NEXT: vpsllw $6, %zmm0, %zmm0
141 ; X64-NEXT: vporq %zmm1, %zmm0, %zmm0
143 %lhs_mul = mul <32 x i16> %i, <i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640>
144 %rhs_mul = mul <32 x i16> %i, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
145 %rhs_shift = lshr <32 x i16> %rhs_mul, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
146 %out = or <32 x i16> %lhs_mul, %rhs_shift
150 ; Result would undershift
151 define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
152 ; X86-LABEL: no_extract_shl:
154 ; X86-NEXT: vpsllq $24, %ymm0, %ymm1
155 ; X86-NEXT: vpsrlq $39, %ymm0, %ymm0
156 ; X86-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm1, %ymm0
159 ; X64-LABEL: no_extract_shl:
161 ; X64-NEXT: vpsllq $24, %ymm0, %ymm1
162 ; X64-NEXT: vpsrlq $39, %ymm0, %ymm0
163 ; X64-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0
165 %lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11>
166 %rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24>
167 %lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50>
168 %out = or <4 x i64> %lhs_shift, %rhs_mul
172 ; Result would overshift
173 define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind {
174 ; X86-LABEL: no_extract_shrl:
176 ; X86-NEXT: vpsrld $9, %xmm0, %xmm1
177 ; X86-NEXT: vpslld $25, %xmm0, %xmm0
178 ; X86-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm1, %xmm0
181 ; X64-LABEL: no_extract_shrl:
183 ; X64-NEXT: vpsrld $9, %xmm0, %xmm1
184 ; X64-NEXT: vpslld $25, %xmm0, %xmm0
185 ; X64-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
187 %lhs_div = lshr <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
188 %rhs_div = lshr <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
189 %lhs_shift = shl <4 x i32> %lhs_div, <i32 28, i32 28, i32 28, i32 28>
190 %out = or <4 x i32> %lhs_shift, %rhs_div
194 ; Can factor 512 from 1536, but result is 3 instead of 9
195 define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind {
196 ; X86-LABEL: no_extract_mul:
198 ; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1
199 ; X86-NEXT: vpslld $3, %ymm0, %ymm2
200 ; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0
201 ; X86-NEXT: vpsrld $23, %ymm0, %ymm0
202 ; X86-NEXT: vpor %ymm0, %ymm1, %ymm0
205 ; X64-LABEL: no_extract_mul:
207 ; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
208 ; X64-NEXT: vpslld $3, %ymm0, %ymm2
209 ; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0
210 ; X64-NEXT: vpsrld $23, %ymm0, %ymm0
211 ; X64-NEXT: vpor %ymm0, %ymm1, %ymm0
213 %lhs_mul = mul <8 x i32> %i, <i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536>
214 %rhs_mul = mul <8 x i32> %i, <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
215 %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
216 %out = or <8 x i32> %lhs_mul, %rhs_shift
220 ; Can't evenly factor 256 from 770
221 define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
222 ; X86-LABEL: no_extract_udiv:
224 ; X86-NEXT: subl $48, %esp
225 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
226 ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
227 ; X86-NEXT: vmovss %xmm0, (%esp)
228 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
229 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
230 ; X86-NEXT: calll __udivdi3
231 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
232 ; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
233 ; X86-NEXT: vextractps $2, %xmm0, (%esp)
234 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
235 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
236 ; X86-NEXT: vmovd %eax, %xmm0
237 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
238 ; X86-NEXT: calll __udivdi3
239 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
240 ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
241 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
242 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
243 ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
244 ; X86-NEXT: vmovss %xmm0, (%esp)
245 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
246 ; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302
247 ; X86-NEXT: calll __udivdi3
248 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
249 ; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
250 ; X86-NEXT: vextractps $2, %xmm0, (%esp)
251 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
252 ; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302
253 ; X86-NEXT: vmovd %eax, %xmm0
254 ; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
255 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
256 ; X86-NEXT: calll __udivdi3
257 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
258 ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
259 ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
260 ; X86-NEXT: vpsllq $56, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Folded Reload
261 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
262 ; X86-NEXT: addl $48, %esp
265 ; X64-LABEL: no_extract_udiv:
267 ; X64-NEXT: vpextrq $1, %xmm0, %rcx
268 ; X64-NEXT: movabsq $-6148914691236517205, %rdi # imm = 0xAAAAAAAAAAAAAAAB
269 ; X64-NEXT: movq %rcx, %rax
270 ; X64-NEXT: mulq %rdi
271 ; X64-NEXT: vmovq %rdx, %xmm1
272 ; X64-NEXT: vmovq %xmm0, %rsi
273 ; X64-NEXT: movq %rsi, %rax
274 ; X64-NEXT: mulq %rdi
275 ; X64-NEXT: vmovq %rdx, %xmm0
276 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
277 ; X64-NEXT: vpsrlq $1, %xmm0, %xmm0
278 ; X64-NEXT: movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B
279 ; X64-NEXT: movq %rcx, %rax
280 ; X64-NEXT: mulq %rdi
281 ; X64-NEXT: vmovq %rdx, %xmm1
282 ; X64-NEXT: movq %rsi, %rax
283 ; X64-NEXT: mulq %rdi
284 ; X64-NEXT: vmovq %rdx, %xmm2
285 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
286 ; X64-NEXT: vpsrlq $9, %xmm1, %xmm1
287 ; X64-NEXT: vpsllq $56, %xmm0, %xmm0
288 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
290 %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
291 %rhs_div = udiv <2 x i64> %i, <i64 770, i64 770>
292 %lhs_shift = shl <2 x i64> %lhs_div, <i64 56, i64 56>
293 %out = or <2 x i64> %lhs_shift, %rhs_div
297 ; DAGCombiner transforms shl X, 1 into add X, X.
298 define <4 x i32> @extract_add_1(<4 x i32> %i) nounwind {
299 ; CHECK-LABEL: extract_add_1:
301 ; CHECK-NEXT: vprold $1, %xmm0, %xmm0
302 ; CHECK-NEXT: ret{{[l|q]}}
303 %ii = add <4 x i32> %i, %i
304 %rhs = lshr <4 x i32> %i, <i32 31, i32 31, i32 31, i32 31>
305 %out = or <4 x i32> %ii, %rhs
309 define <4 x i32> @extract_add_1_comut(<4 x i32> %i) nounwind {
310 ; CHECK-LABEL: extract_add_1_comut:
312 ; CHECK-NEXT: vprold $1, %xmm0, %xmm0
313 ; CHECK-NEXT: ret{{[l|q]}}
314 %ii = add <4 x i32> %i, %i
315 %lhs = lshr <4 x i32> %i, <i32 31, i32 31, i32 31, i32 31>
316 %out = or <4 x i32> %lhs, %ii
320 define <4 x i32> @no_extract_add_1(<4 x i32> %i) nounwind {
321 ; CHECK-LABEL: no_extract_add_1:
323 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm1
324 ; CHECK-NEXT: vpsrld $27, %xmm0, %xmm0
325 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
326 ; CHECK-NEXT: ret{{[l|q]}}
327 %ii = add <4 x i32> %i, %i
328 %rhs = lshr <4 x i32> %i, <i32 27, i32 27, i32 27, i32 27>
329 %out = or <4 x i32> %ii, %rhs