1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
5 ; Check that under certain conditions we can factor out a rotate
6 ; from the following idioms:
7 ; (a*c0) >> s1 | (a*c1)
8 ; (a/c0) << s1 | (a/c1)
9 ; This targets cases where instcombine has folded a shl/srl/mul/udiv
10 ; with one of the shifts from the rotate idiom
12 define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) {
13 ; CHECK-LABEL: vroll_v4i32_extract_shl:
15 ; CHECK-NEXT: vpslld $3, %xmm0, %xmm0
16 ; CHECK-NEXT: vprold $7, %xmm0, %xmm0
17 ; CHECK-NEXT: ret{{[l|q]}}
18 %lhs_mul = shl <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
19 %rhs_mul = shl <4 x i32> %i, <i32 10, i32 10, i32 10, i32 10>
20 %lhs_shift = lshr <4 x i32> %lhs_mul, <i32 25, i32 25, i32 25, i32 25>
21 %out = or <4 x i32> %lhs_shift, %rhs_mul
25 define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind {
26 ; CHECK-LABEL: vrolq_v4i64_extract_shrl:
28 ; CHECK-NEXT: vpsrlq $5, %ymm0, %ymm0
29 ; CHECK-NEXT: vprolq $29, %ymm0, %ymm0
30 ; CHECK-NEXT: ret{{[l|q]}}
31 %lhs_div = lshr <4 x i64> %i, <i64 40, i64 40, i64 40, i64 40>
32 %rhs_div = lshr <4 x i64> %i, <i64 5, i64 5, i64 5, i64 5>
33 %rhs_shift = shl <4 x i64> %rhs_div, <i64 29, i64 29, i64 29, i64 29>
34 %out = or <4 x i64> %lhs_div, %rhs_shift
38 define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind {
39 ; X86-LABEL: vroll_extract_mul:
41 ; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
42 ; X86-NEXT: vprold $6, %ymm0, %ymm0
45 ; X64-LABEL: vroll_extract_mul:
47 ; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
48 ; X64-NEXT: vprold $6, %ymm0, %ymm0
50 %lhs_mul = mul <8 x i32> %i, <i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640>
51 %rhs_mul = mul <8 x i32> %i, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
52 %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26>
53 %out = or <8 x i32> %lhs_mul, %rhs_shift
57 define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
58 ; X86-LABEL: vrolq_extract_udiv:
60 ; X86-NEXT: subl $32, %esp
61 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
62 ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
63 ; X86-NEXT: vmovss %xmm0, (%esp)
64 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
65 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
66 ; X86-NEXT: calll __udivdi3
67 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
68 ; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
69 ; X86-NEXT: vextractps $2, %xmm0, (%esp)
70 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
71 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
72 ; X86-NEXT: vmovd %eax, %xmm0
73 ; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
74 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
75 ; X86-NEXT: calll __udivdi3
76 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
77 ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
78 ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
79 ; X86-NEXT: vprolq $57, %xmm0, %xmm0
80 ; X86-NEXT: addl $32, %esp
83 ; X64-LABEL: vrolq_extract_udiv:
85 ; X64-NEXT: vpextrq $1, %xmm0, %rax
86 ; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
88 ; X64-NEXT: vmovq %rdx, %xmm1
89 ; X64-NEXT: vmovq %xmm0, %rax
91 ; X64-NEXT: vmovq %rdx, %xmm0
92 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
93 ; X64-NEXT: vpsrlq $1, %xmm0, %xmm0
94 ; X64-NEXT: vprolq $57, %xmm0, %xmm0
96 %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
97 %rhs_div = udiv <2 x i64> %i, <i64 384, i64 384>
98 %lhs_shift = shl <2 x i64> %lhs_div, <i64 57, i64 57>
99 %out = or <2 x i64> %lhs_shift, %rhs_div
103 define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind {
104 ; X86-LABEL: vrolw_extract_mul_with_mask:
106 ; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
107 ; X86-NEXT: vprold $7, %xmm0, %xmm0
108 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
111 ; X64-LABEL: vrolw_extract_mul_with_mask:
113 ; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
114 ; X64-NEXT: vprold $7, %xmm0, %xmm0
115 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
117 %lhs_mul = mul <4 x i32> %i, <i32 1152, i32 1152, i32 1152, i32 1152>
118 %rhs_mul = mul <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
119 %lhs_and = and <4 x i32> %lhs_mul, <i32 160, i32 160, i32 160, i32 160>
120 %rhs_shift = lshr <4 x i32> %rhs_mul, <i32 25, i32 25, i32 25, i32 25>
121 %out = or <4 x i32> %lhs_and, %rhs_shift
125 define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind {
126 ; X86-LABEL: illegal_no_extract_mul:
128 ; X86-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
129 ; X86-NEXT: vpsrlw $10, %zmm0, %zmm1
130 ; X86-NEXT: vpsllw $6, %zmm0, %zmm0
131 ; X86-NEXT: vporq %zmm1, %zmm0, %zmm0
134 ; X64-LABEL: illegal_no_extract_mul:
136 ; X64-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
137 ; X64-NEXT: vpsrlw $10, %zmm0, %zmm1
138 ; X64-NEXT: vpsllw $6, %zmm0, %zmm0
139 ; X64-NEXT: vporq %zmm1, %zmm0, %zmm0
141 %lhs_mul = mul <32 x i16> %i, <i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640>
142 %rhs_mul = mul <32 x i16> %i, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
143 %rhs_shift = lshr <32 x i16> %rhs_mul, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
144 %out = or <32 x i16> %lhs_mul, %rhs_shift
148 ; Result would undershift
149 define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
150 ; X86-LABEL: no_extract_shl:
152 ; X86-NEXT: vpsllq $24, %ymm0, %ymm1
153 ; X86-NEXT: vpsrlq $39, %ymm0, %ymm0
154 ; X86-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm0
157 ; X64-LABEL: no_extract_shl:
159 ; X64-NEXT: vpsllq $24, %ymm0, %ymm1
160 ; X64-NEXT: vpsrlq $39, %ymm0, %ymm0
161 ; X64-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0
163 %lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11>
164 %rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24>
165 %lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50>
166 %out = or <4 x i64> %lhs_shift, %rhs_mul
170 ; Result would overshift
171 define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind {
172 ; X86-LABEL: no_extract_shrl:
174 ; X86-NEXT: vpsrld $9, %xmm0, %xmm1
175 ; X86-NEXT: vpslld $25, %xmm0, %xmm0
176 ; X86-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm1, %xmm0
179 ; X64-LABEL: no_extract_shrl:
181 ; X64-NEXT: vpsrld $9, %xmm0, %xmm1
182 ; X64-NEXT: vpslld $25, %xmm0, %xmm0
183 ; X64-NEXT: vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
185 %lhs_div = lshr <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
186 %rhs_div = lshr <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
187 %lhs_shift = shl <4 x i32> %lhs_div, <i32 28, i32 28, i32 28, i32 28>
188 %out = or <4 x i32> %lhs_shift, %rhs_div
192 ; Can factor 512 from 1536, but result is 3 instead of 9
193 define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind {
194 ; X86-LABEL: no_extract_mul:
196 ; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1
197 ; X86-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
198 ; X86-NEXT: vpsrld $23, %ymm0, %ymm0
199 ; X86-NEXT: vpor %ymm0, %ymm1, %ymm0
202 ; X64-LABEL: no_extract_mul:
204 ; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
205 ; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
206 ; X64-NEXT: vpsrld $23, %ymm0, %ymm0
207 ; X64-NEXT: vpor %ymm0, %ymm1, %ymm0
209 %lhs_mul = mul <8 x i32> %i, <i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536>
210 %rhs_mul = mul <8 x i32> %i, <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
211 %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
212 %out = or <8 x i32> %lhs_mul, %rhs_shift
216 ; Can't evenly factor 256 from 770
217 define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
218 ; X86-LABEL: no_extract_udiv:
220 ; X86-NEXT: subl $48, %esp
221 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
222 ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
223 ; X86-NEXT: vmovss %xmm0, (%esp)
224 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
225 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
226 ; X86-NEXT: calll __udivdi3
227 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
228 ; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
229 ; X86-NEXT: vextractps $2, %xmm0, (%esp)
230 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
231 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
232 ; X86-NEXT: vmovd %eax, %xmm0
233 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
234 ; X86-NEXT: calll __udivdi3
235 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
236 ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
237 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
238 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
239 ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
240 ; X86-NEXT: vmovss %xmm0, (%esp)
241 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
242 ; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302
243 ; X86-NEXT: calll __udivdi3
244 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
245 ; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
246 ; X86-NEXT: vextractps $2, %xmm0, (%esp)
247 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
248 ; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302
249 ; X86-NEXT: vmovd %eax, %xmm0
250 ; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
251 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
252 ; X86-NEXT: calll __udivdi3
253 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
254 ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
255 ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
256 ; X86-NEXT: vpsllq $56, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Folded Reload
257 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
258 ; X86-NEXT: addl $48, %esp
261 ; X64-LABEL: no_extract_udiv:
263 ; X64-NEXT: vpextrq $1, %xmm0, %rcx
264 ; X64-NEXT: movabsq $-6148914691236517205, %rdi # imm = 0xAAAAAAAAAAAAAAAB
265 ; X64-NEXT: movq %rcx, %rax
266 ; X64-NEXT: mulq %rdi
267 ; X64-NEXT: vmovq %rdx, %xmm1
268 ; X64-NEXT: vmovq %xmm0, %rsi
269 ; X64-NEXT: movq %rsi, %rax
270 ; X64-NEXT: mulq %rdi
271 ; X64-NEXT: vmovq %rdx, %xmm0
272 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
273 ; X64-NEXT: vpsrlq $1, %xmm0, %xmm0
274 ; X64-NEXT: movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B
275 ; X64-NEXT: movq %rcx, %rax
276 ; X64-NEXT: mulq %rdi
277 ; X64-NEXT: vmovq %rdx, %xmm1
278 ; X64-NEXT: movq %rsi, %rax
279 ; X64-NEXT: mulq %rdi
280 ; X64-NEXT: vmovq %rdx, %xmm2
281 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
282 ; X64-NEXT: vpsrlq $9, %xmm1, %xmm1
283 ; X64-NEXT: vpsllq $56, %xmm0, %xmm0
284 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
286 %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
287 %rhs_div = udiv <2 x i64> %i, <i64 770, i64 770>
288 %lhs_shift = shl <2 x i64> %lhs_div, <i64 56, i64 56>
289 %out = or <2 x i64> %lhs_shift, %rhs_div
293 ; DAGCombiner transforms shl X, 1 into add X, X.
294 define <4 x i32> @extract_add_1(<4 x i32> %i) nounwind {
295 ; CHECK-LABEL: extract_add_1:
297 ; CHECK-NEXT: vprold $1, %xmm0, %xmm0
298 ; CHECK-NEXT: ret{{[l|q]}}
299 %ii = add <4 x i32> %i, %i
300 %rhs = lshr <4 x i32> %i, <i32 31, i32 31, i32 31, i32 31>
301 %out = or <4 x i32> %ii, %rhs
305 define <4 x i32> @extract_add_1_comut(<4 x i32> %i) nounwind {
306 ; CHECK-LABEL: extract_add_1_comut:
308 ; CHECK-NEXT: vprold $1, %xmm0, %xmm0
309 ; CHECK-NEXT: ret{{[l|q]}}
310 %ii = add <4 x i32> %i, %i
311 %lhs = lshr <4 x i32> %i, <i32 31, i32 31, i32 31, i32 31>
312 %out = or <4 x i32> %lhs, %ii
316 define <4 x i32> @no_extract_add_1(<4 x i32> %i) nounwind {
317 ; CHECK-LABEL: no_extract_add_1:
319 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm1
320 ; CHECK-NEXT: vpsrld $27, %xmm0, %xmm0
321 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
322 ; CHECK-NEXT: ret{{[l|q]}}
323 %ii = add <4 x i32> %i, %i
324 %rhs = lshr <4 x i32> %i, <i32 27, i32 27, i32 27, i32 27>
325 %out = or <4 x i32> %ii, %rhs