1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64
5 ; Check that under certain conditions we can factor out a rotate
6 ; from the following idioms:
7 ; (a*c0) >> s1 | (a*c1)
8 ; (a/c0) << s1 | (a/c1)
9 ; This targets cases where instcombine has folded a shl/srl/mul/udiv
10 ; with one of the shifts from the rotate idiom
12 define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) {
13 ; CHECK-LABEL: vroll_v4i32_extract_shl:
15 ; CHECK-NEXT: vpslld $3, %xmm0, %xmm0
16 ; CHECK-NEXT: vprold $7, %zmm0, %zmm0
17 ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
18 ; CHECK-NEXT: vzeroupper
19 ; CHECK-NEXT: ret{{[l|q]}}
20 %lhs_mul = shl <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
21 %rhs_mul = shl <4 x i32> %i, <i32 10, i32 10, i32 10, i32 10>
22 %lhs_shift = lshr <4 x i32> %lhs_mul, <i32 25, i32 25, i32 25, i32 25>
23 %out = or <4 x i32> %lhs_shift, %rhs_mul
27 define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind {
28 ; CHECK-LABEL: vrolq_v4i64_extract_shrl:
30 ; CHECK-NEXT: vpsrlq $5, %ymm0, %ymm0
31 ; CHECK-NEXT: vprolq $29, %zmm0, %zmm0
32 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
33 ; CHECK-NEXT: ret{{[l|q]}}
34 %lhs_div = lshr <4 x i64> %i, <i64 40, i64 40, i64 40, i64 40>
35 %rhs_div = lshr <4 x i64> %i, <i64 5, i64 5, i64 5, i64 5>
36 %rhs_shift = shl <4 x i64> %rhs_div, <i64 29, i64 29, i64 29, i64 29>
37 %out = or <4 x i64> %lhs_div, %rhs_shift
41 define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind {
42 ; CHECK-LABEL: vroll_extract_mul:
44 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10]
45 ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
46 ; CHECK-NEXT: vprold $6, %zmm0, %zmm0
47 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
48 ; CHECK-NEXT: ret{{[l|q]}}
49 %lhs_mul = mul <8 x i32> %i, <i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640>
50 %rhs_mul = mul <8 x i32> %i, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
51 %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26>
52 %out = or <8 x i32> %lhs_mul, %rhs_shift
56 define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
57 ; X86-LABEL: vrolq_extract_udiv:
59 ; X86-NEXT: subl $44, %esp
60 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
61 ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
62 ; X86-NEXT: vmovss %xmm0, (%esp)
63 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
64 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
65 ; X86-NEXT: calll __udivdi3
66 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
67 ; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
68 ; X86-NEXT: vextractps $2, %xmm0, (%esp)
69 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
70 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
71 ; X86-NEXT: vmovd %eax, %xmm0
72 ; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
73 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
74 ; X86-NEXT: calll __udivdi3
75 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
76 ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
77 ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
78 ; X86-NEXT: vprolq $57, %zmm0, %zmm0
79 ; X86-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
80 ; X86-NEXT: addl $44, %esp
81 ; X86-NEXT: vzeroupper
84 ; X64-LABEL: vrolq_extract_udiv:
86 ; X64-NEXT: vpextrq $1, %xmm0, %rax
87 ; X64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
90 ; X64-NEXT: vmovq %rdx, %xmm1
91 ; X64-NEXT: vmovq %xmm0, %rax
94 ; X64-NEXT: vmovq %rdx, %xmm0
95 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
96 ; X64-NEXT: vprolq $57, %zmm0, %zmm0
97 ; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
98 ; X64-NEXT: vzeroupper
100 %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
101 %rhs_div = udiv <2 x i64> %i, <i64 384, i64 384>
102 %lhs_shift = shl <2 x i64> %lhs_div, <i64 57, i64 57>
103 %out = or <2 x i64> %lhs_shift, %rhs_div
107 define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind {
108 ; X86-LABEL: vrolw_extract_mul_with_mask:
110 ; X86-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9]
111 ; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0
112 ; X86-NEXT: vprold $7, %zmm0, %zmm0
113 ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
114 ; X86-NEXT: vzeroupper
117 ; X64-LABEL: vrolw_extract_mul_with_mask:
119 ; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9]
120 ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
121 ; X64-NEXT: vprold $7, %zmm0, %zmm0
122 ; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
123 ; X64-NEXT: vzeroupper
125 %lhs_mul = mul <4 x i32> %i, <i32 1152, i32 1152, i32 1152, i32 1152>
126 %rhs_mul = mul <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
127 %lhs_and = and <4 x i32> %lhs_mul, <i32 160, i32 160, i32 160, i32 160>
128 %rhs_shift = lshr <4 x i32> %rhs_mul, <i32 25, i32 25, i32 25, i32 25>
129 %out = or <4 x i32> %lhs_and, %rhs_shift
133 define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind {
134 ; X86-LABEL: illegal_no_extract_mul:
136 ; X86-NEXT: vpmullw {{\.LCPI.*}}, %zmm0, %zmm1
137 ; X86-NEXT: vpmullw {{\.LCPI.*}}, %zmm0, %zmm0
138 ; X86-NEXT: vpsrlw $10, %zmm0, %zmm0
139 ; X86-NEXT: vporq %zmm0, %zmm1, %zmm0
142 ; X64-LABEL: illegal_no_extract_mul:
144 ; X64-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm1
145 ; X64-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0
146 ; X64-NEXT: vpsrlw $10, %zmm0, %zmm0
147 ; X64-NEXT: vporq %zmm0, %zmm1, %zmm0
149 %lhs_mul = mul <32 x i16> %i, <i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640>
150 %rhs_mul = mul <32 x i16> %i, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
151 %rhs_shift = lshr <32 x i16> %rhs_mul, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
152 %out = or <32 x i16> %lhs_mul, %rhs_shift
156 ; Result would undershift
157 define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
158 ; CHECK-LABEL: no_extract_shl:
160 ; CHECK-NEXT: vpsllq $11, %ymm0, %ymm1
161 ; CHECK-NEXT: vpsllq $24, %ymm0, %ymm0
162 ; CHECK-NEXT: vpsrlq $50, %ymm1, %ymm1
163 ; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0
164 ; CHECK-NEXT: ret{{[l|q]}}
165 %lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11>
166 %rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24>
167 %lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50>
168 %out = or <4 x i64> %lhs_shift, %rhs_mul
172 ; Result would overshift
173 define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind {
174 ; CHECK-LABEL: no_extract_shrl:
176 ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4026531840,4026531840,4026531840,4026531840]
177 ; CHECK-NEXT: vpslld $25, %xmm0, %xmm2
178 ; CHECK-NEXT: vpand %xmm1, %xmm2, %xmm1
179 ; CHECK-NEXT: vpsrld $9, %xmm0, %xmm0
180 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
181 ; CHECK-NEXT: ret{{[l|q]}}
182 %lhs_div = lshr <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
183 %rhs_div = lshr <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
184 %lhs_shift = shl <4 x i32> %lhs_div, <i32 28, i32 28, i32 28, i32 28>
185 %out = or <4 x i32> %lhs_shift, %rhs_div
189 ; Can factor 512 from 1536, but result is 3 instead of 9
190 define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind {
191 ; CHECK-LABEL: no_extract_mul:
193 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1536,1536,1536,1536,1536,1536,1536,1536]
194 ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm1
195 ; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9]
196 ; CHECK-NEXT: vpmulld %ymm2, %ymm0, %ymm0
197 ; CHECK-NEXT: vpsrld $23, %ymm0, %ymm0
198 ; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0
199 ; CHECK-NEXT: ret{{[l|q]}}
200 %lhs_mul = mul <8 x i32> %i, <i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536>
201 %rhs_mul = mul <8 x i32> %i, <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
202 %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
203 %out = or <8 x i32> %lhs_mul, %rhs_shift
207 ; Can't evenly factor 256 from 770
208 define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
209 ; X86-LABEL: no_extract_udiv:
211 ; X86-NEXT: subl $60, %esp
212 ; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
213 ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
214 ; X86-NEXT: vmovss %xmm0, (%esp)
215 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
216 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
217 ; X86-NEXT: calll __udivdi3
218 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
219 ; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
220 ; X86-NEXT: vextractps $2, %xmm0, (%esp)
221 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
222 ; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
223 ; X86-NEXT: vmovd %eax, %xmm0
224 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
225 ; X86-NEXT: calll __udivdi3
226 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
227 ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
228 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
229 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
230 ; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp)
231 ; X86-NEXT: vmovss %xmm0, (%esp)
232 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
233 ; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302
234 ; X86-NEXT: calll __udivdi3
235 ; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
236 ; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp)
237 ; X86-NEXT: vextractps $2, %xmm0, (%esp)
238 ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
239 ; X86-NEXT: movl $770, {{[0-9]+}}(%esp) # imm = 0x302
240 ; X86-NEXT: vmovd %eax, %xmm0
241 ; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
242 ; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
243 ; X86-NEXT: calll __udivdi3
244 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
245 ; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
246 ; X86-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
247 ; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
248 ; X86-NEXT: vpsllq $56, %xmm1, %xmm1
249 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
250 ; X86-NEXT: addl $60, %esp
253 ; X64-LABEL: no_extract_udiv:
255 ; X64-NEXT: vpextrq $1, %xmm0, %rcx
256 ; X64-NEXT: movabsq $-6148914691236517205, %rdi # imm = 0xAAAAAAAAAAAAAAAB
257 ; X64-NEXT: movq %rcx, %rax
258 ; X64-NEXT: mulq %rdi
259 ; X64-NEXT: shrq %rdx
260 ; X64-NEXT: vmovq %rdx, %xmm1
261 ; X64-NEXT: vmovq %xmm0, %rsi
262 ; X64-NEXT: movq %rsi, %rax
263 ; X64-NEXT: mulq %rdi
264 ; X64-NEXT: shrq %rdx
265 ; X64-NEXT: vmovq %rdx, %xmm0
266 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
267 ; X64-NEXT: movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B
268 ; X64-NEXT: movq %rcx, %rax
269 ; X64-NEXT: mulq %rdi
270 ; X64-NEXT: shrq $9, %rdx
271 ; X64-NEXT: vmovq %rdx, %xmm1
272 ; X64-NEXT: movq %rsi, %rax
273 ; X64-NEXT: mulq %rdi
274 ; X64-NEXT: shrq $9, %rdx
275 ; X64-NEXT: vmovq %rdx, %xmm2
276 ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
277 ; X64-NEXT: vpsllq $56, %xmm0, %xmm0
278 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
280 %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
281 %rhs_div = udiv <2 x i64> %i, <i64 770, i64 770>
282 %lhs_shift = shl <2 x i64> %lhs_div, <i64 56, i64 56>
283 %out = or <2 x i64> %lhs_shift, %rhs_div