1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 < %s | FileCheck %s
4 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5 target triple = "x86_64-unknown-unknown"
7 ; Stack reload folding tests.
9 ; By including a nop call with sideeffects we can force a partial register spill of the
10 ; relevant registers and check that the reload is correctly folded into the instruction.
12 define i32 @stack_fold_bzhi_u32(i32 %a0, i32 %a1) {
13 ; CHECK-LABEL: stack_fold_bzhi_u32:
15 ; CHECK-NEXT: pushq %rbp
16 ; CHECK-NEXT: .cfi_def_cfa_offset 16
17 ; CHECK-NEXT: pushq %r15
18 ; CHECK-NEXT: .cfi_def_cfa_offset 24
19 ; CHECK-NEXT: pushq %r14
20 ; CHECK-NEXT: .cfi_def_cfa_offset 32
21 ; CHECK-NEXT: pushq %r13
22 ; CHECK-NEXT: .cfi_def_cfa_offset 40
23 ; CHECK-NEXT: pushq %r12
24 ; CHECK-NEXT: .cfi_def_cfa_offset 48
25 ; CHECK-NEXT: pushq %rbx
26 ; CHECK-NEXT: .cfi_def_cfa_offset 56
27 ; CHECK-NEXT: .cfi_offset %rbx, -56
28 ; CHECK-NEXT: .cfi_offset %r12, -48
29 ; CHECK-NEXT: .cfi_offset %r13, -40
30 ; CHECK-NEXT: .cfi_offset %r14, -32
31 ; CHECK-NEXT: .cfi_offset %r15, -24
32 ; CHECK-NEXT: .cfi_offset %rbp, -16
33 ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
34 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
38 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
39 ; CHECK-NEXT: bzhil %eax, {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
40 ; CHECK-NEXT: popq %rbx
41 ; CHECK-NEXT: .cfi_def_cfa_offset 48
42 ; CHECK-NEXT: popq %r12
43 ; CHECK-NEXT: .cfi_def_cfa_offset 40
44 ; CHECK-NEXT: popq %r13
45 ; CHECK-NEXT: .cfi_def_cfa_offset 32
46 ; CHECK-NEXT: popq %r14
47 ; CHECK-NEXT: .cfi_def_cfa_offset 24
48 ; CHECK-NEXT: popq %r15
49 ; CHECK-NEXT: .cfi_def_cfa_offset 16
50 ; CHECK-NEXT: popq %rbp
51 ; CHECK-NEXT: .cfi_def_cfa_offset 8
53 %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
54 %2 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a0, i32 %a1)
57 declare i32 @llvm.x86.bmi.bzhi.32(i32, i32)
59 define i64 @stack_fold_bzhi_u64(i64 %a0, i64 %a1) {
60 ; CHECK-LABEL: stack_fold_bzhi_u64:
62 ; CHECK-NEXT: pushq %rbp
63 ; CHECK-NEXT: .cfi_def_cfa_offset 16
64 ; CHECK-NEXT: pushq %r15
65 ; CHECK-NEXT: .cfi_def_cfa_offset 24
66 ; CHECK-NEXT: pushq %r14
67 ; CHECK-NEXT: .cfi_def_cfa_offset 32
68 ; CHECK-NEXT: pushq %r13
69 ; CHECK-NEXT: .cfi_def_cfa_offset 40
70 ; CHECK-NEXT: pushq %r12
71 ; CHECK-NEXT: .cfi_def_cfa_offset 48
72 ; CHECK-NEXT: pushq %rbx
73 ; CHECK-NEXT: .cfi_def_cfa_offset 56
74 ; CHECK-NEXT: .cfi_offset %rbx, -56
75 ; CHECK-NEXT: .cfi_offset %r12, -48
76 ; CHECK-NEXT: .cfi_offset %r13, -40
77 ; CHECK-NEXT: .cfi_offset %r14, -32
78 ; CHECK-NEXT: .cfi_offset %r15, -24
79 ; CHECK-NEXT: .cfi_offset %rbp, -16
80 ; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
81 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
85 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
86 ; CHECK-NEXT: bzhiq %rax, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
87 ; CHECK-NEXT: popq %rbx
88 ; CHECK-NEXT: .cfi_def_cfa_offset 48
89 ; CHECK-NEXT: popq %r12
90 ; CHECK-NEXT: .cfi_def_cfa_offset 40
91 ; CHECK-NEXT: popq %r13
92 ; CHECK-NEXT: .cfi_def_cfa_offset 32
93 ; CHECK-NEXT: popq %r14
94 ; CHECK-NEXT: .cfi_def_cfa_offset 24
95 ; CHECK-NEXT: popq %r15
96 ; CHECK-NEXT: .cfi_def_cfa_offset 16
97 ; CHECK-NEXT: popq %rbp
98 ; CHECK-NEXT: .cfi_def_cfa_offset 8
100 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
101 %2 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a0, i64 %a1)
104 declare i64 @llvm.x86.bmi.bzhi.64(i64, i64)
106 define i32 @stack_fold_pdep_u32(i32 %a0, i32 %a1) {
107 ; CHECK-LABEL: stack_fold_pdep_u32:
109 ; CHECK-NEXT: pushq %rbp
110 ; CHECK-NEXT: .cfi_def_cfa_offset 16
111 ; CHECK-NEXT: pushq %r15
112 ; CHECK-NEXT: .cfi_def_cfa_offset 24
113 ; CHECK-NEXT: pushq %r14
114 ; CHECK-NEXT: .cfi_def_cfa_offset 32
115 ; CHECK-NEXT: pushq %r13
116 ; CHECK-NEXT: .cfi_def_cfa_offset 40
117 ; CHECK-NEXT: pushq %r12
118 ; CHECK-NEXT: .cfi_def_cfa_offset 48
119 ; CHECK-NEXT: pushq %rbx
120 ; CHECK-NEXT: .cfi_def_cfa_offset 56
121 ; CHECK-NEXT: .cfi_offset %rbx, -56
122 ; CHECK-NEXT: .cfi_offset %r12, -48
123 ; CHECK-NEXT: .cfi_offset %r13, -40
124 ; CHECK-NEXT: .cfi_offset %r14, -32
125 ; CHECK-NEXT: .cfi_offset %r15, -24
126 ; CHECK-NEXT: .cfi_offset %rbp, -16
127 ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
128 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
131 ; CHECK-NEXT: #NO_APP
132 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
133 ; CHECK-NEXT: pdepl {{[-0-9]+}}(%r{{[sb]}}p), %eax, %eax # 4-byte Folded Reload
134 ; CHECK-NEXT: popq %rbx
135 ; CHECK-NEXT: .cfi_def_cfa_offset 48
136 ; CHECK-NEXT: popq %r12
137 ; CHECK-NEXT: .cfi_def_cfa_offset 40
138 ; CHECK-NEXT: popq %r13
139 ; CHECK-NEXT: .cfi_def_cfa_offset 32
140 ; CHECK-NEXT: popq %r14
141 ; CHECK-NEXT: .cfi_def_cfa_offset 24
142 ; CHECK-NEXT: popq %r15
143 ; CHECK-NEXT: .cfi_def_cfa_offset 16
144 ; CHECK-NEXT: popq %rbp
145 ; CHECK-NEXT: .cfi_def_cfa_offset 8
147 %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
148 %2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %a1)
151 declare i32 @llvm.x86.bmi.pdep.32(i32, i32)
153 define i64 @stack_fold_pdep_u64(i64 %a0, i64 %a1) {
154 ; CHECK-LABEL: stack_fold_pdep_u64:
156 ; CHECK-NEXT: pushq %rbp
157 ; CHECK-NEXT: .cfi_def_cfa_offset 16
158 ; CHECK-NEXT: pushq %r15
159 ; CHECK-NEXT: .cfi_def_cfa_offset 24
160 ; CHECK-NEXT: pushq %r14
161 ; CHECK-NEXT: .cfi_def_cfa_offset 32
162 ; CHECK-NEXT: pushq %r13
163 ; CHECK-NEXT: .cfi_def_cfa_offset 40
164 ; CHECK-NEXT: pushq %r12
165 ; CHECK-NEXT: .cfi_def_cfa_offset 48
166 ; CHECK-NEXT: pushq %rbx
167 ; CHECK-NEXT: .cfi_def_cfa_offset 56
168 ; CHECK-NEXT: .cfi_offset %rbx, -56
169 ; CHECK-NEXT: .cfi_offset %r12, -48
170 ; CHECK-NEXT: .cfi_offset %r13, -40
171 ; CHECK-NEXT: .cfi_offset %r14, -32
172 ; CHECK-NEXT: .cfi_offset %r15, -24
173 ; CHECK-NEXT: .cfi_offset %rbp, -16
174 ; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
175 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
178 ; CHECK-NEXT: #NO_APP
179 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
180 ; CHECK-NEXT: pdepq {{[-0-9]+}}(%r{{[sb]}}p), %rax, %rax # 8-byte Folded Reload
181 ; CHECK-NEXT: popq %rbx
182 ; CHECK-NEXT: .cfi_def_cfa_offset 48
183 ; CHECK-NEXT: popq %r12
184 ; CHECK-NEXT: .cfi_def_cfa_offset 40
185 ; CHECK-NEXT: popq %r13
186 ; CHECK-NEXT: .cfi_def_cfa_offset 32
187 ; CHECK-NEXT: popq %r14
188 ; CHECK-NEXT: .cfi_def_cfa_offset 24
189 ; CHECK-NEXT: popq %r15
190 ; CHECK-NEXT: .cfi_def_cfa_offset 16
191 ; CHECK-NEXT: popq %rbp
192 ; CHECK-NEXT: .cfi_def_cfa_offset 8
194 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
195 %2 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %a1)
198 declare i64 @llvm.x86.bmi.pdep.64(i64, i64)
200 define i32 @stack_fold_pext_u32(i32 %a0, i32 %a1) {
201 ; CHECK-LABEL: stack_fold_pext_u32:
203 ; CHECK-NEXT: pushq %rbp
204 ; CHECK-NEXT: .cfi_def_cfa_offset 16
205 ; CHECK-NEXT: pushq %r15
206 ; CHECK-NEXT: .cfi_def_cfa_offset 24
207 ; CHECK-NEXT: pushq %r14
208 ; CHECK-NEXT: .cfi_def_cfa_offset 32
209 ; CHECK-NEXT: pushq %r13
210 ; CHECK-NEXT: .cfi_def_cfa_offset 40
211 ; CHECK-NEXT: pushq %r12
212 ; CHECK-NEXT: .cfi_def_cfa_offset 48
213 ; CHECK-NEXT: pushq %rbx
214 ; CHECK-NEXT: .cfi_def_cfa_offset 56
215 ; CHECK-NEXT: .cfi_offset %rbx, -56
216 ; CHECK-NEXT: .cfi_offset %r12, -48
217 ; CHECK-NEXT: .cfi_offset %r13, -40
218 ; CHECK-NEXT: .cfi_offset %r14, -32
219 ; CHECK-NEXT: .cfi_offset %r15, -24
220 ; CHECK-NEXT: .cfi_offset %rbp, -16
221 ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
222 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
225 ; CHECK-NEXT: #NO_APP
226 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
227 ; CHECK-NEXT: pextl {{[-0-9]+}}(%r{{[sb]}}p), %eax, %eax # 4-byte Folded Reload
228 ; CHECK-NEXT: popq %rbx
229 ; CHECK-NEXT: .cfi_def_cfa_offset 48
230 ; CHECK-NEXT: popq %r12
231 ; CHECK-NEXT: .cfi_def_cfa_offset 40
232 ; CHECK-NEXT: popq %r13
233 ; CHECK-NEXT: .cfi_def_cfa_offset 32
234 ; CHECK-NEXT: popq %r14
235 ; CHECK-NEXT: .cfi_def_cfa_offset 24
236 ; CHECK-NEXT: popq %r15
237 ; CHECK-NEXT: .cfi_def_cfa_offset 16
238 ; CHECK-NEXT: popq %rbp
239 ; CHECK-NEXT: .cfi_def_cfa_offset 8
241 %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
242 %2 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %a1)
245 declare i32 @llvm.x86.bmi.pext.32(i32, i32)
247 define i64 @stack_fold_pext_u64(i64 %a0, i64 %a1) {
248 ; CHECK-LABEL: stack_fold_pext_u64:
250 ; CHECK-NEXT: pushq %rbp
251 ; CHECK-NEXT: .cfi_def_cfa_offset 16
252 ; CHECK-NEXT: pushq %r15
253 ; CHECK-NEXT: .cfi_def_cfa_offset 24
254 ; CHECK-NEXT: pushq %r14
255 ; CHECK-NEXT: .cfi_def_cfa_offset 32
256 ; CHECK-NEXT: pushq %r13
257 ; CHECK-NEXT: .cfi_def_cfa_offset 40
258 ; CHECK-NEXT: pushq %r12
259 ; CHECK-NEXT: .cfi_def_cfa_offset 48
260 ; CHECK-NEXT: pushq %rbx
261 ; CHECK-NEXT: .cfi_def_cfa_offset 56
262 ; CHECK-NEXT: .cfi_offset %rbx, -56
263 ; CHECK-NEXT: .cfi_offset %r12, -48
264 ; CHECK-NEXT: .cfi_offset %r13, -40
265 ; CHECK-NEXT: .cfi_offset %r14, -32
266 ; CHECK-NEXT: .cfi_offset %r15, -24
267 ; CHECK-NEXT: .cfi_offset %rbp, -16
268 ; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
269 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
272 ; CHECK-NEXT: #NO_APP
273 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
274 ; CHECK-NEXT: pextq {{[-0-9]+}}(%r{{[sb]}}p), %rax, %rax # 8-byte Folded Reload
275 ; CHECK-NEXT: popq %rbx
276 ; CHECK-NEXT: .cfi_def_cfa_offset 48
277 ; CHECK-NEXT: popq %r12
278 ; CHECK-NEXT: .cfi_def_cfa_offset 40
279 ; CHECK-NEXT: popq %r13
280 ; CHECK-NEXT: .cfi_def_cfa_offset 32
281 ; CHECK-NEXT: popq %r14
282 ; CHECK-NEXT: .cfi_def_cfa_offset 24
283 ; CHECK-NEXT: popq %r15
284 ; CHECK-NEXT: .cfi_def_cfa_offset 16
285 ; CHECK-NEXT: popq %rbp
286 ; CHECK-NEXT: .cfi_def_cfa_offset 8
288 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
289 %2 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %a1)
292 declare i64 @llvm.x86.bmi.pext.64(i64, i64)