1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
4 ; FIXME: We should not generate ld/st for such register spill/fill, because the
5 ; test case seems very simple and the register pressure is not high. If the
6 ; spill/fill algorithm is optimized, this test case may not be triggered. And
7 ; then we can delete it.
8 define i32 @spill.DPairReg(ptr %arg1, i32 %arg2) {
9 ; CHECK-LABEL: spill.DPairReg:
10 ; CHECK: // %bb.0: // %entry
11 ; CHECK-NEXT: ld2 { v0.2s, v1.2s }, [x0]
12 ; CHECK-NEXT: cbz w1, .LBB0_2
13 ; CHECK-NEXT: // %bb.1: // %if.end
14 ; CHECK-NEXT: mov w0, v0.s[1]
16 ; CHECK-NEXT: .LBB0_2: // %if.then
17 ; CHECK-NEXT: sub sp, sp, #48
18 ; CHECK-NEXT: stp x29, x30, [sp, #32] // 16-byte Folded Spill
19 ; CHECK-NEXT: .cfi_def_cfa_offset 48
20 ; CHECK-NEXT: .cfi_offset w30, -8
21 ; CHECK-NEXT: .cfi_offset w29, -16
22 ; CHECK-NEXT: mov x8, sp
23 ; CHECK-NEXT: st1 { v0.2d, v1.2d }, [x8] // 32-byte Folded Spill
25 ; CHECK-NEXT: mov x8, sp
26 ; CHECK-NEXT: ldp x29, x30, [sp, #32] // 16-byte Folded Reload
27 ; CHECK-NEXT: ld1 { v0.2d, v1.2d }, [x8] // 32-byte Folded Reload
28 ; CHECK-NEXT: add sp, sp, #48
29 ; CHECK-NEXT: mov w0, v0.s[1]
32 %vld = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr %arg1)
33 %cmp = icmp eq i32 %arg2, 0
34 br i1 %cmp, label %if.then, label %if.end
41 %vld.extract = extractvalue { <2 x i32>, <2 x i32> } %vld, 0
42 %res = extractelement <2 x i32> %vld.extract, i32 1
46 define i16 @spill.DTripleReg(ptr %arg1, i32 %arg2) {
47 ; CHECK-LABEL: spill.DTripleReg:
48 ; CHECK: // %bb.0: // %entry
49 ; CHECK-NEXT: ld3 { v0.4h, v1.4h, v2.4h }, [x0]
50 ; CHECK-NEXT: cbz w1, .LBB1_2
51 ; CHECK-NEXT: // %bb.1: // %if.end
52 ; CHECK-NEXT: umov w0, v0.h[1]
54 ; CHECK-NEXT: .LBB1_2: // %if.then
55 ; CHECK-NEXT: sub sp, sp, #64
56 ; CHECK-NEXT: stp x29, x30, [sp, #48] // 16-byte Folded Spill
57 ; CHECK-NEXT: .cfi_def_cfa_offset 64
58 ; CHECK-NEXT: .cfi_offset w30, -8
59 ; CHECK-NEXT: .cfi_offset w29, -16
60 ; CHECK-NEXT: mov x8, sp
61 ; CHECK-NEXT: st1 { v0.2d, v1.2d, v2.2d }, [x8] // 48-byte Folded Spill
63 ; CHECK-NEXT: mov x8, sp
64 ; CHECK-NEXT: ldp x29, x30, [sp, #48] // 16-byte Folded Reload
65 ; CHECK-NEXT: ld1 { v0.2d, v1.2d, v2.2d }, [x8] // 48-byte Folded Reload
66 ; CHECK-NEXT: add sp, sp, #64
67 ; CHECK-NEXT: umov w0, v0.h[1]
70 %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr %arg1)
71 %cmp = icmp eq i32 %arg2, 0
72 br i1 %cmp, label %if.then, label %if.end
79 %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0
80 %res = extractelement <4 x i16> %vld.extract, i32 1
84 define i16 @spill.DQuadReg(ptr %arg1, i32 %arg2) {
85 ; CHECK-LABEL: spill.DQuadReg:
86 ; CHECK: // %bb.0: // %entry
87 ; CHECK-NEXT: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
88 ; CHECK-NEXT: cbz w1, .LBB2_2
89 ; CHECK-NEXT: // %bb.1: // %if.end
90 ; CHECK-NEXT: umov w0, v0.h[0]
92 ; CHECK-NEXT: .LBB2_2: // %if.then
93 ; CHECK-NEXT: sub sp, sp, #80
94 ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
95 ; CHECK-NEXT: .cfi_def_cfa_offset 80
96 ; CHECK-NEXT: .cfi_offset w30, -8
97 ; CHECK-NEXT: .cfi_offset w29, -16
98 ; CHECK-NEXT: mov x8, sp
99 ; CHECK-NEXT: st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x8] // 64-byte Folded Spill
101 ; CHECK-NEXT: mov x8, sp
102 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
103 ; CHECK-NEXT: ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x8] // 64-byte Folded Reload
104 ; CHECK-NEXT: add sp, sp, #80
105 ; CHECK-NEXT: umov w0, v0.h[0]
108 %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr %arg1)
109 %cmp = icmp eq i32 %arg2, 0
110 br i1 %cmp, label %if.then, label %if.end
113 tail call void @foo()
117 %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0
118 %res = extractelement <4 x i16> %vld.extract, i32 0
122 define i32 @spill.QPairReg(ptr %arg1, i32 %arg2) {
123 ; CHECK-LABEL: spill.QPairReg:
124 ; CHECK: // %bb.0: // %entry
125 ; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x0]
126 ; CHECK-NEXT: cbz w1, .LBB3_2
127 ; CHECK-NEXT: // %bb.1: // %if.end
128 ; CHECK-NEXT: mov w0, v0.s[1]
130 ; CHECK-NEXT: .LBB3_2: // %if.then
131 ; CHECK-NEXT: sub sp, sp, #48
132 ; CHECK-NEXT: stp x29, x30, [sp, #32] // 16-byte Folded Spill
133 ; CHECK-NEXT: .cfi_def_cfa_offset 48
134 ; CHECK-NEXT: .cfi_offset w30, -8
135 ; CHECK-NEXT: .cfi_offset w29, -16
136 ; CHECK-NEXT: mov x8, sp
137 ; CHECK-NEXT: st1 { v0.2d, v1.2d }, [x8] // 32-byte Folded Spill
139 ; CHECK-NEXT: mov x8, sp
140 ; CHECK-NEXT: ldp x29, x30, [sp, #32] // 16-byte Folded Reload
141 ; CHECK-NEXT: ld1 { v0.2d, v1.2d }, [x8] // 32-byte Folded Reload
142 ; CHECK-NEXT: add sp, sp, #48
143 ; CHECK-NEXT: mov w0, v0.s[1]
146 %vld = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %arg1)
147 %cmp = icmp eq i32 %arg2, 0
148 br i1 %cmp, label %if.then, label %if.end
151 tail call void @foo()
155 %vld.extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
156 %res = extractelement <4 x i32> %vld.extract, i32 1
160 define float @spill.QTripleReg(ptr %arg1, i32 %arg2) {
161 ; CHECK-LABEL: spill.QTripleReg:
162 ; CHECK: // %bb.0: // %entry
163 ; CHECK-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
164 ; CHECK-NEXT: cbz w1, .LBB4_2
165 ; CHECK-NEXT: // %bb.1: // %if.end
166 ; CHECK-NEXT: mov s0, v0.s[1]
168 ; CHECK-NEXT: .LBB4_2: // %if.then
169 ; CHECK-NEXT: sub sp, sp, #64
170 ; CHECK-NEXT: stp x29, x30, [sp, #48] // 16-byte Folded Spill
171 ; CHECK-NEXT: .cfi_def_cfa_offset 64
172 ; CHECK-NEXT: .cfi_offset w30, -8
173 ; CHECK-NEXT: .cfi_offset w29, -16
174 ; CHECK-NEXT: mov x8, sp
175 ; CHECK-NEXT: st1 { v0.2d, v1.2d, v2.2d }, [x8] // 48-byte Folded Spill
177 ; CHECK-NEXT: mov x8, sp
178 ; CHECK-NEXT: ldp x29, x30, [sp, #48] // 16-byte Folded Reload
179 ; CHECK-NEXT: ld1 { v0.2d, v1.2d, v2.2d }, [x8] // 48-byte Folded Reload
180 ; CHECK-NEXT: add sp, sp, #64
181 ; CHECK-NEXT: mov s0, v0.s[1]
184 %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0(ptr %arg1)
185 %cmp = icmp eq i32 %arg2, 0
186 br i1 %cmp, label %if.then, label %if.end
189 tail call void @foo()
193 %vld3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
194 %res = extractelement <4 x float> %vld3.extract, i32 1
198 define i8 @spill.QQuadReg(ptr %arg1, i32 %arg2) {
199 ; CHECK-LABEL: spill.QQuadReg:
200 ; CHECK: // %bb.0: // %entry
201 ; CHECK-NEXT: ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0]
202 ; CHECK-NEXT: cbz w1, .LBB5_2
203 ; CHECK-NEXT: // %bb.1: // %if.end
204 ; CHECK-NEXT: umov w0, v0.b[1]
206 ; CHECK-NEXT: .LBB5_2: // %if.then
207 ; CHECK-NEXT: sub sp, sp, #80
208 ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
209 ; CHECK-NEXT: .cfi_def_cfa_offset 80
210 ; CHECK-NEXT: .cfi_offset w30, -8
211 ; CHECK-NEXT: .cfi_offset w29, -16
212 ; CHECK-NEXT: mov x8, sp
213 ; CHECK-NEXT: st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x8] // 64-byte Folded Spill
215 ; CHECK-NEXT: mov x8, sp
216 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
217 ; CHECK-NEXT: ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x8] // 64-byte Folded Reload
218 ; CHECK-NEXT: add sp, sp, #80
219 ; CHECK-NEXT: umov w0, v0.b[1]
222 %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr %arg1)
223 %cmp = icmp eq i32 %arg2, 0
224 br i1 %cmp, label %if.then, label %if.end
227 tail call void @foo()
231 %vld.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld, 0
232 %res = extractelement <16 x i8> %vld.extract, i32 1
236 declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr)
237 declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr)
238 declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr)
239 declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr)
240 declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0(ptr)
241 declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr)
245 ; FIXME: We should not generate ld/st for such register spill/fill, because the
246 ; test case seems very simple and the register pressure is not high. If the
247 ; spill/fill algorithm is optimized, this test case may not be triggered. And
248 ; then we can delete it.
249 ; check the spill for Register Class QPair_with_qsub_0_in_FPR128Lo
250 define <8 x i16> @test_2xFPR128Lo(i64 %got, ptr %ptr, <1 x i64> %a) {
251 ; CHECK-LABEL: test_2xFPR128Lo:
253 ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
254 ; CHECK-NEXT: .cfi_def_cfa_offset 16
255 ; CHECK-NEXT: .cfi_offset w30, -16
256 ; CHECK-NEXT: movi v0.2d, #0000000000000000
257 ; CHECK-NEXT: mov v1.16b, v0.16b
258 ; CHECK-NEXT: st2 { v0.d, v1.d }[0], [x1]
260 ; CHECK-NEXT: movi v0.2d, #0000000000000000
261 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
263 tail call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, ptr %ptr)
264 tail call void @foo()
265 %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
266 %1 = bitcast <2 x i64> %sv to <8 x i16>
267 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
268 %3 = mul <8 x i16> %2, %2
272 ; check the spill for Register Class QTriple_with_qsub_0_in_FPR128Lo
273 define <8 x i16> @test_3xFPR128Lo(i64 %got, ptr %ptr, <1 x i64> %a) {
274 ; CHECK-LABEL: test_3xFPR128Lo:
276 ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
277 ; CHECK-NEXT: .cfi_def_cfa_offset 16
278 ; CHECK-NEXT: .cfi_offset w30, -16
279 ; CHECK-NEXT: movi v0.2d, #0000000000000000
280 ; CHECK-NEXT: mov v1.16b, v0.16b
281 ; CHECK-NEXT: mov v2.16b, v0.16b
282 ; CHECK-NEXT: st3 { v0.d, v1.d, v2.d }[0], [x1]
284 ; CHECK-NEXT: movi v0.2d, #0000000000000000
285 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
287 tail call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, ptr %ptr)
288 tail call void @foo()
289 %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
290 %1 = bitcast <2 x i64> %sv to <8 x i16>
291 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
292 %3 = mul <8 x i16> %2, %2
296 ; check the spill for Register Class QQuad_with_qsub_0_in_FPR128Lo
297 define <8 x i16> @test_4xFPR128Lo(i64 %got, ptr %ptr, <1 x i64> %a) {
298 ; CHECK-LABEL: test_4xFPR128Lo:
300 ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
301 ; CHECK-NEXT: .cfi_def_cfa_offset 16
302 ; CHECK-NEXT: .cfi_offset w30, -16
303 ; CHECK-NEXT: movi v0.2d, #0000000000000000
304 ; CHECK-NEXT: mov v1.16b, v0.16b
305 ; CHECK-NEXT: mov v2.16b, v0.16b
306 ; CHECK-NEXT: mov v3.16b, v0.16b
307 ; CHECK-NEXT: st4 { v0.d, v1.d, v2.d, v3.d }[0], [x1]
309 ; CHECK-NEXT: movi v0.2d, #0000000000000000
310 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
312 tail call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, ptr %ptr)
313 tail call void @foo()
314 %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
315 %1 = bitcast <2 x i64> %sv to <8 x i16>
316 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
317 %3 = mul <8 x i16> %2, %2
321 declare void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64>, <1 x i64>, i64, ptr)
322 declare void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64>, <1 x i64>, <1 x i64>, i64, ptr)
323 declare void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, ptr)