1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
3 ; RUN: llc -mtriple=thumbebv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
5 define void @load_load_add_store(ptr %src1, ptr %src2) {
6 ; CHECK-LABEL: load_load_add_store:
7 ; CHECK: @ %bb.0: @ %entry
8 ; CHECK-NEXT: vldrw.u32 q0, [r1]
9 ; CHECK-NEXT: vldrw.u32 q1, [r0]
10 ; CHECK-NEXT: vadd.i32 q0, q1, q0
11 ; CHECK-NEXT: vstrw.32 q0, [r0]
14 %l1 = load <4 x i32>, ptr %src1, align 4
15 %l2 = load <4 x i32>, ptr %src2, align 4
16 %a = add <4 x i32> %l1, %l2
17 store <4 x i32> %a, ptr %src1, align 4
21 define void @load_load_add_store_align1(ptr %src1, ptr %src2) {
22 ; CHECK-LE-LABEL: load_load_add_store_align1:
23 ; CHECK-LE: @ %bb.0: @ %entry
24 ; CHECK-LE-NEXT: vldrb.u8 q0, [r1]
25 ; CHECK-LE-NEXT: vldrb.u8 q1, [r0]
26 ; CHECK-LE-NEXT: vadd.i32 q0, q1, q0
27 ; CHECK-LE-NEXT: vstrb.8 q0, [r0]
28 ; CHECK-LE-NEXT: bx lr
30 ; CHECK-BE-LABEL: load_load_add_store_align1:
31 ; CHECK-BE: @ %bb.0: @ %entry
32 ; CHECK-BE-NEXT: vldrb.u8 q0, [r1]
33 ; CHECK-BE-NEXT: vldrb.u8 q1, [r0]
34 ; CHECK-BE-NEXT: vrev32.8 q0, q0
35 ; CHECK-BE-NEXT: vrev32.8 q1, q1
36 ; CHECK-BE-NEXT: vadd.i32 q0, q1, q0
37 ; CHECK-BE-NEXT: vrev32.8 q0, q0
38 ; CHECK-BE-NEXT: vstrb.8 q0, [r0]
39 ; CHECK-BE-NEXT: bx lr
41 %l1 = load <4 x i32>, ptr %src1, align 1
42 %l2 = load <4 x i32>, ptr %src2, align 1
43 %a = add <4 x i32> %l1, %l2
44 store <4 x i32> %a, ptr %src1, align 1
48 define arm_aapcs_vfpcc void @load_arg_add_store(ptr %src1, <4 x i32> %src2) {
49 ; CHECK-LE-LABEL: load_arg_add_store:
50 ; CHECK-LE: @ %bb.0: @ %entry
51 ; CHECK-LE-NEXT: vldrw.u32 q1, [r0]
52 ; CHECK-LE-NEXT: vadd.i32 q0, q1, q0
53 ; CHECK-LE-NEXT: vstrw.32 q0, [r0]
54 ; CHECK-LE-NEXT: bx lr
56 ; CHECK-BE-LABEL: load_arg_add_store:
57 ; CHECK-BE: @ %bb.0: @ %entry
58 ; CHECK-BE-NEXT: vrev64.32 q1, q0
59 ; CHECK-BE-NEXT: vldrw.u32 q0, [r0]
60 ; CHECK-BE-NEXT: vadd.i32 q0, q0, q1
61 ; CHECK-BE-NEXT: vstrw.32 q0, [r0]
62 ; CHECK-BE-NEXT: bx lr
64 %l1 = load <4 x i32>, ptr %src1, align 4
65 %a = add <4 x i32> %l1, %src2
66 store <4 x i32> %a, ptr %src1, align 4
70 define <4 x i32> @add_soft(<4 x i32> %src1, <4 x i32> %src2) {
71 ; CHECK-LE-LABEL: add_soft:
72 ; CHECK-LE: @ %bb.0: @ %entry
73 ; CHECK-LE-NEXT: vmov d0, r0, r1
74 ; CHECK-LE-NEXT: mov r0, sp
75 ; CHECK-LE-NEXT: vldrw.u32 q1, [r0]
76 ; CHECK-LE-NEXT: vmov d1, r2, r3
77 ; CHECK-LE-NEXT: vadd.i32 q0, q0, q1
78 ; CHECK-LE-NEXT: vmov r0, r1, d0
79 ; CHECK-LE-NEXT: vmov r2, r3, d1
80 ; CHECK-LE-NEXT: bx lr
82 ; CHECK-BE-LABEL: add_soft:
83 ; CHECK-BE: @ %bb.0: @ %entry
84 ; CHECK-BE-NEXT: vmov d0, r1, r0
85 ; CHECK-BE-NEXT: mov r0, sp
86 ; CHECK-BE-NEXT: vmov d1, r3, r2
87 ; CHECK-BE-NEXT: vrev64.32 q1, q0
88 ; CHECK-BE-NEXT: vldrw.u32 q0, [r0]
89 ; CHECK-BE-NEXT: vadd.i32 q0, q1, q0
90 ; CHECK-BE-NEXT: vrev64.32 q1, q0
91 ; CHECK-BE-NEXT: vmov r1, r0, d2
92 ; CHECK-BE-NEXT: vmov r3, r2, d3
93 ; CHECK-BE-NEXT: bx lr
95 %0 = add <4 x i32> %src1, %src2
99 define arm_aapcs_vfpcc <4 x i32> @add_hard(<4 x i32> %src1, <4 x i32> %src2) {
100 ; CHECK-LE-LABEL: add_hard:
101 ; CHECK-LE: @ %bb.0: @ %entry
102 ; CHECK-LE-NEXT: vadd.i32 q0, q0, q1
103 ; CHECK-LE-NEXT: bx lr
105 ; CHECK-BE-LABEL: add_hard:
106 ; CHECK-BE: @ %bb.0: @ %entry
107 ; CHECK-BE-NEXT: vrev64.32 q2, q1
108 ; CHECK-BE-NEXT: vrev64.32 q1, q0
109 ; CHECK-BE-NEXT: vadd.i32 q1, q1, q2
110 ; CHECK-BE-NEXT: vrev64.32 q0, q1
111 ; CHECK-BE-NEXT: bx lr
113 %0 = add <4 x i32> %src1, %src2
117 define <4 x i32> @call_soft(<4 x i32> %src1, <4 x i32> %src2) {
118 ; CHECK-LE-LABEL: call_soft:
119 ; CHECK-LE: @ %bb.0: @ %entry
120 ; CHECK-LE-NEXT: .save {r7, lr}
121 ; CHECK-LE-NEXT: push {r7, lr}
122 ; CHECK-LE-NEXT: .pad #16
123 ; CHECK-LE-NEXT: sub sp, #16
124 ; CHECK-LE-NEXT: add.w r12, sp, #24
125 ; CHECK-LE-NEXT: vldrw.u32 q0, [r12]
126 ; CHECK-LE-NEXT: vstrw.32 q0, [sp]
127 ; CHECK-LE-NEXT: vmov d1, r2, r3
128 ; CHECK-LE-NEXT: vmov d0, r0, r1
129 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
130 ; CHECK-LE-NEXT: vmov r0, r1, d0
131 ; CHECK-LE-NEXT: vmov r2, r3, d1
132 ; CHECK-LE-NEXT: bl add_soft
133 ; CHECK-LE-NEXT: vmov d1, r2, r3
134 ; CHECK-LE-NEXT: vmov d0, r0, r1
135 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
136 ; CHECK-LE-NEXT: vmov r0, r1, d0
137 ; CHECK-LE-NEXT: vmov r2, r3, d1
138 ; CHECK-LE-NEXT: add sp, #16
139 ; CHECK-LE-NEXT: pop {r7, pc}
141 ; CHECK-BE-LABEL: call_soft:
142 ; CHECK-BE: @ %bb.0: @ %entry
143 ; CHECK-BE-NEXT: .save {r7, lr}
144 ; CHECK-BE-NEXT: push {r7, lr}
145 ; CHECK-BE-NEXT: .pad #16
146 ; CHECK-BE-NEXT: sub sp, #16
147 ; CHECK-BE-NEXT: add.w r12, sp, #24
148 ; CHECK-BE-NEXT: vldrw.u32 q0, [r12]
149 ; CHECK-BE-NEXT: vstrw.32 q0, [sp]
150 ; CHECK-BE-NEXT: vmov d1, r3, r2
151 ; CHECK-BE-NEXT: vmov d0, r1, r0
152 ; CHECK-BE-NEXT: vrev64.32 q1, q0
153 ; CHECK-BE-NEXT: vshr.u32 q0, q1, #1
154 ; CHECK-BE-NEXT: vrev64.32 q1, q0
155 ; CHECK-BE-NEXT: vmov r1, r0, d2
156 ; CHECK-BE-NEXT: vmov r3, r2, d3
157 ; CHECK-BE-NEXT: bl add_soft
158 ; CHECK-BE-NEXT: vmov d1, r3, r2
159 ; CHECK-BE-NEXT: vmov d0, r1, r0
160 ; CHECK-BE-NEXT: vrev64.32 q1, q0
161 ; CHECK-BE-NEXT: vshr.u32 q0, q1, #1
162 ; CHECK-BE-NEXT: vrev64.32 q1, q0
163 ; CHECK-BE-NEXT: vmov r1, r0, d2
164 ; CHECK-BE-NEXT: vmov r3, r2, d3
165 ; CHECK-BE-NEXT: add sp, #16
166 ; CHECK-BE-NEXT: pop {r7, pc}
168 %0 = lshr <4 x i32> %src1, <i32 1, i32 1, i32 1, i32 1>
169 %1 = call <4 x i32> @add_soft(<4 x i32> %0, <4 x i32> %src2)
170 %2 = lshr <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
174 define arm_aapcs_vfpcc <4 x i32> @call_hard(<4 x i32> %src1, <4 x i32> %src2) {
175 ; CHECK-LE-LABEL: call_hard:
176 ; CHECK-LE: @ %bb.0: @ %entry
177 ; CHECK-LE-NEXT: .save {r7, lr}
178 ; CHECK-LE-NEXT: push {r7, lr}
179 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
180 ; CHECK-LE-NEXT: bl add_hard
181 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
182 ; CHECK-LE-NEXT: pop {r7, pc}
184 ; CHECK-BE-LABEL: call_hard:
185 ; CHECK-BE: @ %bb.0: @ %entry
186 ; CHECK-BE-NEXT: .save {r7, lr}
187 ; CHECK-BE-NEXT: push {r7, lr}
188 ; CHECK-BE-NEXT: vrev64.32 q2, q0
189 ; CHECK-BE-NEXT: vshr.u32 q2, q2, #1
190 ; CHECK-BE-NEXT: vrev64.32 q0, q2
191 ; CHECK-BE-NEXT: bl add_hard
192 ; CHECK-BE-NEXT: vrev64.32 q1, q0
193 ; CHECK-BE-NEXT: vshr.u32 q1, q1, #1
194 ; CHECK-BE-NEXT: vrev64.32 q0, q1
195 ; CHECK-BE-NEXT: pop {r7, pc}
197 %0 = lshr <4 x i32> %src1, <i32 1, i32 1, i32 1, i32 1>
198 %1 = call arm_aapcs_vfpcc <4 x i32> @add_hard(<4 x i32> %0, <4 x i32> %src2)
199 %2 = lshr <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
203 define arm_aapcs_vfpcc <16 x i8> @and_v4i32(<4 x i32> %src) {
204 ; CHECK-LE-LABEL: and_v4i32:
205 ; CHECK-LE: @ %bb.0: @ %entry
206 ; CHECK-LE-NEXT: vmov.i32 q1, #0x1
207 ; CHECK-LE-NEXT: vand q0, q0, q1
208 ; CHECK-LE-NEXT: bx lr
210 ; CHECK-BE-LABEL: and_v4i32:
211 ; CHECK-BE: @ %bb.0: @ %entry
212 ; CHECK-BE-NEXT: vrev64.32 q1, q0
213 ; CHECK-BE-NEXT: vmov.i32 q0, #0x1
214 ; CHECK-BE-NEXT: vand q1, q1, q0
215 ; CHECK-BE-NEXT: vrev64.32 q0, q1
216 ; CHECK-BE-NEXT: bx lr
218 %s1 = and <4 x i32> %src, <i32 1, i32 1, i32 1, i32 1>
219 %r = bitcast <4 x i32> %s1 to <16 x i8>
223 ; Should be the same as and_v4i32 for LE
224 define arm_aapcs_vfpcc <16 x i8> @and_v16i8_le(<4 x i32> %src) {
225 ; CHECK-LE-LABEL: and_v16i8_le:
226 ; CHECK-LE: @ %bb.0: @ %entry
227 ; CHECK-LE-NEXT: vmov.i32 q1, #0x1
228 ; CHECK-LE-NEXT: vand q0, q0, q1
229 ; CHECK-LE-NEXT: bx lr
231 ; CHECK-BE-LABEL: and_v16i8_le:
232 ; CHECK-BE: @ %bb.0: @ %entry
233 ; CHECK-BE-NEXT: vrev64.8 q1, q0
234 ; CHECK-BE-NEXT: vmov.i32 q0, #0x1
235 ; CHECK-BE-NEXT: vand q1, q1, q0
236 ; CHECK-BE-NEXT: vrev64.8 q0, q1
237 ; CHECK-BE-NEXT: bx lr
239 %0 = bitcast <4 x i32> %src to <16 x i8>
240 %r = and <16 x i8> %0, <i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0>
244 ; Should be the same (or at least equivalent) as and_v4i32 for BE
245 define arm_aapcs_vfpcc <16 x i8> @and_v16i8_be(<4 x i32> %src) {
246 ; CHECK-LE-LABEL: and_v16i8_be:
247 ; CHECK-LE: @ %bb.0: @ %entry
248 ; CHECK-LE-NEXT: vmov.i32 q1, #0x1000000
249 ; CHECK-LE-NEXT: vand q0, q0, q1
250 ; CHECK-LE-NEXT: bx lr
252 ; CHECK-BE-LABEL: and_v16i8_be:
253 ; CHECK-BE: @ %bb.0: @ %entry
254 ; CHECK-BE-NEXT: vrev64.8 q1, q0
255 ; CHECK-BE-NEXT: vmov.i32 q0, #0x1000000
256 ; CHECK-BE-NEXT: vand q1, q1, q0
257 ; CHECK-BE-NEXT: vrev64.8 q0, q1
258 ; CHECK-BE-NEXT: bx lr
260 %0 = bitcast <4 x i32> %src to <16 x i8>
261 %r = and <16 x i8> %0, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
265 ; FIXME: This looks wrong
266 define arm_aapcs_vfpcc <4 x i32> @test(ptr %data) {
267 ; CHECK-LE-LABEL: test:
268 ; CHECK-LE: @ %bb.0: @ %entry
269 ; CHECK-LE-NEXT: vldrw.u32 q0, [r0, #32]
270 ; CHECK-LE-NEXT: movs r0, #1
271 ; CHECK-LE-NEXT: vadd.i32 q1, q0, r0
272 ; CHECK-LE-NEXT: @APP
273 ; CHECK-LE-NEXT: vmullb.s32 q0, q1, q1
274 ; CHECK-LE-NEXT: @NO_APP
275 ; CHECK-LE-NEXT: bx lr
277 ; CHECK-BE-LABEL: test:
278 ; CHECK-BE: @ %bb.0: @ %entry
279 ; CHECK-BE-NEXT: vldrw.u32 q0, [r0, #32]
280 ; CHECK-BE-NEXT: movs r0, #1
281 ; CHECK-BE-NEXT: vadd.i32 q0, q0, r0
282 ; CHECK-BE-NEXT: vrev32.8 q0, q0
283 ; CHECK-BE-NEXT: @APP
284 ; CHECK-BE-NEXT: vmullb.s32 q1, q0, q0
285 ; CHECK-BE-NEXT: @NO_APP
286 ; CHECK-BE-NEXT: vrev64.8 q0, q1
287 ; CHECK-BE-NEXT: bx lr
289 %add.ptr = getelementptr inbounds i32, ptr %data, i32 8
290 %0 = load <4 x i32>, ptr %add.ptr, align 4
291 %1 = add <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
292 %2 = tail call <4 x i32> asm sideeffect " VMULLB.s32 $0, $1, $1", "=&w,w"(<4 x i32> %1) #2
296 ; Test case demonstrating that 'bitcast' reinterprets the memory format of a
297 ; vector, as if stored and then loaded. So if it has to go between two
298 ; operations treating a register as having different lane sizes, then in
299 ; big-endian mode, it has to emit a vrev32.16, which is equivalent to the
300 ; effect that vstrw.32 followed by vldrh.16 would have.
301 define arm_aapcs_vfpcc void @test_bitcast(ptr readonly %in, ptr %out) {
302 ; CHECK-LE-LABEL: test_bitcast:
303 ; CHECK-LE: @ %bb.0: @ %entry
304 ; CHECK-LE-NEXT: vldrw.u32 q0, [r0]
305 ; CHECK-LE-NEXT: vmul.i32 q0, q0, q0
306 ; CHECK-LE-NEXT: vmul.i16 q0, q0, q0
307 ; CHECK-LE-NEXT: vstrw.32 q0, [r1]
308 ; CHECK-LE-NEXT: bx lr
310 ; CHECK-BE-LABEL: test_bitcast:
311 ; CHECK-BE: @ %bb.0: @ %entry
312 ; CHECK-BE-NEXT: vldrw.u32 q0, [r0]
313 ; CHECK-BE-NEXT: vmul.i32 q0, q0, q0
314 ; CHECK-BE-NEXT: vrev32.16 q0, q0
315 ; CHECK-BE-NEXT: vmul.i16 q0, q0, q0
316 ; CHECK-BE-NEXT: vstrh.16 q0, [r1]
317 ; CHECK-BE-NEXT: bx lr
319 %vin = load <4 x i32>, ptr %in, align 8
320 %vdbl = mul <4 x i32> %vin, %vin
321 %cast = bitcast <4 x i32> %vdbl to <8 x i16>
322 %cdbl = mul <8 x i16> %cast, %cast
323 store <8 x i16> %cdbl, ptr %out, align 8
327 ; Similar test case but using the arm.mve.vreinterpretq intrinsic instead,
328 ; which is defined to reinterpret the in-register format, so it generates no
329 ; instruction in either endianness.
330 define arm_aapcs_vfpcc void @test_vreinterpretq(ptr readonly %in, ptr %out) {
331 ; CHECK-LE-LABEL: test_vreinterpretq:
332 ; CHECK-LE: @ %bb.0: @ %entry
333 ; CHECK-LE-NEXT: vldrw.u32 q0, [r0]
334 ; CHECK-LE-NEXT: vmul.i32 q0, q0, q0
335 ; CHECK-LE-NEXT: vmul.i16 q0, q0, q0
336 ; CHECK-LE-NEXT: vstrw.32 q0, [r1]
337 ; CHECK-LE-NEXT: bx lr
339 ; CHECK-BE-LABEL: test_vreinterpretq:
340 ; CHECK-BE: @ %bb.0: @ %entry
341 ; CHECK-BE-NEXT: vldrw.u32 q0, [r0]
342 ; CHECK-BE-NEXT: vmul.i32 q0, q0, q0
343 ; CHECK-BE-NEXT: vmul.i16 q0, q0, q0
344 ; CHECK-BE-NEXT: vstrh.16 q0, [r1]
345 ; CHECK-BE-NEXT: bx lr
347 %vin = load <4 x i32>, ptr %in, align 8
348 %vdbl = mul <4 x i32> %vin, %vin
349 %cast = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v4i32(<4 x i32> %vdbl)
350 %cdbl = mul <8 x i16> %cast, %cast
351 store <8 x i16> %cdbl, ptr %out, align 8
355 define arm_aapcs_vfpcc <8 x half> @undef_one() {
356 ; CHECK-LABEL: undef_one:
359 %c = call <8 x half> @llvm.arm.mve.vreinterpretq.v8f16.v4f32(<4 x float> undef)
363 declare <8 x half> @llvm.arm.mve.vreinterpretq.v8f16.v4f32(<4 x float>)
364 declare <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v4i32(<4 x i32>)