1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
3 ; RUN: llc -mtriple=thumbebv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
5 define void @load_load_add_store(ptr %src1, ptr %src2) {
6 ; CHECK-LABEL: load_load_add_store:
7 ; CHECK: @ %bb.0: @ %entry
8 ; CHECK-NEXT: vldrw.u32 q0, [r1]
9 ; CHECK-NEXT: vldrw.u32 q1, [r0]
10 ; CHECK-NEXT: vadd.i32 q0, q1, q0
11 ; CHECK-NEXT: vstrw.32 q0, [r0]
14 %l1 = load <4 x i32>, ptr %src1, align 4
15 %l2 = load <4 x i32>, ptr %src2, align 4
16 %a = add <4 x i32> %l1, %l2
17 store <4 x i32> %a, ptr %src1, align 4
21 define void @load_load_add_store_align1(ptr %src1, ptr %src2) {
22 ; CHECK-LE-LABEL: load_load_add_store_align1:
23 ; CHECK-LE: @ %bb.0: @ %entry
24 ; CHECK-LE-NEXT: vldrb.u8 q0, [r1]
25 ; CHECK-LE-NEXT: vldrb.u8 q1, [r0]
26 ; CHECK-LE-NEXT: vadd.i32 q0, q1, q0
27 ; CHECK-LE-NEXT: vstrb.8 q0, [r0]
28 ; CHECK-LE-NEXT: bx lr
30 ; CHECK-BE-LABEL: load_load_add_store_align1:
31 ; CHECK-BE: @ %bb.0: @ %entry
32 ; CHECK-BE-NEXT: vldrb.u8 q0, [r1]
33 ; CHECK-BE-NEXT: vldrb.u8 q1, [r0]
34 ; CHECK-BE-NEXT: vrev32.8 q0, q0
35 ; CHECK-BE-NEXT: vrev32.8 q1, q1
36 ; CHECK-BE-NEXT: vadd.i32 q0, q1, q0
37 ; CHECK-BE-NEXT: vrev32.8 q0, q0
38 ; CHECK-BE-NEXT: vstrb.8 q0, [r0]
39 ; CHECK-BE-NEXT: bx lr
41 %l1 = load <4 x i32>, ptr %src1, align 1
42 %l2 = load <4 x i32>, ptr %src2, align 1
43 %a = add <4 x i32> %l1, %l2
44 store <4 x i32> %a, ptr %src1, align 1
48 define arm_aapcs_vfpcc void @load_arg_add_store(ptr %src1, <4 x i32> %src2) {
49 ; CHECK-LE-LABEL: load_arg_add_store:
50 ; CHECK-LE: @ %bb.0: @ %entry
51 ; CHECK-LE-NEXT: vldrw.u32 q1, [r0]
52 ; CHECK-LE-NEXT: vadd.i32 q0, q1, q0
53 ; CHECK-LE-NEXT: vstrw.32 q0, [r0]
54 ; CHECK-LE-NEXT: bx lr
56 ; CHECK-BE-LABEL: load_arg_add_store:
57 ; CHECK-BE: @ %bb.0: @ %entry
58 ; CHECK-BE-NEXT: vrev64.32 q1, q0
59 ; CHECK-BE-NEXT: vldrw.u32 q0, [r0]
60 ; CHECK-BE-NEXT: vadd.i32 q0, q0, q1
61 ; CHECK-BE-NEXT: vstrw.32 q0, [r0]
62 ; CHECK-BE-NEXT: bx lr
64 %l1 = load <4 x i32>, ptr %src1, align 4
65 %a = add <4 x i32> %l1, %src2
66 store <4 x i32> %a, ptr %src1, align 4
70 define <4 x i32> @add_soft(<4 x i32> %src1, <4 x i32> %src2) {
71 ; CHECK-LE-LABEL: add_soft:
72 ; CHECK-LE: @ %bb.0: @ %entry
73 ; CHECK-LE-NEXT: vmov d0, r0, r1
74 ; CHECK-LE-NEXT: mov r0, sp
75 ; CHECK-LE-NEXT: vldrw.u32 q1, [r0]
76 ; CHECK-LE-NEXT: vmov d1, r2, r3
77 ; CHECK-LE-NEXT: vadd.i32 q0, q0, q1
78 ; CHECK-LE-NEXT: vmov r0, r1, d0
79 ; CHECK-LE-NEXT: vmov r2, r3, d1
80 ; CHECK-LE-NEXT: bx lr
82 ; CHECK-BE-LABEL: add_soft:
83 ; CHECK-BE: @ %bb.0: @ %entry
84 ; CHECK-BE-NEXT: vmov d0, r1, r0
85 ; CHECK-BE-NEXT: mov r0, sp
86 ; CHECK-BE-NEXT: vmov d1, r3, r2
87 ; CHECK-BE-NEXT: vrev64.32 q1, q0
88 ; CHECK-BE-NEXT: vldrw.u32 q0, [r0]
89 ; CHECK-BE-NEXT: vadd.i32 q0, q1, q0
90 ; CHECK-BE-NEXT: vrev64.32 q1, q0
91 ; CHECK-BE-NEXT: vmov r1, r0, d2
92 ; CHECK-BE-NEXT: vmov r3, r2, d3
93 ; CHECK-BE-NEXT: bx lr
95 %0 = add <4 x i32> %src1, %src2
99 define arm_aapcs_vfpcc <4 x i32> @add_hard(<4 x i32> %src1, <4 x i32> %src2) {
100 ; CHECK-LE-LABEL: add_hard:
101 ; CHECK-LE: @ %bb.0: @ %entry
102 ; CHECK-LE-NEXT: vadd.i32 q0, q0, q1
103 ; CHECK-LE-NEXT: bx lr
105 ; CHECK-BE-LABEL: add_hard:
106 ; CHECK-BE: @ %bb.0: @ %entry
107 ; CHECK-BE-NEXT: vrev64.32 q2, q1
108 ; CHECK-BE-NEXT: vrev64.32 q1, q0
109 ; CHECK-BE-NEXT: vadd.i32 q1, q1, q2
110 ; CHECK-BE-NEXT: vrev64.32 q0, q1
111 ; CHECK-BE-NEXT: bx lr
113 %0 = add <4 x i32> %src1, %src2
117 define <4 x i32> @call_soft(<4 x i32> %src1, <4 x i32> %src2) {
118 ; CHECK-LE-LABEL: call_soft:
119 ; CHECK-LE: @ %bb.0: @ %entry
120 ; CHECK-LE-NEXT: .save {r7, lr}
121 ; CHECK-LE-NEXT: push {r7, lr}
122 ; CHECK-LE-NEXT: .pad #16
123 ; CHECK-LE-NEXT: sub sp, #16
124 ; CHECK-LE-NEXT: add.w r12, sp, #24
125 ; CHECK-LE-NEXT: vldrw.u32 q0, [r12]
126 ; CHECK-LE-NEXT: vstrw.32 q0, [sp]
127 ; CHECK-LE-NEXT: vmov d1, r2, r3
128 ; CHECK-LE-NEXT: vmov d0, r0, r1
129 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
130 ; CHECK-LE-NEXT: vmov r0, r1, d0
131 ; CHECK-LE-NEXT: vmov r2, r3, d1
132 ; CHECK-LE-NEXT: bl add_soft
133 ; CHECK-LE-NEXT: vmov d1, r2, r3
134 ; CHECK-LE-NEXT: vmov d0, r0, r1
135 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
136 ; CHECK-LE-NEXT: vmov r0, r1, d0
137 ; CHECK-LE-NEXT: vmov r2, r3, d1
138 ; CHECK-LE-NEXT: add sp, #16
139 ; CHECK-LE-NEXT: pop {r7, pc}
141 ; CHECK-BE-LABEL: call_soft:
142 ; CHECK-BE: @ %bb.0: @ %entry
143 ; CHECK-BE-NEXT: .save {r7, lr}
144 ; CHECK-BE-NEXT: push {r7, lr}
145 ; CHECK-BE-NEXT: .pad #16
146 ; CHECK-BE-NEXT: sub sp, #16
147 ; CHECK-BE-NEXT: add.w r12, sp, #24
148 ; CHECK-BE-NEXT: vldrw.u32 q0, [r12]
149 ; CHECK-BE-NEXT: vstrw.32 q0, [sp]
150 ; CHECK-BE-NEXT: vmov d1, r3, r2
151 ; CHECK-BE-NEXT: vmov d0, r1, r0
152 ; CHECK-BE-NEXT: vrev64.32 q1, q0
153 ; CHECK-BE-NEXT: vshr.u32 q0, q1, #1
154 ; CHECK-BE-NEXT: vrev64.32 q1, q0
155 ; CHECK-BE-NEXT: vmov r1, r0, d2
156 ; CHECK-BE-NEXT: vmov r3, r2, d3
157 ; CHECK-BE-NEXT: bl add_soft
158 ; CHECK-BE-NEXT: vmov d1, r3, r2
159 ; CHECK-BE-NEXT: vmov d0, r1, r0
160 ; CHECK-BE-NEXT: vrev64.32 q1, q0
161 ; CHECK-BE-NEXT: vshr.u32 q0, q1, #1
162 ; CHECK-BE-NEXT: vrev64.32 q1, q0
163 ; CHECK-BE-NEXT: vmov r1, r0, d2
164 ; CHECK-BE-NEXT: vmov r3, r2, d3
165 ; CHECK-BE-NEXT: add sp, #16
166 ; CHECK-BE-NEXT: pop {r7, pc}
168 %0 = lshr <4 x i32> %src1, <i32 1, i32 1, i32 1, i32 1>
169 %1 = call <4 x i32> @add_soft(<4 x i32> %0, <4 x i32> %src2)
170 %2 = lshr <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
174 define arm_aapcs_vfpcc <4 x i32> @call_hard(<4 x i32> %src1, <4 x i32> %src2) {
175 ; CHECK-LE-LABEL: call_hard:
176 ; CHECK-LE: @ %bb.0: @ %entry
177 ; CHECK-LE-NEXT: .save {r7, lr}
178 ; CHECK-LE-NEXT: push {r7, lr}
179 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
180 ; CHECK-LE-NEXT: bl add_hard
181 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
182 ; CHECK-LE-NEXT: pop {r7, pc}
184 ; CHECK-BE-LABEL: call_hard:
185 ; CHECK-BE: @ %bb.0: @ %entry
186 ; CHECK-BE-NEXT: .save {r7, lr}
187 ; CHECK-BE-NEXT: push {r7, lr}
188 ; CHECK-BE-NEXT: vrev64.32 q2, q0
189 ; CHECK-BE-NEXT: vshr.u32 q2, q2, #1
190 ; CHECK-BE-NEXT: vrev64.32 q0, q2
191 ; CHECK-BE-NEXT: bl add_hard
192 ; CHECK-BE-NEXT: vrev64.32 q1, q0
193 ; CHECK-BE-NEXT: vshr.u32 q1, q1, #1
194 ; CHECK-BE-NEXT: vrev64.32 q0, q1
195 ; CHECK-BE-NEXT: pop {r7, pc}
197 %0 = lshr <4 x i32> %src1, <i32 1, i32 1, i32 1, i32 1>
198 %1 = call arm_aapcs_vfpcc <4 x i32> @add_hard(<4 x i32> %0, <4 x i32> %src2)
199 %2 = lshr <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
203 define arm_aapcs_vfpcc <16 x i8> @and_v4i32(<4 x i32> %src) {
204 ; CHECK-LE-LABEL: and_v4i32:
205 ; CHECK-LE: @ %bb.0: @ %entry
206 ; CHECK-LE-NEXT: vmov.i32 q1, #0x1
207 ; CHECK-LE-NEXT: vand q0, q0, q1
208 ; CHECK-LE-NEXT: bx lr
210 ; CHECK-BE-LABEL: and_v4i32:
211 ; CHECK-BE: @ %bb.0: @ %entry
212 ; CHECK-BE-NEXT: vrev64.32 q1, q0
213 ; CHECK-BE-NEXT: vmov.i32 q0, #0x1
214 ; CHECK-BE-NEXT: vand q1, q1, q0
215 ; CHECK-BE-NEXT: vrev64.32 q0, q1
216 ; CHECK-BE-NEXT: bx lr
218 %s1 = and <4 x i32> %src, <i32 1, i32 1, i32 1, i32 1>
219 %r = bitcast <4 x i32> %s1 to <16 x i8>
223 ; Should be the same as and_v4i32 for LE
224 define arm_aapcs_vfpcc <16 x i8> @and_v16i8_le(<4 x i32> %src) {
225 ; CHECK-LE-LABEL: and_v16i8_le:
226 ; CHECK-LE: @ %bb.0: @ %entry
227 ; CHECK-LE-NEXT: vmov.i32 q1, #0x1
228 ; CHECK-LE-NEXT: vand q0, q0, q1
229 ; CHECK-LE-NEXT: bx lr
231 ; CHECK-BE-LABEL: and_v16i8_le:
232 ; CHECK-BE: @ %bb.0: @ %entry
233 ; CHECK-BE-NEXT: vrev64.8 q1, q0
234 ; CHECK-BE-NEXT: vmov.i32 q0, #0x1
235 ; CHECK-BE-NEXT: vrev32.8 q0, q0
236 ; CHECK-BE-NEXT: vand q1, q1, q0
237 ; CHECK-BE-NEXT: vrev64.8 q0, q1
238 ; CHECK-BE-NEXT: bx lr
240 %0 = bitcast <4 x i32> %src to <16 x i8>
241 %r = and <16 x i8> %0, <i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0>
245 ; Should be the same (or at least equivalent) as and_v4i32 for BE
246 define arm_aapcs_vfpcc <16 x i8> @and_v16i8_be(<4 x i32> %src) {
247 ; CHECK-LE-LABEL: and_v16i8_be:
248 ; CHECK-LE: @ %bb.0: @ %entry
249 ; CHECK-LE-NEXT: vmov.i32 q1, #0x1000000
250 ; CHECK-LE-NEXT: vand q0, q0, q1
251 ; CHECK-LE-NEXT: bx lr
253 ; CHECK-BE-LABEL: and_v16i8_be:
254 ; CHECK-BE: @ %bb.0: @ %entry
255 ; CHECK-BE-NEXT: vrev64.8 q1, q0
256 ; CHECK-BE-NEXT: vmov.i32 q0, #0x1000000
257 ; CHECK-BE-NEXT: vrev32.8 q0, q0
258 ; CHECK-BE-NEXT: vand q1, q1, q0
259 ; CHECK-BE-NEXT: vrev64.8 q0, q1
260 ; CHECK-BE-NEXT: bx lr
262 %0 = bitcast <4 x i32> %src to <16 x i8>
263 %r = and <16 x i8> %0, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
267 ; FIXME: This looks wrong
268 define arm_aapcs_vfpcc <4 x i32> @test(ptr %data) {
269 ; CHECK-LE-LABEL: test:
270 ; CHECK-LE: @ %bb.0: @ %entry
271 ; CHECK-LE-NEXT: vldrw.u32 q0, [r0, #32]
272 ; CHECK-LE-NEXT: movs r0, #1
273 ; CHECK-LE-NEXT: vadd.i32 q1, q0, r0
274 ; CHECK-LE-NEXT: @APP
275 ; CHECK-LE-NEXT: vmullb.s32 q0, q1, q1
276 ; CHECK-LE-NEXT: @NO_APP
277 ; CHECK-LE-NEXT: bx lr
279 ; CHECK-BE-LABEL: test:
280 ; CHECK-BE: @ %bb.0: @ %entry
281 ; CHECK-BE-NEXT: movs r1, #1
282 ; CHECK-BE-NEXT: vldrw.u32 q1, [r0, #32]
283 ; CHECK-BE-NEXT: vdup.32 q0, r1
284 ; CHECK-BE-NEXT: vadd.i32 q0, q1, q0
285 ; CHECK-BE-NEXT: vrev32.8 q0, q0
286 ; CHECK-BE-NEXT: @APP
287 ; CHECK-BE-NEXT: vmullb.s32 q1, q0, q0
288 ; CHECK-BE-NEXT: @NO_APP
289 ; CHECK-BE-NEXT: vrev64.8 q0, q1
290 ; CHECK-BE-NEXT: bx lr
292 %add.ptr = getelementptr inbounds i32, ptr %data, i32 8
293 %0 = load <4 x i32>, ptr %add.ptr, align 4
294 %1 = add <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
295 %2 = tail call <4 x i32> asm sideeffect " VMULLB.s32 $0, $1, $1", "=&w,w"(<4 x i32> %1) #2
299 ; Test case demonstrating that 'bitcast' reinterprets the memory format of a
300 ; vector, as if stored and then loaded. So if it has to go between two
301 ; operations treating a register as having different lane sizes, then in
302 ; big-endian mode, it has to emit a vrev32.16, which is equivalent to the
303 ; effect that vstrw.32 followed by vldrh.16 would have.
304 define arm_aapcs_vfpcc void @test_bitcast(ptr readonly %in, ptr %out) {
305 ; CHECK-LE-LABEL: test_bitcast:
306 ; CHECK-LE: @ %bb.0: @ %entry
307 ; CHECK-LE-NEXT: vldrw.u32 q0, [r0]
308 ; CHECK-LE-NEXT: vmul.i32 q0, q0, q0
309 ; CHECK-LE-NEXT: vmul.i16 q0, q0, q0
310 ; CHECK-LE-NEXT: vstrw.32 q0, [r1]
311 ; CHECK-LE-NEXT: bx lr
313 ; CHECK-BE-LABEL: test_bitcast:
314 ; CHECK-BE: @ %bb.0: @ %entry
315 ; CHECK-BE-NEXT: vldrw.u32 q0, [r0]
316 ; CHECK-BE-NEXT: vmul.i32 q0, q0, q0
317 ; CHECK-BE-NEXT: vrev32.16 q0, q0
318 ; CHECK-BE-NEXT: vmul.i16 q0, q0, q0
319 ; CHECK-BE-NEXT: vstrh.16 q0, [r1]
320 ; CHECK-BE-NEXT: bx lr
322 %vin = load <4 x i32>, ptr %in, align 8
323 %vdbl = mul <4 x i32> %vin, %vin
324 %cast = bitcast <4 x i32> %vdbl to <8 x i16>
325 %cdbl = mul <8 x i16> %cast, %cast
326 store <8 x i16> %cdbl, ptr %out, align 8
330 ; Similar test case but using the arm.mve.vreinterpretq intrinsic instead,
331 ; which is defined to reinterpret the in-register format, so it generates no
332 ; instruction in either endianness.
333 define arm_aapcs_vfpcc void @test_vreinterpretq(ptr readonly %in, ptr %out) {
334 ; CHECK-LE-LABEL: test_vreinterpretq:
335 ; CHECK-LE: @ %bb.0: @ %entry
336 ; CHECK-LE-NEXT: vldrw.u32 q0, [r0]
337 ; CHECK-LE-NEXT: vmul.i32 q0, q0, q0
338 ; CHECK-LE-NEXT: vmul.i16 q0, q0, q0
339 ; CHECK-LE-NEXT: vstrw.32 q0, [r1]
340 ; CHECK-LE-NEXT: bx lr
342 ; CHECK-BE-LABEL: test_vreinterpretq:
343 ; CHECK-BE: @ %bb.0: @ %entry
344 ; CHECK-BE-NEXT: vldrw.u32 q0, [r0]
345 ; CHECK-BE-NEXT: vmul.i32 q0, q0, q0
346 ; CHECK-BE-NEXT: vmul.i16 q0, q0, q0
347 ; CHECK-BE-NEXT: vstrh.16 q0, [r1]
348 ; CHECK-BE-NEXT: bx lr
350 %vin = load <4 x i32>, ptr %in, align 8
351 %vdbl = mul <4 x i32> %vin, %vin
352 %cast = call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v4i32(<4 x i32> %vdbl)
353 %cdbl = mul <8 x i16> %cast, %cast
354 store <8 x i16> %cdbl, ptr %out, align 8
358 define arm_aapcs_vfpcc <8 x half> @undef_one() {
359 ; CHECK-LABEL: undef_one:
362 %c = call <8 x half> @llvm.arm.mve.vreinterpretq.v8f16.v4f32(<4 x float> undef)
366 declare <8 x half> @llvm.arm.mve.vreinterpretq.v8f16.v4f32(<4 x float>)
367 declare <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v4i32(<4 x i32>)