1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
3 ; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
5 define void @load_load_add_store(<4 x i32> *%src1, <4 x i32> *%src2) {
6 ; CHECK-LABEL: load_load_add_store:
7 ; CHECK: @ %bb.0: @ %entry
8 ; CHECK-NEXT: vldrw.u32 q0, [r1]
9 ; CHECK-NEXT: vldrw.u32 q1, [r0]
10 ; CHECK-NEXT: vadd.i32 q0, q1, q0
11 ; CHECK-NEXT: vstrw.32 q0, [r0]
14 %l1 = load <4 x i32>, <4 x i32>* %src1, align 4
15 %l2 = load <4 x i32>, <4 x i32>* %src2, align 4
16 %a = add <4 x i32> %l1, %l2
17 store <4 x i32> %a, <4 x i32>* %src1, align 4
21 define void @load_load_add_store_align1(<4 x i32> *%src1, <4 x i32> *%src2) {
22 ; CHECK-LE-LABEL: load_load_add_store_align1:
23 ; CHECK-LE: @ %bb.0: @ %entry
24 ; CHECK-LE-NEXT: vldrb.u8 q0, [r1]
25 ; CHECK-LE-NEXT: vldrb.u8 q1, [r0]
26 ; CHECK-LE-NEXT: vadd.i32 q0, q1, q0
27 ; CHECK-LE-NEXT: vstrb.8 q0, [r0]
28 ; CHECK-LE-NEXT: bx lr
30 ; CHECK-BE-LABEL: load_load_add_store_align1:
31 ; CHECK-BE: @ %bb.0: @ %entry
32 ; CHECK-BE-NEXT: vldrb.u8 q0, [r1]
33 ; CHECK-BE-NEXT: vldrb.u8 q1, [r0]
34 ; CHECK-BE-NEXT: vrev32.8 q0, q0
35 ; CHECK-BE-NEXT: vrev32.8 q1, q1
36 ; CHECK-BE-NEXT: vadd.i32 q0, q1, q0
37 ; CHECK-BE-NEXT: vrev32.8 q0, q0
38 ; CHECK-BE-NEXT: vstrb.8 q0, [r0]
39 ; CHECK-BE-NEXT: bx lr
41 %l1 = load <4 x i32>, <4 x i32>* %src1, align 1
42 %l2 = load <4 x i32>, <4 x i32>* %src2, align 1
43 %a = add <4 x i32> %l1, %l2
44 store <4 x i32> %a, <4 x i32>* %src1, align 1
48 define arm_aapcs_vfpcc void @load_arg_add_store(<4 x i32> *%src1, <4 x i32> %src2) {
49 ; CHECK-LE-LABEL: load_arg_add_store:
50 ; CHECK-LE: @ %bb.0: @ %entry
51 ; CHECK-LE-NEXT: vldrw.u32 q1, [r0]
52 ; CHECK-LE-NEXT: vadd.i32 q0, q1, q0
53 ; CHECK-LE-NEXT: vstrw.32 q0, [r0]
54 ; CHECK-LE-NEXT: bx lr
56 ; CHECK-BE-LABEL: load_arg_add_store:
57 ; CHECK-BE: @ %bb.0: @ %entry
58 ; CHECK-BE-NEXT: vrev64.32 q1, q0
59 ; CHECK-BE-NEXT: vldrw.u32 q0, [r0]
60 ; CHECK-BE-NEXT: vadd.i32 q0, q0, q1
61 ; CHECK-BE-NEXT: vstrw.32 q0, [r0]
62 ; CHECK-BE-NEXT: bx lr
64 %l1 = load <4 x i32>, <4 x i32>* %src1, align 4
65 %a = add <4 x i32> %l1, %src2
66 store <4 x i32> %a, <4 x i32>* %src1, align 4
70 define <4 x i32> @add_soft(<4 x i32> %src1, <4 x i32> %src2) {
71 ; CHECK-LE-LABEL: add_soft:
72 ; CHECK-LE: @ %bb.0: @ %entry
73 ; CHECK-LE-NEXT: vmov d1, r2, r3
74 ; CHECK-LE-NEXT: vmov d0, r0, r1
75 ; CHECK-LE-NEXT: mov r0, sp
76 ; CHECK-LE-NEXT: vldrw.u32 q1, [r0]
77 ; CHECK-LE-NEXT: vadd.i32 q0, q0, q1
78 ; CHECK-LE-NEXT: vmov r0, r1, d0
79 ; CHECK-LE-NEXT: vmov r2, r3, d1
80 ; CHECK-LE-NEXT: bx lr
82 ; CHECK-BE-LABEL: add_soft:
83 ; CHECK-BE: @ %bb.0: @ %entry
84 ; CHECK-BE-NEXT: vmov d1, r3, r2
85 ; CHECK-BE-NEXT: vmov d0, r1, r0
86 ; CHECK-BE-NEXT: mov r0, sp
87 ; CHECK-BE-NEXT: vrev64.32 q1, q0
88 ; CHECK-BE-NEXT: vldrw.u32 q0, [r0]
89 ; CHECK-BE-NEXT: vadd.i32 q0, q1, q0
90 ; CHECK-BE-NEXT: vrev64.32 q1, q0
91 ; CHECK-BE-NEXT: vmov r1, r0, d2
92 ; CHECK-BE-NEXT: vmov r3, r2, d3
93 ; CHECK-BE-NEXT: bx lr
95 %0 = add <4 x i32> %src1, %src2
99 define arm_aapcs_vfpcc <4 x i32> @add_hard(<4 x i32> %src1, <4 x i32> %src2) {
100 ; CHECK-LE-LABEL: add_hard:
101 ; CHECK-LE: @ %bb.0: @ %entry
102 ; CHECK-LE-NEXT: vadd.i32 q0, q0, q1
103 ; CHECK-LE-NEXT: bx lr
105 ; CHECK-BE-LABEL: add_hard:
106 ; CHECK-BE: @ %bb.0: @ %entry
107 ; CHECK-BE-NEXT: vrev64.32 q2, q1
108 ; CHECK-BE-NEXT: vrev64.32 q1, q0
109 ; CHECK-BE-NEXT: vadd.i32 q1, q1, q2
110 ; CHECK-BE-NEXT: vrev64.32 q0, q1
111 ; CHECK-BE-NEXT: bx lr
113 %0 = add <4 x i32> %src1, %src2
117 define <4 x i32> @call_soft(<4 x i32> %src1, <4 x i32> %src2) {
118 ; CHECK-LE-LABEL: call_soft:
119 ; CHECK-LE: @ %bb.0: @ %entry
120 ; CHECK-LE-NEXT: .save {r7, lr}
121 ; CHECK-LE-NEXT: push {r7, lr}
122 ; CHECK-LE-NEXT: .pad #16
123 ; CHECK-LE-NEXT: sub sp, #16
124 ; CHECK-LE-NEXT: add.w r12, sp, #24
125 ; CHECK-LE-NEXT: vldrw.u32 q0, [r12]
126 ; CHECK-LE-NEXT: vstrw.32 q0, [sp]
127 ; CHECK-LE-NEXT: vmov d1, r2, r3
128 ; CHECK-LE-NEXT: vmov d0, r0, r1
129 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
130 ; CHECK-LE-NEXT: vmov r0, r1, d0
131 ; CHECK-LE-NEXT: vmov r2, r3, d1
132 ; CHECK-LE-NEXT: bl add_soft
133 ; CHECK-LE-NEXT: vmov d1, r2, r3
134 ; CHECK-LE-NEXT: vmov d0, r0, r1
135 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
136 ; CHECK-LE-NEXT: vmov r0, r1, d0
137 ; CHECK-LE-NEXT: vmov r2, r3, d1
138 ; CHECK-LE-NEXT: add sp, #16
139 ; CHECK-LE-NEXT: pop {r7, pc}
141 ; CHECK-BE-LABEL: call_soft:
142 ; CHECK-BE: @ %bb.0: @ %entry
143 ; CHECK-BE-NEXT: .save {r7, lr}
144 ; CHECK-BE-NEXT: push {r7, lr}
145 ; CHECK-BE-NEXT: .pad #16
146 ; CHECK-BE-NEXT: sub sp, #16
147 ; CHECK-BE-NEXT: add.w r12, sp, #24
148 ; CHECK-BE-NEXT: vldrw.u32 q0, [r12]
149 ; CHECK-BE-NEXT: vstrw.32 q0, [sp]
150 ; CHECK-BE-NEXT: vmov d1, r3, r2
151 ; CHECK-BE-NEXT: vmov d0, r1, r0
152 ; CHECK-BE-NEXT: vrev64.32 q1, q0
153 ; CHECK-BE-NEXT: vshr.u32 q0, q1, #1
154 ; CHECK-BE-NEXT: vrev64.32 q1, q0
155 ; CHECK-BE-NEXT: vmov r1, r0, d2
156 ; CHECK-BE-NEXT: vmov r3, r2, d3
157 ; CHECK-BE-NEXT: bl add_soft
158 ; CHECK-BE-NEXT: vmov d1, r3, r2
159 ; CHECK-BE-NEXT: vmov d0, r1, r0
160 ; CHECK-BE-NEXT: vrev64.32 q1, q0
161 ; CHECK-BE-NEXT: vshr.u32 q0, q1, #1
162 ; CHECK-BE-NEXT: vrev64.32 q1, q0
163 ; CHECK-BE-NEXT: vmov r1, r0, d2
164 ; CHECK-BE-NEXT: vmov r3, r2, d3
165 ; CHECK-BE-NEXT: add sp, #16
166 ; CHECK-BE-NEXT: pop {r7, pc}
168 %0 = lshr <4 x i32> %src1, <i32 1, i32 1, i32 1, i32 1>
169 %1 = call <4 x i32> @add_soft(<4 x i32> %0, <4 x i32> %src2)
170 %2 = lshr <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
174 define arm_aapcs_vfpcc <4 x i32> @call_hard(<4 x i32> %src1, <4 x i32> %src2) {
175 ; CHECK-LE-LABEL: call_hard:
176 ; CHECK-LE: @ %bb.0: @ %entry
177 ; CHECK-LE-NEXT: .save {r7, lr}
178 ; CHECK-LE-NEXT: push {r7, lr}
179 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
180 ; CHECK-LE-NEXT: bl add_hard
181 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
182 ; CHECK-LE-NEXT: pop {r7, pc}
184 ; CHECK-BE-LABEL: call_hard:
185 ; CHECK-BE: @ %bb.0: @ %entry
186 ; CHECK-BE-NEXT: .save {r7, lr}
187 ; CHECK-BE-NEXT: push {r7, lr}
188 ; CHECK-BE-NEXT: vrev64.32 q2, q0
189 ; CHECK-BE-NEXT: vshr.u32 q2, q2, #1
190 ; CHECK-BE-NEXT: vrev64.32 q0, q2
191 ; CHECK-BE-NEXT: bl add_hard
192 ; CHECK-BE-NEXT: vrev64.32 q1, q0
193 ; CHECK-BE-NEXT: vshr.u32 q1, q1, #1
194 ; CHECK-BE-NEXT: vrev64.32 q0, q1
195 ; CHECK-BE-NEXT: pop {r7, pc}
197 %0 = lshr <4 x i32> %src1, <i32 1, i32 1, i32 1, i32 1>
198 %1 = call arm_aapcs_vfpcc <4 x i32> @add_hard(<4 x i32> %0, <4 x i32> %src2)
199 %2 = lshr <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
203 define arm_aapcs_vfpcc <16 x i8> @and_v4i32(<4 x i32> %src) {
204 ; CHECK-LE-LABEL: and_v4i32:
205 ; CHECK-LE: @ %bb.0: @ %entry
206 ; CHECK-LE-NEXT: vmov.i32 q1, #0x1
207 ; CHECK-LE-NEXT: vand q0, q0, q1
208 ; CHECK-LE-NEXT: bx lr
210 ; CHECK-BE-LABEL: and_v4i32:
211 ; CHECK-BE: @ %bb.0: @ %entry
212 ; CHECK-BE-NEXT: vrev64.32 q1, q0
213 ; CHECK-BE-NEXT: vmov.i32 q0, #0x1
214 ; CHECK-BE-NEXT: vand q1, q1, q0
215 ; CHECK-BE-NEXT: vrev64.32 q0, q1
216 ; CHECK-BE-NEXT: bx lr
218 %s1 = and <4 x i32> %src, <i32 1, i32 1, i32 1, i32 1>
219 %r = bitcast <4 x i32> %s1 to <16 x i8>
223 ; Should be the same as and_v4i32 for LE
224 define arm_aapcs_vfpcc <16 x i8> @and_v16i8_le(<4 x i32> %src) {
225 ; CHECK-LE-LABEL: and_v16i8_le:
226 ; CHECK-LE: @ %bb.0: @ %entry
227 ; CHECK-LE-NEXT: vmov.i32 q1, #0x1
228 ; CHECK-LE-NEXT: vand q0, q0, q1
229 ; CHECK-LE-NEXT: bx lr
231 ; CHECK-BE-LABEL: and_v16i8_le:
232 ; CHECK-BE: @ %bb.0: @ %entry
233 ; CHECK-BE-NEXT: vrev64.8 q1, q0
234 ; CHECK-BE-NEXT: vmov.i32 q0, #0x1
235 ; CHECK-BE-NEXT: vrev32.8 q0, q0
236 ; CHECK-BE-NEXT: vand q1, q1, q0
237 ; CHECK-BE-NEXT: vrev64.8 q0, q1
238 ; CHECK-BE-NEXT: bx lr
240 %0 = bitcast <4 x i32> %src to <16 x i8>
241 %r = and <16 x i8> %0, <i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0>
245 ; Should be the same (or at least equivalent) as and_v4i32 for BE
246 define arm_aapcs_vfpcc <16 x i8> @and_v16i8_be(<4 x i32> %src) {
247 ; CHECK-LE-LABEL: and_v16i8_be:
248 ; CHECK-LE: @ %bb.0: @ %entry
249 ; CHECK-LE-NEXT: vmov.i32 q1, #0x1000000
250 ; CHECK-LE-NEXT: vand q0, q0, q1
251 ; CHECK-LE-NEXT: bx lr
253 ; CHECK-BE-LABEL: and_v16i8_be:
254 ; CHECK-BE: @ %bb.0: @ %entry
255 ; CHECK-BE-NEXT: vrev64.8 q1, q0
256 ; CHECK-BE-NEXT: vmov.i32 q0, #0x1000000
257 ; CHECK-BE-NEXT: vrev32.8 q0, q0
258 ; CHECK-BE-NEXT: vand q1, q1, q0
259 ; CHECK-BE-NEXT: vrev64.8 q0, q1
260 ; CHECK-BE-NEXT: bx lr
262 %0 = bitcast <4 x i32> %src to <16 x i8>
263 %r = and <16 x i8> %0, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
267 ; FIXME: This looks wrong
268 define arm_aapcs_vfpcc <4 x i32> @test(i32* %data) {
269 ; CHECK-LE-LABEL: test:
270 ; CHECK-LE: @ %bb.0: @ %entry
271 ; CHECK-LE-NEXT: vldrw.u32 q1, [r0, #32]
272 ; CHECK-LE-NEXT: vmov.i32 q0, #0x1
273 ; CHECK-LE-NEXT: vadd.i32 q1, q1, q0
274 ; CHECK-LE-NEXT: @APP
275 ; CHECK-LE-NEXT: vmullb.s32 q0, q1, q1
276 ; CHECK-LE-NEXT: @NO_APP
277 ; CHECK-LE-NEXT: bx lr
279 ; CHECK-BE-LABEL: test:
280 ; CHECK-BE: @ %bb.0: @ %entry
281 ; CHECK-BE-NEXT: vldrw.u32 q1, [r0, #32]
282 ; CHECK-BE-NEXT: vmov.i32 q0, #0x1
283 ; CHECK-BE-NEXT: vadd.i32 q0, q1, q0
284 ; CHECK-BE-NEXT: vrev32.8 q0, q0
285 ; CHECK-BE-NEXT: @APP
286 ; CHECK-BE-NEXT: vmullb.s32 q1, q0, q0
287 ; CHECK-BE-NEXT: @NO_APP
288 ; CHECK-BE-NEXT: vrev64.8 q0, q1
289 ; CHECK-BE-NEXT: bx lr
291 %add.ptr = getelementptr inbounds i32, i32* %data, i32 8
292 %0 = bitcast i32* %add.ptr to <4 x i32>*
293 %1 = load <4 x i32>, <4 x i32>* %0, align 4
294 %2 = add <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
295 %3 = tail call <4 x i32> asm sideeffect " VMULLB.s32 $0, $1, $1", "=&w,w"(<4 x i32> %2) #2