1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
3 ; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
5 define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4(<4 x i32>* %vp) {
6 ; CHECK-LE-LABEL: load_4xi32_a4:
7 ; CHECK-LE: @ %bb.0: @ %entry
8 ; CHECK-LE-NEXT: vldrw.u32 q0, [r0]
9 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
10 ; CHECK-LE-NEXT: bx lr
12 ; CHECK-BE-LABEL: load_4xi32_a4:
13 ; CHECK-BE: @ %bb.0: @ %entry
14 ; CHECK-BE-NEXT: vldrw.u32 q0, [r0]
15 ; CHECK-BE-NEXT: vshr.u32 q1, q0, #1
16 ; CHECK-BE-NEXT: vrev64.32 q0, q1
17 ; CHECK-BE-NEXT: bx lr
19 %0 = load <4 x i32>, <4 x i32>* %vp, align 4
20 %1 = lshr <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
24 define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a2(<4 x i32>* %vp) {
25 ; CHECK-LE-LABEL: load_4xi32_a2:
26 ; CHECK-LE: @ %bb.0: @ %entry
27 ; CHECK-LE-NEXT: vldrh.u16 q0, [r0]
28 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
29 ; CHECK-LE-NEXT: bx lr
31 ; CHECK-BE-LABEL: load_4xi32_a2:
32 ; CHECK-BE: @ %bb.0: @ %entry
33 ; CHECK-BE-NEXT: vldrb.u8 q0, [r0]
34 ; CHECK-BE-NEXT: vrev32.8 q0, q0
35 ; CHECK-BE-NEXT: vshr.u32 q1, q0, #1
36 ; CHECK-BE-NEXT: vrev64.32 q0, q1
37 ; CHECK-BE-NEXT: bx lr
39 %0 = load <4 x i32>, <4 x i32>* %vp, align 2
40 %1 = lshr <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
44 define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a1(<4 x i32>* %vp) {
45 ; CHECK-LE-LABEL: load_4xi32_a1:
46 ; CHECK-LE: @ %bb.0: @ %entry
47 ; CHECK-LE-NEXT: vldrb.u8 q0, [r0]
48 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
49 ; CHECK-LE-NEXT: bx lr
51 ; CHECK-BE-LABEL: load_4xi32_a1:
52 ; CHECK-BE: @ %bb.0: @ %entry
53 ; CHECK-BE-NEXT: vldrb.u8 q0, [r0]
54 ; CHECK-BE-NEXT: vrev32.8 q0, q0
55 ; CHECK-BE-NEXT: vshr.u32 q1, q0, #1
56 ; CHECK-BE-NEXT: vrev64.32 q0, q1
57 ; CHECK-BE-NEXT: bx lr
59 %0 = load <4 x i32>, <4 x i32>* %vp, align 1
60 %1 = lshr <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
64 define arm_aapcs_vfpcc void @store_4xi32_a4(<4 x i32>* %vp, <4 x i32> %val) {
65 ; CHECK-LE-LABEL: store_4xi32_a4:
66 ; CHECK-LE: @ %bb.0: @ %entry
67 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
68 ; CHECK-LE-NEXT: vstrw.32 q0, [r0]
69 ; CHECK-LE-NEXT: bx lr
71 ; CHECK-BE-LABEL: store_4xi32_a4:
72 ; CHECK-BE: @ %bb.0: @ %entry
73 ; CHECK-BE-NEXT: vrev64.32 q1, q0
74 ; CHECK-BE-NEXT: vshr.u32 q0, q1, #1
75 ; CHECK-BE-NEXT: vstrw.32 q0, [r0]
76 ; CHECK-BE-NEXT: bx lr
78 %0 = lshr <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1>
79 store <4 x i32> %0, <4 x i32>* %vp, align 4
83 define arm_aapcs_vfpcc void @store_4xi32_a2(<4 x i32>* %vp, <4 x i32> %val) {
84 ; CHECK-LE-LABEL: store_4xi32_a2:
85 ; CHECK-LE: @ %bb.0: @ %entry
86 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
87 ; CHECK-LE-NEXT: vstrh.16 q0, [r0]
88 ; CHECK-LE-NEXT: bx lr
90 ; CHECK-BE-LABEL: store_4xi32_a2:
91 ; CHECK-BE: @ %bb.0: @ %entry
92 ; CHECK-BE-NEXT: vrev64.32 q1, q0
93 ; CHECK-BE-NEXT: vshr.u32 q0, q1, #1
94 ; CHECK-BE-NEXT: vrev32.8 q0, q0
95 ; CHECK-BE-NEXT: vstrb.8 q0, [r0]
96 ; CHECK-BE-NEXT: bx lr
98 %0 = lshr <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1>
99 store <4 x i32> %0, <4 x i32>* %vp, align 2
103 define arm_aapcs_vfpcc void @store_4xi32_a1(<4 x i32>* %vp, <4 x i32> %val) {
104 ; CHECK-LE-LABEL: store_4xi32_a1:
105 ; CHECK-LE: @ %bb.0: @ %entry
106 ; CHECK-LE-NEXT: vshr.u32 q0, q0, #1
107 ; CHECK-LE-NEXT: vstrb.8 q0, [r0]
108 ; CHECK-LE-NEXT: bx lr
110 ; CHECK-BE-LABEL: store_4xi32_a1:
111 ; CHECK-BE: @ %bb.0: @ %entry
112 ; CHECK-BE-NEXT: vrev64.32 q1, q0
113 ; CHECK-BE-NEXT: vshr.u32 q0, q1, #1
114 ; CHECK-BE-NEXT: vrev32.8 q0, q0
115 ; CHECK-BE-NEXT: vstrb.8 q0, [r0]
116 ; CHECK-BE-NEXT: bx lr
118 %0 = lshr <4 x i32> %val, <i32 1, i32 1, i32 1, i32 1>
119 store <4 x i32> %0, <4 x i32>* %vp, align 1
123 define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4_offset_pos(i32* %ip) {
124 ; CHECK-LE-LABEL: load_4xi32_a4_offset_pos:
125 ; CHECK-LE: @ %bb.0: @ %entry
126 ; CHECK-LE-NEXT: add.w r0, r0, #508
127 ; CHECK-LE-NEXT: vldrw.u32 q0, [r0]
128 ; CHECK-LE-NEXT: bx lr
130 ; CHECK-BE-LABEL: load_4xi32_a4_offset_pos:
131 ; CHECK-BE: @ %bb.0: @ %entry
132 ; CHECK-BE-NEXT: add.w r0, r0, #508
133 ; CHECK-BE-NEXT: vldrb.u8 q1, [r0]
134 ; CHECK-BE-NEXT: vrev64.8 q0, q1
135 ; CHECK-BE-NEXT: bx lr
137 %ipoffset = getelementptr inbounds i32, i32* %ip, i32 127
138 %vp = bitcast i32* %ipoffset to <4 x i32>*
139 %0 = load <4 x i32>, <4 x i32>* %vp, align 4
143 define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4_offset_neg(i32* %ip) {
144 ; CHECK-LE-LABEL: load_4xi32_a4_offset_neg:
145 ; CHECK-LE: @ %bb.0: @ %entry
146 ; CHECK-LE-NEXT: sub.w r0, r0, #508
147 ; CHECK-LE-NEXT: vldrw.u32 q0, [r0]
148 ; CHECK-LE-NEXT: bx lr
150 ; CHECK-BE-LABEL: load_4xi32_a4_offset_neg:
151 ; CHECK-BE: @ %bb.0: @ %entry
152 ; CHECK-BE-NEXT: sub.w r0, r0, #508
153 ; CHECK-BE-NEXT: vldrb.u8 q1, [r0]
154 ; CHECK-BE-NEXT: vrev64.8 q0, q1
155 ; CHECK-BE-NEXT: bx lr
157 %ipoffset = getelementptr inbounds i32, i32* %ip, i32 -127
158 %vp = bitcast i32* %ipoffset to <4 x i32>*
159 %0 = load <4 x i32>, <4 x i32>* %vp, align 4
163 define arm_aapcs_vfpcc <4 x i32> @loadstore_4xi32_stack_off16() {
164 ; CHECK-LE-LABEL: loadstore_4xi32_stack_off16:
165 ; CHECK-LE: @ %bb.0: @ %entry
166 ; CHECK-LE-NEXT: .pad #40
167 ; CHECK-LE-NEXT: sub sp, #40
168 ; CHECK-LE-NEXT: vmov.i32 q0, #0x1
169 ; CHECK-LE-NEXT: mov r0, sp
170 ; CHECK-LE-NEXT: vstrw.32 q0, [r0]
171 ; CHECK-LE-NEXT: movs r0, #3
172 ; CHECK-LE-NEXT: vstrw.32 q0, [sp, #16]
173 ; CHECK-LE-NEXT: str r0, [sp, #16]
174 ; CHECK-LE-NEXT: vldrw.u32 q0, [sp, #16]
175 ; CHECK-LE-NEXT: add sp, #40
176 ; CHECK-LE-NEXT: bx lr
178 ; CHECK-BE-LABEL: loadstore_4xi32_stack_off16:
179 ; CHECK-BE: @ %bb.0: @ %entry
180 ; CHECK-BE-NEXT: .pad #40
181 ; CHECK-BE-NEXT: sub sp, #40
182 ; CHECK-BE-NEXT: vmov.i32 q0, #0x1
183 ; CHECK-BE-NEXT: mov r0, sp
184 ; CHECK-BE-NEXT: vstrw.32 q0, [r0]
185 ; CHECK-BE-NEXT: movs r0, #3
186 ; CHECK-BE-NEXT: vstrw.32 q0, [sp, #16]
187 ; CHECK-BE-NEXT: str r0, [sp, #16]
188 ; CHECK-BE-NEXT: vldrb.u8 q1, [sp, #16]
189 ; CHECK-BE-NEXT: vrev64.8 q0, q1
190 ; CHECK-BE-NEXT: add sp, #40
191 ; CHECK-BE-NEXT: bx lr
193 %c = alloca [1 x [5 x [2 x i32]]], align 4
194 %0 = bitcast [1 x [5 x [2 x i32]]]* %c to i8*
195 %arrayidx5 = getelementptr inbounds [1 x [5 x [2 x i32]]], [1 x [5 x [2 x i32]]]* %c, i32 0, i32 0, i32 0, i32 0
196 %1 = bitcast [1 x [5 x [2 x i32]]]* %c to <4 x i32>*
197 store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32>* %1, align 4
198 %arrayidx5.2 = getelementptr inbounds [1 x [5 x [2 x i32]]], [1 x [5 x [2 x i32]]]* %c, i32 0, i32 0, i32 2, i32 0
199 %2 = bitcast i32* %arrayidx5.2 to <4 x i32>*
200 store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32>* %2, align 4
201 store i32 3, i32* %arrayidx5.2, align 4
202 %3 = load <4 x i32>, <4 x i32>* %2, align 4
206 define arm_aapcs_vfpcc <8 x i16> @loadstore_8xi16_stack_off16() {
207 ; CHECK-LE-LABEL: loadstore_8xi16_stack_off16:
208 ; CHECK-LE: @ %bb.0: @ %entry
209 ; CHECK-LE-NEXT: .pad #40
210 ; CHECK-LE-NEXT: sub sp, #40
211 ; CHECK-LE-NEXT: vmov.i16 q0, #0x1
212 ; CHECK-LE-NEXT: mov r0, sp
213 ; CHECK-LE-NEXT: vstrh.16 q0, [r0]
214 ; CHECK-LE-NEXT: movs r0, #3
215 ; CHECK-LE-NEXT: vstrh.16 q0, [sp, #16]
216 ; CHECK-LE-NEXT: strh.w r0, [sp, #16]
217 ; CHECK-LE-NEXT: vldrh.u16 q0, [sp, #16]
218 ; CHECK-LE-NEXT: add sp, #40
219 ; CHECK-LE-NEXT: bx lr
221 ; CHECK-BE-LABEL: loadstore_8xi16_stack_off16:
222 ; CHECK-BE: @ %bb.0: @ %entry
223 ; CHECK-BE-NEXT: .pad #40
224 ; CHECK-BE-NEXT: sub sp, #40
225 ; CHECK-BE-NEXT: vmov.i16 q0, #0x1
226 ; CHECK-BE-NEXT: mov r0, sp
227 ; CHECK-BE-NEXT: vstrh.16 q0, [r0]
228 ; CHECK-BE-NEXT: movs r0, #3
229 ; CHECK-BE-NEXT: vstrh.16 q0, [sp, #16]
230 ; CHECK-BE-NEXT: strh.w r0, [sp, #16]
231 ; CHECK-BE-NEXT: vldrb.u8 q1, [sp, #16]
232 ; CHECK-BE-NEXT: vrev64.8 q0, q1
233 ; CHECK-BE-NEXT: add sp, #40
234 ; CHECK-BE-NEXT: bx lr
236 %c = alloca [1 x [10 x [2 x i16]]], align 2
237 %0 = bitcast [1 x [10 x [2 x i16]]]* %c to i8*
238 %arrayidx5 = getelementptr inbounds [1 x [10 x [2 x i16]]], [1 x [10 x [2 x i16]]]* %c, i32 0, i32 0, i32 0, i32 0
239 %1 = bitcast [1 x [10 x [2 x i16]]]* %c to <8 x i16>*
240 store <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16>* %1, align 2
241 %arrayidx5.2 = getelementptr inbounds [1 x [10 x [2 x i16]]], [1 x [10 x [2 x i16]]]* %c, i32 0, i32 0, i32 4, i32 0
242 %2 = bitcast i16* %arrayidx5.2 to <8 x i16>*
243 store <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16>* %2, align 2
244 store i16 3, i16* %arrayidx5.2, align 2
245 %3 = load <8 x i16>, <8 x i16>* %2, align 2
249 define arm_aapcs_vfpcc <16 x i8> @loadstore_16xi8_stack_off16() {
250 ; CHECK-LE-LABEL: loadstore_16xi8_stack_off16:
251 ; CHECK-LE: @ %bb.0: @ %entry
252 ; CHECK-LE-NEXT: .pad #40
253 ; CHECK-LE-NEXT: sub sp, #40
254 ; CHECK-LE-NEXT: vmov.i8 q0, #0x1
255 ; CHECK-LE-NEXT: mov r0, sp
256 ; CHECK-LE-NEXT: vstrb.8 q0, [r0]
257 ; CHECK-LE-NEXT: movs r0, #3
258 ; CHECK-LE-NEXT: vstrb.8 q0, [sp, #16]
259 ; CHECK-LE-NEXT: strb.w r0, [sp, #16]
260 ; CHECK-LE-NEXT: vldrb.u8 q0, [sp, #16]
261 ; CHECK-LE-NEXT: add sp, #40
262 ; CHECK-LE-NEXT: bx lr
264 ; CHECK-BE-LABEL: loadstore_16xi8_stack_off16:
265 ; CHECK-BE: @ %bb.0: @ %entry
266 ; CHECK-BE-NEXT: .pad #40
267 ; CHECK-BE-NEXT: sub sp, #40
268 ; CHECK-BE-NEXT: vmov.i8 q0, #0x1
269 ; CHECK-BE-NEXT: mov r0, sp
270 ; CHECK-BE-NEXT: vstrb.8 q0, [r0]
271 ; CHECK-BE-NEXT: movs r0, #3
272 ; CHECK-BE-NEXT: vstrb.8 q0, [sp, #16]
273 ; CHECK-BE-NEXT: strb.w r0, [sp, #16]
274 ; CHECK-BE-NEXT: vldrb.u8 q1, [sp, #16]
275 ; CHECK-BE-NEXT: vrev64.8 q0, q1
276 ; CHECK-BE-NEXT: add sp, #40
277 ; CHECK-BE-NEXT: bx lr
279 %c = alloca [1 x [20 x [2 x i8]]], align 1
280 %0 = bitcast [1 x [20 x [2 x i8]]]* %c to i8*
281 %arrayidx5 = getelementptr inbounds [1 x [20 x [2 x i8]]], [1 x [20 x [2 x i8]]]* %c, i32 0, i32 0, i32 0, i32 0
282 %1 = bitcast [1 x [20 x [2 x i8]]]* %c to <16 x i8>*
283 store <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8>* %1, align 1
284 %arrayidx5.2 = getelementptr inbounds [1 x [20 x [2 x i8]]], [1 x [20 x [2 x i8]]]* %c, i32 0, i32 0, i32 8, i32 0
285 %2 = bitcast i8* %arrayidx5.2 to <16 x i8>*
286 store <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8>* %2, align 1
287 store i8 3, i8* %arrayidx5.2, align 1
288 %3 = load <16 x i8>, <16 x i8>* %2, align 1