1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s
5 define arm_aapcs_vfpcc void @scatter_inc_minipred_4i32(<4 x i32> %data, i32* %dst, <4 x i32> %offs) {
6 ; CHECK-LABEL: scatter_inc_minipred_4i32:
8 ; CHECK-NEXT: movw r1, #3855
9 ; CHECK-NEXT: vmov.i32 q2, #0x4
10 ; CHECK-NEXT: vadd.i32 q1, q1, q2
11 ; CHECK-NEXT: vmsr p0, r1
13 ; CHECK-NEXT: vstrwt.32 q0, [r0, q1, uxtw #2]
15 %1 = add <4 x i32> %offs, <i32 4, i32 4, i32 4, i32 4>
16 %2 = getelementptr inbounds i32, i32* %dst, <4 x i32> %1
17 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data, <4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
21 define arm_aapcs_vfpcc void @scatter_inc_mini_8i16(<8 x i16> %data, i16* %dst, <8 x i32> %offs) {
22 ; CHECK-LABEL: scatter_inc_mini_8i16:
24 ; CHECK-NEXT: .save {r4, r5, r6, lr}
25 ; CHECK-NEXT: push {r4, r5, r6, lr}
26 ; CHECK-NEXT: vshl.i32 q1, q1, #1
27 ; CHECK-NEXT: vmov.i32 q3, #0x10
28 ; CHECK-NEXT: vadd.i32 q1, q1, r0
29 ; CHECK-NEXT: vmov.u16 r6, q0[0]
30 ; CHECK-NEXT: vadd.i32 q1, q1, q3
31 ; CHECK-NEXT: vmov r1, r2, d2
32 ; CHECK-NEXT: vmov r3, r12, d3
33 ; CHECK-NEXT: vshl.i32 q1, q2, #1
34 ; CHECK-NEXT: vadd.i32 q1, q1, r0
35 ; CHECK-NEXT: vadd.i32 q1, q1, q3
36 ; CHECK-NEXT: vmov r0, lr, d2
37 ; CHECK-NEXT: vmov r4, r5, d3
38 ; CHECK-NEXT: strh r6, [r1]
39 ; CHECK-NEXT: vmov.u16 r1, q0[1]
40 ; CHECK-NEXT: strh r1, [r2]
41 ; CHECK-NEXT: vmov.u16 r1, q0[2]
42 ; CHECK-NEXT: strh r1, [r3]
43 ; CHECK-NEXT: vmov.u16 r1, q0[3]
44 ; CHECK-NEXT: strh.w r1, [r12]
45 ; CHECK-NEXT: vmov.u16 r1, q0[4]
46 ; CHECK-NEXT: strh r1, [r0]
47 ; CHECK-NEXT: vmov.u16 r0, q0[5]
48 ; CHECK-NEXT: strh.w r0, [lr]
49 ; CHECK-NEXT: vmov.u16 r0, q0[6]
50 ; CHECK-NEXT: strh r0, [r4]
51 ; CHECK-NEXT: vmov.u16 r0, q0[7]
52 ; CHECK-NEXT: strh r0, [r5]
53 ; CHECK-NEXT: pop {r4, r5, r6, pc}
54 %1 = add <8 x i32> %offs, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
55 %2 = getelementptr inbounds i16, i16* %dst, <8 x i32> %1
56 call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %data, <8 x i16*> %2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
60 define arm_aapcs_vfpcc void @scatter_inc_mini_16i8(<16 x i8> %data, i8* %dst, <16 x i32> %offs) {
61 ; CHECK-LABEL: scatter_inc_mini_16i8:
63 ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
64 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
66 ; CHECK-NEXT: sub sp, #4
67 ; CHECK-NEXT: .vsave {d8, d9}
68 ; CHECK-NEXT: vpush {d8, d9}
69 ; CHECK-NEXT: vmov.i32 q4, #0x10
70 ; CHECK-NEXT: vadd.i32 q1, q1, r0
71 ; CHECK-NEXT: vadd.i32 q1, q1, q4
72 ; CHECK-NEXT: add r5, sp, #48
73 ; CHECK-NEXT: vmov r1, r2, d2
74 ; CHECK-NEXT: vadd.i32 q3, q3, r0
75 ; CHECK-NEXT: vmov r3, r12, d3
76 ; CHECK-NEXT: vadd.i32 q1, q2, r0
77 ; CHECK-NEXT: vadd.i32 q2, q1, q4
78 ; CHECK-NEXT: vldrw.u32 q1, [r5]
79 ; CHECK-NEXT: vmov lr, r7, d4
80 ; CHECK-NEXT: vmov.u8 r6, q0[0]
81 ; CHECK-NEXT: vadd.i32 q1, q1, r0
82 ; CHECK-NEXT: vmov r0, r8, d5
83 ; CHECK-NEXT: vadd.i32 q2, q3, q4
84 ; CHECK-NEXT: vmov.u8 r4, q0[4]
85 ; CHECK-NEXT: vadd.i32 q1, q1, q4
86 ; CHECK-NEXT: vmov.u8 r5, q0[6]
87 ; CHECK-NEXT: strb r6, [r1]
88 ; CHECK-NEXT: vmov.u8 r1, q0[1]
89 ; CHECK-NEXT: strb r1, [r2]
90 ; CHECK-NEXT: vmov.u8 r6, q0[2]
91 ; CHECK-NEXT: vmov r1, r9, d4
92 ; CHECK-NEXT: strb r6, [r3]
93 ; CHECK-NEXT: vmov.u8 r3, q0[3]
94 ; CHECK-NEXT: vmov.u8 r2, q0[8]
95 ; CHECK-NEXT: strb.w r3, [r12]
96 ; CHECK-NEXT: vmov r3, r6, d5
97 ; CHECK-NEXT: strb.w r4, [lr]
98 ; CHECK-NEXT: vmov.u8 r4, q0[5]
99 ; CHECK-NEXT: strb r4, [r7]
100 ; CHECK-NEXT: vmov r7, r4, d2
101 ; CHECK-NEXT: strb r5, [r0]
102 ; CHECK-NEXT: vmov.u8 r0, q0[7]
103 ; CHECK-NEXT: strb.w r0, [r8]
104 ; CHECK-NEXT: vmov r0, r5, d3
105 ; CHECK-NEXT: strb r2, [r1]
106 ; CHECK-NEXT: vmov.u8 r1, q0[9]
107 ; CHECK-NEXT: strb.w r1, [r9]
108 ; CHECK-NEXT: vmov.u8 r1, q0[10]
109 ; CHECK-NEXT: strb r1, [r3]
110 ; CHECK-NEXT: vmov.u8 r1, q0[11]
111 ; CHECK-NEXT: strb r1, [r6]
112 ; CHECK-NEXT: vmov.u8 r1, q0[12]
113 ; CHECK-NEXT: strb r1, [r7]
114 ; CHECK-NEXT: vmov.u8 r1, q0[13]
115 ; CHECK-NEXT: strb r1, [r4]
116 ; CHECK-NEXT: vmov.u8 r1, q0[14]
117 ; CHECK-NEXT: strb r1, [r0]
118 ; CHECK-NEXT: vmov.u8 r0, q0[15]
119 ; CHECK-NEXT: strb r0, [r5]
120 ; CHECK-NEXT: vpop {d8, d9}
121 ; CHECK-NEXT: add sp, #4
122 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
123 %1 = add <16 x i32> %offs, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
124 %2 = getelementptr inbounds i8, i8* %dst, <16 x i32> %1
125 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %data, <16 x i8*> %2, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
129 define arm_aapcs_vfpcc void @scatter_inc_v4i32_complex(<4 x i32> %data1, <4 x i32> %data2, <4 x i32> %data3, i32* %dst, i32 %n) {
130 ; CHECK-LABEL: scatter_inc_v4i32_complex:
131 ; CHECK: @ %bb.0: @ %entry
132 ; CHECK-NEXT: .save {r4, lr}
133 ; CHECK-NEXT: push {r4, lr}
134 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
135 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
136 ; CHECK-NEXT: .pad #16
137 ; CHECK-NEXT: sub sp, #16
138 ; CHECK-NEXT: cmp r1, #1
139 ; CHECK-NEXT: blt .LBB3_5
140 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader
141 ; CHECK-NEXT: adr r4, .LCPI3_2
142 ; CHECK-NEXT: bic r2, r1, #3
143 ; CHECK-NEXT: vldrw.u32 q3, [r4]
144 ; CHECK-NEXT: sub.w r12, r2, #4
145 ; CHECK-NEXT: adr.w lr, .LCPI3_1
146 ; CHECK-NEXT: movs r3, #1
147 ; CHECK-NEXT: vadd.i32 q3, q3, r0
148 ; CHECK-NEXT: add.w r3, r3, r12, lsr #2
149 ; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill
150 ; CHECK-NEXT: vldrw.u32 q3, [lr]
151 ; CHECK-NEXT: adr.w r12, .LCPI3_0
152 ; CHECK-NEXT: vadd.i32 q4, q3, r0
153 ; CHECK-NEXT: vldrw.u32 q3, [r12]
154 ; CHECK-NEXT: vadd.i32 q3, q3, r0
155 ; CHECK-NEXT: .LBB3_2: @ %vector.ph
156 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
157 ; CHECK-NEXT: @ Child Loop BB3_3 Depth 2
158 ; CHECK-NEXT: dls lr, r3
159 ; CHECK-NEXT: vmov q6, q4
160 ; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
161 ; CHECK-NEXT: vmov q5, q3
162 ; CHECK-NEXT: .LBB3_3: @ %vector.body
163 ; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1
164 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
165 ; CHECK-NEXT: vstrw.32 q0, [q5, #48]!
166 ; CHECK-NEXT: vstrw.32 q1, [q6, #48]!
167 ; CHECK-NEXT: vstrw.32 q2, [q7, #48]!
168 ; CHECK-NEXT: le lr, .LBB3_3
169 ; CHECK-NEXT: @ %bb.4: @ %middle.block
170 ; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1
171 ; CHECK-NEXT: cmp r2, r1
172 ; CHECK-NEXT: bne .LBB3_2
173 ; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup
174 ; CHECK-NEXT: add sp, #16
175 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
176 ; CHECK-NEXT: pop {r4, pc}
177 ; CHECK-NEXT: .p2align 4
178 ; CHECK-NEXT: @ %bb.6:
179 ; CHECK-NEXT: .LCPI3_0:
180 ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0
181 ; CHECK-NEXT: .long 4294967260 @ 0xffffffdc
182 ; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
183 ; CHECK-NEXT: .long 4294967284 @ 0xfffffff4
184 ; CHECK-NEXT: .LCPI3_1:
185 ; CHECK-NEXT: .long 4294967252 @ 0xffffffd4
186 ; CHECK-NEXT: .long 4294967264 @ 0xffffffe0
187 ; CHECK-NEXT: .long 4294967276 @ 0xffffffec
188 ; CHECK-NEXT: .long 4294967288 @ 0xfffffff8
189 ; CHECK-NEXT: .LCPI3_2:
190 ; CHECK-NEXT: .long 4294967256 @ 0xffffffd8
191 ; CHECK-NEXT: .long 4294967268 @ 0xffffffe4
192 ; CHECK-NEXT: .long 4294967280 @ 0xfffffff0
193 ; CHECK-NEXT: .long 4294967292 @ 0xfffffffc
195 %cmp22 = icmp sgt i32 %n, 0
196 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
198 vector.ph: ; preds = %for.body.preheader
199 %n.vec = and i32 %n, -4
200 br label %vector.body
202 vector.body: ; preds = %vector.body, %vector.ph
203 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
204 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
205 %0 = mul nuw nsw <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
206 %1 = getelementptr inbounds i32, i32* %dst, <4 x i32> %0
207 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data1, <4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
208 %2 = add nuw nsw <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
209 %3 = getelementptr inbounds i32, i32* %dst, <4 x i32> %2
210 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data2, <4 x i32*> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
211 %4 = add nuw nsw <4 x i32> %0, <i32 2, i32 2, i32 2, i32 2>
212 %5 = getelementptr inbounds i32, i32* %dst, <4 x i32> %4
213 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data3, <4 x i32*> %5, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
214 %index.next = add i32 %index, 4
215 %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
216 %6 = icmp eq i32 %index.next, %n.vec
217 br i1 %6, label %middle.block, label %vector.body
219 middle.block: ; preds = %vector.body
220 %cmp.n = icmp eq i32 %n.vec, %n
221 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
223 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
227 define arm_aapcs_vfpcc void @scatter_inc_v4i32_complex_opaque(<4 x i32> %data1, <4 x i32> %data2, <4 x i32> %data3, ptr %dst, i32 %n) {
228 ; CHECK-LABEL: scatter_inc_v4i32_complex_opaque:
229 ; CHECK: @ %bb.0: @ %entry
230 ; CHECK-NEXT: .save {r4, lr}
231 ; CHECK-NEXT: push {r4, lr}
232 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
233 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
234 ; CHECK-NEXT: .pad #16
235 ; CHECK-NEXT: sub sp, #16
236 ; CHECK-NEXT: cmp r1, #1
237 ; CHECK-NEXT: blt .LBB4_5
238 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader
239 ; CHECK-NEXT: adr r4, .LCPI4_2
240 ; CHECK-NEXT: bic r2, r1, #3
241 ; CHECK-NEXT: vldrw.u32 q3, [r4]
242 ; CHECK-NEXT: sub.w r12, r2, #4
243 ; CHECK-NEXT: adr.w lr, .LCPI4_1
244 ; CHECK-NEXT: movs r3, #1
245 ; CHECK-NEXT: vadd.i32 q3, q3, r0
246 ; CHECK-NEXT: add.w r3, r3, r12, lsr #2
247 ; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill
248 ; CHECK-NEXT: vldrw.u32 q3, [lr]
249 ; CHECK-NEXT: adr.w r12, .LCPI4_0
250 ; CHECK-NEXT: vadd.i32 q4, q3, r0
251 ; CHECK-NEXT: vldrw.u32 q3, [r12]
252 ; CHECK-NEXT: vadd.i32 q3, q3, r0
253 ; CHECK-NEXT: .LBB4_2: @ %vector.ph
254 ; CHECK-NEXT: @ =>This Loop Header: Depth=1
255 ; CHECK-NEXT: @ Child Loop BB4_3 Depth 2
256 ; CHECK-NEXT: dls lr, r3
257 ; CHECK-NEXT: vmov q6, q4
258 ; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
259 ; CHECK-NEXT: vmov q5, q3
260 ; CHECK-NEXT: .LBB4_3: @ %vector.body
261 ; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1
262 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
263 ; CHECK-NEXT: vstrw.32 q0, [q5, #48]!
264 ; CHECK-NEXT: vstrw.32 q1, [q6, #48]!
265 ; CHECK-NEXT: vstrw.32 q2, [q7, #48]!
266 ; CHECK-NEXT: le lr, .LBB4_3
267 ; CHECK-NEXT: @ %bb.4: @ %middle.block
268 ; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1
269 ; CHECK-NEXT: cmp r2, r1
270 ; CHECK-NEXT: bne .LBB4_2
271 ; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup
272 ; CHECK-NEXT: add sp, #16
273 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
274 ; CHECK-NEXT: pop {r4, pc}
275 ; CHECK-NEXT: .p2align 4
276 ; CHECK-NEXT: @ %bb.6:
277 ; CHECK-NEXT: .LCPI4_0:
278 ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0
279 ; CHECK-NEXT: .long 4294967260 @ 0xffffffdc
280 ; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
281 ; CHECK-NEXT: .long 4294967284 @ 0xfffffff4
282 ; CHECK-NEXT: .LCPI4_1:
283 ; CHECK-NEXT: .long 4294967252 @ 0xffffffd4
284 ; CHECK-NEXT: .long 4294967264 @ 0xffffffe0
285 ; CHECK-NEXT: .long 4294967276 @ 0xffffffec
286 ; CHECK-NEXT: .long 4294967288 @ 0xfffffff8
287 ; CHECK-NEXT: .LCPI4_2:
288 ; CHECK-NEXT: .long 4294967256 @ 0xffffffd8
289 ; CHECK-NEXT: .long 4294967268 @ 0xffffffe4
290 ; CHECK-NEXT: .long 4294967280 @ 0xfffffff0
291 ; CHECK-NEXT: .long 4294967292 @ 0xfffffffc
293 %cmp22 = icmp sgt i32 %n, 0
294 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
296 vector.ph: ; preds = %for.body.preheader
297 %n.vec = and i32 %n, -4
298 br label %vector.body
300 vector.body: ; preds = %vector.body, %vector.ph
301 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
302 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
303 %0 = mul nuw nsw <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
304 %1 = getelementptr inbounds i32, ptr %dst, <4 x i32> %0
305 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data1, <4 x ptr> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
306 %2 = add nuw nsw <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
307 %3 = getelementptr inbounds i32, ptr %dst, <4 x i32> %2
308 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data2, <4 x ptr> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
309 %4 = add nuw nsw <4 x i32> %0, <i32 2, i32 2, i32 2, i32 2>
310 %5 = getelementptr inbounds i32, ptr %dst, <4 x i32> %4
311 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data3, <4 x ptr> %5, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
312 %index.next = add i32 %index, 4
313 %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
314 %6 = icmp eq i32 %index.next, %n.vec
315 br i1 %6, label %middle.block, label %vector.body
317 middle.block: ; preds = %vector.body
318 %cmp.n = icmp eq i32 %n.vec, %n
319 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
321 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
325 declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>)
326 declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>)
327 declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>)
328 declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>)
329 declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>)
330 declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>)
331 declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>)
332 declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
333 declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
334 declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>)