1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
4 ; VLDRB.u16 Qd, [base, offs]
5 define arm_aapcs_vfpcc void @ext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) {
6 ; CHECK-LABEL: ext_unscaled_i8_i16:
7 ; CHECK: @ %bb.0: @ %entry
8 ; CHECK-NEXT: vldrh.u16 q1, [r1]
9 ; CHECK-NEXT: vstrb.16 q0, [r0, q1]
12 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
13 %offs.zext = zext <8 x i16> %offs to <8 x i32>
14 %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
15 %t = trunc <8 x i16> %input to <8 x i8>
16 call void @llvm.masked.scatter.v8i8(<8 x i8> %t, <8 x i8*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
20 ; VLDRB.u16 Qd, [base, offs]
21 define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) {
22 ; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
23 ; CHECK: @ %bb.0: @ %entry
24 ; CHECK-NEXT: vldrb.u16 q1, [r1]
25 ; CHECK-NEXT: vstrb.16 q0, [r0, q1]
28 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
29 %offs.zext = zext <8 x i8> %offs to <8 x i32>
30 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
31 %input.trunc = trunc <8 x i16> %input to <8 x i8>
32 call void @llvm.masked.scatter.v8i8(<8 x i8> %input.trunc, <8 x i8*> %byte_ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
36 ; VLDRH.16 Qd, [base, offs]
37 define arm_aapcs_vfpcc void @unscaled_i16_i16(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) {
38 ; CHECK-LABEL: unscaled_i16_i16:
39 ; CHECK: @ %bb.0: @ %entry
40 ; CHECK-NEXT: vldrh.u16 q1, [r1]
41 ; CHECK-NEXT: vstrh.16 q0, [r0, q1]
44 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
45 %offs.zext = zext <8 x i16> %offs to <8 x i32>
46 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
47 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
48 call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
52 ; VLDRH.s16 Qd, [base, offs]
53 define arm_aapcs_vfpcc void @unscaled_v8f16_i16(i8* %base, <8 x i16>* %offptr, <8 x half> %input) {
54 ; CHECK-LABEL: unscaled_v8f16_i16:
55 ; CHECK: @ %bb.0: @ %entry
56 ; CHECK-NEXT: vldrh.u16 q1, [r1]
57 ; CHECK-NEXT: vstrh.16 q0, [r0, q1]
60 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
61 %offs.zext = zext <8 x i16> %offs to <8 x i32>
62 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
63 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*>
64 call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
68 ; Expand - sext offsets
69 define arm_aapcs_vfpcc void @unscaled_v8i16_sext(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) {
70 ; CHECK-LABEL: unscaled_v8i16_sext:
71 ; CHECK: @ %bb.0: @ %entry
72 ; CHECK-NEXT: .save {r4, r5, r6, lr}
73 ; CHECK-NEXT: push {r4, r5, r6, lr}
74 ; CHECK-NEXT: vldrh.s32 q1, [r1]
75 ; CHECK-NEXT: vmov.u16 r6, q0[0]
76 ; CHECK-NEXT: vadd.i32 q1, q1, r0
77 ; CHECK-NEXT: vmov r2, r3, d2
78 ; CHECK-NEXT: vmov r12, lr, d3
79 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
80 ; CHECK-NEXT: vadd.i32 q1, q1, r0
81 ; CHECK-NEXT: vmov r0, r1, d2
82 ; CHECK-NEXT: vmov r4, r5, d3
83 ; CHECK-NEXT: strh r6, [r2]
84 ; CHECK-NEXT: vmov.u16 r2, q0[1]
85 ; CHECK-NEXT: strh r2, [r3]
86 ; CHECK-NEXT: vmov.u16 r2, q0[2]
87 ; CHECK-NEXT: strh.w r2, [r12]
88 ; CHECK-NEXT: vmov.u16 r2, q0[3]
89 ; CHECK-NEXT: strh.w r2, [lr]
90 ; CHECK-NEXT: vmov.u16 r2, q0[4]
91 ; CHECK-NEXT: strh r2, [r0]
92 ; CHECK-NEXT: vmov.u16 r0, q0[5]
93 ; CHECK-NEXT: strh r0, [r1]
94 ; CHECK-NEXT: vmov.u16 r0, q0[6]
95 ; CHECK-NEXT: strh r0, [r4]
96 ; CHECK-NEXT: vmov.u16 r0, q0[7]
97 ; CHECK-NEXT: strh r0, [r5]
98 ; CHECK-NEXT: pop {r4, r5, r6, pc}
100 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
101 %offs.sext = sext <8 x i16> %offs to <8 x i32>
102 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext
103 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
104 call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
108 ; Expand - sext offsets
109 define arm_aapcs_vfpcc void @unscaled_v8f16_sext(i8* %base, <8 x i16>* %offptr, <8 x half> %input) {
110 ; CHECK-LABEL: unscaled_v8f16_sext:
111 ; CHECK: @ %bb.0: @ %entry
112 ; CHECK-NEXT: vldrh.s32 q2, [r1]
113 ; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
114 ; CHECK-NEXT: vadd.i32 q2, q2, r0
115 ; CHECK-NEXT: vadd.i32 q1, q1, r0
116 ; CHECK-NEXT: vmov r1, r2, d4
117 ; CHECK-NEXT: vstr.16 s0, [r1]
118 ; CHECK-NEXT: vmovx.f16 s0, s0
119 ; CHECK-NEXT: vstr.16 s0, [r2]
120 ; CHECK-NEXT: vmov r1, r2, d5
121 ; CHECK-NEXT: vmovx.f16 s0, s1
122 ; CHECK-NEXT: vstr.16 s1, [r1]
123 ; CHECK-NEXT: vstr.16 s0, [r2]
124 ; CHECK-NEXT: vmov r0, r1, d2
125 ; CHECK-NEXT: vmovx.f16 s0, s2
126 ; CHECK-NEXT: vstr.16 s2, [r0]
127 ; CHECK-NEXT: vstr.16 s0, [r1]
128 ; CHECK-NEXT: vmov r0, r1, d3
129 ; CHECK-NEXT: vmovx.f16 s0, s3
130 ; CHECK-NEXT: vstr.16 s3, [r0]
131 ; CHECK-NEXT: vstr.16 s0, [r1]
134 %offs = load <8 x i16>, <8 x i16>* %offptr, align 2
135 %offs.sext = sext <8 x i16> %offs to <8 x i32>
136 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext
137 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*>
138 call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
142 ; Expand - i32 offsets
143 define arm_aapcs_vfpcc void @unscaled_v8i16_noext(i8* %base, <8 x i32>* %offptr, <8 x i16> %input) {
144 ; CHECK-LABEL: unscaled_v8i16_noext:
145 ; CHECK: @ %bb.0: @ %entry
146 ; CHECK-NEXT: .save {r4, r5, r6, lr}
147 ; CHECK-NEXT: push {r4, r5, r6, lr}
148 ; CHECK-NEXT: vldrw.u32 q1, [r1]
149 ; CHECK-NEXT: vmov.u16 r6, q0[0]
150 ; CHECK-NEXT: vadd.i32 q1, q1, r0
151 ; CHECK-NEXT: vmov r2, r3, d2
152 ; CHECK-NEXT: vmov r12, lr, d3
153 ; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
154 ; CHECK-NEXT: vadd.i32 q1, q1, r0
155 ; CHECK-NEXT: vmov r0, r1, d2
156 ; CHECK-NEXT: vmov r4, r5, d3
157 ; CHECK-NEXT: strh r6, [r2]
158 ; CHECK-NEXT: vmov.u16 r2, q0[1]
159 ; CHECK-NEXT: strh r2, [r3]
160 ; CHECK-NEXT: vmov.u16 r2, q0[2]
161 ; CHECK-NEXT: strh.w r2, [r12]
162 ; CHECK-NEXT: vmov.u16 r2, q0[3]
163 ; CHECK-NEXT: strh.w r2, [lr]
164 ; CHECK-NEXT: vmov.u16 r2, q0[4]
165 ; CHECK-NEXT: strh r2, [r0]
166 ; CHECK-NEXT: vmov.u16 r0, q0[5]
167 ; CHECK-NEXT: strh r0, [r1]
168 ; CHECK-NEXT: vmov.u16 r0, q0[6]
169 ; CHECK-NEXT: strh r0, [r4]
170 ; CHECK-NEXT: vmov.u16 r0, q0[7]
171 ; CHECK-NEXT: strh r0, [r5]
172 ; CHECK-NEXT: pop {r4, r5, r6, pc}
174 %offs = load <8 x i32>, <8 x i32>* %offptr, align 4
175 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs
176 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
177 call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
181 ; Expand - i32 offsets
182 define arm_aapcs_vfpcc void @unscaled_v8f16_noext(i8* %base, <8 x i32>* %offptr, <8 x half> %input) {
183 ; CHECK-LABEL: unscaled_v8f16_noext:
184 ; CHECK: @ %bb.0: @ %entry
185 ; CHECK-NEXT: vldrw.u32 q2, [r1]
186 ; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
187 ; CHECK-NEXT: vadd.i32 q2, q2, r0
188 ; CHECK-NEXT: vadd.i32 q1, q1, r0
189 ; CHECK-NEXT: vmov r1, r2, d4
190 ; CHECK-NEXT: vstr.16 s0, [r1]
191 ; CHECK-NEXT: vmovx.f16 s0, s0
192 ; CHECK-NEXT: vstr.16 s0, [r2]
193 ; CHECK-NEXT: vmov r1, r2, d5
194 ; CHECK-NEXT: vmovx.f16 s0, s1
195 ; CHECK-NEXT: vstr.16 s1, [r1]
196 ; CHECK-NEXT: vstr.16 s0, [r2]
197 ; CHECK-NEXT: vmov r0, r1, d2
198 ; CHECK-NEXT: vmovx.f16 s0, s2
199 ; CHECK-NEXT: vstr.16 s2, [r0]
200 ; CHECK-NEXT: vstr.16 s0, [r1]
201 ; CHECK-NEXT: vmov r0, r1, d3
202 ; CHECK-NEXT: vmovx.f16 s0, s3
203 ; CHECK-NEXT: vstr.16 s3, [r0]
204 ; CHECK-NEXT: vstr.16 s0, [r1]
207 %offs = load <8 x i32>, <8 x i32>* %offptr, align 4
208 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs
209 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*>
210 call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
214 ; VLDRH.16 Qd, [base, zext(offs)]
215 define arm_aapcs_vfpcc void @unsigned_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) {
216 ; CHECK-LABEL: unsigned_unscaled_i16_i8:
217 ; CHECK: @ %bb.0: @ %entry
218 ; CHECK-NEXT: vldrb.u16 q1, [r1]
219 ; CHECK-NEXT: vstrh.16 q0, [r0, q1]
222 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
223 %offs.zext = zext <8 x i8> %offs to <8 x i32>
224 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
225 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
226 call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
230 ; VLDRH.16 Qd, [base, zext(offs)]
231 define arm_aapcs_vfpcc void @unsigned_unscaled_f16_i8(i8* %base, <8 x i8>* %offptr, <8 x half> %input) {
232 ; CHECK-LABEL: unsigned_unscaled_f16_i8:
233 ; CHECK: @ %bb.0: @ %entry
234 ; CHECK-NEXT: vldrb.u16 q1, [r1]
235 ; CHECK-NEXT: vstrh.16 q0, [r0, q1]
238 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
239 %offs.zext = zext <8 x i8> %offs to <8 x i32>
240 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
241 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*>
242 call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
246 ; Expand - sext offsets
247 define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) {
248 ; CHECK-LABEL: trunc_signed_unscaled_i64_i8:
249 ; CHECK: @ %bb.0: @ %entry
250 ; CHECK-NEXT: .save {r4, r5, r7, lr}
251 ; CHECK-NEXT: push {r4, r5, r7, lr}
252 ; CHECK-NEXT: .vsave {d8, d9}
253 ; CHECK-NEXT: vpush {d8, d9}
254 ; CHECK-NEXT: vldrb.s32 q4, [r1]
255 ; CHECK-NEXT: vmov r4, s0
256 ; CHECK-NEXT: vadd.i32 q4, q4, r0
257 ; CHECK-NEXT: vmov r2, r3, d8
258 ; CHECK-NEXT: vmov r12, lr, d9
259 ; CHECK-NEXT: vldrb.s32 q4, [r1, #4]
260 ; CHECK-NEXT: vadd.i32 q4, q4, r0
261 ; CHECK-NEXT: vmov r0, r1, d8
262 ; CHECK-NEXT: strh r4, [r2]
263 ; CHECK-NEXT: vmov r2, s2
264 ; CHECK-NEXT: vmov r4, r5, d9
265 ; CHECK-NEXT: strh r2, [r3]
266 ; CHECK-NEXT: vmov r2, s4
267 ; CHECK-NEXT: strh.w r2, [r12]
268 ; CHECK-NEXT: vmov r2, s6
269 ; CHECK-NEXT: strh.w r2, [lr]
270 ; CHECK-NEXT: vmov r2, s8
271 ; CHECK-NEXT: strh r2, [r0]
272 ; CHECK-NEXT: vmov r0, s10
273 ; CHECK-NEXT: strh r0, [r1]
274 ; CHECK-NEXT: vmov r0, s12
275 ; CHECK-NEXT: strh r0, [r4]
276 ; CHECK-NEXT: vmov r0, s14
277 ; CHECK-NEXT: strh r0, [r5]
278 ; CHECK-NEXT: vpop {d8, d9}
279 ; CHECK-NEXT: pop {r4, r5, r7, pc}
281 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
282 %offs.sext = sext <8 x i8> %offs to <8 x i32>
283 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext
284 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
285 %input.trunc = trunc <8 x i64> %input to <8 x i16>
286 call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
290 define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) {
291 ; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
292 ; CHECK: @ %bb.0: @ %entry
293 ; CHECK-NEXT: .vsave {d8, d9}
294 ; CHECK-NEXT: vpush {d8, d9}
295 ; CHECK-NEXT: vmov r3, s0
296 ; CHECK-NEXT: vmov.16 q4[0], r3
297 ; CHECK-NEXT: vmov r3, s2
298 ; CHECK-NEXT: vmov.16 q4[1], r3
299 ; CHECK-NEXT: vmov r3, s4
300 ; CHECK-NEXT: vmov.16 q4[2], r3
301 ; CHECK-NEXT: vmov r3, s6
302 ; CHECK-NEXT: vmov.16 q4[3], r3
303 ; CHECK-NEXT: vmov r3, s8
304 ; CHECK-NEXT: vmov.16 q4[4], r3
305 ; CHECK-NEXT: vmov r3, s10
306 ; CHECK-NEXT: vmov.16 q4[5], r3
307 ; CHECK-NEXT: vmov r3, s12
308 ; CHECK-NEXT: vmov r2, s14
309 ; CHECK-NEXT: vmov.16 q4[6], r3
310 ; CHECK-NEXT: vldrb.u16 q0, [r1]
311 ; CHECK-NEXT: vmov.16 q4[7], r2
312 ; CHECK-NEXT: vstrh.16 q4, [r0, q0]
313 ; CHECK-NEXT: vpop {d8, d9}
316 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
317 %offs.zext = zext <8 x i8> %offs to <8 x i32>
318 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
319 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
320 %input.trunc = trunc <8 x i64> %input to <8 x i16>
321 call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
325 ; Expand - sext offsets
326 define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
327 ; CHECK-LABEL: trunc_signed_unscaled_i32_i8:
328 ; CHECK: @ %bb.0: @ %entry
329 ; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
330 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
331 ; CHECK-NEXT: vldrb.s32 q2, [r1]
332 ; CHECK-NEXT: vmov r4, r5, d0
333 ; CHECK-NEXT: vadd.i32 q2, q2, r0
334 ; CHECK-NEXT: vmov r2, r3, d4
335 ; CHECK-NEXT: vmov r12, lr, d5
336 ; CHECK-NEXT: vldrb.s32 q2, [r1, #4]
337 ; CHECK-NEXT: vadd.i32 q2, q2, r0
338 ; CHECK-NEXT: vmov r0, r6, d1
339 ; CHECK-NEXT: strh r4, [r2]
340 ; CHECK-NEXT: vmov r2, r7, d4
341 ; CHECK-NEXT: strh r5, [r3]
342 ; CHECK-NEXT: vmov r3, r5, d5
343 ; CHECK-NEXT: strh.w r0, [r12]
344 ; CHECK-NEXT: vmov r0, r1, d2
345 ; CHECK-NEXT: strh.w r6, [lr]
346 ; CHECK-NEXT: vmov r6, r4, d3
347 ; CHECK-NEXT: strh r0, [r2]
348 ; CHECK-NEXT: strh r1, [r7]
349 ; CHECK-NEXT: strh r6, [r3]
350 ; CHECK-NEXT: strh r4, [r5]
351 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
353 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
354 %offs.sext = sext <8 x i8> %offs to <8 x i32>
355 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext
356 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
357 %input.trunc = trunc <8 x i32> %input to <8 x i16>
358 call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
362 define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
363 ; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
364 ; CHECK: @ %bb.0: @ %entry
365 ; CHECK-NEXT: .pad #16
366 ; CHECK-NEXT: sub sp, #16
367 ; CHECK-NEXT: mov r2, sp
368 ; CHECK-NEXT: vstrh.32 q1, [r2, #8]
369 ; CHECK-NEXT: vstrh.32 q0, [r2]
370 ; CHECK-NEXT: vldrb.u16 q0, [r1]
371 ; CHECK-NEXT: vldrw.u32 q1, [r2]
372 ; CHECK-NEXT: vstrh.16 q1, [r0, q0]
373 ; CHECK-NEXT: add sp, #16
376 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
377 %offs.zext = zext <8 x i8> %offs to <8 x i32>
378 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
379 %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
380 %input.trunc = trunc <8 x i32> %input to <8 x i16>
381 call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
385 ; Expand - sext offsets
386 define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) {
387 ; CHECK-LABEL: trunc_signed_unscaled_i16_i8:
388 ; CHECK: @ %bb.0: @ %entry
389 ; CHECK-NEXT: .save {r4, r5, r6, lr}
390 ; CHECK-NEXT: push {r4, r5, r6, lr}
391 ; CHECK-NEXT: vldrb.s32 q1, [r1]
392 ; CHECK-NEXT: vmov.u16 r6, q0[0]
393 ; CHECK-NEXT: vadd.i32 q1, q1, r0
394 ; CHECK-NEXT: vmov r2, r3, d2
395 ; CHECK-NEXT: vmov r12, lr, d3
396 ; CHECK-NEXT: vldrb.s32 q1, [r1, #4]
397 ; CHECK-NEXT: vadd.i32 q1, q1, r0
398 ; CHECK-NEXT: vmov r0, r1, d2
399 ; CHECK-NEXT: vmov r4, r5, d3
400 ; CHECK-NEXT: strb r6, [r2]
401 ; CHECK-NEXT: vmov.u16 r2, q0[1]
402 ; CHECK-NEXT: strb r2, [r3]
403 ; CHECK-NEXT: vmov.u16 r2, q0[2]
404 ; CHECK-NEXT: strb.w r2, [r12]
405 ; CHECK-NEXT: vmov.u16 r2, q0[3]
406 ; CHECK-NEXT: strb.w r2, [lr]
407 ; CHECK-NEXT: vmov.u16 r2, q0[4]
408 ; CHECK-NEXT: strb r2, [r0]
409 ; CHECK-NEXT: vmov.u16 r0, q0[5]
410 ; CHECK-NEXT: strb r0, [r1]
411 ; CHECK-NEXT: vmov.u16 r0, q0[6]
412 ; CHECK-NEXT: strb r0, [r4]
413 ; CHECK-NEXT: vmov.u16 r0, q0[7]
414 ; CHECK-NEXT: strb r0, [r5]
415 ; CHECK-NEXT: pop {r4, r5, r6, pc}
417 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1
418 %offs.sext = sext <8 x i8> %offs to <8 x i32>
419 %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext
420 %input.trunc = trunc <8 x i16> %input to <8 x i8>
421 call void @llvm.masked.scatter.v8i8(<8 x i8> %input.trunc, <8 x i8*> %byte_ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
425 declare void @llvm.masked.scatter.v8i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>)
426 declare void @llvm.masked.scatter.v8i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>)
427 declare void @llvm.masked.scatter.v8f16(<8 x half>, <8 x half*>, i32, <8 x i1>)