1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
3 ; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
5 define arm_aapcs_vfpcc void @masked_v4i32(<4 x i32> *%dest, <4 x i32> %a) {
6 ; CHECK-LE-LABEL: masked_v4i32:
7 ; CHECK-LE: @ %bb.0: @ %entry
8 ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
9 ; CHECK-LE-NEXT: vstrwt.32 q0, [r0]
10 ; CHECK-LE-NEXT: bx lr
12 ; CHECK-BE-LABEL: masked_v4i32:
13 ; CHECK-BE: @ %bb.0: @ %entry
14 ; CHECK-BE-NEXT: vrev64.32 q1, q0
15 ; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
16 ; CHECK-BE-NEXT: vstrwt.32 q1, [r0]
17 ; CHECK-BE-NEXT: bx lr
19 %c = icmp sgt <4 x i32> %a, zeroinitializer
20 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %a, <4 x i32>* %dest, i32 4, <4 x i1> %c)
24 define arm_aapcs_vfpcc void @masked_v4i32_align1(<4 x i32> *%dest, <4 x i32> %a) {
25 ; CHECK-LE-LABEL: masked_v4i32_align1:
26 ; CHECK-LE: @ %bb.0: @ %entry
27 ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
28 ; CHECK-LE-NEXT: vstrbt.8 q0, [r0]
29 ; CHECK-LE-NEXT: bx lr
31 ; CHECK-BE-LABEL: masked_v4i32_align1:
32 ; CHECK-BE: @ %bb.0: @ %entry
33 ; CHECK-BE-NEXT: vrev64.32 q1, q0
34 ; CHECK-BE-NEXT: vrev32.8 q0, q1
35 ; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
36 ; CHECK-BE-NEXT: vstrbt.8 q0, [r0]
37 ; CHECK-BE-NEXT: bx lr
39 %c = icmp sgt <4 x i32> %a, zeroinitializer
40 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %a, <4 x i32>* %dest, i32 1, <4 x i1> %c)
44 define i8* @masked_v4i32_pre(i8* %y, i8* %x, <4 x i32> %a) {
45 ; CHECK-LE-LABEL: masked_v4i32_pre:
46 ; CHECK-LE: @ %bb.0: @ %entry
47 ; CHECK-LE-NEXT: vldr d1, [sp]
48 ; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
49 ; CHECK-LE-NEXT: vmov d0, r2, r3
50 ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
51 ; CHECK-LE-NEXT: vstrwt.32 q1, [r0, #4]
52 ; CHECK-LE-NEXT: adds r0, #4
53 ; CHECK-LE-NEXT: bx lr
55 ; CHECK-BE-LABEL: masked_v4i32_pre:
56 ; CHECK-BE: @ %bb.0: @ %entry
57 ; CHECK-BE-NEXT: vldr d1, [sp]
58 ; CHECK-BE-NEXT: vldrw.u32 q1, [r1]
59 ; CHECK-BE-NEXT: vmov d0, r3, r2
60 ; CHECK-BE-NEXT: vrev64.32 q2, q0
61 ; CHECK-BE-NEXT: vpt.s32 gt, q2, zr
62 ; CHECK-BE-NEXT: vstrwt.32 q1, [r0, #4]
63 ; CHECK-BE-NEXT: adds r0, #4
64 ; CHECK-BE-NEXT: bx lr
66 %z = getelementptr inbounds i8, i8* %y, i32 4
67 %0 = bitcast i8* %x to <4 x i32>*
68 %1 = load <4 x i32>, <4 x i32>* %0, align 4
69 %2 = bitcast i8* %z to <4 x i32>*
70 %c = icmp sgt <4 x i32> %a, zeroinitializer
71 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
75 define i8* @masked_v4i32_post(i8* %y, i8* %x, <4 x i32> %a) {
76 ; CHECK-LE-LABEL: masked_v4i32_post:
77 ; CHECK-LE: @ %bb.0: @ %entry
78 ; CHECK-LE-NEXT: vldr d1, [sp]
79 ; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
80 ; CHECK-LE-NEXT: vmov d0, r2, r3
81 ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
82 ; CHECK-LE-NEXT: vstrwt.32 q1, [r0]
83 ; CHECK-LE-NEXT: adds r0, #4
84 ; CHECK-LE-NEXT: bx lr
86 ; CHECK-BE-LABEL: masked_v4i32_post:
87 ; CHECK-BE: @ %bb.0: @ %entry
88 ; CHECK-BE-NEXT: vldr d1, [sp]
89 ; CHECK-BE-NEXT: vldrw.u32 q1, [r1]
90 ; CHECK-BE-NEXT: vmov d0, r3, r2
91 ; CHECK-BE-NEXT: vrev64.32 q2, q0
92 ; CHECK-BE-NEXT: vpt.s32 gt, q2, zr
93 ; CHECK-BE-NEXT: vstrwt.32 q1, [r0]
94 ; CHECK-BE-NEXT: adds r0, #4
95 ; CHECK-BE-NEXT: bx lr
97 %z = getelementptr inbounds i8, i8* %y, i32 4
98 %0 = bitcast i8* %x to <4 x i32>*
99 %1 = load <4 x i32>, <4 x i32>* %0, align 4
100 %2 = bitcast i8* %y to <4 x i32>*
101 %c = icmp sgt <4 x i32> %a, zeroinitializer
102 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
107 define arm_aapcs_vfpcc void @masked_v8i16(<8 x i16> *%dest, <8 x i16> %a) {
108 ; CHECK-LE-LABEL: masked_v8i16:
109 ; CHECK-LE: @ %bb.0: @ %entry
110 ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
111 ; CHECK-LE-NEXT: vstrht.16 q0, [r0]
112 ; CHECK-LE-NEXT: bx lr
114 ; CHECK-BE-LABEL: masked_v8i16:
115 ; CHECK-BE: @ %bb.0: @ %entry
116 ; CHECK-BE-NEXT: vrev64.16 q1, q0
117 ; CHECK-BE-NEXT: vpt.s16 gt, q1, zr
118 ; CHECK-BE-NEXT: vstrht.16 q1, [r0]
119 ; CHECK-BE-NEXT: bx lr
121 %c = icmp sgt <8 x i16> %a, zeroinitializer
122 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %a, <8 x i16>* %dest, i32 2, <8 x i1> %c)
126 define arm_aapcs_vfpcc void @masked_v8i16_align1(<8 x i16> *%dest, <8 x i16> %a) {
127 ; CHECK-LE-LABEL: masked_v8i16_align1:
128 ; CHECK-LE: @ %bb.0: @ %entry
129 ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
130 ; CHECK-LE-NEXT: vstrbt.8 q0, [r0]
131 ; CHECK-LE-NEXT: bx lr
133 ; CHECK-BE-LABEL: masked_v8i16_align1:
134 ; CHECK-BE: @ %bb.0: @ %entry
135 ; CHECK-BE-NEXT: vrev64.16 q1, q0
136 ; CHECK-BE-NEXT: vrev16.8 q0, q1
137 ; CHECK-BE-NEXT: vpt.s16 gt, q1, zr
138 ; CHECK-BE-NEXT: vstrbt.8 q0, [r0]
139 ; CHECK-BE-NEXT: bx lr
141 %c = icmp sgt <8 x i16> %a, zeroinitializer
142 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %a, <8 x i16>* %dest, i32 1, <8 x i1> %c)
146 define i8* @masked_v8i16_pre(i8* %y, i8* %x, <8 x i16> %a) {
147 ; CHECK-LE-LABEL: masked_v8i16_pre:
148 ; CHECK-LE: @ %bb.0: @ %entry
149 ; CHECK-LE-NEXT: vldr d1, [sp]
150 ; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
151 ; CHECK-LE-NEXT: vmov d0, r2, r3
152 ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
153 ; CHECK-LE-NEXT: vstrht.16 q1, [r0, #4]
154 ; CHECK-LE-NEXT: adds r0, #4
155 ; CHECK-LE-NEXT: bx lr
157 ; CHECK-BE-LABEL: masked_v8i16_pre:
158 ; CHECK-BE: @ %bb.0: @ %entry
159 ; CHECK-BE-NEXT: vldr d1, [sp]
160 ; CHECK-BE-NEXT: vldrh.u16 q1, [r1]
161 ; CHECK-BE-NEXT: vmov d0, r3, r2
162 ; CHECK-BE-NEXT: vrev64.16 q2, q0
163 ; CHECK-BE-NEXT: vpt.s16 gt, q2, zr
164 ; CHECK-BE-NEXT: vstrht.16 q1, [r0, #4]
165 ; CHECK-BE-NEXT: adds r0, #4
166 ; CHECK-BE-NEXT: bx lr
168 %z = getelementptr inbounds i8, i8* %y, i32 4
169 %0 = bitcast i8* %x to <8 x i16>*
170 %1 = load <8 x i16>, <8 x i16>* %0, align 4
171 %2 = bitcast i8* %z to <8 x i16>*
172 %c = icmp sgt <8 x i16> %a, zeroinitializer
173 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
177 define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) {
178 ; CHECK-LE-LABEL: masked_v8i16_post:
179 ; CHECK-LE: @ %bb.0: @ %entry
180 ; CHECK-LE-NEXT: vldr d1, [sp]
181 ; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
182 ; CHECK-LE-NEXT: vmov d0, r2, r3
183 ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
184 ; CHECK-LE-NEXT: vstrht.16 q1, [r0]
185 ; CHECK-LE-NEXT: adds r0, #4
186 ; CHECK-LE-NEXT: bx lr
188 ; CHECK-BE-LABEL: masked_v8i16_post:
189 ; CHECK-BE: @ %bb.0: @ %entry
190 ; CHECK-BE-NEXT: vldr d1, [sp]
191 ; CHECK-BE-NEXT: vldrh.u16 q1, [r1]
192 ; CHECK-BE-NEXT: vmov d0, r3, r2
193 ; CHECK-BE-NEXT: vrev64.16 q2, q0
194 ; CHECK-BE-NEXT: vpt.s16 gt, q2, zr
195 ; CHECK-BE-NEXT: vstrht.16 q1, [r0]
196 ; CHECK-BE-NEXT: adds r0, #4
197 ; CHECK-BE-NEXT: bx lr
199 %z = getelementptr inbounds i8, i8* %y, i32 4
200 %0 = bitcast i8* %x to <8 x i16>*
201 %1 = load <8 x i16>, <8 x i16>* %0, align 4
202 %2 = bitcast i8* %y to <8 x i16>*
203 %c = icmp sgt <8 x i16> %a, zeroinitializer
204 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
209 define arm_aapcs_vfpcc void @masked_v16i8(<16 x i8> *%dest, <16 x i8> %a) {
210 ; CHECK-LE-LABEL: masked_v16i8:
211 ; CHECK-LE: @ %bb.0: @ %entry
212 ; CHECK-LE-NEXT: vpt.s8 gt, q0, zr
213 ; CHECK-LE-NEXT: vstrbt.8 q0, [r0]
214 ; CHECK-LE-NEXT: bx lr
216 ; CHECK-BE-LABEL: masked_v16i8:
217 ; CHECK-BE: @ %bb.0: @ %entry
218 ; CHECK-BE-NEXT: vrev64.8 q1, q0
219 ; CHECK-BE-NEXT: vpt.s8 gt, q1, zr
220 ; CHECK-BE-NEXT: vstrbt.8 q1, [r0]
221 ; CHECK-BE-NEXT: bx lr
223 %c = icmp sgt <16 x i8> %a, zeroinitializer
224 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %a, <16 x i8>* %dest, i32 1, <16 x i1> %c)
228 define i8* @masked_v16i8_pre(i8* %y, i8* %x, <16 x i8> %a) {
229 ; CHECK-LE-LABEL: masked_v16i8_pre:
230 ; CHECK-LE: @ %bb.0: @ %entry
231 ; CHECK-LE-NEXT: vldr d1, [sp]
232 ; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
233 ; CHECK-LE-NEXT: vmov d0, r2, r3
234 ; CHECK-LE-NEXT: vpt.s8 gt, q0, zr
235 ; CHECK-LE-NEXT: vstrbt.8 q1, [r0, #4]
236 ; CHECK-LE-NEXT: adds r0, #4
237 ; CHECK-LE-NEXT: bx lr
239 ; CHECK-BE-LABEL: masked_v16i8_pre:
240 ; CHECK-BE: @ %bb.0: @ %entry
241 ; CHECK-BE-NEXT: vldr d1, [sp]
242 ; CHECK-BE-NEXT: vldrb.u8 q1, [r1]
243 ; CHECK-BE-NEXT: vmov d0, r3, r2
244 ; CHECK-BE-NEXT: vrev64.8 q2, q0
245 ; CHECK-BE-NEXT: vpt.s8 gt, q2, zr
246 ; CHECK-BE-NEXT: vstrbt.8 q1, [r0, #4]
247 ; CHECK-BE-NEXT: adds r0, #4
248 ; CHECK-BE-NEXT: bx lr
250 %z = getelementptr inbounds i8, i8* %y, i32 4
251 %0 = bitcast i8* %x to <16 x i8>*
252 %1 = load <16 x i8>, <16 x i8>* %0, align 4
253 %2 = bitcast i8* %z to <16 x i8>*
254 %c = icmp sgt <16 x i8> %a, zeroinitializer
255 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
259 define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) {
260 ; CHECK-LE-LABEL: masked_v16i8_post:
261 ; CHECK-LE: @ %bb.0: @ %entry
262 ; CHECK-LE-NEXT: vldr d1, [sp]
263 ; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
264 ; CHECK-LE-NEXT: vmov d0, r2, r3
265 ; CHECK-LE-NEXT: vpt.s8 gt, q0, zr
266 ; CHECK-LE-NEXT: vstrbt.8 q1, [r0]
267 ; CHECK-LE-NEXT: adds r0, #4
268 ; CHECK-LE-NEXT: bx lr
270 ; CHECK-BE-LABEL: masked_v16i8_post:
271 ; CHECK-BE: @ %bb.0: @ %entry
272 ; CHECK-BE-NEXT: vldr d1, [sp]
273 ; CHECK-BE-NEXT: vldrb.u8 q1, [r1]
274 ; CHECK-BE-NEXT: vmov d0, r3, r2
275 ; CHECK-BE-NEXT: vrev64.8 q2, q0
276 ; CHECK-BE-NEXT: vpt.s8 gt, q2, zr
277 ; CHECK-BE-NEXT: vstrbt.8 q1, [r0]
278 ; CHECK-BE-NEXT: adds r0, #4
279 ; CHECK-BE-NEXT: bx lr
281 %z = getelementptr inbounds i8, i8* %y, i32 4
282 %0 = bitcast i8* %x to <16 x i8>*
283 %1 = load <16 x i8>, <16 x i8>* %0, align 4
284 %2 = bitcast i8* %y to <16 x i8>*
285 %c = icmp sgt <16 x i8> %a, zeroinitializer
286 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
291 define arm_aapcs_vfpcc void @masked_v4f32(<4 x float> *%dest, <4 x float> %a, <4 x i32> %b) {
292 ; CHECK-LE-LABEL: masked_v4f32:
293 ; CHECK-LE: @ %bb.0: @ %entry
294 ; CHECK-LE-NEXT: vpt.i32 ne, q1, zr
295 ; CHECK-LE-NEXT: vstrwt.32 q0, [r0]
296 ; CHECK-LE-NEXT: bx lr
298 ; CHECK-BE-LABEL: masked_v4f32:
299 ; CHECK-BE: @ %bb.0: @ %entry
300 ; CHECK-BE-NEXT: vrev64.32 q2, q1
301 ; CHECK-BE-NEXT: vrev64.32 q1, q0
302 ; CHECK-BE-NEXT: vpt.i32 ne, q2, zr
303 ; CHECK-BE-NEXT: vstrwt.32 q1, [r0]
304 ; CHECK-BE-NEXT: bx lr
306 %c = icmp ugt <4 x i32> %b, zeroinitializer
307 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %a, <4 x float>* %dest, i32 4, <4 x i1> %c)
311 define arm_aapcs_vfpcc void @masked_v4f32_align1(<4 x float> *%dest, <4 x float> %a, <4 x i32> %b) {
312 ; CHECK-LE-LABEL: masked_v4f32_align1:
313 ; CHECK-LE: @ %bb.0: @ %entry
314 ; CHECK-LE-NEXT: vpt.i32 ne, q1, zr
315 ; CHECK-LE-NEXT: vstrbt.8 q0, [r0]
316 ; CHECK-LE-NEXT: bx lr
318 ; CHECK-BE-LABEL: masked_v4f32_align1:
319 ; CHECK-BE: @ %bb.0: @ %entry
320 ; CHECK-BE-NEXT: vrev64.32 q2, q1
321 ; CHECK-BE-NEXT: vrev64.32 q1, q0
322 ; CHECK-BE-NEXT: vrev32.8 q0, q1
323 ; CHECK-BE-NEXT: vpt.i32 ne, q2, zr
324 ; CHECK-BE-NEXT: vstrbt.8 q0, [r0]
325 ; CHECK-BE-NEXT: bx lr
327 %c = icmp ugt <4 x i32> %b, zeroinitializer
328 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %a, <4 x float>* %dest, i32 1, <4 x i1> %c)
332 define i8* @masked_v4f32_pre(i8* %y, i8* %x, <4 x i32> %a) {
333 ; CHECK-LE-LABEL: masked_v4f32_pre:
334 ; CHECK-LE: @ %bb.0: @ %entry
335 ; CHECK-LE-NEXT: vldr d1, [sp]
336 ; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
337 ; CHECK-LE-NEXT: vmov d0, r2, r3
338 ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
339 ; CHECK-LE-NEXT: vstrwt.32 q1, [r0, #4]
340 ; CHECK-LE-NEXT: adds r0, #4
341 ; CHECK-LE-NEXT: bx lr
343 ; CHECK-BE-LABEL: masked_v4f32_pre:
344 ; CHECK-BE: @ %bb.0: @ %entry
345 ; CHECK-BE-NEXT: vldr d1, [sp]
346 ; CHECK-BE-NEXT: vldrw.u32 q1, [r1]
347 ; CHECK-BE-NEXT: vmov d0, r3, r2
348 ; CHECK-BE-NEXT: vrev64.32 q2, q0
349 ; CHECK-BE-NEXT: vpt.s32 gt, q2, zr
350 ; CHECK-BE-NEXT: vstrwt.32 q1, [r0, #4]
351 ; CHECK-BE-NEXT: adds r0, #4
352 ; CHECK-BE-NEXT: bx lr
354 %z = getelementptr inbounds i8, i8* %y, i32 4
355 %0 = bitcast i8* %x to <4 x float>*
356 %1 = load <4 x float>, <4 x float>* %0, align 4
357 %2 = bitcast i8* %z to <4 x float>*
358 %c = icmp sgt <4 x i32> %a, zeroinitializer
359 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
363 define i8* @masked_v4f32_post(i8* %y, i8* %x, <4 x i32> %a) {
364 ; CHECK-LE-LABEL: masked_v4f32_post:
365 ; CHECK-LE: @ %bb.0: @ %entry
366 ; CHECK-LE-NEXT: vldr d1, [sp]
367 ; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
368 ; CHECK-LE-NEXT: vmov d0, r2, r3
369 ; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
370 ; CHECK-LE-NEXT: vstrwt.32 q1, [r0]
371 ; CHECK-LE-NEXT: adds r0, #4
372 ; CHECK-LE-NEXT: bx lr
374 ; CHECK-BE-LABEL: masked_v4f32_post:
375 ; CHECK-BE: @ %bb.0: @ %entry
376 ; CHECK-BE-NEXT: vldr d1, [sp]
377 ; CHECK-BE-NEXT: vldrw.u32 q1, [r1]
378 ; CHECK-BE-NEXT: vmov d0, r3, r2
379 ; CHECK-BE-NEXT: vrev64.32 q2, q0
380 ; CHECK-BE-NEXT: vpt.s32 gt, q2, zr
381 ; CHECK-BE-NEXT: vstrwt.32 q1, [r0]
382 ; CHECK-BE-NEXT: adds r0, #4
383 ; CHECK-BE-NEXT: bx lr
385 %z = getelementptr inbounds i8, i8* %y, i32 4
386 %0 = bitcast i8* %x to <4 x float>*
387 %1 = load <4 x float>, <4 x float>* %0, align 4
388 %2 = bitcast i8* %y to <4 x float>*
389 %c = icmp sgt <4 x i32> %a, zeroinitializer
390 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
395 define arm_aapcs_vfpcc void @masked_v8f16(<8 x half> *%dest, <8 x half> %a, <8 x i16> %b) {
396 ; CHECK-LE-LABEL: masked_v8f16:
397 ; CHECK-LE: @ %bb.0: @ %entry
398 ; CHECK-LE-NEXT: vpt.i16 ne, q1, zr
399 ; CHECK-LE-NEXT: vstrht.16 q0, [r0]
400 ; CHECK-LE-NEXT: bx lr
402 ; CHECK-BE-LABEL: masked_v8f16:
403 ; CHECK-BE: @ %bb.0: @ %entry
404 ; CHECK-BE-NEXT: vrev64.16 q2, q1
405 ; CHECK-BE-NEXT: vrev64.16 q1, q0
406 ; CHECK-BE-NEXT: vpt.i16 ne, q2, zr
407 ; CHECK-BE-NEXT: vstrht.16 q1, [r0]
408 ; CHECK-BE-NEXT: bx lr
410 %c = icmp ugt <8 x i16> %b, zeroinitializer
411 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %a, <8 x half>* %dest, i32 2, <8 x i1> %c)
415 define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> %a, <8 x i16> %b) {
416 ; CHECK-LE-LABEL: masked_v8f16_align1:
417 ; CHECK-LE: @ %bb.0: @ %entry
418 ; CHECK-LE-NEXT: vpt.i16 ne, q1, zr
419 ; CHECK-LE-NEXT: vstrbt.8 q0, [r0]
420 ; CHECK-LE-NEXT: bx lr
422 ; CHECK-BE-LABEL: masked_v8f16_align1:
423 ; CHECK-BE: @ %bb.0: @ %entry
424 ; CHECK-BE-NEXT: vrev64.16 q2, q0
425 ; CHECK-BE-NEXT: vrev16.8 q0, q2
426 ; CHECK-BE-NEXT: vrev64.16 q2, q1
427 ; CHECK-BE-NEXT: vpt.i16 ne, q2, zr
428 ; CHECK-BE-NEXT: vstrbt.8 q0, [r0]
429 ; CHECK-BE-NEXT: bx lr
431 %c = icmp ugt <8 x i16> %b, zeroinitializer
432 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %a, <8 x half>* %dest, i32 1, <8 x i1> %c)
436 define i8* @masked_v8f16_pre(i8* %y, i8* %x, <8 x i16> %a) {
437 ; CHECK-LE-LABEL: masked_v8f16_pre:
438 ; CHECK-LE: @ %bb.0: @ %entry
439 ; CHECK-LE-NEXT: vldr d1, [sp]
440 ; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
441 ; CHECK-LE-NEXT: vmov d0, r2, r3
442 ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
443 ; CHECK-LE-NEXT: vstrht.16 q1, [r0, #4]
444 ; CHECK-LE-NEXT: adds r0, #4
445 ; CHECK-LE-NEXT: bx lr
447 ; CHECK-BE-LABEL: masked_v8f16_pre:
448 ; CHECK-BE: @ %bb.0: @ %entry
449 ; CHECK-BE-NEXT: vldr d1, [sp]
450 ; CHECK-BE-NEXT: vldrh.u16 q1, [r1]
451 ; CHECK-BE-NEXT: vmov d0, r3, r2
452 ; CHECK-BE-NEXT: vrev64.16 q2, q0
453 ; CHECK-BE-NEXT: vpt.s16 gt, q2, zr
454 ; CHECK-BE-NEXT: vstrht.16 q1, [r0, #4]
455 ; CHECK-BE-NEXT: adds r0, #4
456 ; CHECK-BE-NEXT: bx lr
458 %z = getelementptr inbounds i8, i8* %y, i32 4
459 %0 = bitcast i8* %x to <8 x half>*
460 %1 = load <8 x half>, <8 x half>* %0, align 4
461 %2 = bitcast i8* %z to <8 x half>*
462 %c = icmp sgt <8 x i16> %a, zeroinitializer
463 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
467 define i8* @masked_v8f16_post(i8* %y, i8* %x, <8 x i16> %a) {
468 ; CHECK-LE-LABEL: masked_v8f16_post:
469 ; CHECK-LE: @ %bb.0: @ %entry
470 ; CHECK-LE-NEXT: vldr d1, [sp]
471 ; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
472 ; CHECK-LE-NEXT: vmov d0, r2, r3
473 ; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
474 ; CHECK-LE-NEXT: vstrht.16 q1, [r0]
475 ; CHECK-LE-NEXT: adds r0, #4
476 ; CHECK-LE-NEXT: bx lr
478 ; CHECK-BE-LABEL: masked_v8f16_post:
479 ; CHECK-BE: @ %bb.0: @ %entry
480 ; CHECK-BE-NEXT: vldr d1, [sp]
481 ; CHECK-BE-NEXT: vldrh.u16 q1, [r1]
482 ; CHECK-BE-NEXT: vmov d0, r3, r2
483 ; CHECK-BE-NEXT: vrev64.16 q2, q0
484 ; CHECK-BE-NEXT: vpt.s16 gt, q2, zr
485 ; CHECK-BE-NEXT: vstrht.16 q1, [r0]
486 ; CHECK-BE-NEXT: adds r0, #4
487 ; CHECK-BE-NEXT: bx lr
489 %z = getelementptr inbounds i8, i8* %y, i32 4
490 %0 = bitcast i8* %x to <8 x half>*
491 %1 = load <8 x half>, <8 x half>* %0, align 4
492 %2 = bitcast i8* %y to <8 x half>*
493 %c = icmp sgt <8 x i16> %a, zeroinitializer
494 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
499 define arm_aapcs_vfpcc void @masked_v2i64(<2 x i64> *%dest, <2 x i64> %a) {
500 ; CHECK-LE-LABEL: masked_v2i64:
501 ; CHECK-LE: @ %bb.0: @ %entry
502 ; CHECK-LE-NEXT: .pad #4
503 ; CHECK-LE-NEXT: sub sp, #4
504 ; CHECK-LE-NEXT: vmov r2, s0
505 ; CHECK-LE-NEXT: movs r3, #0
506 ; CHECK-LE-NEXT: vmov r1, s1
507 ; CHECK-LE-NEXT: vmov r12, s3
508 ; CHECK-LE-NEXT: rsbs r2, r2, #0
509 ; CHECK-LE-NEXT: vmov r2, s2
510 ; CHECK-LE-NEXT: sbcs.w r1, r3, r1
511 ; CHECK-LE-NEXT: mov.w r1, #0
512 ; CHECK-LE-NEXT: it lt
513 ; CHECK-LE-NEXT: movlt r1, #1
514 ; CHECK-LE-NEXT: rsbs r2, r2, #0
515 ; CHECK-LE-NEXT: sbcs.w r2, r3, r12
516 ; CHECK-LE-NEXT: it lt
517 ; CHECK-LE-NEXT: movlt r3, #1
518 ; CHECK-LE-NEXT: cmp r3, #0
519 ; CHECK-LE-NEXT: it ne
520 ; CHECK-LE-NEXT: mvnne r3, #1
521 ; CHECK-LE-NEXT: bfi r3, r1, #0, #1
522 ; CHECK-LE-NEXT: and r1, r3, #3
523 ; CHECK-LE-NEXT: lsls r2, r3, #31
524 ; CHECK-LE-NEXT: it ne
525 ; CHECK-LE-NEXT: vstrne d0, [r0]
526 ; CHECK-LE-NEXT: lsls r1, r1, #30
527 ; CHECK-LE-NEXT: it mi
528 ; CHECK-LE-NEXT: vstrmi d1, [r0, #8]
529 ; CHECK-LE-NEXT: add sp, #4
530 ; CHECK-LE-NEXT: bx lr
532 ; CHECK-BE-LABEL: masked_v2i64:
533 ; CHECK-BE: @ %bb.0: @ %entry
534 ; CHECK-BE-NEXT: .pad #4
535 ; CHECK-BE-NEXT: sub sp, #4
536 ; CHECK-BE-NEXT: vrev64.32 q1, q0
537 ; CHECK-BE-NEXT: movs r3, #0
538 ; CHECK-BE-NEXT: vmov r2, s7
539 ; CHECK-BE-NEXT: vmov r1, s6
540 ; CHECK-BE-NEXT: vmov r12, s4
541 ; CHECK-BE-NEXT: rsbs r2, r2, #0
542 ; CHECK-BE-NEXT: vmov r2, s5
543 ; CHECK-BE-NEXT: sbcs.w r1, r3, r1
544 ; CHECK-BE-NEXT: mov.w r1, #0
545 ; CHECK-BE-NEXT: it lt
546 ; CHECK-BE-NEXT: movlt r1, #1
547 ; CHECK-BE-NEXT: rsbs r2, r2, #0
548 ; CHECK-BE-NEXT: sbcs.w r2, r3, r12
549 ; CHECK-BE-NEXT: it lt
550 ; CHECK-BE-NEXT: movlt r3, #1
551 ; CHECK-BE-NEXT: cmp r3, #0
552 ; CHECK-BE-NEXT: it ne
553 ; CHECK-BE-NEXT: mvnne r3, #1
554 ; CHECK-BE-NEXT: bfi r3, r1, #0, #1
555 ; CHECK-BE-NEXT: and r1, r3, #3
556 ; CHECK-BE-NEXT: lsls r2, r3, #31
557 ; CHECK-BE-NEXT: it ne
558 ; CHECK-BE-NEXT: vstrne d0, [r0]
559 ; CHECK-BE-NEXT: lsls r1, r1, #30
560 ; CHECK-BE-NEXT: it mi
561 ; CHECK-BE-NEXT: vstrmi d1, [r0, #8]
562 ; CHECK-BE-NEXT: add sp, #4
563 ; CHECK-BE-NEXT: bx lr
565 %c = icmp sgt <2 x i64> %a, zeroinitializer
566 call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %a, <2 x i64>* %dest, i32 8, <2 x i1> %c)
570 define arm_aapcs_vfpcc void @masked_v2f64(<2 x double> *%dest, <2 x double> %a, <2 x i64> %b) {
571 ; CHECK-LE-LABEL: masked_v2f64:
572 ; CHECK-LE: @ %bb.0: @ %entry
573 ; CHECK-LE-NEXT: .pad #4
574 ; CHECK-LE-NEXT: sub sp, #4
575 ; CHECK-LE-NEXT: vmov r2, s4
576 ; CHECK-LE-NEXT: movs r3, #0
577 ; CHECK-LE-NEXT: vmov r1, s5
578 ; CHECK-LE-NEXT: vmov r12, s7
579 ; CHECK-LE-NEXT: rsbs r2, r2, #0
580 ; CHECK-LE-NEXT: vmov r2, s6
581 ; CHECK-LE-NEXT: sbcs.w r1, r3, r1
582 ; CHECK-LE-NEXT: mov.w r1, #0
583 ; CHECK-LE-NEXT: it lt
584 ; CHECK-LE-NEXT: movlt r1, #1
585 ; CHECK-LE-NEXT: rsbs r2, r2, #0
586 ; CHECK-LE-NEXT: sbcs.w r2, r3, r12
587 ; CHECK-LE-NEXT: it lt
588 ; CHECK-LE-NEXT: movlt r3, #1
589 ; CHECK-LE-NEXT: cmp r3, #0
590 ; CHECK-LE-NEXT: it ne
591 ; CHECK-LE-NEXT: mvnne r3, #1
592 ; CHECK-LE-NEXT: bfi r3, r1, #0, #1
593 ; CHECK-LE-NEXT: and r1, r3, #3
594 ; CHECK-LE-NEXT: lsls r2, r3, #31
595 ; CHECK-LE-NEXT: it ne
596 ; CHECK-LE-NEXT: vstrne d0, [r0]
597 ; CHECK-LE-NEXT: lsls r1, r1, #30
598 ; CHECK-LE-NEXT: it mi
599 ; CHECK-LE-NEXT: vstrmi d1, [r0, #8]
600 ; CHECK-LE-NEXT: add sp, #4
601 ; CHECK-LE-NEXT: bx lr
603 ; CHECK-BE-LABEL: masked_v2f64:
604 ; CHECK-BE: @ %bb.0: @ %entry
605 ; CHECK-BE-NEXT: .pad #4
606 ; CHECK-BE-NEXT: sub sp, #4
607 ; CHECK-BE-NEXT: vrev64.32 q2, q1
608 ; CHECK-BE-NEXT: movs r3, #0
609 ; CHECK-BE-NEXT: vmov r2, s11
610 ; CHECK-BE-NEXT: vmov r1, s10
611 ; CHECK-BE-NEXT: vmov r12, s8
612 ; CHECK-BE-NEXT: rsbs r2, r2, #0
613 ; CHECK-BE-NEXT: vmov r2, s9
614 ; CHECK-BE-NEXT: sbcs.w r1, r3, r1
615 ; CHECK-BE-NEXT: mov.w r1, #0
616 ; CHECK-BE-NEXT: it lt
617 ; CHECK-BE-NEXT: movlt r1, #1
618 ; CHECK-BE-NEXT: rsbs r2, r2, #0
619 ; CHECK-BE-NEXT: sbcs.w r2, r3, r12
620 ; CHECK-BE-NEXT: it lt
621 ; CHECK-BE-NEXT: movlt r3, #1
622 ; CHECK-BE-NEXT: cmp r3, #0
623 ; CHECK-BE-NEXT: it ne
624 ; CHECK-BE-NEXT: mvnne r3, #1
625 ; CHECK-BE-NEXT: bfi r3, r1, #0, #1
626 ; CHECK-BE-NEXT: and r1, r3, #3
627 ; CHECK-BE-NEXT: lsls r2, r3, #31
628 ; CHECK-BE-NEXT: it ne
629 ; CHECK-BE-NEXT: vstrne d0, [r0]
630 ; CHECK-BE-NEXT: lsls r1, r1, #30
631 ; CHECK-BE-NEXT: it mi
632 ; CHECK-BE-NEXT: vstrmi d1, [r0, #8]
633 ; CHECK-BE-NEXT: add sp, #4
634 ; CHECK-BE-NEXT: bx lr
636 %c = icmp sgt <2 x i64> %b, zeroinitializer
637 call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %a, <2 x double>* %dest, i32 8, <2 x i1> %c)
642 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
643 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
644 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
645 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
646 declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>)
647 declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
648 declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)