1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
3 ; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
5 define void @foo_v4i32_v4i32(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i32> *%src) {
6 ; CHECK-LABEL: foo_v4i32_v4i32:
7 ; CHECK: @ %bb.0: @ %entry
8 ; CHECK-NEXT: vldrw.u32 q0, [r1]
9 ; CHECK-NEXT: vptt.s32 gt, q0, zr
10 ; CHECK-NEXT: vldrwt.u32 q0, [r2]
11 ; CHECK-NEXT: vstrwt.32 q0, [r0]
14 %0 = load <4 x i32>, <4 x i32>* %mask, align 4
15 %1 = icmp sgt <4 x i32> %0, zeroinitializer
16 %2 = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef)
17 call void @llvm.masked.store.v4i32(<4 x i32> %2, <4 x i32>* %dest, i32 4, <4 x i1> %1)
21 define void @foo_sext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) {
22 ; CHECK-LABEL: foo_sext_v4i32_v4i8:
23 ; CHECK: @ %bb.0: @ %entry
24 ; CHECK-NEXT: .save {r7, lr}
25 ; CHECK-NEXT: push {r7, lr}
27 ; CHECK-NEXT: sub sp, #4
28 ; CHECK-NEXT: vldrw.u32 q0, [r1]
29 ; CHECK-NEXT: vcmp.s32 gt, q0, zr
30 ; CHECK-NEXT: @ implicit-def: $q0
31 ; CHECK-NEXT: vmrs lr, p0
32 ; CHECK-NEXT: and r1, lr, #1
33 ; CHECK-NEXT: ubfx r3, lr, #4, #1
34 ; CHECK-NEXT: rsb.w r12, r1, #0
35 ; CHECK-NEXT: movs r1, #0
36 ; CHECK-NEXT: rsbs r3, r3, #0
37 ; CHECK-NEXT: bfi r1, r12, #0, #1
38 ; CHECK-NEXT: bfi r1, r3, #1, #1
39 ; CHECK-NEXT: ubfx r3, lr, #8, #1
40 ; CHECK-NEXT: rsbs r3, r3, #0
41 ; CHECK-NEXT: bfi r1, r3, #2, #1
42 ; CHECK-NEXT: ubfx r3, lr, #12, #1
43 ; CHECK-NEXT: rsbs r3, r3, #0
44 ; CHECK-NEXT: bfi r1, r3, #3, #1
45 ; CHECK-NEXT: lsls r3, r1, #31
47 ; CHECK-NEXT: ldrbne r3, [r2]
48 ; CHECK-NEXT: vmovne.32 q0[0], r3
49 ; CHECK-NEXT: lsls r3, r1, #30
51 ; CHECK-NEXT: ldrbmi r3, [r2, #1]
52 ; CHECK-NEXT: vmovmi.32 q0[1], r3
53 ; CHECK-NEXT: lsls r3, r1, #29
55 ; CHECK-NEXT: ldrbmi r3, [r2, #2]
56 ; CHECK-NEXT: vmovmi.32 q0[2], r3
57 ; CHECK-NEXT: lsls r1, r1, #28
59 ; CHECK-NEXT: ldrbmi r1, [r2, #3]
60 ; CHECK-NEXT: vmovmi.32 q0[3], r1
61 ; CHECK-NEXT: vmovlb.s8 q0, q0
62 ; CHECK-NEXT: vmovlb.s16 q0, q0
64 ; CHECK-NEXT: vstrwt.32 q0, [r0]
65 ; CHECK-NEXT: add sp, #4
66 ; CHECK-NEXT: pop {r7, pc}
68 %0 = load <4 x i32>, <4 x i32>* %mask, align 4
69 %1 = icmp sgt <4 x i32> %0, zeroinitializer
70 %2 = call <4 x i8> @llvm.masked.load.v4i8(<4 x i8>* %src, i32 1, <4 x i1> %1, <4 x i8> undef)
71 %3 = sext <4 x i8> %2 to <4 x i32>
72 call void @llvm.masked.store.v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1)
76 define void @foo_sext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) {
77 ; CHECK-LABEL: foo_sext_v4i32_v4i16:
78 ; CHECK: @ %bb.0: @ %entry
79 ; CHECK-NEXT: .save {r7, lr}
80 ; CHECK-NEXT: push {r7, lr}
82 ; CHECK-NEXT: sub sp, #4
83 ; CHECK-NEXT: vldrw.u32 q0, [r1]
84 ; CHECK-NEXT: vcmp.s32 gt, q0, zr
85 ; CHECK-NEXT: @ implicit-def: $q0
86 ; CHECK-NEXT: vmrs lr, p0
87 ; CHECK-NEXT: and r1, lr, #1
88 ; CHECK-NEXT: ubfx r3, lr, #4, #1
89 ; CHECK-NEXT: rsb.w r12, r1, #0
90 ; CHECK-NEXT: movs r1, #0
91 ; CHECK-NEXT: rsbs r3, r3, #0
92 ; CHECK-NEXT: bfi r1, r12, #0, #1
93 ; CHECK-NEXT: bfi r1, r3, #1, #1
94 ; CHECK-NEXT: ubfx r3, lr, #8, #1
95 ; CHECK-NEXT: rsbs r3, r3, #0
96 ; CHECK-NEXT: bfi r1, r3, #2, #1
97 ; CHECK-NEXT: ubfx r3, lr, #12, #1
98 ; CHECK-NEXT: rsbs r3, r3, #0
99 ; CHECK-NEXT: bfi r1, r3, #3, #1
100 ; CHECK-NEXT: lsls r3, r1, #31
102 ; CHECK-NEXT: ldrhne r3, [r2]
103 ; CHECK-NEXT: vmovne.32 q0[0], r3
104 ; CHECK-NEXT: lsls r3, r1, #30
106 ; CHECK-NEXT: ldrhmi r3, [r2, #2]
107 ; CHECK-NEXT: vmovmi.32 q0[1], r3
108 ; CHECK-NEXT: lsls r3, r1, #29
110 ; CHECK-NEXT: ldrhmi r3, [r2, #4]
111 ; CHECK-NEXT: vmovmi.32 q0[2], r3
112 ; CHECK-NEXT: lsls r1, r1, #28
114 ; CHECK-NEXT: ldrhmi r1, [r2, #6]
115 ; CHECK-NEXT: vmovmi.32 q0[3], r1
116 ; CHECK-NEXT: vmovlb.s16 q0, q0
118 ; CHECK-NEXT: vstrwt.32 q0, [r0]
119 ; CHECK-NEXT: add sp, #4
120 ; CHECK-NEXT: pop {r7, pc}
122 %0 = load <4 x i32>, <4 x i32>* %mask, align 4
123 %1 = icmp sgt <4 x i32> %0, zeroinitializer
124 %2 = call <4 x i16> @llvm.masked.load.v4i16(<4 x i16>* %src, i32 2, <4 x i1> %1, <4 x i16> undef)
125 %3 = sext <4 x i16> %2 to <4 x i32>
126 call void @llvm.masked.store.v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1)
130 define void @foo_zext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%src) {
131 ; CHECK-LABEL: foo_zext_v4i32_v4i8:
132 ; CHECK: @ %bb.0: @ %entry
133 ; CHECK-NEXT: .save {r7, lr}
134 ; CHECK-NEXT: push {r7, lr}
135 ; CHECK-NEXT: .pad #4
136 ; CHECK-NEXT: sub sp, #4
137 ; CHECK-NEXT: vldrw.u32 q0, [r1]
138 ; CHECK-NEXT: vmov.i32 q1, #0xff
139 ; CHECK-NEXT: vcmp.s32 gt, q0, zr
140 ; CHECK-NEXT: @ implicit-def: $q0
141 ; CHECK-NEXT: vmrs lr, p0
142 ; CHECK-NEXT: and r1, lr, #1
143 ; CHECK-NEXT: ubfx r3, lr, #4, #1
144 ; CHECK-NEXT: rsb.w r12, r1, #0
145 ; CHECK-NEXT: movs r1, #0
146 ; CHECK-NEXT: rsbs r3, r3, #0
147 ; CHECK-NEXT: bfi r1, r12, #0, #1
148 ; CHECK-NEXT: bfi r1, r3, #1, #1
149 ; CHECK-NEXT: ubfx r3, lr, #8, #1
150 ; CHECK-NEXT: rsbs r3, r3, #0
151 ; CHECK-NEXT: bfi r1, r3, #2, #1
152 ; CHECK-NEXT: ubfx r3, lr, #12, #1
153 ; CHECK-NEXT: rsbs r3, r3, #0
154 ; CHECK-NEXT: bfi r1, r3, #3, #1
155 ; CHECK-NEXT: lsls r3, r1, #31
157 ; CHECK-NEXT: ldrbne r3, [r2]
158 ; CHECK-NEXT: vmovne.32 q0[0], r3
159 ; CHECK-NEXT: lsls r3, r1, #30
161 ; CHECK-NEXT: ldrbmi r3, [r2, #1]
162 ; CHECK-NEXT: vmovmi.32 q0[1], r3
163 ; CHECK-NEXT: lsls r3, r1, #29
165 ; CHECK-NEXT: ldrbmi r3, [r2, #2]
166 ; CHECK-NEXT: vmovmi.32 q0[2], r3
167 ; CHECK-NEXT: lsls r1, r1, #28
169 ; CHECK-NEXT: ldrbmi r1, [r2, #3]
170 ; CHECK-NEXT: vmovmi.32 q0[3], r1
171 ; CHECK-NEXT: vand q0, q0, q1
173 ; CHECK-NEXT: vstrwt.32 q0, [r0]
174 ; CHECK-NEXT: add sp, #4
175 ; CHECK-NEXT: pop {r7, pc}
177 %0 = load <4 x i32>, <4 x i32>* %mask, align 4
178 %1 = icmp sgt <4 x i32> %0, zeroinitializer
179 %2 = call <4 x i8> @llvm.masked.load.v4i8(<4 x i8>* %src, i32 1, <4 x i1> %1, <4 x i8> undef)
180 %3 = zext <4 x i8> %2 to <4 x i32>
181 call void @llvm.masked.store.v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1)
185 define void @foo_zext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> *%src) {
186 ; CHECK-LABEL: foo_zext_v4i32_v4i16:
187 ; CHECK: @ %bb.0: @ %entry
188 ; CHECK-NEXT: .save {r7, lr}
189 ; CHECK-NEXT: push {r7, lr}
190 ; CHECK-NEXT: .pad #4
191 ; CHECK-NEXT: sub sp, #4
192 ; CHECK-NEXT: vldrw.u32 q0, [r1]
193 ; CHECK-NEXT: vcmp.s32 gt, q0, zr
194 ; CHECK-NEXT: @ implicit-def: $q0
195 ; CHECK-NEXT: vmrs lr, p0
196 ; CHECK-NEXT: and r1, lr, #1
197 ; CHECK-NEXT: ubfx r3, lr, #4, #1
198 ; CHECK-NEXT: rsb.w r12, r1, #0
199 ; CHECK-NEXT: movs r1, #0
200 ; CHECK-NEXT: rsbs r3, r3, #0
201 ; CHECK-NEXT: bfi r1, r12, #0, #1
202 ; CHECK-NEXT: bfi r1, r3, #1, #1
203 ; CHECK-NEXT: ubfx r3, lr, #8, #1
204 ; CHECK-NEXT: rsbs r3, r3, #0
205 ; CHECK-NEXT: bfi r1, r3, #2, #1
206 ; CHECK-NEXT: ubfx r3, lr, #12, #1
207 ; CHECK-NEXT: rsbs r3, r3, #0
208 ; CHECK-NEXT: bfi r1, r3, #3, #1
209 ; CHECK-NEXT: lsls r3, r1, #31
211 ; CHECK-NEXT: ldrhne r3, [r2]
212 ; CHECK-NEXT: vmovne.32 q0[0], r3
213 ; CHECK-NEXT: lsls r3, r1, #30
215 ; CHECK-NEXT: ldrhmi r3, [r2, #2]
216 ; CHECK-NEXT: vmovmi.32 q0[1], r3
217 ; CHECK-NEXT: lsls r3, r1, #29
219 ; CHECK-NEXT: ldrhmi r3, [r2, #4]
220 ; CHECK-NEXT: vmovmi.32 q0[2], r3
221 ; CHECK-NEXT: lsls r1, r1, #28
223 ; CHECK-NEXT: ldrhmi r1, [r2, #6]
224 ; CHECK-NEXT: vmovmi.32 q0[3], r1
225 ; CHECK-NEXT: vmovlb.u16 q0, q0
227 ; CHECK-NEXT: vstrwt.32 q0, [r0]
228 ; CHECK-NEXT: add sp, #4
229 ; CHECK-NEXT: pop {r7, pc}
231 %0 = load <4 x i32>, <4 x i32>* %mask, align 4
232 %1 = icmp sgt <4 x i32> %0, zeroinitializer
233 %2 = call <4 x i16> @llvm.masked.load.v4i16(<4 x i16>* %src, i32 2, <4 x i1> %1, <4 x i16> undef)
234 %3 = zext <4 x i16> %2 to <4 x i32>
235 call void @llvm.masked.store.v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1)
239 define void @foo_v8i16_v8i16(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i16> *%src) {
240 ; CHECK-LABEL: foo_v8i16_v8i16:
241 ; CHECK: @ %bb.0: @ %entry
242 ; CHECK-NEXT: vldrh.u16 q0, [r1]
243 ; CHECK-NEXT: vptt.s16 gt, q0, zr
244 ; CHECK-NEXT: vldrht.u16 q0, [r2]
245 ; CHECK-NEXT: vstrht.16 q0, [r0]
248 %0 = load <8 x i16>, <8 x i16>* %mask, align 2
249 %1 = icmp sgt <8 x i16> %0, zeroinitializer
250 %2 = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %src, i32 2, <8 x i1> %1, <8 x i16> undef)
251 call void @llvm.masked.store.v8i16(<8 x i16> %2, <8 x i16>* %dest, i32 2, <8 x i1> %1)
255 define void @foo_sext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) {
256 ; CHECK-LABEL: foo_sext_v8i16_v8i8:
257 ; CHECK: @ %bb.0: @ %entry
258 ; CHECK-NEXT: .save {r7, lr}
259 ; CHECK-NEXT: push {r7, lr}
260 ; CHECK-NEXT: .pad #8
261 ; CHECK-NEXT: sub sp, #8
262 ; CHECK-NEXT: vldrh.u16 q0, [r1]
263 ; CHECK-NEXT: vcmp.s16 gt, q0, zr
264 ; CHECK-NEXT: @ implicit-def: $q0
265 ; CHECK-NEXT: vmrs lr, p0
266 ; CHECK-NEXT: and r3, lr, #1
267 ; CHECK-NEXT: ubfx r1, lr, #2, #1
268 ; CHECK-NEXT: rsb.w r12, r3, #0
269 ; CHECK-NEXT: movs r3, #0
270 ; CHECK-NEXT: rsbs r1, r1, #0
271 ; CHECK-NEXT: bfi r3, r12, #0, #1
272 ; CHECK-NEXT: bfi r3, r1, #1, #1
273 ; CHECK-NEXT: ubfx r1, lr, #4, #1
274 ; CHECK-NEXT: rsbs r1, r1, #0
275 ; CHECK-NEXT: bfi r3, r1, #2, #1
276 ; CHECK-NEXT: ubfx r1, lr, #6, #1
277 ; CHECK-NEXT: rsbs r1, r1, #0
278 ; CHECK-NEXT: bfi r3, r1, #3, #1
279 ; CHECK-NEXT: ubfx r1, lr, #8, #1
280 ; CHECK-NEXT: rsbs r1, r1, #0
281 ; CHECK-NEXT: bfi r3, r1, #4, #1
282 ; CHECK-NEXT: ubfx r1, lr, #10, #1
283 ; CHECK-NEXT: rsbs r1, r1, #0
284 ; CHECK-NEXT: bfi r3, r1, #5, #1
285 ; CHECK-NEXT: ubfx r1, lr, #12, #1
286 ; CHECK-NEXT: rsbs r1, r1, #0
287 ; CHECK-NEXT: bfi r3, r1, #6, #1
288 ; CHECK-NEXT: ubfx r1, lr, #14, #1
289 ; CHECK-NEXT: rsbs r1, r1, #0
290 ; CHECK-NEXT: bfi r3, r1, #7, #1
291 ; CHECK-NEXT: uxtb r1, r3
292 ; CHECK-NEXT: lsls r3, r3, #31
294 ; CHECK-NEXT: ldrbne r3, [r2]
295 ; CHECK-NEXT: vmovne.16 q0[0], r3
296 ; CHECK-NEXT: lsls r3, r1, #30
298 ; CHECK-NEXT: ldrbmi r3, [r2, #1]
299 ; CHECK-NEXT: vmovmi.16 q0[1], r3
300 ; CHECK-NEXT: lsls r3, r1, #29
302 ; CHECK-NEXT: ldrbmi r3, [r2, #2]
303 ; CHECK-NEXT: vmovmi.16 q0[2], r3
304 ; CHECK-NEXT: lsls r3, r1, #28
306 ; CHECK-NEXT: ldrbmi r3, [r2, #3]
307 ; CHECK-NEXT: vmovmi.16 q0[3], r3
308 ; CHECK-NEXT: lsls r3, r1, #27
310 ; CHECK-NEXT: ldrbmi r3, [r2, #4]
311 ; CHECK-NEXT: vmovmi.16 q0[4], r3
312 ; CHECK-NEXT: lsls r3, r1, #26
314 ; CHECK-NEXT: ldrbmi r3, [r2, #5]
315 ; CHECK-NEXT: vmovmi.16 q0[5], r3
316 ; CHECK-NEXT: lsls r3, r1, #25
318 ; CHECK-NEXT: ldrbmi r3, [r2, #6]
319 ; CHECK-NEXT: vmovmi.16 q0[6], r3
320 ; CHECK-NEXT: lsls r1, r1, #24
322 ; CHECK-NEXT: ldrbmi r1, [r2, #7]
323 ; CHECK-NEXT: vmovmi.16 q0[7], r1
324 ; CHECK-NEXT: vmovlb.s8 q0, q0
326 ; CHECK-NEXT: vstrht.16 q0, [r0]
327 ; CHECK-NEXT: add sp, #8
328 ; CHECK-NEXT: pop {r7, pc}
330 %0 = load <8 x i16>, <8 x i16>* %mask, align 2
331 %1 = icmp sgt <8 x i16> %0, zeroinitializer
332 %2 = call <8 x i8> @llvm.masked.load.v8i8(<8 x i8>* %src, i32 1, <8 x i1> %1, <8 x i8> undef)
333 %3 = sext <8 x i8> %2 to <8 x i16>
334 call void @llvm.masked.store.v8i16(<8 x i16> %3, <8 x i16>* %dest, i32 2, <8 x i1> %1)
338 define void @foo_zext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%src) {
339 ; CHECK-LABEL: foo_zext_v8i16_v8i8:
340 ; CHECK: @ %bb.0: @ %entry
341 ; CHECK-NEXT: .save {r7, lr}
342 ; CHECK-NEXT: push {r7, lr}
343 ; CHECK-NEXT: .pad #8
344 ; CHECK-NEXT: sub sp, #8
345 ; CHECK-NEXT: vldrh.u16 q0, [r1]
346 ; CHECK-NEXT: vcmp.s16 gt, q0, zr
347 ; CHECK-NEXT: @ implicit-def: $q0
348 ; CHECK-NEXT: vmrs lr, p0
349 ; CHECK-NEXT: and r3, lr, #1
350 ; CHECK-NEXT: ubfx r1, lr, #2, #1
351 ; CHECK-NEXT: rsb.w r12, r3, #0
352 ; CHECK-NEXT: movs r3, #0
353 ; CHECK-NEXT: rsbs r1, r1, #0
354 ; CHECK-NEXT: bfi r3, r12, #0, #1
355 ; CHECK-NEXT: bfi r3, r1, #1, #1
356 ; CHECK-NEXT: ubfx r1, lr, #4, #1
357 ; CHECK-NEXT: rsbs r1, r1, #0
358 ; CHECK-NEXT: bfi r3, r1, #2, #1
359 ; CHECK-NEXT: ubfx r1, lr, #6, #1
360 ; CHECK-NEXT: rsbs r1, r1, #0
361 ; CHECK-NEXT: bfi r3, r1, #3, #1
362 ; CHECK-NEXT: ubfx r1, lr, #8, #1
363 ; CHECK-NEXT: rsbs r1, r1, #0
364 ; CHECK-NEXT: bfi r3, r1, #4, #1
365 ; CHECK-NEXT: ubfx r1, lr, #10, #1
366 ; CHECK-NEXT: rsbs r1, r1, #0
367 ; CHECK-NEXT: bfi r3, r1, #5, #1
368 ; CHECK-NEXT: ubfx r1, lr, #12, #1
369 ; CHECK-NEXT: rsbs r1, r1, #0
370 ; CHECK-NEXT: bfi r3, r1, #6, #1
371 ; CHECK-NEXT: ubfx r1, lr, #14, #1
372 ; CHECK-NEXT: rsbs r1, r1, #0
373 ; CHECK-NEXT: bfi r3, r1, #7, #1
374 ; CHECK-NEXT: uxtb r1, r3
375 ; CHECK-NEXT: lsls r3, r3, #31
377 ; CHECK-NEXT: ldrbne r3, [r2]
378 ; CHECK-NEXT: vmovne.16 q0[0], r3
379 ; CHECK-NEXT: lsls r3, r1, #30
381 ; CHECK-NEXT: ldrbmi r3, [r2, #1]
382 ; CHECK-NEXT: vmovmi.16 q0[1], r3
383 ; CHECK-NEXT: lsls r3, r1, #29
385 ; CHECK-NEXT: ldrbmi r3, [r2, #2]
386 ; CHECK-NEXT: vmovmi.16 q0[2], r3
387 ; CHECK-NEXT: lsls r3, r1, #28
389 ; CHECK-NEXT: ldrbmi r3, [r2, #3]
390 ; CHECK-NEXT: vmovmi.16 q0[3], r3
391 ; CHECK-NEXT: lsls r3, r1, #27
393 ; CHECK-NEXT: ldrbmi r3, [r2, #4]
394 ; CHECK-NEXT: vmovmi.16 q0[4], r3
395 ; CHECK-NEXT: lsls r3, r1, #26
397 ; CHECK-NEXT: ldrbmi r3, [r2, #5]
398 ; CHECK-NEXT: vmovmi.16 q0[5], r3
399 ; CHECK-NEXT: lsls r3, r1, #25
401 ; CHECK-NEXT: ldrbmi r3, [r2, #6]
402 ; CHECK-NEXT: vmovmi.16 q0[6], r3
403 ; CHECK-NEXT: lsls r1, r1, #24
405 ; CHECK-NEXT: ldrbmi r1, [r2, #7]
406 ; CHECK-NEXT: vmovmi.16 q0[7], r1
407 ; CHECK-NEXT: vmovlb.u8 q0, q0
409 ; CHECK-NEXT: vstrht.16 q0, [r0]
410 ; CHECK-NEXT: add sp, #8
411 ; CHECK-NEXT: pop {r7, pc}
413 %0 = load <8 x i16>, <8 x i16>* %mask, align 2
414 %1 = icmp sgt <8 x i16> %0, zeroinitializer
415 %2 = call <8 x i8> @llvm.masked.load.v8i8(<8 x i8>* %src, i32 1, <8 x i1> %1, <8 x i8> undef)
416 %3 = zext <8 x i8> %2 to <8 x i16>
417 call void @llvm.masked.store.v8i16(<8 x i16> %3, <8 x i16>* %dest, i32 2, <8 x i1> %1)
421 define void @foo_v16i8_v16i8(<16 x i8> *%dest, <16 x i8> *%mask, <16 x i8> *%src) {
422 ; CHECK-LABEL: foo_v16i8_v16i8:
423 ; CHECK: @ %bb.0: @ %entry
424 ; CHECK-NEXT: vldrb.u8 q0, [r1]
425 ; CHECK-NEXT: vptt.s8 gt, q0, zr
426 ; CHECK-NEXT: vldrbt.u8 q0, [r2]
427 ; CHECK-NEXT: vstrbt.8 q0, [r0]
430 %0 = load <16 x i8>, <16 x i8>* %mask, align 1
431 %1 = icmp sgt <16 x i8> %0, zeroinitializer
432 %2 = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %src, i32 1, <16 x i1> %1, <16 x i8> undef)
433 call void @llvm.masked.store.v16i8(<16 x i8> %2, <16 x i8>* %dest, i32 1, <16 x i1> %1)
437 define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> *%src) {
438 ; CHECK-LABEL: foo_trunc_v8i8_v8i16:
439 ; CHECK: @ %bb.0: @ %entry
440 ; CHECK-NEXT: .pad #8
441 ; CHECK-NEXT: sub sp, #8
442 ; CHECK-NEXT: vldrh.u16 q0, [r1]
443 ; CHECK-NEXT: vpt.s16 gt, q0, zr
444 ; CHECK-NEXT: vldrht.u16 q0, [r2]
445 ; CHECK-NEXT: vmrs r1, p0
446 ; CHECK-NEXT: and r2, r1, #1
447 ; CHECK-NEXT: rsbs r3, r2, #0
448 ; CHECK-NEXT: movs r2, #0
449 ; CHECK-NEXT: bfi r2, r3, #0, #1
450 ; CHECK-NEXT: ubfx r3, r1, #2, #1
451 ; CHECK-NEXT: rsbs r3, r3, #0
452 ; CHECK-NEXT: bfi r2, r3, #1, #1
453 ; CHECK-NEXT: ubfx r3, r1, #4, #1
454 ; CHECK-NEXT: rsbs r3, r3, #0
455 ; CHECK-NEXT: bfi r2, r3, #2, #1
456 ; CHECK-NEXT: ubfx r3, r1, #6, #1
457 ; CHECK-NEXT: rsbs r3, r3, #0
458 ; CHECK-NEXT: bfi r2, r3, #3, #1
459 ; CHECK-NEXT: ubfx r3, r1, #8, #1
460 ; CHECK-NEXT: rsbs r3, r3, #0
461 ; CHECK-NEXT: bfi r2, r3, #4, #1
462 ; CHECK-NEXT: ubfx r3, r1, #10, #1
463 ; CHECK-NEXT: rsbs r3, r3, #0
464 ; CHECK-NEXT: bfi r2, r3, #5, #1
465 ; CHECK-NEXT: ubfx r3, r1, #12, #1
466 ; CHECK-NEXT: ubfx r1, r1, #14, #1
467 ; CHECK-NEXT: rsbs r3, r3, #0
468 ; CHECK-NEXT: bfi r2, r3, #6, #1
469 ; CHECK-NEXT: rsbs r1, r1, #0
470 ; CHECK-NEXT: bfi r2, r1, #7, #1
471 ; CHECK-NEXT: uxtb r1, r2
472 ; CHECK-NEXT: lsls r2, r2, #31
474 ; CHECK-NEXT: vmovne.u16 r2, q0[0]
475 ; CHECK-NEXT: strbne r2, [r0]
476 ; CHECK-NEXT: lsls r2, r1, #30
478 ; CHECK-NEXT: vmovmi.u16 r2, q0[1]
479 ; CHECK-NEXT: strbmi r2, [r0, #1]
480 ; CHECK-NEXT: lsls r2, r1, #29
482 ; CHECK-NEXT: vmovmi.u16 r2, q0[2]
483 ; CHECK-NEXT: strbmi r2, [r0, #2]
484 ; CHECK-NEXT: lsls r2, r1, #28
486 ; CHECK-NEXT: vmovmi.u16 r2, q0[3]
487 ; CHECK-NEXT: strbmi r2, [r0, #3]
488 ; CHECK-NEXT: lsls r2, r1, #27
490 ; CHECK-NEXT: vmovmi.u16 r2, q0[4]
491 ; CHECK-NEXT: strbmi r2, [r0, #4]
492 ; CHECK-NEXT: lsls r2, r1, #26
494 ; CHECK-NEXT: vmovmi.u16 r2, q0[5]
495 ; CHECK-NEXT: strbmi r2, [r0, #5]
496 ; CHECK-NEXT: lsls r2, r1, #25
498 ; CHECK-NEXT: vmovmi.u16 r2, q0[6]
499 ; CHECK-NEXT: strbmi r2, [r0, #6]
500 ; CHECK-NEXT: lsls r1, r1, #24
502 ; CHECK-NEXT: vmovmi.u16 r1, q0[7]
503 ; CHECK-NEXT: strbmi r1, [r0, #7]
504 ; CHECK-NEXT: add sp, #8
507 %0 = load <8 x i16>, <8 x i16>* %mask, align 2
508 %1 = icmp sgt <8 x i16> %0, zeroinitializer
509 %2 = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %src, i32 2, <8 x i1> %1, <8 x i16> undef)
510 %3 = trunc <8 x i16> %2 to <8 x i8>
511 call void @llvm.masked.store.v8i8(<8 x i8> %3, <8 x i8>* %dest, i32 1, <8 x i1> %1)
515 define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> *%src) {
516 ; CHECK-LABEL: foo_trunc_v4i8_v4i32:
517 ; CHECK: @ %bb.0: @ %entry
518 ; CHECK-NEXT: .pad #4
519 ; CHECK-NEXT: sub sp, #4
520 ; CHECK-NEXT: vldrw.u32 q0, [r1]
521 ; CHECK-NEXT: vpt.s32 gt, q0, zr
522 ; CHECK-NEXT: vldrwt.u32 q0, [r2]
523 ; CHECK-NEXT: vmrs r2, p0
524 ; CHECK-NEXT: and r1, r2, #1
525 ; CHECK-NEXT: rsbs r3, r1, #0
526 ; CHECK-NEXT: movs r1, #0
527 ; CHECK-NEXT: bfi r1, r3, #0, #1
528 ; CHECK-NEXT: ubfx r3, r2, #4, #1
529 ; CHECK-NEXT: rsbs r3, r3, #0
530 ; CHECK-NEXT: bfi r1, r3, #1, #1
531 ; CHECK-NEXT: ubfx r3, r2, #8, #1
532 ; CHECK-NEXT: ubfx r2, r2, #12, #1
533 ; CHECK-NEXT: rsbs r3, r3, #0
534 ; CHECK-NEXT: bfi r1, r3, #2, #1
535 ; CHECK-NEXT: rsbs r2, r2, #0
536 ; CHECK-NEXT: bfi r1, r2, #3, #1
537 ; CHECK-NEXT: lsls r2, r1, #31
539 ; CHECK-NEXT: vmovne r2, s0
540 ; CHECK-NEXT: strbne r2, [r0]
541 ; CHECK-NEXT: lsls r2, r1, #30
543 ; CHECK-NEXT: vmovmi r2, s1
544 ; CHECK-NEXT: strbmi r2, [r0, #1]
545 ; CHECK-NEXT: lsls r2, r1, #29
547 ; CHECK-NEXT: vmovmi r2, s2
548 ; CHECK-NEXT: strbmi r2, [r0, #2]
549 ; CHECK-NEXT: lsls r1, r1, #28
551 ; CHECK-NEXT: vmovmi r1, s3
552 ; CHECK-NEXT: strbmi r1, [r0, #3]
553 ; CHECK-NEXT: add sp, #4
556 %0 = load <4 x i32>, <4 x i32>* %mask, align 4
557 %1 = icmp sgt <4 x i32> %0, zeroinitializer
558 %2 = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef)
559 %3 = trunc <4 x i32> %2 to <4 x i8>
560 call void @llvm.masked.store.v4i8(<4 x i8> %3, <4 x i8>* %dest, i32 1, <4 x i1> %1)
564 define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32> *%src) {
565 ; CHECK-LABEL: foo_trunc_v4i16_v4i32:
566 ; CHECK: @ %bb.0: @ %entry
567 ; CHECK-NEXT: .pad #4
568 ; CHECK-NEXT: sub sp, #4
569 ; CHECK-NEXT: vldrw.u32 q0, [r1]
570 ; CHECK-NEXT: vpt.s32 gt, q0, zr
571 ; CHECK-NEXT: vldrwt.u32 q0, [r2]
572 ; CHECK-NEXT: vmrs r2, p0
573 ; CHECK-NEXT: and r1, r2, #1
574 ; CHECK-NEXT: rsbs r3, r1, #0
575 ; CHECK-NEXT: movs r1, #0
576 ; CHECK-NEXT: bfi r1, r3, #0, #1
577 ; CHECK-NEXT: ubfx r3, r2, #4, #1
578 ; CHECK-NEXT: rsbs r3, r3, #0
579 ; CHECK-NEXT: bfi r1, r3, #1, #1
580 ; CHECK-NEXT: ubfx r3, r2, #8, #1
581 ; CHECK-NEXT: ubfx r2, r2, #12, #1
582 ; CHECK-NEXT: rsbs r3, r3, #0
583 ; CHECK-NEXT: bfi r1, r3, #2, #1
584 ; CHECK-NEXT: rsbs r2, r2, #0
585 ; CHECK-NEXT: bfi r1, r2, #3, #1
586 ; CHECK-NEXT: lsls r2, r1, #31
588 ; CHECK-NEXT: vmovne r2, s0
589 ; CHECK-NEXT: strhne r2, [r0]
590 ; CHECK-NEXT: lsls r2, r1, #30
592 ; CHECK-NEXT: vmovmi r2, s1
593 ; CHECK-NEXT: strhmi r2, [r0, #2]
594 ; CHECK-NEXT: lsls r2, r1, #29
596 ; CHECK-NEXT: vmovmi r2, s2
597 ; CHECK-NEXT: strhmi r2, [r0, #4]
598 ; CHECK-NEXT: lsls r1, r1, #28
600 ; CHECK-NEXT: vmovmi r1, s3
601 ; CHECK-NEXT: strhmi r1, [r0, #6]
602 ; CHECK-NEXT: add sp, #4
605 %0 = load <4 x i32>, <4 x i32>* %mask, align 4
606 %1 = icmp sgt <4 x i32> %0, zeroinitializer
607 %2 = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef)
608 %3 = trunc <4 x i32> %2 to <4 x i16>
609 call void @llvm.masked.store.v4i16(<4 x i16> %3, <4 x i16>* %dest, i32 2, <4 x i1> %1)
613 define void @foo_v4f32_v4f32(<4 x float> *%dest, <4 x i32> *%mask, <4 x float> *%src) {
614 ; CHECK-LABEL: foo_v4f32_v4f32:
615 ; CHECK: @ %bb.0: @ %entry
616 ; CHECK-NEXT: vldrw.u32 q0, [r1]
617 ; CHECK-NEXT: vptt.s32 gt, q0, zr
618 ; CHECK-NEXT: vldrwt.u32 q0, [r2]
619 ; CHECK-NEXT: vstrwt.32 q0, [r0]
622 %0 = load <4 x i32>, <4 x i32>* %mask, align 4
623 %1 = icmp sgt <4 x i32> %0, zeroinitializer
624 %2 = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %src, i32 4, <4 x i1> %1, <4 x float> undef)
625 call void @llvm.masked.store.v4f32(<4 x float> %2, <4 x float>* %dest, i32 4, <4 x i1> %1)
629 define void @foo_v8f16_v8f16(<8 x half> *%dest, <8 x i16> *%mask, <8 x half> *%src) {
630 ; CHECK-LABEL: foo_v8f16_v8f16:
631 ; CHECK: @ %bb.0: @ %entry
632 ; CHECK-NEXT: vldrh.u16 q0, [r1]
633 ; CHECK-NEXT: vptt.s16 gt, q0, zr
634 ; CHECK-NEXT: vldrht.u16 q0, [r2]
635 ; CHECK-NEXT: vstrht.16 q0, [r0]
638 %0 = load <8 x i16>, <8 x i16>* %mask, align 2
639 %1 = icmp sgt <8 x i16> %0, zeroinitializer
640 %2 = call <8 x half> @llvm.masked.load.v8f16(<8 x half>* %src, i32 2, <8 x i1> %1, <8 x half> undef)
641 call void @llvm.masked.store.v8f16(<8 x half> %2, <8 x half>* %dest, i32 2, <8 x i1> %1)
645 declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
646 declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
647 declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
648 declare void @llvm.masked.store.v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>)
649 declare void @llvm.masked.store.v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
650 declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
651 declare <8 x i16> @llvm.masked.load.v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
652 declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
653 declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
654 declare <8 x half> @llvm.masked.load.v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>)
656 declare void @llvm.masked.store.v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>)
657 declare void @llvm.masked.store.v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>)
658 declare void @llvm.masked.store.v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>)
659 declare <4 x i16> @llvm.masked.load.v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
660 declare <4 x i8> @llvm.masked.load.v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>)
661 declare <8 x i8> @llvm.masked.load.v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)