1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
3 ; RUN: llc -mtriple=thumbebv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
5 define i8* @ldrwu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
6 ; CHECK-LABEL: ldrwu32_4:
7 ; CHECK: @ %bb.0: @ %entry
8 ; CHECK-NEXT: vldrw.u32 q0, [r2]
9 ; CHECK-NEXT: vpt.i32 ne, q0, zr
10 ; CHECK-NEXT: vldrwt.u32 q0, [r0, #4]
11 ; CHECK-NEXT: vstrw.32 q0, [r1]
14 %z = getelementptr inbounds i8, i8* %x, i32 4
15 %0 = bitcast i8* %z to <4 x i32>*
16 %mask = load <4 x i32>, <4 x i32>* %m, align 4
17 %c = icmp ne <4 x i32> %mask, zeroinitializer
18 %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
19 %2 = bitcast i8* %y to <4 x i32>*
20 store <4 x i32> %1, <4 x i32>* %2, align 4
24 define i8* @ldrwu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
25 ; CHECK-LABEL: ldrwu32_3:
26 ; CHECK: @ %bb.0: @ %entry
27 ; CHECK-NEXT: vldrw.u32 q0, [r2]
28 ; CHECK-NEXT: adds r3, r0, #3
29 ; CHECK-NEXT: vpt.i32 ne, q0, zr
30 ; CHECK-NEXT: vldrwt.u32 q0, [r3]
31 ; CHECK-NEXT: vstrw.32 q0, [r1]
34 %z = getelementptr inbounds i8, i8* %x, i32 3
35 %0 = bitcast i8* %z to <4 x i32>*
36 %mask = load <4 x i32>, <4 x i32>* %m, align 4
37 %c = icmp ne <4 x i32> %mask, zeroinitializer
38 %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
39 %2 = bitcast i8* %y to <4 x i32>*
40 store <4 x i32> %1, <4 x i32>* %2, align 4
44 define i8* @ldrwu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
45 ; CHECK-LABEL: ldrwu32_2:
46 ; CHECK: @ %bb.0: @ %entry
47 ; CHECK-NEXT: vldrw.u32 q0, [r2]
48 ; CHECK-NEXT: adds r3, r0, #2
49 ; CHECK-NEXT: vpt.i32 ne, q0, zr
50 ; CHECK-NEXT: vldrwt.u32 q0, [r3]
51 ; CHECK-NEXT: vstrw.32 q0, [r1]
54 %z = getelementptr inbounds i8, i8* %x, i32 2
55 %0 = bitcast i8* %z to <4 x i32>*
56 %mask = load <4 x i32>, <4 x i32>* %m, align 4
57 %c = icmp ne <4 x i32> %mask, zeroinitializer
58 %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
59 %2 = bitcast i8* %y to <4 x i32>*
60 store <4 x i32> %1, <4 x i32>* %2, align 4
64 define i8* @ldrwu32_508(i8* %x, i8* %y, <4 x i32> *%m) {
65 ; CHECK-LABEL: ldrwu32_508:
66 ; CHECK: @ %bb.0: @ %entry
67 ; CHECK-NEXT: vldrw.u32 q0, [r2]
68 ; CHECK-NEXT: vpt.i32 ne, q0, zr
69 ; CHECK-NEXT: vldrwt.u32 q0, [r0, #508]
70 ; CHECK-NEXT: vstrw.32 q0, [r1]
73 %z = getelementptr inbounds i8, i8* %x, i32 508
74 %0 = bitcast i8* %z to <4 x i32>*
75 %mask = load <4 x i32>, <4 x i32>* %m, align 4
76 %c = icmp ne <4 x i32> %mask, zeroinitializer
77 %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
78 %2 = bitcast i8* %y to <4 x i32>*
79 store <4 x i32> %1, <4 x i32>* %2, align 4
83 define i8* @ldrwu32_512(i8* %x, i8* %y, <4 x i32> *%m) {
84 ; CHECK-LABEL: ldrwu32_512:
85 ; CHECK: @ %bb.0: @ %entry
86 ; CHECK-NEXT: vldrw.u32 q0, [r2]
87 ; CHECK-NEXT: add.w r3, r0, #512
88 ; CHECK-NEXT: vpt.i32 ne, q0, zr
89 ; CHECK-NEXT: vldrwt.u32 q0, [r3]
90 ; CHECK-NEXT: vstrw.32 q0, [r1]
93 %z = getelementptr inbounds i8, i8* %x, i32 512
94 %0 = bitcast i8* %z to <4 x i32>*
95 %mask = load <4 x i32>, <4 x i32>* %m, align 4
96 %c = icmp ne <4 x i32> %mask, zeroinitializer
97 %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
98 %2 = bitcast i8* %y to <4 x i32>*
99 store <4 x i32> %1, <4 x i32>* %2, align 4
103 define i8* @ldrwu32_m508(i8* %x, i8* %y, <4 x i32> *%m) {
104 ; CHECK-LABEL: ldrwu32_m508:
105 ; CHECK: @ %bb.0: @ %entry
106 ; CHECK-NEXT: vldrw.u32 q0, [r2]
107 ; CHECK-NEXT: vpt.i32 ne, q0, zr
108 ; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508]
109 ; CHECK-NEXT: vstrw.32 q0, [r1]
112 %z = getelementptr inbounds i8, i8* %x, i32 -508
113 %0 = bitcast i8* %z to <4 x i32>*
114 %mask = load <4 x i32>, <4 x i32>* %m, align 4
115 %c = icmp ne <4 x i32> %mask, zeroinitializer
116 %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
117 %2 = bitcast i8* %y to <4 x i32>*
118 store <4 x i32> %1, <4 x i32>* %2, align 4
122 define i8* @ldrwu32_m512(i8* %x, i8* %y, <4 x i32> *%m) {
123 ; CHECK-LABEL: ldrwu32_m512:
124 ; CHECK: @ %bb.0: @ %entry
125 ; CHECK-NEXT: vldrw.u32 q0, [r2]
126 ; CHECK-NEXT: sub.w r3, r0, #512
127 ; CHECK-NEXT: vpt.i32 ne, q0, zr
128 ; CHECK-NEXT: vldrwt.u32 q0, [r3]
129 ; CHECK-NEXT: vstrw.32 q0, [r1]
132 %z = getelementptr inbounds i8, i8* %x, i32 -512
133 %0 = bitcast i8* %z to <4 x i32>*
134 %mask = load <4 x i32>, <4 x i32>* %m, align 4
135 %c = icmp ne <4 x i32> %mask, zeroinitializer
136 %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
137 %2 = bitcast i8* %y to <4 x i32>*
138 store <4 x i32> %1, <4 x i32>* %2, align 4
142 define i8* @ldrhu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
143 ; CHECK-LABEL: ldrhu32_4:
144 ; CHECK: @ %bb.0: @ %entry
145 ; CHECK-NEXT: vldrw.u32 q0, [r2]
146 ; CHECK-NEXT: vpt.i32 ne, q0, zr
147 ; CHECK-NEXT: vldrht.u32 q0, [r0, #4]
148 ; CHECK-NEXT: vstrw.32 q0, [r1]
151 %z = getelementptr inbounds i8, i8* %x, i32 4
152 %0 = bitcast i8* %z to <4 x i16>*
153 %mask = load <4 x i32>, <4 x i32>* %m, align 4
154 %c = icmp ne <4 x i32> %mask, zeroinitializer
155 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
156 %2 = zext <4 x i16> %1 to <4 x i32>
157 %3 = bitcast i8* %y to <4 x i32>*
158 store <4 x i32> %2, <4 x i32>* %3, align 4
162 define i8* @ldrhu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
163 ; CHECK-LABEL: ldrhu32_3:
164 ; CHECK: @ %bb.0: @ %entry
165 ; CHECK-NEXT: vldrw.u32 q0, [r2]
166 ; CHECK-NEXT: adds r3, r0, #3
167 ; CHECK-NEXT: vpt.i32 ne, q0, zr
168 ; CHECK-NEXT: vldrht.u32 q0, [r3]
169 ; CHECK-NEXT: vstrw.32 q0, [r1]
172 %z = getelementptr inbounds i8, i8* %x, i32 3
173 %0 = bitcast i8* %z to <4 x i16>*
174 %mask = load <4 x i32>, <4 x i32>* %m, align 4
175 %c = icmp ne <4 x i32> %mask, zeroinitializer
176 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
177 %2 = zext <4 x i16> %1 to <4 x i32>
178 %3 = bitcast i8* %y to <4 x i32>*
179 store <4 x i32> %2, <4 x i32>* %3, align 4
183 define i8* @ldrhu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
184 ; CHECK-LABEL: ldrhu32_2:
185 ; CHECK: @ %bb.0: @ %entry
186 ; CHECK-NEXT: vldrw.u32 q0, [r2]
187 ; CHECK-NEXT: vpt.i32 ne, q0, zr
188 ; CHECK-NEXT: vldrht.u32 q0, [r0, #2]
189 ; CHECK-NEXT: vstrw.32 q0, [r1]
192 %z = getelementptr inbounds i8, i8* %x, i32 2
193 %0 = bitcast i8* %z to <4 x i16>*
194 %mask = load <4 x i32>, <4 x i32>* %m, align 4
195 %c = icmp ne <4 x i32> %mask, zeroinitializer
196 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
197 %2 = zext <4 x i16> %1 to <4 x i32>
198 %3 = bitcast i8* %y to <4 x i32>*
199 store <4 x i32> %2, <4 x i32>* %3, align 4
203 define i8* @ldrhu32_254(i8* %x, i8* %y, <4 x i32> *%m) {
204 ; CHECK-LABEL: ldrhu32_254:
205 ; CHECK: @ %bb.0: @ %entry
206 ; CHECK-NEXT: vldrw.u32 q0, [r2]
207 ; CHECK-NEXT: vpt.i32 ne, q0, zr
208 ; CHECK-NEXT: vldrht.u32 q0, [r0, #254]
209 ; CHECK-NEXT: vstrw.32 q0, [r1]
212 %z = getelementptr inbounds i8, i8* %x, i32 254
213 %0 = bitcast i8* %z to <4 x i16>*
214 %mask = load <4 x i32>, <4 x i32>* %m, align 4
215 %c = icmp ne <4 x i32> %mask, zeroinitializer
216 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
217 %2 = zext <4 x i16> %1 to <4 x i32>
218 %3 = bitcast i8* %y to <4 x i32>*
219 store <4 x i32> %2, <4 x i32>* %3, align 4
223 define i8* @ldrhu32_256(i8* %x, i8* %y, <4 x i32> *%m) {
224 ; CHECK-LABEL: ldrhu32_256:
225 ; CHECK: @ %bb.0: @ %entry
226 ; CHECK-NEXT: vldrw.u32 q0, [r2]
227 ; CHECK-NEXT: add.w r3, r0, #256
228 ; CHECK-NEXT: vpt.i32 ne, q0, zr
229 ; CHECK-NEXT: vldrht.u32 q0, [r3]
230 ; CHECK-NEXT: vstrw.32 q0, [r1]
233 %z = getelementptr inbounds i8, i8* %x, i32 256
234 %0 = bitcast i8* %z to <4 x i16>*
235 %mask = load <4 x i32>, <4 x i32>* %m, align 4
236 %c = icmp ne <4 x i32> %mask, zeroinitializer
237 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
238 %2 = zext <4 x i16> %1 to <4 x i32>
239 %3 = bitcast i8* %y to <4 x i32>*
240 store <4 x i32> %2, <4 x i32>* %3, align 4
244 define i8* @ldrhu32_m254(i8* %x, i8* %y, <4 x i32> *%m) {
245 ; CHECK-LABEL: ldrhu32_m254:
246 ; CHECK: @ %bb.0: @ %entry
247 ; CHECK-NEXT: vldrw.u32 q0, [r2]
248 ; CHECK-NEXT: vpt.i32 ne, q0, zr
249 ; CHECK-NEXT: vldrht.u32 q0, [r0, #-254]
250 ; CHECK-NEXT: vstrw.32 q0, [r1]
253 %z = getelementptr inbounds i8, i8* %x, i32 -254
254 %0 = bitcast i8* %z to <4 x i16>*
255 %mask = load <4 x i32>, <4 x i32>* %m, align 4
256 %c = icmp ne <4 x i32> %mask, zeroinitializer
257 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
258 %2 = zext <4 x i16> %1 to <4 x i32>
259 %3 = bitcast i8* %y to <4 x i32>*
260 store <4 x i32> %2, <4 x i32>* %3, align 4
264 define i8* @ldrhu32_m256(i8* %x, i8* %y, <4 x i32> *%m) {
265 ; CHECK-LABEL: ldrhu32_m256:
266 ; CHECK: @ %bb.0: @ %entry
267 ; CHECK-NEXT: vldrw.u32 q0, [r2]
268 ; CHECK-NEXT: sub.w r3, r0, #256
269 ; CHECK-NEXT: vpt.i32 ne, q0, zr
270 ; CHECK-NEXT: vldrht.u32 q0, [r3]
271 ; CHECK-NEXT: vstrw.32 q0, [r1]
274 %z = getelementptr inbounds i8, i8* %x, i32 -256
275 %0 = bitcast i8* %z to <4 x i16>*
276 %mask = load <4 x i32>, <4 x i32>* %m, align 4
277 %c = icmp ne <4 x i32> %mask, zeroinitializer
278 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
279 %2 = zext <4 x i16> %1 to <4 x i32>
280 %3 = bitcast i8* %y to <4 x i32>*
281 store <4 x i32> %2, <4 x i32>* %3, align 4
285 define i8* @ldrhs32_4(i8* %x, i8* %y, <4 x i32> *%m) {
286 ; CHECK-LABEL: ldrhs32_4:
287 ; CHECK: @ %bb.0: @ %entry
288 ; CHECK-NEXT: vldrw.u32 q0, [r2]
289 ; CHECK-NEXT: vpt.i32 ne, q0, zr
290 ; CHECK-NEXT: vldrht.s32 q0, [r0, #4]
291 ; CHECK-NEXT: vstrw.32 q0, [r1]
294 %z = getelementptr inbounds i8, i8* %x, i32 4
295 %0 = bitcast i8* %z to <4 x i16>*
296 %mask = load <4 x i32>, <4 x i32>* %m, align 4
297 %c = icmp ne <4 x i32> %mask, zeroinitializer
298 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
299 %2 = sext <4 x i16> %1 to <4 x i32>
300 %3 = bitcast i8* %y to <4 x i32>*
301 store <4 x i32> %2, <4 x i32>* %3, align 4
305 define i8* @ldrhs32_3(i8* %x, i8* %y, <4 x i32> *%m) {
306 ; CHECK-LABEL: ldrhs32_3:
307 ; CHECK: @ %bb.0: @ %entry
308 ; CHECK-NEXT: vldrw.u32 q0, [r2]
309 ; CHECK-NEXT: adds r3, r0, #3
310 ; CHECK-NEXT: vpt.i32 ne, q0, zr
311 ; CHECK-NEXT: vldrht.s32 q0, [r3]
312 ; CHECK-NEXT: vstrw.32 q0, [r1]
315 %z = getelementptr inbounds i8, i8* %x, i32 3
316 %0 = bitcast i8* %z to <4 x i16>*
317 %mask = load <4 x i32>, <4 x i32>* %m, align 4
318 %c = icmp ne <4 x i32> %mask, zeroinitializer
319 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
320 %2 = sext <4 x i16> %1 to <4 x i32>
321 %3 = bitcast i8* %y to <4 x i32>*
322 store <4 x i32> %2, <4 x i32>* %3, align 4
326 define i8* @ldrhs32_2(i8* %x, i8* %y, <4 x i32> *%m) {
327 ; CHECK-LABEL: ldrhs32_2:
328 ; CHECK: @ %bb.0: @ %entry
329 ; CHECK-NEXT: vldrw.u32 q0, [r2]
330 ; CHECK-NEXT: vpt.i32 ne, q0, zr
331 ; CHECK-NEXT: vldrht.s32 q0, [r0, #2]
332 ; CHECK-NEXT: vstrw.32 q0, [r1]
335 %z = getelementptr inbounds i8, i8* %x, i32 2
336 %0 = bitcast i8* %z to <4 x i16>*
337 %mask = load <4 x i32>, <4 x i32>* %m, align 4
338 %c = icmp ne <4 x i32> %mask, zeroinitializer
339 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
340 %2 = sext <4 x i16> %1 to <4 x i32>
341 %3 = bitcast i8* %y to <4 x i32>*
342 store <4 x i32> %2, <4 x i32>* %3, align 4
346 define i8* @ldrhs32_254(i8* %x, i8* %y, <4 x i32> *%m) {
347 ; CHECK-LABEL: ldrhs32_254:
348 ; CHECK: @ %bb.0: @ %entry
349 ; CHECK-NEXT: vldrw.u32 q0, [r2]
350 ; CHECK-NEXT: vpt.i32 ne, q0, zr
351 ; CHECK-NEXT: vldrht.s32 q0, [r0, #254]
352 ; CHECK-NEXT: vstrw.32 q0, [r1]
355 %z = getelementptr inbounds i8, i8* %x, i32 254
356 %0 = bitcast i8* %z to <4 x i16>*
357 %mask = load <4 x i32>, <4 x i32>* %m, align 4
358 %c = icmp ne <4 x i32> %mask, zeroinitializer
359 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
360 %2 = sext <4 x i16> %1 to <4 x i32>
361 %3 = bitcast i8* %y to <4 x i32>*
362 store <4 x i32> %2, <4 x i32>* %3, align 4
366 define i8* @ldrhs32_256(i8* %x, i8* %y, <4 x i32> *%m) {
367 ; CHECK-LABEL: ldrhs32_256:
368 ; CHECK: @ %bb.0: @ %entry
369 ; CHECK-NEXT: vldrw.u32 q0, [r2]
370 ; CHECK-NEXT: add.w r3, r0, #256
371 ; CHECK-NEXT: vpt.i32 ne, q0, zr
372 ; CHECK-NEXT: vldrht.s32 q0, [r3]
373 ; CHECK-NEXT: vstrw.32 q0, [r1]
376 %z = getelementptr inbounds i8, i8* %x, i32 256
377 %0 = bitcast i8* %z to <4 x i16>*
378 %mask = load <4 x i32>, <4 x i32>* %m, align 4
379 %c = icmp ne <4 x i32> %mask, zeroinitializer
380 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
381 %2 = sext <4 x i16> %1 to <4 x i32>
382 %3 = bitcast i8* %y to <4 x i32>*
383 store <4 x i32> %2, <4 x i32>* %3, align 4
387 define i8* @ldrhs32_m254(i8* %x, i8* %y, <4 x i32> *%m) {
388 ; CHECK-LABEL: ldrhs32_m254:
389 ; CHECK: @ %bb.0: @ %entry
390 ; CHECK-NEXT: vldrw.u32 q0, [r2]
391 ; CHECK-NEXT: vpt.i32 ne, q0, zr
392 ; CHECK-NEXT: vldrht.s32 q0, [r0, #-254]
393 ; CHECK-NEXT: vstrw.32 q0, [r1]
396 %z = getelementptr inbounds i8, i8* %x, i32 -254
397 %0 = bitcast i8* %z to <4 x i16>*
398 %mask = load <4 x i32>, <4 x i32>* %m, align 4
399 %c = icmp ne <4 x i32> %mask, zeroinitializer
400 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
401 %2 = sext <4 x i16> %1 to <4 x i32>
402 %3 = bitcast i8* %y to <4 x i32>*
403 store <4 x i32> %2, <4 x i32>* %3, align 4
407 define i8* @ldrhs32_m256(i8* %x, i8* %y, <4 x i32> *%m) {
408 ; CHECK-LABEL: ldrhs32_m256:
409 ; CHECK: @ %bb.0: @ %entry
410 ; CHECK-NEXT: vldrw.u32 q0, [r2]
411 ; CHECK-NEXT: sub.w r3, r0, #256
412 ; CHECK-NEXT: vpt.i32 ne, q0, zr
413 ; CHECK-NEXT: vldrht.s32 q0, [r3]
414 ; CHECK-NEXT: vstrw.32 q0, [r1]
417 %z = getelementptr inbounds i8, i8* %x, i32 -256
418 %0 = bitcast i8* %z to <4 x i16>*
419 %mask = load <4 x i32>, <4 x i32>* %m, align 4
420 %c = icmp ne <4 x i32> %mask, zeroinitializer
421 %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
422 %2 = sext <4 x i16> %1 to <4 x i32>
423 %3 = bitcast i8* %y to <4 x i32>*
424 store <4 x i32> %2, <4 x i32>* %3, align 4
428 define i8* @ldrhu16_4(i8* %x, i8* %y, <8 x i16> *%m) {
429 ; CHECK-LABEL: ldrhu16_4:
430 ; CHECK: @ %bb.0: @ %entry
431 ; CHECK-NEXT: vldrh.u16 q0, [r2]
432 ; CHECK-NEXT: vpt.i16 ne, q0, zr
433 ; CHECK-NEXT: vldrht.u16 q0, [r0, #4]
434 ; CHECK-NEXT: vstrh.16 q0, [r1]
437 %z = getelementptr inbounds i8, i8* %x, i32 4
438 %0 = bitcast i8* %z to <8 x i16>*
439 %mask = load <8 x i16>, <8 x i16>* %m, align 2
440 %c = icmp ne <8 x i16> %mask, zeroinitializer
441 %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
442 %2 = bitcast i8* %y to <8 x i16>*
443 store <8 x i16> %1, <8 x i16>* %2, align 2
447 define i8* @ldrhu16_3(i8* %x, i8* %y, <8 x i16> *%m) {
448 ; CHECK-LABEL: ldrhu16_3:
449 ; CHECK: @ %bb.0: @ %entry
450 ; CHECK-NEXT: vldrh.u16 q0, [r2]
451 ; CHECK-NEXT: adds r3, r0, #3
452 ; CHECK-NEXT: vpt.i16 ne, q0, zr
453 ; CHECK-NEXT: vldrht.u16 q0, [r3]
454 ; CHECK-NEXT: vstrh.16 q0, [r1]
457 %z = getelementptr inbounds i8, i8* %x, i32 3
458 %0 = bitcast i8* %z to <8 x i16>*
459 %mask = load <8 x i16>, <8 x i16>* %m, align 2
460 %c = icmp ne <8 x i16> %mask, zeroinitializer
461 %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
462 %2 = bitcast i8* %y to <8 x i16>*
463 store <8 x i16> %1, <8 x i16>* %2, align 2
467 define i8* @ldrhu16_2(i8* %x, i8* %y, <8 x i16> *%m) {
468 ; CHECK-LABEL: ldrhu16_2:
469 ; CHECK: @ %bb.0: @ %entry
470 ; CHECK-NEXT: vldrh.u16 q0, [r2]
471 ; CHECK-NEXT: vpt.i16 ne, q0, zr
472 ; CHECK-NEXT: vldrht.u16 q0, [r0, #2]
473 ; CHECK-NEXT: vstrh.16 q0, [r1]
476 %z = getelementptr inbounds i8, i8* %x, i32 2
477 %0 = bitcast i8* %z to <8 x i16>*
478 %mask = load <8 x i16>, <8 x i16>* %m, align 2
479 %c = icmp ne <8 x i16> %mask, zeroinitializer
480 %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
481 %2 = bitcast i8* %y to <8 x i16>*
482 store <8 x i16> %1, <8 x i16>* %2, align 2
486 define i8* @ldrhu16_254(i8* %x, i8* %y, <8 x i16> *%m) {
487 ; CHECK-LABEL: ldrhu16_254:
488 ; CHECK: @ %bb.0: @ %entry
489 ; CHECK-NEXT: vldrh.u16 q0, [r2]
490 ; CHECK-NEXT: vpt.i16 ne, q0, zr
491 ; CHECK-NEXT: vldrht.u16 q0, [r0, #254]
492 ; CHECK-NEXT: vstrh.16 q0, [r1]
495 %z = getelementptr inbounds i8, i8* %x, i32 254
496 %0 = bitcast i8* %z to <8 x i16>*
497 %mask = load <8 x i16>, <8 x i16>* %m, align 2
498 %c = icmp ne <8 x i16> %mask, zeroinitializer
499 %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
500 %2 = bitcast i8* %y to <8 x i16>*
501 store <8 x i16> %1, <8 x i16>* %2, align 2
505 define i8* @ldrhu16_256(i8* %x, i8* %y, <8 x i16> *%m) {
506 ; CHECK-LABEL: ldrhu16_256:
507 ; CHECK: @ %bb.0: @ %entry
508 ; CHECK-NEXT: vldrh.u16 q0, [r2]
509 ; CHECK-NEXT: add.w r3, r0, #256
510 ; CHECK-NEXT: vpt.i16 ne, q0, zr
511 ; CHECK-NEXT: vldrht.u16 q0, [r3]
512 ; CHECK-NEXT: vstrh.16 q0, [r1]
515 %z = getelementptr inbounds i8, i8* %x, i32 256
516 %0 = bitcast i8* %z to <8 x i16>*
517 %mask = load <8 x i16>, <8 x i16>* %m, align 2
518 %c = icmp ne <8 x i16> %mask, zeroinitializer
519 %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
520 %2 = bitcast i8* %y to <8 x i16>*
521 store <8 x i16> %1, <8 x i16>* %2, align 2
525 define i8* @ldrhu16_m254(i8* %x, i8* %y, <8 x i16> *%m) {
526 ; CHECK-LABEL: ldrhu16_m254:
527 ; CHECK: @ %bb.0: @ %entry
528 ; CHECK-NEXT: vldrh.u16 q0, [r2]
529 ; CHECK-NEXT: vpt.i16 ne, q0, zr
530 ; CHECK-NEXT: vldrht.u16 q0, [r0, #-254]
531 ; CHECK-NEXT: vstrh.16 q0, [r1]
534 %z = getelementptr inbounds i8, i8* %x, i32 -254
535 %0 = bitcast i8* %z to <8 x i16>*
536 %mask = load <8 x i16>, <8 x i16>* %m, align 2
537 %c = icmp ne <8 x i16> %mask, zeroinitializer
538 %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
539 %2 = bitcast i8* %y to <8 x i16>*
540 store <8 x i16> %1, <8 x i16>* %2, align 2
544 define i8* @ldrhu16_m256(i8* %x, i8* %y, <8 x i16> *%m) {
545 ; CHECK-LABEL: ldrhu16_m256:
546 ; CHECK: @ %bb.0: @ %entry
547 ; CHECK-NEXT: vldrh.u16 q0, [r2]
548 ; CHECK-NEXT: sub.w r3, r0, #256
549 ; CHECK-NEXT: vpt.i16 ne, q0, zr
550 ; CHECK-NEXT: vldrht.u16 q0, [r3]
551 ; CHECK-NEXT: vstrh.16 q0, [r1]
554 %z = getelementptr inbounds i8, i8* %x, i32 -256
555 %0 = bitcast i8* %z to <8 x i16>*
556 %mask = load <8 x i16>, <8 x i16>* %m, align 2
557 %c = icmp ne <8 x i16> %mask, zeroinitializer
558 %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
559 %2 = bitcast i8* %y to <8 x i16>*
560 store <8 x i16> %1, <8 x i16>* %2, align 2
564 define i8* @ldrbu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
565 ; CHECK-LABEL: ldrbu32_4:
566 ; CHECK: @ %bb.0: @ %entry
567 ; CHECK-NEXT: vldrw.u32 q0, [r2]
568 ; CHECK-NEXT: vpt.i32 ne, q0, zr
569 ; CHECK-NEXT: vldrbt.u32 q0, [r0, #4]
570 ; CHECK-NEXT: vstrw.32 q0, [r1]
573 %z = getelementptr inbounds i8, i8* %x, i32 4
574 %0 = bitcast i8* %z to <4 x i8>*
575 %mask = load <4 x i32>, <4 x i32>* %m, align 4
576 %c = icmp ne <4 x i32> %mask, zeroinitializer
577 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
578 %2 = zext <4 x i8> %1 to <4 x i32>
579 %3 = bitcast i8* %y to <4 x i32>*
580 store <4 x i32> %2, <4 x i32>* %3, align 4
584 define i8* @ldrbu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
585 ; CHECK-LABEL: ldrbu32_3:
586 ; CHECK: @ %bb.0: @ %entry
587 ; CHECK-NEXT: vldrw.u32 q0, [r2]
588 ; CHECK-NEXT: vpt.i32 ne, q0, zr
589 ; CHECK-NEXT: vldrbt.u32 q0, [r0, #3]
590 ; CHECK-NEXT: vstrw.32 q0, [r1]
593 %z = getelementptr inbounds i8, i8* %x, i32 3
594 %0 = bitcast i8* %z to <4 x i8>*
595 %mask = load <4 x i32>, <4 x i32>* %m, align 4
596 %c = icmp ne <4 x i32> %mask, zeroinitializer
597 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
598 %2 = zext <4 x i8> %1 to <4 x i32>
599 %3 = bitcast i8* %y to <4 x i32>*
600 store <4 x i32> %2, <4 x i32>* %3, align 4
604 define i8* @ldrbu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
605 ; CHECK-LABEL: ldrbu32_2:
606 ; CHECK: @ %bb.0: @ %entry
607 ; CHECK-NEXT: vldrw.u32 q0, [r2]
608 ; CHECK-NEXT: vpt.i32 ne, q0, zr
609 ; CHECK-NEXT: vldrbt.u32 q0, [r0, #2]
610 ; CHECK-NEXT: vstrw.32 q0, [r1]
613 %z = getelementptr inbounds i8, i8* %x, i32 2
614 %0 = bitcast i8* %z to <4 x i8>*
615 %mask = load <4 x i32>, <4 x i32>* %m, align 4
616 %c = icmp ne <4 x i32> %mask, zeroinitializer
617 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
618 %2 = zext <4 x i8> %1 to <4 x i32>
619 %3 = bitcast i8* %y to <4 x i32>*
620 store <4 x i32> %2, <4 x i32>* %3, align 4
624 define i8* @ldrbu32_127(i8* %x, i8* %y, <4 x i32> *%m) {
625 ; CHECK-LABEL: ldrbu32_127:
626 ; CHECK: @ %bb.0: @ %entry
627 ; CHECK-NEXT: vldrw.u32 q0, [r2]
628 ; CHECK-NEXT: vpt.i32 ne, q0, zr
629 ; CHECK-NEXT: vldrbt.u32 q0, [r0, #127]
630 ; CHECK-NEXT: vstrw.32 q0, [r1]
633 %z = getelementptr inbounds i8, i8* %x, i32 127
634 %0 = bitcast i8* %z to <4 x i8>*
635 %mask = load <4 x i32>, <4 x i32>* %m, align 4
636 %c = icmp ne <4 x i32> %mask, zeroinitializer
637 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
638 %2 = zext <4 x i8> %1 to <4 x i32>
639 %3 = bitcast i8* %y to <4 x i32>*
640 store <4 x i32> %2, <4 x i32>* %3, align 4
644 define i8* @ldrbu32_128(i8* %x, i8* %y, <4 x i32> *%m) {
645 ; CHECK-LABEL: ldrbu32_128:
646 ; CHECK: @ %bb.0: @ %entry
647 ; CHECK-NEXT: vldrw.u32 q0, [r2]
648 ; CHECK-NEXT: add.w r3, r0, #128
649 ; CHECK-NEXT: vpt.i32 ne, q0, zr
650 ; CHECK-NEXT: vldrbt.u32 q0, [r3]
651 ; CHECK-NEXT: vstrw.32 q0, [r1]
654 %z = getelementptr inbounds i8, i8* %x, i32 128
655 %0 = bitcast i8* %z to <4 x i8>*
656 %mask = load <4 x i32>, <4 x i32>* %m, align 4
657 %c = icmp ne <4 x i32> %mask, zeroinitializer
658 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
659 %2 = zext <4 x i8> %1 to <4 x i32>
660 %3 = bitcast i8* %y to <4 x i32>*
661 store <4 x i32> %2, <4 x i32>* %3, align 4
665 define i8* @ldrbu32_m127(i8* %x, i8* %y, <4 x i32> *%m) {
666 ; CHECK-LABEL: ldrbu32_m127:
667 ; CHECK: @ %bb.0: @ %entry
668 ; CHECK-NEXT: vldrw.u32 q0, [r2]
669 ; CHECK-NEXT: vpt.i32 ne, q0, zr
670 ; CHECK-NEXT: vldrbt.u32 q0, [r0, #-127]
671 ; CHECK-NEXT: vstrw.32 q0, [r1]
674 %z = getelementptr inbounds i8, i8* %x, i32 -127
675 %0 = bitcast i8* %z to <4 x i8>*
676 %mask = load <4 x i32>, <4 x i32>* %m, align 4
677 %c = icmp ne <4 x i32> %mask, zeroinitializer
678 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
679 %2 = zext <4 x i8> %1 to <4 x i32>
680 %3 = bitcast i8* %y to <4 x i32>*
681 store <4 x i32> %2, <4 x i32>* %3, align 4
685 define i8* @ldrbu32_m128(i8* %x, i8* %y, <4 x i32> *%m) {
686 ; CHECK-LABEL: ldrbu32_m128:
687 ; CHECK: @ %bb.0: @ %entry
688 ; CHECK-NEXT: vldrw.u32 q0, [r2]
689 ; CHECK-NEXT: sub.w r3, r0, #128
690 ; CHECK-NEXT: vpt.i32 ne, q0, zr
691 ; CHECK-NEXT: vldrbt.u32 q0, [r3]
692 ; CHECK-NEXT: vstrw.32 q0, [r1]
695 %z = getelementptr inbounds i8, i8* %x, i32 -128
696 %0 = bitcast i8* %z to <4 x i8>*
697 %mask = load <4 x i32>, <4 x i32>* %m, align 4
698 %c = icmp ne <4 x i32> %mask, zeroinitializer
699 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
700 %2 = zext <4 x i8> %1 to <4 x i32>
701 %3 = bitcast i8* %y to <4 x i32>*
702 store <4 x i32> %2, <4 x i32>* %3, align 4
706 define i8* @ldrbs32_4(i8* %x, i8* %y, <4 x i32> *%m) {
707 ; CHECK-LABEL: ldrbs32_4:
708 ; CHECK: @ %bb.0: @ %entry
709 ; CHECK-NEXT: vldrw.u32 q0, [r2]
710 ; CHECK-NEXT: vpt.i32 ne, q0, zr
711 ; CHECK-NEXT: vldrbt.s32 q0, [r0, #4]
712 ; CHECK-NEXT: vstrw.32 q0, [r1]
715 %z = getelementptr inbounds i8, i8* %x, i32 4
716 %0 = bitcast i8* %z to <4 x i8>*
717 %mask = load <4 x i32>, <4 x i32>* %m, align 4
718 %c = icmp ne <4 x i32> %mask, zeroinitializer
719 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
720 %2 = sext <4 x i8> %1 to <4 x i32>
721 %3 = bitcast i8* %y to <4 x i32>*
722 store <4 x i32> %2, <4 x i32>* %3, align 4
726 define i8* @ldrbs32_3(i8* %x, i8* %y, <4 x i32> *%m) {
727 ; CHECK-LABEL: ldrbs32_3:
728 ; CHECK: @ %bb.0: @ %entry
729 ; CHECK-NEXT: vldrw.u32 q0, [r2]
730 ; CHECK-NEXT: vpt.i32 ne, q0, zr
731 ; CHECK-NEXT: vldrbt.s32 q0, [r0, #3]
732 ; CHECK-NEXT: vstrw.32 q0, [r1]
735 %z = getelementptr inbounds i8, i8* %x, i32 3
736 %0 = bitcast i8* %z to <4 x i8>*
737 %mask = load <4 x i32>, <4 x i32>* %m, align 4
738 %c = icmp ne <4 x i32> %mask, zeroinitializer
739 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
740 %2 = sext <4 x i8> %1 to <4 x i32>
741 %3 = bitcast i8* %y to <4 x i32>*
742 store <4 x i32> %2, <4 x i32>* %3, align 4
746 define i8* @ldrbs32_2(i8* %x, i8* %y, <4 x i32> *%m) {
747 ; CHECK-LABEL: ldrbs32_2:
748 ; CHECK: @ %bb.0: @ %entry
749 ; CHECK-NEXT: vldrw.u32 q0, [r2]
750 ; CHECK-NEXT: vpt.i32 ne, q0, zr
751 ; CHECK-NEXT: vldrbt.s32 q0, [r0, #2]
752 ; CHECK-NEXT: vstrw.32 q0, [r1]
755 %z = getelementptr inbounds i8, i8* %x, i32 2
756 %0 = bitcast i8* %z to <4 x i8>*
757 %mask = load <4 x i32>, <4 x i32>* %m, align 4
758 %c = icmp ne <4 x i32> %mask, zeroinitializer
759 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
760 %2 = sext <4 x i8> %1 to <4 x i32>
761 %3 = bitcast i8* %y to <4 x i32>*
762 store <4 x i32> %2, <4 x i32>* %3, align 4
766 define i8* @ldrbs32_127(i8* %x, i8* %y, <4 x i32> *%m) {
767 ; CHECK-LABEL: ldrbs32_127:
768 ; CHECK: @ %bb.0: @ %entry
769 ; CHECK-NEXT: vldrw.u32 q0, [r2]
770 ; CHECK-NEXT: vpt.i32 ne, q0, zr
771 ; CHECK-NEXT: vldrbt.s32 q0, [r0, #127]
772 ; CHECK-NEXT: vstrw.32 q0, [r1]
775 %z = getelementptr inbounds i8, i8* %x, i32 127
776 %0 = bitcast i8* %z to <4 x i8>*
777 %mask = load <4 x i32>, <4 x i32>* %m, align 4
778 %c = icmp ne <4 x i32> %mask, zeroinitializer
779 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
780 %2 = sext <4 x i8> %1 to <4 x i32>
781 %3 = bitcast i8* %y to <4 x i32>*
782 store <4 x i32> %2, <4 x i32>* %3, align 4
786 define i8* @ldrbs32_128(i8* %x, i8* %y, <4 x i32> *%m) {
787 ; CHECK-LABEL: ldrbs32_128:
788 ; CHECK: @ %bb.0: @ %entry
789 ; CHECK-NEXT: vldrw.u32 q0, [r2]
790 ; CHECK-NEXT: add.w r3, r0, #128
791 ; CHECK-NEXT: vpt.i32 ne, q0, zr
792 ; CHECK-NEXT: vldrbt.s32 q0, [r3]
793 ; CHECK-NEXT: vstrw.32 q0, [r1]
796 %z = getelementptr inbounds i8, i8* %x, i32 128
797 %0 = bitcast i8* %z to <4 x i8>*
798 %mask = load <4 x i32>, <4 x i32>* %m, align 4
799 %c = icmp ne <4 x i32> %mask, zeroinitializer
800 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
801 %2 = sext <4 x i8> %1 to <4 x i32>
802 %3 = bitcast i8* %y to <4 x i32>*
803 store <4 x i32> %2, <4 x i32>* %3, align 4
807 define i8* @ldrbs32_m127(i8* %x, i8* %y, <4 x i32> *%m) {
808 ; CHECK-LABEL: ldrbs32_m127:
809 ; CHECK: @ %bb.0: @ %entry
810 ; CHECK-NEXT: vldrw.u32 q0, [r2]
811 ; CHECK-NEXT: vpt.i32 ne, q0, zr
812 ; CHECK-NEXT: vldrbt.s32 q0, [r0, #-127]
813 ; CHECK-NEXT: vstrw.32 q0, [r1]
816 %z = getelementptr inbounds i8, i8* %x, i32 -127
817 %0 = bitcast i8* %z to <4 x i8>*
818 %mask = load <4 x i32>, <4 x i32>* %m, align 4
819 %c = icmp ne <4 x i32> %mask, zeroinitializer
820 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
821 %2 = sext <4 x i8> %1 to <4 x i32>
822 %3 = bitcast i8* %y to <4 x i32>*
823 store <4 x i32> %2, <4 x i32>* %3, align 4
827 define i8* @ldrbs32_m128(i8* %x, i8* %y, <4 x i32> *%m) {
828 ; CHECK-LABEL: ldrbs32_m128:
829 ; CHECK: @ %bb.0: @ %entry
830 ; CHECK-NEXT: vldrw.u32 q0, [r2]
831 ; CHECK-NEXT: sub.w r3, r0, #128
832 ; CHECK-NEXT: vpt.i32 ne, q0, zr
833 ; CHECK-NEXT: vldrbt.s32 q0, [r3]
834 ; CHECK-NEXT: vstrw.32 q0, [r1]
837 %z = getelementptr inbounds i8, i8* %x, i32 -128
838 %0 = bitcast i8* %z to <4 x i8>*
839 %mask = load <4 x i32>, <4 x i32>* %m, align 4
840 %c = icmp ne <4 x i32> %mask, zeroinitializer
841 %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
842 %2 = sext <4 x i8> %1 to <4 x i32>
843 %3 = bitcast i8* %y to <4 x i32>*
844 store <4 x i32> %2, <4 x i32>* %3, align 4
848 define i8* @ldrbu16_4(i8* %x, i8* %y, <8 x i16> *%m) {
849 ; CHECK-LABEL: ldrbu16_4:
850 ; CHECK: @ %bb.0: @ %entry
851 ; CHECK-NEXT: vldrh.u16 q0, [r2]
852 ; CHECK-NEXT: vpt.i16 ne, q0, zr
853 ; CHECK-NEXT: vldrbt.u16 q0, [r0, #4]
854 ; CHECK-NEXT: vstrh.16 q0, [r1]
857 %z = getelementptr inbounds i8, i8* %x, i32 4
858 %0 = bitcast i8* %z to <8 x i8>*
859 %mask = load <8 x i16>, <8 x i16>* %m, align 2
860 %c = icmp ne <8 x i16> %mask, zeroinitializer
861 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
862 %2 = zext <8 x i8> %1 to <8 x i16>
863 %3 = bitcast i8* %y to <8 x i16>*
864 store <8 x i16> %2, <8 x i16>* %3, align 2
868 define i8* @ldrbu16_3(i8* %x, i8* %y, <8 x i16> *%m) {
869 ; CHECK-LABEL: ldrbu16_3:
870 ; CHECK: @ %bb.0: @ %entry
871 ; CHECK-NEXT: vldrh.u16 q0, [r2]
872 ; CHECK-NEXT: vpt.i16 ne, q0, zr
873 ; CHECK-NEXT: vldrbt.u16 q0, [r0, #3]
874 ; CHECK-NEXT: vstrh.16 q0, [r1]
877 %z = getelementptr inbounds i8, i8* %x, i32 3
878 %0 = bitcast i8* %z to <8 x i8>*
879 %mask = load <8 x i16>, <8 x i16>* %m, align 2
880 %c = icmp ne <8 x i16> %mask, zeroinitializer
881 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
882 %2 = zext <8 x i8> %1 to <8 x i16>
883 %3 = bitcast i8* %y to <8 x i16>*
884 store <8 x i16> %2, <8 x i16>* %3, align 2
888 define i8* @ldrbu16_2(i8* %x, i8* %y, <8 x i16> *%m) {
889 ; CHECK-LABEL: ldrbu16_2:
890 ; CHECK: @ %bb.0: @ %entry
891 ; CHECK-NEXT: vldrh.u16 q0, [r2]
892 ; CHECK-NEXT: vpt.i16 ne, q0, zr
893 ; CHECK-NEXT: vldrbt.u16 q0, [r0, #2]
894 ; CHECK-NEXT: vstrh.16 q0, [r1]
897 %z = getelementptr inbounds i8, i8* %x, i32 2
898 %0 = bitcast i8* %z to <8 x i8>*
899 %mask = load <8 x i16>, <8 x i16>* %m, align 2
900 %c = icmp ne <8 x i16> %mask, zeroinitializer
901 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
902 %2 = zext <8 x i8> %1 to <8 x i16>
903 %3 = bitcast i8* %y to <8 x i16>*
904 store <8 x i16> %2, <8 x i16>* %3, align 2
908 define i8* @ldrbu16_127(i8* %x, i8* %y, <8 x i16> *%m) {
909 ; CHECK-LABEL: ldrbu16_127:
910 ; CHECK: @ %bb.0: @ %entry
911 ; CHECK-NEXT: vldrh.u16 q0, [r2]
912 ; CHECK-NEXT: vpt.i16 ne, q0, zr
913 ; CHECK-NEXT: vldrbt.u16 q0, [r0, #127]
914 ; CHECK-NEXT: vstrh.16 q0, [r1]
917 %z = getelementptr inbounds i8, i8* %x, i32 127
918 %0 = bitcast i8* %z to <8 x i8>*
919 %mask = load <8 x i16>, <8 x i16>* %m, align 2
920 %c = icmp ne <8 x i16> %mask, zeroinitializer
921 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
922 %2 = zext <8 x i8> %1 to <8 x i16>
923 %3 = bitcast i8* %y to <8 x i16>*
924 store <8 x i16> %2, <8 x i16>* %3, align 2
928 define i8* @ldrbu16_128(i8* %x, i8* %y, <8 x i16> *%m) {
929 ; CHECK-LABEL: ldrbu16_128:
930 ; CHECK: @ %bb.0: @ %entry
931 ; CHECK-NEXT: vldrh.u16 q0, [r2]
932 ; CHECK-NEXT: add.w r3, r0, #128
933 ; CHECK-NEXT: vpt.i16 ne, q0, zr
934 ; CHECK-NEXT: vldrbt.u16 q0, [r3]
935 ; CHECK-NEXT: vstrh.16 q0, [r1]
938 %z = getelementptr inbounds i8, i8* %x, i32 128
939 %0 = bitcast i8* %z to <8 x i8>*
940 %mask = load <8 x i16>, <8 x i16>* %m, align 2
941 %c = icmp ne <8 x i16> %mask, zeroinitializer
942 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
943 %2 = zext <8 x i8> %1 to <8 x i16>
944 %3 = bitcast i8* %y to <8 x i16>*
945 store <8 x i16> %2, <8 x i16>* %3, align 2
949 define i8* @ldrbu16_m127(i8* %x, i8* %y, <8 x i16> *%m) {
950 ; CHECK-LABEL: ldrbu16_m127:
951 ; CHECK: @ %bb.0: @ %entry
952 ; CHECK-NEXT: vldrh.u16 q0, [r2]
953 ; CHECK-NEXT: vpt.i16 ne, q0, zr
954 ; CHECK-NEXT: vldrbt.u16 q0, [r0, #-127]
955 ; CHECK-NEXT: vstrh.16 q0, [r1]
958 %z = getelementptr inbounds i8, i8* %x, i32 -127
959 %0 = bitcast i8* %z to <8 x i8>*
960 %mask = load <8 x i16>, <8 x i16>* %m, align 2
961 %c = icmp ne <8 x i16> %mask, zeroinitializer
962 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
963 %2 = zext <8 x i8> %1 to <8 x i16>
964 %3 = bitcast i8* %y to <8 x i16>*
965 store <8 x i16> %2, <8 x i16>* %3, align 2
969 define i8* @ldrbu16_m128(i8* %x, i8* %y, <8 x i16> *%m) {
970 ; CHECK-LABEL: ldrbu16_m128:
971 ; CHECK: @ %bb.0: @ %entry
972 ; CHECK-NEXT: vldrh.u16 q0, [r2]
973 ; CHECK-NEXT: sub.w r3, r0, #128
974 ; CHECK-NEXT: vpt.i16 ne, q0, zr
975 ; CHECK-NEXT: vldrbt.u16 q0, [r3]
976 ; CHECK-NEXT: vstrh.16 q0, [r1]
979 %z = getelementptr inbounds i8, i8* %x, i32 -128
980 %0 = bitcast i8* %z to <8 x i8>*
981 %mask = load <8 x i16>, <8 x i16>* %m, align 2
982 %c = icmp ne <8 x i16> %mask, zeroinitializer
983 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
984 %2 = zext <8 x i8> %1 to <8 x i16>
985 %3 = bitcast i8* %y to <8 x i16>*
986 store <8 x i16> %2, <8 x i16>* %3, align 2
990 define i8* @ldrbs16_4(i8* %x, i8* %y, <8 x i16> *%m) {
991 ; CHECK-LABEL: ldrbs16_4:
992 ; CHECK: @ %bb.0: @ %entry
993 ; CHECK-NEXT: vldrh.u16 q0, [r2]
994 ; CHECK-NEXT: vpt.i16 ne, q0, zr
995 ; CHECK-NEXT: vldrbt.s16 q0, [r0, #4]
996 ; CHECK-NEXT: vstrh.16 q0, [r1]
999 %z = getelementptr inbounds i8, i8* %x, i32 4
1000 %0 = bitcast i8* %z to <8 x i8>*
1001 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1002 %c = icmp ne <8 x i16> %mask, zeroinitializer
1003 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
1004 %2 = sext <8 x i8> %1 to <8 x i16>
1005 %3 = bitcast i8* %y to <8 x i16>*
1006 store <8 x i16> %2, <8 x i16>* %3, align 2
1010 define i8* @ldrbs16_3(i8* %x, i8* %y, <8 x i16> *%m) {
1011 ; CHECK-LABEL: ldrbs16_3:
1012 ; CHECK: @ %bb.0: @ %entry
1013 ; CHECK-NEXT: vldrh.u16 q0, [r2]
1014 ; CHECK-NEXT: vpt.i16 ne, q0, zr
1015 ; CHECK-NEXT: vldrbt.s16 q0, [r0, #3]
1016 ; CHECK-NEXT: vstrh.16 q0, [r1]
1019 %z = getelementptr inbounds i8, i8* %x, i32 3
1020 %0 = bitcast i8* %z to <8 x i8>*
1021 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1022 %c = icmp ne <8 x i16> %mask, zeroinitializer
1023 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
1024 %2 = sext <8 x i8> %1 to <8 x i16>
1025 %3 = bitcast i8* %y to <8 x i16>*
1026 store <8 x i16> %2, <8 x i16>* %3, align 2
1030 define i8* @ldrbs16_2(i8* %x, i8* %y, <8 x i16> *%m) {
1031 ; CHECK-LABEL: ldrbs16_2:
1032 ; CHECK: @ %bb.0: @ %entry
1033 ; CHECK-NEXT: vldrh.u16 q0, [r2]
1034 ; CHECK-NEXT: vpt.i16 ne, q0, zr
1035 ; CHECK-NEXT: vldrbt.s16 q0, [r0, #2]
1036 ; CHECK-NEXT: vstrh.16 q0, [r1]
1039 %z = getelementptr inbounds i8, i8* %x, i32 2
1040 %0 = bitcast i8* %z to <8 x i8>*
1041 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1042 %c = icmp ne <8 x i16> %mask, zeroinitializer
1043 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
1044 %2 = sext <8 x i8> %1 to <8 x i16>
1045 %3 = bitcast i8* %y to <8 x i16>*
1046 store <8 x i16> %2, <8 x i16>* %3, align 2
1050 define i8* @ldrbs16_127(i8* %x, i8* %y, <8 x i16> *%m) {
1051 ; CHECK-LABEL: ldrbs16_127:
1052 ; CHECK: @ %bb.0: @ %entry
1053 ; CHECK-NEXT: vldrh.u16 q0, [r2]
1054 ; CHECK-NEXT: vpt.i16 ne, q0, zr
1055 ; CHECK-NEXT: vldrbt.s16 q0, [r0, #127]
1056 ; CHECK-NEXT: vstrh.16 q0, [r1]
1059 %z = getelementptr inbounds i8, i8* %x, i32 127
1060 %0 = bitcast i8* %z to <8 x i8>*
1061 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1062 %c = icmp ne <8 x i16> %mask, zeroinitializer
1063 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
1064 %2 = sext <8 x i8> %1 to <8 x i16>
1065 %3 = bitcast i8* %y to <8 x i16>*
1066 store <8 x i16> %2, <8 x i16>* %3, align 2
1070 define i8* @ldrbs16_128(i8* %x, i8* %y, <8 x i16> *%m) {
1071 ; CHECK-LABEL: ldrbs16_128:
1072 ; CHECK: @ %bb.0: @ %entry
1073 ; CHECK-NEXT: vldrh.u16 q0, [r2]
1074 ; CHECK-NEXT: add.w r3, r0, #128
1075 ; CHECK-NEXT: vpt.i16 ne, q0, zr
1076 ; CHECK-NEXT: vldrbt.s16 q0, [r3]
1077 ; CHECK-NEXT: vstrh.16 q0, [r1]
1080 %z = getelementptr inbounds i8, i8* %x, i32 128
1081 %0 = bitcast i8* %z to <8 x i8>*
1082 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1083 %c = icmp ne <8 x i16> %mask, zeroinitializer
1084 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
1085 %2 = sext <8 x i8> %1 to <8 x i16>
1086 %3 = bitcast i8* %y to <8 x i16>*
1087 store <8 x i16> %2, <8 x i16>* %3, align 2
1091 define i8* @ldrbs16_m127(i8* %x, i8* %y, <8 x i16> *%m) {
1092 ; CHECK-LABEL: ldrbs16_m127:
1093 ; CHECK: @ %bb.0: @ %entry
1094 ; CHECK-NEXT: vldrh.u16 q0, [r2]
1095 ; CHECK-NEXT: vpt.i16 ne, q0, zr
1096 ; CHECK-NEXT: vldrbt.s16 q0, [r0, #-127]
1097 ; CHECK-NEXT: vstrh.16 q0, [r1]
1100 %z = getelementptr inbounds i8, i8* %x, i32 -127
1101 %0 = bitcast i8* %z to <8 x i8>*
1102 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1103 %c = icmp ne <8 x i16> %mask, zeroinitializer
1104 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
1105 %2 = sext <8 x i8> %1 to <8 x i16>
1106 %3 = bitcast i8* %y to <8 x i16>*
1107 store <8 x i16> %2, <8 x i16>* %3, align 2
1111 define i8* @ldrbs16_m128(i8* %x, i8* %y, <8 x i16> *%m) {
1112 ; CHECK-LABEL: ldrbs16_m128:
1113 ; CHECK: @ %bb.0: @ %entry
1114 ; CHECK-NEXT: vldrh.u16 q0, [r2]
1115 ; CHECK-NEXT: sub.w r3, r0, #128
1116 ; CHECK-NEXT: vpt.i16 ne, q0, zr
1117 ; CHECK-NEXT: vldrbt.s16 q0, [r3]
1118 ; CHECK-NEXT: vstrh.16 q0, [r1]
1121 %z = getelementptr inbounds i8, i8* %x, i32 -128
1122 %0 = bitcast i8* %z to <8 x i8>*
1123 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1124 %c = icmp ne <8 x i16> %mask, zeroinitializer
1125 %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
1126 %2 = sext <8 x i8> %1 to <8 x i16>
1127 %3 = bitcast i8* %y to <8 x i16>*
1128 store <8 x i16> %2, <8 x i16>* %3, align 2
1132 define i8* @ldrbu8_4(i8* %x, i8* %y, <16 x i8> *%m) {
1133 ; CHECK-LABEL: ldrbu8_4:
1134 ; CHECK: @ %bb.0: @ %entry
1135 ; CHECK-NEXT: vldrb.u8 q0, [r2]
1136 ; CHECK-NEXT: vpt.i8 ne, q0, zr
1137 ; CHECK-NEXT: vldrbt.u8 q0, [r0, #4]
1138 ; CHECK-NEXT: vstrb.8 q0, [r1]
1141 %z = getelementptr inbounds i8, i8* %x, i32 4
1142 %0 = bitcast i8* %z to <16 x i8>*
1143 %mask = load <16 x i8>, <16 x i8>* %m, align 1
1144 %c = icmp ne <16 x i8> %mask, zeroinitializer
1145 %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
1146 %2 = bitcast i8* %y to <16 x i8>*
1147 store <16 x i8> %1, <16 x i8>* %2, align 1
1151 define i8* @ldrbu8_3(i8* %x, i8* %y, <16 x i8> *%m) {
1152 ; CHECK-LABEL: ldrbu8_3:
1153 ; CHECK: @ %bb.0: @ %entry
1154 ; CHECK-NEXT: vldrb.u8 q0, [r2]
1155 ; CHECK-NEXT: vpt.i8 ne, q0, zr
1156 ; CHECK-NEXT: vldrbt.u8 q0, [r0, #3]
1157 ; CHECK-NEXT: vstrb.8 q0, [r1]
1160 %z = getelementptr inbounds i8, i8* %x, i32 3
1161 %0 = bitcast i8* %z to <16 x i8>*
1162 %mask = load <16 x i8>, <16 x i8>* %m, align 1
1163 %c = icmp ne <16 x i8> %mask, zeroinitializer
1164 %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
1165 %2 = bitcast i8* %y to <16 x i8>*
1166 store <16 x i8> %1, <16 x i8>* %2, align 1
1170 define i8* @ldrbu8_2(i8* %x, i8* %y, <16 x i8> *%m) {
1171 ; CHECK-LABEL: ldrbu8_2:
1172 ; CHECK: @ %bb.0: @ %entry
1173 ; CHECK-NEXT: vldrb.u8 q0, [r2]
1174 ; CHECK-NEXT: vpt.i8 ne, q0, zr
1175 ; CHECK-NEXT: vldrbt.u8 q0, [r0, #2]
1176 ; CHECK-NEXT: vstrb.8 q0, [r1]
1179 %z = getelementptr inbounds i8, i8* %x, i32 2
1180 %0 = bitcast i8* %z to <16 x i8>*
1181 %mask = load <16 x i8>, <16 x i8>* %m, align 1
1182 %c = icmp ne <16 x i8> %mask, zeroinitializer
1183 %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
1184 %2 = bitcast i8* %y to <16 x i8>*
1185 store <16 x i8> %1, <16 x i8>* %2, align 1
1189 define i8* @ldrbu8_127(i8* %x, i8* %y, <16 x i8> *%m) {
1190 ; CHECK-LABEL: ldrbu8_127:
1191 ; CHECK: @ %bb.0: @ %entry
1192 ; CHECK-NEXT: vldrb.u8 q0, [r2]
1193 ; CHECK-NEXT: vpt.i8 ne, q0, zr
1194 ; CHECK-NEXT: vldrbt.u8 q0, [r0, #127]
1195 ; CHECK-NEXT: vstrb.8 q0, [r1]
1198 %z = getelementptr inbounds i8, i8* %x, i32 127
1199 %0 = bitcast i8* %z to <16 x i8>*
1200 %mask = load <16 x i8>, <16 x i8>* %m, align 1
1201 %c = icmp ne <16 x i8> %mask, zeroinitializer
1202 %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
1203 %2 = bitcast i8* %y to <16 x i8>*
1204 store <16 x i8> %1, <16 x i8>* %2, align 1
1208 define i8* @ldrbu8_128(i8* %x, i8* %y, <16 x i8> *%m) {
1209 ; CHECK-LABEL: ldrbu8_128:
1210 ; CHECK: @ %bb.0: @ %entry
1211 ; CHECK-NEXT: vldrb.u8 q0, [r2]
1212 ; CHECK-NEXT: add.w r3, r0, #128
1213 ; CHECK-NEXT: vpt.i8 ne, q0, zr
1214 ; CHECK-NEXT: vldrbt.u8 q0, [r3]
1215 ; CHECK-NEXT: vstrb.8 q0, [r1]
1218 %z = getelementptr inbounds i8, i8* %x, i32 128
1219 %0 = bitcast i8* %z to <16 x i8>*
1220 %mask = load <16 x i8>, <16 x i8>* %m, align 1
1221 %c = icmp ne <16 x i8> %mask, zeroinitializer
1222 %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
1223 %2 = bitcast i8* %y to <16 x i8>*
1224 store <16 x i8> %1, <16 x i8>* %2, align 1
1228 define i8* @ldrbu8_m127(i8* %x, i8* %y, <16 x i8> *%m) {
1229 ; CHECK-LABEL: ldrbu8_m127:
1230 ; CHECK: @ %bb.0: @ %entry
1231 ; CHECK-NEXT: vldrb.u8 q0, [r2]
1232 ; CHECK-NEXT: vpt.i8 ne, q0, zr
1233 ; CHECK-NEXT: vldrbt.u8 q0, [r0, #-127]
1234 ; CHECK-NEXT: vstrb.8 q0, [r1]
1237 %z = getelementptr inbounds i8, i8* %x, i32 -127
1238 %0 = bitcast i8* %z to <16 x i8>*
1239 %mask = load <16 x i8>, <16 x i8>* %m, align 1
1240 %c = icmp ne <16 x i8> %mask, zeroinitializer
1241 %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
1242 %2 = bitcast i8* %y to <16 x i8>*
1243 store <16 x i8> %1, <16 x i8>* %2, align 1
1247 define i8* @ldrbu8_m128(i8* %x, i8* %y, <16 x i8> *%m) {
1248 ; CHECK-LABEL: ldrbu8_m128:
1249 ; CHECK: @ %bb.0: @ %entry
1250 ; CHECK-NEXT: vldrb.u8 q0, [r2]
1251 ; CHECK-NEXT: sub.w r3, r0, #128
1252 ; CHECK-NEXT: vpt.i8 ne, q0, zr
1253 ; CHECK-NEXT: vldrbt.u8 q0, [r3]
1254 ; CHECK-NEXT: vstrb.8 q0, [r1]
1257 %z = getelementptr inbounds i8, i8* %x, i32 -128
1258 %0 = bitcast i8* %z to <16 x i8>*
1259 %mask = load <16 x i8>, <16 x i8>* %m, align 1
1260 %c = icmp ne <16 x i8> %mask, zeroinitializer
1261 %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
1262 %2 = bitcast i8* %y to <16 x i8>*
1263 store <16 x i8> %1, <16 x i8>* %2, align 1
1267 define i8* @ldrwf32_4(i8* %x, i8* %y, <4 x i32> *%m) {
1268 ; CHECK-LABEL: ldrwf32_4:
1269 ; CHECK: @ %bb.0: @ %entry
1270 ; CHECK-NEXT: vldrw.u32 q0, [r2]
1271 ; CHECK-NEXT: vpt.i32 ne, q0, zr
1272 ; CHECK-NEXT: vldrwt.u32 q0, [r0, #4]
1273 ; CHECK-NEXT: vstrw.32 q0, [r1]
1276 %z = getelementptr inbounds i8, i8* %x, i32 4
1277 %0 = bitcast i8* %z to <4 x float>*
1278 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1279 %c = icmp ne <4 x i32> %mask, zeroinitializer
1280 %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
1281 %2 = bitcast i8* %y to <4 x float>*
1282 store <4 x float> %1, <4 x float>* %2, align 4
1286 define i8* @ldrwf32_3(i8* %x, i8* %y, <4 x i32> *%m) {
1287 ; CHECK-LABEL: ldrwf32_3:
1288 ; CHECK: @ %bb.0: @ %entry
1289 ; CHECK-NEXT: vldrw.u32 q0, [r2]
1290 ; CHECK-NEXT: adds r3, r0, #3
1291 ; CHECK-NEXT: vpt.i32 ne, q0, zr
1292 ; CHECK-NEXT: vldrwt.u32 q0, [r3]
1293 ; CHECK-NEXT: vstrw.32 q0, [r1]
1296 %z = getelementptr inbounds i8, i8* %x, i32 3
1297 %0 = bitcast i8* %z to <4 x float>*
1298 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1299 %c = icmp ne <4 x i32> %mask, zeroinitializer
1300 %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
1301 %2 = bitcast i8* %y to <4 x float>*
1302 store <4 x float> %1, <4 x float>* %2, align 4
1306 define i8* @ldrwf32_2(i8* %x, i8* %y, <4 x i32> *%m) {
1307 ; CHECK-LABEL: ldrwf32_2:
1308 ; CHECK: @ %bb.0: @ %entry
1309 ; CHECK-NEXT: vldrw.u32 q0, [r2]
1310 ; CHECK-NEXT: adds r3, r0, #2
1311 ; CHECK-NEXT: vpt.i32 ne, q0, zr
1312 ; CHECK-NEXT: vldrwt.u32 q0, [r3]
1313 ; CHECK-NEXT: vstrw.32 q0, [r1]
1316 %z = getelementptr inbounds i8, i8* %x, i32 2
1317 %0 = bitcast i8* %z to <4 x float>*
1318 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1319 %c = icmp ne <4 x i32> %mask, zeroinitializer
1320 %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
1321 %2 = bitcast i8* %y to <4 x float>*
1322 store <4 x float> %1, <4 x float>* %2, align 4
1326 define i8* @ldrwf32_508(i8* %x, i8* %y, <4 x i32> *%m) {
1327 ; CHECK-LABEL: ldrwf32_508:
1328 ; CHECK: @ %bb.0: @ %entry
1329 ; CHECK-NEXT: vldrw.u32 q0, [r2]
1330 ; CHECK-NEXT: vpt.i32 ne, q0, zr
1331 ; CHECK-NEXT: vldrwt.u32 q0, [r0, #508]
1332 ; CHECK-NEXT: vstrw.32 q0, [r1]
1335 %z = getelementptr inbounds i8, i8* %x, i32 508
1336 %0 = bitcast i8* %z to <4 x float>*
1337 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1338 %c = icmp ne <4 x i32> %mask, zeroinitializer
1339 %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
1340 %2 = bitcast i8* %y to <4 x float>*
1341 store <4 x float> %1, <4 x float>* %2, align 4
1345 define i8* @ldrwf32_512(i8* %x, i8* %y, <4 x i32> *%m) {
1346 ; CHECK-LABEL: ldrwf32_512:
1347 ; CHECK: @ %bb.0: @ %entry
1348 ; CHECK-NEXT: vldrw.u32 q0, [r2]
1349 ; CHECK-NEXT: add.w r3, r0, #512
1350 ; CHECK-NEXT: vpt.i32 ne, q0, zr
1351 ; CHECK-NEXT: vldrwt.u32 q0, [r3]
1352 ; CHECK-NEXT: vstrw.32 q0, [r1]
1355 %z = getelementptr inbounds i8, i8* %x, i32 512
1356 %0 = bitcast i8* %z to <4 x float>*
1357 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1358 %c = icmp ne <4 x i32> %mask, zeroinitializer
1359 %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
1360 %2 = bitcast i8* %y to <4 x float>*
1361 store <4 x float> %1, <4 x float>* %2, align 4
1365 define i8* @ldrwf32_m508(i8* %x, i8* %y, <4 x i32> *%m) {
1366 ; CHECK-LABEL: ldrwf32_m508:
1367 ; CHECK: @ %bb.0: @ %entry
1368 ; CHECK-NEXT: vldrw.u32 q0, [r2]
1369 ; CHECK-NEXT: vpt.i32 ne, q0, zr
1370 ; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508]
1371 ; CHECK-NEXT: vstrw.32 q0, [r1]
1374 %z = getelementptr inbounds i8, i8* %x, i32 -508
1375 %0 = bitcast i8* %z to <4 x float>*
1376 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1377 %c = icmp ne <4 x i32> %mask, zeroinitializer
1378 %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
1379 %2 = bitcast i8* %y to <4 x float>*
1380 store <4 x float> %1, <4 x float>* %2, align 4
1384 define i8* @ldrwf32_m512(i8* %x, i8* %y, <4 x i32> *%m) {
1385 ; CHECK-LABEL: ldrwf32_m512:
1386 ; CHECK: @ %bb.0: @ %entry
1387 ; CHECK-NEXT: vldrw.u32 q0, [r2]
1388 ; CHECK-NEXT: sub.w r3, r0, #512
1389 ; CHECK-NEXT: vpt.i32 ne, q0, zr
1390 ; CHECK-NEXT: vldrwt.u32 q0, [r3]
1391 ; CHECK-NEXT: vstrw.32 q0, [r1]
1394 %z = getelementptr inbounds i8, i8* %x, i32 -512
1395 %0 = bitcast i8* %z to <4 x float>*
1396 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1397 %c = icmp ne <4 x i32> %mask, zeroinitializer
1398 %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
1399 %2 = bitcast i8* %y to <4 x float>*
1400 store <4 x float> %1, <4 x float>* %2, align 4
1404 define i8* @ldrhf16_4(i8* %x, i8* %y, <8 x i16> *%m) {
1405 ; CHECK-LABEL: ldrhf16_4:
1406 ; CHECK: @ %bb.0: @ %entry
1407 ; CHECK-NEXT: vldrh.u16 q0, [r2]
1408 ; CHECK-NEXT: vpt.i16 ne, q0, zr
1409 ; CHECK-NEXT: vldrht.u16 q0, [r0, #4]
1410 ; CHECK-NEXT: vstrh.16 q0, [r1]
1413 %z = getelementptr inbounds i8, i8* %x, i32 4
1414 %0 = bitcast i8* %z to <8 x half>*
1415 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1416 %c = icmp ne <8 x i16> %mask, zeroinitializer
1417 %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
1418 %2 = bitcast i8* %y to <8 x half>*
1419 store <8 x half> %1, <8 x half>* %2, align 2
1423 define i8* @ldrhf16_3(i8* %x, i8* %y, <8 x i16> *%m) {
1424 ; CHECK-LABEL: ldrhf16_3:
1425 ; CHECK: @ %bb.0: @ %entry
1426 ; CHECK-NEXT: vldrh.u16 q0, [r2]
1427 ; CHECK-NEXT: adds r3, r0, #3
1428 ; CHECK-NEXT: vpt.i16 ne, q0, zr
1429 ; CHECK-NEXT: vldrht.u16 q0, [r3]
1430 ; CHECK-NEXT: vstrh.16 q0, [r1]
1433 %z = getelementptr inbounds i8, i8* %x, i32 3
1434 %0 = bitcast i8* %z to <8 x half>*
1435 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1436 %c = icmp ne <8 x i16> %mask, zeroinitializer
1437 %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
1438 %2 = bitcast i8* %y to <8 x half>*
1439 store <8 x half> %1, <8 x half>* %2, align 2
1443 define i8* @ldrhf16_2(i8* %x, i8* %y, <8 x i16> *%m) {
1444 ; CHECK-LABEL: ldrhf16_2:
1445 ; CHECK: @ %bb.0: @ %entry
1446 ; CHECK-NEXT: vldrh.u16 q0, [r2]
1447 ; CHECK-NEXT: vpt.i16 ne, q0, zr
1448 ; CHECK-NEXT: vldrht.u16 q0, [r0, #2]
1449 ; CHECK-NEXT: vstrh.16 q0, [r1]
1452 %z = getelementptr inbounds i8, i8* %x, i32 2
1453 %0 = bitcast i8* %z to <8 x half>*
1454 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1455 %c = icmp ne <8 x i16> %mask, zeroinitializer
1456 %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
1457 %2 = bitcast i8* %y to <8 x half>*
1458 store <8 x half> %1, <8 x half>* %2, align 2
1462 define i8* @ldrhf16_254(i8* %x, i8* %y, <8 x i16> *%m) {
1463 ; CHECK-LABEL: ldrhf16_254:
1464 ; CHECK: @ %bb.0: @ %entry
1465 ; CHECK-NEXT: vldrh.u16 q0, [r2]
1466 ; CHECK-NEXT: vpt.i16 ne, q0, zr
1467 ; CHECK-NEXT: vldrht.u16 q0, [r0, #254]
1468 ; CHECK-NEXT: vstrh.16 q0, [r1]
1471 %z = getelementptr inbounds i8, i8* %x, i32 254
1472 %0 = bitcast i8* %z to <8 x half>*
1473 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1474 %c = icmp ne <8 x i16> %mask, zeroinitializer
1475 %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
1476 %2 = bitcast i8* %y to <8 x half>*
1477 store <8 x half> %1, <8 x half>* %2, align 2
1481 define i8* @ldrhf16_256(i8* %x, i8* %y, <8 x i16> *%m) {
1482 ; CHECK-LABEL: ldrhf16_256:
1483 ; CHECK: @ %bb.0: @ %entry
1484 ; CHECK-NEXT: vldrh.u16 q0, [r2]
1485 ; CHECK-NEXT: add.w r3, r0, #256
1486 ; CHECK-NEXT: vpt.i16 ne, q0, zr
1487 ; CHECK-NEXT: vldrht.u16 q0, [r3]
1488 ; CHECK-NEXT: vstrh.16 q0, [r1]
1491 %z = getelementptr inbounds i8, i8* %x, i32 256
1492 %0 = bitcast i8* %z to <8 x half>*
1493 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1494 %c = icmp ne <8 x i16> %mask, zeroinitializer
1495 %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
1496 %2 = bitcast i8* %y to <8 x half>*
1497 store <8 x half> %1, <8 x half>* %2, align 2
1501 define i8* @ldrhf16_m254(i8* %x, i8* %y, <8 x i16> *%m) {
1502 ; CHECK-LABEL: ldrhf16_m254:
1503 ; CHECK: @ %bb.0: @ %entry
1504 ; CHECK-NEXT: vldrh.u16 q0, [r2]
1505 ; CHECK-NEXT: vpt.i16 ne, q0, zr
1506 ; CHECK-NEXT: vldrht.u16 q0, [r0, #-254]
1507 ; CHECK-NEXT: vstrh.16 q0, [r1]
1510 %z = getelementptr inbounds i8, i8* %x, i32 -254
1511 %0 = bitcast i8* %z to <8 x half>*
1512 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1513 %c = icmp ne <8 x i16> %mask, zeroinitializer
1514 %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
1515 %2 = bitcast i8* %y to <8 x half>*
1516 store <8 x half> %1, <8 x half>* %2, align 2
1520 define i8* @ldrhf16_m256(i8* %x, i8* %y, <8 x i16> *%m) {
1521 ; CHECK-LABEL: ldrhf16_m256:
1522 ; CHECK: @ %bb.0: @ %entry
1523 ; CHECK-NEXT: vldrh.u16 q0, [r2]
1524 ; CHECK-NEXT: sub.w r3, r0, #256
1525 ; CHECK-NEXT: vpt.i16 ne, q0, zr
1526 ; CHECK-NEXT: vldrht.u16 q0, [r3]
1527 ; CHECK-NEXT: vstrh.16 q0, [r1]
1530 %z = getelementptr inbounds i8, i8* %x, i32 -256
1531 %0 = bitcast i8* %z to <8 x half>*
1532 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1533 %c = icmp ne <8 x i16> %mask, zeroinitializer
1534 %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
1535 %2 = bitcast i8* %y to <8 x half>*
1536 store <8 x half> %1, <8 x half>* %2, align 2
1543 define i8* @strw32_4(i8* %y, i8* %x, <4 x i32> *%m) {
1544 ; CHECK-LABEL: strw32_4:
1545 ; CHECK: @ %bb.0: @ %entry
1546 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1547 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1548 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1549 ; CHECK-NEXT: vstrwt.32 q0, [r0, #4]
1552 %z = getelementptr inbounds i8, i8* %y, i32 4
1553 %0 = bitcast i8* %x to <4 x i32>*
1554 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1555 %c = icmp ne <4 x i32> %mask, zeroinitializer
1556 %1 = load <4 x i32>, <4 x i32>* %0, align 4
1557 %2 = bitcast i8* %z to <4 x i32>*
1558 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
1562 define i8* @strw32_3(i8* %y, i8* %x, <4 x i32> *%m) {
1563 ; CHECK-LABEL: strw32_3:
1564 ; CHECK: @ %bb.0: @ %entry
1565 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1566 ; CHECK-NEXT: adds r1, r0, #3
1567 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1568 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1569 ; CHECK-NEXT: vstrwt.32 q0, [r1]
1572 %z = getelementptr inbounds i8, i8* %y, i32 3
1573 %0 = bitcast i8* %x to <4 x i32>*
1574 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1575 %c = icmp ne <4 x i32> %mask, zeroinitializer
1576 %1 = load <4 x i32>, <4 x i32>* %0, align 4
1577 %2 = bitcast i8* %z to <4 x i32>*
1578 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
1582 define i8* @strw32_2(i8* %y, i8* %x, <4 x i32> *%m) {
1583 ; CHECK-LABEL: strw32_2:
1584 ; CHECK: @ %bb.0: @ %entry
1585 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1586 ; CHECK-NEXT: adds r1, r0, #2
1587 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1588 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1589 ; CHECK-NEXT: vstrwt.32 q0, [r1]
1592 %z = getelementptr inbounds i8, i8* %y, i32 2
1593 %0 = bitcast i8* %x to <4 x i32>*
1594 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1595 %c = icmp ne <4 x i32> %mask, zeroinitializer
1596 %1 = load <4 x i32>, <4 x i32>* %0, align 4
1597 %2 = bitcast i8* %z to <4 x i32>*
1598 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
1602 define i8* @strw32_508(i8* %y, i8* %x, <4 x i32> *%m) {
1603 ; CHECK-LABEL: strw32_508:
1604 ; CHECK: @ %bb.0: @ %entry
1605 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1606 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1607 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1608 ; CHECK-NEXT: vstrwt.32 q0, [r0, #508]
1611 %z = getelementptr inbounds i8, i8* %y, i32 508
1612 %0 = bitcast i8* %x to <4 x i32>*
1613 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1614 %c = icmp ne <4 x i32> %mask, zeroinitializer
1615 %1 = load <4 x i32>, <4 x i32>* %0, align 4
1616 %2 = bitcast i8* %z to <4 x i32>*
1617 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
1621 define i8* @strw32_512(i8* %y, i8* %x, <4 x i32> *%m) {
1622 ; CHECK-LABEL: strw32_512:
1623 ; CHECK: @ %bb.0: @ %entry
1624 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1625 ; CHECK-NEXT: add.w r1, r0, #512
1626 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1627 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1628 ; CHECK-NEXT: vstrwt.32 q0, [r1]
1631 %z = getelementptr inbounds i8, i8* %y, i32 512
1632 %0 = bitcast i8* %x to <4 x i32>*
1633 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1634 %c = icmp ne <4 x i32> %mask, zeroinitializer
1635 %1 = load <4 x i32>, <4 x i32>* %0, align 4
1636 %2 = bitcast i8* %z to <4 x i32>*
1637 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
1641 define i8* @strw32_m508(i8* %y, i8* %x, <4 x i32> *%m) {
1642 ; CHECK-LABEL: strw32_m508:
1643 ; CHECK: @ %bb.0: @ %entry
1644 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1645 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1646 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1647 ; CHECK-NEXT: vstrwt.32 q0, [r0, #-508]
1650 %z = getelementptr inbounds i8, i8* %y, i32 -508
1651 %0 = bitcast i8* %x to <4 x i32>*
1652 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1653 %c = icmp ne <4 x i32> %mask, zeroinitializer
1654 %1 = load <4 x i32>, <4 x i32>* %0, align 4
1655 %2 = bitcast i8* %z to <4 x i32>*
1656 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
1660 define i8* @strw32_m512(i8* %y, i8* %x, <4 x i32> *%m) {
1661 ; CHECK-LABEL: strw32_m512:
1662 ; CHECK: @ %bb.0: @ %entry
1663 ; CHECK-NEXT: vldrw.u32 q0, [r1]
1664 ; CHECK-NEXT: sub.w r1, r0, #512
1665 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1666 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1667 ; CHECK-NEXT: vstrwt.32 q0, [r1]
1670 %z = getelementptr inbounds i8, i8* %y, i32 -512
1671 %0 = bitcast i8* %x to <4 x i32>*
1672 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1673 %c = icmp ne <4 x i32> %mask, zeroinitializer
1674 %1 = load <4 x i32>, <4 x i32>* %0, align 4
1675 %2 = bitcast i8* %z to <4 x i32>*
1676 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
1680 define i8* @strh32_4(i8* %y, i8* %x, <4 x i32> *%m) {
1681 ; CHECK-LABEL: strh32_4:
1682 ; CHECK: @ %bb.0: @ %entry
1683 ; CHECK-NEXT: vldrh.u32 q0, [r1]
1684 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1685 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1686 ; CHECK-NEXT: vstrht.32 q0, [r0, #4]
1689 %z = getelementptr inbounds i8, i8* %y, i32 4
1690 %0 = bitcast i8* %x to <4 x i16>*
1691 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1692 %c = icmp ne <4 x i32> %mask, zeroinitializer
1693 %1 = load <4 x i16>, <4 x i16>* %0, align 2
1694 %2 = bitcast i8* %z to <4 x i16>*
1695 call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
1699 define i8* @strh32_3(i8* %y, i8* %x, <4 x i32> *%m) {
1700 ; CHECK-LABEL: strh32_3:
1701 ; CHECK: @ %bb.0: @ %entry
1702 ; CHECK-NEXT: vldrh.u32 q0, [r1]
1703 ; CHECK-NEXT: adds r1, r0, #3
1704 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1705 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1706 ; CHECK-NEXT: vstrht.32 q0, [r1]
1709 %z = getelementptr inbounds i8, i8* %y, i32 3
1710 %0 = bitcast i8* %x to <4 x i16>*
1711 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1712 %c = icmp ne <4 x i32> %mask, zeroinitializer
1713 %1 = load <4 x i16>, <4 x i16>* %0, align 2
1714 %2 = bitcast i8* %z to <4 x i16>*
1715 call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
1719 define i8* @strh32_2(i8* %y, i8* %x, <4 x i32> *%m) {
1720 ; CHECK-LABEL: strh32_2:
1721 ; CHECK: @ %bb.0: @ %entry
1722 ; CHECK-NEXT: vldrh.u32 q0, [r1]
1723 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1724 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1725 ; CHECK-NEXT: vstrht.32 q0, [r0, #2]
1728 %z = getelementptr inbounds i8, i8* %y, i32 2
1729 %0 = bitcast i8* %x to <4 x i16>*
1730 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1731 %c = icmp ne <4 x i32> %mask, zeroinitializer
1732 %1 = load <4 x i16>, <4 x i16>* %0, align 2
1733 %2 = bitcast i8* %z to <4 x i16>*
1734 call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
1738 define i8* @strh32_254(i8* %y, i8* %x, <4 x i32> *%m) {
1739 ; CHECK-LABEL: strh32_254:
1740 ; CHECK: @ %bb.0: @ %entry
1741 ; CHECK-NEXT: vldrh.u32 q0, [r1]
1742 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1743 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1744 ; CHECK-NEXT: vstrht.32 q0, [r0, #254]
1747 %z = getelementptr inbounds i8, i8* %y, i32 254
1748 %0 = bitcast i8* %x to <4 x i16>*
1749 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1750 %c = icmp ne <4 x i32> %mask, zeroinitializer
1751 %1 = load <4 x i16>, <4 x i16>* %0, align 2
1752 %2 = bitcast i8* %z to <4 x i16>*
1753 call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
1757 define i8* @strh32_256(i8* %y, i8* %x, <4 x i32> *%m) {
1758 ; CHECK-LABEL: strh32_256:
1759 ; CHECK: @ %bb.0: @ %entry
1760 ; CHECK-NEXT: vldrh.u32 q0, [r1]
1761 ; CHECK-NEXT: add.w r1, r0, #256
1762 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1763 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1764 ; CHECK-NEXT: vstrht.32 q0, [r1]
1767 %z = getelementptr inbounds i8, i8* %y, i32 256
1768 %0 = bitcast i8* %x to <4 x i16>*
1769 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1770 %c = icmp ne <4 x i32> %mask, zeroinitializer
1771 %1 = load <4 x i16>, <4 x i16>* %0, align 2
1772 %2 = bitcast i8* %z to <4 x i16>*
1773 call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
1777 define i8* @strh32_m254(i8* %y, i8* %x, <4 x i32> *%m) {
1778 ; CHECK-LABEL: strh32_m254:
1779 ; CHECK: @ %bb.0: @ %entry
1780 ; CHECK-NEXT: vldrh.u32 q0, [r1]
1781 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1782 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1783 ; CHECK-NEXT: vstrht.32 q0, [r0, #-254]
1786 %z = getelementptr inbounds i8, i8* %y, i32 -254
1787 %0 = bitcast i8* %x to <4 x i16>*
1788 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1789 %c = icmp ne <4 x i32> %mask, zeroinitializer
1790 %1 = load <4 x i16>, <4 x i16>* %0, align 2
1791 %2 = bitcast i8* %z to <4 x i16>*
1792 call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
1796 define i8* @strh32_m256(i8* %y, i8* %x, <4 x i32> *%m) {
1797 ; CHECK-LABEL: strh32_m256:
1798 ; CHECK: @ %bb.0: @ %entry
1799 ; CHECK-NEXT: vldrh.u32 q0, [r1]
1800 ; CHECK-NEXT: sub.w r1, r0, #256
1801 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1802 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1803 ; CHECK-NEXT: vstrht.32 q0, [r1]
1806 %z = getelementptr inbounds i8, i8* %y, i32 -256
1807 %0 = bitcast i8* %x to <4 x i16>*
1808 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1809 %c = icmp ne <4 x i32> %mask, zeroinitializer
1810 %1 = load <4 x i16>, <4 x i16>* %0, align 2
1811 %2 = bitcast i8* %z to <4 x i16>*
1812 call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
1816 define i8* @strh16_4(i8* %y, i8* %x, <8 x i16> *%m) {
1817 ; CHECK-LABEL: strh16_4:
1818 ; CHECK: @ %bb.0: @ %entry
1819 ; CHECK-NEXT: vldrh.u16 q0, [r1]
1820 ; CHECK-NEXT: vldrh.u16 q1, [r2]
1821 ; CHECK-NEXT: vpt.i16 ne, q1, zr
1822 ; CHECK-NEXT: vstrht.16 q0, [r0, #4]
1825 %z = getelementptr inbounds i8, i8* %y, i32 4
1826 %0 = bitcast i8* %x to <8 x i16>*
1827 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1828 %c = icmp ne <8 x i16> %mask, zeroinitializer
1829 %1 = load <8 x i16>, <8 x i16>* %0, align 2
1830 %2 = bitcast i8* %z to <8 x i16>*
1831 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
1835 define i8* @strh16_3(i8* %y, i8* %x, <8 x i16> *%m) {
1836 ; CHECK-LABEL: strh16_3:
1837 ; CHECK: @ %bb.0: @ %entry
1838 ; CHECK-NEXT: vldrh.u16 q0, [r1]
1839 ; CHECK-NEXT: adds r1, r0, #3
1840 ; CHECK-NEXT: vldrh.u16 q1, [r2]
1841 ; CHECK-NEXT: vpt.i16 ne, q1, zr
1842 ; CHECK-NEXT: vstrht.16 q0, [r1]
1845 %z = getelementptr inbounds i8, i8* %y, i32 3
1846 %0 = bitcast i8* %x to <8 x i16>*
1847 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1848 %c = icmp ne <8 x i16> %mask, zeroinitializer
1849 %1 = load <8 x i16>, <8 x i16>* %0, align 2
1850 %2 = bitcast i8* %z to <8 x i16>*
1851 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
1855 define i8* @strh16_2(i8* %y, i8* %x, <8 x i16> *%m) {
1856 ; CHECK-LABEL: strh16_2:
1857 ; CHECK: @ %bb.0: @ %entry
1858 ; CHECK-NEXT: vldrh.u16 q0, [r1]
1859 ; CHECK-NEXT: vldrh.u16 q1, [r2]
1860 ; CHECK-NEXT: vpt.i16 ne, q1, zr
1861 ; CHECK-NEXT: vstrht.16 q0, [r0, #2]
1864 %z = getelementptr inbounds i8, i8* %y, i32 2
1865 %0 = bitcast i8* %x to <8 x i16>*
1866 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1867 %c = icmp ne <8 x i16> %mask, zeroinitializer
1868 %1 = load <8 x i16>, <8 x i16>* %0, align 2
1869 %2 = bitcast i8* %z to <8 x i16>*
1870 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
1874 define i8* @strh16_254(i8* %y, i8* %x, <8 x i16> *%m) {
1875 ; CHECK-LABEL: strh16_254:
1876 ; CHECK: @ %bb.0: @ %entry
1877 ; CHECK-NEXT: vldrh.u16 q0, [r1]
1878 ; CHECK-NEXT: vldrh.u16 q1, [r2]
1879 ; CHECK-NEXT: vpt.i16 ne, q1, zr
1880 ; CHECK-NEXT: vstrht.16 q0, [r0, #254]
1883 %z = getelementptr inbounds i8, i8* %y, i32 254
1884 %0 = bitcast i8* %x to <8 x i16>*
1885 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1886 %c = icmp ne <8 x i16> %mask, zeroinitializer
1887 %1 = load <8 x i16>, <8 x i16>* %0, align 2
1888 %2 = bitcast i8* %z to <8 x i16>*
1889 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
1893 define i8* @strh16_256(i8* %y, i8* %x, <8 x i16> *%m) {
1894 ; CHECK-LABEL: strh16_256:
1895 ; CHECK: @ %bb.0: @ %entry
1896 ; CHECK-NEXT: vldrh.u16 q0, [r1]
1897 ; CHECK-NEXT: add.w r1, r0, #256
1898 ; CHECK-NEXT: vldrh.u16 q1, [r2]
1899 ; CHECK-NEXT: vpt.i16 ne, q1, zr
1900 ; CHECK-NEXT: vstrht.16 q0, [r1]
1903 %z = getelementptr inbounds i8, i8* %y, i32 256
1904 %0 = bitcast i8* %x to <8 x i16>*
1905 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1906 %c = icmp ne <8 x i16> %mask, zeroinitializer
1907 %1 = load <8 x i16>, <8 x i16>* %0, align 2
1908 %2 = bitcast i8* %z to <8 x i16>*
1909 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
1913 define i8* @strh16_m254(i8* %y, i8* %x, <8 x i16> *%m) {
1914 ; CHECK-LABEL: strh16_m254:
1915 ; CHECK: @ %bb.0: @ %entry
1916 ; CHECK-NEXT: vldrh.u16 q0, [r1]
1917 ; CHECK-NEXT: vldrh.u16 q1, [r2]
1918 ; CHECK-NEXT: vpt.i16 ne, q1, zr
1919 ; CHECK-NEXT: vstrht.16 q0, [r0, #-254]
1922 %z = getelementptr inbounds i8, i8* %y, i32 -254
1923 %0 = bitcast i8* %x to <8 x i16>*
1924 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1925 %c = icmp ne <8 x i16> %mask, zeroinitializer
1926 %1 = load <8 x i16>, <8 x i16>* %0, align 2
1927 %2 = bitcast i8* %z to <8 x i16>*
1928 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
1932 define i8* @strh16_m256(i8* %y, i8* %x, <8 x i16> *%m) {
1933 ; CHECK-LABEL: strh16_m256:
1934 ; CHECK: @ %bb.0: @ %entry
1935 ; CHECK-NEXT: vldrh.u16 q0, [r1]
1936 ; CHECK-NEXT: sub.w r1, r0, #256
1937 ; CHECK-NEXT: vldrh.u16 q1, [r2]
1938 ; CHECK-NEXT: vpt.i16 ne, q1, zr
1939 ; CHECK-NEXT: vstrht.16 q0, [r1]
1942 %z = getelementptr inbounds i8, i8* %y, i32 -256
1943 %0 = bitcast i8* %x to <8 x i16>*
1944 %mask = load <8 x i16>, <8 x i16>* %m, align 2
1945 %c = icmp ne <8 x i16> %mask, zeroinitializer
1946 %1 = load <8 x i16>, <8 x i16>* %0, align 2
1947 %2 = bitcast i8* %z to <8 x i16>*
1948 call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
1952 define i8* @strb32_4(i8* %y, i8* %x, <4 x i32> *%m) {
1953 ; CHECK-LABEL: strb32_4:
1954 ; CHECK: @ %bb.0: @ %entry
1955 ; CHECK-NEXT: vldrb.u32 q0, [r1]
1956 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1957 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1958 ; CHECK-NEXT: vstrbt.32 q0, [r0, #4]
1961 %z = getelementptr inbounds i8, i8* %y, i32 4
1962 %0 = bitcast i8* %x to <4 x i8>*
1963 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1964 %c = icmp ne <4 x i32> %mask, zeroinitializer
1965 %1 = load <4 x i8>, <4 x i8>* %0, align 1
1966 %2 = bitcast i8* %z to <4 x i8>*
1967 call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
1971 define i8* @strb32_3(i8* %y, i8* %x, <4 x i32> *%m) {
1972 ; CHECK-LABEL: strb32_3:
1973 ; CHECK: @ %bb.0: @ %entry
1974 ; CHECK-NEXT: vldrb.u32 q0, [r1]
1975 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1976 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1977 ; CHECK-NEXT: vstrbt.32 q0, [r0, #3]
1980 %z = getelementptr inbounds i8, i8* %y, i32 3
1981 %0 = bitcast i8* %x to <4 x i8>*
1982 %mask = load <4 x i32>, <4 x i32>* %m, align 4
1983 %c = icmp ne <4 x i32> %mask, zeroinitializer
1984 %1 = load <4 x i8>, <4 x i8>* %0, align 1
1985 %2 = bitcast i8* %z to <4 x i8>*
1986 call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
1990 define i8* @strb32_2(i8* %y, i8* %x, <4 x i32> *%m) {
1991 ; CHECK-LABEL: strb32_2:
1992 ; CHECK: @ %bb.0: @ %entry
1993 ; CHECK-NEXT: vldrb.u32 q0, [r1]
1994 ; CHECK-NEXT: vldrw.u32 q1, [r2]
1995 ; CHECK-NEXT: vpt.i32 ne, q1, zr
1996 ; CHECK-NEXT: vstrbt.32 q0, [r0, #2]
1999 %z = getelementptr inbounds i8, i8* %y, i32 2
2000 %0 = bitcast i8* %x to <4 x i8>*
2001 %mask = load <4 x i32>, <4 x i32>* %m, align 4
2002 %c = icmp ne <4 x i32> %mask, zeroinitializer
2003 %1 = load <4 x i8>, <4 x i8>* %0, align 1
2004 %2 = bitcast i8* %z to <4 x i8>*
2005 call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
2009 define i8* @strb32_127(i8* %y, i8* %x, <4 x i32> *%m) {
2010 ; CHECK-LABEL: strb32_127:
2011 ; CHECK: @ %bb.0: @ %entry
2012 ; CHECK-NEXT: vldrb.u32 q0, [r1]
2013 ; CHECK-NEXT: vldrw.u32 q1, [r2]
2014 ; CHECK-NEXT: vpt.i32 ne, q1, zr
2015 ; CHECK-NEXT: vstrbt.32 q0, [r0, #127]
2018 %z = getelementptr inbounds i8, i8* %y, i32 127
2019 %0 = bitcast i8* %x to <4 x i8>*
2020 %mask = load <4 x i32>, <4 x i32>* %m, align 4
2021 %c = icmp ne <4 x i32> %mask, zeroinitializer
2022 %1 = load <4 x i8>, <4 x i8>* %0, align 1
2023 %2 = bitcast i8* %z to <4 x i8>*
2024 call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
2028 define i8* @strb32_128(i8* %y, i8* %x, <4 x i32> *%m) {
2029 ; CHECK-LABEL: strb32_128:
2030 ; CHECK: @ %bb.0: @ %entry
2031 ; CHECK-NEXT: vldrb.u32 q0, [r1]
2032 ; CHECK-NEXT: add.w r1, r0, #128
2033 ; CHECK-NEXT: vldrw.u32 q1, [r2]
2034 ; CHECK-NEXT: vpt.i32 ne, q1, zr
2035 ; CHECK-NEXT: vstrbt.32 q0, [r1]
2038 %z = getelementptr inbounds i8, i8* %y, i32 128
2039 %0 = bitcast i8* %x to <4 x i8>*
2040 %mask = load <4 x i32>, <4 x i32>* %m, align 4
2041 %c = icmp ne <4 x i32> %mask, zeroinitializer
2042 %1 = load <4 x i8>, <4 x i8>* %0, align 1
2043 %2 = bitcast i8* %z to <4 x i8>*
2044 call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
2048 define i8* @strb32_m127(i8* %y, i8* %x, <4 x i32> *%m) {
2049 ; CHECK-LABEL: strb32_m127:
2050 ; CHECK: @ %bb.0: @ %entry
2051 ; CHECK-NEXT: vldrb.u32 q0, [r1]
2052 ; CHECK-NEXT: vldrw.u32 q1, [r2]
2053 ; CHECK-NEXT: vpt.i32 ne, q1, zr
2054 ; CHECK-NEXT: vstrbt.32 q0, [r0, #-127]
2057 %z = getelementptr inbounds i8, i8* %y, i32 -127
2058 %0 = bitcast i8* %x to <4 x i8>*
2059 %mask = load <4 x i32>, <4 x i32>* %m, align 4
2060 %c = icmp ne <4 x i32> %mask, zeroinitializer
2061 %1 = load <4 x i8>, <4 x i8>* %0, align 1
2062 %2 = bitcast i8* %z to <4 x i8>*
2063 call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
2067 define i8* @strb32_m128(i8* %y, i8* %x, <4 x i32> *%m) {
2068 ; CHECK-LABEL: strb32_m128:
2069 ; CHECK: @ %bb.0: @ %entry
2070 ; CHECK-NEXT: vldrb.u32 q0, [r1]
2071 ; CHECK-NEXT: sub.w r1, r0, #128
2072 ; CHECK-NEXT: vldrw.u32 q1, [r2]
2073 ; CHECK-NEXT: vpt.i32 ne, q1, zr
2074 ; CHECK-NEXT: vstrbt.32 q0, [r1]
2077 %z = getelementptr inbounds i8, i8* %y, i32 -128
2078 %0 = bitcast i8* %x to <4 x i8>*
2079 %mask = load <4 x i32>, <4 x i32>* %m, align 4
2080 %c = icmp ne <4 x i32> %mask, zeroinitializer
2081 %1 = load <4 x i8>, <4 x i8>* %0, align 1
2082 %2 = bitcast i8* %z to <4 x i8>*
2083 call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
2087 define i8* @strb16_4(i8* %y, i8* %x, <8 x i16> *%m) {
2088 ; CHECK-LABEL: strb16_4:
2089 ; CHECK: @ %bb.0: @ %entry
2090 ; CHECK-NEXT: vldrb.u16 q0, [r1]
2091 ; CHECK-NEXT: vldrh.u16 q1, [r2]
2092 ; CHECK-NEXT: vpt.i16 ne, q1, zr
2093 ; CHECK-NEXT: vstrbt.16 q0, [r0, #4]
2096 %z = getelementptr inbounds i8, i8* %y, i32 4
2097 %0 = bitcast i8* %x to <8 x i8>*
2098 %mask = load <8 x i16>, <8 x i16>* %m, align 2
2099 %c = icmp ne <8 x i16> %mask, zeroinitializer
2100 %1 = load <8 x i8>, <8 x i8>* %0, align 1
2101 %2 = bitcast i8* %z to <8 x i8>*
2102 call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
2106 define i8* @strb16_3(i8* %y, i8* %x, <8 x i16> *%m) {
2107 ; CHECK-LABEL: strb16_3:
2108 ; CHECK: @ %bb.0: @ %entry
2109 ; CHECK-NEXT: vldrb.u16 q0, [r1]
2110 ; CHECK-NEXT: vldrh.u16 q1, [r2]
2111 ; CHECK-NEXT: vpt.i16 ne, q1, zr
2112 ; CHECK-NEXT: vstrbt.16 q0, [r0, #3]
2115 %z = getelementptr inbounds i8, i8* %y, i32 3
2116 %0 = bitcast i8* %x to <8 x i8>*
2117 %mask = load <8 x i16>, <8 x i16>* %m, align 2
2118 %c = icmp ne <8 x i16> %mask, zeroinitializer
2119 %1 = load <8 x i8>, <8 x i8>* %0, align 1
2120 %2 = bitcast i8* %z to <8 x i8>*
2121 call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
2125 define i8* @strb16_2(i8* %y, i8* %x, <8 x i16> *%m) {
2126 ; CHECK-LABEL: strb16_2:
2127 ; CHECK: @ %bb.0: @ %entry
2128 ; CHECK-NEXT: vldrb.u16 q0, [r1]
2129 ; CHECK-NEXT: vldrh.u16 q1, [r2]
2130 ; CHECK-NEXT: vpt.i16 ne, q1, zr
2131 ; CHECK-NEXT: vstrbt.16 q0, [r0, #2]
2134 %z = getelementptr inbounds i8, i8* %y, i32 2
2135 %0 = bitcast i8* %x to <8 x i8>*
2136 %mask = load <8 x i16>, <8 x i16>* %m, align 2
2137 %c = icmp ne <8 x i16> %mask, zeroinitializer
2138 %1 = load <8 x i8>, <8 x i8>* %0, align 1
2139 %2 = bitcast i8* %z to <8 x i8>*
2140 call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
2144 define i8* @strb16_127(i8* %y, i8* %x, <8 x i16> *%m) {
2145 ; CHECK-LABEL: strb16_127:
2146 ; CHECK: @ %bb.0: @ %entry
2147 ; CHECK-NEXT: vldrb.u16 q0, [r1]
2148 ; CHECK-NEXT: vldrh.u16 q1, [r2]
2149 ; CHECK-NEXT: vpt.i16 ne, q1, zr
2150 ; CHECK-NEXT: vstrbt.16 q0, [r0, #127]
2153 %z = getelementptr inbounds i8, i8* %y, i32 127
2154 %0 = bitcast i8* %x to <8 x i8>*
2155 %mask = load <8 x i16>, <8 x i16>* %m, align 2
2156 %c = icmp ne <8 x i16> %mask, zeroinitializer
2157 %1 = load <8 x i8>, <8 x i8>* %0, align 1
2158 %2 = bitcast i8* %z to <8 x i8>*
2159 call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
2163 define i8* @strb16_128(i8* %y, i8* %x, <8 x i16> *%m) {
2164 ; CHECK-LABEL: strb16_128:
2165 ; CHECK: @ %bb.0: @ %entry
2166 ; CHECK-NEXT: vldrb.u16 q0, [r1]
2167 ; CHECK-NEXT: add.w r1, r0, #128
2168 ; CHECK-NEXT: vldrh.u16 q1, [r2]
2169 ; CHECK-NEXT: vpt.i16 ne, q1, zr
2170 ; CHECK-NEXT: vstrbt.16 q0, [r1]
2173 %z = getelementptr inbounds i8, i8* %y, i32 128
2174 %0 = bitcast i8* %x to <8 x i8>*
2175 %mask = load <8 x i16>, <8 x i16>* %m, align 2
2176 %c = icmp ne <8 x i16> %mask, zeroinitializer
2177 %1 = load <8 x i8>, <8 x i8>* %0, align 1
2178 %2 = bitcast i8* %z to <8 x i8>*
2179 call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
2183 define i8* @strb16_m127(i8* %y, i8* %x, <8 x i16> *%m) {
2184 ; CHECK-LABEL: strb16_m127:
2185 ; CHECK: @ %bb.0: @ %entry
2186 ; CHECK-NEXT: vldrb.u16 q0, [r1]
2187 ; CHECK-NEXT: vldrh.u16 q1, [r2]
2188 ; CHECK-NEXT: vpt.i16 ne, q1, zr
2189 ; CHECK-NEXT: vstrbt.16 q0, [r0, #-127]
2192 %z = getelementptr inbounds i8, i8* %y, i32 -127
2193 %0 = bitcast i8* %x to <8 x i8>*
2194 %mask = load <8 x i16>, <8 x i16>* %m, align 2
2195 %c = icmp ne <8 x i16> %mask, zeroinitializer
2196 %1 = load <8 x i8>, <8 x i8>* %0, align 1
2197 %2 = bitcast i8* %z to <8 x i8>*
2198 call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
2202 define i8* @strb16_m128(i8* %y, i8* %x, <8 x i16> *%m) {
2203 ; CHECK-LABEL: strb16_m128:
2204 ; CHECK: @ %bb.0: @ %entry
2205 ; CHECK-NEXT: vldrb.u16 q0, [r1]
2206 ; CHECK-NEXT: sub.w r1, r0, #128
2207 ; CHECK-NEXT: vldrh.u16 q1, [r2]
2208 ; CHECK-NEXT: vpt.i16 ne, q1, zr
2209 ; CHECK-NEXT: vstrbt.16 q0, [r1]
2212 %z = getelementptr inbounds i8, i8* %y, i32 -128
2213 %0 = bitcast i8* %x to <8 x i8>*
2214 %mask = load <8 x i16>, <8 x i16>* %m, align 2
2215 %c = icmp ne <8 x i16> %mask, zeroinitializer
2216 %1 = load <8 x i8>, <8 x i8>* %0, align 1
2217 %2 = bitcast i8* %z to <8 x i8>*
2218 call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
2222 define i8* @strb8_4(i8* %y, i8* %x, <16 x i8> *%m) {
2223 ; CHECK-LABEL: strb8_4:
2224 ; CHECK: @ %bb.0: @ %entry
2225 ; CHECK-NEXT: vldrb.u8 q0, [r1]
2226 ; CHECK-NEXT: vldrb.u8 q1, [r2]
2227 ; CHECK-NEXT: vpt.i8 ne, q1, zr
2228 ; CHECK-NEXT: vstrbt.8 q0, [r0, #4]
2231 %z = getelementptr inbounds i8, i8* %y, i32 4
2232 %0 = bitcast i8* %x to <16 x i8>*
2233 %mask = load <16 x i8>, <16 x i8>* %m, align 1
2234 %c = icmp ne <16 x i8> %mask, zeroinitializer
2235 %1 = load <16 x i8>, <16 x i8>* %0, align 1
2236 %2 = bitcast i8* %z to <16 x i8>*
2237 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
2241 define i8* @strb8_3(i8* %y, i8* %x, <16 x i8> *%m) {
2242 ; CHECK-LABEL: strb8_3:
2243 ; CHECK: @ %bb.0: @ %entry
2244 ; CHECK-NEXT: vldrb.u8 q0, [r1]
2245 ; CHECK-NEXT: vldrb.u8 q1, [r2]
2246 ; CHECK-NEXT: vpt.i8 ne, q1, zr
2247 ; CHECK-NEXT: vstrbt.8 q0, [r0, #3]
2250 %z = getelementptr inbounds i8, i8* %y, i32 3
2251 %0 = bitcast i8* %x to <16 x i8>*
2252 %mask = load <16 x i8>, <16 x i8>* %m, align 1
2253 %c = icmp ne <16 x i8> %mask, zeroinitializer
2254 %1 = load <16 x i8>, <16 x i8>* %0, align 1
2255 %2 = bitcast i8* %z to <16 x i8>*
2256 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
2260 define i8* @strb8_2(i8* %y, i8* %x, <16 x i8> *%m) {
2261 ; CHECK-LABEL: strb8_2:
2262 ; CHECK: @ %bb.0: @ %entry
2263 ; CHECK-NEXT: vldrb.u8 q0, [r1]
2264 ; CHECK-NEXT: vldrb.u8 q1, [r2]
2265 ; CHECK-NEXT: vpt.i8 ne, q1, zr
2266 ; CHECK-NEXT: vstrbt.8 q0, [r0, #2]
2269 %z = getelementptr inbounds i8, i8* %y, i32 2
2270 %0 = bitcast i8* %x to <16 x i8>*
2271 %mask = load <16 x i8>, <16 x i8>* %m, align 1
2272 %c = icmp ne <16 x i8> %mask, zeroinitializer
2273 %1 = load <16 x i8>, <16 x i8>* %0, align 1
2274 %2 = bitcast i8* %z to <16 x i8>*
2275 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
2279 define i8* @strb8_127(i8* %y, i8* %x, <16 x i8> *%m) {
2280 ; CHECK-LABEL: strb8_127:
2281 ; CHECK: @ %bb.0: @ %entry
2282 ; CHECK-NEXT: vldrb.u8 q0, [r1]
2283 ; CHECK-NEXT: vldrb.u8 q1, [r2]
2284 ; CHECK-NEXT: vpt.i8 ne, q1, zr
2285 ; CHECK-NEXT: vstrbt.8 q0, [r0, #127]
2288 %z = getelementptr inbounds i8, i8* %y, i32 127
2289 %0 = bitcast i8* %x to <16 x i8>*
2290 %mask = load <16 x i8>, <16 x i8>* %m, align 1
2291 %c = icmp ne <16 x i8> %mask, zeroinitializer
2292 %1 = load <16 x i8>, <16 x i8>* %0, align 1
2293 %2 = bitcast i8* %z to <16 x i8>*
2294 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
2298 define i8* @strb8_128(i8* %y, i8* %x, <16 x i8> *%m) {
2299 ; CHECK-LABEL: strb8_128:
2300 ; CHECK: @ %bb.0: @ %entry
2301 ; CHECK-NEXT: vldrb.u8 q0, [r1]
2302 ; CHECK-NEXT: add.w r1, r0, #128
2303 ; CHECK-NEXT: vldrb.u8 q1, [r2]
2304 ; CHECK-NEXT: vpt.i8 ne, q1, zr
2305 ; CHECK-NEXT: vstrbt.8 q0, [r1]
2308 %z = getelementptr inbounds i8, i8* %y, i32 128
2309 %0 = bitcast i8* %x to <16 x i8>*
2310 %mask = load <16 x i8>, <16 x i8>* %m, align 1
2311 %c = icmp ne <16 x i8> %mask, zeroinitializer
2312 %1 = load <16 x i8>, <16 x i8>* %0, align 1
2313 %2 = bitcast i8* %z to <16 x i8>*
2314 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
2318 define i8* @strb8_m127(i8* %y, i8* %x, <16 x i8> *%m) {
2319 ; CHECK-LABEL: strb8_m127:
2320 ; CHECK: @ %bb.0: @ %entry
2321 ; CHECK-NEXT: vldrb.u8 q0, [r1]
2322 ; CHECK-NEXT: vldrb.u8 q1, [r2]
2323 ; CHECK-NEXT: vpt.i8 ne, q1, zr
2324 ; CHECK-NEXT: vstrbt.8 q0, [r0, #-127]
2327 %z = getelementptr inbounds i8, i8* %y, i32 -127
2328 %0 = bitcast i8* %x to <16 x i8>*
2329 %mask = load <16 x i8>, <16 x i8>* %m, align 1
2330 %c = icmp ne <16 x i8> %mask, zeroinitializer
2331 %1 = load <16 x i8>, <16 x i8>* %0, align 1
2332 %2 = bitcast i8* %z to <16 x i8>*
2333 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
2337 define i8* @strb8_m128(i8* %y, i8* %x, <16 x i8> *%m) {
2338 ; CHECK-LABEL: strb8_m128:
2339 ; CHECK: @ %bb.0: @ %entry
2340 ; CHECK-NEXT: vldrb.u8 q0, [r1]
2341 ; CHECK-NEXT: sub.w r1, r0, #128
2342 ; CHECK-NEXT: vldrb.u8 q1, [r2]
2343 ; CHECK-NEXT: vpt.i8 ne, q1, zr
2344 ; CHECK-NEXT: vstrbt.8 q0, [r1]
2347 %z = getelementptr inbounds i8, i8* %y, i32 -128
2348 %0 = bitcast i8* %x to <16 x i8>*
2349 %mask = load <16 x i8>, <16 x i8>* %m, align 1
2350 %c = icmp ne <16 x i8> %mask, zeroinitializer
2351 %1 = load <16 x i8>, <16 x i8>* %0, align 1
2352 %2 = bitcast i8* %z to <16 x i8>*
2353 call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
2357 define i8* @strwf32_4(i8* %y, i8* %x, <4 x i32> *%m) {
2358 ; CHECK-LABEL: strwf32_4:
2359 ; CHECK: @ %bb.0: @ %entry
2360 ; CHECK-NEXT: vldrw.u32 q0, [r1]
2361 ; CHECK-NEXT: vldrw.u32 q1, [r2]
2362 ; CHECK-NEXT: vpt.i32 ne, q1, zr
2363 ; CHECK-NEXT: vstrwt.32 q0, [r0, #4]
2366 %z = getelementptr inbounds i8, i8* %y, i32 4
2367 %0 = bitcast i8* %x to <4 x float>*
2368 %mask = load <4 x i32>, <4 x i32>* %m, align 4
2369 %c = icmp ne <4 x i32> %mask, zeroinitializer
2370 %1 = load <4 x float>, <4 x float>* %0, align 4
2371 %2 = bitcast i8* %z to <4 x float>*
2372 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
2376 define i8* @strwf32_3(i8* %y, i8* %x, <4 x i32> *%m) {
2377 ; CHECK-LABEL: strwf32_3:
2378 ; CHECK: @ %bb.0: @ %entry
2379 ; CHECK-NEXT: vldrw.u32 q0, [r1]
2380 ; CHECK-NEXT: adds r1, r0, #3
2381 ; CHECK-NEXT: vldrw.u32 q1, [r2]
2382 ; CHECK-NEXT: vpt.i32 ne, q1, zr
2383 ; CHECK-NEXT: vstrwt.32 q0, [r1]
2386 %z = getelementptr inbounds i8, i8* %y, i32 3
2387 %0 = bitcast i8* %x to <4 x float>*
2388 %mask = load <4 x i32>, <4 x i32>* %m, align 4
2389 %c = icmp ne <4 x i32> %mask, zeroinitializer
2390 %1 = load <4 x float>, <4 x float>* %0, align 4
2391 %2 = bitcast i8* %z to <4 x float>*
2392 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
2396 define i8* @strwf32_2(i8* %y, i8* %x, <4 x i32> *%m) {
2397 ; CHECK-LABEL: strwf32_2:
2398 ; CHECK: @ %bb.0: @ %entry
2399 ; CHECK-NEXT: vldrw.u32 q0, [r1]
2400 ; CHECK-NEXT: adds r1, r0, #2
2401 ; CHECK-NEXT: vldrw.u32 q1, [r2]
2402 ; CHECK-NEXT: vpt.i32 ne, q1, zr
2403 ; CHECK-NEXT: vstrwt.32 q0, [r1]
2406 %z = getelementptr inbounds i8, i8* %y, i32 2
2407 %0 = bitcast i8* %x to <4 x float>*
2408 %mask = load <4 x i32>, <4 x i32>* %m, align 4
2409 %c = icmp ne <4 x i32> %mask, zeroinitializer
2410 %1 = load <4 x float>, <4 x float>* %0, align 4
2411 %2 = bitcast i8* %z to <4 x float>*
2412 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
2416 define i8* @strwf32_508(i8* %y, i8* %x, <4 x i32> *%m) {
2417 ; CHECK-LABEL: strwf32_508:
2418 ; CHECK: @ %bb.0: @ %entry
2419 ; CHECK-NEXT: vldrw.u32 q0, [r1]
2420 ; CHECK-NEXT: vldrw.u32 q1, [r2]
2421 ; CHECK-NEXT: vpt.i32 ne, q1, zr
2422 ; CHECK-NEXT: vstrwt.32 q0, [r0, #508]
2425 %z = getelementptr inbounds i8, i8* %y, i32 508
2426 %0 = bitcast i8* %x to <4 x float>*
2427 %mask = load <4 x i32>, <4 x i32>* %m, align 4
2428 %c = icmp ne <4 x i32> %mask, zeroinitializer
2429 %1 = load <4 x float>, <4 x float>* %0, align 4
2430 %2 = bitcast i8* %z to <4 x float>*
2431 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
2435 define i8* @strwf32_512(i8* %y, i8* %x, <4 x i32> *%m) {
2436 ; CHECK-LABEL: strwf32_512:
2437 ; CHECK: @ %bb.0: @ %entry
2438 ; CHECK-NEXT: vldrw.u32 q0, [r1]
2439 ; CHECK-NEXT: add.w r1, r0, #512
2440 ; CHECK-NEXT: vldrw.u32 q1, [r2]
2441 ; CHECK-NEXT: vpt.i32 ne, q1, zr
2442 ; CHECK-NEXT: vstrwt.32 q0, [r1]
2445 %z = getelementptr inbounds i8, i8* %y, i32 512
2446 %0 = bitcast i8* %x to <4 x float>*
2447 %mask = load <4 x i32>, <4 x i32>* %m, align 4
2448 %c = icmp ne <4 x i32> %mask, zeroinitializer
2449 %1 = load <4 x float>, <4 x float>* %0, align 4
2450 %2 = bitcast i8* %z to <4 x float>*
2451 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
2455 define i8* @strwf32_m508(i8* %y, i8* %x, <4 x i32> *%m) {
2456 ; CHECK-LABEL: strwf32_m508:
2457 ; CHECK: @ %bb.0: @ %entry
2458 ; CHECK-NEXT: vldrw.u32 q0, [r1]
2459 ; CHECK-NEXT: vldrw.u32 q1, [r2]
2460 ; CHECK-NEXT: vpt.i32 ne, q1, zr
2461 ; CHECK-NEXT: vstrwt.32 q0, [r0, #-508]
2464 %z = getelementptr inbounds i8, i8* %y, i32 -508
2465 %0 = bitcast i8* %x to <4 x float>*
2466 %mask = load <4 x i32>, <4 x i32>* %m, align 4
2467 %c = icmp ne <4 x i32> %mask, zeroinitializer
2468 %1 = load <4 x float>, <4 x float>* %0, align 4
2469 %2 = bitcast i8* %z to <4 x float>*
2470 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
2474 define i8* @strwf32_m512(i8* %y, i8* %x, <4 x i32> *%m) {
2475 ; CHECK-LABEL: strwf32_m512:
2476 ; CHECK: @ %bb.0: @ %entry
2477 ; CHECK-NEXT: vldrw.u32 q0, [r1]
2478 ; CHECK-NEXT: sub.w r1, r0, #512
2479 ; CHECK-NEXT: vldrw.u32 q1, [r2]
2480 ; CHECK-NEXT: vpt.i32 ne, q1, zr
2481 ; CHECK-NEXT: vstrwt.32 q0, [r1]
2484 %z = getelementptr inbounds i8, i8* %y, i32 -512
2485 %0 = bitcast i8* %x to <4 x float>*
2486 %mask = load <4 x i32>, <4 x i32>* %m, align 4
2487 %c = icmp ne <4 x i32> %mask, zeroinitializer
2488 %1 = load <4 x float>, <4 x float>* %0, align 4
2489 %2 = bitcast i8* %z to <4 x float>*
2490 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
2494 define i8* @strhf16_4(i8* %y, i8* %x, <8 x i16> *%m) {
2495 ; CHECK-LABEL: strhf16_4:
2496 ; CHECK: @ %bb.0: @ %entry
2497 ; CHECK-NEXT: vldrh.u16 q0, [r1]
2498 ; CHECK-NEXT: vldrh.u16 q1, [r2]
2499 ; CHECK-NEXT: vpt.i16 ne, q1, zr
2500 ; CHECK-NEXT: vstrht.16 q0, [r0, #4]
2503 %z = getelementptr inbounds i8, i8* %y, i32 4
2504 %0 = bitcast i8* %x to <8 x half>*
2505 %mask = load <8 x i16>, <8 x i16>* %m, align 2
2506 %c = icmp ne <8 x i16> %mask, zeroinitializer
2507 %1 = load <8 x half>, <8 x half>* %0, align 2
2508 %2 = bitcast i8* %z to <8 x half>*
2509 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
2513 define i8* @strhf16_3(i8* %y, i8* %x, <8 x i16> *%m) {
2514 ; CHECK-LABEL: strhf16_3:
2515 ; CHECK: @ %bb.0: @ %entry
2516 ; CHECK-NEXT: vldrh.u16 q0, [r1]
2517 ; CHECK-NEXT: adds r1, r0, #3
2518 ; CHECK-NEXT: vldrh.u16 q1, [r2]
2519 ; CHECK-NEXT: vpt.i16 ne, q1, zr
2520 ; CHECK-NEXT: vstrht.16 q0, [r1]
2523 %z = getelementptr inbounds i8, i8* %y, i32 3
2524 %0 = bitcast i8* %x to <8 x half>*
2525 %mask = load <8 x i16>, <8 x i16>* %m, align 2
2526 %c = icmp ne <8 x i16> %mask, zeroinitializer
2527 %1 = load <8 x half>, <8 x half>* %0, align 2
2528 %2 = bitcast i8* %z to <8 x half>*
2529 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
2533 define i8* @strhf16_2(i8* %y, i8* %x, <8 x i16> *%m) {
2534 ; CHECK-LABEL: strhf16_2:
2535 ; CHECK: @ %bb.0: @ %entry
2536 ; CHECK-NEXT: vldrh.u16 q0, [r1]
2537 ; CHECK-NEXT: vldrh.u16 q1, [r2]
2538 ; CHECK-NEXT: vpt.i16 ne, q1, zr
2539 ; CHECK-NEXT: vstrht.16 q0, [r0, #2]
2542 %z = getelementptr inbounds i8, i8* %y, i32 2
2543 %0 = bitcast i8* %x to <8 x half>*
2544 %mask = load <8 x i16>, <8 x i16>* %m, align 2
2545 %c = icmp ne <8 x i16> %mask, zeroinitializer
2546 %1 = load <8 x half>, <8 x half>* %0, align 2
2547 %2 = bitcast i8* %z to <8 x half>*
2548 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
2552 define i8* @strhf16_254(i8* %y, i8* %x, <8 x i16> *%m) {
2553 ; CHECK-LABEL: strhf16_254:
2554 ; CHECK: @ %bb.0: @ %entry
2555 ; CHECK-NEXT: vldrh.u16 q0, [r1]
2556 ; CHECK-NEXT: vldrh.u16 q1, [r2]
2557 ; CHECK-NEXT: vpt.i16 ne, q1, zr
2558 ; CHECK-NEXT: vstrht.16 q0, [r0, #254]
2561 %z = getelementptr inbounds i8, i8* %y, i32 254
2562 %0 = bitcast i8* %x to <8 x half>*
2563 %mask = load <8 x i16>, <8 x i16>* %m, align 2
2564 %c = icmp ne <8 x i16> %mask, zeroinitializer
2565 %1 = load <8 x half>, <8 x half>* %0, align 2
2566 %2 = bitcast i8* %z to <8 x half>*
2567 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
2571 define i8* @strhf16_256(i8* %y, i8* %x, <8 x i16> *%m) {
2572 ; CHECK-LABEL: strhf16_256:
2573 ; CHECK: @ %bb.0: @ %entry
2574 ; CHECK-NEXT: vldrh.u16 q0, [r1]
2575 ; CHECK-NEXT: add.w r1, r0, #256
2576 ; CHECK-NEXT: vldrh.u16 q1, [r2]
2577 ; CHECK-NEXT: vpt.i16 ne, q1, zr
2578 ; CHECK-NEXT: vstrht.16 q0, [r1]
2581 %z = getelementptr inbounds i8, i8* %y, i32 256
2582 %0 = bitcast i8* %x to <8 x half>*
2583 %mask = load <8 x i16>, <8 x i16>* %m, align 2
2584 %c = icmp ne <8 x i16> %mask, zeroinitializer
2585 %1 = load <8 x half>, <8 x half>* %0, align 2
2586 %2 = bitcast i8* %z to <8 x half>*
2587 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
2591 define i8* @strhf16_m254(i8* %y, i8* %x, <8 x i16> *%m) {
2592 ; CHECK-LABEL: strhf16_m254:
2593 ; CHECK: @ %bb.0: @ %entry
2594 ; CHECK-NEXT: vldrh.u16 q0, [r1]
2595 ; CHECK-NEXT: vldrh.u16 q1, [r2]
2596 ; CHECK-NEXT: vpt.i16 ne, q1, zr
2597 ; CHECK-NEXT: vstrht.16 q0, [r0, #-254]
2600 %z = getelementptr inbounds i8, i8* %y, i32 -254
2601 %0 = bitcast i8* %x to <8 x half>*
2602 %mask = load <8 x i16>, <8 x i16>* %m, align 2
2603 %c = icmp ne <8 x i16> %mask, zeroinitializer
2604 %1 = load <8 x half>, <8 x half>* %0, align 2
2605 %2 = bitcast i8* %z to <8 x half>*
2606 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
2610 define i8* @strhf16_m256(i8* %y, i8* %x, <8 x i16> *%m) {
2611 ; CHECK-LABEL: strhf16_m256:
2612 ; CHECK: @ %bb.0: @ %entry
2613 ; CHECK-NEXT: vldrh.u16 q0, [r1]
2614 ; CHECK-NEXT: sub.w r1, r0, #256
2615 ; CHECK-NEXT: vldrh.u16 q1, [r2]
2616 ; CHECK-NEXT: vpt.i16 ne, q1, zr
2617 ; CHECK-NEXT: vstrht.16 q0, [r1]
2620 %z = getelementptr inbounds i8, i8* %y, i32 -256
2621 %0 = bitcast i8* %x to <8 x half>*
2622 %mask = load <8 x i16>, <8 x i16>* %m, align 2
2623 %c = icmp ne <8 x i16> %mask, zeroinitializer
2624 %1 = load <8 x half>, <8 x half>* %0, align 2
2625 %2 = bitcast i8* %z to <8 x half>*
2626 call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
2630 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
2631 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
2632 declare <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
2633 declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>)
2634 declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
2635 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
2636 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
2637 declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>)
2639 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
2640 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
2641 declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>)
2642 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
2643 declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>)
2644 declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>)
2645 declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
2646 declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>)