1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-LD1R
3 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+no-sve-fp-ld1r < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NO-LD1R
5 ; Check that ldr1* instruction is generated to splat scalar during load,
6 ; rather than mov from scalar to vector register (which would require the vector unit).
8 ; one-off: ld1r_stack checks that ldr1b works with stack objects.
11 ; types = [i8, i16, i32, i64, half, float, double]
12 ; methods = [direct load, gep upper bound - 1, gep out of range x {neg,pos}, sext..., zext..., unpacked_floats...]
15 @g8 = external global i8
17 ; One-off test for splatted value coming from stack load.
18 define <vscale x 16 x i8> @ld1r_stack() {
19 ; CHECK-LABEL: ld1r_stack:
21 ; CHECK-NEXT: sub sp, sp, #16
22 ; CHECK-NEXT: .cfi_def_cfa_offset 16
23 ; CHECK-NEXT: ptrue p0.b
24 ; CHECK-NEXT: adrp x8, :got:g8
25 ; CHECK-NEXT: ldr x8, [x8, :got_lo12:g8]
26 ; CHECK-NEXT: ldrb w8, [x8]
27 ; CHECK-NEXT: strb w8, [sp, #12]
28 ; CHECK-NEXT: ld1rb { z0.b }, p0/z, [sp, #14]
29 ; CHECK-NEXT: add sp, sp, #16
32 %valp2 = load volatile i8, ptr @g8
33 store volatile i8 %valp2, ptr %valp
34 %valp3 = getelementptr i8, ptr %valp, i32 2
35 %val = load i8, ptr %valp3
36 %1 = insertelement <vscale x 16 x i8> undef, i8 %val, i32 0
37 %2 = shufflevector <vscale x 16 x i8> %1, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
38 ret <vscale x 16 x i8> %2
41 define <vscale x 16 x i8> @ld1rb(ptr %valp) {
44 ; CHECK-NEXT: ptrue p0.b
45 ; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0]
47 %val = load i8, ptr %valp
48 %ins = insertelement <vscale x 16 x i8> undef, i8 %val, i32 0
49 %shf = shufflevector <vscale x 16 x i8> %ins, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
50 ret <vscale x 16 x i8> %shf
53 define <vscale x 16 x i8> @ld1rb_gep(ptr %valp) {
54 ; CHECK-LABEL: ld1rb_gep:
56 ; CHECK-NEXT: ptrue p0.b
57 ; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0, #63]
59 %valp2 = getelementptr i8, ptr %valp, i32 63
60 %val = load i8, ptr %valp2
61 %ins = insertelement <vscale x 16 x i8> undef, i8 %val, i32 0
62 %shf = shufflevector <vscale x 16 x i8> %ins, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
63 ret <vscale x 16 x i8> %shf
66 define <vscale x 16 x i8> @ld1rb_gep_out_of_range_up(ptr %valp) {
67 ; CHECK-LABEL: ld1rb_gep_out_of_range_up:
69 ; CHECK-NEXT: ptrue p0.b
70 ; CHECK-NEXT: add x8, x0, #64
71 ; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x8]
73 %valp2 = getelementptr i8, ptr %valp, i32 64
74 %val = load i8, ptr %valp2
75 %ins = insertelement <vscale x 16 x i8> undef, i8 %val, i32 0
76 %shf = shufflevector <vscale x 16 x i8> %ins, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
77 ret <vscale x 16 x i8> %shf
80 define <vscale x 16 x i8> @ld1rb_gep_out_of_range_down(ptr %valp) {
81 ; CHECK-LABEL: ld1rb_gep_out_of_range_down:
83 ; CHECK-NEXT: ptrue p0.b
84 ; CHECK-NEXT: sub x8, x0, #1
85 ; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x8]
87 %valp2 = getelementptr i8, ptr %valp, i32 -1
88 %val = load i8, ptr %valp2
89 %ins = insertelement <vscale x 16 x i8> undef, i8 %val, i32 0
90 %shf = shufflevector <vscale x 16 x i8> %ins, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
91 ret <vscale x 16 x i8> %shf
94 define <vscale x 8 x i16> @ld1rb_i8_i16_zext(ptr %valp) {
95 ; CHECK-LABEL: ld1rb_i8_i16_zext:
97 ; CHECK-NEXT: ptrue p0.h
98 ; CHECK-NEXT: ld1rb { z0.h }, p0/z, [x0]
100 %val = load i8, ptr %valp
101 %ext = zext i8 %val to i16
102 %ins = insertelement <vscale x 8 x i16> undef, i16 %ext, i32 0
103 %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
104 ret <vscale x 8 x i16> %shf
107 define <vscale x 8 x i16> @ld1rb_i8_i16_sext(ptr %valp) {
108 ; CHECK-LABEL: ld1rb_i8_i16_sext:
110 ; CHECK-NEXT: ptrue p0.h
111 ; CHECK-NEXT: ld1rsb { z0.h }, p0/z, [x0]
113 %val = load i8, ptr %valp
114 %ext = sext i8 %val to i16
115 %ins = insertelement <vscale x 8 x i16> undef, i16 %ext, i32 0
116 %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
117 ret <vscale x 8 x i16> %shf
120 define <vscale x 4 x i32> @ld1rb_i8_i32_zext(ptr %valp) {
121 ; CHECK-LABEL: ld1rb_i8_i32_zext:
123 ; CHECK-NEXT: ptrue p0.s
124 ; CHECK-NEXT: ld1rb { z0.s }, p0/z, [x0]
126 %val = load i8, ptr %valp
127 %ext = zext i8 %val to i32
128 %ins = insertelement <vscale x 4 x i32> undef, i32 %ext, i32 0
129 %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
130 ret <vscale x 4 x i32> %shf
133 define <vscale x 4 x i32> @ld1rb_i8_i32_sext(ptr %valp) {
134 ; CHECK-LABEL: ld1rb_i8_i32_sext:
136 ; CHECK-NEXT: ptrue p0.s
137 ; CHECK-NEXT: ld1rsb { z0.s }, p0/z, [x0]
139 %val = load i8, ptr %valp
140 %ext = sext i8 %val to i32
141 %ins = insertelement <vscale x 4 x i32> undef, i32 %ext, i32 0
142 %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
143 ret <vscale x 4 x i32> %shf
146 define <vscale x 2 x i64> @ld1rb_i8_i64_zext(ptr %valp) {
147 ; CHECK-LABEL: ld1rb_i8_i64_zext:
149 ; CHECK-NEXT: ptrue p0.d
150 ; CHECK-NEXT: ld1rb { z0.d }, p0/z, [x0]
152 %val = load i8, ptr %valp
153 %ext = zext i8 %val to i64
154 %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
155 %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
156 ret <vscale x 2 x i64> %shf
159 define <vscale x 2 x i64> @ld1rb_i8_i64_sext(ptr %valp) {
160 ; CHECK-LABEL: ld1rb_i8_i64_sext:
162 ; CHECK-NEXT: ptrue p0.d
163 ; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0]
165 %val = load i8, ptr %valp
166 %ext = sext i8 %val to i64
167 %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
168 %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
169 ret <vscale x 2 x i64> %shf
172 define <vscale x 8 x i16> @ld1rh(ptr %valp) {
173 ; CHECK-LABEL: ld1rh:
175 ; CHECK-NEXT: ptrue p0.h
176 ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0]
178 %val = load i16, ptr %valp
179 %ins = insertelement <vscale x 8 x i16> undef, i16 %val, i32 0
180 %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
181 ret <vscale x 8 x i16> %shf
184 define <vscale x 8 x i16> @ld1rh_gep(ptr %valp) {
185 ; CHECK-LABEL: ld1rh_gep:
187 ; CHECK-NEXT: ptrue p0.h
188 ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0, #126]
190 %valp2 = getelementptr i16, ptr %valp, i32 63
191 %val = load i16, ptr %valp2
192 %ins = insertelement <vscale x 8 x i16> undef, i16 %val, i32 0
193 %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
194 ret <vscale x 8 x i16> %shf
197 define <vscale x 8 x i16> @ld1rh_gep_out_of_range_up(ptr %valp) {
198 ; CHECK-LABEL: ld1rh_gep_out_of_range_up:
200 ; CHECK-NEXT: ptrue p0.h
201 ; CHECK-NEXT: add x8, x0, #128
202 ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x8]
204 %valp2 = getelementptr i16, ptr %valp, i32 64
205 %val = load i16, ptr %valp2
206 %ins = insertelement <vscale x 8 x i16> undef, i16 %val, i32 0
207 %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
208 ret <vscale x 8 x i16> %shf
211 define <vscale x 8 x i16> @ld1rh_gep_out_of_range_down(ptr %valp) {
212 ; CHECK-LABEL: ld1rh_gep_out_of_range_down:
214 ; CHECK-NEXT: ptrue p0.h
215 ; CHECK-NEXT: sub x8, x0, #2
216 ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x8]
218 %valp2 = getelementptr i16, ptr %valp, i32 -1
219 %val = load i16, ptr %valp2
220 %ins = insertelement <vscale x 8 x i16> undef, i16 %val, i32 0
221 %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
222 ret <vscale x 8 x i16> %shf
225 define <vscale x 4 x i32> @ld1rh_i16_i32_zext(ptr %valp) {
226 ; CHECK-LABEL: ld1rh_i16_i32_zext:
228 ; CHECK-NEXT: ptrue p0.s
229 ; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0]
231 %val = load i16, ptr %valp
232 %ext = zext i16 %val to i32
233 %ins = insertelement <vscale x 4 x i32> undef, i32 %ext, i32 0
234 %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
235 ret <vscale x 4 x i32> %shf
238 define <vscale x 4 x i32> @ld1rh_i16_i32_sext(ptr %valp) {
239 ; CHECK-LABEL: ld1rh_i16_i32_sext:
241 ; CHECK-NEXT: ptrue p0.s
242 ; CHECK-NEXT: ld1rsh { z0.s }, p0/z, [x0]
244 %val = load i16, ptr %valp
245 %ext = sext i16 %val to i32
246 %ins = insertelement <vscale x 4 x i32> undef, i32 %ext, i32 0
247 %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
248 ret <vscale x 4 x i32> %shf
251 define <vscale x 2 x i64> @ld1rh_i16_i64_zext(ptr %valp) {
252 ; CHECK-LABEL: ld1rh_i16_i64_zext:
254 ; CHECK-NEXT: ptrue p0.d
255 ; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0]
257 %val = load i16, ptr %valp
258 %ext = zext i16 %val to i64
259 %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
260 %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
261 ret <vscale x 2 x i64> %shf
264 define <vscale x 2 x i64> @ld1rh_i16_i64_sext(ptr %valp) {
265 ; CHECK-LABEL: ld1rh_i16_i64_sext:
267 ; CHECK-NEXT: ptrue p0.d
268 ; CHECK-NEXT: ld1rsh { z0.d }, p0/z, [x0]
270 %val = load i16, ptr %valp
271 %ext = sext i16 %val to i64
272 %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
273 %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
274 ret <vscale x 2 x i64> %shf
277 define <vscale x 4 x i32> @ld1rw(ptr %valp) {
278 ; CHECK-LABEL: ld1rw:
280 ; CHECK-NEXT: ptrue p0.s
281 ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0]
283 %val = load i32, ptr %valp
284 %ins = insertelement <vscale x 4 x i32> undef, i32 %val, i32 0
285 %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
286 ret <vscale x 4 x i32> %shf
289 define <vscale x 4 x i32> @ld1rw_gep(ptr %valp) {
290 ; CHECK-LABEL: ld1rw_gep:
292 ; CHECK-NEXT: ptrue p0.s
293 ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0, #252]
295 %valp2 = getelementptr i32, ptr %valp, i32 63
296 %val = load i32, ptr %valp2
297 %ins = insertelement <vscale x 4 x i32> undef, i32 %val, i32 0
298 %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
299 ret <vscale x 4 x i32> %shf
302 define <vscale x 4 x i32> @ld1rw_gep_out_of_range_up(ptr %valp) {
303 ; CHECK-LABEL: ld1rw_gep_out_of_range_up:
305 ; CHECK-NEXT: ptrue p0.s
306 ; CHECK-NEXT: add x8, x0, #256
307 ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x8]
309 %valp2 = getelementptr i32, ptr %valp, i32 64
310 %val = load i32, ptr %valp2
311 %ins = insertelement <vscale x 4 x i32> undef, i32 %val, i32 0
312 %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
313 ret <vscale x 4 x i32> %shf
316 define <vscale x 4 x i32> @ld1rw_gep_out_of_range_down(ptr %valp) {
317 ; CHECK-LABEL: ld1rw_gep_out_of_range_down:
319 ; CHECK-NEXT: ptrue p0.s
320 ; CHECK-NEXT: sub x8, x0, #4
321 ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x8]
323 %valp2 = getelementptr i32, ptr %valp, i32 -1
324 %val = load i32, ptr %valp2
325 %ins = insertelement <vscale x 4 x i32> undef, i32 %val, i32 0
326 %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
327 ret <vscale x 4 x i32> %shf
330 define <vscale x 2 x i64> @ld1rw_i32_i64_zext(ptr %valp) {
331 ; CHECK-LABEL: ld1rw_i32_i64_zext:
333 ; CHECK-NEXT: ptrue p0.d
334 ; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0]
336 %val = load i32, ptr %valp
337 %ext = zext i32 %val to i64
338 %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
339 %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
340 ret <vscale x 2 x i64> %shf
343 define <vscale x 2 x i64> @ld1rw_i32_i64_sext(ptr %valp) {
344 ; CHECK-LABEL: ld1rw_i32_i64_sext:
346 ; CHECK-NEXT: ptrue p0.d
347 ; CHECK-NEXT: ld1rsw { z0.d }, p0/z, [x0]
349 %val = load i32, ptr %valp
350 %ext = sext i32 %val to i64
351 %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
352 %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
353 ret <vscale x 2 x i64> %shf
356 define <vscale x 2 x i64> @ld1rd(ptr %valp) {
357 ; CHECK-LABEL: ld1rd:
359 ; CHECK-NEXT: ptrue p0.d
360 ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0]
362 %val = load i64, ptr %valp
363 %ins = insertelement <vscale x 2 x i64> undef, i64 %val, i32 0
364 %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
365 ret <vscale x 2 x i64> %shf
368 define <vscale x 2 x i64> @ld1rd_gep(ptr %valp) {
369 ; CHECK-LABEL: ld1rd_gep:
371 ; CHECK-NEXT: ptrue p0.d
372 ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0, #504]
374 %valp2 = getelementptr i64, ptr %valp, i32 63
375 %val = load i64, ptr %valp2
376 %ins = insertelement <vscale x 2 x i64> undef, i64 %val, i32 0
377 %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
378 ret <vscale x 2 x i64> %shf
381 define <vscale x 2 x i64> @ld1rd_gep_out_of_range_up(ptr %valp) {
382 ; CHECK-LABEL: ld1rd_gep_out_of_range_up:
384 ; CHECK-NEXT: ptrue p0.d
385 ; CHECK-NEXT: add x8, x0, #512
386 ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x8]
388 %valp2 = getelementptr i64, ptr %valp, i32 64
389 %val = load i64, ptr %valp2
390 %ins = insertelement <vscale x 2 x i64> undef, i64 %val, i32 0
391 %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
392 ret <vscale x 2 x i64> %shf
395 define <vscale x 2 x i64> @ld1rd_gep_out_of_range_down(ptr %valp) {
396 ; CHECK-LABEL: ld1rd_gep_out_of_range_down:
398 ; CHECK-NEXT: ptrue p0.d
399 ; CHECK-NEXT: sub x8, x0, #8
400 ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x8]
402 %valp2 = getelementptr i64, ptr %valp, i32 -1
403 %val = load i64, ptr %valp2
404 %ins = insertelement <vscale x 2 x i64> undef, i64 %val, i32 0
405 %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
406 ret <vscale x 2 x i64> %shf
409 define <vscale x 8 x half> @ld1rh_half(ptr %valp) {
410 ; CHECK-LD1R-LABEL: ld1rh_half:
411 ; CHECK-LD1R: // %bb.0:
412 ; CHECK-LD1R-NEXT: ptrue p0.h
413 ; CHECK-LD1R-NEXT: ld1rh { z0.h }, p0/z, [x0]
414 ; CHECK-LD1R-NEXT: ret
416 ; CHECK-NO-LD1R-LABEL: ld1rh_half:
417 ; CHECK-NO-LD1R: // %bb.0:
418 ; CHECK-NO-LD1R-NEXT: ldr h0, [x0]
419 ; CHECK-NO-LD1R-NEXT: mov z0.h, h0
420 ; CHECK-NO-LD1R-NEXT: ret
421 %val = load half, ptr %valp
422 %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
423 %shf = shufflevector <vscale x 8 x half> %ins, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
424 ret <vscale x 8 x half> %shf
427 define <vscale x 8 x half> @ld1rh_half_neoverse(ptr %valp) #1 {
428 ; CHECK-LABEL: ld1rh_half_neoverse:
430 ; CHECK-NEXT: ldr h0, [x0]
431 ; CHECK-NEXT: mov z0.h, h0
433 %val = load half, ptr %valp
434 %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
435 %shf = shufflevector <vscale x 8 x half> %ins, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
436 ret <vscale x 8 x half> %shf
439 define <vscale x 8 x half> @ld1rh_half_gep(ptr %valp) {
440 ; CHECK-LD1R-LABEL: ld1rh_half_gep:
441 ; CHECK-LD1R: // %bb.0:
442 ; CHECK-LD1R-NEXT: ptrue p0.h
443 ; CHECK-LD1R-NEXT: ld1rh { z0.h }, p0/z, [x0, #126]
444 ; CHECK-LD1R-NEXT: ret
446 ; CHECK-NO-LD1R-LABEL: ld1rh_half_gep:
447 ; CHECK-NO-LD1R: // %bb.0:
448 ; CHECK-NO-LD1R-NEXT: ldr h0, [x0, #126]
449 ; CHECK-NO-LD1R-NEXT: mov z0.h, h0
450 ; CHECK-NO-LD1R-NEXT: ret
451 %valp2 = getelementptr half, ptr %valp, i32 63
452 %val = load half, ptr %valp2
453 %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
454 %shf = shufflevector <vscale x 8 x half> %ins, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
455 ret <vscale x 8 x half> %shf
458 define <vscale x 8 x half> @ld1rh_half_gep_out_of_range_up(ptr %valp) {
459 ; CHECK-LD1R-LABEL: ld1rh_half_gep_out_of_range_up:
460 ; CHECK-LD1R: // %bb.0:
461 ; CHECK-LD1R-NEXT: ptrue p0.h
462 ; CHECK-LD1R-NEXT: add x8, x0, #128
463 ; CHECK-LD1R-NEXT: ld1rh { z0.h }, p0/z, [x8]
464 ; CHECK-LD1R-NEXT: ret
466 ; CHECK-NO-LD1R-LABEL: ld1rh_half_gep_out_of_range_up:
467 ; CHECK-NO-LD1R: // %bb.0:
468 ; CHECK-NO-LD1R-NEXT: ldr h0, [x0, #128]
469 ; CHECK-NO-LD1R-NEXT: mov z0.h, h0
470 ; CHECK-NO-LD1R-NEXT: ret
471 %valp2 = getelementptr half, ptr %valp, i32 64
472 %val = load half, ptr %valp2
473 %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
474 %shf = shufflevector <vscale x 8 x half> %ins, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
475 ret <vscale x 8 x half> %shf
478 define <vscale x 8 x half> @ld1rh_half_gep_out_of_range_down(ptr %valp) {
479 ; CHECK-LD1R-LABEL: ld1rh_half_gep_out_of_range_down:
480 ; CHECK-LD1R: // %bb.0:
481 ; CHECK-LD1R-NEXT: ptrue p0.h
482 ; CHECK-LD1R-NEXT: sub x8, x0, #2
483 ; CHECK-LD1R-NEXT: ld1rh { z0.h }, p0/z, [x8]
484 ; CHECK-LD1R-NEXT: ret
486 ; CHECK-NO-LD1R-LABEL: ld1rh_half_gep_out_of_range_down:
487 ; CHECK-NO-LD1R: // %bb.0:
488 ; CHECK-NO-LD1R-NEXT: ldur h0, [x0, #-2]
489 ; CHECK-NO-LD1R-NEXT: mov z0.h, h0
490 ; CHECK-NO-LD1R-NEXT: ret
491 %valp2 = getelementptr half, ptr %valp, i32 -1
492 %val = load half, ptr %valp2
493 %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
494 %shf = shufflevector <vscale x 8 x half> %ins, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
495 ret <vscale x 8 x half> %shf
498 define <vscale x 4 x half> @ld1rh_half_unpacked4(ptr %valp) {
499 ; CHECK-LD1R-LABEL: ld1rh_half_unpacked4:
500 ; CHECK-LD1R: // %bb.0:
501 ; CHECK-LD1R-NEXT: ptrue p0.s
502 ; CHECK-LD1R-NEXT: ld1rh { z0.s }, p0/z, [x0]
503 ; CHECK-LD1R-NEXT: ret
505 ; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked4:
506 ; CHECK-NO-LD1R: // %bb.0:
507 ; CHECK-NO-LD1R-NEXT: ldr h0, [x0]
508 ; CHECK-NO-LD1R-NEXT: mov z0.h, h0
509 ; CHECK-NO-LD1R-NEXT: ret
510 %val = load half, ptr %valp
511 %ins = insertelement <vscale x 4 x half> undef, half %val, i32 0
512 %shf = shufflevector <vscale x 4 x half> %ins, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
513 ret <vscale x 4 x half> %shf
516 define <vscale x 4 x half> @ld1rh_half_unpacked4_gep(ptr %valp) {
517 ; CHECK-LD1R-LABEL: ld1rh_half_unpacked4_gep:
518 ; CHECK-LD1R: // %bb.0:
519 ; CHECK-LD1R-NEXT: ptrue p0.s
520 ; CHECK-LD1R-NEXT: ld1rh { z0.s }, p0/z, [x0, #126]
521 ; CHECK-LD1R-NEXT: ret
523 ; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked4_gep:
524 ; CHECK-NO-LD1R: // %bb.0:
525 ; CHECK-NO-LD1R-NEXT: ldr h0, [x0, #126]
526 ; CHECK-NO-LD1R-NEXT: mov z0.h, h0
527 ; CHECK-NO-LD1R-NEXT: ret
528 %valp2 = getelementptr half, ptr %valp, i32 63
529 %val = load half, ptr %valp2
530 %ins = insertelement <vscale x 4 x half> undef, half %val, i32 0
531 %shf = shufflevector <vscale x 4 x half> %ins, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
532 ret <vscale x 4 x half> %shf
535 define <vscale x 4 x half> @ld1rh_half_unpacked4_gep_out_of_range_up(ptr %valp) {
536 ; CHECK-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_up:
537 ; CHECK-LD1R: // %bb.0:
538 ; CHECK-LD1R-NEXT: ptrue p0.s
539 ; CHECK-LD1R-NEXT: add x8, x0, #128
540 ; CHECK-LD1R-NEXT: ld1rh { z0.s }, p0/z, [x8]
541 ; CHECK-LD1R-NEXT: ret
543 ; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_up:
544 ; CHECK-NO-LD1R: // %bb.0:
545 ; CHECK-NO-LD1R-NEXT: ldr h0, [x0, #128]
546 ; CHECK-NO-LD1R-NEXT: mov z0.h, h0
547 ; CHECK-NO-LD1R-NEXT: ret
548 %valp2 = getelementptr half, ptr %valp, i32 64
549 %val = load half, ptr %valp2
550 %ins = insertelement <vscale x 4 x half> undef, half %val, i32 0
551 %shf = shufflevector <vscale x 4 x half> %ins, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
552 ret <vscale x 4 x half> %shf
555 define <vscale x 4 x half> @ld1rh_half_unpacked4_gep_out_of_range_down(ptr %valp) {
556 ; CHECK-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_down:
557 ; CHECK-LD1R: // %bb.0:
558 ; CHECK-LD1R-NEXT: ptrue p0.s
559 ; CHECK-LD1R-NEXT: sub x8, x0, #2
560 ; CHECK-LD1R-NEXT: ld1rh { z0.s }, p0/z, [x8]
561 ; CHECK-LD1R-NEXT: ret
563 ; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_down:
564 ; CHECK-NO-LD1R: // %bb.0:
565 ; CHECK-NO-LD1R-NEXT: ldur h0, [x0, #-2]
566 ; CHECK-NO-LD1R-NEXT: mov z0.h, h0
567 ; CHECK-NO-LD1R-NEXT: ret
568 %valp2 = getelementptr half, ptr %valp, i32 -1
569 %val = load half, ptr %valp2
570 %ins = insertelement <vscale x 4 x half> undef, half %val, i32 0
571 %shf = shufflevector <vscale x 4 x half> %ins, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
572 ret <vscale x 4 x half> %shf
575 define <vscale x 2 x half> @ld1rh_half_unpacked2(ptr %valp) {
576 ; CHECK-LD1R-LABEL: ld1rh_half_unpacked2:
577 ; CHECK-LD1R: // %bb.0:
578 ; CHECK-LD1R-NEXT: ptrue p0.d
579 ; CHECK-LD1R-NEXT: ld1rh { z0.d }, p0/z, [x0]
580 ; CHECK-LD1R-NEXT: ret
582 ; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked2:
583 ; CHECK-NO-LD1R: // %bb.0:
584 ; CHECK-NO-LD1R-NEXT: ldr h0, [x0]
585 ; CHECK-NO-LD1R-NEXT: mov z0.h, h0
586 ; CHECK-NO-LD1R-NEXT: ret
587 %val = load half, ptr %valp
588 %ins = insertelement <vscale x 2 x half> undef, half %val, i32 0
589 %shf = shufflevector <vscale x 2 x half> %ins, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
590 ret <vscale x 2 x half> %shf
593 define <vscale x 2 x half> @ld1rh_half_unpacked2_gep(ptr %valp) {
594 ; CHECK-LD1R-LABEL: ld1rh_half_unpacked2_gep:
595 ; CHECK-LD1R: // %bb.0:
596 ; CHECK-LD1R-NEXT: ptrue p0.d
597 ; CHECK-LD1R-NEXT: ld1rh { z0.d }, p0/z, [x0, #126]
598 ; CHECK-LD1R-NEXT: ret
600 ; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked2_gep:
601 ; CHECK-NO-LD1R: // %bb.0:
602 ; CHECK-NO-LD1R-NEXT: ldr h0, [x0, #126]
603 ; CHECK-NO-LD1R-NEXT: mov z0.h, h0
604 ; CHECK-NO-LD1R-NEXT: ret
605 %valp2 = getelementptr half, ptr %valp, i32 63
606 %val = load half, ptr %valp2
607 %ins = insertelement <vscale x 2 x half> undef, half %val, i32 0
608 %shf = shufflevector <vscale x 2 x half> %ins, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
609 ret <vscale x 2 x half> %shf
612 define <vscale x 2 x half> @ld1rh_half_unpacked2_gep_out_of_range_up(ptr %valp) {
613 ; CHECK-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_up:
614 ; CHECK-LD1R: // %bb.0:
615 ; CHECK-LD1R-NEXT: ptrue p0.d
616 ; CHECK-LD1R-NEXT: add x8, x0, #128
617 ; CHECK-LD1R-NEXT: ld1rh { z0.d }, p0/z, [x8]
618 ; CHECK-LD1R-NEXT: ret
620 ; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_up:
621 ; CHECK-NO-LD1R: // %bb.0:
622 ; CHECK-NO-LD1R-NEXT: ldr h0, [x0, #128]
623 ; CHECK-NO-LD1R-NEXT: mov z0.h, h0
624 ; CHECK-NO-LD1R-NEXT: ret
625 %valp2 = getelementptr half, ptr %valp, i32 64
626 %val = load half, ptr %valp2
627 %ins = insertelement <vscale x 2 x half> undef, half %val, i32 0
628 %shf = shufflevector <vscale x 2 x half> %ins, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
629 ret <vscale x 2 x half> %shf
632 define <vscale x 2 x half> @ld1rh_half_unpacked2_gep_out_of_range_down(ptr %valp) {
633 ; CHECK-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_down:
634 ; CHECK-LD1R: // %bb.0:
635 ; CHECK-LD1R-NEXT: ptrue p0.d
636 ; CHECK-LD1R-NEXT: sub x8, x0, #2
637 ; CHECK-LD1R-NEXT: ld1rh { z0.d }, p0/z, [x8]
638 ; CHECK-LD1R-NEXT: ret
640 ; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_down:
641 ; CHECK-NO-LD1R: // %bb.0:
642 ; CHECK-NO-LD1R-NEXT: ldur h0, [x0, #-2]
643 ; CHECK-NO-LD1R-NEXT: mov z0.h, h0
644 ; CHECK-NO-LD1R-NEXT: ret
645 %valp2 = getelementptr half, ptr %valp, i32 -1
646 %val = load half, ptr %valp2
647 %ins = insertelement <vscale x 2 x half> undef, half %val, i32 0
648 %shf = shufflevector <vscale x 2 x half> %ins, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
649 ret <vscale x 2 x half> %shf
652 define <vscale x 4 x float> @ld1rw_float(ptr %valp) {
653 ; CHECK-LD1R-LABEL: ld1rw_float:
654 ; CHECK-LD1R: // %bb.0:
655 ; CHECK-LD1R-NEXT: ptrue p0.s
656 ; CHECK-LD1R-NEXT: ld1rw { z0.s }, p0/z, [x0]
657 ; CHECK-LD1R-NEXT: ret
659 ; CHECK-NO-LD1R-LABEL: ld1rw_float:
660 ; CHECK-NO-LD1R: // %bb.0:
661 ; CHECK-NO-LD1R-NEXT: ldr s0, [x0]
662 ; CHECK-NO-LD1R-NEXT: mov z0.s, s0
663 ; CHECK-NO-LD1R-NEXT: ret
664 %val = load float, ptr %valp
665 %ins = insertelement <vscale x 4 x float> undef, float %val, i32 0
666 %shf = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
667 ret <vscale x 4 x float> %shf
670 define <vscale x 4 x float> @ld1rw_float_gep(ptr %valp) {
671 ; CHECK-LD1R-LABEL: ld1rw_float_gep:
672 ; CHECK-LD1R: // %bb.0:
673 ; CHECK-LD1R-NEXT: ptrue p0.s
674 ; CHECK-LD1R-NEXT: ld1rw { z0.s }, p0/z, [x0, #252]
675 ; CHECK-LD1R-NEXT: ret
677 ; CHECK-NO-LD1R-LABEL: ld1rw_float_gep:
678 ; CHECK-NO-LD1R: // %bb.0:
679 ; CHECK-NO-LD1R-NEXT: ldr s0, [x0, #252]
680 ; CHECK-NO-LD1R-NEXT: mov z0.s, s0
681 ; CHECK-NO-LD1R-NEXT: ret
682 %valp2 = getelementptr float, ptr %valp, i32 63
683 %val = load float, ptr %valp2
684 %ins = insertelement <vscale x 4 x float> undef, float %val, i32 0
685 %shf = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
686 ret <vscale x 4 x float> %shf
689 define <vscale x 4 x float> @ld1rw_float_gep_out_of_range_up(ptr %valp) {
690 ; CHECK-LD1R-LABEL: ld1rw_float_gep_out_of_range_up:
691 ; CHECK-LD1R: // %bb.0:
692 ; CHECK-LD1R-NEXT: ptrue p0.s
693 ; CHECK-LD1R-NEXT: add x8, x0, #256
694 ; CHECK-LD1R-NEXT: ld1rw { z0.s }, p0/z, [x8]
695 ; CHECK-LD1R-NEXT: ret
697 ; CHECK-NO-LD1R-LABEL: ld1rw_float_gep_out_of_range_up:
698 ; CHECK-NO-LD1R: // %bb.0:
699 ; CHECK-NO-LD1R-NEXT: ldr s0, [x0, #256]
700 ; CHECK-NO-LD1R-NEXT: mov z0.s, s0
701 ; CHECK-NO-LD1R-NEXT: ret
702 %valp2 = getelementptr float, ptr %valp, i32 64
703 %val = load float, ptr %valp2
704 %ins = insertelement <vscale x 4 x float> undef, float %val, i32 0
705 %shf = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
706 ret <vscale x 4 x float> %shf
709 define <vscale x 4 x float> @ld1rw_float_gep_out_of_range_down(ptr %valp) {
710 ; CHECK-LD1R-LABEL: ld1rw_float_gep_out_of_range_down:
711 ; CHECK-LD1R: // %bb.0:
712 ; CHECK-LD1R-NEXT: ptrue p0.s
713 ; CHECK-LD1R-NEXT: sub x8, x0, #4
714 ; CHECK-LD1R-NEXT: ld1rw { z0.s }, p0/z, [x8]
715 ; CHECK-LD1R-NEXT: ret
717 ; CHECK-NO-LD1R-LABEL: ld1rw_float_gep_out_of_range_down:
718 ; CHECK-NO-LD1R: // %bb.0:
719 ; CHECK-NO-LD1R-NEXT: ldur s0, [x0, #-4]
720 ; CHECK-NO-LD1R-NEXT: mov z0.s, s0
721 ; CHECK-NO-LD1R-NEXT: ret
722 %valp2 = getelementptr float, ptr %valp, i32 -1
723 %val = load float, ptr %valp2
724 %ins = insertelement <vscale x 4 x float> undef, float %val, i32 0
725 %shf = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
726 ret <vscale x 4 x float> %shf
729 define <vscale x 2 x float> @ld1rw_float_unpacked2(ptr %valp) {
730 ; CHECK-LD1R-LABEL: ld1rw_float_unpacked2:
731 ; CHECK-LD1R: // %bb.0:
732 ; CHECK-LD1R-NEXT: ptrue p0.d
733 ; CHECK-LD1R-NEXT: ld1rw { z0.d }, p0/z, [x0]
734 ; CHECK-LD1R-NEXT: ret
736 ; CHECK-NO-LD1R-LABEL: ld1rw_float_unpacked2:
737 ; CHECK-NO-LD1R: // %bb.0:
738 ; CHECK-NO-LD1R-NEXT: ldr s0, [x0]
739 ; CHECK-NO-LD1R-NEXT: mov z0.s, s0
740 ; CHECK-NO-LD1R-NEXT: ret
741 %val = load float, ptr %valp
742 %ins = insertelement <vscale x 2 x float> undef, float %val, i32 0
743 %shf = shufflevector <vscale x 2 x float> %ins, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
744 ret <vscale x 2 x float> %shf
747 define <vscale x 2 x float> @ld1rw_float_unpacked2_gep(ptr %valp) {
748 ; CHECK-LD1R-LABEL: ld1rw_float_unpacked2_gep:
749 ; CHECK-LD1R: // %bb.0:
750 ; CHECK-LD1R-NEXT: ptrue p0.d
751 ; CHECK-LD1R-NEXT: ld1rw { z0.d }, p0/z, [x0, #252]
752 ; CHECK-LD1R-NEXT: ret
754 ; CHECK-NO-LD1R-LABEL: ld1rw_float_unpacked2_gep:
755 ; CHECK-NO-LD1R: // %bb.0:
756 ; CHECK-NO-LD1R-NEXT: ldr s0, [x0, #252]
757 ; CHECK-NO-LD1R-NEXT: mov z0.s, s0
758 ; CHECK-NO-LD1R-NEXT: ret
759 %valp2 = getelementptr float, ptr %valp, i32 63
760 %val = load float, ptr %valp2
761 %ins = insertelement <vscale x 2 x float> undef, float %val, i32 0
762 %shf = shufflevector <vscale x 2 x float> %ins, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
763 ret <vscale x 2 x float> %shf
766 define <vscale x 2 x float> @ld1rw_float_unpacked2_gep_out_of_range_up(ptr %valp) {
767 ; CHECK-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_up:
768 ; CHECK-LD1R: // %bb.0:
769 ; CHECK-LD1R-NEXT: ptrue p0.d
770 ; CHECK-LD1R-NEXT: add x8, x0, #256
771 ; CHECK-LD1R-NEXT: ld1rw { z0.d }, p0/z, [x8]
772 ; CHECK-LD1R-NEXT: ret
774 ; CHECK-NO-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_up:
775 ; CHECK-NO-LD1R: // %bb.0:
776 ; CHECK-NO-LD1R-NEXT: ldr s0, [x0, #256]
777 ; CHECK-NO-LD1R-NEXT: mov z0.s, s0
778 ; CHECK-NO-LD1R-NEXT: ret
779 %valp2 = getelementptr float, ptr %valp, i32 64
780 %val = load float, ptr %valp2
781 %ins = insertelement <vscale x 2 x float> undef, float %val, i32 0
782 %shf = shufflevector <vscale x 2 x float> %ins, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
783 ret <vscale x 2 x float> %shf
786 define <vscale x 2 x float> @ld1rw_float_unpacked2_gep_out_of_range_down(ptr %valp) {
787 ; CHECK-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_down:
788 ; CHECK-LD1R: // %bb.0:
789 ; CHECK-LD1R-NEXT: ptrue p0.d
790 ; CHECK-LD1R-NEXT: sub x8, x0, #4
791 ; CHECK-LD1R-NEXT: ld1rw { z0.d }, p0/z, [x8]
792 ; CHECK-LD1R-NEXT: ret
794 ; CHECK-NO-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_down:
795 ; CHECK-NO-LD1R: // %bb.0:
796 ; CHECK-NO-LD1R-NEXT: ldur s0, [x0, #-4]
797 ; CHECK-NO-LD1R-NEXT: mov z0.s, s0
798 ; CHECK-NO-LD1R-NEXT: ret
799 %valp2 = getelementptr float, ptr %valp, i32 -1
800 %val = load float, ptr %valp2
801 %ins = insertelement <vscale x 2 x float> undef, float %val, i32 0
802 %shf = shufflevector <vscale x 2 x float> %ins, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
803 ret <vscale x 2 x float> %shf
806 define <vscale x 2 x double> @ld1rd_double(ptr %valp) {
807 ; CHECK-LD1R-LABEL: ld1rd_double:
808 ; CHECK-LD1R: // %bb.0:
809 ; CHECK-LD1R-NEXT: ptrue p0.d
810 ; CHECK-LD1R-NEXT: ld1rd { z0.d }, p0/z, [x0]
811 ; CHECK-LD1R-NEXT: ret
813 ; CHECK-NO-LD1R-LABEL: ld1rd_double:
814 ; CHECK-NO-LD1R: // %bb.0:
815 ; CHECK-NO-LD1R-NEXT: ldr d0, [x0]
816 ; CHECK-NO-LD1R-NEXT: mov z0.d, d0
817 ; CHECK-NO-LD1R-NEXT: ret
818 %val = load double, ptr %valp
819 %ins = insertelement <vscale x 2 x double> undef, double %val, i32 0
820 %shf = shufflevector <vscale x 2 x double> %ins, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
821 ret <vscale x 2 x double> %shf
824 define <vscale x 2 x double> @ld1rd_double_gep(ptr %valp) {
825 ; CHECK-LD1R-LABEL: ld1rd_double_gep:
826 ; CHECK-LD1R: // %bb.0:
827 ; CHECK-LD1R-NEXT: ptrue p0.d
828 ; CHECK-LD1R-NEXT: ld1rd { z0.d }, p0/z, [x0, #504]
829 ; CHECK-LD1R-NEXT: ret
831 ; CHECK-NO-LD1R-LABEL: ld1rd_double_gep:
832 ; CHECK-NO-LD1R: // %bb.0:
833 ; CHECK-NO-LD1R-NEXT: ldr d0, [x0, #504]
834 ; CHECK-NO-LD1R-NEXT: mov z0.d, d0
835 ; CHECK-NO-LD1R-NEXT: ret
836 %valp2 = getelementptr double, ptr %valp, i32 63
837 %val = load double, ptr %valp2
838 %ins = insertelement <vscale x 2 x double> undef, double %val, i32 0
839 %shf = shufflevector <vscale x 2 x double> %ins, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
840 ret <vscale x 2 x double> %shf
843 define <vscale x 2 x double> @ld1rd_double_gep_out_of_range_up(ptr %valp) {
844 ; CHECK-LD1R-LABEL: ld1rd_double_gep_out_of_range_up:
845 ; CHECK-LD1R: // %bb.0:
846 ; CHECK-LD1R-NEXT: ptrue p0.d
847 ; CHECK-LD1R-NEXT: add x8, x0, #512
848 ; CHECK-LD1R-NEXT: ld1rd { z0.d }, p0/z, [x8]
849 ; CHECK-LD1R-NEXT: ret
851 ; CHECK-NO-LD1R-LABEL: ld1rd_double_gep_out_of_range_up:
852 ; CHECK-NO-LD1R: // %bb.0:
853 ; CHECK-NO-LD1R-NEXT: ldr d0, [x0, #512]
854 ; CHECK-NO-LD1R-NEXT: mov z0.d, d0
855 ; CHECK-NO-LD1R-NEXT: ret
856 %valp2 = getelementptr double, ptr %valp, i32 64
857 %val = load double, ptr %valp2
858 %ins = insertelement <vscale x 2 x double> undef, double %val, i32 0
859 %shf = shufflevector <vscale x 2 x double> %ins, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
860 ret <vscale x 2 x double> %shf
863 define <vscale x 2 x double> @ld1rd_double_gep_out_of_range_down(ptr %valp) {
864 ; CHECK-LD1R-LABEL: ld1rd_double_gep_out_of_range_down:
865 ; CHECK-LD1R: // %bb.0:
866 ; CHECK-LD1R-NEXT: ptrue p0.d
867 ; CHECK-LD1R-NEXT: sub x8, x0, #8
868 ; CHECK-LD1R-NEXT: ld1rd { z0.d }, p0/z, [x8]
869 ; CHECK-LD1R-NEXT: ret
871 ; CHECK-NO-LD1R-LABEL: ld1rd_double_gep_out_of_range_down:
872 ; CHECK-NO-LD1R: // %bb.0:
873 ; CHECK-NO-LD1R-NEXT: ldur d0, [x0, #-8]
874 ; CHECK-NO-LD1R-NEXT: mov z0.d, d0
875 ; CHECK-NO-LD1R-NEXT: ret
876 %valp2 = getelementptr double, ptr %valp, i32 -1
877 %val = load double, ptr %valp2
878 %ins = insertelement <vscale x 2 x double> undef, double %val, i32 0
879 %shf = shufflevector <vscale x 2 x double> %ins, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
880 ret <vscale x 2 x double> %shf
883 define <vscale x 2 x double> @dupq_ld1rqd_f64(ptr %a) {
884 ; CHECK-LABEL: dupq_ld1rqd_f64:
886 ; CHECK-NEXT: ptrue p0.d
887 ; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0]
889 %1 = load <2 x double>, ptr %a
890 %2 = tail call fast <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %1, i64 0)
891 %3 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %2, i64 0)
892 ret <vscale x 2 x double> %3
895 define <vscale x 4 x float> @dupq_ld1rqw_f32(ptr %a) {
896 ; CHECK-LABEL: dupq_ld1rqw_f32:
898 ; CHECK-NEXT: ptrue p0.s
899 ; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0]
901 %1 = load <4 x float>, ptr %a
902 %2 = tail call fast <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %1, i64 0)
903 %3 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %2, i64 0)
904 ret <vscale x 4 x float> %3
907 define <vscale x 8 x half> @dupq_ld1rqh_f16(ptr %a) {
908 ; CHECK-LABEL: dupq_ld1rqh_f16:
910 ; CHECK-NEXT: ptrue p0.h
911 ; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0]
913 %1 = load <8 x half>, ptr %a
914 %2 = tail call fast <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %1, i64 0)
915 %3 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %2, i64 0)
916 ret <vscale x 8 x half> %3
919 define <vscale x 8 x bfloat> @dupq_ld1rqh_bf16(ptr %a) #0 {
920 ; CHECK-LABEL: dupq_ld1rqh_bf16:
922 ; CHECK-NEXT: ptrue p0.h
923 ; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0]
925 %1 = load <8 x bfloat>, ptr %a
926 %2 = tail call fast <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> %1, i64 0)
927 %3 = tail call fast <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %2, i64 0)
928 ret <vscale x 8 x bfloat> %3
931 define <vscale x 2 x i64> @dupq_ld1rqd_i64(ptr %a) #0 {
932 ; CHECK-LABEL: dupq_ld1rqd_i64:
934 ; CHECK-NEXT: ptrue p0.d
935 ; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0]
937 %1 = load <2 x i64>, ptr %a
938 %2 = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> %1, i64 0)
939 %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %2, i64 0)
940 ret <vscale x 2 x i64> %3
943 define <vscale x 4 x i32> @dupq_ld1rqw_i32(ptr %a) #0 {
944 ; CHECK-LABEL: dupq_ld1rqw_i32:
946 ; CHECK-NEXT: ptrue p0.s
947 ; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0]
949 %1 = load <4 x i32>, ptr %a
950 %2 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> %1, i64 0)
951 %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %2, i64 0)
952 ret <vscale x 4 x i32> %3
955 define <vscale x 8 x i16> @dupq_ld1rqw_i16(ptr %a) #0 {
956 ; CHECK-LABEL: dupq_ld1rqw_i16:
958 ; CHECK-NEXT: ptrue p0.h
959 ; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0]
961 %1 = load <8 x i16>, ptr %a
962 %2 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> %1, i64 0)
963 %3 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %2, i64 0)
964 ret <vscale x 8 x i16> %3
967 define <vscale x 16 x i8> @dupq_ld1rqw_i8(ptr %a) #0 {
968 ; CHECK-LABEL: dupq_ld1rqw_i8:
970 ; CHECK-NEXT: ptrue p0.b
971 ; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0]
973 %1 = load <16 x i8>, ptr %a
974 %2 = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> %1, i64 0)
975 %3 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %2, i64 0)
976 ret <vscale x 16 x i8> %3
984 ; * dup with passthru=undef or passthrue=zero.
985 ; * sign/zero extending.
989 ; * dup with passthru as a parameter.
993 define <vscale x 16 x i8> @dup_ld1rb_i8_passthruundef_nxv16i8(<vscale x 16 x i1> %pg, ptr %addr) {
994 ; CHECK-LABEL: dup_ld1rb_i8_passthruundef_nxv16i8:
996 ; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0]
998 %ld = load i8, ptr %addr
999 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> %pg, i8 %ld)
1000 ret <vscale x 16 x i8> %res
1002 define <vscale x 8 x i16> @dup_ld1rh_i16_passthruundef_nxv8i16(<vscale x 8 x i1> %pg, ptr %addr) {
1003 ; CHECK-LABEL: dup_ld1rh_i16_passthruundef_nxv8i16:
1005 ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0]
1007 %ld = load i16, ptr %addr
1008 %res = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, i16 %ld)
1009 ret <vscale x 8 x i16> %res
1011 define <vscale x 8 x i16> @dup_ld1rh_i8_passthruundef_nxv8i16_sext(<vscale x 8 x i1> %pg, ptr %addr) {
1012 ; CHECK-LABEL: dup_ld1rh_i8_passthruundef_nxv8i16_sext:
1014 ; CHECK-NEXT: ld1rsb { z0.h }, p0/z, [x0]
1016 %ld = load i8, ptr %addr
1017 %ext = sext i8 %ld to i16
1018 %res = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, i16 %ext)
1019 ret <vscale x 8 x i16> %res
1021 define <vscale x 8 x i16> @dup_ld1rh_i8_passthruundef_nxv8i16_zext(<vscale x 8 x i1> %pg, ptr %addr) {
1022 ; CHECK-LABEL: dup_ld1rh_i8_passthruundef_nxv8i16_zext:
1024 ; CHECK-NEXT: ld1rb { z0.h }, p0/z, [x0]
1026 %ld = load i8, ptr %addr
1027 %ext = zext i8 %ld to i16
1028 %res = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, i16 %ext)
1029 ret <vscale x 8 x i16> %res
1031 define <vscale x 4 x i32> @dup_ld1rs_i32_passthruundef_nxv4i32(<vscale x 4 x i1> %pg, ptr %addr) {
1032 ; CHECK-LABEL: dup_ld1rs_i32_passthruundef_nxv4i32:
1034 ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0]
1036 %ld = load i32, ptr %addr
1037 %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, i32 %ld)
1038 ret <vscale x 4 x i32> %res
1040 define <vscale x 4 x i32> @dup_ld1rs_i8_passthruundef_nxv4i32_sext(<vscale x 4 x i1> %pg, ptr %addr) {
1041 ; CHECK-LABEL: dup_ld1rs_i8_passthruundef_nxv4i32_sext:
1043 ; CHECK-NEXT: ld1rsb { z0.s }, p0/z, [x0]
1045 %ld = load i8, ptr %addr
1046 %ext = sext i8 %ld to i32
1047 %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, i32 %ext)
1048 ret <vscale x 4 x i32> %res
1050 define <vscale x 4 x i32> @dup_ld1rs_i8_passthruundef_nxv4i32_zext(<vscale x 4 x i1> %pg, ptr %addr) {
1051 ; CHECK-LABEL: dup_ld1rs_i8_passthruundef_nxv4i32_zext:
1053 ; CHECK-NEXT: ld1rb { z0.s }, p0/z, [x0]
1055 %ld = load i8, ptr %addr
1056 %ext = zext i8 %ld to i32
1057 %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, i32 %ext)
1058 ret <vscale x 4 x i32> %res
1060 define <vscale x 4 x i32> @dup_ld1rs_i16_passthruundef_nxv4i32_sext(<vscale x 4 x i1> %pg, ptr %addr) {
1061 ; CHECK-LABEL: dup_ld1rs_i16_passthruundef_nxv4i32_sext:
1063 ; CHECK-NEXT: ld1rsh { z0.s }, p0/z, [x0]
1065 %ld = load i16, ptr %addr
1066 %ext = sext i16 %ld to i32
1067 %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, i32 %ext)
1068 ret <vscale x 4 x i32> %res
1070 define <vscale x 4 x i32> @dup_ld1rs_i16_passthruundef_nxv4i32_zext(<vscale x 4 x i1> %pg, ptr %addr) {
1071 ; CHECK-LABEL: dup_ld1rs_i16_passthruundef_nxv4i32_zext:
1073 ; CHECK-NEXT: ld1rh { z0.s }, p0/z, [x0]
1075 %ld = load i16, ptr %addr
1076 %ext = zext i16 %ld to i32
1077 %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, i32 %ext)
1078 ret <vscale x 4 x i32> %res
1080 define <vscale x 2 x i64> @dup_ld1rd_i64_passthruundef_nxv2i64(<vscale x 2 x i1> %pg, ptr %addr) {
1081 ; CHECK-LABEL: dup_ld1rd_i64_passthruundef_nxv2i64:
1083 ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0]
1085 %ld = load i64, ptr %addr
1086 %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ld)
1087 ret <vscale x 2 x i64> %res
1089 define <vscale x 2 x i64> @dup_ld1rs_i8_passthruundef_nxv2i64_sext(<vscale x 2 x i1> %pg, ptr %addr) {
1090 ; CHECK-LABEL: dup_ld1rs_i8_passthruundef_nxv2i64_sext:
1092 ; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0]
1094 %ld = load i8, ptr %addr
1095 %ext = sext i8 %ld to i64
1096 %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
1097 ret <vscale x 2 x i64> %res
1099 define <vscale x 2 x i64> @dup_ld1rs_i8_passthruundef_nxv2i64_zext(<vscale x 2 x i1> %pg, ptr %addr) {
1100 ; CHECK-LABEL: dup_ld1rs_i8_passthruundef_nxv2i64_zext:
1102 ; CHECK-NEXT: ld1rb { z0.d }, p0/z, [x0]
1104 %ld = load i8, ptr %addr
1105 %ext = zext i8 %ld to i64
1106 %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
1107 ret <vscale x 2 x i64> %res
1109 define <vscale x 2 x i64> @dup_ld1rs_i16_passthruundef_nxv2i64_sext(<vscale x 2 x i1> %pg, ptr %addr) {
1110 ; CHECK-LABEL: dup_ld1rs_i16_passthruundef_nxv2i64_sext:
1112 ; CHECK-NEXT: ld1rsh { z0.d }, p0/z, [x0]
1114 %ld = load i16, ptr %addr
1115 %ext = sext i16 %ld to i64
1116 %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
1117 ret <vscale x 2 x i64> %res
1119 define <vscale x 2 x i64> @dup_ld1rs_i16_passthruundef_nxv2i64_zext(<vscale x 2 x i1> %pg, ptr %addr) {
1120 ; CHECK-LABEL: dup_ld1rs_i16_passthruundef_nxv2i64_zext:
1122 ; CHECK-NEXT: ld1rh { z0.d }, p0/z, [x0]
1124 %ld = load i16, ptr %addr
1125 %ext = zext i16 %ld to i64
1126 %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
1127 ret <vscale x 2 x i64> %res
1129 define <vscale x 2 x i64> @dup_ld1rs_i32_passthruundef_nxv2i64_sext(<vscale x 2 x i1> %pg, ptr %addr) {
1130 ; CHECK-LABEL: dup_ld1rs_i32_passthruundef_nxv2i64_sext:
1132 ; CHECK-NEXT: ld1rsw { z0.d }, p0/z, [x0]
1134 %ld = load i32, ptr %addr
1135 %ext = sext i32 %ld to i64
1136 %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
1137 ret <vscale x 2 x i64> %res
1139 define <vscale x 2 x i64> @dup_ld1rs_i32_passthruundef_nxv2i64_zext(<vscale x 2 x i1> %pg, ptr %addr) {
1140 ; CHECK-LABEL: dup_ld1rs_i32_passthruundef_nxv2i64_zext:
1142 ; CHECK-NEXT: ld1rw { z0.d }, p0/z, [x0]
1144 %ld = load i32, ptr %addr
1145 %ext = zext i32 %ld to i64
1146 %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
1147 ret <vscale x 2 x i64> %res
1149 define <vscale x 8 x half> @dup_ld1rh_half_passthruundef_nxv8f16(<vscale x 8 x i1> %pg, ptr %addr) {
1150 ; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv8f16:
1151 ; CHECK-LD1R: // %bb.0:
1152 ; CHECK-LD1R-NEXT: ld1rh { z0.h }, p0/z, [x0]
1153 ; CHECK-LD1R-NEXT: ret
1155 ; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv8f16:
1156 ; CHECK-NO-LD1R: // %bb.0:
1157 ; CHECK-NO-LD1R-NEXT: ldr h0, [x0]
1158 ; CHECK-NO-LD1R-NEXT: mov z0.h, p0/m, h0
1159 ; CHECK-NO-LD1R-NEXT: ret
1160 %ld = load half, ptr %addr
1161 %res = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, half %ld)
1162 ret <vscale x 8 x half> %res
1164 define <vscale x 4 x float> @dup_ld1rs_float_passthruundef_nxv4f32(<vscale x 4 x i1> %pg, ptr %addr) {
1165 ; CHECK-LD1R-LABEL: dup_ld1rs_float_passthruundef_nxv4f32:
1166 ; CHECK-LD1R: // %bb.0:
1167 ; CHECK-LD1R-NEXT: ld1rw { z0.s }, p0/z, [x0]
1168 ; CHECK-LD1R-NEXT: ret
1170 ; CHECK-NO-LD1R-LABEL: dup_ld1rs_float_passthruundef_nxv4f32:
1171 ; CHECK-NO-LD1R: // %bb.0:
1172 ; CHECK-NO-LD1R-NEXT: ldr s0, [x0]
1173 ; CHECK-NO-LD1R-NEXT: mov z0.s, p0/m, s0
1174 ; CHECK-NO-LD1R-NEXT: ret
1175 %ld = load float, ptr %addr
1176 %res = call <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, float %ld)
1177 ret <vscale x 4 x float> %res
1179 define <vscale x 2 x double> @dup_ld1rd_double_passthruundef_nxv2f64(<vscale x 2 x i1> %pg, ptr %addr) {
1180 ; CHECK-LD1R-LABEL: dup_ld1rd_double_passthruundef_nxv2f64:
1181 ; CHECK-LD1R: // %bb.0:
1182 ; CHECK-LD1R-NEXT: ld1rd { z0.d }, p0/z, [x0]
1183 ; CHECK-LD1R-NEXT: ret
1185 ; CHECK-NO-LD1R-LABEL: dup_ld1rd_double_passthruundef_nxv2f64:
1186 ; CHECK-NO-LD1R: // %bb.0:
1187 ; CHECK-NO-LD1R-NEXT: ldr d0, [x0]
1188 ; CHECK-NO-LD1R-NEXT: mov z0.d, p0/m, d0
1189 ; CHECK-NO-LD1R-NEXT: ret
1190 %ld = load double, ptr %addr
1191 %res = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, double %ld)
1192 ret <vscale x 2 x double> %res
1194 define <vscale x 4 x half> @dup_ld1rh_half_passthruundef_nxv4f16(<vscale x 4 x i1> %pg, ptr %addr) {
1195 ; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv4f16:
1196 ; CHECK-LD1R: // %bb.0:
1197 ; CHECK-LD1R-NEXT: ld1rh { z0.s }, p0/z, [x0]
1198 ; CHECK-LD1R-NEXT: ret
1200 ; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv4f16:
1201 ; CHECK-NO-LD1R: // %bb.0:
1202 ; CHECK-NO-LD1R-NEXT: ldr h0, [x0]
1203 ; CHECK-NO-LD1R-NEXT: mov z0.h, p0/m, h0
1204 ; CHECK-NO-LD1R-NEXT: ret
1205 %ld = load half, ptr %addr
1206 %res = call <vscale x 4 x half> @llvm.aarch64.sve.dup.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x i1> %pg, half %ld)
1207 ret <vscale x 4 x half> %res
1209 define <vscale x 16 x i8> @dup_ld1rb_i8_passthruzero_nxv16i8(<vscale x 16 x i1> %pg, ptr %addr) {
1210 ; CHECK-LABEL: dup_ld1rb_i8_passthruzero_nxv16i8:
1212 ; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0]
1214 %ld = load i8, ptr %addr
1215 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, i8 %ld)
1216 ret <vscale x 16 x i8> %res
1218 define <vscale x 8 x i16> @dup_ld1rh_i16_passthruzero_nxv8i16(<vscale x 8 x i1> %pg, ptr %addr) {
1219 ; CHECK-LABEL: dup_ld1rh_i16_passthruzero_nxv8i16:
1221 ; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0]
1223 %ld = load i16, ptr %addr
1224 %res = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %pg, i16 %ld)
1225 ret <vscale x 8 x i16> %res
1227 define <vscale x 4 x i32> @dup_ld1rs_i32_passthruzero_nxv4i32(<vscale x 4 x i1> %pg, ptr %addr) {
1228 ; CHECK-LABEL: dup_ld1rs_i32_passthruzero_nxv4i32:
1230 ; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0]
1232 %ld = load i32, ptr %addr
1233 %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, i32 %ld)
1234 ret <vscale x 4 x i32> %res
1236 define <vscale x 2 x i64> @dup_ld1rd_i64_passthruzero_nxv2i64(<vscale x 2 x i1> %pg, ptr %addr) {
1237 ; CHECK-LABEL: dup_ld1rd_i64_passthruzero_nxv2i64:
1239 ; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0]
1241 %ld = load i64, ptr %addr
1242 %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, i64 %ld)
1243 ret <vscale x 2 x i64> %res
1245 define <vscale x 8 x half> @dup_ld1rh_half_passthruzero_nxv8f16(<vscale x 8 x i1> %pg, ptr %addr) {
1246 ; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv8f16:
1247 ; CHECK-LD1R: // %bb.0:
1248 ; CHECK-LD1R-NEXT: ld1rh { z0.h }, p0/z, [x0]
1249 ; CHECK-LD1R-NEXT: ret
1251 ; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv8f16:
1252 ; CHECK-NO-LD1R: // %bb.0:
1253 ; CHECK-NO-LD1R-NEXT: mov z0.h, #0 // =0x0
1254 ; CHECK-NO-LD1R-NEXT: ldr h1, [x0]
1255 ; CHECK-NO-LD1R-NEXT: mov z0.h, p0/m, h1
1256 ; CHECK-NO-LD1R-NEXT: ret
1257 %ld = load half, ptr %addr
1258 %res = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x i1> %pg, half %ld)
1259 ret <vscale x 8 x half> %res
1261 define <vscale x 4 x float> @dup_ld1rs_float_passthruzero_nxv4f32(<vscale x 4 x i1> %pg, ptr %addr) {
1262 ; CHECK-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv4f32:
1263 ; CHECK-LD1R: // %bb.0:
1264 ; CHECK-LD1R-NEXT: ld1rw { z0.s }, p0/z, [x0]
1265 ; CHECK-LD1R-NEXT: ret
1267 ; CHECK-NO-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv4f32:
1268 ; CHECK-NO-LD1R: // %bb.0:
1269 ; CHECK-NO-LD1R-NEXT: mov z0.s, #0 // =0x0
1270 ; CHECK-NO-LD1R-NEXT: ldr s1, [x0]
1271 ; CHECK-NO-LD1R-NEXT: mov z0.s, p0/m, s1
1272 ; CHECK-NO-LD1R-NEXT: ret
1273 %ld = load float, ptr %addr
1274 %res = call <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %pg, float %ld)
1275 ret <vscale x 4 x float> %res
1277 define <vscale x 2 x double> @dup_ld1rd_double_passthruzero_nxv2f64(<vscale x 2 x i1> %pg, ptr %addr) {
1278 ; CHECK-LD1R-LABEL: dup_ld1rd_double_passthruzero_nxv2f64:
1279 ; CHECK-LD1R: // %bb.0:
1280 ; CHECK-LD1R-NEXT: ld1rd { z0.d }, p0/z, [x0]
1281 ; CHECK-LD1R-NEXT: ret
1283 ; CHECK-NO-LD1R-LABEL: dup_ld1rd_double_passthruzero_nxv2f64:
1284 ; CHECK-NO-LD1R: // %bb.0:
1285 ; CHECK-NO-LD1R-NEXT: mov z0.d, #0 // =0x0
1286 ; CHECK-NO-LD1R-NEXT: ldr d1, [x0]
1287 ; CHECK-NO-LD1R-NEXT: mov z0.d, p0/m, d1
1288 ; CHECK-NO-LD1R-NEXT: ret
1289 %ld = load double, ptr %addr
1290 %res = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, double %ld)
1291 ret <vscale x 2 x double> %res
1293 define <vscale x 4 x half> @dup_ld1rh_half_passthruzero_nxv4f16(<vscale x 4 x i1> %pg, ptr %addr) {
1294 ; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv4f16:
1295 ; CHECK-LD1R: // %bb.0:
1296 ; CHECK-LD1R-NEXT: ld1rh { z0.s }, p0/z, [x0]
1297 ; CHECK-LD1R-NEXT: ret
1299 ; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv4f16:
1300 ; CHECK-NO-LD1R: // %bb.0:
1301 ; CHECK-NO-LD1R-NEXT: mov z0.h, #0 // =0x0
1302 ; CHECK-NO-LD1R-NEXT: ldr h1, [x0]
1303 ; CHECK-NO-LD1R-NEXT: mov z0.h, p0/m, h1
1304 ; CHECK-NO-LD1R-NEXT: ret
1305 %ld = load half, ptr %addr
1306 %res = call <vscale x 4 x half> @llvm.aarch64.sve.dup.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x i1> %pg, half %ld)
1307 ret <vscale x 4 x half> %res
1309 define <vscale x 2 x half> @dup_ld1rh_half_passthruzero_nxv2f16(<vscale x 2 x i1> %pg, ptr %addr) {
1310 ; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv2f16:
1311 ; CHECK-LD1R: // %bb.0:
1312 ; CHECK-LD1R-NEXT: ld1rh { z0.d }, p0/z, [x0]
1313 ; CHECK-LD1R-NEXT: ret
1315 ; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv2f16:
1316 ; CHECK-NO-LD1R: // %bb.0:
1317 ; CHECK-NO-LD1R-NEXT: mov z0.h, #0 // =0x0
1318 ; CHECK-NO-LD1R-NEXT: ldr h1, [x0]
1319 ; CHECK-NO-LD1R-NEXT: mov z0.h, p0/m, h1
1320 ; CHECK-NO-LD1R-NEXT: ret
1321 %ld = load half, ptr %addr
1322 %res = call <vscale x 2 x half> @llvm.aarch64.sve.dup.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x i1> %pg, half %ld)
1323 ret <vscale x 2 x half> %res
1325 define <vscale x 2 x float> @dup_ld1rs_float_passthruzero_nxv2f32(<vscale x 2 x i1> %pg, ptr %addr) {
1326 ; CHECK-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv2f32:
1327 ; CHECK-LD1R: // %bb.0:
1328 ; CHECK-LD1R-NEXT: ld1rw { z0.d }, p0/z, [x0]
1329 ; CHECK-LD1R-NEXT: ret
1331 ; CHECK-NO-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv2f32:
1332 ; CHECK-NO-LD1R: // %bb.0:
1333 ; CHECK-NO-LD1R-NEXT: mov z0.s, #0 // =0x0
1334 ; CHECK-NO-LD1R-NEXT: ldr s1, [x0]
1335 ; CHECK-NO-LD1R-NEXT: mov z0.s, p0/m, s1
1336 ; CHECK-NO-LD1R-NEXT: ret
1337 %ld = load float, ptr %addr
1338 %res = call <vscale x 2 x float> @llvm.aarch64.sve.dup.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x i1> %pg, float %ld)
1339 ret <vscale x 2 x float> %res
1341 define <vscale x 16 x i8> @negtest_dup_ld1rb_i8_passthru_nxv16i8(<vscale x 16 x i8> %pt, <vscale x 16 x i1> %pg, ptr %addr) {
1342 ; CHECK-LABEL: negtest_dup_ld1rb_i8_passthru_nxv16i8:
1344 ; CHECK-NEXT: ldrb w8, [x0]
1345 ; CHECK-NEXT: mov z0.b, p0/m, w8
1347 %ld = load i8, ptr %addr
1348 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %pt, <vscale x 16 x i1> %pg, i8 %ld)
1349 ret <vscale x 16 x i8> %res
1351 define <vscale x 8 x i16> @negtest_dup_ld1rh_i16_passthru_nxv8i16(<vscale x 8 x i16> %pt, <vscale x 8 x i1> %pg, ptr %addr) {
1352 ; CHECK-LABEL: negtest_dup_ld1rh_i16_passthru_nxv8i16:
1354 ; CHECK-NEXT: ldrh w8, [x0]
1355 ; CHECK-NEXT: mov z0.h, p0/m, w8
1357 %ld = load i16, ptr %addr
1358 %res = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> %pt, <vscale x 8 x i1> %pg, i16 %ld)
1359 ret <vscale x 8 x i16> %res
1361 define <vscale x 4 x i32> @negtest_dup_ld1rs_i32_passthru_nxv4i32(<vscale x 4 x i32> %pt, <vscale x 4 x i1> %pg, ptr %addr) {
1362 ; CHECK-LABEL: negtest_dup_ld1rs_i32_passthru_nxv4i32:
1364 ; CHECK-NEXT: ldr w8, [x0]
1365 ; CHECK-NEXT: mov z0.s, p0/m, w8
1367 %ld = load i32, ptr %addr
1368 %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> %pt, <vscale x 4 x i1> %pg, i32 %ld)
1369 ret <vscale x 4 x i32> %res
1371 define <vscale x 2 x i64> @negtest_dup_ld1rd_i64_passthru_nxv2i64(<vscale x 2 x i64> %pt, <vscale x 2 x i1> %pg, ptr %addr) {
1372 ; CHECK-LABEL: negtest_dup_ld1rd_i64_passthru_nxv2i64:
1374 ; CHECK-NEXT: ldr x8, [x0]
1375 ; CHECK-NEXT: mov z0.d, p0/m, x8
1377 %ld = load i64, ptr %addr
1378 %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> %pt, <vscale x 2 x i1> %pg, i64 %ld)
1379 ret <vscale x 2 x i64> %res
1381 define <vscale x 8 x half> @negtest_dup_ld1rh_half_passthru_nxv8f16(<vscale x 8 x half> %pt, <vscale x 8 x i1> %pg, ptr %addr) {
1382 ; CHECK-LABEL: negtest_dup_ld1rh_half_passthru_nxv8f16:
1384 ; CHECK-NEXT: ldr h1, [x0]
1385 ; CHECK-NEXT: mov z0.h, p0/m, h1
1387 %ld = load half, ptr %addr
1388 %res = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> %pt, <vscale x 8 x i1> %pg, half %ld)
1389 ret <vscale x 8 x half> %res
1391 define <vscale x 4 x float> @negtest_dup_ld1rs_float_passthru_nxv4f32(<vscale x 4 x float> %pt, <vscale x 4 x i1> %pg, ptr %addr) {
1392 ; CHECK-LABEL: negtest_dup_ld1rs_float_passthru_nxv4f32:
1394 ; CHECK-NEXT: ldr s1, [x0]
1395 ; CHECK-NEXT: mov z0.s, p0/m, s1
1397 %ld = load float, ptr %addr
1398 %res = call <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float> %pt, <vscale x 4 x i1> %pg, float %ld)
1399 ret <vscale x 4 x float> %res
1401 define <vscale x 2 x double> @negtest_dup_ld1rd_double_passthru_nxv2f64(<vscale x 2 x double> %pt, <vscale x 2 x i1> %pg, ptr %addr) {
1402 ; CHECK-LABEL: negtest_dup_ld1rd_double_passthru_nxv2f64:
1404 ; CHECK-NEXT: ldr d1, [x0]
1405 ; CHECK-NEXT: mov z0.d, p0/m, d1
1407 %ld = load double, ptr %addr
1408 %res = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> %pt, <vscale x 2 x i1> %pg, double %ld)
1409 ret <vscale x 2 x double> %res
1413 ; Check that a load consumed by a scalable splat prefers a replicating load.
1414 define i8* @avoid_preindex_load(i8* %src, <vscale x 2 x i64>* %out) {
1415 ; CHECK-LABEL: avoid_preindex_load:
1417 ; CHECK-NEXT: ptrue p0.d
1418 ; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1]
1419 ; CHECK-NEXT: add x0, x0, #1
1420 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1422 %ptr = getelementptr inbounds i8, i8* %src, i64 1
1423 %tmp = load i8, i8* %ptr, align 4
1424 %ext = sext i8 %tmp to i64
1425 %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
1426 %dup = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
1427 store <vscale x 2 x i64> %dup, <vscale x 2 x i64>* %out
1431 ; Check that a load consumed by a scalable splat prefers a replicating
1432 ; load over a pre-indexed load.
1433 define i8* @avoid_preindex_load_dup(i8* %src, <vscale x 2 x i1> %pg, <vscale x 2 x i64>* %out) {
1434 ; CHECK-LABEL: avoid_preindex_load_dup:
1436 ; CHECK-NEXT: ptrue p1.d
1437 ; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1]
1438 ; CHECK-NEXT: add x0, x0, #1
1439 ; CHECK-NEXT: st1d { z0.d }, p1, [x1]
1441 %ptr = getelementptr inbounds i8, i8* %src, i64 1
1442 %tmp = load i8, i8* %ptr, align 4
1443 %ext = sext i8 %tmp to i64
1444 %dup = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
1445 store <vscale x 2 x i64> %dup, <vscale x 2 x i64>* %out
1449 ; Same as avoid_preindex_load_dup, but with zero passthru.
1450 define i8* @avoid_preindex_load_dup_passthru_zero(i8* %src, <vscale x 2 x i1> %pg, <vscale x 2 x i64>* %out) {
1451 ; CHECK-LABEL: avoid_preindex_load_dup_passthru_zero:
1453 ; CHECK-NEXT: ptrue p1.d
1454 ; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1]
1455 ; CHECK-NEXT: add x0, x0, #1
1456 ; CHECK-NEXT: st1d { z0.d }, p1, [x1]
1458 %ptr = getelementptr inbounds i8, i8* %src, i64 1
1459 %tmp = load i8, i8* %ptr, align 4
1460 %ext = sext i8 %tmp to i64
1461 %dup = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, i64 %ext)
1462 store <vscale x 2 x i64> %dup, <vscale x 2 x i64>* %out
1466 ; If a dup has a non-undef passthru, stick with the pre-indexed load.
1467 define i8* @preindex_load_dup_passthru(<vscale x 2 x i64> %passthru, i8* %src, <vscale x 2 x i1> %pg, <vscale x 2 x i64>* %out) {
1468 ; CHECK-LABEL: preindex_load_dup_passthru:
1470 ; CHECK-NEXT: ptrue p1.d
1471 ; CHECK-NEXT: ldrsb x8, [x0, #1]!
1472 ; CHECK-NEXT: mov z0.d, p0/m, x8
1473 ; CHECK-NEXT: st1d { z0.d }, p1, [x1]
1475 %ptr = getelementptr inbounds i8, i8* %src, i64 1
1476 %tmp = load i8, i8* %ptr, align 4
1477 %ext = sext i8 %tmp to i64
1478 %dup = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> %passthru, <vscale x 2 x i1> %pg, i64 %ext)
1479 store <vscale x 2 x i64> %dup, <vscale x 2 x i64>* %out
1483 ; Show that a second user of the load prevents the replicating load
1484 ; check which would ordinarily inhibit indexed loads from firing.
1485 define i8* @preidx8sext64_instead_of_ld1r(i8* %src, <vscale x 2 x i64>* %out, i64* %dst) {
1486 ; CHECK-LABEL: preidx8sext64_instead_of_ld1r:
1488 ; CHECK-NEXT: ptrue p0.d
1489 ; CHECK-NEXT: ldrsb x8, [x0, #1]!
1490 ; CHECK-NEXT: mov z0.d, x8
1491 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1492 ; CHECK-NEXT: str x8, [x2]
1494 %ptr = getelementptr inbounds i8, i8* %src, i64 1
1495 %tmp = load i8, i8* %ptr, align 4
1496 %ext = sext i8 %tmp to i64
1497 %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
1498 %dup = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
1499 store <vscale x 2 x i64> %dup, <vscale x 2 x i64>* %out
1500 store i64 %ext, i64* %dst
1505 declare <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8>, i64)
1506 declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64)
1507 declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
1508 declare <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64>, i64)
1509 declare <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half>, i64)
1510 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat>, i64)
1511 declare <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float>, i64)
1512 declare <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double>, i64)
1514 declare <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double>, <2 x double>, i64)
1515 declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
1516 declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half>, <8 x half>, i64)
1517 declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
1518 declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
1519 declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
1520 declare <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
1521 declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat>, <8 x bfloat>, i64)
1523 declare <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8)
1524 declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16)
1525 declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
1526 declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64)
1527 declare <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half)
1528 declare <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float)
1529 declare <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double)
1530 declare <vscale x 4 x half> @llvm.aarch64.sve.dup.nxv4f16(<vscale x 4 x half>, <vscale x 4 x i1>, half)
1531 declare <vscale x 2 x half> @llvm.aarch64.sve.dup.nxv2f16(<vscale x 2 x half>, <vscale x 2 x i1>, half)
1532 declare <vscale x 2 x float> @llvm.aarch64.sve.dup.nxv2f32(<vscale x 2 x float>, <vscale x 2 x i1>, float)
1535 attributes #0 = { "target-features"="+sve,+bf16" }
1536 attributes #1 = { "target-cpu"="neoverse-v1" }