1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+bf16,+sve | FileCheck %s
4 define <8 x i8> @loadv8i8(ptr %p) {
5 ; CHECK-LABEL: loadv8i8:
7 ; CHECK-NEXT: ldr b0, [x0]
10 %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
14 define <16 x i8> @loadv16i8(ptr %p) {
15 ; CHECK-LABEL: loadv16i8:
17 ; CHECK-NEXT: ldr b0, [x0]
20 %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
24 define <4 x i16> @loadv4i16(ptr %p) {
25 ; CHECK-LABEL: loadv4i16:
27 ; CHECK-NEXT: ldr h0, [x0]
30 %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
34 define <8 x i16> @loadv8i16(ptr %p) {
35 ; CHECK-LABEL: loadv8i16:
37 ; CHECK-NEXT: ldr h0, [x0]
40 %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
44 define <2 x i32> @loadv2i32(ptr %p) {
45 ; CHECK-LABEL: loadv2i32:
47 ; CHECK-NEXT: ldr s0, [x0]
50 %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
54 define <4 x i32> @loadv4i32(ptr %p) {
55 ; CHECK-LABEL: loadv4i32:
57 ; CHECK-NEXT: ldr s0, [x0]
60 %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
64 define <2 x i64> @loadv2i64(ptr %p) {
65 ; CHECK-LABEL: loadv2i64:
67 ; CHECK-NEXT: ldr d0, [x0]
70 %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
75 define <4 x half> @loadv4f16(ptr %p) {
76 ; CHECK-LABEL: loadv4f16:
78 ; CHECK-NEXT: ldr h0, [x0]
80 %l = load half, ptr %p
81 %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
85 define <8 x half> @loadv8f16(ptr %p) {
86 ; CHECK-LABEL: loadv8f16:
88 ; CHECK-NEXT: ldr h0, [x0]
90 %l = load half, ptr %p
91 %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
95 define <4 x bfloat> @loadv4bf16(ptr %p) {
96 ; CHECK-LABEL: loadv4bf16:
98 ; CHECK-NEXT: ldr h0, [x0]
100 %l = load bfloat, ptr %p
101 %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
105 define <8 x bfloat> @loadv8bf16(ptr %p) {
106 ; CHECK-LABEL: loadv8bf16:
108 ; CHECK-NEXT: ldr h0, [x0]
110 %l = load bfloat, ptr %p
111 %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
115 define <2 x float> @loadv2f32(ptr %p) {
116 ; CHECK-LABEL: loadv2f32:
118 ; CHECK-NEXT: ldr s0, [x0]
120 %l = load float, ptr %p
121 %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
125 define <4 x float> @loadv4f32(ptr %p) {
126 ; CHECK-LABEL: loadv4f32:
128 ; CHECK-NEXT: ldr s0, [x0]
130 %l = load float, ptr %p
131 %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
135 define <2 x double> @loadv2f64(ptr %p) {
136 ; CHECK-LABEL: loadv2f64:
138 ; CHECK-NEXT: ldr d0, [x0]
140 %l = load double, ptr %p
141 %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
148 define <8 x i8> @loadv8i8_offset(ptr %p) {
149 ; CHECK-LABEL: loadv8i8_offset:
151 ; CHECK-NEXT: ldr b0, [x0, #1]
153 %g = getelementptr inbounds i8, ptr %p, i64 1
155 %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
159 define <16 x i8> @loadv16i8_offset(ptr %p) {
160 ; CHECK-LABEL: loadv16i8_offset:
162 ; CHECK-NEXT: ldr b0, [x0, #1]
164 %g = getelementptr inbounds i8, ptr %p, i64 1
166 %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
170 define <4 x i16> @loadv4i16_offset(ptr %p) {
171 ; CHECK-LABEL: loadv4i16_offset:
173 ; CHECK-NEXT: ldur h0, [x0, #1]
175 %g = getelementptr inbounds i8, ptr %p, i64 1
176 %l = load i16, ptr %g
177 %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
181 define <8 x i16> @loadv8i16_offset(ptr %p) {
182 ; CHECK-LABEL: loadv8i16_offset:
184 ; CHECK-NEXT: ldur h0, [x0, #1]
186 %g = getelementptr inbounds i8, ptr %p, i64 1
187 %l = load i16, ptr %g
188 %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
192 define <2 x i32> @loadv2i32_offset(ptr %p) {
193 ; CHECK-LABEL: loadv2i32_offset:
195 ; CHECK-NEXT: ldur s0, [x0, #1]
197 %g = getelementptr inbounds i8, ptr %p, i64 1
198 %l = load i32, ptr %g
199 %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
203 define <4 x i32> @loadv4i32_offset(ptr %p) {
204 ; CHECK-LABEL: loadv4i32_offset:
206 ; CHECK-NEXT: ldur s0, [x0, #1]
208 %g = getelementptr inbounds i8, ptr %p, i64 1
209 %l = load i32, ptr %g
210 %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
214 define <2 x i64> @loadv2i64_offset(ptr %p) {
215 ; CHECK-LABEL: loadv2i64_offset:
217 ; CHECK-NEXT: ldur d0, [x0, #1]
219 %g = getelementptr inbounds i8, ptr %p, i64 1
220 %l = load i64, ptr %g
221 %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
226 define <4 x half> @loadv4f16_offset(ptr %p) {
227 ; CHECK-LABEL: loadv4f16_offset:
229 ; CHECK-NEXT: ldur h0, [x0, #1]
231 %g = getelementptr inbounds i8, ptr %p, i64 1
232 %l = load half, ptr %g
233 %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
237 define <8 x half> @loadv8f16_offset(ptr %p) {
238 ; CHECK-LABEL: loadv8f16_offset:
240 ; CHECK-NEXT: ldur h0, [x0, #1]
242 %g = getelementptr inbounds i8, ptr %p, i64 1
243 %l = load half, ptr %g
244 %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
248 define <4 x bfloat> @loadv4bf16_offset(ptr %p) {
249 ; CHECK-LABEL: loadv4bf16_offset:
251 ; CHECK-NEXT: ldur h0, [x0, #1]
253 %g = getelementptr inbounds i8, ptr %p, i64 1
254 %l = load bfloat, ptr %g
255 %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
259 define <8 x bfloat> @loadv8bf16_offset(ptr %p) {
260 ; CHECK-LABEL: loadv8bf16_offset:
262 ; CHECK-NEXT: ldur h0, [x0, #1]
264 %g = getelementptr inbounds i8, ptr %p, i64 1
265 %l = load bfloat, ptr %g
266 %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
270 define <2 x float> @loadv2f32_offset(ptr %p) {
271 ; CHECK-LABEL: loadv2f32_offset:
273 ; CHECK-NEXT: ldur s0, [x0, #1]
275 %g = getelementptr inbounds i8, ptr %p, i64 1
276 %l = load float, ptr %g
277 %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
281 define <4 x float> @loadv4f32_offset(ptr %p) {
282 ; CHECK-LABEL: loadv4f32_offset:
284 ; CHECK-NEXT: ldur s0, [x0, #1]
286 %g = getelementptr inbounds i8, ptr %p, i64 1
287 %l = load float, ptr %g
288 %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
292 define <2 x double> @loadv2f64_offset(ptr %p) {
293 ; CHECK-LABEL: loadv2f64_offset:
295 ; CHECK-NEXT: ldur d0, [x0, #1]
297 %g = getelementptr inbounds i8, ptr %p, i64 1
298 %l = load double, ptr %g
299 %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
304 define <8 x i8> @loadv8i8_noffset(ptr %p) {
305 ; CHECK-LABEL: loadv8i8_noffset:
307 ; CHECK-NEXT: ldur b0, [x0, #-1]
309 %g = getelementptr inbounds i8, ptr %p, i64 -1
311 %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
315 define <16 x i8> @loadv16i8_noffset(ptr %p) {
316 ; CHECK-LABEL: loadv16i8_noffset:
318 ; CHECK-NEXT: ldur b0, [x0, #-1]
320 %g = getelementptr inbounds i8, ptr %p, i64 -1
322 %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
326 define <4 x i16> @loadv4i16_noffset(ptr %p) {
327 ; CHECK-LABEL: loadv4i16_noffset:
329 ; CHECK-NEXT: ldur h0, [x0, #-1]
331 %g = getelementptr inbounds i8, ptr %p, i64 -1
332 %l = load i16, ptr %g
333 %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
337 define <8 x i16> @loadv8i16_noffset(ptr %p) {
338 ; CHECK-LABEL: loadv8i16_noffset:
340 ; CHECK-NEXT: ldur h0, [x0, #-1]
342 %g = getelementptr inbounds i8, ptr %p, i64 -1
343 %l = load i16, ptr %g
344 %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
348 define <2 x i32> @loadv2i32_noffset(ptr %p) {
349 ; CHECK-LABEL: loadv2i32_noffset:
351 ; CHECK-NEXT: ldur s0, [x0, #-1]
353 %g = getelementptr inbounds i8, ptr %p, i64 -1
354 %l = load i32, ptr %g
355 %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
359 define <4 x i32> @loadv4i32_noffset(ptr %p) {
360 ; CHECK-LABEL: loadv4i32_noffset:
362 ; CHECK-NEXT: ldur s0, [x0, #-1]
364 %g = getelementptr inbounds i8, ptr %p, i64 -1
365 %l = load i32, ptr %g
366 %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
370 define <2 x i64> @loadv2i64_noffset(ptr %p) {
371 ; CHECK-LABEL: loadv2i64_noffset:
373 ; CHECK-NEXT: ldur d0, [x0, #-1]
375 %g = getelementptr inbounds i8, ptr %p, i64 -1
376 %l = load i64, ptr %g
377 %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
381 define <4 x half> @loadv4f16_noffset(ptr %p) {
382 ; CHECK-LABEL: loadv4f16_noffset:
384 ; CHECK-NEXT: ldur h0, [x0, #-1]
386 %g = getelementptr inbounds i8, ptr %p, i64 -1
387 %l = load half, ptr %g
388 %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
392 define <8 x half> @loadv8f16_noffset(ptr %p) {
393 ; CHECK-LABEL: loadv8f16_noffset:
395 ; CHECK-NEXT: ldur h0, [x0, #-1]
397 %g = getelementptr inbounds i8, ptr %p, i64 -1
398 %l = load half, ptr %g
399 %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
403 define <4 x bfloat> @loadv4bf16_noffset(ptr %p) {
404 ; CHECK-LABEL: loadv4bf16_noffset:
406 ; CHECK-NEXT: ldur h0, [x0, #-1]
408 %g = getelementptr inbounds i8, ptr %p, i64 -1
409 %l = load bfloat, ptr %g
410 %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
414 define <8 x bfloat> @loadv8bf16_noffset(ptr %p) {
415 ; CHECK-LABEL: loadv8bf16_noffset:
417 ; CHECK-NEXT: ldur h0, [x0, #-1]
419 %g = getelementptr inbounds i8, ptr %p, i64 -1
420 %l = load bfloat, ptr %g
421 %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
425 define <2 x float> @loadv2f32_noffset(ptr %p) {
426 ; CHECK-LABEL: loadv2f32_noffset:
428 ; CHECK-NEXT: ldur s0, [x0, #-1]
430 %g = getelementptr inbounds i8, ptr %p, i64 -1
431 %l = load float, ptr %g
432 %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
436 define <4 x float> @loadv4f32_noffset(ptr %p) {
437 ; CHECK-LABEL: loadv4f32_noffset:
439 ; CHECK-NEXT: ldur s0, [x0, #-1]
441 %g = getelementptr inbounds i8, ptr %p, i64 -1
442 %l = load float, ptr %g
443 %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
447 define <2 x double> @loadv2f64_noffset(ptr %p) {
448 ; CHECK-LABEL: loadv2f64_noffset:
450 ; CHECK-NEXT: ldur d0, [x0, #-1]
452 %g = getelementptr inbounds i8, ptr %p, i64 -1
453 %l = load double, ptr %g
454 %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
459 ; ROW addressing modes
461 define <8 x i8> @loadv8i8_roW(ptr %p, i32 %o) {
462 ; CHECK-LABEL: loadv8i8_roW:
464 ; CHECK-NEXT: ldr b0, [x0, w1, sxtw]
466 %g = getelementptr inbounds i8, ptr %p, i32 %o
468 %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
472 define <16 x i8> @loadv16i8_roW(ptr %p, i32 %o) {
473 ; CHECK-LABEL: loadv16i8_roW:
475 ; CHECK-NEXT: ldr b0, [x0, w1, sxtw]
477 %g = getelementptr inbounds i8, ptr %p, i32 %o
479 %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
483 define <4 x i16> @loadv4i16_roW(ptr %p, i32 %o) {
484 ; CHECK-LABEL: loadv4i16_roW:
486 ; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1]
488 %g = getelementptr inbounds i16, ptr %p, i32 %o
489 %l = load i16, ptr %g
490 %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
494 define <8 x i16> @loadv8i16_roW(ptr %p, i32 %o) {
495 ; CHECK-LABEL: loadv8i16_roW:
497 ; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1]
499 %g = getelementptr inbounds i16, ptr %p, i32 %o
500 %l = load i16, ptr %g
501 %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
505 define <2 x i32> @loadv2i32_roW(ptr %p, i32 %o) {
506 ; CHECK-LABEL: loadv2i32_roW:
508 ; CHECK-NEXT: ldr s0, [x0, w1, sxtw #2]
510 %g = getelementptr inbounds i32, ptr %p, i32 %o
511 %l = load i32, ptr %g
512 %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
516 define <4 x i32> @loadv4i32_roW(ptr %p, i32 %o) {
517 ; CHECK-LABEL: loadv4i32_roW:
519 ; CHECK-NEXT: ldr s0, [x0, w1, sxtw #2]
521 %g = getelementptr inbounds i32, ptr %p, i32 %o
522 %l = load i32, ptr %g
523 %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
527 define <2 x i64> @loadv2i64_roW(ptr %p, i32 %o) {
528 ; CHECK-LABEL: loadv2i64_roW:
530 ; CHECK-NEXT: ldr d0, [x0, w1, sxtw #3]
532 %g = getelementptr inbounds i64, ptr %p, i32 %o
533 %l = load i64, ptr %g
534 %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
538 define <4 x half> @loadv4f16_roW(ptr %p, i32 %o) {
539 ; CHECK-LABEL: loadv4f16_roW:
541 ; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1]
543 %g = getelementptr inbounds half, ptr %p, i32 %o
544 %l = load half, ptr %g
545 %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
549 define <8 x half> @loadv8f16_roW(ptr %p, i32 %o) {
550 ; CHECK-LABEL: loadv8f16_roW:
552 ; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1]
554 %g = getelementptr inbounds half, ptr %p, i32 %o
555 %l = load half, ptr %g
556 %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
560 define <4 x bfloat> @loadv4bf16_roW(ptr %p, i32 %o) {
561 ; CHECK-LABEL: loadv4bf16_roW:
563 ; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1]
565 %g = getelementptr inbounds bfloat, ptr %p, i32 %o
566 %l = load bfloat, ptr %g
567 %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
571 define <8 x bfloat> @loadv8bf16_roW(ptr %p, i32 %o) {
572 ; CHECK-LABEL: loadv8bf16_roW:
574 ; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1]
576 %g = getelementptr inbounds bfloat, ptr %p, i32 %o
577 %l = load bfloat, ptr %g
578 %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
582 define <2 x float> @loadv2f32_roW(ptr %p, i32 %o) {
583 ; CHECK-LABEL: loadv2f32_roW:
585 ; CHECK-NEXT: ldr s0, [x0, w1, sxtw #2]
587 %g = getelementptr inbounds float, ptr %p, i32 %o
588 %l = load float, ptr %g
589 %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
593 define <4 x float> @loadv4f32_roW(ptr %p, i32 %o) {
594 ; CHECK-LABEL: loadv4f32_roW:
596 ; CHECK-NEXT: ldr s0, [x0, w1, sxtw #2]
598 %g = getelementptr inbounds float, ptr %p, i32 %o
599 %l = load float, ptr %g
600 %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
604 define <2 x double> @loadv2f64_roW(ptr %p, i32 %o) {
605 ; CHECK-LABEL: loadv2f64_roW:
607 ; CHECK-NEXT: ldr d0, [x0, w1, sxtw #3]
609 %g = getelementptr inbounds double, ptr %p, i32 %o
610 %l = load double, ptr %g
611 %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
618 define <8 x i8> @loadv8i8_roX(ptr %p, i64 %o) {
619 ; CHECK-LABEL: loadv8i8_roX:
621 ; CHECK-NEXT: ldr b0, [x0, x1]
623 %g = getelementptr inbounds i8, ptr %p, i64 %o
625 %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
629 define <16 x i8> @loadv16i8_roX(ptr %p, i64 %o) {
630 ; CHECK-LABEL: loadv16i8_roX:
632 ; CHECK-NEXT: ldr b0, [x0, x1]
634 %g = getelementptr inbounds i8, ptr %p, i64 %o
636 %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
640 define <4 x i16> @loadv4i16_roX(ptr %p, i64 %o) {
641 ; CHECK-LABEL: loadv4i16_roX:
643 ; CHECK-NEXT: ldr h0, [x0, x1, lsl #1]
645 %g = getelementptr inbounds i16, ptr %p, i64 %o
646 %l = load i16, ptr %g
647 %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
651 define <8 x i16> @loadv8i16_roX(ptr %p, i64 %o) {
652 ; CHECK-LABEL: loadv8i16_roX:
654 ; CHECK-NEXT: ldr h0, [x0, x1, lsl #1]
656 %g = getelementptr inbounds i16, ptr %p, i64 %o
657 %l = load i16, ptr %g
658 %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
662 define <2 x i32> @loadv2i32_roX(ptr %p, i64 %o) {
663 ; CHECK-LABEL: loadv2i32_roX:
665 ; CHECK-NEXT: ldr s0, [x0, x1, lsl #2]
667 %g = getelementptr inbounds i32, ptr %p, i64 %o
668 %l = load i32, ptr %g
669 %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
673 define <4 x i32> @loadv4i32_roX(ptr %p, i64 %o) {
674 ; CHECK-LABEL: loadv4i32_roX:
676 ; CHECK-NEXT: ldr s0, [x0, x1, lsl #2]
678 %g = getelementptr inbounds i32, ptr %p, i64 %o
679 %l = load i32, ptr %g
680 %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
684 define <2 x i64> @loadv2i64_roX(ptr %p, i64 %o) {
685 ; CHECK-LABEL: loadv2i64_roX:
687 ; CHECK-NEXT: ldr d0, [x0, x1, lsl #3]
689 %g = getelementptr inbounds i64, ptr %p, i64 %o
690 %l = load i64, ptr %g
691 %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
695 define <4 x half> @loadv4f16_roX(ptr %p, i64 %o) {
696 ; CHECK-LABEL: loadv4f16_roX:
698 ; CHECK-NEXT: ldr h0, [x0, x1, lsl #1]
700 %g = getelementptr inbounds half, ptr %p, i64 %o
701 %l = load half, ptr %g
702 %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
706 define <8 x half> @loadv8f16_roX(ptr %p, i64 %o) {
707 ; CHECK-LABEL: loadv8f16_roX:
709 ; CHECK-NEXT: ldr h0, [x0, x1, lsl #1]
711 %g = getelementptr inbounds half, ptr %p, i64 %o
712 %l = load half, ptr %g
713 %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
717 define <4 x bfloat> @loadv4bf16_roX(ptr %p, i64 %o) {
718 ; CHECK-LABEL: loadv4bf16_roX:
720 ; CHECK-NEXT: ldr h0, [x0, x1, lsl #1]
722 %g = getelementptr inbounds bfloat, ptr %p, i64 %o
723 %l = load bfloat, ptr %g
724 %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
728 define <8 x bfloat> @loadv8bf16_roX(ptr %p, i64 %o) {
729 ; CHECK-LABEL: loadv8bf16_roX:
731 ; CHECK-NEXT: ldr h0, [x0, x1, lsl #1]
733 %g = getelementptr inbounds bfloat, ptr %p, i64 %o
734 %l = load bfloat, ptr %g
735 %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
739 define <2 x float> @loadv2f32_roX(ptr %p, i64 %o) {
740 ; CHECK-LABEL: loadv2f32_roX:
742 ; CHECK-NEXT: ldr s0, [x0, x1, lsl #2]
744 %g = getelementptr inbounds float, ptr %p, i64 %o
745 %l = load float, ptr %g
746 %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
750 define <4 x float> @loadv4f32_roX(ptr %p, i64 %o) {
751 ; CHECK-LABEL: loadv4f32_roX:
753 ; CHECK-NEXT: ldr s0, [x0, x1, lsl #2]
755 %g = getelementptr inbounds float, ptr %p, i64 %o
756 %l = load float, ptr %g
757 %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
761 define <2 x double> @loadv2f64_roX(ptr %p, i64 %o) {
762 ; CHECK-LABEL: loadv2f64_roX:
764 ; CHECK-NEXT: ldr d0, [x0, x1, lsl #3]
766 %g = getelementptr inbounds double, ptr %p, i64 %o
767 %l = load double, ptr %g
768 %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
773 define void @predictor_4x4_neon(ptr nocapture noundef writeonly %0, i64 noundef %1, ptr nocapture noundef readonly %2, ptr nocapture noundef readnone %3) {
774 ; CHECK-LABEL: predictor_4x4_neon:
776 ; CHECK-NEXT: movi v0.2d, #0000000000000000
777 ; CHECK-NEXT: ldur w8, [x2, #2]
778 ; CHECK-NEXT: ldr s1, [x2]
779 ; CHECK-NEXT: ldur s2, [x2, #1]
780 ; CHECK-NEXT: ushll v3.8h, v2.8b, #1
781 ; CHECK-NEXT: mov v0.s[0], w8
782 ; CHECK-NEXT: lsr w8, w8, #24
783 ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
784 ; CHECK-NEXT: urhadd v1.8b, v1.8b, v2.8b
785 ; CHECK-NEXT: add v0.8h, v0.8h, v3.8h
786 ; CHECK-NEXT: dup v3.8b, w8
787 ; CHECK-NEXT: str s1, [x0]
788 ; CHECK-NEXT: lsl x8, x1, #1
789 ; CHECK-NEXT: rshrn v0.8b, v0.8h, #2
790 ; CHECK-NEXT: zip1 v2.2s, v1.2s, v3.2s
791 ; CHECK-NEXT: str s0, [x0, x1]
792 ; CHECK-NEXT: zip1 v3.2s, v0.2s, v3.2s
793 ; CHECK-NEXT: ext v2.8b, v2.8b, v0.8b, #1
794 ; CHECK-NEXT: ext v1.8b, v3.8b, v0.8b, #1
795 ; CHECK-NEXT: str s2, [x0, x8]
796 ; CHECK-NEXT: add x8, x8, x1
797 ; CHECK-NEXT: str s1, [x0, x8]
799 %5 = load i32, ptr %2, align 4
800 %6 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %5, i64 0
801 %7 = bitcast <2 x i32> %6 to <8 x i8>
802 %8 = getelementptr inbounds i8, ptr %2, i64 1
803 %9 = load i32, ptr %8, align 4
804 %10 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %9, i64 0
805 %11 = bitcast <2 x i32> %10 to <8 x i8>
806 %12 = getelementptr inbounds i8, ptr %2, i64 2
807 %13 = load i32, ptr %12, align 4
808 %14 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %13, i64 0
809 %15 = bitcast <2 x i32> %14 to <8 x i8>
810 %16 = lshr i32 %13, 24
811 %17 = trunc i32 %16 to i8
812 %18 = insertelement <8 x i8> undef, i8 %17, i64 0
813 %19 = shufflevector <8 x i8> %18, <8 x i8> poison, <8 x i32> zeroinitializer
814 %20 = tail call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %7, <8 x i8> %11)
815 %21 = zext <8 x i8> %7 to <8 x i16>
816 %22 = zext <8 x i8> %11 to <8 x i16>
817 %23 = zext <8 x i8> %15 to <8 x i16>
818 %24 = shl nuw nsw <8 x i16> %22, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
819 %25 = add nuw nsw <8 x i16> %23, %21
820 %26 = add nuw nsw <8 x i16> %25, %24
821 %27 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %26, i32 2)
822 %28 = bitcast <8 x i8> %20 to <2 x i32>
823 %29 = extractelement <2 x i32> %28, i64 0
824 store i32 %29, ptr %0, align 4
825 %30 = bitcast <8 x i8> %27 to <2 x i32>
826 %31 = getelementptr inbounds i8, ptr %0, i64 %1
827 %32 = extractelement <2 x i32> %30, i64 0
828 store i32 %32, ptr %31, align 4
829 %33 = bitcast <8 x i8> %19 to <2 x i32>
830 %34 = shufflevector <2 x i32> %28, <2 x i32> %33, <2 x i32> <i32 0, i32 2>
831 %35 = bitcast <2 x i32> %34 to <8 x i8>
832 %36 = shufflevector <2 x i32> %30, <2 x i32> %33, <2 x i32> <i32 0, i32 2>
833 %37 = bitcast <2 x i32> %36 to <8 x i8>
834 %38 = shufflevector <8 x i8> %35, <8 x i8> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 undef, i32 undef>
835 %39 = bitcast <8 x i8> %38 to <2 x i32>
836 %40 = shl nsw i64 %1, 1
837 %41 = getelementptr inbounds i8, ptr %0, i64 %40
838 %42 = extractelement <2 x i32> %39, i64 0
839 store i32 %42, ptr %41, align 4
840 %43 = shufflevector <8 x i8> %37, <8 x i8> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 undef, i32 undef>
841 %44 = bitcast <8 x i8> %43 to <2 x i32>
842 %45 = mul nsw i64 %1, 3
843 %46 = getelementptr inbounds i8, ptr %0, i64 %45
844 %47 = extractelement <2 x i32> %44, i64 0
845 store i32 %47, ptr %46, align 4
849 define void @predictor_4x4_neon_new(ptr nocapture noundef writeonly %0, i64 noundef %1, ptr nocapture noundef readonly %2, ptr nocapture noundef readnone %3) {
850 ; CHECK-LABEL: predictor_4x4_neon_new:
852 ; CHECK-NEXT: ldr s0, [x2]
853 ; CHECK-NEXT: ldur s1, [x2, #1]
854 ; CHECK-NEXT: lsl x8, x1, #1
855 ; CHECK-NEXT: ldur s2, [x2, #2]
856 ; CHECK-NEXT: ldur s3, [x2, #3]
857 ; CHECK-NEXT: uaddl v4.8h, v1.8b, v0.8b
858 ; CHECK-NEXT: urhadd v0.8b, v0.8b, v1.8b
859 ; CHECK-NEXT: add x9, x8, x1
860 ; CHECK-NEXT: uaddl v5.8h, v2.8b, v1.8b
861 ; CHECK-NEXT: uaddl v3.8h, v3.8b, v2.8b
862 ; CHECK-NEXT: urhadd v1.8b, v1.8b, v2.8b
863 ; CHECK-NEXT: str s0, [x0]
864 ; CHECK-NEXT: add v4.8h, v4.8h, v5.8h
865 ; CHECK-NEXT: add v3.8h, v3.8h, v5.8h
866 ; CHECK-NEXT: rshrn v4.8b, v4.8h, #2
867 ; CHECK-NEXT: rshrn v0.8b, v3.8h, #2
868 ; CHECK-NEXT: str s4, [x0, x1]
869 ; CHECK-NEXT: str s1, [x0, x8]
870 ; CHECK-NEXT: str s0, [x0, x9]
872 %5 = load i32, ptr %2, align 4
873 %6 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %5, i64 0
874 %7 = bitcast <2 x i32> %6 to <8 x i8>
875 %8 = getelementptr inbounds i8, ptr %2, i64 1
876 %9 = load i32, ptr %8, align 4
877 %10 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %9, i64 0
878 %11 = bitcast <2 x i32> %10 to <8 x i8>
879 %12 = getelementptr inbounds i8, ptr %2, i64 2
880 %13 = load i32, ptr %12, align 4
881 %14 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %13, i64 0
882 %15 = bitcast <2 x i32> %14 to <8 x i8>
883 %16 = getelementptr inbounds i8, ptr %2, i64 3
884 %17 = load i32, ptr %16, align 4
885 %18 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %17, i64 0
886 %19 = bitcast <2 x i32> %18 to <8 x i8>
887 %20 = tail call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %7, <8 x i8> %11)
888 %21 = tail call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %11, <8 x i8> %15)
889 %22 = zext <8 x i8> %7 to <8 x i16>
890 %23 = zext <8 x i8> %11 to <8 x i16>
891 %24 = add nuw nsw <8 x i16> %23, %22
892 %25 = zext <8 x i8> %15 to <8 x i16>
893 %26 = add nuw nsw <8 x i16> %25, %23
894 %27 = add nuw nsw <8 x i16> %24, %26
895 %28 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %27, i32 2)
896 %29 = zext <8 x i8> %19 to <8 x i16>
897 %30 = add nuw nsw <8 x i16> %29, %25
898 %31 = add nuw nsw <8 x i16> %30, %26
899 %32 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %31, i32 2)
900 %33 = bitcast <8 x i8> %20 to <2 x i32>
901 %34 = extractelement <2 x i32> %33, i64 0
902 store i32 %34, ptr %0, align 4
903 %35 = bitcast <8 x i8> %28 to <2 x i32>
904 %36 = getelementptr inbounds i8, ptr %0, i64 %1
905 %37 = extractelement <2 x i32> %35, i64 0
906 store i32 %37, ptr %36, align 4
907 %38 = bitcast <8 x i8> %21 to <2 x i32>
908 %39 = shl nsw i64 %1, 1
909 %40 = getelementptr inbounds i8, ptr %0, i64 %39
910 %41 = extractelement <2 x i32> %38, i64 0
911 store i32 %41, ptr %40, align 4
912 %42 = bitcast <8 x i8> %32 to <2 x i32>
913 %43 = mul nsw i64 %1, 3
914 %44 = getelementptr inbounds i8, ptr %0, i64 %43
915 %45 = extractelement <2 x i32> %42, i64 0
916 store i32 %45, ptr %44, align 4
921 define <vscale x 8 x i8> @loadnxv8i8(ptr %p) {
922 ; CHECK-LABEL: loadnxv8i8:
924 ; CHECK-NEXT: mov z0.h, #0 // =0x0
925 ; CHECK-NEXT: ldrb w8, [x0]
926 ; CHECK-NEXT: ptrue p0.h, vl1
927 ; CHECK-NEXT: mov z0.h, p0/m, w8
930 %v = insertelement <vscale x 8 x i8> zeroinitializer, i8 %l, i32 0
931 ret <vscale x 8 x i8> %v
934 define <vscale x 16 x i8> @loadnxv16i8(ptr %p) {
935 ; CHECK-LABEL: loadnxv16i8:
937 ; CHECK-NEXT: ldr b0, [x0]
940 %v = insertelement <vscale x 16 x i8> zeroinitializer, i8 %l, i32 0
941 ret <vscale x 16 x i8> %v
944 define <vscale x 4 x i16> @loadnxv4i16(ptr %p) {
945 ; CHECK-LABEL: loadnxv4i16:
947 ; CHECK-NEXT: mov z0.s, #0 // =0x0
948 ; CHECK-NEXT: ldrh w8, [x0]
949 ; CHECK-NEXT: ptrue p0.s, vl1
950 ; CHECK-NEXT: mov z0.s, p0/m, w8
952 %l = load i16, ptr %p
953 %v = insertelement <vscale x 4 x i16> zeroinitializer, i16 %l, i32 0
954 ret <vscale x 4 x i16> %v
957 define <vscale x 8 x i16> @loadnxv8i16(ptr %p) {
958 ; CHECK-LABEL: loadnxv8i16:
960 ; CHECK-NEXT: ldr h0, [x0]
962 %l = load i16, ptr %p
963 %v = insertelement <vscale x 8 x i16> zeroinitializer, i16 %l, i32 0
964 ret <vscale x 8 x i16> %v
967 define <vscale x 2 x i32> @loadnxv2i32(ptr %p) {
968 ; CHECK-LABEL: loadnxv2i32:
970 ; CHECK-NEXT: mov z0.d, #0 // =0x0
971 ; CHECK-NEXT: ldr w8, [x0]
972 ; CHECK-NEXT: ptrue p0.d, vl1
973 ; CHECK-NEXT: mov z0.d, p0/m, x8
975 %l = load i32, ptr %p
976 %v = insertelement <vscale x 2 x i32> zeroinitializer, i32 %l, i32 0
977 ret <vscale x 2 x i32> %v
980 define <vscale x 4 x i32> @loadnxv4i32(ptr %p) {
981 ; CHECK-LABEL: loadnxv4i32:
983 ; CHECK-NEXT: ldr s0, [x0]
985 %l = load i32, ptr %p
986 %v = insertelement <vscale x 4 x i32> zeroinitializer, i32 %l, i32 0
987 ret <vscale x 4 x i32> %v
990 define <vscale x 2 x i64> @loadnxv2i64(ptr %p) {
991 ; CHECK-LABEL: loadnxv2i64:
993 ; CHECK-NEXT: ldr d0, [x0]
995 %l = load i64, ptr %p
996 %v = insertelement <vscale x 2 x i64> zeroinitializer, i64 %l, i32 0
997 ret <vscale x 2 x i64> %v
1001 define <vscale x 4 x half> @loadnxv4f16(ptr %p) {
1002 ; CHECK-LABEL: loadnxv4f16:
1004 ; CHECK-NEXT: mov w8, wzr
1005 ; CHECK-NEXT: index z0.s, #0, #1
1006 ; CHECK-NEXT: ptrue p0.s
1007 ; CHECK-NEXT: mov z1.s, w8
1008 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s
1009 ; CHECK-NEXT: mov z0.h, #0 // =0x0
1010 ; CHECK-NEXT: ldr h1, [x0]
1011 ; CHECK-NEXT: mov z0.h, p0/m, h1
1013 %l = load half, ptr %p
1014 %v = insertelement <vscale x 4 x half> zeroinitializer, half %l, i32 0
1015 ret <vscale x 4 x half> %v
1018 define <vscale x 8 x half> @loadnxv8f16(ptr %p) {
1019 ; CHECK-LABEL: loadnxv8f16:
1021 ; CHECK-NEXT: ldr h0, [x0]
1023 %l = load half, ptr %p
1024 %v = insertelement <vscale x 8 x half> zeroinitializer, half %l, i32 0
1025 ret <vscale x 8 x half> %v
1028 define <vscale x 4 x bfloat> @loadnxv4bf16(ptr %p) {
1029 ; CHECK-LABEL: loadnxv4bf16:
1031 ; CHECK-NEXT: mov w8, wzr
1032 ; CHECK-NEXT: index z0.s, #0, #1
1033 ; CHECK-NEXT: ptrue p0.s
1034 ; CHECK-NEXT: mov z1.s, w8
1035 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s
1036 ; CHECK-NEXT: mov z0.h, #0 // =0x0
1037 ; CHECK-NEXT: ldr h1, [x0]
1038 ; CHECK-NEXT: mov z0.h, p0/m, h1
1040 %l = load bfloat, ptr %p
1041 %v = insertelement <vscale x 4 x bfloat> zeroinitializer, bfloat %l, i32 0
1042 ret <vscale x 4 x bfloat> %v
1045 define <vscale x 8 x bfloat> @loadnxv8bf16(ptr %p) {
1046 ; CHECK-LABEL: loadnxv8bf16:
1048 ; CHECK-NEXT: ldr h0, [x0]
1050 %l = load bfloat, ptr %p
1051 %v = insertelement <vscale x 8 x bfloat> zeroinitializer, bfloat %l, i32 0
1052 ret <vscale x 8 x bfloat> %v
1055 define <vscale x 2 x float> @loadnxv2f32(ptr %p) {
1056 ; CHECK-LABEL: loadnxv2f32:
1058 ; CHECK-NEXT: mov x8, xzr
1059 ; CHECK-NEXT: index z0.d, #0, #1
1060 ; CHECK-NEXT: ptrue p0.d
1061 ; CHECK-NEXT: mov z1.d, x8
1062 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d
1063 ; CHECK-NEXT: mov z0.s, #0 // =0x0
1064 ; CHECK-NEXT: ldr s1, [x0]
1065 ; CHECK-NEXT: mov z0.s, p0/m, s1
1067 %l = load float, ptr %p
1068 %v = insertelement <vscale x 2 x float> zeroinitializer, float %l, i32 0
1069 ret <vscale x 2 x float> %v
1072 define <vscale x 4 x float> @loadnxv4f32(ptr %p) {
1073 ; CHECK-LABEL: loadnxv4f32:
1075 ; CHECK-NEXT: ldr s0, [x0]
1077 %l = load float, ptr %p
1078 %v = insertelement <vscale x 4 x float> zeroinitializer, float %l, i32 0
1079 ret <vscale x 4 x float> %v
1082 define <vscale x 2 x double> @loadnxv2f64(ptr %p) {
1083 ; CHECK-LABEL: loadnxv2f64:
1085 ; CHECK-NEXT: ldr d0, [x0]
1087 %l = load double, ptr %p
1088 %v = insertelement <vscale x 2 x double> zeroinitializer, double %l, i32 0
1089 ret <vscale x 2 x double> %v
1095 define <vscale x 8 x i8> @loadnxv8i8_offset(ptr %p) {
1096 ; CHECK-LABEL: loadnxv8i8_offset:
1098 ; CHECK-NEXT: mov z0.h, #0 // =0x0
1099 ; CHECK-NEXT: ldrb w8, [x0, #1]
1100 ; CHECK-NEXT: ptrue p0.h, vl1
1101 ; CHECK-NEXT: mov z0.h, p0/m, w8
1103 %g = getelementptr inbounds i8, ptr %p, i64 1
1104 %l = load i8, ptr %g
1105 %v = insertelement <vscale x 8 x i8> zeroinitializer, i8 %l, i32 0
1106 ret <vscale x 8 x i8> %v
1109 define <vscale x 16 x i8> @loadnxv16i8_offset(ptr %p) {
1110 ; CHECK-LABEL: loadnxv16i8_offset:
1112 ; CHECK-NEXT: ldr b0, [x0, #1]
1114 %g = getelementptr inbounds i8, ptr %p, i64 1
1115 %l = load i8, ptr %g
1116 %v = insertelement <vscale x 16 x i8> zeroinitializer, i8 %l, i32 0
1117 ret <vscale x 16 x i8> %v
1120 define <vscale x 4 x i16> @loadnxv4i16_offset(ptr %p) {
1121 ; CHECK-LABEL: loadnxv4i16_offset:
1123 ; CHECK-NEXT: mov z0.s, #0 // =0x0
1124 ; CHECK-NEXT: ldurh w8, [x0, #1]
1125 ; CHECK-NEXT: ptrue p0.s, vl1
1126 ; CHECK-NEXT: mov z0.s, p0/m, w8
1128 %g = getelementptr inbounds i8, ptr %p, i64 1
1129 %l = load i16, ptr %g
1130 %v = insertelement <vscale x 4 x i16> zeroinitializer, i16 %l, i32 0
1131 ret <vscale x 4 x i16> %v
1134 define <vscale x 8 x i16> @loadnxv8i16_offset(ptr %p) {
1135 ; CHECK-LABEL: loadnxv8i16_offset:
1137 ; CHECK-NEXT: ldur h0, [x0, #1]
1139 %g = getelementptr inbounds i8, ptr %p, i64 1
1140 %l = load i16, ptr %g
1141 %v = insertelement <vscale x 8 x i16> zeroinitializer, i16 %l, i32 0
1142 ret <vscale x 8 x i16> %v
1145 define <vscale x 2 x i32> @loadnxv2i32_offset(ptr %p) {
1146 ; CHECK-LABEL: loadnxv2i32_offset:
1148 ; CHECK-NEXT: mov z0.d, #0 // =0x0
1149 ; CHECK-NEXT: ldur w8, [x0, #1]
1150 ; CHECK-NEXT: ptrue p0.d, vl1
1151 ; CHECK-NEXT: mov z0.d, p0/m, x8
1153 %g = getelementptr inbounds i8, ptr %p, i64 1
1154 %l = load i32, ptr %g
1155 %v = insertelement <vscale x 2 x i32> zeroinitializer, i32 %l, i32 0
1156 ret <vscale x 2 x i32> %v
1159 define <vscale x 4 x i32> @loadnxv4i32_offset(ptr %p) {
1160 ; CHECK-LABEL: loadnxv4i32_offset:
1162 ; CHECK-NEXT: ldur s0, [x0, #1]
1164 %g = getelementptr inbounds i8, ptr %p, i64 1
1165 %l = load i32, ptr %g
1166 %v = insertelement <vscale x 4 x i32> zeroinitializer, i32 %l, i32 0
1167 ret <vscale x 4 x i32> %v
1170 define <vscale x 2 x i64> @loadnxv2i64_offset(ptr %p) {
1171 ; CHECK-LABEL: loadnxv2i64_offset:
1173 ; CHECK-NEXT: ldur d0, [x0, #1]
1175 %g = getelementptr inbounds i8, ptr %p, i64 1
1176 %l = load i64, ptr %g
1177 %v = insertelement <vscale x 2 x i64> zeroinitializer, i64 %l, i32 0
1178 ret <vscale x 2 x i64> %v
1182 define <vscale x 4 x half> @loadnxv4f16_offset(ptr %p) {
1183 ; CHECK-LABEL: loadnxv4f16_offset:
1185 ; CHECK-NEXT: mov w8, wzr
1186 ; CHECK-NEXT: index z0.s, #0, #1
1187 ; CHECK-NEXT: ptrue p0.s
1188 ; CHECK-NEXT: mov z1.s, w8
1189 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s
1190 ; CHECK-NEXT: mov z0.h, #0 // =0x0
1191 ; CHECK-NEXT: ldur h1, [x0, #1]
1192 ; CHECK-NEXT: mov z0.h, p0/m, h1
1194 %g = getelementptr inbounds i8, ptr %p, i64 1
1195 %l = load half, ptr %g
1196 %v = insertelement <vscale x 4 x half> zeroinitializer, half %l, i32 0
1197 ret <vscale x 4 x half> %v
1200 define <vscale x 8 x half> @loadnxv8f16_offset(ptr %p) {
1201 ; CHECK-LABEL: loadnxv8f16_offset:
1203 ; CHECK-NEXT: ldur h0, [x0, #1]
1205 %g = getelementptr inbounds i8, ptr %p, i64 1
1206 %l = load half, ptr %g
1207 %v = insertelement <vscale x 8 x half> zeroinitializer, half %l, i32 0
1208 ret <vscale x 8 x half> %v
1211 define <vscale x 4 x bfloat> @loadnxv4bf16_offset(ptr %p) {
1212 ; CHECK-LABEL: loadnxv4bf16_offset:
1214 ; CHECK-NEXT: mov w8, wzr
1215 ; CHECK-NEXT: index z0.s, #0, #1
1216 ; CHECK-NEXT: ptrue p0.s
1217 ; CHECK-NEXT: mov z1.s, w8
1218 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s
1219 ; CHECK-NEXT: mov z0.h, #0 // =0x0
1220 ; CHECK-NEXT: ldur h1, [x0, #1]
1221 ; CHECK-NEXT: mov z0.h, p0/m, h1
1223 %g = getelementptr inbounds i8, ptr %p, i64 1
1224 %l = load bfloat, ptr %g
1225 %v = insertelement <vscale x 4 x bfloat> zeroinitializer, bfloat %l, i32 0
1226 ret <vscale x 4 x bfloat> %v
1229 define <vscale x 8 x bfloat> @loadnxv8bf16_offset(ptr %p) {
1230 ; CHECK-LABEL: loadnxv8bf16_offset:
1232 ; CHECK-NEXT: ldur h0, [x0, #1]
1234 %g = getelementptr inbounds i8, ptr %p, i64 1
1235 %l = load bfloat, ptr %g
1236 %v = insertelement <vscale x 8 x bfloat> zeroinitializer, bfloat %l, i32 0
1237 ret <vscale x 8 x bfloat> %v
1240 define <vscale x 2 x float> @loadnxv2f32_offset(ptr %p) {
1241 ; CHECK-LABEL: loadnxv2f32_offset:
1243 ; CHECK-NEXT: mov x8, xzr
1244 ; CHECK-NEXT: index z0.d, #0, #1
1245 ; CHECK-NEXT: ptrue p0.d
1246 ; CHECK-NEXT: mov z1.d, x8
1247 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d
1248 ; CHECK-NEXT: mov z0.s, #0 // =0x0
1249 ; CHECK-NEXT: ldur s1, [x0, #1]
1250 ; CHECK-NEXT: mov z0.s, p0/m, s1
1252 %g = getelementptr inbounds i8, ptr %p, i64 1
1253 %l = load float, ptr %g
1254 %v = insertelement <vscale x 2 x float> zeroinitializer, float %l, i32 0
1255 ret <vscale x 2 x float> %v
1258 define <vscale x 4 x float> @loadnxv4f32_offset(ptr %p) {
1259 ; CHECK-LABEL: loadnxv4f32_offset:
1261 ; CHECK-NEXT: ldur s0, [x0, #1]
1263 %g = getelementptr inbounds i8, ptr %p, i64 1
1264 %l = load float, ptr %g
1265 %v = insertelement <vscale x 4 x float> zeroinitializer, float %l, i32 0
1266 ret <vscale x 4 x float> %v
1269 define <vscale x 2 x double> @loadnxv2f64_offset(ptr %p) {
1270 ; CHECK-LABEL: loadnxv2f64_offset:
1272 ; CHECK-NEXT: ldur d0, [x0, #1]
1274 %g = getelementptr inbounds i8, ptr %p, i64 1
1275 %l = load double, ptr %g
1276 %v = insertelement <vscale x 2 x double> zeroinitializer, double %l, i32 0
1277 ret <vscale x 2 x double> %v
1281 declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) #1
1282 declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) #1