1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+bf16,+sve | FileCheck %s
4 define <8 x i8> @loadv8i8(ptr %p) {
5 ; CHECK-LABEL: loadv8i8:
7 ; CHECK-NEXT: ldr b0, [x0]
10 %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
14 define <16 x i8> @loadv16i8(ptr %p) {
15 ; CHECK-LABEL: loadv16i8:
17 ; CHECK-NEXT: ldr b0, [x0]
20 %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
24 define <4 x i16> @loadv4i16(ptr %p) {
25 ; CHECK-LABEL: loadv4i16:
27 ; CHECK-NEXT: ldr h0, [x0]
30 %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
34 define <8 x i16> @loadv8i16(ptr %p) {
35 ; CHECK-LABEL: loadv8i16:
37 ; CHECK-NEXT: ldr h0, [x0]
40 %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
44 define <2 x i32> @loadv2i32(ptr %p) {
45 ; CHECK-LABEL: loadv2i32:
47 ; CHECK-NEXT: ldr s0, [x0]
50 %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
54 define <4 x i32> @loadv4i32(ptr %p) {
55 ; CHECK-LABEL: loadv4i32:
57 ; CHECK-NEXT: ldr s0, [x0]
60 %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
64 define <2 x i64> @loadv2i64(ptr %p) {
65 ; CHECK-LABEL: loadv2i64:
67 ; CHECK-NEXT: ldr d0, [x0]
70 %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
75 define <4 x half> @loadv4f16(ptr %p) {
76 ; CHECK-LABEL: loadv4f16:
78 ; CHECK-NEXT: ldr h0, [x0]
80 %l = load half, ptr %p
81 %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
85 define <8 x half> @loadv8f16(ptr %p) {
86 ; CHECK-LABEL: loadv8f16:
88 ; CHECK-NEXT: ldr h0, [x0]
90 %l = load half, ptr %p
91 %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
95 define <4 x bfloat> @loadv4bf16(ptr %p) {
96 ; CHECK-LABEL: loadv4bf16:
98 ; CHECK-NEXT: ldr h0, [x0]
100 %l = load bfloat, ptr %p
101 %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
105 define <8 x bfloat> @loadv8bf16(ptr %p) {
106 ; CHECK-LABEL: loadv8bf16:
108 ; CHECK-NEXT: ldr h0, [x0]
110 %l = load bfloat, ptr %p
111 %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
115 define <2 x float> @loadv2f32(ptr %p) {
116 ; CHECK-LABEL: loadv2f32:
118 ; CHECK-NEXT: ldr s0, [x0]
120 %l = load float, ptr %p
121 %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
125 define <4 x float> @loadv4f32(ptr %p) {
126 ; CHECK-LABEL: loadv4f32:
128 ; CHECK-NEXT: ldr s0, [x0]
130 %l = load float, ptr %p
131 %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
135 define <2 x double> @loadv2f64(ptr %p) {
136 ; CHECK-LABEL: loadv2f64:
138 ; CHECK-NEXT: ldr d0, [x0]
140 %l = load double, ptr %p
141 %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
148 define <8 x i8> @loadv8i8_offset(ptr %p) {
149 ; CHECK-LABEL: loadv8i8_offset:
151 ; CHECK-NEXT: ldr b0, [x0, #1]
153 %g = getelementptr inbounds i8, ptr %p, i64 1
155 %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
159 define <16 x i8> @loadv16i8_offset(ptr %p) {
160 ; CHECK-LABEL: loadv16i8_offset:
162 ; CHECK-NEXT: ldr b0, [x0, #1]
164 %g = getelementptr inbounds i8, ptr %p, i64 1
166 %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
170 define <4 x i16> @loadv4i16_offset(ptr %p) {
171 ; CHECK-LABEL: loadv4i16_offset:
173 ; CHECK-NEXT: ldur h0, [x0, #1]
175 %g = getelementptr inbounds i8, ptr %p, i64 1
176 %l = load i16, ptr %g
177 %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
181 define <8 x i16> @loadv8i16_offset(ptr %p) {
182 ; CHECK-LABEL: loadv8i16_offset:
184 ; CHECK-NEXT: ldur h0, [x0, #1]
186 %g = getelementptr inbounds i8, ptr %p, i64 1
187 %l = load i16, ptr %g
188 %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
192 define <2 x i32> @loadv2i32_offset(ptr %p) {
193 ; CHECK-LABEL: loadv2i32_offset:
195 ; CHECK-NEXT: ldur s0, [x0, #1]
197 %g = getelementptr inbounds i8, ptr %p, i64 1
198 %l = load i32, ptr %g
199 %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
203 define <4 x i32> @loadv4i32_offset(ptr %p) {
204 ; CHECK-LABEL: loadv4i32_offset:
206 ; CHECK-NEXT: ldur s0, [x0, #1]
208 %g = getelementptr inbounds i8, ptr %p, i64 1
209 %l = load i32, ptr %g
210 %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
214 define <2 x i64> @loadv2i64_offset(ptr %p) {
215 ; CHECK-LABEL: loadv2i64_offset:
217 ; CHECK-NEXT: ldur d0, [x0, #1]
219 %g = getelementptr inbounds i8, ptr %p, i64 1
220 %l = load i64, ptr %g
221 %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
226 define <4 x half> @loadv4f16_offset(ptr %p) {
227 ; CHECK-LABEL: loadv4f16_offset:
229 ; CHECK-NEXT: ldur h0, [x0, #1]
231 %g = getelementptr inbounds i8, ptr %p, i64 1
232 %l = load half, ptr %g
233 %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
237 define <8 x half> @loadv8f16_offset(ptr %p) {
238 ; CHECK-LABEL: loadv8f16_offset:
240 ; CHECK-NEXT: ldur h0, [x0, #1]
242 %g = getelementptr inbounds i8, ptr %p, i64 1
243 %l = load half, ptr %g
244 %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
248 define <4 x bfloat> @loadv4bf16_offset(ptr %p) {
249 ; CHECK-LABEL: loadv4bf16_offset:
251 ; CHECK-NEXT: ldur h0, [x0, #1]
253 %g = getelementptr inbounds i8, ptr %p, i64 1
254 %l = load bfloat, ptr %g
255 %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
259 define <8 x bfloat> @loadv8bf16_offset(ptr %p) {
260 ; CHECK-LABEL: loadv8bf16_offset:
262 ; CHECK-NEXT: ldur h0, [x0, #1]
264 %g = getelementptr inbounds i8, ptr %p, i64 1
265 %l = load bfloat, ptr %g
266 %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
270 define <2 x float> @loadv2f32_offset(ptr %p) {
271 ; CHECK-LABEL: loadv2f32_offset:
273 ; CHECK-NEXT: ldur s0, [x0, #1]
275 %g = getelementptr inbounds i8, ptr %p, i64 1
276 %l = load float, ptr %g
277 %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
281 define <4 x float> @loadv4f32_offset(ptr %p) {
282 ; CHECK-LABEL: loadv4f32_offset:
284 ; CHECK-NEXT: ldur s0, [x0, #1]
286 %g = getelementptr inbounds i8, ptr %p, i64 1
287 %l = load float, ptr %g
288 %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
292 define <2 x double> @loadv2f64_offset(ptr %p) {
293 ; CHECK-LABEL: loadv2f64_offset:
295 ; CHECK-NEXT: ldur d0, [x0, #1]
297 %g = getelementptr inbounds i8, ptr %p, i64 1
298 %l = load double, ptr %g
299 %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
304 define <8 x i8> @loadv8i8_noffset(ptr %p) {
305 ; CHECK-LABEL: loadv8i8_noffset:
307 ; CHECK-NEXT: ldur b0, [x0, #-1]
309 %g = getelementptr inbounds i8, ptr %p, i64 -1
311 %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
315 define <16 x i8> @loadv16i8_noffset(ptr %p) {
316 ; CHECK-LABEL: loadv16i8_noffset:
318 ; CHECK-NEXT: ldur b0, [x0, #-1]
320 %g = getelementptr inbounds i8, ptr %p, i64 -1
322 %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
326 define <4 x i16> @loadv4i16_noffset(ptr %p) {
327 ; CHECK-LABEL: loadv4i16_noffset:
329 ; CHECK-NEXT: ldur h0, [x0, #-1]
331 %g = getelementptr inbounds i8, ptr %p, i64 -1
332 %l = load i16, ptr %g
333 %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
337 define <8 x i16> @loadv8i16_noffset(ptr %p) {
338 ; CHECK-LABEL: loadv8i16_noffset:
340 ; CHECK-NEXT: ldur h0, [x0, #-1]
342 %g = getelementptr inbounds i8, ptr %p, i64 -1
343 %l = load i16, ptr %g
344 %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
348 define <2 x i32> @loadv2i32_noffset(ptr %p) {
349 ; CHECK-LABEL: loadv2i32_noffset:
351 ; CHECK-NEXT: ldur s0, [x0, #-1]
353 %g = getelementptr inbounds i8, ptr %p, i64 -1
354 %l = load i32, ptr %g
355 %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
359 define <4 x i32> @loadv4i32_noffset(ptr %p) {
360 ; CHECK-LABEL: loadv4i32_noffset:
362 ; CHECK-NEXT: ldur s0, [x0, #-1]
364 %g = getelementptr inbounds i8, ptr %p, i64 -1
365 %l = load i32, ptr %g
366 %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
370 define <2 x i64> @loadv2i64_noffset(ptr %p) {
371 ; CHECK-LABEL: loadv2i64_noffset:
373 ; CHECK-NEXT: ldur d0, [x0, #-1]
375 %g = getelementptr inbounds i8, ptr %p, i64 -1
376 %l = load i64, ptr %g
377 %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
382 define <4 x half> @loadv4f16_noffset(ptr %p) {
383 ; CHECK-LABEL: loadv4f16_noffset:
385 ; CHECK-NEXT: ldur h0, [x0, #-1]
387 %g = getelementptr inbounds i8, ptr %p, i64 -1
388 %l = load half, ptr %g
389 %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
393 define <8 x half> @loadv8f16_noffset(ptr %p) {
394 ; CHECK-LABEL: loadv8f16_noffset:
396 ; CHECK-NEXT: ldur h0, [x0, #-1]
398 %g = getelementptr inbounds i8, ptr %p, i64 -1
399 %l = load half, ptr %g
400 %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
404 define <4 x bfloat> @loadv4bf16_noffset(ptr %p) {
405 ; CHECK-LABEL: loadv4bf16_noffset:
407 ; CHECK-NEXT: ldur h0, [x0, #-1]
409 %g = getelementptr inbounds i8, ptr %p, i64 -1
410 %l = load bfloat, ptr %g
411 %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
415 define <8 x bfloat> @loadv8bf16_noffset(ptr %p) {
416 ; CHECK-LABEL: loadv8bf16_noffset:
418 ; CHECK-NEXT: ldur h0, [x0, #-1]
420 %g = getelementptr inbounds i8, ptr %p, i64 -1
421 %l = load bfloat, ptr %g
422 %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
426 define <2 x float> @loadv2f32_noffset(ptr %p) {
427 ; CHECK-LABEL: loadv2f32_noffset:
429 ; CHECK-NEXT: ldur s0, [x0, #-1]
431 %g = getelementptr inbounds i8, ptr %p, i64 -1
432 %l = load float, ptr %g
433 %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
437 define <4 x float> @loadv4f32_noffset(ptr %p) {
438 ; CHECK-LABEL: loadv4f32_noffset:
440 ; CHECK-NEXT: ldur s0, [x0, #-1]
442 %g = getelementptr inbounds i8, ptr %p, i64 -1
443 %l = load float, ptr %g
444 %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
448 define <2 x double> @loadv2f64_noffset(ptr %p) {
449 ; CHECK-LABEL: loadv2f64_noffset:
451 ; CHECK-NEXT: ldur d0, [x0, #-1]
453 %g = getelementptr inbounds i8, ptr %p, i64 -1
454 %l = load double, ptr %g
455 %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
460 define void @predictor_4x4_neon(ptr nocapture noundef writeonly %0, i64 noundef %1, ptr nocapture noundef readonly %2, ptr nocapture noundef readnone %3) {
461 ; CHECK-LABEL: predictor_4x4_neon:
463 ; CHECK-NEXT: movi v0.2d, #0000000000000000
464 ; CHECK-NEXT: ldur w8, [x2, #2]
465 ; CHECK-NEXT: ldr s1, [x2]
466 ; CHECK-NEXT: ldur s2, [x2, #1]
467 ; CHECK-NEXT: ushll v3.8h, v2.8b, #1
468 ; CHECK-NEXT: mov v0.s[0], w8
469 ; CHECK-NEXT: lsr w8, w8, #24
470 ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
471 ; CHECK-NEXT: urhadd v1.8b, v1.8b, v2.8b
472 ; CHECK-NEXT: add v0.8h, v0.8h, v3.8h
473 ; CHECK-NEXT: dup v3.8b, w8
474 ; CHECK-NEXT: str s1, [x0]
475 ; CHECK-NEXT: lsl x8, x1, #1
476 ; CHECK-NEXT: rshrn v0.8b, v0.8h, #2
477 ; CHECK-NEXT: zip1 v2.2s, v1.2s, v3.2s
478 ; CHECK-NEXT: str s0, [x0, x1]
479 ; CHECK-NEXT: zip1 v3.2s, v0.2s, v3.2s
480 ; CHECK-NEXT: ext v2.8b, v2.8b, v0.8b, #1
481 ; CHECK-NEXT: ext v1.8b, v3.8b, v0.8b, #1
482 ; CHECK-NEXT: str s2, [x0, x8]
483 ; CHECK-NEXT: add x8, x8, x1
484 ; CHECK-NEXT: str s1, [x0, x8]
486 %5 = load i32, ptr %2, align 4
487 %6 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %5, i64 0
488 %7 = bitcast <2 x i32> %6 to <8 x i8>
489 %8 = getelementptr inbounds i8, ptr %2, i64 1
490 %9 = load i32, ptr %8, align 4
491 %10 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %9, i64 0
492 %11 = bitcast <2 x i32> %10 to <8 x i8>
493 %12 = getelementptr inbounds i8, ptr %2, i64 2
494 %13 = load i32, ptr %12, align 4
495 %14 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %13, i64 0
496 %15 = bitcast <2 x i32> %14 to <8 x i8>
497 %16 = lshr i32 %13, 24
498 %17 = trunc i32 %16 to i8
499 %18 = insertelement <8 x i8> undef, i8 %17, i64 0
500 %19 = shufflevector <8 x i8> %18, <8 x i8> poison, <8 x i32> zeroinitializer
501 %20 = tail call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %7, <8 x i8> %11)
502 %21 = zext <8 x i8> %7 to <8 x i16>
503 %22 = zext <8 x i8> %11 to <8 x i16>
504 %23 = zext <8 x i8> %15 to <8 x i16>
505 %24 = shl nuw nsw <8 x i16> %22, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
506 %25 = add nuw nsw <8 x i16> %23, %21
507 %26 = add nuw nsw <8 x i16> %25, %24
508 %27 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %26, i32 2)
509 %28 = bitcast <8 x i8> %20 to <2 x i32>
510 %29 = extractelement <2 x i32> %28, i64 0
511 store i32 %29, ptr %0, align 4
512 %30 = bitcast <8 x i8> %27 to <2 x i32>
513 %31 = getelementptr inbounds i8, ptr %0, i64 %1
514 %32 = extractelement <2 x i32> %30, i64 0
515 store i32 %32, ptr %31, align 4
516 %33 = bitcast <8 x i8> %19 to <2 x i32>
517 %34 = shufflevector <2 x i32> %28, <2 x i32> %33, <2 x i32> <i32 0, i32 2>
518 %35 = bitcast <2 x i32> %34 to <8 x i8>
519 %36 = shufflevector <2 x i32> %30, <2 x i32> %33, <2 x i32> <i32 0, i32 2>
520 %37 = bitcast <2 x i32> %36 to <8 x i8>
521 %38 = shufflevector <8 x i8> %35, <8 x i8> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 undef, i32 undef>
522 %39 = bitcast <8 x i8> %38 to <2 x i32>
523 %40 = shl nsw i64 %1, 1
524 %41 = getelementptr inbounds i8, ptr %0, i64 %40
525 %42 = extractelement <2 x i32> %39, i64 0
526 store i32 %42, ptr %41, align 4
527 %43 = shufflevector <8 x i8> %37, <8 x i8> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 undef, i32 undef>
528 %44 = bitcast <8 x i8> %43 to <2 x i32>
529 %45 = mul nsw i64 %1, 3
530 %46 = getelementptr inbounds i8, ptr %0, i64 %45
531 %47 = extractelement <2 x i32> %44, i64 0
532 store i32 %47, ptr %46, align 4
536 define void @predictor_4x4_neon_new(ptr nocapture noundef writeonly %0, i64 noundef %1, ptr nocapture noundef readonly %2, ptr nocapture noundef readnone %3) {
537 ; CHECK-LABEL: predictor_4x4_neon_new:
539 ; CHECK-NEXT: ldr s0, [x2]
540 ; CHECK-NEXT: ldur s1, [x2, #1]
541 ; CHECK-NEXT: lsl x8, x1, #1
542 ; CHECK-NEXT: ldur s2, [x2, #2]
543 ; CHECK-NEXT: ldur s3, [x2, #3]
544 ; CHECK-NEXT: uaddl v4.8h, v1.8b, v0.8b
545 ; CHECK-NEXT: urhadd v0.8b, v0.8b, v1.8b
546 ; CHECK-NEXT: add x9, x8, x1
547 ; CHECK-NEXT: uaddl v5.8h, v2.8b, v1.8b
548 ; CHECK-NEXT: uaddl v3.8h, v3.8b, v2.8b
549 ; CHECK-NEXT: urhadd v1.8b, v1.8b, v2.8b
550 ; CHECK-NEXT: str s0, [x0]
551 ; CHECK-NEXT: add v4.8h, v4.8h, v5.8h
552 ; CHECK-NEXT: add v3.8h, v3.8h, v5.8h
553 ; CHECK-NEXT: rshrn v4.8b, v4.8h, #2
554 ; CHECK-NEXT: rshrn v0.8b, v3.8h, #2
555 ; CHECK-NEXT: str s4, [x0, x1]
556 ; CHECK-NEXT: str s1, [x0, x8]
557 ; CHECK-NEXT: str s0, [x0, x9]
559 %5 = load i32, ptr %2, align 4
560 %6 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %5, i64 0
561 %7 = bitcast <2 x i32> %6 to <8 x i8>
562 %8 = getelementptr inbounds i8, ptr %2, i64 1
563 %9 = load i32, ptr %8, align 4
564 %10 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %9, i64 0
565 %11 = bitcast <2 x i32> %10 to <8 x i8>
566 %12 = getelementptr inbounds i8, ptr %2, i64 2
567 %13 = load i32, ptr %12, align 4
568 %14 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %13, i64 0
569 %15 = bitcast <2 x i32> %14 to <8 x i8>
570 %16 = getelementptr inbounds i8, ptr %2, i64 3
571 %17 = load i32, ptr %16, align 4
572 %18 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %17, i64 0
573 %19 = bitcast <2 x i32> %18 to <8 x i8>
574 %20 = tail call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %7, <8 x i8> %11)
575 %21 = tail call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %11, <8 x i8> %15)
576 %22 = zext <8 x i8> %7 to <8 x i16>
577 %23 = zext <8 x i8> %11 to <8 x i16>
578 %24 = add nuw nsw <8 x i16> %23, %22
579 %25 = zext <8 x i8> %15 to <8 x i16>
580 %26 = add nuw nsw <8 x i16> %25, %23
581 %27 = add nuw nsw <8 x i16> %24, %26
582 %28 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %27, i32 2)
583 %29 = zext <8 x i8> %19 to <8 x i16>
584 %30 = add nuw nsw <8 x i16> %29, %25
585 %31 = add nuw nsw <8 x i16> %30, %26
586 %32 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %31, i32 2)
587 %33 = bitcast <8 x i8> %20 to <2 x i32>
588 %34 = extractelement <2 x i32> %33, i64 0
589 store i32 %34, ptr %0, align 4
590 %35 = bitcast <8 x i8> %28 to <2 x i32>
591 %36 = getelementptr inbounds i8, ptr %0, i64 %1
592 %37 = extractelement <2 x i32> %35, i64 0
593 store i32 %37, ptr %36, align 4
594 %38 = bitcast <8 x i8> %21 to <2 x i32>
595 %39 = shl nsw i64 %1, 1
596 %40 = getelementptr inbounds i8, ptr %0, i64 %39
597 %41 = extractelement <2 x i32> %38, i64 0
598 store i32 %41, ptr %40, align 4
599 %42 = bitcast <8 x i8> %32 to <2 x i32>
600 %43 = mul nsw i64 %1, 3
601 %44 = getelementptr inbounds i8, ptr %0, i64 %43
602 %45 = extractelement <2 x i32> %42, i64 0
603 store i32 %45, ptr %44, align 4
608 define <vscale x 8 x i8> @loadnxv8i8(ptr %p) {
609 ; CHECK-LABEL: loadnxv8i8:
611 ; CHECK-NEXT: mov z0.h, #0 // =0x0
612 ; CHECK-NEXT: ldrb w8, [x0]
613 ; CHECK-NEXT: ptrue p0.h, vl1
614 ; CHECK-NEXT: mov z0.h, p0/m, w8
617 %v = insertelement <vscale x 8 x i8> zeroinitializer, i8 %l, i32 0
618 ret <vscale x 8 x i8> %v
621 define <vscale x 16 x i8> @loadnxv16i8(ptr %p) {
622 ; CHECK-LABEL: loadnxv16i8:
624 ; CHECK-NEXT: ldr b0, [x0]
627 %v = insertelement <vscale x 16 x i8> zeroinitializer, i8 %l, i32 0
628 ret <vscale x 16 x i8> %v
631 define <vscale x 4 x i16> @loadnxv4i16(ptr %p) {
632 ; CHECK-LABEL: loadnxv4i16:
634 ; CHECK-NEXT: mov z0.s, #0 // =0x0
635 ; CHECK-NEXT: ldrh w8, [x0]
636 ; CHECK-NEXT: ptrue p0.s, vl1
637 ; CHECK-NEXT: mov z0.s, p0/m, w8
639 %l = load i16, ptr %p
640 %v = insertelement <vscale x 4 x i16> zeroinitializer, i16 %l, i32 0
641 ret <vscale x 4 x i16> %v
644 define <vscale x 8 x i16> @loadnxv8i16(ptr %p) {
645 ; CHECK-LABEL: loadnxv8i16:
647 ; CHECK-NEXT: ldr h0, [x0]
649 %l = load i16, ptr %p
650 %v = insertelement <vscale x 8 x i16> zeroinitializer, i16 %l, i32 0
651 ret <vscale x 8 x i16> %v
654 define <vscale x 2 x i32> @loadnxv2i32(ptr %p) {
655 ; CHECK-LABEL: loadnxv2i32:
657 ; CHECK-NEXT: mov z0.d, #0 // =0x0
658 ; CHECK-NEXT: ldr w8, [x0]
659 ; CHECK-NEXT: ptrue p0.d, vl1
660 ; CHECK-NEXT: mov z0.d, p0/m, x8
662 %l = load i32, ptr %p
663 %v = insertelement <vscale x 2 x i32> zeroinitializer, i32 %l, i32 0
664 ret <vscale x 2 x i32> %v
667 define <vscale x 4 x i32> @loadnxv4i32(ptr %p) {
668 ; CHECK-LABEL: loadnxv4i32:
670 ; CHECK-NEXT: ldr s0, [x0]
672 %l = load i32, ptr %p
673 %v = insertelement <vscale x 4 x i32> zeroinitializer, i32 %l, i32 0
674 ret <vscale x 4 x i32> %v
677 define <vscale x 2 x i64> @loadnxv2i64(ptr %p) {
678 ; CHECK-LABEL: loadnxv2i64:
680 ; CHECK-NEXT: ldr d0, [x0]
682 %l = load i64, ptr %p
683 %v = insertelement <vscale x 2 x i64> zeroinitializer, i64 %l, i32 0
684 ret <vscale x 2 x i64> %v
688 define <vscale x 4 x half> @loadnxv4f16(ptr %p) {
689 ; CHECK-LABEL: loadnxv4f16:
691 ; CHECK-NEXT: mov w8, wzr
692 ; CHECK-NEXT: index z0.s, #0, #1
693 ; CHECK-NEXT: ptrue p0.s
694 ; CHECK-NEXT: mov z1.s, w8
695 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s
696 ; CHECK-NEXT: mov z0.h, #0 // =0x0
697 ; CHECK-NEXT: ldr h1, [x0]
698 ; CHECK-NEXT: mov z0.h, p0/m, h1
700 %l = load half, ptr %p
701 %v = insertelement <vscale x 4 x half> zeroinitializer, half %l, i32 0
702 ret <vscale x 4 x half> %v
705 define <vscale x 8 x half> @loadnxv8f16(ptr %p) {
706 ; CHECK-LABEL: loadnxv8f16:
708 ; CHECK-NEXT: ldr h0, [x0]
710 %l = load half, ptr %p
711 %v = insertelement <vscale x 8 x half> zeroinitializer, half %l, i32 0
712 ret <vscale x 8 x half> %v
715 define <vscale x 4 x bfloat> @loadnxv4bf16(ptr %p) {
716 ; CHECK-LABEL: loadnxv4bf16:
718 ; CHECK-NEXT: mov w8, wzr
719 ; CHECK-NEXT: index z0.s, #0, #1
720 ; CHECK-NEXT: ptrue p0.s
721 ; CHECK-NEXT: mov z1.s, w8
722 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s
723 ; CHECK-NEXT: mov z0.h, #0 // =0x0
724 ; CHECK-NEXT: ldr h1, [x0]
725 ; CHECK-NEXT: mov z0.h, p0/m, h1
727 %l = load bfloat, ptr %p
728 %v = insertelement <vscale x 4 x bfloat> zeroinitializer, bfloat %l, i32 0
729 ret <vscale x 4 x bfloat> %v
732 define <vscale x 8 x bfloat> @loadnxv8bf16(ptr %p) {
733 ; CHECK-LABEL: loadnxv8bf16:
735 ; CHECK-NEXT: ldr h0, [x0]
737 %l = load bfloat, ptr %p
738 %v = insertelement <vscale x 8 x bfloat> zeroinitializer, bfloat %l, i32 0
739 ret <vscale x 8 x bfloat> %v
742 define <vscale x 2 x float> @loadnxv2f32(ptr %p) {
743 ; CHECK-LABEL: loadnxv2f32:
745 ; CHECK-NEXT: mov x8, xzr
746 ; CHECK-NEXT: index z0.d, #0, #1
747 ; CHECK-NEXT: ptrue p0.d
748 ; CHECK-NEXT: mov z1.d, x8
749 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d
750 ; CHECK-NEXT: mov z0.s, #0 // =0x0
751 ; CHECK-NEXT: ldr s1, [x0]
752 ; CHECK-NEXT: mov z0.s, p0/m, s1
754 %l = load float, ptr %p
755 %v = insertelement <vscale x 2 x float> zeroinitializer, float %l, i32 0
756 ret <vscale x 2 x float> %v
759 define <vscale x 4 x float> @loadnxv4f32(ptr %p) {
760 ; CHECK-LABEL: loadnxv4f32:
762 ; CHECK-NEXT: ldr s0, [x0]
764 %l = load float, ptr %p
765 %v = insertelement <vscale x 4 x float> zeroinitializer, float %l, i32 0
766 ret <vscale x 4 x float> %v
769 define <vscale x 2 x double> @loadnxv2f64(ptr %p) {
770 ; CHECK-LABEL: loadnxv2f64:
772 ; CHECK-NEXT: ldr d0, [x0]
774 %l = load double, ptr %p
775 %v = insertelement <vscale x 2 x double> zeroinitializer, double %l, i32 0
776 ret <vscale x 2 x double> %v
782 define <vscale x 8 x i8> @loadnxv8i8_offset(ptr %p) {
783 ; CHECK-LABEL: loadnxv8i8_offset:
785 ; CHECK-NEXT: mov z0.h, #0 // =0x0
786 ; CHECK-NEXT: ldrb w8, [x0, #1]
787 ; CHECK-NEXT: ptrue p0.h, vl1
788 ; CHECK-NEXT: mov z0.h, p0/m, w8
790 %g = getelementptr inbounds i8, ptr %p, i64 1
792 %v = insertelement <vscale x 8 x i8> zeroinitializer, i8 %l, i32 0
793 ret <vscale x 8 x i8> %v
796 define <vscale x 16 x i8> @loadnxv16i8_offset(ptr %p) {
797 ; CHECK-LABEL: loadnxv16i8_offset:
799 ; CHECK-NEXT: ldr b0, [x0, #1]
801 %g = getelementptr inbounds i8, ptr %p, i64 1
803 %v = insertelement <vscale x 16 x i8> zeroinitializer, i8 %l, i32 0
804 ret <vscale x 16 x i8> %v
807 define <vscale x 4 x i16> @loadnxv4i16_offset(ptr %p) {
808 ; CHECK-LABEL: loadnxv4i16_offset:
810 ; CHECK-NEXT: mov z0.s, #0 // =0x0
811 ; CHECK-NEXT: ldurh w8, [x0, #1]
812 ; CHECK-NEXT: ptrue p0.s, vl1
813 ; CHECK-NEXT: mov z0.s, p0/m, w8
815 %g = getelementptr inbounds i8, ptr %p, i64 1
816 %l = load i16, ptr %g
817 %v = insertelement <vscale x 4 x i16> zeroinitializer, i16 %l, i32 0
818 ret <vscale x 4 x i16> %v
821 define <vscale x 8 x i16> @loadnxv8i16_offset(ptr %p) {
822 ; CHECK-LABEL: loadnxv8i16_offset:
824 ; CHECK-NEXT: ldur h0, [x0, #1]
826 %g = getelementptr inbounds i8, ptr %p, i64 1
827 %l = load i16, ptr %g
828 %v = insertelement <vscale x 8 x i16> zeroinitializer, i16 %l, i32 0
829 ret <vscale x 8 x i16> %v
832 define <vscale x 2 x i32> @loadnxv2i32_offset(ptr %p) {
833 ; CHECK-LABEL: loadnxv2i32_offset:
835 ; CHECK-NEXT: mov z0.d, #0 // =0x0
836 ; CHECK-NEXT: ldur w8, [x0, #1]
837 ; CHECK-NEXT: ptrue p0.d, vl1
838 ; CHECK-NEXT: mov z0.d, p0/m, x8
840 %g = getelementptr inbounds i8, ptr %p, i64 1
841 %l = load i32, ptr %g
842 %v = insertelement <vscale x 2 x i32> zeroinitializer, i32 %l, i32 0
843 ret <vscale x 2 x i32> %v
846 define <vscale x 4 x i32> @loadnxv4i32_offset(ptr %p) {
847 ; CHECK-LABEL: loadnxv4i32_offset:
849 ; CHECK-NEXT: ldur s0, [x0, #1]
851 %g = getelementptr inbounds i8, ptr %p, i64 1
852 %l = load i32, ptr %g
853 %v = insertelement <vscale x 4 x i32> zeroinitializer, i32 %l, i32 0
854 ret <vscale x 4 x i32> %v
857 define <vscale x 2 x i64> @loadnxv2i64_offset(ptr %p) {
858 ; CHECK-LABEL: loadnxv2i64_offset:
860 ; CHECK-NEXT: ldur d0, [x0, #1]
862 %g = getelementptr inbounds i8, ptr %p, i64 1
863 %l = load i64, ptr %g
864 %v = insertelement <vscale x 2 x i64> zeroinitializer, i64 %l, i32 0
865 ret <vscale x 2 x i64> %v
869 define <vscale x 4 x half> @loadnxv4f16_offset(ptr %p) {
870 ; CHECK-LABEL: loadnxv4f16_offset:
872 ; CHECK-NEXT: mov w8, wzr
873 ; CHECK-NEXT: index z0.s, #0, #1
874 ; CHECK-NEXT: ptrue p0.s
875 ; CHECK-NEXT: mov z1.s, w8
876 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s
877 ; CHECK-NEXT: mov z0.h, #0 // =0x0
878 ; CHECK-NEXT: ldur h1, [x0, #1]
879 ; CHECK-NEXT: mov z0.h, p0/m, h1
881 %g = getelementptr inbounds i8, ptr %p, i64 1
882 %l = load half, ptr %g
883 %v = insertelement <vscale x 4 x half> zeroinitializer, half %l, i32 0
884 ret <vscale x 4 x half> %v
887 define <vscale x 8 x half> @loadnxv8f16_offset(ptr %p) {
888 ; CHECK-LABEL: loadnxv8f16_offset:
890 ; CHECK-NEXT: ldur h0, [x0, #1]
892 %g = getelementptr inbounds i8, ptr %p, i64 1
893 %l = load half, ptr %g
894 %v = insertelement <vscale x 8 x half> zeroinitializer, half %l, i32 0
895 ret <vscale x 8 x half> %v
898 define <vscale x 4 x bfloat> @loadnxv4bf16_offset(ptr %p) {
899 ; CHECK-LABEL: loadnxv4bf16_offset:
901 ; CHECK-NEXT: mov w8, wzr
902 ; CHECK-NEXT: index z0.s, #0, #1
903 ; CHECK-NEXT: ptrue p0.s
904 ; CHECK-NEXT: mov z1.s, w8
905 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s
906 ; CHECK-NEXT: mov z0.h, #0 // =0x0
907 ; CHECK-NEXT: ldur h1, [x0, #1]
908 ; CHECK-NEXT: mov z0.h, p0/m, h1
910 %g = getelementptr inbounds i8, ptr %p, i64 1
911 %l = load bfloat, ptr %g
912 %v = insertelement <vscale x 4 x bfloat> zeroinitializer, bfloat %l, i32 0
913 ret <vscale x 4 x bfloat> %v
916 define <vscale x 8 x bfloat> @loadnxv8bf16_offset(ptr %p) {
917 ; CHECK-LABEL: loadnxv8bf16_offset:
919 ; CHECK-NEXT: ldur h0, [x0, #1]
921 %g = getelementptr inbounds i8, ptr %p, i64 1
922 %l = load bfloat, ptr %g
923 %v = insertelement <vscale x 8 x bfloat> zeroinitializer, bfloat %l, i32 0
924 ret <vscale x 8 x bfloat> %v
927 define <vscale x 2 x float> @loadnxv2f32_offset(ptr %p) {
928 ; CHECK-LABEL: loadnxv2f32_offset:
930 ; CHECK-NEXT: mov x8, xzr
931 ; CHECK-NEXT: index z0.d, #0, #1
932 ; CHECK-NEXT: ptrue p0.d
933 ; CHECK-NEXT: mov z1.d, x8
934 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d
935 ; CHECK-NEXT: mov z0.s, #0 // =0x0
936 ; CHECK-NEXT: ldur s1, [x0, #1]
937 ; CHECK-NEXT: mov z0.s, p0/m, s1
939 %g = getelementptr inbounds i8, ptr %p, i64 1
940 %l = load float, ptr %g
941 %v = insertelement <vscale x 2 x float> zeroinitializer, float %l, i32 0
942 ret <vscale x 2 x float> %v
945 define <vscale x 4 x float> @loadnxv4f32_offset(ptr %p) {
946 ; CHECK-LABEL: loadnxv4f32_offset:
948 ; CHECK-NEXT: ldur s0, [x0, #1]
950 %g = getelementptr inbounds i8, ptr %p, i64 1
951 %l = load float, ptr %g
952 %v = insertelement <vscale x 4 x float> zeroinitializer, float %l, i32 0
953 ret <vscale x 4 x float> %v
956 define <vscale x 2 x double> @loadnxv2f64_offset(ptr %p) {
957 ; CHECK-LABEL: loadnxv2f64_offset:
959 ; CHECK-NEXT: ldur d0, [x0, #1]
961 %g = getelementptr inbounds i8, ptr %p, i64 1
962 %l = load double, ptr %g
963 %v = insertelement <vscale x 2 x double> zeroinitializer, double %l, i32 0
964 ret <vscale x 2 x double> %v
968 declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) #1
969 declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) #1