From 71d05ac64e65283886b51004f3fd40e18478cddd Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 1 Feb 2025 16:21:47 +0000 Subject: [PATCH] [TTI] getTypeBasedIntrinsicInstrCost - add basic handling for strided load/store intrinsics (#125223) (REAPPLIED) As noted on #124499 - this is currently missing for type-only analysis and was falling back to scalarization for fixed vectors (and failing entirely for scalable vectors) --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 14 ++++ .../test/Analysis/CostModel/RISCV/vp-intrinsics.ll | 88 +++++++++++----------- .../SLPVectorizer/RISCV/complex-loads.ll | 62 ++++++++------- 3 files changed, 93 insertions(+), 71 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 596db3923921..9571bd9330de 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -2204,6 +2204,20 @@ public: return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, CostKind); } + case Intrinsic::experimental_vp_strided_store: { + auto *Ty = cast(ICA.getArgTypes()[0]); + Align Alignment = thisT()->DL.getABITypeAlign(Ty->getElementType()); + return thisT()->getStridedMemoryOpCost( + Instruction::Store, Ty, /*Ptr=*/nullptr, /*VariableMask=*/true, + Alignment, CostKind, ICA.getInst()); + } + case Intrinsic::experimental_vp_strided_load: { + auto *Ty = cast(ICA.getReturnType()); + Align Alignment = thisT()->DL.getABITypeAlign(Ty->getElementType()); + return thisT()->getStridedMemoryOpCost( + Instruction::Load, Ty, /*Ptr=*/nullptr, /*VariableMask=*/true, + Alignment, CostKind, ICA.getInst()); + } case Intrinsic::vector_reduce_add: case Intrinsic::vector_reduce_mul: case Intrinsic::vector_reduce_and: diff --git a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll index 0245a0f7ee6c..7c5197b283fb 100644 --- a/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/RISCV/vp-intrinsics.ll @@ -1482,30 +1482,30 @@ define void @strided_load() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; TYPEBASED-LABEL: 'strided_load' -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %ti1_4 = call <4 x i1> @llvm.experimental.vp.strided.load.v4i1.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %ti1_8 = call <8 x i1> @llvm.experimental.vp.strided.load.v8i1.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %ti1_16 = call <16 x i1> @llvm.experimental.vp.strided.load.v16i1.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %t0 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %t2 = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %t4 = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %t6 = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %t8.a = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr align 8 undef, i64 undef, <2 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %t10.a = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 undef, i64 undef, <4 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %t13.a = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i64(ptr align 8 undef, i64 undef, <8 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %t15.a = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr align 8 undef, i64 undef, <16 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %t8 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %t13 = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %t17 = call @llvm.experimental.vp.strided.load.nxv2i8.p0.i64(ptr undef, i64 undef, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %t19 = call @llvm.experimental.vp.strided.load.nxv4i8.p0.i64(ptr undef, i64 undef, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %t21 = call @llvm.experimental.vp.strided.load.nxv8i8.p0.i64(ptr undef, i64 undef, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %t23 = call @llvm.experimental.vp.strided.load.nxv16i8.p0.i64(ptr undef, i64 undef, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %t25 = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr undef, i64 undef, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %t27 = call @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 undef, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %t29 = call @llvm.experimental.vp.strided.load.nxv8i64.p0.i64(ptr undef, i64 undef, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: %t31 = call @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %ti1_4 = call <4 x i1> @llvm.experimental.vp.strided.load.v4i1.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %ti1_8 = call <8 x i1> @llvm.experimental.vp.strided.load.v8i1.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %ti1_16 = call <16 x i1> @llvm.experimental.vp.strided.load.v16i1.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t0 = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t2 = call <4 x i8> @llvm.experimental.vp.strided.load.v4i8.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t4 = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t6 = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t8.a = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr align 8 undef, i64 undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t10.a = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr align 8 undef, i64 undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t13.a = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i64(ptr align 8 undef, i64 undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t15.a = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr align 8 undef, i64 undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t8 = call <2 x i64> @llvm.experimental.vp.strided.load.v2i64.p0.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t10 = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i64(ptr undef, i64 undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t13 = call <8 x i64> @llvm.experimental.vp.strided.load.v8i64.p0.i64(ptr undef, i64 undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t15 = call <16 x i64> @llvm.experimental.vp.strided.load.v16i64.p0.i64(ptr undef, i64 undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t17 = call @llvm.experimental.vp.strided.load.nxv2i8.p0.i64(ptr undef, i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t19 = call @llvm.experimental.vp.strided.load.nxv4i8.p0.i64(ptr undef, i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t21 = call @llvm.experimental.vp.strided.load.nxv8i8.p0.i64(ptr undef, i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %t23 = call @llvm.experimental.vp.strided.load.nxv16i8.p0.i64(ptr undef, i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t25 = call @llvm.experimental.vp.strided.load.nxv2i64.p0.i64(ptr undef, i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t27 = call @llvm.experimental.vp.strided.load.nxv4i64.p0.i64(ptr undef, i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %t29 = call @llvm.experimental.vp.strided.load.nxv8i64.p0.i64(ptr undef, i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %t31 = call @llvm.experimental.vp.strided.load.nxv16i64.p0.i64(ptr undef, i64 undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %ti1_2 = call <2 x i1> @llvm.experimental.vp.strided.load.v2i1.i64(ptr undef, i64 undef, <2 x i1> undef, i32 undef) @@ -1560,26 +1560,26 @@ define void @strided_store() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; TYPEBASED-LABEL: 'strided_store' -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> undef, ptr undef, i64 undef, <2 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> undef, ptr undef, i64 undef, <4 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.experimental.vp.strided.store.v8i8.p0.i64(<8 x i8> undef, ptr undef, i64 undef, <8 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.experimental.vp.strided.store.v16i8.p0.i64(<16 x i8> undef, ptr undef, i64 undef, <16 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i64.p0.i64(<2 x i64> undef, ptr undef, i64 undef, <2 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.experimental.vp.strided.store.v4i64.p0.i64(<4 x i64> undef, ptr undef, i64 undef, <4 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.experimental.vp.strided.store.v8i64.p0.i64(<8 x i64> undef, ptr undef, i64 undef, <8 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr undef, i64 undef, <16 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.experimental.vp.strided.store.v2i64.p0.i64(<2 x i64> undef, ptr align 8 undef, i64 undef, <2 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.experimental.vp.strided.store.v4i64.p0.i64(<4 x i64> undef, ptr align 8 undef, i64 undef, <4 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 54 for instruction: call void @llvm.experimental.vp.strided.store.v8i64.p0.i64(<8 x i64> undef, ptr align 8 undef, i64 undef, <8 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 110 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr align 8 undef, i64 undef, <16 x i1> undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv2i8.p0.i64( undef, ptr undef, i64 undef, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv4i8.p0.i64( undef, ptr undef, i64 undef, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv8i8.p0.i64( undef, ptr undef, i64 undef, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv16i8.p0.i64( undef, ptr undef, i64 undef, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv2i64.p0.i64( undef, ptr undef, i64 undef, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64( undef, ptr undef, i64 undef, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv8i64.p0.i64( undef, ptr undef, i64 undef, undef, i32 undef) -; TYPEBASED-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64( undef, ptr undef, i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.experimental.vp.strided.store.v2i8.p0.i64(<2 x i8> undef, ptr undef, i64 undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.v4i8.p0.i64(<4 x i8> undef, ptr undef, i64 undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.v8i8.p0.i64(<8 x i8> undef, ptr undef, i64 undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i8.p0.i64(<16 x i8> undef, ptr undef, i64 undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.experimental.vp.strided.store.v2i64.p0.i64(<2 x i64> undef, ptr undef, i64 undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.v4i64.p0.i64(<4 x i64> undef, ptr undef, i64 undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.v8i64.p0.i64(<8 x i64> undef, ptr undef, i64 undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr undef, i64 undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.experimental.vp.strided.store.v2i64.p0.i64(<2 x i64> undef, ptr align 8 undef, i64 undef, <2 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.v4i64.p0.i64(<4 x i64> undef, ptr align 8 undef, i64 undef, <4 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.v8i64.p0.i64(<8 x i64> undef, ptr align 8 undef, i64 undef, <8 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.v16i64.p0.i64(<16 x i64> undef, ptr align 8 undef, i64 undef, <16 x i1> undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.nxv2i8.p0.i64( undef, ptr undef, i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i8.p0.i64( undef, ptr undef, i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.nxv8i8.p0.i64( undef, ptr undef, i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i8.p0.i64( undef, ptr undef, i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.experimental.vp.strided.store.nxv2i64.p0.i64( undef, ptr undef, i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.experimental.vp.strided.store.nxv4i64.p0.i64( undef, ptr undef, i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vp.strided.store.nxv8i64.p0.i64( undef, ptr undef, i64 undef, undef, i32 undef) +; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vp.strided.store.nxv16i64.p0.i64( undef, ptr undef, i64 undef, undef, i32 undef) ; TYPEBASED-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; call void @llvm.experimental.vp.strided.store.v2i8.i64(<2 x i8> undef, ptr undef, i64 undef, <2 x i1> undef, i32 undef) diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index 257e4660c80a..11fa3337544a 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -28,13 +28,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1 ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32> -; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[TMP6]] to i32 ; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 ; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32> @@ -50,7 +46,6 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]] ; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32> -; CHECK-NEXT: [[CONV9_2:%.*]] = zext i8 [[TMP7]] to i32 ; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32> ; CHECK-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]] @@ -64,8 +59,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP34:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]] ; CHECK-NEXT: [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]] ; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[TMP34]], i32 0 -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP34]], i32 1 -; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP45]], [[TMP43]] +; CHECK-NEXT: [[CONV_2:%.*]] = extractelement <2 x i32> [[TMP34]], i32 1 +; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[CONV_2]], [[TMP43]] ; CHECK-NEXT: [[TMP46:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0 ; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1 ; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP47]], [[TMP46]] @@ -120,15 +115,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP78]], [[TMP80]] ; CHECK-NEXT: [[ADD95:%.*]] = add i32 [[ADD94]], [[ADD48_2]] ; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[ADD48_2]], [[ADD94]] -; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP77]], 15 -; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 -; CHECK-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535 -; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP45]], 15 -; CHECK-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537 -; CHECK-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535 -; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]] -; CHECK-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]] -; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[CONV9_2]], 15 +; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP77]], 15 ; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 ; CHECK-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535 ; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[CONV_2]], 15 @@ -244,10 +231,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD95]] ; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB86_3]], [[SUB86]] ; CHECK-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB86_3]] -; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] +; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I_1]], [[ADD103]] ; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP77]] -; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]] -; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP45]] +; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51_1]], [[ADD105]] +; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV_2]] ; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]] ; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP160]] ; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]] @@ -255,21 +242,42 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] ; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] ; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[ADD112]], [[XOR_I63]] -; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]] -; CHECK-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD55_1]] -; CHECK-NEXT: [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]] +; CHECK-NEXT: [[TMP169:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP181:%.*]] = zext <2 x i8> [[TMP169]] to <2 x i32> +; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_2]], i32 0 +; CHECK-NEXT: [[TMP182:%.*]] = shufflevector <2 x i32> [[TMP152]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP183:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_3]], i32 0 +; CHECK-NEXT: [[TMP184:%.*]] = shufflevector <2 x i32> [[TMP183]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP191:%.*]] = sub <2 x i32> [[TMP182]], [[TMP184]] +; CHECK-NEXT: [[TMP192:%.*]] = add <2 x i32> [[TMP182]], [[TMP184]] +; CHECK-NEXT: [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP191]], <2 x i32> [[TMP192]], <2 x i32> +; CHECK-NEXT: [[TMP195:%.*]] = lshr <2 x i32> [[TMP181]], splat (i32 15) +; CHECK-NEXT: [[TMP196:%.*]] = and <2 x i32> [[TMP195]], splat (i32 65537) +; CHECK-NEXT: [[TMP198:%.*]] = mul <2 x i32> [[TMP196]], splat (i32 65535) +; CHECK-NEXT: [[TMP202:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55]], i32 0 +; CHECK-NEXT: [[TMP203:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP205:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_1]], i32 0 +; CHECK-NEXT: [[TMP206:%.*]] = shufflevector <2 x i32> [[TMP205]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP207:%.*]] = sub <2 x i32> [[TMP203]], [[TMP206]] +; CHECK-NEXT: [[TMP210:%.*]] = add <2 x i32> [[TMP203]], [[TMP206]] +; CHECK-NEXT: [[TMP168:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP210]], <2 x i32> +; CHECK-NEXT: [[ADD94_1:%.*]] = extractelement <2 x i32> [[TMP194]], i32 1 +; CHECK-NEXT: [[ADD78_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 1 ; CHECK-NEXT: [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]] -; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]] +; CHECK-NEXT: [[TMP220:%.*]] = add <2 x i32> [[TMP194]], [[TMP168]] +; CHECK-NEXT: [[SUB102_1:%.*]] = extractelement <2 x i32> [[TMP194]], i32 0 +; CHECK-NEXT: [[SUB86_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 0 +; CHECK-NEXT: [[TMP174:%.*]] = shufflevector <2 x i32> [[TMP168]], <2 x i32> [[TMP194]], <2 x i32> ; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]] -; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]] -; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[CONV9_2]] -; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]] -; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[CONV_2]] +; CHECK-NEXT: [[TMP175:%.*]] = add <2 x i32> [[TMP198]], [[TMP220]] +; CHECK-NEXT: [[TMP221:%.*]] = xor <2 x i32> [[TMP175]], [[TMP181]] ; CHECK-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]] ; CHECK-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP162]] ; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]] ; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP129]] +; CHECK-NEXT: [[XOR_I53_1:%.*]] = extractelement <2 x i32> [[TMP221]], i32 0 ; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD105_3]] +; CHECK-NEXT: [[XOR_I_1:%.*]] = extractelement <2 x i32> [[TMP221]], i32 1 ; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] ; CHECK-NEXT: [[ADD112_5:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]] ; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_5]], [[XOR_I63_1]] -- 2.11.4.GIT