From f5056c8c16bb732a83ec12776e01915af717917b Mon Sep 17 00:00:00 2001 From: Andrew Wei Date: Mon, 18 Oct 2021 21:11:24 +0800 Subject: [PATCH] [AArch64] Improve shuffle vector by using wider types Try to widen element type to get a new mask value for a better permutation sequence, so that we can use NEON shuffle instructions, such as zip1/2, UZP1/2, TRN1/2, REV, INS, etc. For example: shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> is equivalent to: shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> Finally, we can get: mov v0.d[0], v1.d[1] Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D111619 --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 83 ++++++++++ llvm/test/CodeGen/AArch64/concat-vector.ll | 3 +- llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll | 179 +++++++++++++++++++++ .../CodeGen/AArch64/sve-fixed-length-concat.ll | 3 +- 4 files changed, 264 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ca9ddfa08081..f63e4cf28fb6 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9577,6 +9577,86 @@ static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64)); } +// Return true if we can get a new shuffle mask by checking the parameter mask +// array to test whether every two adjacent mask values are continuous and +// starting from an even number. +static bool isWideTypeMask(ArrayRef M, EVT VT, + SmallVectorImpl &NewMask) { + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts % 2 != 0) + return false; + + NewMask.clear(); + for (unsigned i = 0; i < NumElts; i += 2) { + int M0 = M[i]; + int M1 = M[i + 1]; + + // If both elements are undef, new mask is undef too. + if (M0 == -1 && M1 == -1) { + NewMask.push_back(-1); + continue; + } + + if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) { + NewMask.push_back(M1 / 2); + continue; + } + + if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) { + NewMask.push_back(M0 / 2); + continue; + } + + NewMask.clear(); + return false; + } + + assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!"); + return true; +} + +// Try to widen element type to get a new mask value for a better permutation +// sequence, so that we can use NEON shuffle instructions, such as zip1/2, +// UZP1/2, TRN1/2, REV, INS, etc. +// For example: +// shufflevector <4 x i32> %a, <4 x i32> %b, +// <4 x i32> +// is equivalent to: +// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> +// Finally, we can get: +// mov v0.d[0], v1.d[1] +static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + EVT ScalarVT = VT.getVectorElementType(); + unsigned ElementSize = ScalarVT.getFixedSizeInBits(); + SDValue V0 = Op.getOperand(0); + SDValue V1 = Op.getOperand(1); + ArrayRef Mask = cast(Op)->getMask(); + + // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ... + // We need to make sure the wider element type is legal. Thus, ElementSize + // should be not larger than 32 bits, and i1 type should also be excluded. + if (ElementSize > 32 || ElementSize == 1) + return SDValue(); + + SmallVector NewMask; + if (isWideTypeMask(Mask, VT, NewMask)) { + MVT NewEltVT = VT.isFloatingPoint() + ? MVT::getFloatingPointVT(ElementSize * 2) + : MVT::getIntegerVT(ElementSize * 2); + MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); + if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { + V0 = DAG.getBitcast(NewVT, V0); + V1 = DAG.getBitcast(NewVT, V1); + return DAG.getBitcast(VT, + DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask)); + } + } + + return SDValue(); +} + SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -9724,6 +9804,9 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, DstLaneV); } + if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG)) + return NewSD; + // If the shuffle is not directly supported and it has 4 elements, use // the PerfectShuffle-generated table to synthesize it from other shuffles. unsigned NumElts = VT.getVectorNumElements(); diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll index e6bf67f50911..690fb716771a 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector.ll @@ -88,8 +88,7 @@ define <8 x i32> @concat8(<4 x i32>* %A, <4 x i32>* %B) { define <4 x half> @concat9(<2 x half> %A, <2 x half> %B) { ; CHECK-LABEL: concat9: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #4 -; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %v4half= shufflevector <2 x half> %A, <2 x half> %B, <4 x i32> ret <4 x half> %v4half diff --git a/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll b/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll new file mode 100644 index 000000000000..89e91944bc3d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll @@ -0,0 +1,179 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <4 x half> @shuffle1(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: shuffle1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip1 v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ret +entry: + %res = shufflevector <2 x half> %a, <2 x half> %b, <4 x i32> + ret <4 x half> %res +} + +define <4 x half> @shuffle2(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: shuffle2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip1 v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ret +entry: + %res = shufflevector <2 x half> %a, <2 x half> %b, <4 x i32> + ret <4 x half> %res +} + +define <4 x i32> @shuffle3(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: shuffle3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[0], v1.d[1] +; CHECK-NEXT: ret +entry: + %res = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + ret <4 x i32> %res +} + +define <4 x float> @shuffle4(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: shuffle4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[1] +; CHECK-NEXT: ret +entry: + %res = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + ret <4 x float> %res +} + +define <16 x i8> @shuffle5(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: shuffle5: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret +entry: + %res = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %res +} + +define <16 x i8> @shuffle6(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: shuffle6: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: trn1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret +entry: + %res = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %res +} + +define <8 x i8> @shuffle7(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: shuffle7: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp2 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret +entry: + %res = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> + ret <8 x i8> %res +} + +define <8 x i8> @shuffle8(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: shuffle8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: trn2 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret +entry: + %res = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> + ret <8 x i8> %res +} + +; No blocks +define <8 x i8> @shuffle9(<8 x i8> %a) { +; CHECK-LABEL: shuffle9: +; CHECK: // %bb.0: +; CHECK-NEXT: rev32 v0.4h, v0.4h +; CHECK-NEXT: ret + %res = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> + ret <8 x i8> %res +} + +define <8 x i16> @shuffle10(<8 x i16> %a) { +; CHECK-LABEL: shuffle10: +; CHECK: // %bb.0: +; CHECK-NEXT: rev64 v0.4s, v0.4s +; CHECK-NEXT: ret + %res = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> + ret <8 x i16> %res +} + +define <4 x i16> @shuffle11(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: shuffle11: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v1.s[1], v0.s[0] +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret +entry: + %res = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> + ret <4 x i16> %res +} + +define <8 x i8> @shuffle12(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: shuffle12: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: trn2 v0.4h, v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %res = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> + ret <8 x i8> %res +} + +define <8 x i16> @shuffle_widen_faili1(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: shuffle_widen_faili1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rev32 v2.4h, v0.4h +; CHECK-NEXT: rev32 v3.4h, v1.4h +; CHECK-NEXT: ext v1.8b, v2.8b, v1.8b, #4 +; CHECK-NEXT: ext v0.8b, v3.8b, v0.8b, #4 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret +entry: + %res = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + ret <8 x i16> %res +} + +define <8 x i16> @shuffle_widen_fail2(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: shuffle_widen_fail2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h +; CHECK-NEXT: trn1 v3.4h, v1.4h, v1.4h +; CHECK-NEXT: ext v1.8b, v2.8b, v1.8b, #4 +; CHECK-NEXT: ext v0.8b, v3.8b, v0.8b, #4 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret +entry: + %res = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> + ret <8 x i16> %res +} + +define <8 x i16> @shuffle_widen_fail3(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: shuffle_widen_fail3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_0] +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: ret +entry: + %res = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %res +} diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll index 152a7e46aa9b..becdc912d1fb 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll @@ -460,8 +460,7 @@ define void @concat_v32i64(<16 x i64>* %a, <16 x i64>* %b, <32 x i64>* %c) #0 { ; Don't use SVE for 64-bit vectors. define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) #0 { ; CHECK-LABEL: concat_v4f16: -; CHECK: ext v0.8b, v0.8b, v0.8b, #4 -; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; CHECK: zip1 v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret %res = shufflevector <2 x half> %op1, <2 x half> %op2, <4 x i32> ret <4 x half> %res -- 2.11.4.GIT