From d460c1de3b989cea919b9d60c21644f28f987950 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 8 Jan 2024 18:01:41 +0000 Subject: [PATCH] [DAG] SimplifyDemandedBits - don't fold sext(x) -> aext(x) if we lose an 0/-1 allsignbits mask (#77296) For targets that use 0/-1 boolean results, we want to keep this pattern through extensions/truncations as much as possible - so avoid simplifying to any_extend even if we don't demand the upper bits. Noticed in triage for https://reviews.llvm.org/D152928 --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 27 ++++++++++++++---------- llvm/test/CodeGen/AArch64/arm64-zip.ll | 2 +- llvm/test/CodeGen/AArch64/vselect-ext.ll | 14 ++++++------ llvm/test/CodeGen/SystemZ/vec-perm-14.ll | 8 +++---- llvm/test/CodeGen/X86/test-shrink-bug.ll | 4 ++-- llvm/test/CodeGen/X86/vec_setcc.ll | 2 +- 6 files changed, 30 insertions(+), 27 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index f8400e8e94df..e3e3e375d6a6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2444,6 +2444,13 @@ bool TargetLowering::SimplifyDemandedBits( unsigned InElts = SrcVT.isFixedLengthVector() ? SrcVT.getVectorNumElements() : 1; bool IsVecInReg = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG; + APInt InDemandedElts = DemandedElts.zext(InElts); + APInt InDemandedBits = DemandedBits.trunc(InBits); + + // Since some of the sign extended bits are demanded, we know that the sign + // bit is demanded. + InDemandedBits.setBit(InBits - 1); + // If none of the top bits are demanded, convert this into an any_extend. if (DemandedBits.getActiveBits() <= InBits) { // If we only need the non-extended bits of the bottom element @@ -2452,19 +2459,17 @@ bool TargetLowering::SimplifyDemandedBits( VT.getSizeInBits() == SrcVT.getSizeInBits()) return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src)); - unsigned Opc = - IsVecInReg ? ISD::ANY_EXTEND_VECTOR_INREG : ISD::ANY_EXTEND; - if (!TLO.LegalOperations() || isOperationLegal(Opc, VT)) - return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src)); + // Don't lose an all signbits 0/-1 splat on targets with 0/-1 booleans. + if (getBooleanContents(VT) != ZeroOrNegativeOneBooleanContent || + TLO.DAG.ComputeNumSignBits(Src, InDemandedElts, Depth + 1) != + InBits) { + unsigned Opc = + IsVecInReg ? ISD::ANY_EXTEND_VECTOR_INREG : ISD::ANY_EXTEND; + if (!TLO.LegalOperations() || isOperationLegal(Opc, VT)) + return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src)); + } } - APInt InDemandedBits = DemandedBits.trunc(InBits); - APInt InDemandedElts = DemandedElts.zext(InElts); - - // Since some of the sign extended bits are demanded, we know that the sign - // bit is demanded. - InDemandedBits.setBit(InBits - 1); - if (SimplifyDemandedBits(Src, InDemandedBits, InDemandedElts, Known, TLO, Depth + 1)) return true; diff --git a/llvm/test/CodeGen/AArch64/arm64-zip.ll b/llvm/test/CodeGen/AArch64/arm64-zip.ll index e22b57c8af44..c6e3c3540f6e 100644 --- a/llvm/test/CodeGen/AArch64/arm64-zip.ll +++ b/llvm/test/CodeGen/AArch64/arm64-zip.ll @@ -328,7 +328,7 @@ define <4 x i32> @shuffle_zip3(<4 x i32> %arg) { ; CHECK-NEXT: zip2.4h v0, v0, v1 ; CHECK-NEXT: movi.4s v1, #1 ; CHECK-NEXT: zip1.4h v0, v0, v0 -; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: sshll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 ; CHECK-NEXT: ret bb: diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll index b80955665c74..0b90343a40c8 100644 --- a/llvm/test/CodeGen/AArch64/vselect-ext.ll +++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll @@ -219,17 +219,17 @@ define <3 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v3i16(<3 x i8> ; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: Lloh0: ; CHECK-NEXT: adrp x8, lCPI9_0@PAGE +; CHECK-NEXT: movi.2d v3, #0x0000ff000000ff ; CHECK-NEXT: Lloh1: ; CHECK-NEXT: ldr d2, [x8, lCPI9_0@PAGEOFF] ; CHECK-NEXT: mov.h v0[1], w1 ; CHECK-NEXT: mov.h v0[2], w2 -; CHECK-NEXT: fmov d1, d0 -; CHECK-NEXT: bic.4h v1, #255, lsl #8 -; CHECK-NEXT: cmhi.4h v1, v1, v2 -; CHECK-NEXT: and.8b v0, v0, v1 -; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: and.16b v0, v0, v1 +; CHECK-NEXT: ushll.4s v1, v0, #0 +; CHECK-NEXT: bic.4h v0, #255, lsl #8 +; CHECK-NEXT: cmhi.4h v0, v0, v2 +; CHECK-NEXT: and.16b v1, v1, v3 +; CHECK-NEXT: sshll.4s v0, v0, #0 +; CHECK-NEXT: and.16b v0, v1, v0 ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1 %ext = zext <3 x i8> %a to <3 x i32> diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-14.ll b/llvm/test/CodeGen/SystemZ/vec-perm-14.ll index fb3ece96017b..0b392676fa3e 100644 --- a/llvm/test/CodeGen/SystemZ/vec-perm-14.ll +++ b/llvm/test/CodeGen/SystemZ/vec-perm-14.ll @@ -1,16 +1,14 @@ ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s -; -; Test that only one vperm of the vector compare is needed for both extracts. +; Test that no vperm of the vector compare is needed for the extracts. define void @fun() { ; CHECK-LABEL: fun: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: vlrepf %v0, 0(%r1) ; CHECK-NEXT: vgbm %v1, 0 -; CHECK-NEXT: larl %r1, .LCPI0_0 ; CHECK-NEXT: vceqb %v0, %v0, %v1 -; CHECK-NEXT: vl %v1, 0(%r1), 3 -; CHECK-NEXT: vperm %v0, %v0, %v0, %v1 +; CHECK-NEXT: vuphb %v0, %v0 +; CHECK-NEXT: vuphh %v0, %v0 ; CHECK-NEXT: vlgvf %r0, %v0, 0 ; CHECK-NEXT: tmll %r0, 1 ; CHECK-NEXT: je .LBB0_2 diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll index f05459f751bc..51a00d211421 100644 --- a/llvm/test/CodeGen/X86/test-shrink-bug.ll +++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll @@ -68,8 +68,8 @@ define dso_local void @fail(i16 %a, <2 x i8> %b) { ; CHECK-X64-NEXT: je .LBB1_3 ; CHECK-X64-NEXT: # %bb.1: ; CHECK-X64-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8] -; CHECK-X64-NEXT: pextrw $4, %xmm0, %eax +; CHECK-X64-NEXT: pslld $8, %xmm0 +; CHECK-X64-NEXT: pextrw $1, %xmm0, %eax ; CHECK-X64-NEXT: testb $1, %al ; CHECK-X64-NEXT: jne .LBB1_3 ; CHECK-X64-NEXT: # %bb.2: # %no diff --git a/llvm/test/CodeGen/X86/vec_setcc.ll b/llvm/test/CodeGen/X86/vec_setcc.ll index e7232a34f471..87e29261eaa4 100644 --- a/llvm/test/CodeGen/X86/vec_setcc.ll +++ b/llvm/test/CodeGen/X86/vec_setcc.ll @@ -308,9 +308,9 @@ define <3 x i1> @test_setcc_v3i1_v3i16(ptr %a) nounwind { ; SSE2-LABEL: test_setcc_v3i1_v3i16: ; SSE2: # %bb.0: ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -- 2.11.4.GIT