From a5b2c393bfb9a072be44b756646a007da8938162 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 9 Sep 2019 16:35:49 +0000 Subject: [PATCH] [ARM] Fix loads and stores for predicate vectors These predicate vectors can usually be loaded and stored with a single instruction, a VSTR_P0. However this instruction will store the entire P0 predicate, 16 bits, zeroextended to 32bits. Each lane of the the v4i1/v8i1/v16i1 representing 4/2/1 bits. As far as I understand, when llvm says "store this v4i1", it really does need to store 4 bits (or 8, that being the size of a byte, with this bottom 4 as the interesting bits). For example a bitcast from a v8i1 to a i8 is defined as a store followed by a load, which is how the code is expanded. So this instead lowers the v4i1/v8i1 load/store through some shuffles to get the bits into the correct positions. This, as you might imagine, is not as efficient as a single instruction. But I believe it is needed for correctness. v16i1 equally should not load/store 32bits, only storing the 16bits of data. Stack loads/stores are still using the VSTR_P0 (as can be seen by the test not changing). This is fine as they are self-consistent, it is only "externally observable loads/stores" (from our point of view) that need to be corrected. Differential revision: https://reviews.llvm.org/D67085 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@371419 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMISelLowering.cpp | 65 ++ lib/Target/ARM/ARMInstrMVE.td | 18 - test/CodeGen/Thumb2/mve-masked-ldst.ll | 626 +++++++++++--- test/CodeGen/Thumb2/mve-masked-load.ll | 1310 ++++++++++++++++++++++++----- test/CodeGen/Thumb2/mve-masked-store.ll | 1180 +++++++++++++++++++------- test/CodeGen/Thumb2/mve-pred-bitcast.ll | 605 +++++++++---- test/CodeGen/Thumb2/mve-pred-loadstore.ll | 178 +++- 7 files changed, 3149 insertions(+), 833 deletions(-) rewrite test/CodeGen/Thumb2/mve-pred-bitcast.ll (68%) diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 9221e913f4c..cedd8981d57 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -378,6 +378,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); + setOperationAction(ISD::LOAD, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); } } @@ -8783,6 +8785,65 @@ void ARMTargetLowering::ExpandDIV_Windows( Results.push_back(Upper); } +static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { + LoadSDNode *LD = cast(Op.getNode()); + EVT MemVT = LD->getMemoryVT(); + assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && + "Expected a predicate type!"); + assert(MemVT == Op.getValueType()); + assert(LD->getExtensionType() == ISD::NON_EXTLOAD && + "Expected a non-extending load"); + assert(LD->isUnindexed() && "Expected a unindexed load"); + + // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit + // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We + // need to make sure that 8/4 bits are actually loaded into the correct + // place, which means loading the value and then shuffling the values into + // the bottom bits of the predicate. + // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect + // for BE). + + SDLoc dl(Op); + SDValue Load = DAG.getExtLoad( + ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(), + EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), + LD->getMemOperand()); + SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load); + if (MemVT != MVT::v16i1) + Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred, + DAG.getConstant(0, dl, MVT::i32)); + return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); +} + +static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { + StoreSDNode *ST = cast(Op.getNode()); + EVT MemVT = ST->getMemoryVT(); + assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && + "Expected a predicate type!"); + assert(MemVT == ST->getValue().getValueType()); + assert(!ST->isTruncatingStore() && "Expected a non-extending store"); + assert(ST->isUnindexed() && "Expected a unindexed store"); + + // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits + // unset and a scalar store. + SDLoc dl(Op); + SDValue Build = ST->getValue(); + if (MemVT != MVT::v16i1) { + SmallVector Ops; + for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) + Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build, + DAG.getConstant(I, dl, MVT::i32))); + for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++) + Ops.push_back(DAG.getUNDEF(MVT::i32)); + Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops); + } + SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build); + return DAG.getTruncStore( + ST->getChain(), dl, GRP, ST->getBasePtr(), + EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), + ST->getMemOperand()); +} + static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { if (isStrongerThanMonotonic(cast(Op)->getOrdering())) // Acquire/Release load/store is not legal for targets without a dmb or @@ -8982,6 +9043,10 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::UADDO: case ISD::USUBO: return LowerUnsignedALUO(Op, DAG); + case ISD::LOAD: + return LowerPredicateLoad(Op, DAG); + case ISD::STORE: + return LowerPredicateStore(Op, DAG); case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); diff --git a/lib/Target/ARM/ARMInstrMVE.td b/lib/Target/ARM/ARMInstrMVE.td index 68f6a72b280..594f928848a 100644 --- a/lib/Target/ARM/ARMInstrMVE.td +++ b/lib/Target/ARM/ARMInstrMVE.td @@ -4999,24 +4999,6 @@ let Predicates = [HasMVEInt, IsBE] in { def : MVE_vector_offset_store_typed; } -let Predicates = [HasMVEInt] in { - // Predicate loads - def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)), - (v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; - def : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)), - (v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; - def : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)), - (v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; - - // Predicate stores - def : Pat<(store (v4i1 VCCR:$val), t2addrmode_imm7<2>:$addr), - (VSTR_P0_off VCCR:$val, t2addrmode_imm7<2>:$addr)>; - def : Pat<(store (v8i1 VCCR:$val), t2addrmode_imm7<2>:$addr), - (VSTR_P0_off VCCR:$val, t2addrmode_imm7<2>:$addr)>; - def : Pat<(store (v16i1 VCCR:$val), t2addrmode_imm7<2>:$addr), - (VSTR_P0_off VCCR:$val, t2addrmode_imm7<2>:$addr)>; -} - // Widening/Narrowing Loads/Stores diff --git a/test/CodeGen/Thumb2/mve-masked-ldst.ll b/test/CodeGen/Thumb2/mve-masked-ldst.ll index 59518c85fde..f7d6a3f3799 100644 --- a/test/CodeGen/Thumb2/mve-masked-ldst.ll +++ b/test/CodeGen/Thumb2/mve-masked-ldst.ll @@ -8,11 +8,23 @@ define void @foo_v4i32_v4i32(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i32> *%src ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vcmp.s32 gt, q0, zr ; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp, #4] +; CHECK-NEXT: vmrs r12, p0 +; CHECK-NEXT: and r1, r12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #0, #1 +; CHECK-NEXT: ubfx r1, r12, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #1, #1 +; CHECK-NEXT: ubfx r1, r12, #8, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #2, #1 +; CHECK-NEXT: ubfx r1, r12, #12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #3, #1 +; CHECK-NEXT: and r1, r3, #15 ; CHECK-NEXT: lsls r3, r1, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: ldrne r3, [r2] @@ -29,9 +41,21 @@ define void @foo_v4i32_v4i32(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i32> *%src ; CHECK-NEXT: itt mi ; CHECK-NEXT: ldrmi r1, [r2, #12] ; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: mov r1, sp -; CHECK-NEXT: vstr p0, [r1] -; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #0, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #1, #1 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #2, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: bfi r1, r2, #3, #1 +; CHECK-NEXT: and r1, r1, #15 ; CHECK-NEXT: lsls r2, r1, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: vmovne r2, s0 @@ -64,11 +88,23 @@ define void @foo_sext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *% ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vcmp.s32 gt, q0, zr ; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp, #4] +; CHECK-NEXT: vmrs r12, p0 +; CHECK-NEXT: and r1, r12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #0, #1 +; CHECK-NEXT: ubfx r1, r12, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #1, #1 +; CHECK-NEXT: ubfx r1, r12, #8, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #2, #1 +; CHECK-NEXT: ubfx r1, r12, #12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #3, #1 +; CHECK-NEXT: and r1, r3, #15 ; CHECK-NEXT: lsls r3, r1, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: ldrbne r3, [r2] @@ -85,11 +121,23 @@ define void @foo_sext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *% ; CHECK-NEXT: itt mi ; CHECK-NEXT: ldrbmi r1, [r2, #3] ; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vstr p0, [r1] ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #0, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #1, #1 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #2, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: bfi r1, r2, #3, #1 +; CHECK-NEXT: and r1, r1, #15 ; CHECK-NEXT: lsls r2, r1, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: vmovne r2, s0 @@ -123,11 +171,23 @@ define void @foo_sext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vcmp.s32 gt, q0, zr ; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp, #4] +; CHECK-NEXT: vmrs r12, p0 +; CHECK-NEXT: and r1, r12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #0, #1 +; CHECK-NEXT: ubfx r1, r12, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #1, #1 +; CHECK-NEXT: ubfx r1, r12, #8, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #2, #1 +; CHECK-NEXT: ubfx r1, r12, #12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #3, #1 +; CHECK-NEXT: and r1, r3, #15 ; CHECK-NEXT: lsls r3, r1, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: ldrhne r3, [r2] @@ -144,10 +204,22 @@ define void @foo_sext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> ; CHECK-NEXT: itt mi ; CHECK-NEXT: ldrhmi r1, [r2, #6] ; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vstr p0, [r1] -; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #0, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #1, #1 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #2, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: bfi r1, r2, #3, #1 +; CHECK-NEXT: and r1, r1, #15 ; CHECK-NEXT: lsls r2, r1, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: vmovne r2, s0 @@ -181,12 +253,24 @@ define void @foo_zext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *% ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vmov.i32 q1, #0xff ; CHECK-NEXT: vcmp.s32 gt, q0, zr ; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp, #4] +; CHECK-NEXT: vmrs r12, p0 +; CHECK-NEXT: and r1, r12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #0, #1 +; CHECK-NEXT: ubfx r1, r12, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #1, #1 +; CHECK-NEXT: ubfx r1, r12, #8, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #2, #1 +; CHECK-NEXT: ubfx r1, r12, #12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #3, #1 +; CHECK-NEXT: and r1, r3, #15 ; CHECK-NEXT: lsls r3, r1, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: ldrbne r3, [r2] @@ -203,10 +287,22 @@ define void @foo_zext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *% ; CHECK-NEXT: itt mi ; CHECK-NEXT: ldrbmi r1, [r2, #3] ; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vstr p0, [r1] -; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #0, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #1, #1 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #2, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: bfi r1, r2, #3, #1 +; CHECK-NEXT: and r1, r1, #15 ; CHECK-NEXT: lsls r2, r1, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: vmovne r2, s0 @@ -240,11 +336,23 @@ define void @foo_zext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vcmp.s32 gt, q0, zr ; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp, #4] +; CHECK-NEXT: vmrs r12, p0 +; CHECK-NEXT: and r1, r12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #0, #1 +; CHECK-NEXT: ubfx r1, r12, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #1, #1 +; CHECK-NEXT: ubfx r1, r12, #8, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #2, #1 +; CHECK-NEXT: ubfx r1, r12, #12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #3, #1 +; CHECK-NEXT: and r1, r3, #15 ; CHECK-NEXT: lsls r3, r1, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: ldrhne r3, [r2] @@ -261,10 +369,22 @@ define void @foo_zext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> ; CHECK-NEXT: itt mi ; CHECK-NEXT: ldrhmi r1, [r2, #6] ; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vstr p0, [r1] -; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #0, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #1, #1 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #2, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: bfi r1, r2, #3, #1 +; CHECK-NEXT: and r1, r1, #15 ; CHECK-NEXT: lsls r2, r1, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: vmovne r2, s0 @@ -298,12 +418,36 @@ define void @foo_v8i16_v8i16(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i16> *%src ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: add r3, sp, #8 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vcmp.s16 gt, q0, zr ; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp, #8] -; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: vmrs r12, p0 +; CHECK-NEXT: and r1, r12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #0, #1 +; CHECK-NEXT: ubfx r1, r12, #2, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #1, #1 +; CHECK-NEXT: ubfx r1, r12, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #2, #1 +; CHECK-NEXT: ubfx r1, r12, #6, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #3, #1 +; CHECK-NEXT: ubfx r1, r12, #8, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #4, #1 +; CHECK-NEXT: ubfx r1, r12, #10, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #5, #1 +; CHECK-NEXT: ubfx r1, r12, #12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #6, #1 +; CHECK-NEXT: ubfx r1, r12, #14, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #7, #1 +; CHECK-NEXT: uxtb r1, r3 +; CHECK-NEXT: lsls r3, r3, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: ldrhne r3, [r2] ; CHECK-NEXT: vmovne.16 q0[0], r3 @@ -335,10 +479,34 @@ define void @foo_v8i16_v8i16(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i16> *%src ; CHECK-NEXT: itt mi ; CHECK-NEXT: ldrhmi r1, [r2, #14] ; CHECK-NEXT: vmovmi.16 q0[7], r1 -; CHECK-NEXT: mov r1, sp -; CHECK-NEXT: vstr p0, [r1] -; CHECK-NEXT: ldrb.w r1, [sp] -; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmrs r1, p0 +; CHECK-NEXT: and r3, r1, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #0, #1 +; CHECK-NEXT: ubfx r3, r1, #2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #1, #1 +; CHECK-NEXT: ubfx r3, r1, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #2, #1 +; CHECK-NEXT: ubfx r3, r1, #6, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #3, #1 +; CHECK-NEXT: ubfx r3, r1, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #4, #1 +; CHECK-NEXT: ubfx r3, r1, #10, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #5, #1 +; CHECK-NEXT: ubfx r3, r1, #12, #1 +; CHECK-NEXT: ubfx r1, r1, #14, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #6, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r2, r1, #7, #1 +; CHECK-NEXT: uxtb r1, r2 +; CHECK-NEXT: lsls r2, r2, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: vmovne.u16 r2, q0[0] ; CHECK-NEXT: strhne r2, [r0] @@ -386,12 +554,36 @@ define void @foo_sext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *% ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: add r3, sp, #8 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vcmp.s16 gt, q0, zr ; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp, #8] -; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: vmrs r12, p0 +; CHECK-NEXT: and r1, r12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #0, #1 +; CHECK-NEXT: ubfx r1, r12, #2, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #1, #1 +; CHECK-NEXT: ubfx r1, r12, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #2, #1 +; CHECK-NEXT: ubfx r1, r12, #6, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #3, #1 +; CHECK-NEXT: ubfx r1, r12, #8, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #4, #1 +; CHECK-NEXT: ubfx r1, r12, #10, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #5, #1 +; CHECK-NEXT: ubfx r1, r12, #12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #6, #1 +; CHECK-NEXT: ubfx r1, r12, #14, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #7, #1 +; CHECK-NEXT: uxtb r1, r3 +; CHECK-NEXT: lsls r3, r3, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: ldrbne r3, [r2] ; CHECK-NEXT: vmovne.16 q0[0], r3 @@ -423,11 +615,35 @@ define void @foo_sext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *% ; CHECK-NEXT: itt mi ; CHECK-NEXT: ldrbmi r1, [r2, #7] ; CHECK-NEXT: vmovmi.16 q0[7], r1 -; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmrs r1, p0 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vstr p0, [r1] -; CHECK-NEXT: ldrb.w r1, [sp] -; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: and r3, r1, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #0, #1 +; CHECK-NEXT: ubfx r3, r1, #2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #1, #1 +; CHECK-NEXT: ubfx r3, r1, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #2, #1 +; CHECK-NEXT: ubfx r3, r1, #6, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #3, #1 +; CHECK-NEXT: ubfx r3, r1, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #4, #1 +; CHECK-NEXT: ubfx r3, r1, #10, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #5, #1 +; CHECK-NEXT: ubfx r3, r1, #12, #1 +; CHECK-NEXT: ubfx r1, r1, #14, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #6, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r2, r1, #7, #1 +; CHECK-NEXT: uxtb r1, r2 +; CHECK-NEXT: lsls r2, r2, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: vmovne.u16 r2, q0[0] ; CHECK-NEXT: strhne r2, [r0] @@ -476,12 +692,36 @@ define void @foo_zext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *% ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: add r3, sp, #8 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vcmp.s16 gt, q0, zr ; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp, #8] -; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: vmrs r12, p0 +; CHECK-NEXT: and r1, r12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #0, #1 +; CHECK-NEXT: ubfx r1, r12, #2, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #1, #1 +; CHECK-NEXT: ubfx r1, r12, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #2, #1 +; CHECK-NEXT: ubfx r1, r12, #6, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #3, #1 +; CHECK-NEXT: ubfx r1, r12, #8, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #4, #1 +; CHECK-NEXT: ubfx r1, r12, #10, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #5, #1 +; CHECK-NEXT: ubfx r1, r12, #12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #6, #1 +; CHECK-NEXT: ubfx r1, r12, #14, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #7, #1 +; CHECK-NEXT: uxtb r1, r3 +; CHECK-NEXT: lsls r3, r3, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: ldrbne r3, [r2] ; CHECK-NEXT: vmovne.16 q0[0], r3 @@ -513,11 +753,35 @@ define void @foo_zext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *% ; CHECK-NEXT: itt mi ; CHECK-NEXT: ldrbmi r1, [r2, #7] ; CHECK-NEXT: vmovmi.16 q0[7], r1 -; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmrs r1, p0 ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vstr p0, [r1] -; CHECK-NEXT: ldrb.w r1, [sp] -; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: and r3, r1, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #0, #1 +; CHECK-NEXT: ubfx r3, r1, #2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #1, #1 +; CHECK-NEXT: ubfx r3, r1, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #2, #1 +; CHECK-NEXT: ubfx r3, r1, #6, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #3, #1 +; CHECK-NEXT: ubfx r3, r1, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #4, #1 +; CHECK-NEXT: ubfx r3, r1, #10, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #5, #1 +; CHECK-NEXT: ubfx r3, r1, #12, #1 +; CHECK-NEXT: ubfx r1, r1, #14, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #6, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r2, r1, #7, #1 +; CHECK-NEXT: uxtb r1, r2 +; CHECK-NEXT: lsls r2, r2, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: vmovne.u16 r2, q0[0] ; CHECK-NEXT: strhne r2, [r0] @@ -573,13 +837,12 @@ define void @foo_v16i8_v16i8(<16 x i8> *%dest, <16 x i8> *%mask, <16 x i8> *%src ; CHECK-NEXT: bfc r4, #0, #4 ; CHECK-NEXT: mov sp, r4 ; CHECK-NEXT: vldrb.u8 q0, [r1] -; CHECK-NEXT: add r3, sp, #16 ; CHECK-NEXT: sub.w r4, r7, #8 ; CHECK-NEXT: vcmp.s8 gt, q0, zr ; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrh.w r1, [sp, #16] -; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: vmrs r3, p0 +; CHECK-NEXT: uxth r1, r3 +; CHECK-NEXT: lsls r3, r3, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: ldrbne r3, [r2] ; CHECK-NEXT: vmovne.8 q0[0], r3 @@ -643,10 +906,9 @@ define void @foo_v16i8_v16i8(<16 x i8> *%dest, <16 x i8> *%mask, <16 x i8> *%src ; CHECK-NEXT: itt mi ; CHECK-NEXT: ldrbmi r1, [r2, #15] ; CHECK-NEXT: vmovmi.8 q0[15], r1 -; CHECK-NEXT: mov r1, sp -; CHECK-NEXT: vstr p0, [r1] -; CHECK-NEXT: ldrh.w r1, [sp] -; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: uxth r1, r2 +; CHECK-NEXT: lsls r2, r2, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: vmovne.u8 r2, q0[0] ; CHECK-NEXT: strbne r2, [r0] @@ -726,12 +988,36 @@ define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> * ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: add r3, sp, #8 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vcmp.s16 gt, q0, zr ; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp, #8] -; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: vmrs r12, p0 +; CHECK-NEXT: and r1, r12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #0, #1 +; CHECK-NEXT: ubfx r1, r12, #2, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #1, #1 +; CHECK-NEXT: ubfx r1, r12, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #2, #1 +; CHECK-NEXT: ubfx r1, r12, #6, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #3, #1 +; CHECK-NEXT: ubfx r1, r12, #8, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #4, #1 +; CHECK-NEXT: ubfx r1, r12, #10, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #5, #1 +; CHECK-NEXT: ubfx r1, r12, #12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #6, #1 +; CHECK-NEXT: ubfx r1, r12, #14, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #7, #1 +; CHECK-NEXT: uxtb r1, r3 +; CHECK-NEXT: lsls r3, r3, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: ldrhne r3, [r2] ; CHECK-NEXT: vmovne.16 q0[0], r3 @@ -763,10 +1049,34 @@ define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> * ; CHECK-NEXT: itt mi ; CHECK-NEXT: ldrhmi r1, [r2, #14] ; CHECK-NEXT: vmovmi.16 q0[7], r1 -; CHECK-NEXT: mov r1, sp -; CHECK-NEXT: vstr p0, [r1] -; CHECK-NEXT: ldrb.w r1, [sp] -; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmrs r1, p0 +; CHECK-NEXT: and r3, r1, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #0, #1 +; CHECK-NEXT: ubfx r3, r1, #2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #1, #1 +; CHECK-NEXT: ubfx r3, r1, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #2, #1 +; CHECK-NEXT: ubfx r3, r1, #6, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #3, #1 +; CHECK-NEXT: ubfx r3, r1, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #4, #1 +; CHECK-NEXT: ubfx r3, r1, #10, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #5, #1 +; CHECK-NEXT: ubfx r3, r1, #12, #1 +; CHECK-NEXT: ubfx r1, r1, #14, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #6, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r2, r1, #7, #1 +; CHECK-NEXT: uxtb r1, r2 +; CHECK-NEXT: lsls r2, r2, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: vmovne.u16 r2, q0[0] ; CHECK-NEXT: strbne r2, [r0] @@ -815,11 +1125,23 @@ define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> * ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vcmp.s32 gt, q0, zr ; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp, #4] +; CHECK-NEXT: vmrs r12, p0 +; CHECK-NEXT: and r1, r12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #0, #1 +; CHECK-NEXT: ubfx r1, r12, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #1, #1 +; CHECK-NEXT: ubfx r1, r12, #8, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #2, #1 +; CHECK-NEXT: ubfx r1, r12, #12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #3, #1 +; CHECK-NEXT: and r1, r3, #15 ; CHECK-NEXT: lsls r3, r1, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: ldrne r3, [r2] @@ -836,9 +1158,21 @@ define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> * ; CHECK-NEXT: itt mi ; CHECK-NEXT: ldrmi r1, [r2, #12] ; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: mov r1, sp -; CHECK-NEXT: vstr p0, [r1] -; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #0, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #1, #1 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #2, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: bfi r1, r2, #3, #1 +; CHECK-NEXT: and r1, r1, #15 ; CHECK-NEXT: lsls r2, r1, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: vmovne r2, s0 @@ -872,11 +1206,23 @@ define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32> ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vcmp.s32 gt, q0, zr ; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp, #4] +; CHECK-NEXT: vmrs r12, p0 +; CHECK-NEXT: and r1, r12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #0, #1 +; CHECK-NEXT: ubfx r1, r12, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #1, #1 +; CHECK-NEXT: ubfx r1, r12, #8, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #2, #1 +; CHECK-NEXT: ubfx r1, r12, #12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #3, #1 +; CHECK-NEXT: and r1, r3, #15 ; CHECK-NEXT: lsls r3, r1, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: ldrne r3, [r2] @@ -893,9 +1239,21 @@ define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32> ; CHECK-NEXT: itt mi ; CHECK-NEXT: ldrmi r1, [r2, #12] ; CHECK-NEXT: vmovmi.32 q0[3], r1 -; CHECK-NEXT: mov r1, sp -; CHECK-NEXT: vstr p0, [r1] -; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #0, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #1, #1 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #2, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: bfi r1, r2, #3, #1 +; CHECK-NEXT: and r1, r1, #15 ; CHECK-NEXT: lsls r2, r1, #31 ; CHECK-NEXT: itt ne ; CHECK-NEXT: vmovne r2, s0 @@ -929,11 +1287,23 @@ define void @foo_v4f32_v4f32(<4 x float> *%dest, <4 x i32> *%mask, <4 x float> * ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: add r3, sp, #4 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vcmp.s32 gt, q0, zr ; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp, #4] +; CHECK-NEXT: vmrs r12, p0 +; CHECK-NEXT: and r1, r12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #0, #1 +; CHECK-NEXT: ubfx r1, r12, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #1, #1 +; CHECK-NEXT: ubfx r1, r12, #8, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #2, #1 +; CHECK-NEXT: ubfx r1, r12, #12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #3, #1 +; CHECK-NEXT: and r1, r3, #15 ; CHECK-NEXT: lsls r3, r1, #31 ; CHECK-NEXT: it ne ; CHECK-NEXT: vldrne s0, [r2] @@ -946,9 +1316,21 @@ define void @foo_v4f32_v4f32(<4 x float> *%dest, <4 x i32> *%mask, <4 x float> * ; CHECK-NEXT: lsls r1, r1, #28 ; CHECK-NEXT: it mi ; CHECK-NEXT: vldrmi s3, [r2, #12] -; CHECK-NEXT: mov r1, sp -; CHECK-NEXT: vstr p0, [r1] -; CHECK-NEXT: ldrb.w r1, [sp] +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #0, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #1, #1 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r1, r3, #2, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: bfi r1, r2, #3, #1 +; CHECK-NEXT: and r1, r1, #15 ; CHECK-NEXT: lsls r2, r1, #31 ; CHECK-NEXT: it ne ; CHECK-NEXT: vstrne s0, [r0] @@ -977,12 +1359,36 @@ define void @foo_v8f16_v8f16(<8 x half> *%dest, <8 x i16> *%mask, <8 x half> *%s ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: add r3, sp, #8 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vcmp.s16 gt, q0, zr ; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: vstr p0, [r3] -; CHECK-NEXT: ldrb.w r1, [sp, #8] -; CHECK-NEXT: lsls r3, r1, #31 +; CHECK-NEXT: vmrs r12, p0 +; CHECK-NEXT: and r1, r12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #0, #1 +; CHECK-NEXT: ubfx r1, r12, #2, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #1, #1 +; CHECK-NEXT: ubfx r1, r12, #4, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #2, #1 +; CHECK-NEXT: ubfx r1, r12, #6, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #3, #1 +; CHECK-NEXT: ubfx r1, r12, #8, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #4, #1 +; CHECK-NEXT: ubfx r1, r12, #10, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #5, #1 +; CHECK-NEXT: ubfx r1, r12, #12, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #6, #1 +; CHECK-NEXT: ubfx r1, r12, #14, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r3, r1, #7, #1 +; CHECK-NEXT: uxtb r1, r3 +; CHECK-NEXT: lsls r3, r3, #31 ; CHECK-NEXT: bne .LBB13_18 ; CHECK-NEXT: @ %bb.1: @ %else ; CHECK-NEXT: lsls r3, r1, #30 @@ -1010,10 +1416,34 @@ define void @foo_v8f16_v8f16(<8 x half> *%dest, <8 x i16> *%mask, <8 x half> *%s ; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vmov.16 q0[7], r1 ; CHECK-NEXT: .LBB13_9: @ %else20 -; CHECK-NEXT: mov r1, sp -; CHECK-NEXT: vstr p0, [r1] -; CHECK-NEXT: ldrb.w r1, [sp] -; CHECK-NEXT: lsls r2, r1, #31 +; CHECK-NEXT: vmrs r1, p0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: and r3, r1, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #0, #1 +; CHECK-NEXT: ubfx r3, r1, #2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #1, #1 +; CHECK-NEXT: ubfx r3, r1, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #2, #1 +; CHECK-NEXT: ubfx r3, r1, #6, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #3, #1 +; CHECK-NEXT: ubfx r3, r1, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #4, #1 +; CHECK-NEXT: ubfx r3, r1, #10, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #5, #1 +; CHECK-NEXT: ubfx r3, r1, #12, #1 +; CHECK-NEXT: ubfx r1, r1, #14, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: bfi r2, r3, #6, #1 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: bfi r2, r1, #7, #1 +; CHECK-NEXT: uxtb r1, r2 +; CHECK-NEXT: lsls r2, r2, #31 ; CHECK-NEXT: bne .LBB13_25 ; CHECK-NEXT: @ %bb.10: @ %else23 ; CHECK-NEXT: lsls r2, r1, #30 @@ -1072,13 +1502,13 @@ define void @foo_v8f16_v8f16(<8 x half> *%dest, <8 x i16> *%mask, <8 x half> *%s ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmov.16 q0[5], r3 ; CHECK-NEXT: lsls r3, r1, #25 -; CHECK-NEXT: bpl .LBB13_7 +; CHECK-NEXT: bpl.w .LBB13_7 ; CHECK-NEXT: .LBB13_24: @ %cond.load16 ; CHECK-NEXT: vldr.16 s4, [r2, #12] ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmov.16 q0[6], r3 ; CHECK-NEXT: lsls r1, r1, #24 -; CHECK-NEXT: bmi .LBB13_8 +; CHECK-NEXT: bmi.w .LBB13_8 ; CHECK-NEXT: b .LBB13_9 ; CHECK-NEXT: .LBB13_25: @ %cond.store ; CHECK-NEXT: vstr.16 s0, [r0] diff --git a/test/CodeGen/Thumb2/mve-masked-load.ll b/test/CodeGen/Thumb2/mve-masked-load.ll index cf04e235fdb..b1d048ecdbd 100644 --- a/test/CodeGen/Thumb2/mve-masked-load.ll +++ b/test/CodeGen/Thumb2/mve-masked-load.ll @@ -7,17 +7,29 @@ define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align4_zero(<4 x i32> *%dest, <4 ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: mov.w r12, #0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #3, #1 +; CHECK-LE-NEXT: and r1, r2, #15 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: beq .LBB0_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: ldr r3, [r0] -; CHECK-LE-NEXT: vdup.32 q0, r2 -; CHECK-LE-NEXT: vmov.32 q0[0], r3 +; CHECK-LE-NEXT: ldr r2, [r0] +; CHECK-LE-NEXT: vdup.32 q0, r12 +; CHECK-LE-NEXT: vmov.32 q0[0], r2 ; CHECK-LE-NEXT: b .LBB0_3 ; CHECK-LE-NEXT: .LBB0_2: ; CHECK-LE-NEXT: vmov.i32 q0, #0x0 @@ -42,17 +54,29 @@ define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align4_zero(<4 x i32> *%dest, <4 ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: mov.w r12, #0 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r3, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #3, #1 +; CHECK-BE-NEXT: and r1, r2, #15 ; CHECK-BE-NEXT: lsls r2, r1, #31 ; CHECK-BE-NEXT: beq .LBB0_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load -; CHECK-BE-NEXT: movs r2, #0 -; CHECK-BE-NEXT: ldr r3, [r0] -; CHECK-BE-NEXT: vdup.32 q1, r2 -; CHECK-BE-NEXT: vmov.32 q1[0], r3 +; CHECK-BE-NEXT: ldr r2, [r0] +; CHECK-BE-NEXT: vdup.32 q1, r12 +; CHECK-BE-NEXT: vmov.32 q1[0], r2 ; CHECK-BE-NEXT: b .LBB0_3 ; CHECK-BE-NEXT: .LBB0_2: ; CHECK-BE-NEXT: vmov.i32 q1, #0x0 @@ -84,10 +108,22 @@ define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align4_undef(<4 x i32> *%dest, <4 ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: mov r1, sp -; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r2, p0 ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: and r1, r1, #15 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r2, [r0] @@ -112,11 +148,23 @@ define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align4_undef(<4 x i32> *%dest, <4 ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r3, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: and r1, r1, #15 ; CHECK-BE-NEXT: lsls r2, r1, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: ldrne r2, [r0] @@ -148,10 +196,22 @@ define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align1_undef(<4 x i32> *%dest, <4 ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: mov r1, sp -; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r2, p0 ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: and r1, r1, #15 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r2, [r0] @@ -176,11 +236,23 @@ define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align1_undef(<4 x i32> *%dest, <4 ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r3, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: and r1, r1, #15 ; CHECK-BE-NEXT: lsls r2, r1, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: ldrne r2, [r0] @@ -211,10 +283,22 @@ define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align4_other(<4 x i32> *%dest, <4 ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: and r1, r1, #15 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r2, [r0] @@ -239,10 +323,22 @@ define arm_aapcs_vfpcc <4 x i32> @masked_v4i32_align4_other(<4 x i32> *%dest, <4 ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r3, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: and r1, r1, #15 ; CHECK-BE-NEXT: lsls r2, r1, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: ldrne r2, [r0] @@ -274,11 +370,23 @@ define arm_aapcs_vfpcc i8* @masked_v4i32_preinc(i8* %x, i8* %y, <4 x i32> %a) { ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: mov r2, sp -; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r12, p0 ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: adds r0, #4 -; CHECK-LE-NEXT: ldrb.w r2, [sp] +; CHECK-LE-NEXT: and r3, r12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: and r2, r2, #15 ; CHECK-LE-NEXT: lsls r3, r2, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r3, [r0] @@ -304,12 +412,24 @@ define arm_aapcs_vfpcc i8* @masked_v4i32_preinc(i8* %x, i8* %y, <4 x i32> %a) { ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q0 ; CHECK-BE-NEXT: adds r0, #4 -; CHECK-BE-NEXT: vstr p0, [r2] -; CHECK-BE-NEXT: ldrb.w r2, [sp] +; CHECK-BE-NEXT: vmrs r12, p0 +; CHECK-BE-NEXT: and r3, r12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: and r2, r2, #15 ; CHECK-BE-NEXT: lsls r3, r2, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: ldrne r3, [r0] @@ -345,11 +465,23 @@ define arm_aapcs_vfpcc i8* @masked_v4i32_postinc(i8* %x, i8* %y, <4 x i32> %a) { ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: mov r2, sp -; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r12, p0 ; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: and r3, r12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 ; CHECK-LE-NEXT: add.w r12, r0, #4 -; CHECK-LE-NEXT: ldrb.w r3, [sp] +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: and r3, r2, #15 ; CHECK-LE-NEXT: lsls r2, r3, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r2, [r0] @@ -376,12 +508,24 @@ define arm_aapcs_vfpcc i8* @masked_v4i32_postinc(i8* %x, i8* %y, <4 x i32> %a) { ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q0 +; CHECK-BE-NEXT: vmrs r12, p0 +; CHECK-BE-NEXT: and r3, r12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: add.w r12, r0, #4 -; CHECK-BE-NEXT: vstr p0, [r2] -; CHECK-BE-NEXT: ldrb.w r3, [sp] +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: and r3, r2, #15 ; CHECK-BE-NEXT: lsls r2, r3, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: ldrne r2, [r0] @@ -419,17 +563,41 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_zero(<8 x i16> *%dest, <8 ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #8 ; CHECK-LE-NEXT: sub sp, #8 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: mov.w r12, #0 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: rsbs r2, r3, #0 +; CHECK-LE-NEXT: movs r3, #0 +; CHECK-LE-NEXT: bfi r3, r2, #0, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #2, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #3, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #4, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #5, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r3, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r3 +; CHECK-LE-NEXT: lsls r2, r3, #31 ; CHECK-LE-NEXT: beq .LBB6_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: ldrh r3, [r0] -; CHECK-LE-NEXT: vdup.16 q0, r2 -; CHECK-LE-NEXT: vmov.16 q0[0], r3 +; CHECK-LE-NEXT: ldrh r2, [r0] +; CHECK-LE-NEXT: vdup.16 q0, r12 +; CHECK-LE-NEXT: vmov.16 q0[0], r2 ; CHECK-LE-NEXT: b .LBB6_3 ; CHECK-LE-NEXT: .LBB6_2: ; CHECK-LE-NEXT: vmov.i32 q0, #0x0 @@ -470,17 +638,41 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_zero(<8 x i16> *%dest, <8 ; CHECK-BE-NEXT: .pad #8 ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: mov.w r12, #0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r3, r1, #1 +; CHECK-BE-NEXT: rsbs r2, r3, #0 +; CHECK-BE-NEXT: movs r3, #0 +; CHECK-BE-NEXT: bfi r3, r2, #0, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #2, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #3, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #4, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #5, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r3, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r3 +; CHECK-BE-NEXT: lsls r2, r3, #31 ; CHECK-BE-NEXT: beq .LBB6_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load -; CHECK-BE-NEXT: movs r2, #0 -; CHECK-BE-NEXT: ldrh r3, [r0] -; CHECK-BE-NEXT: vdup.16 q1, r2 -; CHECK-BE-NEXT: vmov.16 q1[0], r3 +; CHECK-BE-NEXT: ldrh r2, [r0] +; CHECK-BE-NEXT: vdup.16 q1, r12 +; CHECK-BE-NEXT: vmov.16 q1[0], r2 ; CHECK-BE-NEXT: b .LBB6_3 ; CHECK-BE-NEXT: .LBB6_2: ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 @@ -529,11 +721,35 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_undef(<8 x i16> *%dest, <8 ; CHECK-LE-NEXT: .pad #8 ; CHECK-LE-NEXT: sub sp, #8 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: mov r1, sp -; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r1, p0 ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: ldrb.w r1, [sp] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrhne r2, [r0] ; CHECK-LE-NEXT: vmovne.16 q0[0], r2 @@ -573,12 +789,36 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_undef(<8 x i16> *%dest, <8 ; CHECK-BE-NEXT: .pad #8 ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r3, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: ldrhne r2, [r0] ; CHECK-BE-NEXT: vmovne.16 q1[0], r2 @@ -625,11 +865,35 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(<8 x i16> *%dest, <8 ; CHECK-LE-NEXT: .pad #8 ; CHECK-LE-NEXT: sub sp, #8 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: mov r1, sp -; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r1, p0 ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: ldrb.w r1, [sp] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrhne r2, [r0] ; CHECK-LE-NEXT: vmovne.16 q0[0], r2 @@ -669,12 +933,36 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(<8 x i16> *%dest, <8 ; CHECK-BE-NEXT: .pad #8 ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r3, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: ldrhne r2, [r0] ; CHECK-BE-NEXT: vmovne.16 q1[0], r2 @@ -720,11 +1008,35 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_other(<8 x i16> *%dest, <8 ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #8 ; CHECK-LE-NEXT: sub sp, #8 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrhne r2, [r0] ; CHECK-LE-NEXT: vmovne.16 q0[0], r2 @@ -764,11 +1076,35 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align4_other(<8 x i16> *%dest, <8 ; CHECK-BE-NEXT: .pad #8 ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r3, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: ldrhne r2, [r0] ; CHECK-BE-NEXT: vmovne.16 q1[0], r2 @@ -817,12 +1153,36 @@ define i8* @masked_v8i16_preinc(i8* %x, i8* %y, <8 x i16> %a) { ; CHECK-LE-NEXT: vldr d1, [sp, #8] ; CHECK-LE-NEXT: adds r0, #4 ; CHECK-LE-NEXT: vmov d0, r2, r3 -; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: movs r3, #0 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: vstr p0, [r2] -; CHECK-LE-NEXT: ldrb.w r2, [sp] -; CHECK-LE-NEXT: lsls r3, r2, #31 +; CHECK-LE-NEXT: vmrs r12, p0 +; CHECK-LE-NEXT: and r2, r12, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #0, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #1, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #4, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #2, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #6, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #3, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #8, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #4, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #10, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #5, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #12, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #6, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #14, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #7, #1 +; CHECK-LE-NEXT: uxtb r2, r3 +; CHECK-LE-NEXT: lsls r3, r3, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrhne r3, [r0] ; CHECK-LE-NEXT: vmovne.16 q0[0], r3 @@ -865,13 +1225,37 @@ define i8* @masked_v8i16_preinc(i8* %x, i8* %y, <8 x i16> %a) { ; CHECK-BE-NEXT: vldr d1, [sp, #8] ; CHECK-BE-NEXT: adds r0, #4 ; CHECK-BE-NEXT: vmov d0, r3, r2 -; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: movs r3, #0 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: @ implicit-def: $q0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r2] -; CHECK-BE-NEXT: ldrb.w r2, [sp] -; CHECK-BE-NEXT: lsls r3, r2, #31 +; CHECK-BE-NEXT: vmrs r12, p0 +; CHECK-BE-NEXT: and r2, r12, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #0, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #4, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #2, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #3, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #8, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #4, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #10, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #5, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #12, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #6, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #14, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #7, #1 +; CHECK-BE-NEXT: uxtb r2, r3 +; CHECK-BE-NEXT: lsls r3, r3, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: ldrhne r3, [r0] ; CHECK-BE-NEXT: vmovne.16 q0[0], r3 @@ -922,12 +1306,36 @@ define arm_aapcs_vfpcc i8* @masked_v8i16_postinc(i8* %x, i8* %y, <8 x i16> %a) { ; CHECK-LE-NEXT: .pad #8 ; CHECK-LE-NEXT: sub sp, #8 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: mov r2, sp -; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r12, p0 ; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: and r3, r12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 ; CHECK-LE-NEXT: add.w r12, r0, #4 -; CHECK-LE-NEXT: ldrb.w r3, [sp] -; CHECK-LE-NEXT: lsls r2, r3, #31 +; CHECK-LE-NEXT: bfi r2, r3, #7, #1 +; CHECK-LE-NEXT: uxtb r3, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrhne r2, [r0] ; CHECK-LE-NEXT: vmovne.16 q0[0], r2 @@ -969,13 +1377,37 @@ define arm_aapcs_vfpcc i8* @masked_v8i16_postinc(i8* %x, i8* %y, <8 x i16> %a) { ; CHECK-BE-NEXT: .pad #8 ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q0 +; CHECK-BE-NEXT: vmrs r12, p0 +; CHECK-BE-NEXT: and r3, r12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: add.w r12, r0, #4 -; CHECK-BE-NEXT: vstr p0, [r2] -; CHECK-BE-NEXT: ldrb.w r3, [sp] -; CHECK-BE-NEXT: lsls r2, r3, #31 +; CHECK-BE-NEXT: bfi r2, r3, #7, #1 +; CHECK-BE-NEXT: uxtb r3, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: ldrhne r2, [r0] ; CHECK-BE-NEXT: vmovne.16 q0[0], r2 @@ -1034,11 +1466,10 @@ define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_zero(<16 x i8> *%dest, <16 ; CHECK-LE-NEXT: mov r4, sp ; CHECK-LE-NEXT: bfc r4, #0, #4 ; CHECK-LE-NEXT: mov sp, r4 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrh.w r1, [sp] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: uxth r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: beq .LBB12_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load ; CHECK-LE-NEXT: movs r2, #0 @@ -1125,11 +1556,10 @@ define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_zero(<16 x i8> *%dest, <16 ; CHECK-BE-NEXT: bfc r4, #0, #4 ; CHECK-BE-NEXT: mov sp, r4 ; CHECK-BE-NEXT: vrev64.8 q1, q0 -; CHECK-BE-NEXT: mov r1, sp ; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrh.w r1, [sp] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: uxth r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: beq .LBB12_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load ; CHECK-BE-NEXT: movs r2, #0 @@ -1224,12 +1654,11 @@ define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_undef(<16 x i8> *%dest, <1 ; CHECK-LE-NEXT: bfc r4, #0, #4 ; CHECK-LE-NEXT: mov sp, r4 ; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr -; CHECK-LE-NEXT: mov r1, sp -; CHECK-LE-NEXT: vstr p0, [r1] ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: sub.w r4, r7, #8 -; CHECK-LE-NEXT: ldrh.w r1, [sp] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: uxth r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrbne r2, [r0] ; CHECK-LE-NEXT: vmovne.8 q0[0], r2 @@ -1308,13 +1737,12 @@ define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_undef(<16 x i8> *%dest, <1 ; CHECK-BE-NEXT: bfc r4, #0, #4 ; CHECK-BE-NEXT: mov sp, r4 ; CHECK-BE-NEXT: vrev64.8 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: sub.w r4, r7, #8 ; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 -; CHECK-BE-NEXT: sub.w r4, r7, #8 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrh.w r1, [sp] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: uxth r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: ldrbne r2, [r0] ; CHECK-BE-NEXT: vmovne.8 q1[0], r2 @@ -1399,12 +1827,11 @@ define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_other(<16 x i8> *%dest, <1 ; CHECK-LE-NEXT: mov r4, sp ; CHECK-LE-NEXT: bfc r4, #0, #4 ; CHECK-LE-NEXT: mov sp, r4 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r1] ; CHECK-LE-NEXT: sub.w r4, r7, #8 -; CHECK-LE-NEXT: ldrh.w r1, [sp] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: uxth r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrbne r2, [r0] ; CHECK-LE-NEXT: vmovne.8 q0[0], r2 @@ -1483,12 +1910,11 @@ define arm_aapcs_vfpcc <16 x i8> @masked_v16i8_align4_other(<16 x i8> *%dest, <1 ; CHECK-BE-NEXT: bfc r4, #0, #4 ; CHECK-BE-NEXT: mov sp, r4 ; CHECK-BE-NEXT: vrev64.8 q1, q0 -; CHECK-BE-NEXT: mov r1, sp -; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr ; CHECK-BE-NEXT: sub.w r4, r7, #8 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrh.w r1, [sp] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: uxth r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: ldrbne r2, [r0] ; CHECK-BE-NEXT: vmovne.8 q1[0], r2 @@ -1574,13 +2000,12 @@ define arm_aapcs_vfpcc i8* @masked_v16i8_preinc(i8* %x, i8* %y, <16 x i8> %a) { ; CHECK-LE-NEXT: bfc r4, #0, #4 ; CHECK-LE-NEXT: mov sp, r4 ; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr -; CHECK-LE-NEXT: mov r2, sp -; CHECK-LE-NEXT: vstr p0, [r2] ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: adds r0, #4 -; CHECK-LE-NEXT: ldrh.w r2, [sp] +; CHECK-LE-NEXT: vmrs r3, p0 ; CHECK-LE-NEXT: sub.w r4, r7, #8 -; CHECK-LE-NEXT: lsls r3, r2, #31 +; CHECK-LE-NEXT: uxth r2, r3 +; CHECK-LE-NEXT: lsls r3, r3, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrbne r3, [r0] ; CHECK-LE-NEXT: vmovne.8 q0[0], r3 @@ -1660,14 +2085,13 @@ define arm_aapcs_vfpcc i8* @masked_v16i8_preinc(i8* %x, i8* %y, <16 x i8> %a) { ; CHECK-BE-NEXT: bfc r4, #0, #4 ; CHECK-BE-NEXT: mov sp, r4 ; CHECK-BE-NEXT: vrev64.8 q1, q0 -; CHECK-BE-NEXT: mov r2, sp -; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q0 ; CHECK-BE-NEXT: adds r0, #4 -; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr ; CHECK-BE-NEXT: sub.w r4, r7, #8 -; CHECK-BE-NEXT: ldrh.w r2, [sp] -; CHECK-BE-NEXT: lsls r3, r2, #31 +; CHECK-BE-NEXT: vmrs r3, p0 +; CHECK-BE-NEXT: uxth r2, r3 +; CHECK-BE-NEXT: lsls r3, r3, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: ldrbne r3, [r0] ; CHECK-BE-NEXT: vmovne.8 q0[0], r3 @@ -1757,13 +2181,12 @@ define arm_aapcs_vfpcc i8* @masked_v16i8_postinc(i8* %x, i8* %y, <16 x i8> %a) { ; CHECK-LE-NEXT: bfc r4, #0, #4 ; CHECK-LE-NEXT: mov sp, r4 ; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr -; CHECK-LE-NEXT: mov r2, sp -; CHECK-LE-NEXT: vstr p0, [r2] ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: sub.w r4, r7, #8 -; CHECK-LE-NEXT: ldrh.w r3, [sp] +; CHECK-LE-NEXT: vmrs r2, p0 ; CHECK-LE-NEXT: add.w r12, r0, #4 -; CHECK-LE-NEXT: lsls r2, r3, #31 +; CHECK-LE-NEXT: uxth r3, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrbne r2, [r0] ; CHECK-LE-NEXT: vmovne.8 q0[0], r2 @@ -1844,14 +2267,13 @@ define arm_aapcs_vfpcc i8* @masked_v16i8_postinc(i8* %x, i8* %y, <16 x i8> %a) { ; CHECK-BE-NEXT: bfc r4, #0, #4 ; CHECK-BE-NEXT: mov sp, r4 ; CHECK-BE-NEXT: vrev64.8 q1, q0 -; CHECK-BE-NEXT: mov r2, sp -; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q0 ; CHECK-BE-NEXT: sub.w r4, r7, #8 -; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr ; CHECK-BE-NEXT: add.w r12, r0, #4 -; CHECK-BE-NEXT: ldrh.w r3, [sp] -; CHECK-BE-NEXT: lsls r2, r3, #31 +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: uxth r3, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: ldrbne r2, [r0] ; CHECK-BE-NEXT: vmovne.8 q0[0], r2 @@ -1935,10 +2357,22 @@ define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_zero(<4 x float> *%dest, ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: and r1, r1, #15 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: beq .LBB17_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load @@ -1972,10 +2406,22 @@ define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_zero(<4 x float> *%dest, ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r3, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: and r1, r1, #15 ; CHECK-BE-NEXT: lsls r2, r1, #31 ; CHECK-BE-NEXT: beq .LBB17_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load @@ -2016,10 +2462,22 @@ define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_undef(<4 x float> *%dest ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: mov r1, sp -; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r2, p0 ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: and r1, r1, #15 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: vldrne s0, [r0] @@ -2040,11 +2498,23 @@ define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_undef(<4 x float> *%dest ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r3, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: and r1, r1, #15 ; CHECK-BE-NEXT: lsls r2, r1, #31 ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vldrne s4, [r0] @@ -2072,10 +2542,22 @@ define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align1_undef(<4 x float> *%dest ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: mov r1, sp -; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r2, p0 ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: and r1, r1, #15 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r2, [r0] @@ -2100,11 +2582,23 @@ define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align1_undef(<4 x float> *%dest ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r3, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: and r1, r1, #15 ; CHECK-BE-NEXT: lsls r2, r1, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: ldrne r2, [r0] @@ -2135,10 +2629,22 @@ define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_other(<4 x float> *%dest ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: and r1, r1, #15 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: vldrne s4, [r0] @@ -2160,11 +2666,23 @@ define arm_aapcs_vfpcc <4 x float> @masked_v4f32_align4_other(<4 x float> *%dest ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q2, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vcmp.s32 gt, q2, zr ; CHECK-BE-NEXT: vrev64.32 q2, q1 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r3, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: and r1, r1, #15 ; CHECK-BE-NEXT: lsls r2, r1, #31 ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vldrne s8, [r0] @@ -2192,11 +2710,23 @@ define arm_aapcs_vfpcc i8* @masked_v4f32_preinc(i8* %x, i8* %y, <4 x i32> %a) { ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: mov r2, sp -; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r12, p0 ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: adds r0, #4 -; CHECK-LE-NEXT: ldrb.w r2, [sp] +; CHECK-LE-NEXT: and r3, r12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: and r2, r2, #15 ; CHECK-LE-NEXT: lsls r3, r2, #31 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: vldrne s0, [r0] @@ -2218,12 +2748,24 @@ define arm_aapcs_vfpcc i8* @masked_v4f32_preinc(i8* %x, i8* %y, <4 x i32> %a) { ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q0 ; CHECK-BE-NEXT: adds r0, #4 -; CHECK-BE-NEXT: vstr p0, [r2] -; CHECK-BE-NEXT: ldrb.w r2, [sp] +; CHECK-BE-NEXT: vmrs r12, p0 +; CHECK-BE-NEXT: and r3, r12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: and r2, r2, #15 ; CHECK-BE-NEXT: lsls r3, r2, #31 ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vldrne s0, [r0] @@ -2255,11 +2797,23 @@ define arm_aapcs_vfpcc i8* @masked_v4f32_postinc(i8* %x, i8* %y, <4 x i32> %a) { ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: mov r2, sp -; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r12, p0 ; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: and r3, r12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 ; CHECK-LE-NEXT: add.w r12, r0, #4 -; CHECK-LE-NEXT: ldrb.w r3, [sp] +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: and r3, r2, #15 ; CHECK-LE-NEXT: lsls r2, r3, #31 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: vldrne s0, [r0] @@ -2282,12 +2836,24 @@ define arm_aapcs_vfpcc i8* @masked_v4f32_postinc(i8* %x, i8* %y, <4 x i32> %a) { ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q0 +; CHECK-BE-NEXT: vmrs r12, p0 +; CHECK-BE-NEXT: and r3, r12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: add.w r12, r0, #4 -; CHECK-BE-NEXT: vstr p0, [r2] -; CHECK-BE-NEXT: ldrb.w r3, [sp] +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: and r3, r2, #15 ; CHECK-BE-NEXT: lsls r2, r3, #31 ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vldrne s0, [r0] @@ -2320,11 +2886,35 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_zero(<8 x half> *%dest, < ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #8 ; CHECK-LE-NEXT: sub sp, #8 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: beq .LBB23_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load ; CHECK-LE-NEXT: vldr.16 s0, .LCPI23_0 @@ -2411,11 +3001,35 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_zero(<8 x half> *%dest, < ; CHECK-BE-NEXT: .pad #8 ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r3, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: beq .LBB23_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load ; CHECK-BE-NEXT: vldr.16 s0, .LCPI23_0 @@ -2509,11 +3123,35 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_undef(<8 x half> *%dest, ; CHECK-LE-NEXT: .pad #8 ; CHECK-LE-NEXT: sub sp, #8 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: mov r1, sp -; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r1, p0 ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: ldrb.w r1, [sp] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: bne .LBB24_9 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r1, #30 @@ -2591,12 +3229,36 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_undef(<8 x half> *%dest, ; CHECK-BE-NEXT: .pad #8 ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r3, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: bne .LBB24_10 ; CHECK-BE-NEXT: @ %bb.1: @ %else ; CHECK-BE-NEXT: lsls r2, r1, #30 @@ -2680,11 +3342,35 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest, ; CHECK-LE-NEXT: .pad #40 ; CHECK-LE-NEXT: sub sp, #40 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: add r1, sp, #32 -; CHECK-LE-NEXT: vstr p0, [r1] +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r1, p0 ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: ldrb.w r1, [sp, #32] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: bne .LBB25_9 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r1, #30 @@ -2778,12 +3464,36 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest, ; CHECK-BE-NEXT: .pad #40 ; CHECK-BE-NEXT: sub sp, #40 ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: add r1, sp, #32 +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp, #32] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r3, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: bne .LBB25_10 ; CHECK-BE-NEXT: @ %bb.1: @ %else ; CHECK-BE-NEXT: lsls r2, r1, #30 @@ -2882,11 +3592,35 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_other(<8 x half> *%dest, ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #8 ; CHECK-LE-NEXT: sub sp, #8 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: bne .LBB26_10 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r1, #30 @@ -2966,12 +3700,36 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align4_other(<8 x half> *%dest, ; CHECK-BE-NEXT: .pad #8 ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vrev64.16 q2, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.s16 gt, q2, zr ; CHECK-BE-NEXT: vrev64.16 q2, q1 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r3, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: bne .LBB26_10 ; CHECK-BE-NEXT: @ %bb.1: @ %else ; CHECK-BE-NEXT: lsls r2, r1, #30 @@ -3057,12 +3815,36 @@ define arm_aapcs_vfpcc i8* @masked_v8f16_preinc(i8* %x, i8* %y, <8 x i16> %a) { ; CHECK-LE-NEXT: .pad #8 ; CHECK-LE-NEXT: sub sp, #8 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: mov r2, sp -; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: movs r3, #0 +; CHECK-LE-NEXT: vmrs r12, p0 ; CHECK-LE-NEXT: adds r0, #4 -; CHECK-LE-NEXT: ldrb.w r2, [sp] ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: lsls r3, r2, #31 +; CHECK-LE-NEXT: and r2, r12, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #0, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #1, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #4, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #2, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #6, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #3, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #8, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #4, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #10, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #5, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #12, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #6, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #14, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #7, #1 +; CHECK-LE-NEXT: uxtb r2, r3 +; CHECK-LE-NEXT: lsls r3, r3, #31 ; CHECK-LE-NEXT: bne .LBB27_10 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r3, r2, #30 @@ -3140,13 +3922,37 @@ define arm_aapcs_vfpcc i8* @masked_v8f16_preinc(i8* %x, i8* %y, <8 x i16> %a) { ; CHECK-BE-NEXT: .pad #8 ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: movs r3, #0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr ; CHECK-BE-NEXT: adds r0, #4 -; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: vmrs r12, p0 ; CHECK-BE-NEXT: @ implicit-def: $q0 -; CHECK-BE-NEXT: ldrb.w r2, [sp] -; CHECK-BE-NEXT: lsls r3, r2, #31 +; CHECK-BE-NEXT: and r2, r12, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #0, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #4, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #2, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #3, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #8, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #4, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #10, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #5, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #12, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #6, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #14, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #7, #1 +; CHECK-BE-NEXT: uxtb r2, r3 +; CHECK-BE-NEXT: lsls r3, r3, #31 ; CHECK-BE-NEXT: bne .LBB27_10 ; CHECK-BE-NEXT: @ %bb.1: @ %else ; CHECK-BE-NEXT: lsls r3, r2, #30 @@ -3234,11 +4040,35 @@ define arm_aapcs_vfpcc i8* @masked_v8f16_postinc(i8* %x, i8* %y, <8 x i16> %a) { ; CHECK-LE-NEXT: .pad #8 ; CHECK-LE-NEXT: sub sp, #8 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: mov r2, sp -; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r12, p0 ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: ldrb.w r3, [sp] -; CHECK-LE-NEXT: lsls r2, r3, #31 +; CHECK-LE-NEXT: and r3, r12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #7, #1 +; CHECK-LE-NEXT: uxtb r3, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: bne .LBB28_12 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r3, #30 @@ -3314,12 +4144,36 @@ define arm_aapcs_vfpcc i8* @masked_v8f16_postinc(i8* %x, i8* %y, <8 x i16> %a) { ; CHECK-BE-NEXT: .pad #8 ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q0 -; CHECK-BE-NEXT: vstr p0, [r2] -; CHECK-BE-NEXT: ldrb.w r3, [sp] -; CHECK-BE-NEXT: lsls r2, r3, #31 +; CHECK-BE-NEXT: vmrs r12, p0 +; CHECK-BE-NEXT: and r3, r12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #7, #1 +; CHECK-BE-NEXT: uxtb r3, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: bne .LBB28_12 ; CHECK-BE-NEXT: @ %bb.1: @ %else ; CHECK-BE-NEXT: lsls r2, r3, #30 diff --git a/test/CodeGen/Thumb2/mve-masked-store.ll b/test/CodeGen/Thumb2/mve-masked-store.ll index 3ff1dec821d..9d777dbedfd 100644 --- a/test/CodeGen/Thumb2/mve-masked-store.ll +++ b/test/CodeGen/Thumb2/mve-masked-store.ll @@ -7,10 +7,22 @@ define arm_aapcs_vfpcc void @masked_v4i32(<4 x i32> *%dest, <4 x i32> %a) { ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: and r1, r1, #15 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, s0 @@ -35,10 +47,22 @@ define arm_aapcs_vfpcc void @masked_v4i32(<4 x i32> *%dest, <4 x i32> %a) { ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r3, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: and r1, r1, #15 ; CHECK-BE-NEXT: lsls r2, r1, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: vmovne r2, s4 @@ -68,10 +92,22 @@ define arm_aapcs_vfpcc void @masked_v4i32_align1(<4 x i32> *%dest, <4 x i32> %a) ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: and r1, r1, #15 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, s0 @@ -96,10 +132,22 @@ define arm_aapcs_vfpcc void @masked_v4i32_align1(<4 x i32> *%dest, <4 x i32> %a) ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r3, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: and r1, r1, #15 ; CHECK-BE-NEXT: lsls r2, r1, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: vmovne r2, s4 @@ -132,24 +180,36 @@ define i8* @masked_v4i32_pre(i8* %y, i8* %x, <4 x i32> %a) { ; CHECK-LE-NEXT: vldr d1, [sp, #8] ; CHECK-LE-NEXT: adds r0, #4 ; CHECK-LE-NEXT: vmov d0, r2, r3 -; CHECK-LE-NEXT: add r2, sp, #4 +; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r2] -; CHECK-LE-NEXT: ldrb.w r2, [sp, #4] +; CHECK-LE-NEXT: vmrs r12, p0 +; CHECK-LE-NEXT: and r3, r12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 ; CHECK-LE-NEXT: vldrw.u32 q0, [r1] -; CHECK-LE-NEXT: lsls r1, r2, #31 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: and r1, r2, #15 +; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne -; CHECK-LE-NEXT: vmovne r1, s0 -; CHECK-LE-NEXT: strne r1, [r0] -; CHECK-LE-NEXT: lsls r1, r2, #30 +; CHECK-LE-NEXT: vmovne r2, s0 +; CHECK-LE-NEXT: strne r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi r1, s1 -; CHECK-LE-NEXT: strmi r1, [r0, #4] -; CHECK-LE-NEXT: lsls r1, r2, #29 +; CHECK-LE-NEXT: vmovmi r2, s1 +; CHECK-LE-NEXT: strmi r2, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #29 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi r1, s2 -; CHECK-LE-NEXT: strmi r1, [r0, #8] -; CHECK-LE-NEXT: lsls r1, r2, #28 +; CHECK-LE-NEXT: vmovmi r2, s2 +; CHECK-LE-NEXT: strmi r2, [r0, #8] +; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi r1, s3 ; CHECK-LE-NEXT: strmi r1, [r0, #12] @@ -163,25 +223,37 @@ define i8* @masked_v4i32_pre(i8* %y, i8* %x, <4 x i32> %a) { ; CHECK-BE-NEXT: vldr d1, [sp, #8] ; CHECK-BE-NEXT: adds r0, #4 ; CHECK-BE-NEXT: vmov d0, r3, r2 -; CHECK-BE-NEXT: add r2, sp, #4 +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r2] -; CHECK-BE-NEXT: ldrb.w r2, [sp, #4] +; CHECK-BE-NEXT: vmrs r12, p0 +; CHECK-BE-NEXT: and r3, r12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: vldrw.u32 q0, [r1] -; CHECK-BE-NEXT: lsls r1, r2, #31 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: and r1, r2, #15 +; CHECK-BE-NEXT: lsls r2, r1, #31 ; CHECK-BE-NEXT: itt ne -; CHECK-BE-NEXT: vmovne r1, s0 -; CHECK-BE-NEXT: strne r1, [r0] -; CHECK-BE-NEXT: lsls r1, r2, #30 +; CHECK-BE-NEXT: vmovne r2, s0 +; CHECK-BE-NEXT: strne r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi r1, s1 -; CHECK-BE-NEXT: strmi r1, [r0, #4] -; CHECK-BE-NEXT: lsls r1, r2, #29 +; CHECK-BE-NEXT: vmovmi r2, s1 +; CHECK-BE-NEXT: strmi r2, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #29 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi r1, s2 -; CHECK-BE-NEXT: strmi r1, [r0, #8] -; CHECK-BE-NEXT: lsls r1, r2, #28 +; CHECK-BE-NEXT: vmovmi r2, s2 +; CHECK-BE-NEXT: strmi r2, [r0, #8] +; CHECK-BE-NEXT: lsls r1, r1, #28 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi r1, s3 ; CHECK-BE-NEXT: strmi r1, [r0, #12] @@ -204,11 +276,23 @@ define i8* @masked_v4i32_post(i8* %y, i8* %x, <4 x i32> %a) { ; CHECK-LE-NEXT: sub sp, #8 ; CHECK-LE-NEXT: vldr d1, [sp, #8] ; CHECK-LE-NEXT: vmov d0, r2, r3 -; CHECK-LE-NEXT: add r2, sp, #4 +; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r2] -; CHECK-LE-NEXT: ldrb.w r2, [sp, #4] +; CHECK-LE-NEXT: vmrs r12, p0 +; CHECK-LE-NEXT: and r3, r12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 ; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: and r2, r2, #15 ; CHECK-LE-NEXT: lsls r1, r2, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r1, s0 @@ -236,12 +320,24 @@ define i8* @masked_v4i32_post(i8* %y, i8* %x, <4 x i32> %a) { ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vldr d1, [sp, #8] ; CHECK-BE-NEXT: vmov d0, r3, r2 -; CHECK-BE-NEXT: add r2, sp, #4 +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r2] -; CHECK-BE-NEXT: ldrb.w r2, [sp, #4] +; CHECK-BE-NEXT: vmrs r12, p0 +; CHECK-BE-NEXT: and r3, r12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: vldrw.u32 q0, [r1] +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: and r2, r2, #15 ; CHECK-BE-NEXT: lsls r1, r2, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: vmovne r1, s0 @@ -278,11 +374,35 @@ define arm_aapcs_vfpcc void @masked_v8i16(<8 x i16> *%dest, <8 x i16> %a) { ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #8 ; CHECK-LE-NEXT: sub sp, #8 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne.u16 r2, q0[0] ; CHECK-LE-NEXT: strhne r2, [r0] @@ -322,11 +442,35 @@ define arm_aapcs_vfpcc void @masked_v8i16(<8 x i16> *%dest, <8 x i16> %a) { ; CHECK-BE-NEXT: .pad #8 ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r3, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: vmovne.u16 r2, q1[0] ; CHECK-BE-NEXT: strhne r2, [r0] @@ -371,11 +515,35 @@ define arm_aapcs_vfpcc void @masked_v8i16_align1(<8 x i16> *%dest, <8 x i16> %a) ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #8 ; CHECK-LE-NEXT: sub sp, #8 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne.u16 r2, q0[0] ; CHECK-LE-NEXT: strhne r2, [r0] @@ -415,11 +583,35 @@ define arm_aapcs_vfpcc void @masked_v8i16_align1(<8 x i16> *%dest, <8 x i16> %a) ; CHECK-BE-NEXT: .pad #8 ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r3, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: vmovne.u16 r2, q1[0] ; CHECK-BE-NEXT: strhne r2, [r0] @@ -467,40 +659,64 @@ define i8* @masked_v8i16_pre(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-LE-NEXT: vldr d1, [sp, #8] ; CHECK-LE-NEXT: adds r0, #4 ; CHECK-LE-NEXT: vmov d0, r2, r3 -; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r2] -; CHECK-LE-NEXT: ldrb.w r2, [sp] +; CHECK-LE-NEXT: vmrs r12, p0 +; CHECK-LE-NEXT: and r3, r12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 ; CHECK-LE-NEXT: vldrw.u32 q0, [r1] -; CHECK-LE-NEXT: lsls r1, r2, #31 +; CHECK-LE-NEXT: bfi r2, r3, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: itt ne -; CHECK-LE-NEXT: vmovne.u16 r1, q0[0] -; CHECK-LE-NEXT: strhne r1, [r0] -; CHECK-LE-NEXT: lsls r1, r2, #30 +; CHECK-LE-NEXT: vmovne.u16 r2, q0[0] +; CHECK-LE-NEXT: strhne r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u16 r1, q0[1] -; CHECK-LE-NEXT: strhmi r1, [r0, #2] -; CHECK-LE-NEXT: lsls r1, r2, #29 +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[1] +; CHECK-LE-NEXT: strhmi r2, [r0, #2] +; CHECK-LE-NEXT: lsls r2, r1, #29 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u16 r1, q0[2] -; CHECK-LE-NEXT: strhmi r1, [r0, #4] -; CHECK-LE-NEXT: lsls r1, r2, #28 +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[2] +; CHECK-LE-NEXT: strhmi r2, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #28 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u16 r1, q0[3] -; CHECK-LE-NEXT: strhmi r1, [r0, #6] -; CHECK-LE-NEXT: lsls r1, r2, #27 +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[3] +; CHECK-LE-NEXT: strhmi r2, [r0, #6] +; CHECK-LE-NEXT: lsls r2, r1, #27 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u16 r1, q0[4] -; CHECK-LE-NEXT: strhmi r1, [r0, #8] -; CHECK-LE-NEXT: lsls r1, r2, #26 +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[4] +; CHECK-LE-NEXT: strhmi r2, [r0, #8] +; CHECK-LE-NEXT: lsls r2, r1, #26 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u16 r1, q0[5] -; CHECK-LE-NEXT: strhmi r1, [r0, #10] -; CHECK-LE-NEXT: lsls r1, r2, #25 +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[5] +; CHECK-LE-NEXT: strhmi r2, [r0, #10] +; CHECK-LE-NEXT: lsls r2, r1, #25 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u16 r1, q0[6] -; CHECK-LE-NEXT: strhmi r1, [r0, #12] -; CHECK-LE-NEXT: lsls r1, r2, #24 +; CHECK-LE-NEXT: vmovmi.u16 r2, q0[6] +; CHECK-LE-NEXT: strhmi r2, [r0, #12] +; CHECK-LE-NEXT: lsls r1, r1, #24 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi.u16 r1, q0[7] ; CHECK-LE-NEXT: strhmi r1, [r0, #14] @@ -514,41 +730,65 @@ define i8* @masked_v8i16_pre(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-BE-NEXT: vldr d1, [sp, #8] ; CHECK-BE-NEXT: adds r0, #4 ; CHECK-BE-NEXT: vmov d0, r3, r2 -; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r2] -; CHECK-BE-NEXT: ldrb.w r2, [sp] +; CHECK-BE-NEXT: vmrs r12, p0 +; CHECK-BE-NEXT: and r3, r12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: vldrh.u16 q0, [r1] -; CHECK-BE-NEXT: lsls r1, r2, #31 +; CHECK-BE-NEXT: bfi r2, r3, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: itt ne -; CHECK-BE-NEXT: vmovne.u16 r1, q0[0] -; CHECK-BE-NEXT: strhne r1, [r0] -; CHECK-BE-NEXT: lsls r1, r2, #30 +; CHECK-BE-NEXT: vmovne.u16 r2, q0[0] +; CHECK-BE-NEXT: strhne r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u16 r1, q0[1] -; CHECK-BE-NEXT: strhmi r1, [r0, #2] -; CHECK-BE-NEXT: lsls r1, r2, #29 +; CHECK-BE-NEXT: vmovmi.u16 r2, q0[1] +; CHECK-BE-NEXT: strhmi r2, [r0, #2] +; CHECK-BE-NEXT: lsls r2, r1, #29 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u16 r1, q0[2] -; CHECK-BE-NEXT: strhmi r1, [r0, #4] -; CHECK-BE-NEXT: lsls r1, r2, #28 +; CHECK-BE-NEXT: vmovmi.u16 r2, q0[2] +; CHECK-BE-NEXT: strhmi r2, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u16 r1, q0[3] -; CHECK-BE-NEXT: strhmi r1, [r0, #6] -; CHECK-BE-NEXT: lsls r1, r2, #27 +; CHECK-BE-NEXT: vmovmi.u16 r2, q0[3] +; CHECK-BE-NEXT: strhmi r2, [r0, #6] +; CHECK-BE-NEXT: lsls r2, r1, #27 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u16 r1, q0[4] -; CHECK-BE-NEXT: strhmi r1, [r0, #8] -; CHECK-BE-NEXT: lsls r1, r2, #26 +; CHECK-BE-NEXT: vmovmi.u16 r2, q0[4] +; CHECK-BE-NEXT: strhmi r2, [r0, #8] +; CHECK-BE-NEXT: lsls r2, r1, #26 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u16 r1, q0[5] -; CHECK-BE-NEXT: strhmi r1, [r0, #10] -; CHECK-BE-NEXT: lsls r1, r2, #25 +; CHECK-BE-NEXT: vmovmi.u16 r2, q0[5] +; CHECK-BE-NEXT: strhmi r2, [r0, #10] +; CHECK-BE-NEXT: lsls r2, r1, #25 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u16 r1, q0[6] -; CHECK-BE-NEXT: strhmi r1, [r0, #12] -; CHECK-BE-NEXT: lsls r1, r2, #24 +; CHECK-BE-NEXT: vmovmi.u16 r2, q0[6] +; CHECK-BE-NEXT: strhmi r2, [r0, #12] +; CHECK-BE-NEXT: lsls r1, r1, #24 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi.u16 r1, q0[7] ; CHECK-BE-NEXT: strhmi r1, [r0, #14] @@ -571,12 +811,36 @@ define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-LE-NEXT: sub sp, #8 ; CHECK-LE-NEXT: vldr d1, [sp, #8] ; CHECK-LE-NEXT: vmov d0, r2, r3 -; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: movs r3, #0 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r2] -; CHECK-LE-NEXT: ldrb.w r2, [sp] +; CHECK-LE-NEXT: vmrs r12, p0 +; CHECK-LE-NEXT: and r2, r12, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #0, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #1, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #4, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #2, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #6, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #3, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #8, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #4, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #10, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #5, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #12, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #6, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #14, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 ; CHECK-LE-NEXT: vldrw.u32 q0, [r1] -; CHECK-LE-NEXT: lsls r1, r2, #31 +; CHECK-LE-NEXT: bfi r3, r2, #7, #1 +; CHECK-LE-NEXT: lsls r1, r3, #31 +; CHECK-LE-NEXT: uxtb r2, r3 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne.u16 r1, q0[0] ; CHECK-LE-NEXT: strhne r1, [r0] @@ -600,8 +864,8 @@ define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi.u16 r1, q0[5] ; CHECK-LE-NEXT: strhmi r1, [r0, #10] -; CHECK-LE-NEXT: adds r1, r0, #4 ; CHECK-LE-NEXT: lsls r3, r2, #25 +; CHECK-LE-NEXT: add.w r1, r0, #4 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi.u16 r3, q0[6] ; CHECK-LE-NEXT: strhmi r3, [r0, #12] @@ -619,13 +883,37 @@ define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vldr d1, [sp, #8] ; CHECK-BE-NEXT: vmov d0, r3, r2 -; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: movs r3, #0 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r2] -; CHECK-BE-NEXT: ldrb.w r2, [sp] +; CHECK-BE-NEXT: vmrs r12, p0 +; CHECK-BE-NEXT: and r2, r12, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #0, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #4, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #2, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #3, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #8, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #4, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #10, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #5, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #12, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #6, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #14, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 ; CHECK-BE-NEXT: vldrh.u16 q0, [r1] -; CHECK-BE-NEXT: lsls r1, r2, #31 +; CHECK-BE-NEXT: bfi r3, r2, #7, #1 +; CHECK-BE-NEXT: lsls r1, r3, #31 +; CHECK-BE-NEXT: uxtb r2, r3 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: vmovne.u16 r1, q0[0] ; CHECK-BE-NEXT: strhne r1, [r0] @@ -649,8 +937,8 @@ define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi.u16 r1, q0[5] ; CHECK-BE-NEXT: strhmi r1, [r0, #10] -; CHECK-BE-NEXT: adds r1, r0, #4 ; CHECK-BE-NEXT: lsls r3, r2, #25 +; CHECK-BE-NEXT: add.w r1, r0, #4 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi.u16 r3, q0[6] ; CHECK-BE-NEXT: strhmi r3, [r0, #12] @@ -684,12 +972,11 @@ define arm_aapcs_vfpcc void @masked_v16i8(<16 x i8> *%dest, <16 x i8> %a) { ; CHECK-LE-NEXT: mov r4, sp ; CHECK-LE-NEXT: bfc r4, #0, #4 ; CHECK-LE-NEXT: mov sp, r4 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r1] ; CHECK-LE-NEXT: sub.w r4, r7, #8 -; CHECK-LE-NEXT: ldrh.w r1, [sp] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: uxth r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne.u8 r2, q0[0] ; CHECK-LE-NEXT: strbne r2, [r0] @@ -768,12 +1055,11 @@ define arm_aapcs_vfpcc void @masked_v16i8(<16 x i8> *%dest, <16 x i8> %a) { ; CHECK-BE-NEXT: bfc r4, #0, #4 ; CHECK-BE-NEXT: mov sp, r4 ; CHECK-BE-NEXT: vrev64.8 q1, q0 -; CHECK-BE-NEXT: mov r1, sp -; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr ; CHECK-BE-NEXT: sub.w r4, r7, #8 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrh.w r1, [sp] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: uxth r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: vmovne.u8 r2, q1[0] ; CHECK-BE-NEXT: strbne r2, [r0] @@ -860,73 +1146,72 @@ define i8* @masked_v16i8_pre(i8* %y, i8* %x, <16 x i8> %a) { ; CHECK-LE-NEXT: vldr d1, [r7, #8] ; CHECK-LE-NEXT: adds r0, #4 ; CHECK-LE-NEXT: vmov d0, r2, r3 -; CHECK-LE-NEXT: mov r2, sp -; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr ; CHECK-LE-NEXT: sub.w r4, r7, #8 -; CHECK-LE-NEXT: vstr p0, [r2] -; CHECK-LE-NEXT: ldrh.w r2, [sp] +; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr ; CHECK-LE-NEXT: vldrw.u32 q0, [r1] -; CHECK-LE-NEXT: lsls r1, r2, #31 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: uxth r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: itt ne -; CHECK-LE-NEXT: vmovne.u8 r1, q0[0] -; CHECK-LE-NEXT: strbne r1, [r0] -; CHECK-LE-NEXT: lsls r1, r2, #30 +; CHECK-LE-NEXT: vmovne.u8 r2, q0[0] +; CHECK-LE-NEXT: strbne r2, [r0] +; CHECK-LE-NEXT: lsls r2, r1, #30 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u8 r1, q0[1] -; CHECK-LE-NEXT: strbmi r1, [r0, #1] -; CHECK-LE-NEXT: lsls r1, r2, #29 +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[1] +; CHECK-LE-NEXT: strbmi r2, [r0, #1] +; CHECK-LE-NEXT: lsls r2, r1, #29 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u8 r1, q0[2] -; CHECK-LE-NEXT: strbmi r1, [r0, #2] -; CHECK-LE-NEXT: lsls r1, r2, #28 +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[2] +; CHECK-LE-NEXT: strbmi r2, [r0, #2] +; CHECK-LE-NEXT: lsls r2, r1, #28 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u8 r1, q0[3] -; CHECK-LE-NEXT: strbmi r1, [r0, #3] -; CHECK-LE-NEXT: lsls r1, r2, #27 +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[3] +; CHECK-LE-NEXT: strbmi r2, [r0, #3] +; CHECK-LE-NEXT: lsls r2, r1, #27 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u8 r1, q0[4] -; CHECK-LE-NEXT: strbmi r1, [r0, #4] -; CHECK-LE-NEXT: lsls r1, r2, #26 +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[4] +; CHECK-LE-NEXT: strbmi r2, [r0, #4] +; CHECK-LE-NEXT: lsls r2, r1, #26 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u8 r1, q0[5] -; CHECK-LE-NEXT: strbmi r1, [r0, #5] -; CHECK-LE-NEXT: lsls r1, r2, #25 +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[5] +; CHECK-LE-NEXT: strbmi r2, [r0, #5] +; CHECK-LE-NEXT: lsls r2, r1, #25 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u8 r1, q0[6] -; CHECK-LE-NEXT: strbmi r1, [r0, #6] -; CHECK-LE-NEXT: lsls r1, r2, #24 +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[6] +; CHECK-LE-NEXT: strbmi r2, [r0, #6] +; CHECK-LE-NEXT: lsls r2, r1, #24 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u8 r1, q0[7] -; CHECK-LE-NEXT: strbmi r1, [r0, #7] -; CHECK-LE-NEXT: lsls r1, r2, #23 +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[7] +; CHECK-LE-NEXT: strbmi r2, [r0, #7] +; CHECK-LE-NEXT: lsls r2, r1, #23 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u8 r1, q0[8] -; CHECK-LE-NEXT: strbmi r1, [r0, #8] -; CHECK-LE-NEXT: lsls r1, r2, #22 +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[8] +; CHECK-LE-NEXT: strbmi r2, [r0, #8] +; CHECK-LE-NEXT: lsls r2, r1, #22 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u8 r1, q0[9] -; CHECK-LE-NEXT: strbmi r1, [r0, #9] -; CHECK-LE-NEXT: lsls r1, r2, #21 +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[9] +; CHECK-LE-NEXT: strbmi r2, [r0, #9] +; CHECK-LE-NEXT: lsls r2, r1, #21 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u8 r1, q0[10] -; CHECK-LE-NEXT: strbmi r1, [r0, #10] -; CHECK-LE-NEXT: lsls r1, r2, #20 +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[10] +; CHECK-LE-NEXT: strbmi r2, [r0, #10] +; CHECK-LE-NEXT: lsls r2, r1, #20 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u8 r1, q0[11] -; CHECK-LE-NEXT: strbmi r1, [r0, #11] -; CHECK-LE-NEXT: lsls r1, r2, #19 +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[11] +; CHECK-LE-NEXT: strbmi r2, [r0, #11] +; CHECK-LE-NEXT: lsls r2, r1, #19 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u8 r1, q0[12] -; CHECK-LE-NEXT: strbmi r1, [r0, #12] -; CHECK-LE-NEXT: lsls r1, r2, #18 +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[12] +; CHECK-LE-NEXT: strbmi r2, [r0, #12] +; CHECK-LE-NEXT: lsls r2, r1, #18 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u8 r1, q0[13] -; CHECK-LE-NEXT: strbmi r1, [r0, #13] -; CHECK-LE-NEXT: lsls r1, r2, #17 +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[13] +; CHECK-LE-NEXT: strbmi r2, [r0, #13] +; CHECK-LE-NEXT: lsls r2, r1, #17 ; CHECK-LE-NEXT: itt mi -; CHECK-LE-NEXT: vmovmi.u8 r1, q0[14] -; CHECK-LE-NEXT: strbmi r1, [r0, #14] -; CHECK-LE-NEXT: lsls r1, r2, #16 +; CHECK-LE-NEXT: vmovmi.u8 r2, q0[14] +; CHECK-LE-NEXT: strbmi r2, [r0, #14] +; CHECK-LE-NEXT: lsls r1, r1, #16 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi.u8 r1, q0[15] ; CHECK-LE-NEXT: strbmi r1, [r0, #15] @@ -947,74 +1232,73 @@ define i8* @masked_v16i8_pre(i8* %y, i8* %x, <16 x i8> %a) { ; CHECK-BE-NEXT: vldr d1, [r7, #8] ; CHECK-BE-NEXT: adds r0, #4 ; CHECK-BE-NEXT: vmov d0, r3, r2 -; CHECK-BE-NEXT: mov r2, sp -; CHECK-BE-NEXT: vrev64.8 q1, q0 ; CHECK-BE-NEXT: sub.w r4, r7, #8 -; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r2] -; CHECK-BE-NEXT: ldrh.w r2, [sp] +; CHECK-BE-NEXT: vrev64.8 q1, q0 ; CHECK-BE-NEXT: vldrb.u8 q0, [r1] -; CHECK-BE-NEXT: lsls r1, r2, #31 +; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: uxth r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: itt ne -; CHECK-BE-NEXT: vmovne.u8 r1, q0[0] -; CHECK-BE-NEXT: strbne r1, [r0] -; CHECK-BE-NEXT: lsls r1, r2, #30 +; CHECK-BE-NEXT: vmovne.u8 r2, q0[0] +; CHECK-BE-NEXT: strbne r2, [r0] +; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u8 r1, q0[1] -; CHECK-BE-NEXT: strbmi r1, [r0, #1] -; CHECK-BE-NEXT: lsls r1, r2, #29 +; CHECK-BE-NEXT: vmovmi.u8 r2, q0[1] +; CHECK-BE-NEXT: strbmi r2, [r0, #1] +; CHECK-BE-NEXT: lsls r2, r1, #29 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u8 r1, q0[2] -; CHECK-BE-NEXT: strbmi r1, [r0, #2] -; CHECK-BE-NEXT: lsls r1, r2, #28 +; CHECK-BE-NEXT: vmovmi.u8 r2, q0[2] +; CHECK-BE-NEXT: strbmi r2, [r0, #2] +; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u8 r1, q0[3] -; CHECK-BE-NEXT: strbmi r1, [r0, #3] -; CHECK-BE-NEXT: lsls r1, r2, #27 +; CHECK-BE-NEXT: vmovmi.u8 r2, q0[3] +; CHECK-BE-NEXT: strbmi r2, [r0, #3] +; CHECK-BE-NEXT: lsls r2, r1, #27 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u8 r1, q0[4] -; CHECK-BE-NEXT: strbmi r1, [r0, #4] -; CHECK-BE-NEXT: lsls r1, r2, #26 +; CHECK-BE-NEXT: vmovmi.u8 r2, q0[4] +; CHECK-BE-NEXT: strbmi r2, [r0, #4] +; CHECK-BE-NEXT: lsls r2, r1, #26 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u8 r1, q0[5] -; CHECK-BE-NEXT: strbmi r1, [r0, #5] -; CHECK-BE-NEXT: lsls r1, r2, #25 +; CHECK-BE-NEXT: vmovmi.u8 r2, q0[5] +; CHECK-BE-NEXT: strbmi r2, [r0, #5] +; CHECK-BE-NEXT: lsls r2, r1, #25 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u8 r1, q0[6] -; CHECK-BE-NEXT: strbmi r1, [r0, #6] -; CHECK-BE-NEXT: lsls r1, r2, #24 +; CHECK-BE-NEXT: vmovmi.u8 r2, q0[6] +; CHECK-BE-NEXT: strbmi r2, [r0, #6] +; CHECK-BE-NEXT: lsls r2, r1, #24 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u8 r1, q0[7] -; CHECK-BE-NEXT: strbmi r1, [r0, #7] -; CHECK-BE-NEXT: lsls r1, r2, #23 +; CHECK-BE-NEXT: vmovmi.u8 r2, q0[7] +; CHECK-BE-NEXT: strbmi r2, [r0, #7] +; CHECK-BE-NEXT: lsls r2, r1, #23 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u8 r1, q0[8] -; CHECK-BE-NEXT: strbmi r1, [r0, #8] -; CHECK-BE-NEXT: lsls r1, r2, #22 +; CHECK-BE-NEXT: vmovmi.u8 r2, q0[8] +; CHECK-BE-NEXT: strbmi r2, [r0, #8] +; CHECK-BE-NEXT: lsls r2, r1, #22 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u8 r1, q0[9] -; CHECK-BE-NEXT: strbmi r1, [r0, #9] -; CHECK-BE-NEXT: lsls r1, r2, #21 +; CHECK-BE-NEXT: vmovmi.u8 r2, q0[9] +; CHECK-BE-NEXT: strbmi r2, [r0, #9] +; CHECK-BE-NEXT: lsls r2, r1, #21 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u8 r1, q0[10] -; CHECK-BE-NEXT: strbmi r1, [r0, #10] -; CHECK-BE-NEXT: lsls r1, r2, #20 +; CHECK-BE-NEXT: vmovmi.u8 r2, q0[10] +; CHECK-BE-NEXT: strbmi r2, [r0, #10] +; CHECK-BE-NEXT: lsls r2, r1, #20 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u8 r1, q0[11] -; CHECK-BE-NEXT: strbmi r1, [r0, #11] -; CHECK-BE-NEXT: lsls r1, r2, #19 +; CHECK-BE-NEXT: vmovmi.u8 r2, q0[11] +; CHECK-BE-NEXT: strbmi r2, [r0, #11] +; CHECK-BE-NEXT: lsls r2, r1, #19 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u8 r1, q0[12] -; CHECK-BE-NEXT: strbmi r1, [r0, #12] -; CHECK-BE-NEXT: lsls r1, r2, #18 +; CHECK-BE-NEXT: vmovmi.u8 r2, q0[12] +; CHECK-BE-NEXT: strbmi r2, [r0, #12] +; CHECK-BE-NEXT: lsls r2, r1, #18 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u8 r1, q0[13] -; CHECK-BE-NEXT: strbmi r1, [r0, #13] -; CHECK-BE-NEXT: lsls r1, r2, #17 +; CHECK-BE-NEXT: vmovmi.u8 r2, q0[13] +; CHECK-BE-NEXT: strbmi r2, [r0, #13] +; CHECK-BE-NEXT: lsls r2, r1, #17 ; CHECK-BE-NEXT: itt mi -; CHECK-BE-NEXT: vmovmi.u8 r1, q0[14] -; CHECK-BE-NEXT: strbmi r1, [r0, #14] -; CHECK-BE-NEXT: lsls r1, r2, #16 +; CHECK-BE-NEXT: vmovmi.u8 r2, q0[14] +; CHECK-BE-NEXT: strbmi r2, [r0, #14] +; CHECK-BE-NEXT: lsls r1, r1, #16 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi.u8 r1, q0[15] ; CHECK-BE-NEXT: strbmi r1, [r0, #15] @@ -1045,12 +1329,11 @@ define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) { ; CHECK-LE-NEXT: vldr d1, [r7, #8] ; CHECK-LE-NEXT: sub.w r4, r7, #8 ; CHECK-LE-NEXT: vmov d0, r2, r3 -; CHECK-LE-NEXT: mov r2, sp ; CHECK-LE-NEXT: vcmp.s8 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r2] -; CHECK-LE-NEXT: ldrh.w r2, [sp] ; CHECK-LE-NEXT: vldrw.u32 q0, [r1] -; CHECK-LE-NEXT: lsls r1, r2, #31 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: uxth r2, r1 +; CHECK-LE-NEXT: lsls r1, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne.u8 r1, q0[0] ; CHECK-LE-NEXT: strbne r1, [r0] @@ -1106,8 +1389,8 @@ define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) { ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi.u8 r1, q0[13] ; CHECK-LE-NEXT: strbmi r1, [r0, #13] -; CHECK-LE-NEXT: adds r1, r0, #4 ; CHECK-LE-NEXT: lsls r3, r2, #17 +; CHECK-LE-NEXT: add.w r1, r0, #4 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi.u8 r3, q0[14] ; CHECK-LE-NEXT: strbmi r3, [r0, #14] @@ -1133,13 +1416,12 @@ define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) { ; CHECK-BE-NEXT: vldr d1, [r7, #8] ; CHECK-BE-NEXT: sub.w r4, r7, #8 ; CHECK-BE-NEXT: vmov d0, r3, r2 -; CHECK-BE-NEXT: mov r2, sp ; CHECK-BE-NEXT: vrev64.8 q1, q0 -; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r2] -; CHECK-BE-NEXT: ldrh.w r2, [sp] ; CHECK-BE-NEXT: vldrb.u8 q0, [r1] -; CHECK-BE-NEXT: lsls r1, r2, #31 +; CHECK-BE-NEXT: vcmp.s8 gt, q1, zr +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: uxth r2, r1 +; CHECK-BE-NEXT: lsls r1, r1, #31 ; CHECK-BE-NEXT: itt ne ; CHECK-BE-NEXT: vmovne.u8 r1, q0[0] ; CHECK-BE-NEXT: strbne r1, [r0] @@ -1195,8 +1477,8 @@ define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) { ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi.u8 r1, q0[13] ; CHECK-BE-NEXT: strbmi r1, [r0, #13] -; CHECK-BE-NEXT: adds r1, r0, #4 ; CHECK-BE-NEXT: lsls r3, r2, #17 +; CHECK-BE-NEXT: add.w r1, r0, #4 ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi.u8 r3, q0[14] ; CHECK-BE-NEXT: strbmi r3, [r0, #14] @@ -1223,10 +1505,22 @@ define arm_aapcs_vfpcc void @masked_v4f32(<4 x float> *%dest, <4 x float> %a, <4 ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.i32 ne, q1, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: and r1, r1, #15 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: vstrne s0, [r0] @@ -1247,11 +1541,23 @@ define arm_aapcs_vfpcc void @masked_v4f32(<4 x float> *%dest, <4 x float> %a, <4 ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.32 q2, q1 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r3, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: and r1, r1, #15 ; CHECK-BE-NEXT: lsls r2, r1, #31 ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vstrne s4, [r0] @@ -1277,10 +1583,22 @@ define arm_aapcs_vfpcc void @masked_v4f32_align1(<4 x float> *%dest, <4 x float> ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #20 ; CHECK-LE-NEXT: sub sp, #20 -; CHECK-LE-NEXT: add r1, sp, #16 ; CHECK-LE-NEXT: vcmp.i32 ne, q1, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp, #16] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: and r1, r1, #15 ; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: ittt ne ; CHECK-LE-NEXT: vstrne s0, [sp, #12] @@ -1309,11 +1627,23 @@ define arm_aapcs_vfpcc void @masked_v4f32_align1(<4 x float> *%dest, <4 x float> ; CHECK-BE-NEXT: .pad #20 ; CHECK-BE-NEXT: sub sp, #20 ; CHECK-BE-NEXT: vrev64.32 q2, q1 -; CHECK-BE-NEXT: add r1, sp, #16 +; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr ; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp, #16] +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: and r3, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #3, #1 +; CHECK-BE-NEXT: and r1, r1, #15 ; CHECK-BE-NEXT: lsls r2, r1, #31 ; CHECK-BE-NEXT: ittt ne ; CHECK-BE-NEXT: vstrne s4, [sp, #12] @@ -1350,21 +1680,33 @@ define i8* @masked_v4f32_pre(i8* %y, i8* %x, <4 x i32> %a) { ; CHECK-LE-NEXT: vldr d1, [sp, #8] ; CHECK-LE-NEXT: adds r0, #4 ; CHECK-LE-NEXT: vmov d0, r2, r3 -; CHECK-LE-NEXT: add r2, sp, #4 +; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r2] -; CHECK-LE-NEXT: ldrb.w r2, [sp, #4] +; CHECK-LE-NEXT: vmrs r12, p0 +; CHECK-LE-NEXT: and r3, r12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 ; CHECK-LE-NEXT: vldrw.u32 q0, [r1] -; CHECK-LE-NEXT: lsls r1, r2, #31 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: and r1, r2, #15 +; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: vstrne s0, [r0] -; CHECK-LE-NEXT: lsls r1, r2, #30 +; CHECK-LE-NEXT: lsls r2, r1, #30 ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vstrmi s1, [r0, #4] -; CHECK-LE-NEXT: lsls r1, r2, #29 +; CHECK-LE-NEXT: lsls r2, r1, #29 ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vstrmi s2, [r0, #8] -; CHECK-LE-NEXT: lsls r1, r2, #28 +; CHECK-LE-NEXT: lsls r1, r1, #28 ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vstrmi s3, [r0, #12] ; CHECK-LE-NEXT: add sp, #8 @@ -1377,22 +1719,34 @@ define i8* @masked_v4f32_pre(i8* %y, i8* %x, <4 x i32> %a) { ; CHECK-BE-NEXT: vldr d1, [sp, #8] ; CHECK-BE-NEXT: adds r0, #4 ; CHECK-BE-NEXT: vmov d0, r3, r2 -; CHECK-BE-NEXT: add r2, sp, #4 +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r2] -; CHECK-BE-NEXT: ldrb.w r2, [sp, #4] +; CHECK-BE-NEXT: vmrs r12, p0 +; CHECK-BE-NEXT: and r3, r12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: vldrw.u32 q0, [r1] -; CHECK-BE-NEXT: lsls r1, r2, #31 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: and r1, r2, #15 +; CHECK-BE-NEXT: lsls r2, r1, #31 ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vstrne s0, [r0] -; CHECK-BE-NEXT: lsls r1, r2, #30 +; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: it mi ; CHECK-BE-NEXT: vstrmi s1, [r0, #4] -; CHECK-BE-NEXT: lsls r1, r2, #29 +; CHECK-BE-NEXT: lsls r2, r1, #29 ; CHECK-BE-NEXT: it mi ; CHECK-BE-NEXT: vstrmi s2, [r0, #8] -; CHECK-BE-NEXT: lsls r1, r2, #28 +; CHECK-BE-NEXT: lsls r1, r1, #28 ; CHECK-BE-NEXT: it mi ; CHECK-BE-NEXT: vstrmi s3, [r0, #12] ; CHECK-BE-NEXT: add sp, #8 @@ -1414,11 +1768,23 @@ define i8* @masked_v4f32_post(i8* %y, i8* %x, <4 x i32> %a) { ; CHECK-LE-NEXT: sub sp, #8 ; CHECK-LE-NEXT: vldr d1, [sp, #8] ; CHECK-LE-NEXT: vmov d0, r2, r3 -; CHECK-LE-NEXT: add r2, sp, #4 +; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r2] -; CHECK-LE-NEXT: ldrb.w r2, [sp, #4] +; CHECK-LE-NEXT: vmrs r12, p0 +; CHECK-LE-NEXT: and r3, r12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 ; CHECK-LE-NEXT: vldrw.u32 q0, [r1] +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: and r2, r2, #15 ; CHECK-LE-NEXT: lsls r1, r2, #31 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: vstrne s0, [r0] @@ -1442,12 +1808,24 @@ define i8* @masked_v4f32_post(i8* %y, i8* %x, <4 x i32> %a) { ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vldr d1, [sp, #8] ; CHECK-BE-NEXT: vmov d0, r3, r2 -; CHECK-BE-NEXT: add r2, sp, #4 +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vcmp.s32 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r2] -; CHECK-BE-NEXT: ldrb.w r2, [sp, #4] +; CHECK-BE-NEXT: vmrs r12, p0 +; CHECK-BE-NEXT: and r3, r12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: vldrw.u32 q0, [r1] +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: and r2, r2, #15 ; CHECK-BE-NEXT: lsls r1, r2, #31 ; CHECK-BE-NEXT: it ne ; CHECK-BE-NEXT: vstrne s0, [r0] @@ -1480,11 +1858,35 @@ define arm_aapcs_vfpcc void @masked_v8f16(<8 x half> *%dest, <8 x half> %a, <8 x ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #8 ; CHECK-LE-NEXT: sub sp, #8 -; CHECK-LE-NEXT: mov r1, sp ; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: bne .LBB15_9 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r1, #30 @@ -1552,12 +1954,36 @@ define arm_aapcs_vfpcc void @masked_v8f16(<8 x half> *%dest, <8 x half> %a, <8 x ; CHECK-BE-NEXT: .pad #8 ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vrev64.16 q2, q1 -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.i16 ne, q2, zr ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r3, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: bne .LBB15_9 ; CHECK-BE-NEXT: @ %bb.1: @ %else ; CHECK-BE-NEXT: lsls r2, r1, #30 @@ -1630,11 +2056,35 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> % ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: .pad #40 ; CHECK-LE-NEXT: sub sp, #40 -; CHECK-LE-NEXT: add r1, sp, #32 ; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr -; CHECK-LE-NEXT: vstr p0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp, #32] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r3, r1, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r2, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: bne .LBB16_9 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r1, #30 @@ -1718,12 +2168,36 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> % ; CHECK-BE-NEXT: .pad #40 ; CHECK-BE-NEXT: sub sp, #40 ; CHECK-BE-NEXT: vrev64.16 q2, q1 -; CHECK-BE-NEXT: add r1, sp, #32 +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vcmp.i16 ne, q2, zr ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: vstr p0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp, #32] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r3, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: bne .LBB16_9 ; CHECK-BE-NEXT: @ %bb.1: @ %else ; CHECK-BE-NEXT: lsls r2, r1, #30 @@ -1815,12 +2289,36 @@ define i8* @masked_v8f16_pre(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-LE-NEXT: vldr d1, [sp, #8] ; CHECK-LE-NEXT: adds r0, #4 ; CHECK-LE-NEXT: vmov d0, r2, r3 -; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r2] +; CHECK-LE-NEXT: vmrs r12, p0 +; CHECK-LE-NEXT: and r3, r12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r2, r3, #6, #1 +; CHECK-LE-NEXT: ubfx r3, r12, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 ; CHECK-LE-NEXT: vldrw.u32 q0, [r1] -; CHECK-LE-NEXT: ldrb.w r1, [sp] -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: bfi r2, r3, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r2 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: bne .LBB17_9 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r1, #30 @@ -1890,13 +2388,37 @@ define i8* @masked_v8f16_pre(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-BE-NEXT: vldr d1, [sp, #8] ; CHECK-BE-NEXT: adds r0, #4 ; CHECK-BE-NEXT: vmov d0, r3, r2 -; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: movs r2, #0 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r2] +; CHECK-BE-NEXT: vmrs r12, p0 +; CHECK-BE-NEXT: and r3, r12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #2, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #1, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #12, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r2, r3, #6, #1 +; CHECK-BE-NEXT: ubfx r3, r12, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 ; CHECK-BE-NEXT: vldrh.u16 q0, [r1] -; CHECK-BE-NEXT: ldrb.w r1, [sp] -; CHECK-BE-NEXT: lsls r2, r1, #31 +; CHECK-BE-NEXT: bfi r2, r3, #7, #1 +; CHECK-BE-NEXT: uxtb r1, r2 +; CHECK-BE-NEXT: lsls r2, r2, #31 ; CHECK-BE-NEXT: bne .LBB17_9 ; CHECK-BE-NEXT: @ %bb.1: @ %else ; CHECK-BE-NEXT: lsls r2, r1, #30 @@ -1975,12 +2497,36 @@ define i8* @masked_v8f16_post(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-LE-NEXT: sub sp, #8 ; CHECK-LE-NEXT: vldr d1, [sp, #8] ; CHECK-LE-NEXT: vmov d0, r2, r3 -; CHECK-LE-NEXT: mov r2, sp +; CHECK-LE-NEXT: movs r3, #0 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: vstr p0, [r2] -; CHECK-LE-NEXT: ldrb.w r2, [sp] +; CHECK-LE-NEXT: vmrs r12, p0 +; CHECK-LE-NEXT: and r2, r12, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #0, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #1, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #4, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #2, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #6, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #3, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #8, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #4, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #10, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #5, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #12, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r3, r2, #6, #1 +; CHECK-LE-NEXT: ubfx r2, r12, #14, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 ; CHECK-LE-NEXT: vldrw.u32 q0, [r1] -; CHECK-LE-NEXT: lsls r1, r2, #31 +; CHECK-LE-NEXT: bfi r3, r2, #7, #1 +; CHECK-LE-NEXT: uxtb r2, r3 +; CHECK-LE-NEXT: lsls r1, r3, #31 ; CHECK-LE-NEXT: bne .LBB18_12 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r1, r2, #30 @@ -2046,13 +2592,37 @@ define i8* @masked_v8f16_post(i8* %y, i8* %x, <8 x i16> %a) { ; CHECK-BE-NEXT: sub sp, #8 ; CHECK-BE-NEXT: vldr d1, [sp, #8] ; CHECK-BE-NEXT: vmov d0, r3, r2 -; CHECK-BE-NEXT: mov r2, sp +; CHECK-BE-NEXT: movs r3, #0 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr -; CHECK-BE-NEXT: vstr p0, [r2] -; CHECK-BE-NEXT: ldrb.w r2, [sp] +; CHECK-BE-NEXT: vmrs r12, p0 +; CHECK-BE-NEXT: and r2, r12, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #0, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #4, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #2, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #3, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #8, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #4, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #10, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #5, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #12, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r3, r2, #6, #1 +; CHECK-BE-NEXT: ubfx r2, r12, #14, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 ; CHECK-BE-NEXT: vldrh.u16 q0, [r1] -; CHECK-BE-NEXT: lsls r1, r2, #31 +; CHECK-BE-NEXT: bfi r3, r2, #7, #1 +; CHECK-BE-NEXT: uxtb r2, r3 +; CHECK-BE-NEXT: lsls r1, r3, #31 ; CHECK-BE-NEXT: bne .LBB18_12 ; CHECK-BE-NEXT: @ %bb.1: @ %else ; CHECK-BE-NEXT: lsls r1, r2, #30 diff --git a/test/CodeGen/Thumb2/mve-pred-bitcast.ll b/test/CodeGen/Thumb2/mve-pred-bitcast.ll dissimilarity index 68% index a89239ea098..f8f2e0b5613 100644 --- a/test/CodeGen/Thumb2/mve-pred-bitcast.ll +++ b/test/CodeGen/Thumb2/mve-pred-bitcast.ll @@ -1,172 +1,433 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK - -define arm_aapcs_vfpcc <4 x i32> @bitcast_to_v4i1(i4 %b, <4 x i32> %a) { -; CHECK-LABEL: bitcast_to_v4i1: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: and r0, r0, #15 -; CHECK-NEXT: strb.w r0, [sp] -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: vldr p0, [r0] -; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: bx lr -entry: - %c = bitcast i4 %b to <4 x i1> - %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> zeroinitializer - ret <4 x i32> %s -} - -define arm_aapcs_vfpcc <8 x i16> @bitcast_to_v8i1(i8 %b, <8 x i16> %a) { -; CHECK-LABEL: bitcast_to_v8i1: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: strb.w r0, [sp] -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vldr p0, [r0] -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: bx lr -entry: - %c = bitcast i8 %b to <8 x i1> - %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> zeroinitializer - ret <8 x i16> %s -} - -define arm_aapcs_vfpcc <16 x i8> @bitcast_to_v16i1(i16 %b, <16 x i8> %a) { -; CHECK-LABEL: bitcast_to_v16i1: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r6, r7, lr} -; CHECK-NEXT: push {r4, r6, r7, lr} -; CHECK-NEXT: .setfp r7, sp, #8 -; CHECK-NEXT: add r7, sp, #8 -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: mov r4, sp -; CHECK-NEXT: bfc r4, #0, #4 -; CHECK-NEXT: mov sp, r4 -; CHECK-NEXT: strh.w r0, [sp] -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: sub.w r4, r7, #8 -; CHECK-NEXT: vldr p0, [r0] -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: mov sp, r4 -; CHECK-NEXT: pop {r4, r6, r7, pc} -entry: - %c = bitcast i16 %b to <16 x i1> - %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> zeroinitializer - ret <16 x i8> %s -} - -define arm_aapcs_vfpcc <2 x i64> @bitcast_to_v2i1(i2 %b, <2 x i64> %a) { -; CHECK-LABEL: bitcast_to_v2i1: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: and r0, r0, #3 -; CHECK-NEXT: sbfx r1, r0, #0, #1 -; CHECK-NEXT: sbfx r0, r0, #1, #1 -; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov.32 q1[3], r0 -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: bx lr -entry: - %c = bitcast i2 %b to <2 x i1> - %s = select <2 x i1> %c, <2 x i64> %a, <2 x i64> zeroinitializer - ret <2 x i64> %s -} - - -define arm_aapcs_vfpcc i4 @bitcast_from_v4i1(<4 x i32> %a) { -; CHECK-LABEL: bitcast_from_v4i1: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vstr p0, [r0] -; CHECK-NEXT: ldrb.w r0, [sp] -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: bx lr -entry: - %c = icmp eq <4 x i32> %a, zeroinitializer - %b = bitcast <4 x i1> %c to i4 - ret i4 %b -} - -define arm_aapcs_vfpcc i8 @bitcast_from_v8i1(<8 x i16> %a) { -; CHECK-LABEL: bitcast_from_v8i1: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vcmp.i16 eq, q0, zr -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vstr p0, [r0] -; CHECK-NEXT: ldrb.w r0, [sp] -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: bx lr -entry: - %c = icmp eq <8 x i16> %a, zeroinitializer - %b = bitcast <8 x i1> %c to i8 - ret i8 %b -} - -define arm_aapcs_vfpcc i16 @bitcast_from_v16i1(<16 x i8> %a) { -; CHECK-LABEL: bitcast_from_v16i1: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r6, r7, lr} -; CHECK-NEXT: push {r4, r6, r7, lr} -; CHECK-NEXT: .setfp r7, sp, #8 -; CHECK-NEXT: add r7, sp, #8 -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: mov r4, sp -; CHECK-NEXT: bfc r4, #0, #4 -; CHECK-NEXT: mov sp, r4 -; CHECK-NEXT: sub.w r4, r7, #8 -; CHECK-NEXT: vcmp.i8 eq, q0, zr -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vstr p0, [r0] -; CHECK-NEXT: ldrh.w r0, [sp] -; CHECK-NEXT: mov sp, r4 -; CHECK-NEXT: pop {r4, r6, r7, pc} -entry: - %c = icmp eq <16 x i8> %a, zeroinitializer - %b = bitcast <16 x i1> %c to i16 - ret i16 %b -} - -define arm_aapcs_vfpcc i2 @bitcast_from_v2i1(<2 x i64> %a) { -; CHECK-LABEL: bitcast_from_v2i1: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: cset r0, eq -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: cset r1, eq -; CHECK-NEXT: ands r1, r1, #1 -; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r1, #1 -; CHECK-NEXT: bfi r1, r0, #0, #1 -; CHECK-NEXT: and r0, r1, #3 -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: bx lr -entry: - %c = icmp eq <2 x i64> %a, zeroinitializer - %b = bitcast <2 x i1> %c to i2 - ret i2 %b -} +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE +; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE + +define arm_aapcs_vfpcc <4 x i32> @bitcast_to_v4i1(i4 %b, <4 x i32> %a) { +; CHECK-LE-LABEL: bitcast_to_v4i1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: and r0, r0, #15 +; CHECK-LE-NEXT: vmov.i8 q1, #0x0 +; CHECK-LE-NEXT: vmov.i8 q2, #0xff +; CHECK-LE-NEXT: vmsr p0, r0 +; CHECK-LE-NEXT: vpsel q1, q2, q1 +; CHECK-LE-NEXT: vmov.u8 r0, q1[0] +; CHECK-LE-NEXT: vmov.32 q2[0], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[1] +; CHECK-LE-NEXT: vmov.32 q2[1], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[2] +; CHECK-LE-NEXT: vmov.32 q2[2], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[3] +; CHECK-LE-NEXT: vmov.32 q2[3], r0 +; CHECK-LE-NEXT: vmov.i32 q1, #0x0 +; CHECK-LE-NEXT: vcmp.i32 ne, q2, zr +; CHECK-LE-NEXT: vpsel q0, q0, q1 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: bitcast_to_v4i1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: and r0, r0, #15 +; CHECK-BE-NEXT: vmov.i8 q1, #0x0 +; CHECK-BE-NEXT: vmov.i8 q2, #0xff +; CHECK-BE-NEXT: vmsr p0, r0 +; CHECK-BE-NEXT: vpsel q1, q2, q1 +; CHECK-BE-NEXT: vmov.u8 r0, q1[0] +; CHECK-BE-NEXT: vmov.32 q2[0], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[1] +; CHECK-BE-NEXT: vmov.32 q2[1], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[2] +; CHECK-BE-NEXT: vmov.32 q2[2], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[3] +; CHECK-BE-NEXT: vmov.32 q2[3], r0 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr +; CHECK-BE-NEXT: vmov.i32 q0, #0x0 +; CHECK-BE-NEXT: vpsel q1, q1, q0 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = bitcast i4 %b to <4 x i1> + %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> zeroinitializer + ret <4 x i32> %s +} + +define arm_aapcs_vfpcc <8 x i16> @bitcast_to_v8i1(i8 %b, <8 x i16> %a) { +; CHECK-LE-LABEL: bitcast_to_v8i1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: uxtb r0, r0 +; CHECK-LE-NEXT: vmov.i8 q1, #0x0 +; CHECK-LE-NEXT: vmov.i8 q2, #0xff +; CHECK-LE-NEXT: vmsr p0, r0 +; CHECK-LE-NEXT: vpsel q2, q2, q1 +; CHECK-LE-NEXT: vmov.u8 r0, q2[0] +; CHECK-LE-NEXT: vmov.16 q1[0], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q2[1] +; CHECK-LE-NEXT: vmov.16 q1[1], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q2[2] +; CHECK-LE-NEXT: vmov.16 q1[2], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q2[3] +; CHECK-LE-NEXT: vmov.16 q1[3], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q2[4] +; CHECK-LE-NEXT: vmov.16 q1[4], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q2[5] +; CHECK-LE-NEXT: vmov.16 q1[5], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q2[6] +; CHECK-LE-NEXT: vmov.16 q1[6], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q2[7] +; CHECK-LE-NEXT: vmov.16 q1[7], r0 +; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr +; CHECK-LE-NEXT: vmov.i32 q1, #0x0 +; CHECK-LE-NEXT: vpsel q0, q0, q1 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: bitcast_to_v8i1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: uxtb r0, r0 +; CHECK-BE-NEXT: vmov.i8 q1, #0x0 +; CHECK-BE-NEXT: vmov.i8 q2, #0xff +; CHECK-BE-NEXT: vmsr p0, r0 +; CHECK-BE-NEXT: vpsel q2, q2, q1 +; CHECK-BE-NEXT: vmov.u8 r0, q2[0] +; CHECK-BE-NEXT: vmov.16 q1[0], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q2[1] +; CHECK-BE-NEXT: vmov.16 q1[1], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q2[2] +; CHECK-BE-NEXT: vmov.16 q1[2], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q2[3] +; CHECK-BE-NEXT: vmov.16 q1[3], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q2[4] +; CHECK-BE-NEXT: vmov.16 q1[4], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q2[5] +; CHECK-BE-NEXT: vmov.16 q1[5], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q2[6] +; CHECK-BE-NEXT: vmov.16 q1[6], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q2[7] +; CHECK-BE-NEXT: vmov.16 q1[7], r0 +; CHECK-BE-NEXT: vcmp.i16 ne, q1, zr +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vmov.i32 q0, #0x0 +; CHECK-BE-NEXT: vrev32.16 q0, q0 +; CHECK-BE-NEXT: vpsel q1, q1, q0 +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %c = bitcast i8 %b to <8 x i1> + %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> zeroinitializer + ret <8 x i16> %s +} + +define arm_aapcs_vfpcc <16 x i8> @bitcast_to_v16i1(i16 %b, <16 x i8> %a) { +; CHECK-LE-LABEL: bitcast_to_v16i1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r4, r6, r7, lr} +; CHECK-LE-NEXT: push {r4, r6, r7, lr} +; CHECK-LE-NEXT: .setfp r7, sp, #8 +; CHECK-LE-NEXT: add r7, sp, #8 +; CHECK-LE-NEXT: .pad #16 +; CHECK-LE-NEXT: sub sp, #16 +; CHECK-LE-NEXT: mov r4, sp +; CHECK-LE-NEXT: bfc r4, #0, #4 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: uxth r0, r0 +; CHECK-LE-NEXT: sub.w r4, r7, #8 +; CHECK-LE-NEXT: vmov.i32 q1, #0x0 +; CHECK-LE-NEXT: vmsr p0, r0 +; CHECK-LE-NEXT: vpsel q0, q0, q1 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: pop {r4, r6, r7, pc} +; +; CHECK-BE-LABEL: bitcast_to_v16i1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r4, r6, r7, lr} +; CHECK-BE-NEXT: push {r4, r6, r7, lr} +; CHECK-BE-NEXT: .setfp r7, sp, #8 +; CHECK-BE-NEXT: add r7, sp, #8 +; CHECK-BE-NEXT: .pad #16 +; CHECK-BE-NEXT: sub sp, #16 +; CHECK-BE-NEXT: mov r4, sp +; CHECK-BE-NEXT: bfc r4, #0, #4 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: vrev64.8 q1, q0 +; CHECK-BE-NEXT: vmov.i32 q0, #0x0 +; CHECK-BE-NEXT: uxth r0, r0 +; CHECK-BE-NEXT: sub.w r4, r7, #8 +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vmsr p0, r0 +; CHECK-BE-NEXT: vpsel q1, q1, q0 +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: pop {r4, r6, r7, pc} +entry: + %c = bitcast i16 %b to <16 x i1> + %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> zeroinitializer + ret <16 x i8> %s +} + +define arm_aapcs_vfpcc <2 x i64> @bitcast_to_v2i1(i2 %b, <2 x i64> %a) { +; CHECK-LE-LABEL: bitcast_to_v2i1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: and r0, r0, #3 +; CHECK-LE-NEXT: sbfx r1, r0, #0, #1 +; CHECK-LE-NEXT: sbfx r0, r0, #1, #1 +; CHECK-LE-NEXT: vmov.32 q1[0], r1 +; CHECK-LE-NEXT: vmov.32 q1[1], r1 +; CHECK-LE-NEXT: vmov.32 q1[2], r0 +; CHECK-LE-NEXT: vmov.32 q1[3], r0 +; CHECK-LE-NEXT: vand q0, q0, q1 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: bitcast_to_v2i1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: and r0, r0, #3 +; CHECK-BE-NEXT: sbfx r1, r0, #0, #1 +; CHECK-BE-NEXT: sbfx r0, r0, #1, #1 +; CHECK-BE-NEXT: vmov.32 q1[0], r1 +; CHECK-BE-NEXT: vmov.32 q1[1], r1 +; CHECK-BE-NEXT: vmov.32 q1[2], r0 +; CHECK-BE-NEXT: vmov.32 q1[3], r0 +; CHECK-BE-NEXT: vrev64.32 q2, q1 +; CHECK-BE-NEXT: vand q0, q0, q2 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = bitcast i2 %b to <2 x i1> + %s = select <2 x i1> %c, <2 x i64> %a, <2 x i64> zeroinitializer + ret <2 x i64> %s +} + + +define arm_aapcs_vfpcc i4 @bitcast_from_v4i1(<4 x i32> %a) { +; CHECK-LE-LABEL: bitcast_from_v4i1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vcmp.i32 eq, q0, zr +; CHECK-LE-NEXT: movs r0, #0 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r2, r1, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r0, r2, #0, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r0, r2, #1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #12, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r0, r2, #2, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r0, r1, #3, #1 +; CHECK-LE-NEXT: and r0, r0, #15 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: bitcast_from_v4i1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: movs r3, #0 +; CHECK-BE-NEXT: vcmp.i32 eq, q1, zr +; CHECK-BE-NEXT: vmrs r0, p0 +; CHECK-BE-NEXT: and r2, r0, #1 +; CHECK-BE-NEXT: ubfx r1, r0, #4, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r3, r2, #0, #1 +; CHECK-BE-NEXT: bfi r3, r1, #1, #1 +; CHECK-BE-NEXT: ubfx r1, r0, #8, #1 +; CHECK-BE-NEXT: ubfx r0, r0, #12, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r3, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r0, r0, #0 +; CHECK-BE-NEXT: bfi r3, r0, #3, #1 +; CHECK-BE-NEXT: and r0, r3, #15 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp eq <4 x i32> %a, zeroinitializer + %b = bitcast <4 x i1> %c to i4 + ret i4 %b +} + +define arm_aapcs_vfpcc i8 @bitcast_from_v8i1(<8 x i16> %a) { +; CHECK-LE-LABEL: bitcast_from_v8i1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vcmp.i16 eq, q0, zr +; CHECK-LE-NEXT: movs r0, #0 +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: and r2, r1, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r0, r2, #0, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r0, r2, #1, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r0, r2, #2, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #6, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r0, r2, #3, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r0, r2, #4, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #10, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r0, r2, #5, #1 +; CHECK-LE-NEXT: ubfx r2, r1, #12, #1 +; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r0, r2, #6, #1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: bfi r0, r1, #7, #1 +; CHECK-LE-NEXT: uxtb r0, r0 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: bitcast_from_v8i1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vcmp.i16 eq, q1, zr +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: ubfx r0, r1, #2, #1 +; CHECK-BE-NEXT: rsbs r2, r0, #0 +; CHECK-BE-NEXT: and r0, r1, #1 +; CHECK-BE-NEXT: rsbs r3, r0, #0 +; CHECK-BE-NEXT: movs r0, #0 +; CHECK-BE-NEXT: bfi r0, r3, #0, #1 +; CHECK-BE-NEXT: bfi r0, r2, #1, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r0, r2, #2, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r0, r2, #3, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #8, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r0, r2, #4, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #10, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r0, r2, #5, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #12, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #14, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r0, r2, #6, #1 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r0, r1, #7, #1 +; CHECK-BE-NEXT: uxtb r0, r0 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp eq <8 x i16> %a, zeroinitializer + %b = bitcast <8 x i1> %c to i8 + ret i8 %b +} + +define arm_aapcs_vfpcc i16 @bitcast_from_v16i1(<16 x i8> %a) { +; CHECK-LE-LABEL: bitcast_from_v16i1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r4, r6, r7, lr} +; CHECK-LE-NEXT: push {r4, r6, r7, lr} +; CHECK-LE-NEXT: .setfp r7, sp, #8 +; CHECK-LE-NEXT: add r7, sp, #8 +; CHECK-LE-NEXT: .pad #16 +; CHECK-LE-NEXT: sub sp, #16 +; CHECK-LE-NEXT: mov r4, sp +; CHECK-LE-NEXT: bfc r4, #0, #4 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: vcmp.i8 eq, q0, zr +; CHECK-LE-NEXT: sub.w r4, r7, #8 +; CHECK-LE-NEXT: vmrs r0, p0 +; CHECK-LE-NEXT: uxth r0, r0 +; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: pop {r4, r6, r7, pc} +; +; CHECK-BE-LABEL: bitcast_from_v16i1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r4, r6, r7, lr} +; CHECK-BE-NEXT: push {r4, r6, r7, lr} +; CHECK-BE-NEXT: .setfp r7, sp, #8 +; CHECK-BE-NEXT: add r7, sp, #8 +; CHECK-BE-NEXT: .pad #16 +; CHECK-BE-NEXT: sub sp, #16 +; CHECK-BE-NEXT: mov r4, sp +; CHECK-BE-NEXT: bfc r4, #0, #4 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: vrev64.8 q1, q0 +; CHECK-BE-NEXT: sub.w r4, r7, #8 +; CHECK-BE-NEXT: vcmp.i8 eq, q1, zr +; CHECK-BE-NEXT: vmrs r0, p0 +; CHECK-BE-NEXT: uxth r0, r0 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: pop {r4, r6, r7, pc} +entry: + %c = icmp eq <16 x i8> %a, zeroinitializer + %b = bitcast <16 x i1> %c to i16 + ret i16 %b +} + +define arm_aapcs_vfpcc i2 @bitcast_from_v2i1(<2 x i64> %a) { +; CHECK-LE-LABEL: bitcast_from_v2i1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 +; CHECK-LE-NEXT: vmov r0, s1 +; CHECK-LE-NEXT: vmov r1, s0 +; CHECK-LE-NEXT: vmov r2, s2 +; CHECK-LE-NEXT: orrs r0, r1 +; CHECK-LE-NEXT: vmov r1, s3 +; CHECK-LE-NEXT: cset r0, eq +; CHECK-LE-NEXT: orrs r1, r2 +; CHECK-LE-NEXT: cset r1, eq +; CHECK-LE-NEXT: ands r1, r1, #1 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: mvnne r1, #1 +; CHECK-LE-NEXT: bfi r1, r0, #0, #1 +; CHECK-LE-NEXT: and r0, r1, #3 +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: bitcast_from_v2i1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vmov r0, s6 +; CHECK-BE-NEXT: vmov r1, s7 +; CHECK-BE-NEXT: vmov r2, s5 +; CHECK-BE-NEXT: orrs r0, r1 +; CHECK-BE-NEXT: vmov r1, s4 +; CHECK-BE-NEXT: cset r0, eq +; CHECK-BE-NEXT: orrs r1, r2 +; CHECK-BE-NEXT: cset r1, eq +; CHECK-BE-NEXT: ands r1, r1, #1 +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: mvnne r1, #1 +; CHECK-BE-NEXT: bfi r1, r0, #0, #1 +; CHECK-BE-NEXT: and r0, r1, #3 +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr +entry: + %c = icmp eq <2 x i64> %a, zeroinitializer + %b = bitcast <2 x i1> %c to i2 + ret i2 %b +} diff --git a/test/CodeGen/Thumb2/mve-pred-loadstore.ll b/test/CodeGen/Thumb2/mve-pred-loadstore.ll index 997efd09b2a..d314d0525c4 100644 --- a/test/CodeGen/Thumb2/mve-pred-loadstore.ll +++ b/test/CodeGen/Thumb2/mve-pred-loadstore.ll @@ -5,15 +5,41 @@ define arm_aapcs_vfpcc <4 x i32> @load_v4i1(<4 x i1> *%src, <4 x i32> %a) { ; CHECK-LE-LABEL: load_v4i1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vldr p0, [r0] +; CHECK-LE-NEXT: ldrb r0, [r0] +; CHECK-LE-NEXT: vmov.i8 q1, #0x0 +; CHECK-LE-NEXT: vmov.i8 q2, #0xff +; CHECK-LE-NEXT: vmsr p0, r0 +; CHECK-LE-NEXT: vpsel q1, q2, q1 +; CHECK-LE-NEXT: vmov.u8 r0, q1[0] +; CHECK-LE-NEXT: vmov.32 q2[0], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[1] +; CHECK-LE-NEXT: vmov.32 q2[1], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[2] +; CHECK-LE-NEXT: vmov.32 q2[2], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q1[3] +; CHECK-LE-NEXT: vmov.32 q2[3], r0 ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 +; CHECK-LE-NEXT: vcmp.i32 ne, q2, zr ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: load_v4i1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: vldr p0, [r0] +; CHECK-BE-NEXT: ldrb r0, [r0] +; CHECK-BE-NEXT: vmov.i8 q1, #0x0 +; CHECK-BE-NEXT: vmov.i8 q2, #0xff +; CHECK-BE-NEXT: vmsr p0, r0 +; CHECK-BE-NEXT: vpsel q1, q2, q1 +; CHECK-BE-NEXT: vmov.u8 r0, q1[0] +; CHECK-BE-NEXT: vmov.32 q2[0], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[1] +; CHECK-BE-NEXT: vmov.32 q2[1], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[2] +; CHECK-BE-NEXT: vmov.32 q2[2], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q1[3] +; CHECK-BE-NEXT: vmov.32 q2[3], r0 ; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vcmp.i32 ne, q2, zr ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.32 q0, q1 @@ -27,16 +53,58 @@ entry: define arm_aapcs_vfpcc <8 x i16> @load_v8i1(<8 x i1> *%src, <8 x i16> %a) { ; CHECK-LE-LABEL: load_v8i1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vldr p0, [r0] +; CHECK-LE-NEXT: ldrb r0, [r0] +; CHECK-LE-NEXT: vmov.i8 q1, #0x0 +; CHECK-LE-NEXT: vmov.i8 q2, #0xff +; CHECK-LE-NEXT: vmsr p0, r0 +; CHECK-LE-NEXT: vpsel q2, q2, q1 +; CHECK-LE-NEXT: vmov.u8 r0, q2[0] +; CHECK-LE-NEXT: vmov.16 q1[0], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q2[1] +; CHECK-LE-NEXT: vmov.16 q1[1], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q2[2] +; CHECK-LE-NEXT: vmov.16 q1[2], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q2[3] +; CHECK-LE-NEXT: vmov.16 q1[3], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q2[4] +; CHECK-LE-NEXT: vmov.16 q1[4], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q2[5] +; CHECK-LE-NEXT: vmov.16 q1[5], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q2[6] +; CHECK-LE-NEXT: vmov.16 q1[6], r0 +; CHECK-LE-NEXT: vmov.u8 r0, q2[7] +; CHECK-LE-NEXT: vmov.16 q1[7], r0 +; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: load_v8i1: ; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: ldrb r0, [r0] +; CHECK-BE-NEXT: vmov.i8 q1, #0x0 +; CHECK-BE-NEXT: vmov.i8 q2, #0xff +; CHECK-BE-NEXT: vmsr p0, r0 +; CHECK-BE-NEXT: vpsel q2, q2, q1 +; CHECK-BE-NEXT: vmov.u8 r0, q2[0] +; CHECK-BE-NEXT: vmov.16 q1[0], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q2[1] +; CHECK-BE-NEXT: vmov.16 q1[1], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q2[2] +; CHECK-BE-NEXT: vmov.16 q1[2], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q2[3] +; CHECK-BE-NEXT: vmov.16 q1[3], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q2[4] +; CHECK-BE-NEXT: vmov.16 q1[4], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q2[5] +; CHECK-BE-NEXT: vmov.16 q1[5], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q2[6] +; CHECK-BE-NEXT: vmov.16 q1[6], r0 +; CHECK-BE-NEXT: vmov.u8 r0, q2[7] +; CHECK-BE-NEXT: vmov.16 q1[7], r0 +; CHECK-BE-NEXT: vcmp.i16 ne, q1, zr ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 -; CHECK-BE-NEXT: vldr p0, [r0] ; CHECK-BE-NEXT: vrev32.16 q0, q0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.16 q0, q1 @@ -50,17 +118,19 @@ entry: define arm_aapcs_vfpcc <16 x i8> @load_v16i1(<16 x i1> *%src, <16 x i8> %a) { ; CHECK-LE-LABEL: load_v16i1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vldr p0, [r0] +; CHECK-LE-NEXT: ldrh r0, [r0] ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 +; CHECK-LE-NEXT: vmsr p0, r0 ; CHECK-LE-NEXT: vpsel q0, q0, q1 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: load_v16i1: ; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: ldrh r0, [r0] ; CHECK-BE-NEXT: vrev64.8 q1, q0 ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 -; CHECK-BE-NEXT: vldr p0, [r0] ; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vmsr p0, r0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.8 q0, q1 ; CHECK-BE-NEXT: bx lr @@ -106,14 +176,44 @@ define arm_aapcs_vfpcc void @store_v4i1(<4 x i1> *%dst, <4 x i32> %a) { ; CHECK-LE-LABEL: store_v4i1: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vcmp.i32 eq, q0, zr -; CHECK-LE-NEXT: vstr p0, [r0] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #3, #1 +; CHECK-LE-NEXT: and r1, r1, #15 +; CHECK-LE-NEXT: strb r1, [r0] ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: store_v4i1: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vrev64.32 q1, q0 ; CHECK-BE-NEXT: vcmp.i32 eq, q1, zr -; CHECK-BE-NEXT: vstr p0, [r0] +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: and r3, r1, #1 +; CHECK-BE-NEXT: ubfx r2, r1, #4, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: rsb.w r12, r2, #0 +; CHECK-BE-NEXT: movs r2, #0 +; CHECK-BE-NEXT: bfi r2, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r1, #8, #1 +; CHECK-BE-NEXT: ubfx r1, r1, #12, #1 +; CHECK-BE-NEXT: bfi r2, r12, #1, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: rsbs r1, r1, #0 +; CHECK-BE-NEXT: bfi r2, r3, #2, #1 +; CHECK-BE-NEXT: bfi r2, r1, #3, #1 +; CHECK-BE-NEXT: and r1, r2, #15 +; CHECK-BE-NEXT: strb r1, [r0] ; CHECK-BE-NEXT: bx lr entry: %c = icmp eq <4 x i32> %a, zeroinitializer @@ -125,14 +225,66 @@ define arm_aapcs_vfpcc void @store_v8i1(<8 x i1> *%dst, <8 x i16> %a) { ; CHECK-LE-LABEL: store_v8i1: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vcmp.i16 eq, q0, zr -; CHECK-LE-NEXT: vstr p0, [r0] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: and r3, r2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #2, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #6, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #10, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #12, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #14, #1 +; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #6, #1 +; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #7, #1 +; CHECK-LE-NEXT: strb r1, [r0] ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: store_v8i1: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vcmp.i16 eq, q1, zr -; CHECK-BE-NEXT: vstr p0, [r0] +; CHECK-BE-NEXT: vmrs r2, p0 +; CHECK-BE-NEXT: ubfx r1, r2, #2, #1 +; CHECK-BE-NEXT: rsb.w r12, r1, #0 +; CHECK-BE-NEXT: and r1, r2, #1 +; CHECK-BE-NEXT: rsbs r3, r1, #0 +; CHECK-BE-NEXT: movs r1, #0 +; CHECK-BE-NEXT: bfi r1, r3, #0, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-BE-NEXT: bfi r1, r12, #1, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #2, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #6, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #3, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #4, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #10, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #5, #1 +; CHECK-BE-NEXT: ubfx r3, r2, #12, #1 +; CHECK-BE-NEXT: ubfx r2, r2, #14, #1 +; CHECK-BE-NEXT: rsbs r3, r3, #0 +; CHECK-BE-NEXT: bfi r1, r3, #6, #1 +; CHECK-BE-NEXT: rsbs r2, r2, #0 +; CHECK-BE-NEXT: bfi r1, r2, #7, #1 +; CHECK-BE-NEXT: strb r1, [r0] ; CHECK-BE-NEXT: bx lr entry: %c = icmp eq <8 x i16> %a, zeroinitializer @@ -144,14 +296,16 @@ define arm_aapcs_vfpcc void @store_v16i1(<16 x i1> *%dst, <16 x i8> %a) { ; CHECK-LE-LABEL: store_v16i1: ; CHECK-LE: @ %bb.0: @ %entry ; CHECK-LE-NEXT: vcmp.i8 eq, q0, zr -; CHECK-LE-NEXT: vstr p0, [r0] +; CHECK-LE-NEXT: vmrs r1, p0 +; CHECK-LE-NEXT: strh r1, [r0] ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: store_v16i1: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vrev64.8 q1, q0 ; CHECK-BE-NEXT: vcmp.i8 eq, q1, zr -; CHECK-BE-NEXT: vstr p0, [r0] +; CHECK-BE-NEXT: vmrs r1, p0 +; CHECK-BE-NEXT: strh r1, [r0] ; CHECK-BE-NEXT: bx lr entry: %c = icmp eq <16 x i8> %a, zeroinitializer -- 2.11.4.GIT