From ce7ee22c5452663461ac9daae0e7c777011107dd Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 8 Aug 2019 15:15:19 +0000 Subject: [PATCH] [ARM] MVE big endian loads/stores This adds some missing patterns for big endian loads/stores, allowing unaligned loads/stores to also be selected with an extra VREV, which produces better code than aligning through a stack. Also moves VLDR_P0 to not be LE only, and adjusts some of the tests to show all that working. Differential Revision: https://reviews.llvm.org/D65583 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@368304 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMISelLowering.cpp | 48 ++---- lib/Target/ARM/ARMInstrMVE.td | 42 ++++- test/CodeGen/Thumb2/mve-be.ll | 47 +----- test/CodeGen/Thumb2/mve-loadstore.ll | 278 +++++++++++++++++++++++--------- test/CodeGen/Thumb2/mve-pred-spill.ll | 246 ++++++++++++++++++---------- test/CodeGen/Thumb2/mve-widen-narrow.ll | 3 +- 6 files changed, 419 insertions(+), 245 deletions(-) rewrite test/CodeGen/Thumb2/mve-pred-spill.ll (72%) diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index d94e91019b3..f9315607754 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -14075,45 +14075,21 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, return true; } - if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 && - Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 && - Ty != MVT::v2f64) - return false; - - if (Subtarget->isLittle()) { - // In little-endian MVE, the store instructions VSTRB.U8, - // VSTRH.U16 and VSTRW.U32 all store the vector register in - // exactly the same format, and differ only in the range of - // their immediate offset field and the required alignment. - // - // In particular, VSTRB.U8 can store a vector at byte alignment. - // So at this stage we can simply say that loads/stores of all - // 128-bit wide vector types are permitted at any alignment, - // because we know at least _one_ instruction can manage that. - // - // Later on we might find that some of those loads are better - // generated as VLDRW.U32 if alignment permits, to take - // advantage of the larger immediate range. But for the moment, - // all that matters is that if we don't lower the load then - // _some_ instruction can handle it. + // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and + // VSTRW.U32 all store the vector register in exactly the same format, and + // differ only in the range of their immediate offset field and the required + // alignment. So there is always a store that can be used, regardless of + // actual type. + // + // For big endian, that is not the case. But can still emit a (VSTRB.U8; + // VREV64.8) pair and get the same effect. This will likely be better than + // aligning the vector through the stack. + if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 || + Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || + Ty == MVT::v2f64) { if (Fast) *Fast = true; return true; - } else { - // In big-endian MVE, those instructions aren't so similar - // after all, because they reorder the bytes of the vector - // differently. So this time we can only store a particular - // kind of vector if its alignment is at least the element - // type. And we can't store vectors of i64 or f64 at all - // without having to do some postprocessing, because there's - // no VSTRD.U64. - if (Ty == MVT::v16i8 || - ((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) || - ((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) { - if (Fast) - *Fast = true; - return true; - } } return false; diff --git a/lib/Target/ARM/ARMInstrMVE.td b/lib/Target/ARM/ARMInstrMVE.td index 373b92afbd1..a1347ec5b02 100644 --- a/lib/Target/ARM/ARMInstrMVE.td +++ b/lib/Target/ARM/ARMInstrMVE.td @@ -4820,13 +4820,6 @@ let Predicates = [HasMVEInt, IsLE] in { defm : MVE_unpred_vector_load; defm : MVE_unpred_vector_load; defm : MVE_unpred_vector_load; - - def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)), - (v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; - def : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)), - (v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; - def : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)), - (v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; } let Predicates = [HasMVEInt, IsBE] in { @@ -4841,6 +4834,41 @@ let Predicates = [HasMVEInt, IsBE] in { def : MVE_unpred_vector_load_typed; def : MVE_unpred_vector_load_typed; def : MVE_unpred_vector_load_typed; + + // Other unaligned loads/stores need to go though a VREV + def : Pat<(v2f64 (load t2addrmode_imm7<0>:$addr)), + (v2f64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v2i64 (load t2addrmode_imm7<0>:$addr)), + (v2i64 (MVE_VREV64_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v4i32 (load t2addrmode_imm7<0>:$addr)), + (v4i32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v4f32 (load t2addrmode_imm7<0>:$addr)), + (v4f32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v8i16 (load t2addrmode_imm7<0>:$addr)), + (v8i16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(v8f16 (load t2addrmode_imm7<0>:$addr)), + (v8f16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr)))>; + def : Pat<(store (v2f64 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v2i64 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV64_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; + def : Pat<(store (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr), + (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr)>; +} + +let Predicates = [HasMVEInt] in { + def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)), + (v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; + def : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)), + (v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; + def : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)), + (v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; } diff --git a/test/CodeGen/Thumb2/mve-be.ll b/test/CodeGen/Thumb2/mve-be.ll index f1de6e54671..7f355396a4c 100644 --- a/test/CodeGen/Thumb2/mve-be.ll +++ b/test/CodeGen/Thumb2/mve-be.ll @@ -29,47 +29,14 @@ define void @load_load_add_store_align1(<4 x i32> *%src1, <4 x i32> *%src2) { ; ; CHECK-BE-LABEL: load_load_add_store_align1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .save {r4, r6, r7, lr} -; CHECK-BE-NEXT: push {r4, r6, r7, lr} -; CHECK-BE-NEXT: .setfp r7, sp, #8 -; CHECK-BE-NEXT: add r7, sp, #8 -; CHECK-BE-NEXT: .pad #48 -; CHECK-BE-NEXT: sub sp, #48 -; CHECK-BE-NEXT: mov r4, sp -; CHECK-BE-NEXT: bfc r4, #0, #4 -; CHECK-BE-NEXT: mov sp, r4 -; CHECK-BE-NEXT: ldr.w r12, [r1] -; CHECK-BE-NEXT: ldr r3, [r1, #4] -; CHECK-BE-NEXT: ldr r2, [r1, #8] -; CHECK-BE-NEXT: ldr r1, [r1, #12] -; CHECK-BE-NEXT: strd r2, r1, [sp, #24] -; CHECK-BE-NEXT: mov r1, r0 -; CHECK-BE-NEXT: strd r12, r3, [sp, #16] -; CHECK-BE-NEXT: ldr r2, [r1, #4]! -; CHECK-BE-NEXT: str r2, [sp, #4] -; CHECK-BE-NEXT: ldr r2, [r0] -; CHECK-BE-NEXT: str r2, [sp] -; CHECK-BE-NEXT: mov r2, r1 -; CHECK-BE-NEXT: ldr r3, [r2, #4]! -; CHECK-BE-NEXT: str r3, [sp, #8] -; CHECK-BE-NEXT: ldr r3, [r2, #4] -; CHECK-BE-NEXT: str r3, [sp, #12] -; CHECK-BE-NEXT: add r3, sp, #16 -; CHECK-BE-NEXT: vldrw.u32 q0, [r3] -; CHECK-BE-NEXT: mov r3, sp -; CHECK-BE-NEXT: vldrw.u32 q1, [r3] -; CHECK-BE-NEXT: add r3, sp, #32 +; CHECK-BE-NEXT: vldrb.u8 q0, [r1] +; CHECK-BE-NEXT: vldrb.u8 q1, [r0] +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vrev32.8 q1, q1 ; CHECK-BE-NEXT: vadd.i32 q0, q1, q0 -; CHECK-BE-NEXT: vstrw.32 q0, [r3] -; CHECK-BE-NEXT: ldrd r3, r4, [sp, #40] -; CHECK-BE-NEXT: ldrd r12, lr, [sp, #32] -; CHECK-BE-NEXT: str r4, [r2, #4] -; CHECK-BE-NEXT: sub.w r4, r7, #8 -; CHECK-BE-NEXT: str r3, [r2] -; CHECK-BE-NEXT: str.w lr, [r1] -; CHECK-BE-NEXT: str.w r12, [r0] -; CHECK-BE-NEXT: mov sp, r4 -; CHECK-BE-NEXT: pop {r4, r6, r7, pc} +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vstrb.8 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: %l1 = load <4 x i32>, <4 x i32>* %src1, align 1 %l2 = load <4 x i32>, <4 x i32>* %src2, align 1 diff --git a/test/CodeGen/Thumb2/mve-loadstore.ll b/test/CodeGen/Thumb2/mve-loadstore.ll index f02ce15b55c..850da7aac0e 100644 --- a/test/CodeGen/Thumb2/mve-loadstore.ll +++ b/test/CodeGen/Thumb2/mve-loadstore.ll @@ -1,72 +1,138 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE +; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4(<4 x i32>* %vp) { -; CHECK-LABEL: load_4xi32_a4: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: load_4xi32_a4: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrw.u32 q0, [r0] +; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: load_4xi32_a4: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrw.u32 q0, [r0] +; CHECK-BE-NEXT: vshr.u32 q1, q0, #1 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %vp, align 4 - ret <4 x i32> %0 + %1 = lshr <4 x i32> %0, + ret <4 x i32> %1 } define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a2(<4 x i32>* %vp) { -; CHECK-LABEL: load_4xi32_a2: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: load_4xi32_a2: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrh.u16 q0, [r0] +; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: load_4xi32_a2: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrb.u8 q0, [r0] +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vshr.u32 q1, q0, #1 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %vp, align 2 - ret <4 x i32> %0 + %1 = lshr <4 x i32> %0, + ret <4 x i32> %1 } define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a1(<4 x i32>* %vp) { -; CHECK-LABEL: load_4xi32_a1: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: load_4xi32_a1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrb.u8 q0, [r0] +; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: load_4xi32_a1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrb.u8 q0, [r0] +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vshr.u32 q1, q0, #1 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr entry: %0 = load <4 x i32>, <4 x i32>* %vp, align 1 - ret <4 x i32> %0 + %1 = lshr <4 x i32> %0, + ret <4 x i32> %1 } define arm_aapcs_vfpcc void @store_4xi32_a4(<4 x i32>* %vp, <4 x i32> %val) { -; CHECK-LABEL: store_4xi32_a4: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: store_4xi32_a4: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 +; CHECK-LE-NEXT: vstrw.32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: store_4xi32_a4: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vshr.u32 q0, q1, #1 +; CHECK-BE-NEXT: vstrw.32 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: - store <4 x i32> %val, <4 x i32>* %vp, align 4 + %0 = lshr <4 x i32> %val, + store <4 x i32> %0, <4 x i32>* %vp, align 4 ret void } define arm_aapcs_vfpcc void @store_4xi32_a2(<4 x i32>* %vp, <4 x i32> %val) { -; CHECK-LABEL: store_4xi32_a2: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vstrh.16 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: store_4xi32_a2: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 +; CHECK-LE-NEXT: vstrh.16 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: store_4xi32_a2: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vshr.u32 q0, q1, #1 +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vstrb.8 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: - store <4 x i32> %val, <4 x i32>* %vp, align 2 + %0 = lshr <4 x i32> %val, + store <4 x i32> %0, <4 x i32>* %vp, align 2 ret void } define arm_aapcs_vfpcc void @store_4xi32_a1(<4 x i32>* %vp, <4 x i32> %val) { -; CHECK-LABEL: store_4xi32_a1: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vstrb.8 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: store_4xi32_a1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 +; CHECK-LE-NEXT: vstrb.8 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: store_4xi32_a1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vshr.u32 q0, q1, #1 +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vstrb.8 q0, [r0] +; CHECK-BE-NEXT: bx lr entry: - store <4 x i32> %val, <4 x i32>* %vp, align 1 + %0 = lshr <4 x i32> %val, + store <4 x i32> %0, <4 x i32>* %vp, align 1 ret void } define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4_offset_pos(i32* %ip) { -; CHECK-LABEL: load_4xi32_a4_offset_pos: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: add.w r0, r0, #508 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: load_4xi32_a4_offset_pos: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: add.w r0, r0, #508 +; CHECK-LE-NEXT: vldrw.u32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: load_4xi32_a4_offset_pos: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: add.w r0, r0, #508 +; CHECK-BE-NEXT: vldrb.u8 q1, [r0] +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: bx lr entry: %ipoffset = getelementptr inbounds i32, i32* %ip, i32 127 %vp = bitcast i32* %ipoffset to <4 x i32>* @@ -75,11 +141,18 @@ entry: } define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4_offset_neg(i32* %ip) { -; CHECK-LABEL: load_4xi32_a4_offset_neg: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: sub.w r0, r0, #508 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: load_4xi32_a4_offset_neg: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: sub.w r0, r0, #508 +; CHECK-LE-NEXT: vldrw.u32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: load_4xi32_a4_offset_neg: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: sub.w r0, r0, #508 +; CHECK-BE-NEXT: vldrb.u8 q1, [r0] +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: bx lr entry: %ipoffset = getelementptr inbounds i32, i32* %ip, i32 -127 %vp = bitcast i32* %ipoffset to <4 x i32>* @@ -88,19 +161,34 @@ entry: } define arm_aapcs_vfpcc <4 x i32> @loadstore_4xi32_stack_off16() { -; CHECK-LABEL: loadstore_4xi32_stack_off16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 -; CHECK-NEXT: vmov.i32 q0, #0x1 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: movs r0, #3 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] -; CHECK-NEXT: str r0, [sp, #16] -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] -; CHECK-NEXT: add sp, #40 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: loadstore_4xi32_stack_off16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #40 +; CHECK-LE-NEXT: sub sp, #40 +; CHECK-LE-NEXT: vmov.i32 q0, #0x1 +; CHECK-LE-NEXT: mov r0, sp +; CHECK-LE-NEXT: vstrw.32 q0, [r0] +; CHECK-LE-NEXT: movs r0, #3 +; CHECK-LE-NEXT: vstrw.32 q0, [sp, #16] +; CHECK-LE-NEXT: str r0, [sp, #16] +; CHECK-LE-NEXT: vldrw.u32 q0, [sp, #16] +; CHECK-LE-NEXT: add sp, #40 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: loadstore_4xi32_stack_off16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #40 +; CHECK-BE-NEXT: sub sp, #40 +; CHECK-BE-NEXT: vmov.i32 q0, #0x1 +; CHECK-BE-NEXT: mov r0, sp +; CHECK-BE-NEXT: vstrw.32 q0, [r0] +; CHECK-BE-NEXT: movs r0, #3 +; CHECK-BE-NEXT: vstrw.32 q0, [sp, #16] +; CHECK-BE-NEXT: str r0, [sp, #16] +; CHECK-BE-NEXT: vldrb.u8 q1, [sp, #16] +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: add sp, #40 +; CHECK-BE-NEXT: bx lr entry: %c = alloca [1 x [5 x [2 x i32]]], align 4 %0 = bitcast [1 x [5 x [2 x i32]]]* %c to i8* @@ -116,19 +204,34 @@ entry: } define arm_aapcs_vfpcc <8 x i16> @loadstore_8xi16_stack_off16() { -; CHECK-LABEL: loadstore_8xi16_stack_off16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 -; CHECK-NEXT: vmov.i16 q0, #0x1 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vstrh.16 q0, [r0] -; CHECK-NEXT: movs r0, #3 -; CHECK-NEXT: vstrh.16 q0, [sp, #16] -; CHECK-NEXT: strh.w r0, [sp, #16] -; CHECK-NEXT: vldrh.u16 q0, [sp, #16] -; CHECK-NEXT: add sp, #40 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: loadstore_8xi16_stack_off16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #40 +; CHECK-LE-NEXT: sub sp, #40 +; CHECK-LE-NEXT: vmov.i16 q0, #0x1 +; CHECK-LE-NEXT: mov r0, sp +; CHECK-LE-NEXT: vstrh.16 q0, [r0] +; CHECK-LE-NEXT: movs r0, #3 +; CHECK-LE-NEXT: vstrh.16 q0, [sp, #16] +; CHECK-LE-NEXT: strh.w r0, [sp, #16] +; CHECK-LE-NEXT: vldrh.u16 q0, [sp, #16] +; CHECK-LE-NEXT: add sp, #40 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: loadstore_8xi16_stack_off16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #40 +; CHECK-BE-NEXT: sub sp, #40 +; CHECK-BE-NEXT: vmov.i16 q0, #0x1 +; CHECK-BE-NEXT: mov r0, sp +; CHECK-BE-NEXT: vstrh.16 q0, [r0] +; CHECK-BE-NEXT: movs r0, #3 +; CHECK-BE-NEXT: vstrh.16 q0, [sp, #16] +; CHECK-BE-NEXT: strh.w r0, [sp, #16] +; CHECK-BE-NEXT: vldrb.u8 q1, [sp, #16] +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: add sp, #40 +; CHECK-BE-NEXT: bx lr entry: %c = alloca [1 x [10 x [2 x i16]]], align 2 %0 = bitcast [1 x [10 x [2 x i16]]]* %c to i8* @@ -144,19 +247,34 @@ entry: } define arm_aapcs_vfpcc <16 x i8> @loadstore_16xi8_stack_off16() { -; CHECK-LABEL: loadstore_16xi8_stack_off16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 -; CHECK-NEXT: vmov.i8 q0, #0x1 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vstrb.8 q0, [r0] -; CHECK-NEXT: movs r0, #3 -; CHECK-NEXT: vstrb.8 q0, [sp, #16] -; CHECK-NEXT: strb.w r0, [sp, #16] -; CHECK-NEXT: vldrb.u8 q0, [sp, #16] -; CHECK-NEXT: add sp, #40 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: loadstore_16xi8_stack_off16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .pad #40 +; CHECK-LE-NEXT: sub sp, #40 +; CHECK-LE-NEXT: vmov.i8 q0, #0x1 +; CHECK-LE-NEXT: mov r0, sp +; CHECK-LE-NEXT: vstrb.8 q0, [r0] +; CHECK-LE-NEXT: movs r0, #3 +; CHECK-LE-NEXT: vstrb.8 q0, [sp, #16] +; CHECK-LE-NEXT: strb.w r0, [sp, #16] +; CHECK-LE-NEXT: vldrb.u8 q0, [sp, #16] +; CHECK-LE-NEXT: add sp, #40 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: loadstore_16xi8_stack_off16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .pad #40 +; CHECK-BE-NEXT: sub sp, #40 +; CHECK-BE-NEXT: vmov.i8 q0, #0x1 +; CHECK-BE-NEXT: mov r0, sp +; CHECK-BE-NEXT: vstrb.8 q0, [r0] +; CHECK-BE-NEXT: movs r0, #3 +; CHECK-BE-NEXT: vstrb.8 q0, [sp, #16] +; CHECK-BE-NEXT: strb.w r0, [sp, #16] +; CHECK-BE-NEXT: vldrb.u8 q1, [sp, #16] +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: add sp, #40 +; CHECK-BE-NEXT: bx lr entry: %c = alloca [1 x [20 x [2 x i8]]], align 1 %0 = bitcast [1 x [20 x [2 x i8]]]* %c to i8* diff --git a/test/CodeGen/Thumb2/mve-pred-spill.ll b/test/CodeGen/Thumb2/mve-pred-spill.ll dissimilarity index 72% index d9c7f3894aa..ccbe5033514 100644 --- a/test/CodeGen/Thumb2/mve-pred-spill.ll +++ b/test/CodeGen/Thumb2/mve-pred-spill.ll @@ -1,81 +1,165 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s - -declare arm_aapcs_vfpcc <4 x i32> @ext_i32() -declare arm_aapcs_vfpcc <8 x i16> @ext_i16() -declare arm_aapcs_vfpcc <16 x i8> @ext_i8() - -define arm_aapcs_vfpcc <4 x i32> @shuffle1_v4i32(<4 x i32> %src, <4 x i32> %a) { -; CHECK-LABEL: shuffle1_v4i32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: bl ext_i32 -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vpsel q0, q4, q0 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r7, pc} -entry: - %c = icmp eq <4 x i32> %src, zeroinitializer - %ext = call arm_aapcs_vfpcc <4 x i32> @ext_i32() - %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %ext - ret <4 x i32> %s -} - -define arm_aapcs_vfpcc <8 x i16> @shuffle1_v8i16(<8 x i16> %src, <8 x i16> %a) { -; CHECK-LABEL: shuffle1_v8i16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vcmp.i16 eq, q0, zr -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: bl ext_i16 -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vpsel q0, q4, q0 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r7, pc} -entry: - %c = icmp eq <8 x i16> %src, zeroinitializer - %ext = call arm_aapcs_vfpcc <8 x i16> @ext_i16() - %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %ext - ret <8 x i16> %s -} - -define arm_aapcs_vfpcc <16 x i8> @shuffle1_v16i8(<16 x i8> %src, <16 x i8> %a) { -; CHECK-LABEL: shuffle1_v16i8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vcmp.i8 eq, q0, zr -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: bl ext_i8 -; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: vpsel q0, q4, q0 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r7, pc} -entry: - %c = icmp eq <16 x i8> %src, zeroinitializer - %ext = call arm_aapcs_vfpcc <16 x i8> @ext_i8() - %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %ext - ret <16 x i8> %s -} +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE +; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE + +declare arm_aapcs_vfpcc <4 x i32> @ext_i32(<4 x i32> %c) +declare arm_aapcs_vfpcc <8 x i16> @ext_i16(<8 x i16> %c) +declare arm_aapcs_vfpcc <16 x i8> @ext_i8(<16 x i8> %c) + +define arm_aapcs_vfpcc <4 x i32> @shuffle1_v4i32(<4 x i32> %src, <4 x i32> %a) { +; CHECK-LE-LABEL: shuffle1_v4i32: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} +; CHECK-LE-NEXT: .vsave {d8, d9} +; CHECK-LE-NEXT: vpush {d8, d9} +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vcmp.i32 eq, q0, zr +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: vpsel q0, q1, q0 +; CHECK-LE-NEXT: vmov q4, q1 +; CHECK-LE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-LE-NEXT: bl ext_i32 +; CHECK-LE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-LE-NEXT: vpsel q0, q4, q0 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: vpop {d8, d9} +; CHECK-LE-NEXT: pop {r7, pc} +; +; CHECK-BE-LABEL: shuffle1_v4i32: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r7, lr} +; CHECK-BE-NEXT: push {r7, lr} +; CHECK-BE-NEXT: .vsave {d8, d9} +; CHECK-BE-NEXT: vpush {d8, d9} +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.32 q4, q1 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vcmp.i32 eq, q1, zr +; CHECK-BE-NEXT: vmov.i32 q0, #0x0 +; CHECK-BE-NEXT: vpsel q1, q4, q0 +; CHECK-BE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bl ext_i32 +; CHECK-BE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vpsel q1, q4, q1 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: vpop {d8, d9} +; CHECK-BE-NEXT: pop {r7, pc} +entry: + %c = icmp eq <4 x i32> %src, zeroinitializer + %s1 = select <4 x i1> %c, <4 x i32> %a, <4 x i32> zeroinitializer + %ext = call arm_aapcs_vfpcc <4 x i32> @ext_i32(<4 x i32> %s1) + %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %ext + ret <4 x i32> %s +} + +define arm_aapcs_vfpcc <8 x i16> @shuffle1_v8i16(<8 x i16> %src, <8 x i16> %a) { +; CHECK-LE-LABEL: shuffle1_v8i16: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} +; CHECK-LE-NEXT: .vsave {d8, d9} +; CHECK-LE-NEXT: vpush {d8, d9} +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vcmp.i16 eq, q0, zr +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: vpsel q0, q1, q0 +; CHECK-LE-NEXT: vmov q4, q1 +; CHECK-LE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-LE-NEXT: bl ext_i16 +; CHECK-LE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-LE-NEXT: vpsel q0, q4, q0 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: vpop {d8, d9} +; CHECK-LE-NEXT: pop {r7, pc} +; +; CHECK-BE-LABEL: shuffle1_v8i16: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r7, lr} +; CHECK-BE-NEXT: push {r7, lr} +; CHECK-BE-NEXT: .vsave {d8, d9} +; CHECK-BE-NEXT: vpush {d8, d9} +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.16 q4, q1 +; CHECK-BE-NEXT: vmov.i32 q1, #0x0 +; CHECK-BE-NEXT: vrev64.16 q2, q0 +; CHECK-BE-NEXT: vrev32.16 q1, q1 +; CHECK-BE-NEXT: vcmp.i16 eq, q2, zr +; CHECK-BE-NEXT: vpsel q1, q4, q1 +; CHECK-BE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: bl ext_i16 +; CHECK-BE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-BE-NEXT: vrev64.16 q1, q0 +; CHECK-BE-NEXT: vpsel q1, q4, q1 +; CHECK-BE-NEXT: vrev64.16 q0, q1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: vpop {d8, d9} +; CHECK-BE-NEXT: pop {r7, pc} +entry: + %c = icmp eq <8 x i16> %src, zeroinitializer + %s1 = select <8 x i1> %c, <8 x i16> %a, <8 x i16> zeroinitializer + %ext = call arm_aapcs_vfpcc <8 x i16> @ext_i16(<8 x i16> %s1) + %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %ext + ret <8 x i16> %s +} + +define arm_aapcs_vfpcc <16 x i8> @shuffle1_v16i8(<16 x i8> %src, <16 x i8> %a) { +; CHECK-LE-LABEL: shuffle1_v16i8: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} +; CHECK-LE-NEXT: .vsave {d8, d9} +; CHECK-LE-NEXT: vpush {d8, d9} +; CHECK-LE-NEXT: .pad #8 +; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: vcmp.i8 eq, q0, zr +; CHECK-LE-NEXT: vmov.i32 q0, #0x0 +; CHECK-LE-NEXT: vpsel q0, q1, q0 +; CHECK-LE-NEXT: vmov q4, q1 +; CHECK-LE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-LE-NEXT: bl ext_i8 +; CHECK-LE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-LE-NEXT: vpsel q0, q4, q0 +; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: vpop {d8, d9} +; CHECK-LE-NEXT: pop {r7, pc} +; +; CHECK-BE-LABEL: shuffle1_v16i8: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r7, lr} +; CHECK-BE-NEXT: push {r7, lr} +; CHECK-BE-NEXT: .vsave {d8, d9} +; CHECK-BE-NEXT: vpush {d8, d9} +; CHECK-BE-NEXT: .pad #8 +; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: vrev64.8 q4, q1 +; CHECK-BE-NEXT: vmov.i32 q1, #0x0 +; CHECK-BE-NEXT: vrev64.8 q2, q0 +; CHECK-BE-NEXT: vrev32.8 q1, q1 +; CHECK-BE-NEXT: vcmp.i8 eq, q2, zr +; CHECK-BE-NEXT: vpsel q1, q4, q1 +; CHECK-BE-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: bl ext_i8 +; CHECK-BE-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-BE-NEXT: vrev64.8 q1, q0 +; CHECK-BE-NEXT: vpsel q1, q4, q1 +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: vpop {d8, d9} +; CHECK-BE-NEXT: pop {r7, pc} +entry: + %c = icmp eq <16 x i8> %src, zeroinitializer + %s1 = select <16 x i1> %c, <16 x i8> %a, <16 x i8> zeroinitializer + %ext = call arm_aapcs_vfpcc <16 x i8> @ext_i8(<16 x i8> %s1) + %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %ext + ret <16 x i8> %s +} diff --git a/test/CodeGen/Thumb2/mve-widen-narrow.ll b/test/CodeGen/Thumb2/mve-widen-narrow.ll index 46752fc2ab5..7b0a64c7ba6 100644 --- a/test/CodeGen/Thumb2/mve-widen-narrow.ll +++ b/test/CodeGen/Thumb2/mve-widen-narrow.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE +; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE define void @foo_int8_int32(<4 x i8>* %dest, <4 x i32>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int8_int32: -- 2.11.4.GIT