From d2f7b2b338d5e3865b6951241792ced82296826f Mon Sep 17 00:00:00 2001 From: David Green Date: Sun, 4 Aug 2019 10:18:15 +0000 Subject: [PATCH] [ARM] MVE big endian bitcasts This adds big endian MVE patterns for bitcasts. They are defined in llvm as being the same as a store of the existing type and the load into the new. This means that they have to become a VREV between the two types, working in the same way that NEON works in big-endian. This also adds some example tests for bigendian, showing where code is and isn't different. The main difference, especially from a testing perspective is that vectors are passed as v2f64, and so are VREV into and out of call arguments, and the parameters are passed in a v2f64 format. Same happens for inline assembly where the register class is used, so it is VREV to a v16i8. So some of this is probably not correct yet, but it is (mostly) self-consistent and seems to be consistent with how llvm treats vectors. The rest we can hopefully fix later. More details about big endian neon can be found in https://llvm.org/docs/BigEndianNEON.html. Differential Revision: https://reviews.llvm.org/D65581 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@367780 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMInstrMVE.td | 45 ++++++ test/CodeGen/Thumb2/mve-be.ll | 330 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 375 insertions(+) create mode 100644 test/CodeGen/Thumb2/mve-be.ll diff --git a/lib/Target/ARM/ARMInstrMVE.td b/lib/Target/ARM/ARMInstrMVE.td index 3e8f726bde4..aeeed7505b7 100644 --- a/lib/Target/ARM/ARMInstrMVE.td +++ b/lib/Target/ARM/ARMInstrMVE.td @@ -4875,3 +4875,48 @@ let Predicates = [IsLE,HasMVEInt] in { def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>; } + +let Predicates = [IsBE,HasMVEInt] in { + def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 (MVE_VREV64_32 QPR:$src))>; + def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 (MVE_VREV64_32 QPR:$src))>; + def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 (MVE_VREV64_16 QPR:$src))>; + def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 (MVE_VREV64_16 QPR:$src))>; + def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 (MVE_VREV64_8 QPR:$src))>; + + def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 (MVE_VREV64_32 QPR:$src))>; + def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 (MVE_VREV64_32 QPR:$src))>; + def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 (MVE_VREV64_16 QPR:$src))>; + def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 (MVE_VREV64_16 QPR:$src))>; + def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 (MVE_VREV64_8 QPR:$src))>; + + def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 (MVE_VREV64_32 QPR:$src))>; + def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 (MVE_VREV64_32 QPR:$src))>; + def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 (MVE_VREV32_16 QPR:$src))>; + def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 (MVE_VREV32_16 QPR:$src))>; + def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 (MVE_VREV32_8 QPR:$src))>; + + def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 (MVE_VREV64_32 QPR:$src))>; + def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 (MVE_VREV64_32 QPR:$src))>; + def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 (MVE_VREV32_16 QPR:$src))>; + def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 (MVE_VREV32_16 QPR:$src))>; + def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 (MVE_VREV32_8 QPR:$src))>; + + def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 (MVE_VREV64_16 QPR:$src))>; + def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (v8f16 (MVE_VREV64_16 QPR:$src))>; + def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (v8f16 (MVE_VREV32_16 QPR:$src))>; + def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 (MVE_VREV32_16 QPR:$src))>; + def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 (MVE_VREV16_8 QPR:$src))>; + + def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 (MVE_VREV64_16 QPR:$src))>; + def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 (MVE_VREV64_16 QPR:$src))>; + def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 (MVE_VREV32_16 QPR:$src))>; + def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 (MVE_VREV32_16 QPR:$src))>; + def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 (MVE_VREV16_8 QPR:$src))>; + + def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 (MVE_VREV64_8 QPR:$src))>; + def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 (MVE_VREV64_8 QPR:$src))>; + def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 (MVE_VREV32_8 QPR:$src))>; + def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 (MVE_VREV32_8 QPR:$src))>; + def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 (MVE_VREV16_8 QPR:$src))>; + def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 (MVE_VREV16_8 QPR:$src))>; +} diff --git a/test/CodeGen/Thumb2/mve-be.ll b/test/CodeGen/Thumb2/mve-be.ll new file mode 100644 index 00000000000..f1de6e54671 --- /dev/null +++ b/test/CodeGen/Thumb2/mve-be.ll @@ -0,0 +1,330 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE +; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE + +define void @load_load_add_store(<4 x i32> *%src1, <4 x i32> *%src2) { +; CHECK-LABEL: load_load_add_store: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vadd.i32 q0, q1, q0 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %l1 = load <4 x i32>, <4 x i32>* %src1, align 4 + %l2 = load <4 x i32>, <4 x i32>* %src2, align 4 + %a = add <4 x i32> %l1, %l2 + store <4 x i32> %a, <4 x i32>* %src1, align 4 + ret void +} + +define void @load_load_add_store_align1(<4 x i32> *%src1, <4 x i32> *%src2) { +; CHECK-LE-LABEL: load_load_add_store_align1: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrb.u8 q0, [r1] +; CHECK-LE-NEXT: vldrb.u8 q1, [r0] +; CHECK-LE-NEXT: vadd.i32 q0, q1, q0 +; CHECK-LE-NEXT: vstrb.8 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: load_load_add_store_align1: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r4, r6, r7, lr} +; CHECK-BE-NEXT: push {r4, r6, r7, lr} +; CHECK-BE-NEXT: .setfp r7, sp, #8 +; CHECK-BE-NEXT: add r7, sp, #8 +; CHECK-BE-NEXT: .pad #48 +; CHECK-BE-NEXT: sub sp, #48 +; CHECK-BE-NEXT: mov r4, sp +; CHECK-BE-NEXT: bfc r4, #0, #4 +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: ldr.w r12, [r1] +; CHECK-BE-NEXT: ldr r3, [r1, #4] +; CHECK-BE-NEXT: ldr r2, [r1, #8] +; CHECK-BE-NEXT: ldr r1, [r1, #12] +; CHECK-BE-NEXT: strd r2, r1, [sp, #24] +; CHECK-BE-NEXT: mov r1, r0 +; CHECK-BE-NEXT: strd r12, r3, [sp, #16] +; CHECK-BE-NEXT: ldr r2, [r1, #4]! +; CHECK-BE-NEXT: str r2, [sp, #4] +; CHECK-BE-NEXT: ldr r2, [r0] +; CHECK-BE-NEXT: str r2, [sp] +; CHECK-BE-NEXT: mov r2, r1 +; CHECK-BE-NEXT: ldr r3, [r2, #4]! +; CHECK-BE-NEXT: str r3, [sp, #8] +; CHECK-BE-NEXT: ldr r3, [r2, #4] +; CHECK-BE-NEXT: str r3, [sp, #12] +; CHECK-BE-NEXT: add r3, sp, #16 +; CHECK-BE-NEXT: vldrw.u32 q0, [r3] +; CHECK-BE-NEXT: mov r3, sp +; CHECK-BE-NEXT: vldrw.u32 q1, [r3] +; CHECK-BE-NEXT: add r3, sp, #32 +; CHECK-BE-NEXT: vadd.i32 q0, q1, q0 +; CHECK-BE-NEXT: vstrw.32 q0, [r3] +; CHECK-BE-NEXT: ldrd r3, r4, [sp, #40] +; CHECK-BE-NEXT: ldrd r12, lr, [sp, #32] +; CHECK-BE-NEXT: str r4, [r2, #4] +; CHECK-BE-NEXT: sub.w r4, r7, #8 +; CHECK-BE-NEXT: str r3, [r2] +; CHECK-BE-NEXT: str.w lr, [r1] +; CHECK-BE-NEXT: str.w r12, [r0] +; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: pop {r4, r6, r7, pc} +entry: + %l1 = load <4 x i32>, <4 x i32>* %src1, align 1 + %l2 = load <4 x i32>, <4 x i32>* %src2, align 1 + %a = add <4 x i32> %l1, %l2 + store <4 x i32> %a, <4 x i32>* %src1, align 1 + ret void +} + +define arm_aapcs_vfpcc void @load_arg_add_store(<4 x i32> *%src1, <4 x i32> %src2) { +; CHECK-LE-LABEL: load_arg_add_store: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrw.u32 q1, [r0] +; CHECK-LE-NEXT: vadd.i32 q0, q1, q0 +; CHECK-LE-NEXT: vstrw.32 q0, [r0] +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: load_arg_add_store: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vldrw.u32 q0, [r0] +; CHECK-BE-NEXT: vadd.i32 q0, q0, q1 +; CHECK-BE-NEXT: vstrw.32 q0, [r0] +; CHECK-BE-NEXT: bx lr +entry: + %l1 = load <4 x i32>, <4 x i32>* %src1, align 4 + %a = add <4 x i32> %l1, %src2 + store <4 x i32> %a, <4 x i32>* %src1, align 4 + ret void +} + +define <4 x i32> @add_soft(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LE-LABEL: add_soft: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmov d1, r2, r3 +; CHECK-LE-NEXT: vmov d0, r0, r1 +; CHECK-LE-NEXT: mov r0, sp +; CHECK-LE-NEXT: vldrw.u32 q1, [r0] +; CHECK-LE-NEXT: vadd.i32 q0, q0, q1 +; CHECK-LE-NEXT: vmov r0, r1, d0 +; CHECK-LE-NEXT: vmov r2, r3, d1 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: add_soft: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vmov d1, r3, r2 +; CHECK-BE-NEXT: vmov d0, r1, r0 +; CHECK-BE-NEXT: mov r0, sp +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vldrw.u32 q0, [r0] +; CHECK-BE-NEXT: vadd.i32 q0, q1, q0 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vmov r1, r0, d2 +; CHECK-BE-NEXT: vmov r3, r2, d3 +; CHECK-BE-NEXT: bx lr +entry: + %0 = add <4 x i32> %src1, %src2 + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @add_hard(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LE-LABEL: add_hard: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vadd.i32 q0, q0, q1 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: add_hard: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q2, q1 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vadd.i32 q1, q1, q2 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %0 = add <4 x i32> %src1, %src2 + ret <4 x i32> %0 +} + +define <4 x i32> @call_soft(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LE-LABEL: call_soft: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} +; CHECK-LE-NEXT: .pad #16 +; CHECK-LE-NEXT: sub sp, #16 +; CHECK-LE-NEXT: add.w r12, sp, #24 +; CHECK-LE-NEXT: vldrw.u32 q0, [r12] +; CHECK-LE-NEXT: vstrw.32 q0, [sp] +; CHECK-LE-NEXT: vmov d1, r2, r3 +; CHECK-LE-NEXT: vmov d0, r0, r1 +; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 +; CHECK-LE-NEXT: vmov r0, r1, d0 +; CHECK-LE-NEXT: vmov r2, r3, d1 +; CHECK-LE-NEXT: bl add_soft +; CHECK-LE-NEXT: vmov d1, r2, r3 +; CHECK-LE-NEXT: vmov d0, r0, r1 +; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 +; CHECK-LE-NEXT: vmov r0, r1, d0 +; CHECK-LE-NEXT: vmov r2, r3, d1 +; CHECK-LE-NEXT: add sp, #16 +; CHECK-LE-NEXT: pop {r7, pc} +; +; CHECK-BE-LABEL: call_soft: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r7, lr} +; CHECK-BE-NEXT: push {r7, lr} +; CHECK-BE-NEXT: .pad #16 +; CHECK-BE-NEXT: sub sp, #16 +; CHECK-BE-NEXT: add.w r12, sp, #24 +; CHECK-BE-NEXT: vldrw.u32 q0, [r12] +; CHECK-BE-NEXT: vstrw.32 q0, [sp] +; CHECK-BE-NEXT: vmov d1, r3, r2 +; CHECK-BE-NEXT: vmov d0, r1, r0 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vshr.u32 q0, q1, #1 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vmov r1, r0, d2 +; CHECK-BE-NEXT: vmov r3, r2, d3 +; CHECK-BE-NEXT: bl add_soft +; CHECK-BE-NEXT: vmov d1, r3, r2 +; CHECK-BE-NEXT: vmov d0, r1, r0 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vshr.u32 q0, q1, #1 +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vmov r1, r0, d2 +; CHECK-BE-NEXT: vmov r3, r2, d3 +; CHECK-BE-NEXT: add sp, #16 +; CHECK-BE-NEXT: pop {r7, pc} +entry: + %0 = lshr <4 x i32> %src1, + %1 = call <4 x i32> @add_soft(<4 x i32> %0, <4 x i32> %src2) + %2 = lshr <4 x i32> %1, + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @call_hard(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LE-LABEL: call_hard: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r7, lr} +; CHECK-LE-NEXT: push {r7, lr} +; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 +; CHECK-LE-NEXT: bl add_hard +; CHECK-LE-NEXT: vshr.u32 q0, q0, #1 +; CHECK-LE-NEXT: pop {r7, pc} +; +; CHECK-BE-LABEL: call_hard: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: .save {r7, lr} +; CHECK-BE-NEXT: push {r7, lr} +; CHECK-BE-NEXT: vrev64.32 q2, q0 +; CHECK-BE-NEXT: vshr.u32 q2, q2, #1 +; CHECK-BE-NEXT: vrev64.32 q0, q2 +; CHECK-BE-NEXT: bl add_hard +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vshr.u32 q1, q1, #1 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: pop {r7, pc} +entry: + %0 = lshr <4 x i32> %src1, + %1 = call arm_aapcs_vfpcc <4 x i32> @add_hard(<4 x i32> %0, <4 x i32> %src2) + %2 = lshr <4 x i32> %1, + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @and_v4i32(<4 x i32> %src) { +; CHECK-LE-LABEL: and_v4i32: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmov.i32 q1, #0x1 +; CHECK-LE-NEXT: vand q0, q0, q1 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: and_v4i32: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.32 q1, q0 +; CHECK-BE-NEXT: vmov.i32 q0, #0x1 +; CHECK-BE-NEXT: vand q1, q1, q0 +; CHECK-BE-NEXT: vrev64.32 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %s1 = and <4 x i32> %src, + %r = bitcast <4 x i32> %s1 to <16 x i8> + ret <16 x i8> %r +} + +; Should be the same as and_v4i32 for LE +define arm_aapcs_vfpcc <16 x i8> @and_v16i8_le(<4 x i32> %src) { +; CHECK-LE-LABEL: and_v16i8_le: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmov.i32 q1, #0x1 +; CHECK-LE-NEXT: vand q0, q0, q1 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: and_v16i8_le: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.8 q1, q0 +; CHECK-BE-NEXT: vmov.i32 q0, #0x1 +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vand q1, q1, q0 +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %0 = bitcast <4 x i32> %src to <16 x i8> + %r = and <16 x i8> %0, + ret <16 x i8> %r +} + +; Should be the same (or at least equivalent) as and_v4i32 for BE +define arm_aapcs_vfpcc <16 x i8> @and_v16i8_be(<4 x i32> %src) { +; CHECK-LE-LABEL: and_v16i8_be: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vmov.i32 q1, #0x1000000 +; CHECK-LE-NEXT: vand q0, q0, q1 +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: and_v16i8_be: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vrev64.8 q1, q0 +; CHECK-BE-NEXT: vmov.i32 q0, #0x1000000 +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: vand q1, q1, q0 +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %0 = bitcast <4 x i32> %src to <16 x i8> + %r = and <16 x i8> %0, + ret <16 x i8> %r +} + +; FIXME: This looks wrong +define arm_aapcs_vfpcc <4 x i32> @test(i32* %data) { +; CHECK-LE-LABEL: test: +; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-LE-NEXT: vmov.i32 q0, #0x1 +; CHECK-LE-NEXT: vadd.i32 q1, q1, q0 +; CHECK-LE-NEXT: @APP +; CHECK-LE-NEXT: vmullb.s32 q0, q1, q1 +; CHECK-LE-NEXT: @NO_APP +; CHECK-LE-NEXT: bx lr +; +; CHECK-BE-LABEL: test: +; CHECK-BE: @ %bb.0: @ %entry +; CHECK-BE-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-BE-NEXT: vmov.i32 q0, #0x1 +; CHECK-BE-NEXT: vadd.i32 q0, q1, q0 +; CHECK-BE-NEXT: vrev32.8 q0, q0 +; CHECK-BE-NEXT: @APP +; CHECK-BE-NEXT: vmullb.s32 q1, q0, q0 +; CHECK-BE-NEXT: @NO_APP +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: bx lr +entry: + %add.ptr = getelementptr inbounds i32, i32* %data, i32 8 + %0 = bitcast i32* %add.ptr to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = add <4 x i32> %1, + %3 = tail call <4 x i32> asm sideeffect " VMULLB.s32 $0, $1, $1", "=&w,w"(<4 x i32> %2) #2 + ret <4 x i32> %3 +} -- 2.11.4.GIT