1 ; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE
2 ; RUN: llc -mtriple=armeb-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
4 define <16 x i8> @vcombine8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
5 ; CHECK-LABEL: vcombine8
6 ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0]
7 ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1]
9 ; CHECK-LE-DAG: vmov r0, r1, [[LD0]]
10 ; CHECK-LE-DAG: vmov r2, r3, [[LD1]]
12 ; CHECK-BE-DAG: vmov r1, r0, d16
13 ; CHECK-BE-DAG: vmov r3, r2, d17
14 %tmp1 = load <8 x i8>, <8 x i8>* %A
15 %tmp2 = load <8 x i8>, <8 x i8>* %B
16 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
20 define <8 x i16> @vcombine16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
21 ; CHECK-LABEL: vcombine16
22 ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0]
23 ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1]
25 ; CHECK-LE-DAG: vmov r0, r1, [[LD0]]
26 ; CHECK-LE-DAG: vmov r2, r3, [[LD1]]
28 ; CHECK-BE-DAG: vmov r1, r0, d16
29 ; CHECK-BE-DAG: vmov r3, r2, d17
30 %tmp1 = load <4 x i16>, <4 x i16>* %A
31 %tmp2 = load <4 x i16>, <4 x i16>* %B
32 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
36 define <4 x i32> @vcombine32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
37 ; CHECK-LABEL: vcombine32
39 ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0]
40 ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1]
42 ; CHECK-LE: vmov r2, r3, [[LD1]]
43 ; CHECK-LE: vmov r0, r1, [[LD0]]
45 ; CHECK-BE: vmov r1, r0, d16
46 ; CHECK-BE: vmov r3, r2, d17
47 %tmp1 = load <2 x i32>, <2 x i32>* %A
48 %tmp2 = load <2 x i32>, <2 x i32>* %B
49 %tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
53 define <4 x float> @vcombinefloat(<2 x float>* %A, <2 x float>* %B) nounwind {
54 ; CHECK-LABEL: vcombinefloat
56 ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0]
57 ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1]
59 ; CHECK-LE: vmov r2, r3, [[LD1]]
60 ; CHECK-LE: vmov r0, r1, [[LD0]]
62 ; CHECK-BE: vmov r1, r0, d16
63 ; CHECK-BE: vmov r3, r2, d17
64 %tmp1 = load <2 x float>, <2 x float>* %A
65 %tmp2 = load <2 x float>, <2 x float>* %B
66 %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
70 define <2 x i64> @vcombine64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
71 ; CHECK-LABEL: vcombine64
72 ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0]
73 ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1]
75 ; CHECK-LE: vmov r2, r3, [[LD1]]
76 ; CHECK-LE: vmov r0, r1, [[LD0]]
78 ; CHECK-BE: vmov r3, r2, [[LD1]]
79 ; CHECK-BE: vmov r1, r0, [[LD0]]
80 %tmp1 = load <1 x i64>, <1 x i64>* %A
81 %tmp2 = load <1 x i64>, <1 x i64>* %B
82 %tmp3 = shufflevector <1 x i64> %tmp1, <1 x i64> %tmp2, <2 x i32> <i32 0, i32 1>
86 ; Check for vget_low and vget_high implemented with shufflevector. PR8411.
87 ; They should not require storing to the stack.
89 define <4 x i16> @vget_low16(<8 x i16>* %A) nounwind {
92 ; CHECK-LE: vmov r0, r1, d16
93 ; CHECK-BE: vmov r1, r0, d16
94 %tmp1 = load <8 x i16>, <8 x i16>* %A
95 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
99 define <8 x i8> @vget_high8(<16 x i8>* %A) nounwind {
102 ; CHECK-LE-NOT: vld1.64 {d16, d17}, [r0]
103 ; CHECK-LE: vldr d16, [r0, #8]
104 ; CHECK-LE: vmov r0, r1, d16
105 ; CHECK-BE: vmov r1, r0, d16
106 %tmp1 = load <16 x i8>, <16 x i8>* %A
107 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
111 ; vcombine(vld1_dup(p), vld1_dup(p2))
112 define <8 x i16> @vcombine_vdup(<8 x i16> %src, i16* nocapture readonly %p) {
113 ; CHECK-LABEL: vcombine_vdup:
114 ; CHECK: vld1.16 {d16[]},
115 ; CHECK: vld1.16 {d17[]},
116 ; CHECK-LE: vmov r0, r1, d16
117 ; CHECK-LE: vmov r2, r3, d17
118 %a1 = load i16, i16* %p, align 2
119 %a2 = insertelement <4 x i16> undef, i16 %a1, i32 0
120 %a3 = shufflevector <4 x i16> %a2, <4 x i16> undef, <4 x i32> zeroinitializer
121 %p2 = getelementptr inbounds i16, i16* %p, i32 1
122 %b1 = load i16, i16* %p2, align 2
123 %b2 = insertelement <4 x i16> undef, i16 %b1, i32 0
124 %b3 = shufflevector <4 x i16> %b2, <4 x i16> undef, <4 x i32> zeroinitializer
125 %shuffle = shufflevector <4 x i16> %a3, <4 x i16> %b3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
126 ret <8 x i16> %shuffle