1 ; RUN: llc < %s -march=arm -mattr=+neon > %t
2 ; RUN: grep {vrshl\\.s8} %t | count 2
3 ; RUN: grep {vrshl\\.s16} %t | count 2
4 ; RUN: grep {vrshl\\.s32} %t | count 2
5 ; RUN: grep {vrshl\\.s64} %t | count 2
6 ; RUN: grep {vrshl\\.u8} %t | count 2
7 ; RUN: grep {vrshl\\.u16} %t | count 2
8 ; RUN: grep {vrshl\\.u32} %t | count 2
9 ; RUN: grep {vrshl\\.u64} %t | count 2
10 ; RUN: grep {vrshr\\.s8} %t | count 2
11 ; RUN: grep {vrshr\\.s16} %t | count 2
12 ; RUN: grep {vrshr\\.s32} %t | count 2
13 ; RUN: grep {vrshr\\.s64} %t | count 2
14 ; RUN: grep {vrshr\\.u8} %t | count 2
15 ; RUN: grep {vrshr\\.u16} %t | count 2
16 ; RUN: grep {vrshr\\.u32} %t | count 2
17 ; RUN: grep {vrshr\\.u64} %t | count 2
19 define <8 x i8> @vrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
20 %tmp1 = load <8 x i8>* %A
21 %tmp2 = load <8 x i8>* %B
22 %tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
26 define <4 x i16> @vrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
27 %tmp1 = load <4 x i16>* %A
28 %tmp2 = load <4 x i16>* %B
29 %tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
33 define <2 x i32> @vrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
34 %tmp1 = load <2 x i32>* %A
35 %tmp2 = load <2 x i32>* %B
36 %tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
40 define <1 x i64> @vrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
41 %tmp1 = load <1 x i64>* %A
42 %tmp2 = load <1 x i64>* %B
43 %tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
47 define <8 x i8> @vrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
48 %tmp1 = load <8 x i8>* %A
49 %tmp2 = load <8 x i8>* %B
50 %tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
54 define <4 x i16> @vrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
55 %tmp1 = load <4 x i16>* %A
56 %tmp2 = load <4 x i16>* %B
57 %tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
61 define <2 x i32> @vrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
62 %tmp1 = load <2 x i32>* %A
63 %tmp2 = load <2 x i32>* %B
64 %tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
68 define <1 x i64> @vrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
69 %tmp1 = load <1 x i64>* %A
70 %tmp2 = load <1 x i64>* %B
71 %tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
75 define <16 x i8> @vrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
76 %tmp1 = load <16 x i8>* %A
77 %tmp2 = load <16 x i8>* %B
78 %tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
82 define <8 x i16> @vrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
83 %tmp1 = load <8 x i16>* %A
84 %tmp2 = load <8 x i16>* %B
85 %tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
89 define <4 x i32> @vrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
90 %tmp1 = load <4 x i32>* %A
91 %tmp2 = load <4 x i32>* %B
92 %tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
96 define <2 x i64> @vrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
97 %tmp1 = load <2 x i64>* %A
98 %tmp2 = load <2 x i64>* %B
99 %tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
103 define <16 x i8> @vrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
104 %tmp1 = load <16 x i8>* %A
105 %tmp2 = load <16 x i8>* %B
106 %tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
110 define <8 x i16> @vrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
111 %tmp1 = load <8 x i16>* %A
112 %tmp2 = load <8 x i16>* %B
113 %tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
117 define <4 x i32> @vrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
118 %tmp1 = load <4 x i32>* %A
119 %tmp2 = load <4 x i32>* %B
120 %tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
124 define <2 x i64> @vrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
125 %tmp1 = load <2 x i64>* %A
126 %tmp2 = load <2 x i64>* %B
127 %tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
131 define <8 x i8> @vrshrs8(<8 x i8>* %A) nounwind {
132 %tmp1 = load <8 x i8>* %A
133 %tmp2 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
137 define <4 x i16> @vrshrs16(<4 x i16>* %A) nounwind {
138 %tmp1 = load <4 x i16>* %A
139 %tmp2 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
143 define <2 x i32> @vrshrs32(<2 x i32>* %A) nounwind {
144 %tmp1 = load <2 x i32>* %A
145 %tmp2 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
149 define <1 x i64> @vrshrs64(<1 x i64>* %A) nounwind {
150 %tmp1 = load <1 x i64>* %A
151 %tmp2 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
155 define <8 x i8> @vrshru8(<8 x i8>* %A) nounwind {
156 %tmp1 = load <8 x i8>* %A
157 %tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
161 define <4 x i16> @vrshru16(<4 x i16>* %A) nounwind {
162 %tmp1 = load <4 x i16>* %A
163 %tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
167 define <2 x i32> @vrshru32(<2 x i32>* %A) nounwind {
168 %tmp1 = load <2 x i32>* %A
169 %tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
173 define <1 x i64> @vrshru64(<1 x i64>* %A) nounwind {
174 %tmp1 = load <1 x i64>* %A
175 %tmp2 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
179 define <16 x i8> @vrshrQs8(<16 x i8>* %A) nounwind {
180 %tmp1 = load <16 x i8>* %A
181 %tmp2 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
185 define <8 x i16> @vrshrQs16(<8 x i16>* %A) nounwind {
186 %tmp1 = load <8 x i16>* %A
187 %tmp2 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
191 define <4 x i32> @vrshrQs32(<4 x i32>* %A) nounwind {
192 %tmp1 = load <4 x i32>* %A
193 %tmp2 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
197 define <2 x i64> @vrshrQs64(<2 x i64>* %A) nounwind {
198 %tmp1 = load <2 x i64>* %A
199 %tmp2 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
203 define <16 x i8> @vrshrQu8(<16 x i8>* %A) nounwind {
204 %tmp1 = load <16 x i8>* %A
205 %tmp2 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
209 define <8 x i16> @vrshrQu16(<8 x i16>* %A) nounwind {
210 %tmp1 = load <8 x i16>* %A
211 %tmp2 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
215 define <4 x i32> @vrshrQu32(<4 x i32>* %A) nounwind {
216 %tmp1 = load <4 x i32>* %A
217 %tmp2 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
221 define <2 x i64> @vrshrQu64(<2 x i64>* %A) nounwind {
222 %tmp1 = load <2 x i64>* %A
223 %tmp2 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
227 declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
228 declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
229 declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
230 declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
232 declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
233 declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
234 declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
235 declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
237 declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
238 declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
239 declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
240 declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
242 declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
243 declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
244 declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
245 declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone