1 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s
3 define i32 @test_rev_w(i32 %a) nounwind {
5 ; CHECK-LABEL: test_rev_w:
7 %0 = tail call i32 @llvm.bswap.i32(i32 %a)
11 define i64 @test_rev_x(i64 %a) nounwind {
13 ; CHECK-LABEL: test_rev_x:
15 %0 = tail call i64 @llvm.bswap.i64(i64 %a)
19 ; Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 16-bits
20 ; of %a are zero. This optimizes rev + lsr 16 to rev16.
21 define i32 @test_rev_w_srl16(i16 %a) {
23 ; CHECK-LABEL: test_rev_w_srl16:
24 ; CHECK: and [[REG:w[0-9]+]], w0, #0xffff
25 ; CHECK: rev16 w0, [[REG]]
27 %0 = zext i16 %a to i32
28 %1 = tail call i32 @llvm.bswap.i32(i32 %0)
33 ; Canonicalize (srl (bswap x), 32) to (rotr (bswap x), 32) if the high 32-bits
34 ; of %a are zero. This optimizes rev + lsr 32 to rev32.
35 define i64 @test_rev_x_srl32(i32 %a) {
37 ; CHECK-LABEL: test_rev_x_srl32:
38 ; CHECK: rev32 x0, {{x[0-9]+}}
40 %0 = zext i32 %a to i64
41 %1 = tail call i64 @llvm.bswap.i64(i64 %0)
46 declare i32 @llvm.bswap.i32(i32) nounwind readnone
47 declare i64 @llvm.bswap.i64(i64) nounwind readnone
49 define i32 @test_rev16_w(i32 %X) nounwind {
51 ; CHECK-LABEL: test_rev16_w:
53 %tmp1 = lshr i32 %X, 8
54 %X15 = bitcast i32 %X to i32
55 %tmp4 = shl i32 %X15, 8
56 %tmp2 = and i32 %tmp1, 16711680
57 %tmp5 = and i32 %tmp4, -16777216
58 %tmp9 = and i32 %tmp1, 255
59 %tmp13 = and i32 %tmp4, 65280
60 %tmp6 = or i32 %tmp5, %tmp2
61 %tmp10 = or i32 %tmp6, %tmp13
62 %tmp14 = or i32 %tmp10, %tmp9
66 ; 64-bit REV16 is *not* a swap then a 16-bit rotation:
67 ; 01234567 ->(bswap) 76543210 ->(rotr) 10765432
68 ; 01234567 ->(rev16) 10325476
69 define i64 @test_rev16_x(i64 %a) nounwind {
71 ; CHECK-LABEL: test_rev16_x:
72 ; CHECK-NOT: rev16 x0, x0
73 %0 = tail call i64 @llvm.bswap.i64(i64 %a)
80 define i64 @test_rev32_x(i64 %a) nounwind {
82 ; CHECK-LABEL: test_rev32_x:
84 %0 = tail call i64 @llvm.bswap.i64(i64 %a)
91 define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
92 ;CHECK-LABEL: test_vrev64D8:
94 %tmp1 = load <8 x i8>, <8 x i8>* %A
95 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
99 define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
100 ;CHECK-LABEL: test_vrev64D16:
102 %tmp1 = load <4 x i16>, <4 x i16>* %A
103 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
107 define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
108 ;CHECK-LABEL: test_vrev64D32:
110 %tmp1 = load <2 x i32>, <2 x i32>* %A
111 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
115 define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
116 ;CHECK-LABEL: test_vrev64Df:
118 %tmp1 = load <2 x float>, <2 x float>* %A
119 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
120 ret <2 x float> %tmp2
123 define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
124 ;CHECK-LABEL: test_vrev64Q8:
126 %tmp1 = load <16 x i8>, <16 x i8>* %A
127 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
131 define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
132 ;CHECK-LABEL: test_vrev64Q16:
134 %tmp1 = load <8 x i16>, <8 x i16>* %A
135 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
139 define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
140 ;CHECK-LABEL: test_vrev64Q32:
142 %tmp1 = load <4 x i32>, <4 x i32>* %A
143 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
147 define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
148 ;CHECK-LABEL: test_vrev64Qf:
150 %tmp1 = load <4 x float>, <4 x float>* %A
151 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
152 ret <4 x float> %tmp2
155 define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
156 ;CHECK-LABEL: test_vrev32D8:
158 %tmp1 = load <8 x i8>, <8 x i8>* %A
159 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
163 define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
164 ;CHECK-LABEL: test_vrev32D16:
166 %tmp1 = load <4 x i16>, <4 x i16>* %A
167 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
171 define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
172 ;CHECK-LABEL: test_vrev32Q8:
174 %tmp1 = load <16 x i8>, <16 x i8>* %A
175 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
179 define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
180 ;CHECK-LABEL: test_vrev32Q16:
182 %tmp1 = load <8 x i16>, <8 x i16>* %A
183 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
187 define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
188 ;CHECK-LABEL: test_vrev16D8:
190 %tmp1 = load <8 x i8>, <8 x i8>* %A
191 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
195 define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
196 ;CHECK-LABEL: test_vrev16Q8:
198 %tmp1 = load <16 x i8>, <16 x i8>* %A
199 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
203 ; Undef shuffle indices should not prevent matching to VREV:
205 define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind {
206 ;CHECK-LABEL: test_vrev64D8_undef:
208 %tmp1 = load <8 x i8>, <8 x i8>* %A
209 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0>
213 define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind {
214 ;CHECK-LABEL: test_vrev32Q16_undef:
216 %tmp1 = load <8 x i16>, <8 x i16>* %A
217 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
221 ; vrev <4 x i16> should use REV32 and not REV64
222 define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
223 ; CHECK-LABEL: test_vrev64:
224 ; CHECK: ldr [[DEST:q[0-9]+]],
228 %0 = bitcast <4 x i16>* %source to <8 x i16>*
229 %tmp2 = load <8 x i16>, <8 x i16>* %0, align 4
230 %tmp3 = extractelement <8 x i16> %tmp2, i32 6
231 %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
232 %tmp9 = extractelement <8 x i16> %tmp2, i32 5
233 %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
234 store <2 x i16> %tmp11, <2 x i16>* %dst, align 4
238 ; Test vrev of float4
239 define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp {
240 ; CHECK: float_vrev64
241 ; CHECK: ldr [[DEST:q[0-9]+]],
244 %0 = bitcast float* %source to <4 x float>*
245 %tmp2 = load <4 x float>, <4 x float>* %0, align 4
246 %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
247 %arrayidx8 = getelementptr inbounds <4 x float>, <4 x float>* %dest, i32 11
248 store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4
253 define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
254 ; CHECK-LABEL: test_vrev32_bswap:
258 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
262 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone