1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=apple | FileCheck %s
3 ; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* -mtriple=aarch64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=FALLBACK,GISEL
5 ; FALLBACK-NOT: remark{{.*}}test_rev_w
6 define i32 @test_rev_w(i32 %a) nounwind {
7 ; CHECK-LABEL: test_rev_w:
8 ; CHECK: // %bb.0: // %entry
9 ; CHECK-NEXT: rev w0, w0
11 ; GISEL-LABEL: test_rev_w:
12 ; GISEL: // %bb.0: // %entry
13 ; GISEL-NEXT: rev w0, w0
16 %0 = tail call i32 @llvm.bswap.i32(i32 %a)
20 ; FALLBACK-NOT: remark{{.*}}test_rev_x
21 define i64 @test_rev_x(i64 %a) nounwind {
22 ; CHECK-LABEL: test_rev_x:
23 ; CHECK: // %bb.0: // %entry
24 ; CHECK-NEXT: rev x0, x0
26 ; GISEL-LABEL: test_rev_x:
27 ; GISEL: // %bb.0: // %entry
28 ; GISEL-NEXT: rev x0, x0
31 %0 = tail call i64 @llvm.bswap.i64(i64 %a)
35 ; Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 16-bits
36 ; of %a are zero. This optimizes rev + lsr 16 to rev16.
37 define i32 @test_rev_w_srl16(i16 %a) {
38 ; CHECK-LABEL: test_rev_w_srl16:
39 ; CHECK: // %bb.0: // %entry
40 ; CHECK-NEXT: and w8, w0, #0xffff
41 ; CHECK-NEXT: rev16 w0, w8
44 %0 = zext i16 %a to i32
45 %1 = tail call i32 @llvm.bswap.i32(i32 %0)
50 ; Canonicalize (srl (bswap x), 32) to (rotr (bswap x), 32) if the high 32-bits
51 ; of %a are zero. This optimizes rev + lsr 32 to rev32.
52 define i64 @test_rev_x_srl32(i32 %a) {
53 ; CHECK-LABEL: test_rev_x_srl32:
54 ; CHECK: // %bb.0: // %entry
55 ; CHECK-NEXT: mov w8, w0
56 ; CHECK-NEXT: rev32 x0, x8
59 %0 = zext i32 %a to i64
60 %1 = tail call i64 @llvm.bswap.i64(i64 %0)
65 declare i32 @llvm.bswap.i32(i32) nounwind readnone
66 declare i64 @llvm.bswap.i64(i64) nounwind readnone
68 define i32 @test_rev16_w(i32 %X) nounwind {
69 ; CHECK-LABEL: test_rev16_w:
70 ; CHECK: // %bb.0: // %entry
71 ; CHECK-NEXT: rev16 w0, w0
74 %tmp1 = lshr i32 %X, 8
75 %X15 = bitcast i32 %X to i32
76 %tmp4 = shl i32 %X15, 8
77 %tmp2 = and i32 %tmp1, 16711680
78 %tmp5 = and i32 %tmp4, -16777216
79 %tmp9 = and i32 %tmp1, 255
80 %tmp13 = and i32 %tmp4, 65280
81 %tmp6 = or i32 %tmp5, %tmp2
82 %tmp10 = or i32 %tmp6, %tmp13
83 %tmp14 = or i32 %tmp10, %tmp9
87 ; 64-bit REV16 is *not* a swap then a 16-bit rotation:
88 ; 01234567 ->(bswap) 76543210 ->(rotr) 10765432
89 ; 01234567 ->(rev16) 10325476
90 define i64 @test_rev16_x(i64 %a) nounwind {
91 ; CHECK-LABEL: test_rev16_x:
92 ; CHECK: // %bb.0: // %entry
93 ; CHECK-NEXT: rev x8, x0
94 ; CHECK-NEXT: ror x0, x8, #16
97 %0 = tail call i64 @llvm.bswap.i64(i64 %a)
104 define i64 @test_rev32_x(i64 %a) nounwind {
105 ; CHECK-LABEL: test_rev32_x:
106 ; CHECK: // %bb.0: // %entry
107 ; CHECK-NEXT: rev32 x0, x0
110 %0 = tail call i64 @llvm.bswap.i64(i64 %a)
117 define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
118 ; CHECK-LABEL: test_vrev64D8:
120 ; CHECK-NEXT: ldr d0, [x0]
121 ; CHECK-NEXT: rev64.8b v0, v0
123 %tmp1 = load <8 x i8>, <8 x i8>* %A
124 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
128 define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
129 ; CHECK-LABEL: test_vrev64D16:
131 ; CHECK-NEXT: ldr d0, [x0]
132 ; CHECK-NEXT: rev64.4h v0, v0
134 %tmp1 = load <4 x i16>, <4 x i16>* %A
135 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
139 define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
140 ; CHECK-LABEL: test_vrev64D32:
142 ; CHECK-NEXT: ldr d0, [x0]
143 ; CHECK-NEXT: rev64.2s v0, v0
145 %tmp1 = load <2 x i32>, <2 x i32>* %A
146 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
150 define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
151 ; CHECK-LABEL: test_vrev64Df:
153 ; CHECK-NEXT: ldr d0, [x0]
154 ; CHECK-NEXT: rev64.2s v0, v0
156 %tmp1 = load <2 x float>, <2 x float>* %A
157 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
158 ret <2 x float> %tmp2
161 define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
162 ; CHECK-LABEL: test_vrev64Q8:
164 ; CHECK-NEXT: ldr q0, [x0]
165 ; CHECK-NEXT: rev64.16b v0, v0
167 %tmp1 = load <16 x i8>, <16 x i8>* %A
168 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
172 define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
173 ; CHECK-LABEL: test_vrev64Q16:
175 ; CHECK-NEXT: ldr q0, [x0]
176 ; CHECK-NEXT: rev64.8h v0, v0
178 %tmp1 = load <8 x i16>, <8 x i16>* %A
179 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
183 define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
184 ; CHECK-LABEL: test_vrev64Q32:
186 ; CHECK-NEXT: ldr q0, [x0]
187 ; CHECK-NEXT: rev64.4s v0, v0
189 %tmp1 = load <4 x i32>, <4 x i32>* %A
190 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
194 define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
195 ; CHECK-LABEL: test_vrev64Qf:
197 ; CHECK-NEXT: ldr q0, [x0]
198 ; CHECK-NEXT: rev64.4s v0, v0
200 %tmp1 = load <4 x float>, <4 x float>* %A
201 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
202 ret <4 x float> %tmp2
205 define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
206 ; CHECK-LABEL: test_vrev32D8:
208 ; CHECK-NEXT: ldr d0, [x0]
209 ; CHECK-NEXT: rev32.8b v0, v0
211 %tmp1 = load <8 x i8>, <8 x i8>* %A
212 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
216 define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
217 ; CHECK-LABEL: test_vrev32D16:
219 ; CHECK-NEXT: ldr d0, [x0]
220 ; CHECK-NEXT: rev32.4h v0, v0
222 %tmp1 = load <4 x i16>, <4 x i16>* %A
223 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
227 define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
228 ; CHECK-LABEL: test_vrev32Q8:
230 ; CHECK-NEXT: ldr q0, [x0]
231 ; CHECK-NEXT: rev32.16b v0, v0
233 %tmp1 = load <16 x i8>, <16 x i8>* %A
234 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
238 define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
239 ; CHECK-LABEL: test_vrev32Q16:
241 ; CHECK-NEXT: ldr q0, [x0]
242 ; CHECK-NEXT: rev32.8h v0, v0
244 %tmp1 = load <8 x i16>, <8 x i16>* %A
245 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
249 define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
250 ; CHECK-LABEL: test_vrev16D8:
252 ; CHECK-NEXT: ldr d0, [x0]
253 ; CHECK-NEXT: rev16.8b v0, v0
255 %tmp1 = load <8 x i8>, <8 x i8>* %A
256 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
260 define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
261 ; CHECK-LABEL: test_vrev16Q8:
263 ; CHECK-NEXT: ldr q0, [x0]
264 ; CHECK-NEXT: rev16.16b v0, v0
266 %tmp1 = load <16 x i8>, <16 x i8>* %A
267 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
271 ; Undef shuffle indices should not prevent matching to VREV:
273 define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind {
274 ; CHECK-LABEL: test_vrev64D8_undef:
276 ; CHECK-NEXT: ldr d0, [x0]
277 ; CHECK-NEXT: rev64.8b v0, v0
279 %tmp1 = load <8 x i8>, <8 x i8>* %A
280 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0>
284 define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind {
285 ; CHECK-LABEL: test_vrev32Q16_undef:
287 ; CHECK-NEXT: ldr q0, [x0]
288 ; CHECK-NEXT: rev32.8h v0, v0
290 %tmp1 = load <8 x i16>, <8 x i16>* %A
291 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
295 ; vrev <4 x i16> should use REV32 and not REV64
296 define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
297 ; CHECK-LABEL: test_vrev64:
298 ; CHECK: // %bb.0: // %entry
299 ; CHECK-NEXT: ldr q0, [x0]
300 ; CHECK-NEXT: add x8, x1, #2 // =2
301 ; CHECK-NEXT: st1.h { v0 }[5], [x8]
302 ; CHECK-NEXT: st1.h { v0 }[6], [x1]
305 %0 = bitcast <4 x i16>* %source to <8 x i16>*
306 %tmp2 = load <8 x i16>, <8 x i16>* %0, align 4
307 %tmp3 = extractelement <8 x i16> %tmp2, i32 6
308 %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
309 %tmp9 = extractelement <8 x i16> %tmp2, i32 5
310 %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
311 store <2 x i16> %tmp11, <2 x i16>* %dst, align 4
315 ; Test vrev of float4
316 define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp {
317 ; CHECK-LABEL: float_vrev64:
318 ; CHECK: // %bb.0: // %entry
319 ; CHECK-NEXT: ldr q0, [x0]
320 ; CHECK-NEXT: movi.2d v1, #0000000000000000
321 ; CHECK-NEXT: dup.4s v1, v1[0]
322 ; CHECK-NEXT: ext.16b v0, v0, v1, #12
323 ; CHECK-NEXT: rev64.4s v0, v0
324 ; CHECK-NEXT: str q0, [x1, #176]
327 %0 = bitcast float* %source to <4 x float>*
328 %tmp2 = load <4 x float>, <4 x float>* %0, align 4
329 %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
330 %arrayidx8 = getelementptr inbounds <4 x float>, <4 x float>* %dest, i32 11
331 store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4
336 define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
337 ; CHECK-LABEL: test_vrev32_bswap:
339 ; CHECK-NEXT: rev32.16b v0, v0
341 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
345 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone