1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=apple | FileCheck %s
3 ; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* -mtriple=aarch64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=FALLBACK,GISEL
5 ; FALLBACK-NOT: remark{{.*}}test_rev_w
6 define i32 @test_rev_w(i32 %a) nounwind {
7 ; CHECK-LABEL: test_rev_w:
8 ; CHECK: // %bb.0: // %entry
9 ; CHECK-NEXT: rev w0, w0
11 ; GISEL-LABEL: test_rev_w:
12 ; GISEL: // %bb.0: // %entry
13 ; GISEL-NEXT: rev w0, w0
16 %0 = tail call i32 @llvm.bswap.i32(i32 %a)
20 ; FALLBACK-NOT: remark{{.*}}test_rev_x
21 define i64 @test_rev_x(i64 %a) nounwind {
22 ; CHECK-LABEL: test_rev_x:
23 ; CHECK: // %bb.0: // %entry
24 ; CHECK-NEXT: rev x0, x0
26 ; GISEL-LABEL: test_rev_x:
27 ; GISEL: // %bb.0: // %entry
28 ; GISEL-NEXT: rev x0, x0
31 %0 = tail call i64 @llvm.bswap.i64(i64 %a)
35 ; Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 16-bits
36 ; of %a are zero. This optimizes rev + lsr 16 to rev16.
37 define i32 @test_rev_w_srl16(i16 %a) {
38 ; CHECK-LABEL: test_rev_w_srl16:
39 ; CHECK: // %bb.0: // %entry
40 ; CHECK-NEXT: and w8, w0, #0xffff
41 ; CHECK-NEXT: rev16 w0, w8
44 %0 = zext i16 %a to i32
45 %1 = tail call i32 @llvm.bswap.i32(i32 %0)
50 define i32 @test_rev_w_srl16_load(i16 *%a) {
51 ; CHECK-LABEL: test_rev_w_srl16_load:
52 ; CHECK: // %bb.0: // %entry
53 ; CHECK-NEXT: ldrh w8, [x0]
54 ; CHECK-NEXT: rev16 w0, w8
57 %0 = load i16, i16 *%a
58 %1 = zext i16 %0 to i32
59 %2 = tail call i32 @llvm.bswap.i32(i32 %1)
64 define i32 @test_rev_w_srl16_add(i8 %a, i8 %b) {
65 ; CHECK-LABEL: test_rev_w_srl16_add:
66 ; CHECK: // %bb.0: // %entry
67 ; CHECK-NEXT: and w8, w0, #0xff
68 ; CHECK-NEXT: add w8, w8, w1, uxtb
69 ; CHECK-NEXT: rev16 w0, w8
72 %0 = zext i8 %a to i32
73 %1 = zext i8 %b to i32
75 %3 = tail call i32 @llvm.bswap.i32(i32 %2)
80 ; Canonicalize (srl (bswap x), 32) to (rotr (bswap x), 32) if the high 32-bits
81 ; of %a are zero. This optimizes rev + lsr 32 to rev32.
82 define i64 @test_rev_x_srl32(i32 %a) {
83 ; CHECK-LABEL: test_rev_x_srl32:
84 ; CHECK: // %bb.0: // %entry
85 ; CHECK-NEXT: mov w8, w0
86 ; CHECK-NEXT: rev32 x0, x8
89 %0 = zext i32 %a to i64
90 %1 = tail call i64 @llvm.bswap.i64(i64 %0)
95 define i64 @test_rev_x_srl32_load(i32 *%a) {
96 ; CHECK-LABEL: test_rev_x_srl32_load:
97 ; CHECK: // %bb.0: // %entry
98 ; CHECK-NEXT: ldr w8, [x0]
99 ; CHECK-NEXT: rev32 x0, x8
102 %0 = load i32, i32 *%a
103 %1 = zext i32 %0 to i64
104 %2 = tail call i64 @llvm.bswap.i64(i64 %1)
109 define i64 @test_rev_x_srl32_shift(i64 %a) {
110 ; CHECK-LABEL: test_rev_x_srl32_shift:
111 ; CHECK: // %bb.0: // %entry
112 ; CHECK-NEXT: ubfx x8, x0, #2, #29
113 ; CHECK-NEXT: rev32 x0, x8
118 %2 = tail call i64 @llvm.bswap.i64(i64 %1)
123 declare i32 @llvm.bswap.i32(i32) nounwind readnone
124 declare i64 @llvm.bswap.i64(i64) nounwind readnone
126 define i32 @test_rev16_w(i32 %X) nounwind {
127 ; CHECK-LABEL: test_rev16_w:
128 ; CHECK: // %bb.0: // %entry
129 ; CHECK-NEXT: rev16 w0, w0
132 %tmp1 = lshr i32 %X, 8
133 %X15 = bitcast i32 %X to i32
134 %tmp4 = shl i32 %X15, 8
135 %tmp2 = and i32 %tmp1, 16711680
136 %tmp5 = and i32 %tmp4, -16777216
137 %tmp9 = and i32 %tmp1, 255
138 %tmp13 = and i32 %tmp4, 65280
139 %tmp6 = or i32 %tmp5, %tmp2
140 %tmp10 = or i32 %tmp6, %tmp13
141 %tmp14 = or i32 %tmp10, %tmp9
145 ; 64-bit REV16 is *not* a swap then a 16-bit rotation:
146 ; 01234567 ->(bswap) 76543210 ->(rotr) 10765432
147 ; 01234567 ->(rev16) 10325476
148 define i64 @test_rev16_x(i64 %a) nounwind {
149 ; CHECK-LABEL: test_rev16_x:
150 ; CHECK: // %bb.0: // %entry
151 ; CHECK-NEXT: rev x8, x0
152 ; CHECK-NEXT: ror x0, x8, #16
155 %0 = tail call i64 @llvm.bswap.i64(i64 %a)
162 define i64 @test_rev32_x(i64 %a) nounwind {
163 ; CHECK-LABEL: test_rev32_x:
164 ; CHECK: // %bb.0: // %entry
165 ; CHECK-NEXT: rev32 x0, x0
168 %0 = tail call i64 @llvm.bswap.i64(i64 %a)
175 define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
176 ; CHECK-LABEL: test_vrev64D8:
178 ; CHECK-NEXT: ldr d0, [x0]
179 ; CHECK-NEXT: rev64.8b v0, v0
181 %tmp1 = load <8 x i8>, <8 x i8>* %A
182 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
186 define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
187 ; CHECK-LABEL: test_vrev64D16:
189 ; CHECK-NEXT: ldr d0, [x0]
190 ; CHECK-NEXT: rev64.4h v0, v0
192 %tmp1 = load <4 x i16>, <4 x i16>* %A
193 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
197 define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
198 ; CHECK-LABEL: test_vrev64D32:
200 ; CHECK-NEXT: ldr d0, [x0]
201 ; CHECK-NEXT: rev64.2s v0, v0
203 %tmp1 = load <2 x i32>, <2 x i32>* %A
204 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
208 define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
209 ; CHECK-LABEL: test_vrev64Df:
211 ; CHECK-NEXT: ldr d0, [x0]
212 ; CHECK-NEXT: rev64.2s v0, v0
214 %tmp1 = load <2 x float>, <2 x float>* %A
215 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
216 ret <2 x float> %tmp2
219 define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
220 ; CHECK-LABEL: test_vrev64Q8:
222 ; CHECK-NEXT: ldr q0, [x0]
223 ; CHECK-NEXT: rev64.16b v0, v0
225 %tmp1 = load <16 x i8>, <16 x i8>* %A
226 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
230 define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
231 ; CHECK-LABEL: test_vrev64Q16:
233 ; CHECK-NEXT: ldr q0, [x0]
234 ; CHECK-NEXT: rev64.8h v0, v0
236 %tmp1 = load <8 x i16>, <8 x i16>* %A
237 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
241 define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
242 ; CHECK-LABEL: test_vrev64Q32:
244 ; CHECK-NEXT: ldr q0, [x0]
245 ; CHECK-NEXT: rev64.4s v0, v0
247 %tmp1 = load <4 x i32>, <4 x i32>* %A
248 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
252 define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
253 ; CHECK-LABEL: test_vrev64Qf:
255 ; CHECK-NEXT: ldr q0, [x0]
256 ; CHECK-NEXT: rev64.4s v0, v0
258 %tmp1 = load <4 x float>, <4 x float>* %A
259 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
260 ret <4 x float> %tmp2
263 define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
264 ; CHECK-LABEL: test_vrev32D8:
266 ; CHECK-NEXT: ldr d0, [x0]
267 ; CHECK-NEXT: rev32.8b v0, v0
269 %tmp1 = load <8 x i8>, <8 x i8>* %A
270 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
274 define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
275 ; CHECK-LABEL: test_vrev32D16:
277 ; CHECK-NEXT: ldr d0, [x0]
278 ; CHECK-NEXT: rev32.4h v0, v0
280 %tmp1 = load <4 x i16>, <4 x i16>* %A
281 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
285 define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
286 ; CHECK-LABEL: test_vrev32Q8:
288 ; CHECK-NEXT: ldr q0, [x0]
289 ; CHECK-NEXT: rev32.16b v0, v0
291 %tmp1 = load <16 x i8>, <16 x i8>* %A
292 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
296 define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
297 ; CHECK-LABEL: test_vrev32Q16:
299 ; CHECK-NEXT: ldr q0, [x0]
300 ; CHECK-NEXT: rev32.8h v0, v0
302 %tmp1 = load <8 x i16>, <8 x i16>* %A
303 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
307 define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
308 ; CHECK-LABEL: test_vrev16D8:
310 ; CHECK-NEXT: ldr d0, [x0]
311 ; CHECK-NEXT: rev16.8b v0, v0
313 %tmp1 = load <8 x i8>, <8 x i8>* %A
314 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
318 define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
319 ; CHECK-LABEL: test_vrev16Q8:
321 ; CHECK-NEXT: ldr q0, [x0]
322 ; CHECK-NEXT: rev16.16b v0, v0
324 %tmp1 = load <16 x i8>, <16 x i8>* %A
325 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
329 ; Undef shuffle indices should not prevent matching to VREV:
331 define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind {
332 ; CHECK-LABEL: test_vrev64D8_undef:
334 ; CHECK-NEXT: ldr d0, [x0]
335 ; CHECK-NEXT: rev64.8b v0, v0
337 %tmp1 = load <8 x i8>, <8 x i8>* %A
338 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0>
342 define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind {
343 ; CHECK-LABEL: test_vrev32Q16_undef:
345 ; CHECK-NEXT: ldr q0, [x0]
346 ; CHECK-NEXT: rev32.8h v0, v0
348 %tmp1 = load <8 x i16>, <8 x i16>* %A
349 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
353 ; vrev <4 x i16> should use REV32 and not REV64
354 define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
355 ; CHECK-LABEL: test_vrev64:
356 ; CHECK: // %bb.0: // %entry
357 ; CHECK-NEXT: ldr q0, [x0]
358 ; CHECK-NEXT: add x8, x1, #2 // =2
359 ; CHECK-NEXT: st1.h { v0 }[5], [x8]
360 ; CHECK-NEXT: st1.h { v0 }[6], [x1]
363 %0 = bitcast <4 x i16>* %source to <8 x i16>*
364 %tmp2 = load <8 x i16>, <8 x i16>* %0, align 4
365 %tmp3 = extractelement <8 x i16> %tmp2, i32 6
366 %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
367 %tmp9 = extractelement <8 x i16> %tmp2, i32 5
368 %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
369 store <2 x i16> %tmp11, <2 x i16>* %dst, align 4
373 ; Test vrev of float4
374 define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp {
375 ; CHECK-LABEL: float_vrev64:
376 ; CHECK: // %bb.0: // %entry
377 ; CHECK-NEXT: ldr q0, [x0]
378 ; CHECK-NEXT: movi.2d v1, #0000000000000000
379 ; CHECK-NEXT: dup.4s v1, v1[0]
380 ; CHECK-NEXT: ext.16b v0, v0, v1, #12
381 ; CHECK-NEXT: rev64.4s v0, v0
382 ; CHECK-NEXT: str q0, [x1, #176]
385 %0 = bitcast float* %source to <4 x float>*
386 %tmp2 = load <4 x float>, <4 x float>* %0, align 4
387 %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
388 %arrayidx8 = getelementptr inbounds <4 x float>, <4 x float>* %dest, i32 11
389 store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4
394 define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
395 ; CHECK-LABEL: test_vrev32_bswap:
397 ; CHECK-NEXT: rev32.16b v0, v0
399 ; GISEL-LABEL: test_vrev32_bswap:
401 ; GISEL-NEXT: rev32.16b v0, v0
403 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
407 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone