1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=apple | FileCheck %s
3 ; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* -mtriple=aarch64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=FALLBACK,GISEL
5 ; FALLBACK-NOT: remark{{.*}}test_rev_w
6 define i32 @test_rev_w(i32 %a) nounwind {
7 ; CHECK-LABEL: test_rev_w:
8 ; CHECK: // %bb.0: // %entry
9 ; CHECK-NEXT: rev w0, w0
12 ; FALLBACK-LABEL: test_rev_w:
13 ; FALLBACK: // %bb.0: // %entry
14 ; FALLBACK-NEXT: rev w0, w0
17 %0 = tail call i32 @llvm.bswap.i32(i32 %a)
21 ; FALLBACK-NOT: remark{{.*}}test_rev_x
22 define i64 @test_rev_x(i64 %a) nounwind {
23 ; CHECK-LABEL: test_rev_x:
24 ; CHECK: // %bb.0: // %entry
25 ; CHECK-NEXT: rev x0, x0
28 ; FALLBACK-LABEL: test_rev_x:
29 ; FALLBACK: // %bb.0: // %entry
30 ; FALLBACK-NEXT: rev x0, x0
33 %0 = tail call i64 @llvm.bswap.i64(i64 %a)
37 ; Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 16-bits
38 ; of %a are zero. This optimizes rev + lsr 16 to rev16.
39 define i32 @test_rev_w_srl16(i16 %a) {
40 ; CHECK-LABEL: test_rev_w_srl16:
41 ; CHECK: // %bb.0: // %entry
42 ; CHECK-NEXT: and w8, w0, #0xffff
43 ; CHECK-NEXT: rev16 w0, w8
46 ; FALLBACK-LABEL: test_rev_w_srl16:
47 ; FALLBACK: // %bb.0: // %entry
48 ; FALLBACK-NEXT: and w8, w0, #0xffff
49 ; FALLBACK-NEXT: rev w8, w8
50 ; FALLBACK-NEXT: lsr w0, w8, #16
53 %0 = zext i16 %a to i32
54 %1 = tail call i32 @llvm.bswap.i32(i32 %0)
59 define i32 @test_rev_w_srl16_load(i16 *%a) {
60 ; CHECK-LABEL: test_rev_w_srl16_load:
61 ; CHECK: // %bb.0: // %entry
62 ; CHECK-NEXT: ldrh w8, [x0]
63 ; CHECK-NEXT: rev16 w0, w8
66 ; FALLBACK-LABEL: test_rev_w_srl16_load:
67 ; FALLBACK: // %bb.0: // %entry
68 ; FALLBACK-NEXT: ldrh w8, [x0]
69 ; FALLBACK-NEXT: rev w8, w8
70 ; FALLBACK-NEXT: lsr w0, w8, #16
73 %0 = load i16, i16 *%a
74 %1 = zext i16 %0 to i32
75 %2 = tail call i32 @llvm.bswap.i32(i32 %1)
80 define i32 @test_rev_w_srl16_add(i8 %a, i8 %b) {
81 ; CHECK-LABEL: test_rev_w_srl16_add:
82 ; CHECK: // %bb.0: // %entry
83 ; CHECK-NEXT: and w8, w0, #0xff
84 ; CHECK-NEXT: add w8, w8, w1, uxtb
85 ; CHECK-NEXT: rev16 w0, w8
88 ; FALLBACK-LABEL: test_rev_w_srl16_add:
89 ; FALLBACK: // %bb.0: // %entry
90 ; FALLBACK-NEXT: and w8, w1, #0xff
91 ; FALLBACK-NEXT: add w8, w8, w0, uxtb
92 ; FALLBACK-NEXT: rev w8, w8
93 ; FALLBACK-NEXT: lsr w0, w8, #16
96 %0 = zext i8 %a to i32
97 %1 = zext i8 %b to i32
99 %3 = tail call i32 @llvm.bswap.i32(i32 %2)
104 ; Canonicalize (srl (bswap x), 32) to (rotr (bswap x), 32) if the high 32-bits
105 ; of %a are zero. This optimizes rev + lsr 32 to rev32.
106 define i64 @test_rev_x_srl32(i32 %a) {
107 ; CHECK-LABEL: test_rev_x_srl32:
108 ; CHECK: // %bb.0: // %entry
109 ; CHECK-NEXT: mov w8, w0
110 ; CHECK-NEXT: rev32 x0, x8
113 ; FALLBACK-LABEL: test_rev_x_srl32:
114 ; FALLBACK: // %bb.0: // %entry
115 ; FALLBACK-NEXT: // kill: def $w0 killed $w0 def $x0
116 ; FALLBACK-NEXT: ubfx x8, x0, #0, #32
117 ; FALLBACK-NEXT: rev x8, x8
118 ; FALLBACK-NEXT: lsr x0, x8, #32
121 %0 = zext i32 %a to i64
122 %1 = tail call i64 @llvm.bswap.i64(i64 %0)
127 define i64 @test_rev_x_srl32_load(i32 *%a) {
128 ; CHECK-LABEL: test_rev_x_srl32_load:
129 ; CHECK: // %bb.0: // %entry
130 ; CHECK-NEXT: ldr w8, [x0]
131 ; CHECK-NEXT: rev32 x0, x8
134 ; FALLBACK-LABEL: test_rev_x_srl32_load:
135 ; FALLBACK: // %bb.0: // %entry
136 ; FALLBACK-NEXT: ldr w8, [x0]
137 ; FALLBACK-NEXT: rev x8, x8
138 ; FALLBACK-NEXT: lsr x0, x8, #32
141 %0 = load i32, i32 *%a
142 %1 = zext i32 %0 to i64
143 %2 = tail call i64 @llvm.bswap.i64(i64 %1)
148 define i64 @test_rev_x_srl32_shift(i64 %a) {
149 ; CHECK-LABEL: test_rev_x_srl32_shift:
150 ; CHECK: // %bb.0: // %entry
151 ; CHECK-NEXT: ubfx x8, x0, #2, #29
152 ; CHECK-NEXT: rev32 x0, x8
155 ; FALLBACK-LABEL: test_rev_x_srl32_shift:
156 ; FALLBACK: // %bb.0: // %entry
157 ; FALLBACK-NEXT: lsl x8, x0, #33
158 ; FALLBACK-NEXT: lsr x8, x8, #35
159 ; FALLBACK-NEXT: rev x8, x8
160 ; FALLBACK-NEXT: lsr x0, x8, #32
165 %2 = tail call i64 @llvm.bswap.i64(i64 %1)
170 declare i32 @llvm.bswap.i32(i32) nounwind readnone
171 declare i64 @llvm.bswap.i64(i64) nounwind readnone
173 define i32 @test_rev16_w(i32 %X) nounwind {
174 ; CHECK-LABEL: test_rev16_w:
175 ; CHECK: // %bb.0: // %entry
176 ; CHECK-NEXT: rev16 w0, w0
179 ; FALLBACK-LABEL: test_rev16_w:
180 ; FALLBACK: // %bb.0: // %entry
181 ; FALLBACK-NEXT: lsr w8, w0, #8
182 ; FALLBACK-NEXT: lsl w9, w0, #8
183 ; FALLBACK-NEXT: and w10, w8, #0xff0000
184 ; FALLBACK-NEXT: and w11, w9, #0xff000000
185 ; FALLBACK-NEXT: and w9, w9, #0xff00
186 ; FALLBACK-NEXT: orr w10, w11, w10
187 ; FALLBACK-NEXT: and w8, w8, #0xff
188 ; FALLBACK-NEXT: orr w9, w10, w9
189 ; FALLBACK-NEXT: orr w0, w9, w8
192 %tmp1 = lshr i32 %X, 8
193 %X15 = bitcast i32 %X to i32
194 %tmp4 = shl i32 %X15, 8
195 %tmp2 = and i32 %tmp1, 16711680
196 %tmp5 = and i32 %tmp4, -16777216
197 %tmp9 = and i32 %tmp1, 255
198 %tmp13 = and i32 %tmp4, 65280
199 %tmp6 = or i32 %tmp5, %tmp2
200 %tmp10 = or i32 %tmp6, %tmp13
201 %tmp14 = or i32 %tmp10, %tmp9
205 ; 64-bit REV16 is *not* a swap then a 16-bit rotation:
206 ; 01234567 ->(bswap) 76543210 ->(rotr) 10765432
207 ; 01234567 ->(rev16) 10325476
208 define i64 @test_rev16_x(i64 %a) nounwind {
209 ; CHECK-LABEL: test_rev16_x:
210 ; CHECK: // %bb.0: // %entry
211 ; CHECK-NEXT: rev x8, x0
212 ; CHECK-NEXT: ror x0, x8, #16
215 ; FALLBACK-LABEL: test_rev16_x:
216 ; FALLBACK: // %bb.0: // %entry
217 ; FALLBACK-NEXT: rev x8, x0
218 ; FALLBACK-NEXT: lsl x9, x8, #48
219 ; FALLBACK-NEXT: orr x0, x9, x8, lsr #16
222 %0 = tail call i64 @llvm.bswap.i64(i64 %a)
229 define i64 @test_rev32_x(i64 %a) nounwind {
230 ; CHECK-LABEL: test_rev32_x:
231 ; CHECK: // %bb.0: // %entry
232 ; CHECK-NEXT: rev32 x0, x0
235 ; FALLBACK-LABEL: test_rev32_x:
236 ; FALLBACK: // %bb.0: // %entry
237 ; FALLBACK-NEXT: rev x8, x0
238 ; FALLBACK-NEXT: lsl x9, x8, #32
239 ; FALLBACK-NEXT: orr x0, x9, x8, lsr #32
242 %0 = tail call i64 @llvm.bswap.i64(i64 %a)
249 define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
250 ; CHECK-LABEL: test_vrev64D8:
252 ; CHECK-NEXT: ldr d0, [x0]
253 ; CHECK-NEXT: rev64.8b v0, v0
256 ; FALLBACK-LABEL: test_vrev64D8:
257 ; FALLBACK: // %bb.0:
258 ; FALLBACK-NEXT: ldr d0, [x0]
259 ; FALLBACK-NEXT: rev64.8b v0, v0
261 %tmp1 = load <8 x i8>, <8 x i8>* %A
262 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
266 define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
267 ; CHECK-LABEL: test_vrev64D16:
269 ; CHECK-NEXT: ldr d0, [x0]
270 ; CHECK-NEXT: rev64.4h v0, v0
273 ; FALLBACK-LABEL: test_vrev64D16:
274 ; FALLBACK: // %bb.0:
275 ; FALLBACK-NEXT: ldr d0, [x0]
276 ; FALLBACK-NEXT: rev64.4h v0, v0
278 %tmp1 = load <4 x i16>, <4 x i16>* %A
279 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
283 define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
284 ; CHECK-LABEL: test_vrev64D32:
286 ; CHECK-NEXT: ldr d0, [x0]
287 ; CHECK-NEXT: rev64.2s v0, v0
290 ; FALLBACK-LABEL: test_vrev64D32:
291 ; FALLBACK: // %bb.0:
292 ; FALLBACK-NEXT: ldr d0, [x0]
293 ; FALLBACK-NEXT: adrp x8, .LCPI13_0
294 ; FALLBACK-NEXT: ldr d1, [x8, :lo12:.LCPI13_0]
295 ; FALLBACK-NEXT: mov.s v2[1], w8
296 ; FALLBACK-NEXT: mov.d v0[1], v2[0]
297 ; FALLBACK-NEXT: tbl.16b v0, { v0 }, v1
298 ; FALLBACK-NEXT: // kill: def $d0 killed $d0 killed $q0
300 %tmp1 = load <2 x i32>, <2 x i32>* %A
301 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
305 define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
306 ; CHECK-LABEL: test_vrev64Df:
308 ; CHECK-NEXT: ldr d0, [x0]
309 ; CHECK-NEXT: rev64.2s v0, v0
312 ; FALLBACK-LABEL: test_vrev64Df:
313 ; FALLBACK: // %bb.0:
314 ; FALLBACK-NEXT: ldr d0, [x0]
315 ; FALLBACK-NEXT: adrp x8, .LCPI14_0
316 ; FALLBACK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0]
317 ; FALLBACK-NEXT: mov.s v2[1], w8
318 ; FALLBACK-NEXT: mov.d v0[1], v2[0]
319 ; FALLBACK-NEXT: tbl.16b v0, { v0 }, v1
320 ; FALLBACK-NEXT: // kill: def $d0 killed $d0 killed $q0
322 %tmp1 = load <2 x float>, <2 x float>* %A
323 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
324 ret <2 x float> %tmp2
327 define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
328 ; CHECK-LABEL: test_vrev64Q8:
330 ; CHECK-NEXT: ldr q0, [x0]
331 ; CHECK-NEXT: rev64.16b v0, v0
334 ; FALLBACK-LABEL: test_vrev64Q8:
335 ; FALLBACK: // %bb.0:
336 ; FALLBACK-NEXT: ldr q0, [x0]
337 ; FALLBACK-NEXT: rev64.16b v0, v0
339 %tmp1 = load <16 x i8>, <16 x i8>* %A
340 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
344 define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
345 ; CHECK-LABEL: test_vrev64Q16:
347 ; CHECK-NEXT: ldr q0, [x0]
348 ; CHECK-NEXT: rev64.8h v0, v0
351 ; FALLBACK-LABEL: test_vrev64Q16:
352 ; FALLBACK: // %bb.0:
353 ; FALLBACK-NEXT: ldr q0, [x0]
354 ; FALLBACK-NEXT: rev64.8h v0, v0
356 %tmp1 = load <8 x i16>, <8 x i16>* %A
357 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
361 define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
362 ; CHECK-LABEL: test_vrev64Q32:
364 ; CHECK-NEXT: ldr q0, [x0]
365 ; CHECK-NEXT: rev64.4s v0, v0
368 ; FALLBACK-LABEL: test_vrev64Q32:
369 ; FALLBACK: // %bb.0:
370 ; FALLBACK-NEXT: adrp x8, .LCPI17_0
371 ; FALLBACK-NEXT: ldr q0, [x0]
372 ; FALLBACK-NEXT: ldr q2, [x8, :lo12:.LCPI17_0]
373 ; FALLBACK-NEXT: tbl.16b v0, { v0, v1 }, v2
375 %tmp1 = load <4 x i32>, <4 x i32>* %A
376 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
380 define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
381 ; CHECK-LABEL: test_vrev64Qf:
383 ; CHECK-NEXT: ldr q0, [x0]
384 ; CHECK-NEXT: rev64.4s v0, v0
387 ; FALLBACK-LABEL: test_vrev64Qf:
388 ; FALLBACK: // %bb.0:
389 ; FALLBACK-NEXT: adrp x8, .LCPI18_0
390 ; FALLBACK-NEXT: ldr q0, [x0]
391 ; FALLBACK-NEXT: ldr q2, [x8, :lo12:.LCPI18_0]
392 ; FALLBACK-NEXT: tbl.16b v0, { v0, v1 }, v2
394 %tmp1 = load <4 x float>, <4 x float>* %A
395 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
396 ret <4 x float> %tmp2
399 define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
400 ; CHECK-LABEL: test_vrev32D8:
402 ; CHECK-NEXT: ldr d0, [x0]
403 ; CHECK-NEXT: rev32.8b v0, v0
406 ; FALLBACK-LABEL: test_vrev32D8:
407 ; FALLBACK: // %bb.0:
408 ; FALLBACK-NEXT: ldr d0, [x0]
409 ; FALLBACK-NEXT: rev32.8b v0, v0
411 %tmp1 = load <8 x i8>, <8 x i8>* %A
412 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
416 define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
417 ; CHECK-LABEL: test_vrev32D16:
419 ; CHECK-NEXT: ldr d0, [x0]
420 ; CHECK-NEXT: rev32.4h v0, v0
423 ; FALLBACK-LABEL: test_vrev32D16:
424 ; FALLBACK: // %bb.0:
425 ; FALLBACK-NEXT: ldr d0, [x0]
426 ; FALLBACK-NEXT: rev32.4h v0, v0
428 %tmp1 = load <4 x i16>, <4 x i16>* %A
429 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
433 define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
434 ; CHECK-LABEL: test_vrev32Q8:
436 ; CHECK-NEXT: ldr q0, [x0]
437 ; CHECK-NEXT: rev32.16b v0, v0
440 ; FALLBACK-LABEL: test_vrev32Q8:
441 ; FALLBACK: // %bb.0:
442 ; FALLBACK-NEXT: ldr q0, [x0]
443 ; FALLBACK-NEXT: rev32.16b v0, v0
445 %tmp1 = load <16 x i8>, <16 x i8>* %A
446 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
450 define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
451 ; CHECK-LABEL: test_vrev32Q16:
453 ; CHECK-NEXT: ldr q0, [x0]
454 ; CHECK-NEXT: rev32.8h v0, v0
457 ; FALLBACK-LABEL: test_vrev32Q16:
458 ; FALLBACK: // %bb.0:
459 ; FALLBACK-NEXT: ldr q0, [x0]
460 ; FALLBACK-NEXT: rev32.8h v0, v0
462 %tmp1 = load <8 x i16>, <8 x i16>* %A
463 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
467 define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
468 ; CHECK-LABEL: test_vrev16D8:
470 ; CHECK-NEXT: ldr d0, [x0]
471 ; CHECK-NEXT: rev16.8b v0, v0
474 ; FALLBACK-LABEL: test_vrev16D8:
475 ; FALLBACK: // %bb.0:
476 ; FALLBACK-NEXT: ldr d0, [x0]
477 ; FALLBACK-NEXT: rev16.8b v0, v0
479 %tmp1 = load <8 x i8>, <8 x i8>* %A
480 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
484 define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
485 ; CHECK-LABEL: test_vrev16Q8:
487 ; CHECK-NEXT: ldr q0, [x0]
488 ; CHECK-NEXT: rev16.16b v0, v0
491 ; FALLBACK-LABEL: test_vrev16Q8:
492 ; FALLBACK: // %bb.0:
493 ; FALLBACK-NEXT: ldr q0, [x0]
494 ; FALLBACK-NEXT: rev16.16b v0, v0
496 %tmp1 = load <16 x i8>, <16 x i8>* %A
497 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
501 ; Undef shuffle indices should not prevent matching to VREV:
503 define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind {
504 ; CHECK-LABEL: test_vrev64D8_undef:
506 ; CHECK-NEXT: ldr d0, [x0]
507 ; CHECK-NEXT: rev64.8b v0, v0
510 ; FALLBACK-LABEL: test_vrev64D8_undef:
511 ; FALLBACK: // %bb.0:
512 ; FALLBACK-NEXT: ldr d0, [x0]
513 ; FALLBACK-NEXT: rev64.8b v0, v0
515 %tmp1 = load <8 x i8>, <8 x i8>* %A
516 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0>
520 define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind {
521 ; CHECK-LABEL: test_vrev32Q16_undef:
523 ; CHECK-NEXT: ldr q0, [x0]
524 ; CHECK-NEXT: rev32.8h v0, v0
527 ; FALLBACK-LABEL: test_vrev32Q16_undef:
528 ; FALLBACK: // %bb.0:
529 ; FALLBACK-NEXT: ldr q0, [x0]
530 ; FALLBACK-NEXT: rev32.8h v0, v0
532 %tmp1 = load <8 x i16>, <8 x i16>* %A
533 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
537 ; vrev <4 x i16> should use REV32 and not REV64
538 define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
539 ; CHECK-LABEL: test_vrev64:
540 ; CHECK: // %bb.0: // %entry
541 ; CHECK-NEXT: ldr q0, [x0]
542 ; CHECK-NEXT: add x8, x1, #2 // =2
543 ; CHECK-NEXT: st1.h { v0 }[5], [x8]
544 ; CHECK-NEXT: st1.h { v0 }[6], [x1]
547 ; FALLBACK-LABEL: test_vrev64:
548 ; FALLBACK: // %bb.0: // %entry
549 ; FALLBACK-NEXT: ldr q0, [x0]
550 ; FALLBACK-NEXT: add x8, x1, #2 // =2
551 ; FALLBACK-NEXT: st1.h { v0 }[5], [x8]
552 ; FALLBACK-NEXT: st1.h { v0 }[6], [x1]
555 %0 = bitcast <4 x i16>* %source to <8 x i16>*
556 %tmp2 = load <8 x i16>, <8 x i16>* %0, align 4
557 %tmp3 = extractelement <8 x i16> %tmp2, i32 6
558 %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
559 %tmp9 = extractelement <8 x i16> %tmp2, i32 5
560 %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
561 store <2 x i16> %tmp11, <2 x i16>* %dst, align 4
565 ; Test vrev of float4
566 define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp {
567 ; CHECK-LABEL: float_vrev64:
568 ; CHECK: // %bb.0: // %entry
569 ; CHECK-NEXT: ldr q0, [x0]
570 ; CHECK-NEXT: movi.2d v1, #0000000000000000
571 ; CHECK-NEXT: dup.4s v1, v1[0]
572 ; CHECK-NEXT: ext.16b v0, v0, v1, #12
573 ; CHECK-NEXT: rev64.4s v0, v0
574 ; CHECK-NEXT: str q0, [x1, #176]
577 ; FALLBACK-LABEL: float_vrev64:
578 ; FALLBACK: // %bb.0: // %entry
579 ; FALLBACK-NEXT: fmov s0, wzr
580 ; FALLBACK-NEXT: mov.s v0[1], v0[0]
581 ; FALLBACK-NEXT: mov.s v0[2], v0[0]
582 ; FALLBACK-NEXT: adrp x8, .LCPI28_0
583 ; FALLBACK-NEXT: mov.s v0[3], v0[0]
584 ; FALLBACK-NEXT: ldr q1, [x0]
585 ; FALLBACK-NEXT: ldr q2, [x8, :lo12:.LCPI28_0]
586 ; FALLBACK-NEXT: tbl.16b v0, { v0, v1 }, v2
587 ; FALLBACK-NEXT: str q0, [x1, #176]
590 %0 = bitcast float* %source to <4 x float>*
591 %tmp2 = load <4 x float>, <4 x float>* %0, align 4
592 %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
593 %arrayidx8 = getelementptr inbounds <4 x float>, <4 x float>* %dest, i32 11
594 store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4
599 define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
600 ; CHECK-LABEL: test_vrev32_bswap:
602 ; CHECK-NEXT: rev32.16b v0, v0
605 ; FALLBACK-LABEL: test_vrev32_bswap:
606 ; FALLBACK: // %bb.0:
607 ; FALLBACK-NEXT: rev32.16b v0, v0
609 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
613 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone