1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5 target triple = "aarch64-unknown-linux-gnu"
7 ; REVB pattern for shuffle v32i8 -> v16i16
8 define void @test_revbv16i16(ptr %a) #0 {
9 ; CHECK-LABEL: test_revbv16i16:
11 ; CHECK-NEXT: ptrue p0.b, vl32
12 ; CHECK-NEXT: ptrue p1.h
13 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
14 ; CHECK-NEXT: revb z0.h, p1/m, z0.h
15 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
17 %tmp1 = load <32 x i8>, ptr %a
18 %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14, i32 17, i32 16, i32 19, i32 18, i32 21, i32 20, i32 23, i32 22, i32 undef, i32 24, i32 27, i32 undef, i32 29, i32 28, i32 undef, i32 undef>
19 store <32 x i8> %tmp2, ptr %a
23 ; REVB pattern for shuffle v32i8 -> v8i32
24 define void @test_revbv8i32(ptr %a) #0 {
25 ; CHECK-LABEL: test_revbv8i32:
27 ; CHECK-NEXT: ptrue p0.b, vl32
28 ; CHECK-NEXT: ptrue p1.s
29 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
30 ; CHECK-NEXT: revb z0.s, p1/m, z0.s
31 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
33 %tmp1 = load <32 x i8>, ptr %a
34 %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
35 store <32 x i8> %tmp2, ptr %a
39 ; REVB pattern for shuffle v32i8 -> v4i64
40 define void @test_revbv4i64(ptr %a) #0 {
41 ; CHECK-LABEL: test_revbv4i64:
43 ; CHECK-NEXT: ptrue p0.b, vl32
44 ; CHECK-NEXT: ptrue p1.d
45 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
46 ; CHECK-NEXT: revb z0.d, p1/m, z0.d
47 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
49 %tmp1 = load <32 x i8>, ptr %a
50 %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 31, i32 30, i32 29, i32 undef, i32 27, i32 undef, i32 undef, i32 undef>
51 store <32 x i8> %tmp2, ptr %a
55 ; REVH pattern for shuffle v16i16 -> v8i32
56 define void @test_revhv8i32(ptr %a) #0 {
57 ; CHECK-LABEL: test_revhv8i32:
59 ; CHECK-NEXT: ptrue p0.h, vl16
60 ; CHECK-NEXT: ptrue p1.s
61 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
62 ; CHECK-NEXT: revh z0.s, p1/m, z0.s
63 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
65 %tmp1 = load <16 x i16>, ptr %a
66 %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
67 store <16 x i16> %tmp2, ptr %a
71 ; REVH pattern for shuffle v16f16 -> v8f32
72 define void @test_revhv8f32(ptr %a) #0 {
73 ; CHECK-LABEL: test_revhv8f32:
75 ; CHECK-NEXT: ptrue p0.h, vl16
76 ; CHECK-NEXT: ptrue p1.s
77 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
78 ; CHECK-NEXT: revh z0.s, p1/m, z0.s
79 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
81 %tmp1 = load <16 x half>, ptr %a
82 %tmp2 = shufflevector <16 x half> %tmp1, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
83 store <16 x half> %tmp2, ptr %a
87 ; REVH pattern for shuffle v16i16 -> v4i64
88 define void @test_revhv4i64(ptr %a) #0 {
89 ; CHECK-LABEL: test_revhv4i64:
91 ; CHECK-NEXT: ptrue p0.h, vl16
92 ; CHECK-NEXT: ptrue p1.d
93 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
94 ; CHECK-NEXT: revh z0.d, p1/m, z0.d
95 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
97 %tmp1 = load <16 x i16>, ptr %a
98 %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
99 store <16 x i16> %tmp2, ptr %a
103 ; REVW pattern for shuffle v8i32 -> v4i64
104 define void @test_revwv4i64(ptr %a) #0 {
105 ; CHECK-LABEL: test_revwv4i64:
107 ; CHECK-NEXT: ptrue p0.s, vl8
108 ; CHECK-NEXT: ptrue p1.d
109 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
110 ; CHECK-NEXT: revw z0.d, p1/m, z0.d
111 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
113 %tmp1 = load <8 x i32>, ptr %a
114 %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
115 store <8 x i32> %tmp2, ptr %a
119 ; REVW pattern for shuffle v8f32 -> v4f64
120 define void @test_revwv4f64(ptr %a) #0 {
121 ; CHECK-LABEL: test_revwv4f64:
123 ; CHECK-NEXT: ptrue p0.s, vl8
124 ; CHECK-NEXT: ptrue p1.d
125 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
126 ; CHECK-NEXT: revw z0.d, p1/m, z0.d
127 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
129 %tmp1 = load <8 x float>, ptr %a
130 %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
131 store <8 x float> %tmp2, ptr %a
135 ; Don't use SVE for 128-bit vectors
136 define <16 x i8> @test_revv16i8(ptr %a) #0 {
137 ; CHECK-LABEL: test_revv16i8:
139 ; CHECK-NEXT: ldr q0, [x0]
140 ; CHECK-NEXT: rev64 v0.16b, v0.16b
142 %tmp1 = load <16 x i8>, ptr %a
143 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
147 ; REVW pattern for shuffle two v8i32 inputs with the second input available.
148 define void @test_revwv8i32v8i32(ptr %a, ptr %b) #0 {
149 ; CHECK-LABEL: test_revwv8i32v8i32:
151 ; CHECK-NEXT: ptrue p0.s, vl8
152 ; CHECK-NEXT: ptrue p1.d
153 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1]
154 ; CHECK-NEXT: revw z0.d, p1/m, z0.d
155 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
157 %tmp1 = load <8 x i32>, ptr %a
158 %tmp2 = load <8 x i32>, ptr %b
159 %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
160 store <8 x i32> %tmp3, ptr %a
164 ; REVH pattern for shuffle v32i16 with 256 bits and 512 bits SVE.
165 define void @test_revhv32i16(ptr %a) #0 {
166 ; VBITS_GE_256-LABEL: test_revhv32i16:
167 ; VBITS_GE_256: // %bb.0:
168 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
169 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
170 ; VBITS_GE_256-NEXT: ptrue p1.d
171 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
172 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
173 ; VBITS_GE_256-NEXT: revh z0.d, p1/m, z0.d
174 ; VBITS_GE_256-NEXT: revh z1.d, p1/m, z1.d
175 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
176 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
177 ; VBITS_GE_256-NEXT: ret
179 ; VBITS_GE_512-LABEL: test_revhv32i16:
180 ; VBITS_GE_512: // %bb.0:
181 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
182 ; VBITS_GE_512-NEXT: ptrue p1.d
183 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
184 ; VBITS_GE_512-NEXT: revh z0.d, p1/m, z0.d
185 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
186 ; VBITS_GE_512-NEXT: ret
187 %tmp1 = load <32 x i16>, ptr %a
188 %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
189 store <32 x i16> %tmp2, ptr %a
193 ; Only support to reverse bytes / halfwords / words within elements
194 define void @test_rev_elts_fail(ptr %a) #1 {
195 ; CHECK-LABEL: test_rev_elts_fail:
197 ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
198 ; CHECK-NEXT: sub x9, sp, #48
199 ; CHECK-NEXT: mov x29, sp
200 ; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
201 ; CHECK-NEXT: .cfi_def_cfa w29, 16
202 ; CHECK-NEXT: .cfi_offset w30, -8
203 ; CHECK-NEXT: .cfi_offset w29, -16
204 ; CHECK-NEXT: ptrue p0.d
205 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
206 ; CHECK-NEXT: mov z1.d, z0.d[2]
207 ; CHECK-NEXT: mov z2.d, z0.d[3]
208 ; CHECK-NEXT: mov x9, v0.d[1]
209 ; CHECK-NEXT: fmov x8, d1
210 ; CHECK-NEXT: fmov x10, d2
211 ; CHECK-NEXT: stp x10, x8, [sp, #16]
212 ; CHECK-NEXT: fmov x8, d0
213 ; CHECK-NEXT: stp x9, x8, [sp]
214 ; CHECK-NEXT: mov x8, sp
215 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8]
216 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
217 ; CHECK-NEXT: mov sp, x29
218 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
220 %tmp1 = load <4 x i64>, ptr %a
221 %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
222 store <4 x i64> %tmp2, ptr %a
226 ; This is the same test as above, but with sve2p1 it can use the REVD instruction to reverse
227 ; the double-words within quard-words.
228 define void @test_revdv4i64_sve2p1(ptr %a) #2 {
229 ; CHECK-LABEL: test_revdv4i64_sve2p1:
231 ; CHECK-NEXT: ptrue p0.d, vl4
232 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
233 ; CHECK-NEXT: revd z0.q, p0/m, z0.q
234 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
236 %tmp1 = load <4 x i64>, ptr %a
237 %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
238 store <4 x i64> %tmp2, ptr %a
242 define void @test_revdv4f64_sve2p1(ptr %a) #2 {
243 ; CHECK-LABEL: test_revdv4f64_sve2p1:
245 ; CHECK-NEXT: ptrue p0.d, vl4
246 ; CHECK-NEXT: ptrue p1.d
247 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
248 ; CHECK-NEXT: revd z0.q, p1/m, z0.q
249 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
251 %tmp1 = load <4 x double>, ptr %a
252 %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
253 store <4 x double> %tmp2, ptr %a
257 ; REV instruction will reverse the order of all elements in the vector.
258 ; When the vector length and the target register size are inconsistent,
259 ; the correctness of generated REV instruction for shuffle pattern cannot be guaranteed.
261 ; sve-vector-bits-min=256, sve-vector-bits-max is not set, REV inst can't be generated.
262 define void @test_revv8i32(ptr %a) #0 {
263 ; CHECK-LABEL: test_revv8i32:
265 ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
266 ; CHECK-NEXT: sub x9, sp, #48
267 ; CHECK-NEXT: mov x29, sp
268 ; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
269 ; CHECK-NEXT: .cfi_def_cfa w29, 16
270 ; CHECK-NEXT: .cfi_offset w30, -8
271 ; CHECK-NEXT: .cfi_offset w29, -16
272 ; CHECK-NEXT: ptrue p0.s, vl8
273 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
274 ; CHECK-NEXT: mov w8, v0.s[1]
275 ; CHECK-NEXT: mov w9, v0.s[2]
276 ; CHECK-NEXT: mov w10, v0.s[3]
277 ; CHECK-NEXT: fmov w11, s0
278 ; CHECK-NEXT: mov z1.s, z0.s[4]
279 ; CHECK-NEXT: mov z2.s, z0.s[5]
280 ; CHECK-NEXT: mov z3.s, z0.s[6]
281 ; CHECK-NEXT: mov z0.s, z0.s[7]
282 ; CHECK-NEXT: stp w8, w11, [sp, #24]
283 ; CHECK-NEXT: fmov w8, s1
284 ; CHECK-NEXT: stp w10, w9, [sp, #16]
285 ; CHECK-NEXT: fmov w9, s2
286 ; CHECK-NEXT: stp w9, w8, [sp, #8]
287 ; CHECK-NEXT: fmov w8, s3
288 ; CHECK-NEXT: fmov w9, s0
289 ; CHECK-NEXT: stp w9, w8, [sp]
290 ; CHECK-NEXT: mov x8, sp
291 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
292 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
293 ; CHECK-NEXT: mov sp, x29
294 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
296 %tmp1 = load <8 x i32>, ptr %a
297 %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
298 store <8 x i32> %tmp2, ptr %a
302 ; REV pattern for v32i8 shuffle with vscale_range(2,2)
303 define void @test_revv32i8_vl256(ptr %a) #1 {
304 ; CHECK-LABEL: test_revv32i8_vl256:
306 ; CHECK-NEXT: ptrue p0.b
307 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
308 ; CHECK-NEXT: rev z0.b, z0.b
309 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
311 %tmp1 = load <32 x i8>, ptr %a
312 %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
313 store <32 x i8> %tmp2, ptr %a
317 ; REV pattern for v16i16 shuffle with vscale_range(2,2)
318 define void @test_revv16i16_vl256(ptr %a) #1 {
319 ; CHECK-LABEL: test_revv16i16_vl256:
321 ; CHECK-NEXT: ptrue p0.h
322 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
323 ; CHECK-NEXT: rev z0.h, z0.h
324 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
326 %tmp1 = load <16 x i16>, ptr %a
327 %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 undef, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
328 store <16 x i16> %tmp2, ptr %a
332 ; REV pattern for v8f32 shuffle with vscale_range(2,2)
333 define void @test_revv8f32_vl256(ptr %a) #1 {
334 ; CHECK-LABEL: test_revv8f32_vl256:
336 ; CHECK-NEXT: ptrue p0.s
337 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
338 ; CHECK-NEXT: rev z0.s, z0.s
339 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
341 %tmp1 = load <8 x float>, ptr %a
342 %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
343 store <8 x float> %tmp2, ptr %a
347 ; REV pattern for v4f64 shuffle with vscale_range(2,2)
348 define void @test_revv4f64_vl256(ptr %a) #1 {
349 ; CHECK-LABEL: test_revv4f64_vl256:
351 ; CHECK-NEXT: ptrue p0.d
352 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
353 ; CHECK-NEXT: rev z0.d, z0.d
354 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
356 %tmp1 = load <4 x double>, ptr %a
357 %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
358 store <4 x double> %tmp2, ptr %a
362 ; REV pattern for shuffle two v8i32 inputs with the second input available, vscale_range(2,2).
363 define void @test_revv8i32v8i32(ptr %a, ptr %b) #1 {
364 ; CHECK-LABEL: test_revv8i32v8i32:
366 ; CHECK-NEXT: ptrue p0.s
367 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1]
368 ; CHECK-NEXT: rev z0.s, z0.s
369 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
371 %tmp1 = load <8 x i32>, ptr %a
372 %tmp2 = load <8 x i32>, ptr %b
373 %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> %tmp2, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
374 store <8 x i32> %tmp3, ptr %a
378 ; Illegal REV pattern.
379 define void @test_rev_fail(ptr %a) #1 {
380 ; CHECK-LABEL: test_rev_fail:
382 ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
383 ; CHECK-NEXT: sub x9, sp, #48
384 ; CHECK-NEXT: mov x29, sp
385 ; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
386 ; CHECK-NEXT: .cfi_def_cfa w29, 16
387 ; CHECK-NEXT: .cfi_offset w30, -8
388 ; CHECK-NEXT: .cfi_offset w29, -16
389 ; CHECK-NEXT: ptrue p0.h
390 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
391 ; CHECK-NEXT: mov z1.h, z0.h[8]
392 ; CHECK-NEXT: fmov w8, s0
393 ; CHECK-NEXT: mov z2.h, z0.h[9]
394 ; CHECK-NEXT: mov z3.h, z0.h[10]
395 ; CHECK-NEXT: mov z4.h, z0.h[11]
396 ; CHECK-NEXT: strh w8, [sp, #14]
397 ; CHECK-NEXT: fmov w8, s1
398 ; CHECK-NEXT: mov z1.h, z0.h[12]
399 ; CHECK-NEXT: fmov w9, s2
400 ; CHECK-NEXT: mov z2.h, z0.h[13]
401 ; CHECK-NEXT: strh w8, [sp, #30]
402 ; CHECK-NEXT: fmov w8, s3
403 ; CHECK-NEXT: mov z3.h, z0.h[14]
404 ; CHECK-NEXT: strh w9, [sp, #28]
405 ; CHECK-NEXT: fmov w9, s4
406 ; CHECK-NEXT: mov z4.h, z0.h[15]
407 ; CHECK-NEXT: fmov w10, s2
408 ; CHECK-NEXT: strh w8, [sp, #26]
409 ; CHECK-NEXT: fmov w8, s1
410 ; CHECK-NEXT: fmov w11, s3
411 ; CHECK-NEXT: strh w9, [sp, #24]
412 ; CHECK-NEXT: umov w9, v0.h[1]
413 ; CHECK-NEXT: fmov w12, s4
414 ; CHECK-NEXT: strh w10, [sp, #20]
415 ; CHECK-NEXT: umov w10, v0.h[3]
416 ; CHECK-NEXT: strh w8, [sp, #22]
417 ; CHECK-NEXT: umov w8, v0.h[2]
418 ; CHECK-NEXT: strh w11, [sp, #18]
419 ; CHECK-NEXT: umov w11, v0.h[4]
420 ; CHECK-NEXT: strh w12, [sp, #16]
421 ; CHECK-NEXT: umov w12, v0.h[5]
422 ; CHECK-NEXT: strh w9, [sp, #12]
423 ; CHECK-NEXT: umov w9, v0.h[6]
424 ; CHECK-NEXT: strh w8, [sp, #10]
425 ; CHECK-NEXT: umov w8, v0.h[7]
426 ; CHECK-NEXT: strh w10, [sp, #8]
427 ; CHECK-NEXT: strh w11, [sp, #6]
428 ; CHECK-NEXT: strh w12, [sp, #4]
429 ; CHECK-NEXT: strh w9, [sp, #2]
430 ; CHECK-NEXT: strh w8, [sp]
431 ; CHECK-NEXT: mov x8, sp
432 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8]
433 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
434 ; CHECK-NEXT: mov sp, x29
435 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
437 %tmp1 = load <16 x i16>, ptr %a
438 %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
439 store <16 x i16> %tmp2, ptr %a
443 ; Don't use SVE for 128-bit shuffle with two inputs
444 define void @test_revv8i16v8i16(ptr %a, ptr %b, ptr %c) #1 {
445 ; CHECK-LABEL: test_revv8i16v8i16:
447 ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
448 ; CHECK-NEXT: sub x9, sp, #48
449 ; CHECK-NEXT: mov x29, sp
450 ; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
451 ; CHECK-NEXT: .cfi_def_cfa w29, 16
452 ; CHECK-NEXT: .cfi_offset w30, -8
453 ; CHECK-NEXT: .cfi_offset w29, -16
454 ; CHECK-NEXT: mov x8, sp
455 ; CHECK-NEXT: ldr q0, [x1]
456 ; CHECK-NEXT: ldr q1, [x0]
457 ; CHECK-NEXT: orr x9, x8, #0x1e
458 ; CHECK-NEXT: orr x10, x8, #0x1c
459 ; CHECK-NEXT: ptrue p0.h
460 ; CHECK-NEXT: st1 { v0.h }[4], [x9]
461 ; CHECK-NEXT: orr x9, x8, #0x18
462 ; CHECK-NEXT: st1 { v0.h }[7], [x9]
463 ; CHECK-NEXT: orr x9, x8, #0xe
464 ; CHECK-NEXT: st1 { v1.h }[4], [x9]
465 ; CHECK-NEXT: orr x9, x8, #0xc
466 ; CHECK-NEXT: st1 { v1.h }[5], [x9]
467 ; CHECK-NEXT: orr x9, x8, #0x8
468 ; CHECK-NEXT: st1 { v0.h }[5], [x10]
469 ; CHECK-NEXT: orr x10, x8, #0x10
470 ; CHECK-NEXT: st1 { v1.h }[7], [x9]
471 ; CHECK-NEXT: orr x9, x8, #0x4
472 ; CHECK-NEXT: st1 { v0.h }[3], [x10]
473 ; CHECK-NEXT: mov w10, #26 // =0x1a
474 ; CHECK-NEXT: st1 { v1.h }[1], [x9]
475 ; CHECK-NEXT: orr x9, x8, #0x2
476 ; CHECK-NEXT: st1 { v1.h }[2], [x9]
477 ; CHECK-NEXT: orr x9, x8, x10
478 ; CHECK-NEXT: mov w10, #20 // =0x14
479 ; CHECK-NEXT: st1 { v0.h }[6], [x9]
480 ; CHECK-NEXT: orr x9, x8, x10
481 ; CHECK-NEXT: mov w10, #18 // =0x12
482 ; CHECK-NEXT: st1 { v0.h }[1], [x9]
483 ; CHECK-NEXT: orr x9, x8, x10
484 ; CHECK-NEXT: st1 { v0.h }[2], [x9]
485 ; CHECK-NEXT: mov w9, #10 // =0xa
486 ; CHECK-NEXT: orr x9, x8, x9
487 ; CHECK-NEXT: st1 { v1.h }[3], [x8]
488 ; CHECK-NEXT: st1 { v1.h }[6], [x9]
489 ; CHECK-NEXT: str h0, [sp, #22]
490 ; CHECK-NEXT: str h1, [sp, #6]
491 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8]
492 ; CHECK-NEXT: st1h { z0.h }, p0, [x2]
493 ; CHECK-NEXT: mov sp, x29
494 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
496 %tmp1 = load <8 x i16>, ptr %a
497 %tmp2 = load <8 x i16>, ptr %b
498 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
499 store <16 x i16> %tmp3, ptr %c
503 attributes #0 = { "target-features"="+sve" }
504 attributes #1 = { "target-features"="+sve" vscale_range(2,2) }
505 attributes #2 = { "target-features"="+sve2p1" }