1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=arm-eabi -mattr=+neon,+fullfp16 %s -o - | FileCheck %s
4 define <8 x i8> @test_vrev64D8(ptr %A) nounwind {
5 ; CHECK-LABEL: test_vrev64D8:
7 ; CHECK-NEXT: vldr d16, [r0]
8 ; CHECK-NEXT: vrev64.8 d16, d16
9 ; CHECK-NEXT: vmov r0, r1, d16
10 ; CHECK-NEXT: mov pc, lr
11 %tmp1 = load <8 x i8>, ptr %A
12 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
16 define <4 x i16> @test_vrev64D16(ptr %A) nounwind {
17 ; CHECK-LABEL: test_vrev64D16:
19 ; CHECK-NEXT: vldr d16, [r0]
20 ; CHECK-NEXT: vrev64.16 d16, d16
21 ; CHECK-NEXT: vmov r0, r1, d16
22 ; CHECK-NEXT: mov pc, lr
23 %tmp1 = load <4 x i16>, ptr %A
24 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
28 define <4 x half> @test_vrev64Df16(ptr %A) nounwind {
29 ; CHECK-LABEL: test_vrev64Df16:
31 ; CHECK-NEXT: vldr d16, [r0]
32 ; CHECK-NEXT: vrev64.16 d16, d16
33 ; CHECK-NEXT: vmov r0, r1, d16
34 ; CHECK-NEXT: mov pc, lr
35 %tmp1 = load <4 x half>, ptr %A
36 %tmp2 = shufflevector <4 x half> %tmp1, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
40 define <2 x i32> @test_vrev64D32(ptr %A) nounwind {
41 ; CHECK-LABEL: test_vrev64D32:
43 ; CHECK-NEXT: vldr d16, [r0]
44 ; CHECK-NEXT: vrev64.32 d16, d16
45 ; CHECK-NEXT: vmov r0, r1, d16
46 ; CHECK-NEXT: mov pc, lr
47 %tmp1 = load <2 x i32>, ptr %A
48 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
52 define <2 x float> @test_vrev64Df(ptr %A) nounwind {
53 ; CHECK-LABEL: test_vrev64Df:
55 ; CHECK-NEXT: vldr d16, [r0]
56 ; CHECK-NEXT: vrev64.32 d16, d16
57 ; CHECK-NEXT: vmov r0, r1, d16
58 ; CHECK-NEXT: mov pc, lr
59 %tmp1 = load <2 x float>, ptr %A
60 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
64 define <16 x i8> @test_vrev64Q8(ptr %A) nounwind {
65 ; CHECK-LABEL: test_vrev64Q8:
67 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
68 ; CHECK-NEXT: vrev64.8 q8, q8
69 ; CHECK-NEXT: vmov r0, r1, d16
70 ; CHECK-NEXT: vmov r2, r3, d17
71 ; CHECK-NEXT: mov pc, lr
72 %tmp1 = load <16 x i8>, ptr %A
73 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
77 define <8 x i16> @test_vrev64Q16(ptr %A) nounwind {
78 ; CHECK-LABEL: test_vrev64Q16:
80 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
81 ; CHECK-NEXT: vrev64.16 q8, q8
82 ; CHECK-NEXT: vmov r0, r1, d16
83 ; CHECK-NEXT: vmov r2, r3, d17
84 ; CHECK-NEXT: mov pc, lr
85 %tmp1 = load <8 x i16>, ptr %A
86 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
90 define <8 x half> @test_vrev64Qf16(ptr %A) nounwind {
91 ; CHECK-LABEL: test_vrev64Qf16:
93 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
94 ; CHECK-NEXT: vrev64.16 q8, q8
95 ; CHECK-NEXT: vmov r0, r1, d16
96 ; CHECK-NEXT: vmov r2, r3, d17
97 ; CHECK-NEXT: mov pc, lr
98 %tmp1 = load <8 x half>, ptr %A
99 %tmp2 = shufflevector <8 x half> %tmp1, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
103 define <4 x i32> @test_vrev64Q32(ptr %A) nounwind {
104 ; CHECK-LABEL: test_vrev64Q32:
106 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
107 ; CHECK-NEXT: vrev64.32 q8, q8
108 ; CHECK-NEXT: vmov r0, r1, d16
109 ; CHECK-NEXT: vmov r2, r3, d17
110 ; CHECK-NEXT: mov pc, lr
111 %tmp1 = load <4 x i32>, ptr %A
112 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
116 define <4 x float> @test_vrev64Qf(ptr %A) nounwind {
117 ; CHECK-LABEL: test_vrev64Qf:
119 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
120 ; CHECK-NEXT: vrev64.32 q8, q8
121 ; CHECK-NEXT: vmov r0, r1, d16
122 ; CHECK-NEXT: vmov r2, r3, d17
123 ; CHECK-NEXT: mov pc, lr
124 %tmp1 = load <4 x float>, ptr %A
125 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
126 ret <4 x float> %tmp2
129 define <8 x i8> @test_vrev32D8(ptr %A) nounwind {
130 ; CHECK-LABEL: test_vrev32D8:
132 ; CHECK-NEXT: vldr d16, [r0]
133 ; CHECK-NEXT: vrev32.8 d16, d16
134 ; CHECK-NEXT: vmov r0, r1, d16
135 ; CHECK-NEXT: mov pc, lr
136 %tmp1 = load <8 x i8>, ptr %A
137 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
141 define <4 x i16> @test_vrev32D16(ptr %A) nounwind {
142 ; CHECK-LABEL: test_vrev32D16:
144 ; CHECK-NEXT: vldr d16, [r0]
145 ; CHECK-NEXT: vrev32.16 d16, d16
146 ; CHECK-NEXT: vmov r0, r1, d16
147 ; CHECK-NEXT: mov pc, lr
148 %tmp1 = load <4 x i16>, ptr %A
149 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
153 define <4 x half> @test_vrev32Df16(ptr %A) nounwind {
154 ; CHECK-LABEL: test_vrev32Df16:
156 ; CHECK-NEXT: vldr d16, [r0]
157 ; CHECK-NEXT: vrev32.16 d16, d16
158 ; CHECK-NEXT: vmov r0, r1, d16
159 ; CHECK-NEXT: mov pc, lr
160 %tmp1 = load <4 x half>, ptr %A
161 %tmp2 = shufflevector <4 x half> %tmp1, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
165 define <16 x i8> @test_vrev32Q8(ptr %A) nounwind {
166 ; CHECK-LABEL: test_vrev32Q8:
168 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
169 ; CHECK-NEXT: vrev32.8 q8, q8
170 ; CHECK-NEXT: vmov r0, r1, d16
171 ; CHECK-NEXT: vmov r2, r3, d17
172 ; CHECK-NEXT: mov pc, lr
173 %tmp1 = load <16 x i8>, ptr %A
174 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
178 define <8 x i16> @test_vrev32Q16(ptr %A) nounwind {
179 ; CHECK-LABEL: test_vrev32Q16:
181 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
182 ; CHECK-NEXT: vrev32.16 q8, q8
183 ; CHECK-NEXT: vmov r0, r1, d16
184 ; CHECK-NEXT: vmov r2, r3, d17
185 ; CHECK-NEXT: mov pc, lr
186 %tmp1 = load <8 x i16>, ptr %A
187 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
191 define <8 x half> @test_vrev32Qf16(ptr %A) nounwind {
192 ; CHECK-LABEL: test_vrev32Qf16:
194 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
195 ; CHECK-NEXT: vrev32.16 q8, q8
196 ; CHECK-NEXT: vmov r0, r1, d16
197 ; CHECK-NEXT: vmov r2, r3, d17
198 ; CHECK-NEXT: mov pc, lr
199 %tmp1 = load <8 x half>, ptr %A
200 %tmp2 = shufflevector <8 x half> %tmp1, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
204 define <8 x i8> @test_vrev16D8(ptr %A) nounwind {
205 ; CHECK-LABEL: test_vrev16D8:
207 ; CHECK-NEXT: vldr d16, [r0]
208 ; CHECK-NEXT: vrev16.8 d16, d16
209 ; CHECK-NEXT: vmov r0, r1, d16
210 ; CHECK-NEXT: mov pc, lr
211 %tmp1 = load <8 x i8>, ptr %A
212 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
216 define <16 x i8> @test_vrev16Q8(ptr %A) nounwind {
217 ; CHECK-LABEL: test_vrev16Q8:
219 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
220 ; CHECK-NEXT: vrev16.8 q8, q8
221 ; CHECK-NEXT: vmov r0, r1, d16
222 ; CHECK-NEXT: vmov r2, r3, d17
223 ; CHECK-NEXT: mov pc, lr
224 %tmp1 = load <16 x i8>, ptr %A
225 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
229 ; Undef shuffle indices should not prevent matching to VREV:
231 define <8 x i8> @test_vrev64D8_undef(ptr %A) nounwind {
232 ; CHECK-LABEL: test_vrev64D8_undef:
234 ; CHECK-NEXT: vldr d16, [r0]
235 ; CHECK-NEXT: vrev64.8 d16, d16
236 ; CHECK-NEXT: vmov r0, r1, d16
237 ; CHECK-NEXT: mov pc, lr
238 %tmp1 = load <8 x i8>, ptr %A
239 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0>
243 define <8 x i16> @test_vrev32Q16_undef(ptr %A) nounwind {
244 ; CHECK-LABEL: test_vrev32Q16_undef:
246 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
247 ; CHECK-NEXT: vrev32.16 q8, q8
248 ; CHECK-NEXT: vmov r0, r1, d16
249 ; CHECK-NEXT: vmov r2, r3, d17
250 ; CHECK-NEXT: mov pc, lr
251 %tmp1 = load <8 x i16>, ptr %A
252 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
256 define <8 x half> @test_vrev32Qf16_undef(ptr %A) nounwind {
257 ; CHECK-LABEL: test_vrev32Qf16_undef:
259 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
260 ; CHECK-NEXT: vrev32.16 q8, q8
261 ; CHECK-NEXT: vmov r0, r1, d16
262 ; CHECK-NEXT: vmov r2, r3, d17
263 ; CHECK-NEXT: mov pc, lr
264 %tmp1 = load <8 x half>, ptr %A
265 %tmp2 = shufflevector <8 x half> %tmp1, <8 x half> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
269 ; A vcombine feeding a VREV should not obscure things. Radar 8597007.
271 define void @test_with_vcombine(ptr %v) nounwind {
272 ; CHECK-LABEL: test_with_vcombine:
274 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0:128]
275 ; CHECK-NEXT: vadd.f32 d18, d17, d17
276 ; CHECK-NEXT: vrev64.32 d16, d16
277 ; CHECK-NEXT: vrev64.32 d17, d18
278 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128]
279 ; CHECK-NEXT: mov pc, lr
280 %tmp1 = load <4 x float>, ptr %v, align 16
281 %tmp2 = bitcast <4 x float> %tmp1 to <2 x double>
282 %tmp3 = extractelement <2 x double> %tmp2, i32 0
283 %tmp4 = bitcast double %tmp3 to <2 x float>
284 %tmp5 = extractelement <2 x double> %tmp2, i32 1
285 %tmp6 = bitcast double %tmp5 to <2 x float>
286 %tmp7 = fadd <2 x float> %tmp6, %tmp6
287 %tmp8 = shufflevector <2 x float> %tmp4, <2 x float> %tmp7, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
288 store <4 x float> %tmp8, ptr %v, align 16
292 ; The type <2 x i16> is legalized to <2 x i32> and need to be trunc-stored
293 ; to <2 x i16> when stored to memory.
294 define void @test_vrev64(ptr nocapture %source, ptr nocapture %dst) nounwind ssp {
295 ; CHECK-LABEL: test_vrev64:
296 ; CHECK: @ %bb.0: @ %entry
297 ; CHECK-NEXT: vld1.32 {d16, d17}, [r0]
298 ; CHECK-NEXT: vmov.u16 r0, d17[2]
299 ; CHECK-NEXT: vmov.u16 r2, d17[1]
300 ; CHECK-NEXT: vmov.32 d16[0], r0
301 ; CHECK-NEXT: vmov.32 d16[1], r2
302 ; CHECK-NEXT: vuzp.16 d16, d17
303 ; CHECK-NEXT: vst1.32 {d16[0]}, [r1:32]
304 ; CHECK-NEXT: mov pc, lr
306 %tmp2 = load <8 x i16>, ptr %source, align 4
307 %tmp3 = extractelement <8 x i16> %tmp2, i32 6
308 %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
309 %tmp9 = extractelement <8 x i16> %tmp2, i32 5
310 %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
311 store <2 x i16> %tmp11, ptr %dst, align 4
315 ; Test vrev of float4
316 define void @float_vrev64(ptr nocapture %source, ptr nocapture %dest) nounwind noinline ssp {
317 ; CHECK-LABEL: float_vrev64:
318 ; CHECK: @ %bb.0: @ %entry
319 ; CHECK-NEXT: vmov.i32 q8, #0x0
320 ; CHECK-NEXT: vld1.32 {d18, d19}, [r0]
321 ; CHECK-NEXT: add r0, r1, #176
322 ; CHECK-NEXT: vext.32 q8, q9, q8, #3
323 ; CHECK-NEXT: vrev64.32 q8, q8
324 ; CHECK-NEXT: vst1.32 {d16, d17}, [r0]
325 ; CHECK-NEXT: mov pc, lr
327 %tmp2 = load <4 x float>, ptr %source, align 4
328 %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
329 %arrayidx8 = getelementptr inbounds <4 x float>, ptr %dest, i32 11
330 store <4 x float> %tmp5, ptr %arrayidx8, align 4
334 define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
335 ; CHECK-LABEL: test_vrev32_bswap:
337 ; CHECK-NEXT: vmov d17, r2, r3
338 ; CHECK-NEXT: vmov d16, r0, r1
339 ; CHECK-NEXT: vrev32.8 q8, q8
340 ; CHECK-NEXT: vmov r0, r1, d16
341 ; CHECK-NEXT: vmov r2, r3, d17
342 ; CHECK-NEXT: mov pc, lr
343 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
347 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone