1 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false -mcpu=cyclone | FileCheck %s
2 ; RUN: llc < %s -global-isel -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false -mcpu=cyclone | FileCheck %s
4 define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
5 ; CHECK-LABEL: test_vaddv_s8:
6 ; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
7 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
10 %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a1)
11 %0 = trunc i32 %vaddv.i to i8
15 define <8 x i8> @test_vaddv_s8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) {
16 ; CHECK-LABEL: test_vaddv_s8_used_by_laneop:
17 ; CHECK: addv.8b b[[REGNUM:[0-9]+]], v1
18 ; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0]
21 %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a2)
22 %1 = trunc i32 %0 to i8
23 %2 = insertelement <8 x i8> %a1, i8 %1, i32 3
27 define signext i16 @test_vaddv_s16(<4 x i16> %a1) {
28 ; CHECK-LABEL: test_vaddv_s16:
29 ; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
30 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
33 %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a1)
34 %0 = trunc i32 %vaddv.i to i16
38 define <4 x i16> @test_vaddv_s16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) {
39 ; CHECK-LABEL: test_vaddv_s16_used_by_laneop:
40 ; CHECK: addv.4h h[[REGNUM:[0-9]+]], v1
41 ; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0]
44 %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a2)
45 %1 = trunc i32 %0 to i16
46 %2 = insertelement <4 x i16> %a1, i16 %1, i32 3
50 define i32 @test_vaddv_s32(<2 x i32> %a1) {
51 ; CHECK-LABEL: test_vaddv_s32:
52 ; 2 x i32 is not supported by the ISA, thus, this is a special case
53 ; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
54 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
57 %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a1)
61 define <2 x i32> @test_vaddv_s32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) {
62 ; CHECK-LABEL: test_vaddv_s32_used_by_laneop:
63 ; CHECK: addp.2s v[[REGNUM:[0-9]+]], v1, v1
64 ; CHECK-NEXT: mov.s v0[1], v[[REGNUM]][0]
67 %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a2)
68 %1 = insertelement <2 x i32> %a1, i32 %0, i32 1
72 define i64 @test_vaddv_s64(<2 x i64> %a1) {
73 ; CHECK-LABEL: test_vaddv_s64:
74 ; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
75 ; CHECK-NEXT: fmov x0, [[REGNUM]]
78 %vaddv.i = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a1)
82 define <2 x i64> @test_vaddv_s64_used_by_laneop(<2 x i64> %a1, <2 x i64> %a2) {
83 ; CHECK-LABEL: test_vaddv_s64_used_by_laneop:
84 ; CHECK: addp.2d d[[REGNUM:[0-9]+]], v1
85 ; CHECK-NEXT: mov.d v0[1], v[[REGNUM]][0]
88 %0 = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a2)
89 %1 = insertelement <2 x i64> %a1, i64 %0, i64 1
93 define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) {
94 ; CHECK-LABEL: test_vaddv_u8:
95 ; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
96 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
99 %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
100 %0 = trunc i32 %vaddv.i to i8
104 define <8 x i8> @test_vaddv_u8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) {
105 ; CHECK-LABEL: test_vaddv_u8_used_by_laneop:
106 ; CHECK: addv.8b b[[REGNUM:[0-9]+]], v1
107 ; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0]
110 %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a2)
111 %1 = trunc i32 %0 to i8
112 %2 = insertelement <8 x i8> %a1, i8 %1, i32 3
116 define i32 @test_vaddv_u8_masked(<8 x i8> %a1) {
117 ; CHECK-LABEL: test_vaddv_u8_masked:
118 ; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
119 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
122 %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
123 %0 = and i32 %vaddv.i, 511 ; 0x1ff
127 define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) {
128 ; CHECK-LABEL: test_vaddv_u16:
129 ; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
130 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
133 %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
134 %0 = trunc i32 %vaddv.i to i16
138 define <4 x i16> @test_vaddv_u16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) {
139 ; CHECK-LABEL: test_vaddv_u16_used_by_laneop:
140 ; CHECK: addv.4h h[[REGNUM:[0-9]+]], v1
141 ; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0]
144 %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a2)
145 %1 = trunc i32 %0 to i16
146 %2 = insertelement <4 x i16> %a1, i16 %1, i32 3
150 define i32 @test_vaddv_u16_masked(<4 x i16> %a1) {
151 ; CHECK-LABEL: test_vaddv_u16_masked:
152 ; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
153 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
156 %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
157 %0 = and i32 %vaddv.i, 3276799 ; 0x31ffff
161 define i32 @test_vaddv_u32(<2 x i32> %a1) {
162 ; CHECK-LABEL: test_vaddv_u32:
163 ; 2 x i32 is not supported by the ISA, thus, this is a special case
164 ; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
165 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
168 %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
172 define <2 x i32> @test_vaddv_u32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) {
173 ; CHECK-LABEL: test_vaddv_u32_used_by_laneop:
174 ; CHECK: addp.2s v[[REGNUM:[0-9]+]], v1, v1
175 ; CHECK-NEXT: mov.s v0[1], v[[REGNUM]][0]
178 %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a2)
179 %1 = insertelement <2 x i32> %a1, i32 %0, i32 1
183 define float @test_vaddv_f32(<2 x float> %a1) {
184 ; CHECK-LABEL: test_vaddv_f32:
185 ; CHECK: faddp.2s s0, v0
188 %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
192 define float @test_vaddv_v4f32(<4 x float> %a1) {
193 ; CHECK-LABEL: test_vaddv_v4f32:
194 ; CHECK: faddp.4s [[REGNUM:v[0-9]+]], v0, v0
195 ; CHECK: faddp.2s s0, [[REGNUM]]
198 %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
202 define double @test_vaddv_f64(<2 x double> %a1) {
203 ; CHECK-LABEL: test_vaddv_f64:
204 ; CHECK: faddp.2d d0, v0
207 %vaddv.i = tail call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
211 define i64 @test_vaddv_u64(<2 x i64> %a1) {
212 ; CHECK-LABEL: test_vaddv_u64:
213 ; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
214 ; CHECK-NEXT: fmov x0, [[REGNUM]]
217 %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
221 define <2 x i64> @test_vaddv_u64_used_by_laneop(<2 x i64> %a1, <2 x i64> %a2) {
222 ; CHECK-LABEL: test_vaddv_u64_used_by_laneop:
223 ; CHECK: addp.2d d[[REGNUM:[0-9]+]], v1
224 ; CHECK-NEXT: mov.d v0[1], v[[REGNUM]][0]
227 %0 = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a2)
228 %1 = insertelement <2 x i64> %a1, i64 %0, i64 1
232 define <1 x i64> @test_vaddv_u64_to_vec(<2 x i64> %a1) {
233 ; CHECK-LABEL: test_vaddv_u64_to_vec:
234 ; CHECK: addp.2d d0, v0
239 %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
240 %vec = insertelement <1 x i64> undef, i64 %vaddv.i, i32 0
244 define signext i8 @test_vaddvq_s8(<16 x i8> %a1) {
245 ; CHECK-LABEL: test_vaddvq_s8:
246 ; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
247 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
250 %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a1)
251 %0 = trunc i32 %vaddv.i to i8
255 define <16 x i8> @test_vaddvq_s8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) {
256 ; CHECK-LABEL: test_vaddvq_s8_used_by_laneop:
257 ; CHECK: addv.16b b[[REGNUM:[0-9]+]], v1
258 ; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0]
261 %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a2)
262 %1 = trunc i32 %0 to i8
263 %2 = insertelement <16 x i8> %a1, i8 %1, i32 3
267 define signext i16 @test_vaddvq_s16(<8 x i16> %a1) {
268 ; CHECK-LABEL: test_vaddvq_s16:
269 ; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
270 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
273 %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a1)
274 %0 = trunc i32 %vaddv.i to i16
278 define <8 x i16> @test_vaddvq_s16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) {
279 ; CHECK-LABEL: test_vaddvq_s16_used_by_laneop:
280 ; CHECK: addv.8h h[[REGNUM:[0-9]+]], v1
281 ; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0]
284 %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a2)
285 %1 = trunc i32 %0 to i16
286 %2 = insertelement <8 x i16> %a1, i16 %1, i32 3
290 define i32 @test_vaddvq_s32(<4 x i32> %a1) {
291 ; CHECK-LABEL: test_vaddvq_s32:
292 ; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
293 ; CHECK-NEXT: fmov w0, [[REGNUM]]
296 %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a1)
300 define <4 x i32> @test_vaddvq_s32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) {
301 ; CHECK-LABEL: test_vaddvq_s32_used_by_laneop:
302 ; CHECK: addv.4s s[[REGNUM:[0-9]+]], v1
303 ; CHECK-NEXT: mov.s v0[3], v[[REGNUM]][0]
306 %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a2)
307 %1 = insertelement <4 x i32> %a1, i32 %0, i32 3
311 define zeroext i8 @test_vaddvq_u8(<16 x i8> %a1) {
312 ; CHECK-LABEL: test_vaddvq_u8:
313 ; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
314 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
317 %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
318 %0 = trunc i32 %vaddv.i to i8
322 define <16 x i8> @test_vaddvq_u8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) {
323 ; CHECK-LABEL: test_vaddvq_u8_used_by_laneop:
324 ; CHECK: addv.16b b[[REGNUM:[0-9]+]], v1
325 ; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0]
328 %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a2)
329 %1 = trunc i32 %0 to i8
330 %2 = insertelement <16 x i8> %a1, i8 %1, i32 3
334 define zeroext i16 @test_vaddvq_u16(<8 x i16> %a1) {
335 ; CHECK-LABEL: test_vaddvq_u16:
336 ; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
337 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
340 %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
341 %0 = trunc i32 %vaddv.i to i16
345 define <8 x i16> @test_vaddvq_u16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) {
346 ; CHECK-LABEL: test_vaddvq_u16_used_by_laneop:
347 ; CHECK: addv.8h h[[REGNUM:[0-9]+]], v1
348 ; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0]
351 %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a2)
352 %1 = trunc i32 %0 to i16
353 %2 = insertelement <8 x i16> %a1, i16 %1, i32 3
357 define i32 @test_vaddvq_u32(<4 x i32> %a1) {
358 ; CHECK-LABEL: test_vaddvq_u32:
359 ; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
360 ; CHECK-NEXT: fmov [[FMOVRES:w[0-9]+]], [[REGNUM]]
363 %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
367 define <4 x i32> @test_vaddvq_u32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) {
368 ; CHECK-LABEL: test_vaddvq_u32_used_by_laneop:
369 ; CHECK: addv.4s s[[REGNUM:[0-9]+]], v1
370 ; CHECK-NEXT: mov.s v0[3], v[[REGNUM]][0]
373 %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a2)
374 %1 = insertelement <4 x i32> %a1, i32 %0, i32 3
378 declare i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32>)
380 declare i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16>)
382 declare i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8>)
384 declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
386 declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
388 declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
390 declare i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64>)
392 declare i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32>)
394 declare i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16>)
396 declare i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8>)
398 declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
400 declare i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64>)
402 declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
404 declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
406 declare float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
407 declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
408 declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)