1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s
4 declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8>) #0
5 declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>) #0
6 declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>) #0
7 declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>) #0
8 declare i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32>) #0
10 define void @insert_vec_v2i32_uaddlv_from_v8i16(ptr %0) {
11 ; CHECK-LABEL: insert_vec_v2i32_uaddlv_from_v8i16:
12 ; CHECK: ; %bb.0: ; %entry
13 ; CHECK-NEXT: movi.2d v0, #0000000000000000
14 ; CHECK-NEXT: movi.2d v1, #0000000000000000
15 ; CHECK-NEXT: uaddlv.8h s0, v0
16 ; CHECK-NEXT: mov.s v1[0], v0[0]
17 ; CHECK-NEXT: ucvtf.2s v0, v1
18 ; CHECK-NEXT: str d0, [x0]
22 %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> zeroinitializer)
23 %1 = insertelement <2 x i32> zeroinitializer, i32 %vaddlv, i64 0
24 %2 = uitofp <2 x i32> %1 to <2 x float>
25 store <2 x float> %2, ptr %0, align 8
29 define void @insert_vec_v4i32_uaddlv_from_v8i16(ptr %0) {
30 ; CHECK-LABEL: insert_vec_v4i32_uaddlv_from_v8i16:
31 ; CHECK: ; %bb.0: ; %entry
32 ; CHECK-NEXT: movi.2d v0, #0000000000000000
33 ; CHECK-NEXT: uaddlv.8h s1, v0
34 ; CHECK-NEXT: mov.s v0[0], v1[0]
35 ; CHECK-NEXT: ucvtf.4s v0, v0
36 ; CHECK-NEXT: str q0, [x0]
40 %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> zeroinitializer)
41 %1 = insertelement <4 x i32> zeroinitializer, i32 %vaddlv, i64 0
42 %2 = uitofp <4 x i32> %1 to <4 x float>
43 store <4 x float> %2, ptr %0, align 8
47 define void @insert_vec_v16i32_uaddlv_from_v8i16(ptr %0) {
48 ; CHECK-LABEL: insert_vec_v16i32_uaddlv_from_v8i16:
49 ; CHECK: ; %bb.0: ; %entry
50 ; CHECK-NEXT: movi.2d v0, #0000000000000000
51 ; CHECK-NEXT: movi.2d v2, #0000000000000000
52 ; CHECK-NEXT: uaddlv.8h s1, v0
53 ; CHECK-NEXT: stp q0, q0, [x0, #32]
54 ; CHECK-NEXT: mov.s v2[0], v1[0]
55 ; CHECK-NEXT: ucvtf.4s v1, v2
56 ; CHECK-NEXT: stp q1, q0, [x0]
60 %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> zeroinitializer)
61 %1 = insertelement <16 x i32> zeroinitializer, i32 %vaddlv, i64 0
62 %2 = uitofp <16 x i32> %1 to <16 x float>
63 store <16 x float> %2, ptr %0, align 8
67 define void @insert_vec_v23i32_uaddlv_from_v8i16(ptr %0) {
68 ; CHECK-LABEL: insert_vec_v23i32_uaddlv_from_v8i16:
69 ; CHECK: ; %bb.0: ; %entry
70 ; CHECK-NEXT: movi.2d v0, #0000000000000000
71 ; CHECK-NEXT: movi.2d v2, #0000000000000000
72 ; CHECK-NEXT: str wzr, [x0, #88]
73 ; CHECK-NEXT: uaddlv.8h s1, v0
74 ; CHECK-NEXT: stp q0, q0, [x0, #16]
75 ; CHECK-NEXT: stp q0, q0, [x0, #48]
76 ; CHECK-NEXT: str d0, [x0, #80]
77 ; CHECK-NEXT: mov.s v2[0], v1[0]
78 ; CHECK-NEXT: ucvtf.4s v1, v2
79 ; CHECK-NEXT: str q1, [x0]
83 %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> zeroinitializer)
84 %1 = insertelement <23 x i32> zeroinitializer, i32 %vaddlv, i64 0
85 %2 = uitofp <23 x i32> %1 to <23 x float>
86 store <23 x float> %2, ptr %0, align 8
90 define void @insert_vec_v2i32_uaddlv_from_v16i8(ptr %0) {
91 ; CHECK-LABEL: insert_vec_v2i32_uaddlv_from_v16i8:
92 ; CHECK: ; %bb.0: ; %entry
93 ; CHECK-NEXT: movi.2d v0, #0000000000000000
94 ; CHECK-NEXT: movi.2d v1, #0000000000000000
95 ; CHECK-NEXT: uaddlv.16b h0, v0
96 ; CHECK-NEXT: mov.s v1[0], v0[0]
97 ; CHECK-NEXT: ucvtf.2s v0, v1
98 ; CHECK-NEXT: str d0, [x0]
102 %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> zeroinitializer)
103 %1 = insertelement <2 x i32> zeroinitializer, i32 %vaddlv, i64 0
104 %2 = uitofp <2 x i32> %1 to <2 x float>
105 store <2 x float> %2, ptr %0, align 8
109 define void @insert_vec_v2i32_uaddlv_from_v8i8(ptr %0) {
110 ; CHECK-LABEL: insert_vec_v2i32_uaddlv_from_v8i8:
111 ; CHECK: ; %bb.0: ; %entry
112 ; CHECK-NEXT: movi.2d v0, #0000000000000000
113 ; CHECK-NEXT: uaddlv.8b h1, v0
114 ; CHECK-NEXT: mov.s v0[0], v1[0]
115 ; CHECK-NEXT: ucvtf.2s v0, v0
116 ; CHECK-NEXT: str d0, [x0]
120 %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> zeroinitializer)
121 %1 = insertelement <2 x i32> zeroinitializer, i32 %vaddlv, i64 0
122 %2 = uitofp <2 x i32> %1 to <2 x float>
123 store <2 x float> %2, ptr %0, align 8
127 define void @insert_vec_v2i32_uaddlv_from_v4i16(ptr %0) {
128 ; CHECK-LABEL: insert_vec_v2i32_uaddlv_from_v4i16:
129 ; CHECK: ; %bb.0: ; %entry
130 ; CHECK-NEXT: movi.2d v0, #0000000000000000
131 ; CHECK-NEXT: uaddlv.4h s1, v0
132 ; CHECK-NEXT: mov.s v0[0], v1[0]
133 ; CHECK-NEXT: ucvtf.2s v0, v0
134 ; CHECK-NEXT: str d0, [x0]
138 %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> zeroinitializer)
139 %1 = insertelement <2 x i32> zeroinitializer, i32 %vaddlv, i64 0
140 %2 = uitofp <2 x i32> %1 to <2 x float>
141 store <2 x float> %2, ptr %0, align 8
145 define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) {
146 ; CHECK-LABEL: insert_vec_v6i64_uaddlv_from_v4i32:
147 ; CHECK: ; %bb.0: ; %entry
148 ; CHECK-NEXT: movi.2d v0, #0000000000000000
149 ; CHECK-NEXT: uaddlv.4s d1, v0
150 ; CHECK-NEXT: mov.d v0[0], v1[0]
151 ; CHECK-NEXT: movi.2d v1, #0000000000000000
152 ; CHECK-NEXT: ucvtf.2d v0, v0
153 ; CHECK-NEXT: str d1, [x0, #16]
154 ; CHECK-NEXT: fcvtn v0.2s, v0.2d
155 ; CHECK-NEXT: str q0, [x0]
159 %vaddlv = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> zeroinitializer)
160 %1 = insertelement <6 x i64> zeroinitializer, i64 %vaddlv, i64 0
161 %2 = uitofp <6 x i64> %1 to <6 x float>
162 store <6 x float> %2, ptr %0, align 8
166 define void @insert_vec_v2i64_uaddlv_from_v4i32(ptr %0) {
167 ; CHECK-LABEL: insert_vec_v2i64_uaddlv_from_v4i32:
168 ; CHECK: ; %bb.0: ; %entry
169 ; CHECK-NEXT: movi.2d v0, #0000000000000000
170 ; CHECK-NEXT: uaddlv.4s d1, v0
171 ; CHECK-NEXT: mov.d v0[0], v1[0]
172 ; CHECK-NEXT: ucvtf.2d v0, v0
173 ; CHECK-NEXT: fcvtn v0.2s, v0.2d
174 ; CHECK-NEXT: str d0, [x0]
178 %vaddlv = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> zeroinitializer)
179 %1 = insertelement <2 x i64> zeroinitializer, i64 %vaddlv, i64 0
180 %2 = uitofp <2 x i64> %1 to <2 x float>
181 store <2 x float> %2, ptr %0, align 8
185 define void @insert_vec_v5i64_uaddlv_from_v4i32(ptr %0) {
186 ; CHECK-LABEL: insert_vec_v5i64_uaddlv_from_v4i32:
187 ; CHECK: ; %bb.0: ; %entry
188 ; CHECK-NEXT: movi.2d v0, #0000000000000000
189 ; CHECK-NEXT: str wzr, [x0, #16]
190 ; CHECK-NEXT: uaddlv.4s d1, v0
191 ; CHECK-NEXT: mov.d v0[0], v1[0]
192 ; CHECK-NEXT: ucvtf.2d v0, v0
193 ; CHECK-NEXT: fcvtn v0.2s, v0.2d
194 ; CHECK-NEXT: str q0, [x0]
198 %vaddlv = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> zeroinitializer)
199 %1 = insertelement <5 x i64> zeroinitializer, i64 %vaddlv, i64 0
200 %2 = uitofp <5 x i64> %1 to <5 x float>
201 store <5 x float> %2, ptr %0, align 8
205 define void @insert_vec_v8i16_uaddlv_from_v8i16(ptr %0) {
206 ; CHECK-LABEL: insert_vec_v8i16_uaddlv_from_v8i16:
207 ; CHECK: ; %bb.0: ; %entry
208 ; CHECK-NEXT: movi.2d v0, #0000000000000000
209 ; CHECK-NEXT: movi.2d v1, #0000000000000000
210 ; CHECK-NEXT: stp xzr, xzr, [x0, #16]
211 ; CHECK-NEXT: uaddlv.8h s0, v0
212 ; CHECK-NEXT: mov.h v1[0], v0[0]
213 ; CHECK-NEXT: ushll.4s v1, v1, #0
214 ; CHECK-NEXT: ucvtf.4s v1, v1
215 ; CHECK-NEXT: str q1, [x0]
219 %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> zeroinitializer)
220 %1 = trunc i32 %vaddlv to i16
221 %2 = insertelement <8 x i16> zeroinitializer, i16 %1, i64 0
222 %3 = uitofp <8 x i16> %2 to <8 x float>
223 store <8 x float> %3, ptr %0, align 8
227 define void @insert_vec_v3i16_uaddlv_from_v8i16(ptr %0) {
228 ; CHECK-LABEL: insert_vec_v3i16_uaddlv_from_v8i16:
229 ; CHECK: ; %bb.0: ; %entry
230 ; CHECK-NEXT: movi.2d v0, #0000000000000000
231 ; CHECK-NEXT: movi.2d v1, #0000000000000000
232 ; CHECK-NEXT: add x8, x0, #8
233 ; CHECK-NEXT: uaddlv.8h s0, v0
234 ; CHECK-NEXT: mov.h v1[0], v0[0]
235 ; CHECK-NEXT: ushll.4s v1, v1, #0
236 ; CHECK-NEXT: ucvtf.4s v1, v1
237 ; CHECK-NEXT: st1.s { v1 }[2], [x8]
238 ; CHECK-NEXT: str d1, [x0]
242 %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> zeroinitializer)
243 %1 = trunc i32 %vaddlv to i16
244 %2 = insertelement <3 x i16> zeroinitializer, i16 %1, i64 0
245 %3 = uitofp <3 x i16> %2 to <3 x float>
246 store <3 x float> %3, ptr %0, align 8
250 define void @insert_vec_v16i64_uaddlv_from_v4i16(ptr %0) {
251 ; CHECK-LABEL: insert_vec_v16i64_uaddlv_from_v4i16:
252 ; CHECK: ; %bb.0: ; %entry
253 ; CHECK-NEXT: movi.2d v0, #0000000000000000
254 ; CHECK-NEXT: movi.2d v2, #0000000000000000
255 ; CHECK-NEXT: uaddlv.4h s1, v0
256 ; CHECK-NEXT: stp q0, q0, [x0, #32]
257 ; CHECK-NEXT: mov.s v2[0], v1[0]
258 ; CHECK-NEXT: ucvtf.2d v1, v2
259 ; CHECK-NEXT: fcvtn v1.2s, v1.2d
260 ; CHECK-NEXT: stp q1, q0, [x0]
264 %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> zeroinitializer)
265 %1 = zext i32 %vaddlv to i64
266 %2 = insertelement <16 x i64> zeroinitializer, i64 %1, i64 0
267 %3 = uitofp <16 x i64> %2 to <16 x float>
268 store <16 x float> %3, ptr %0, align 8
272 define void @insert_vec_v16i8_uaddlv_from_v8i8(ptr %0) {
273 ; CHECK-LABEL: insert_vec_v16i8_uaddlv_from_v8i8:
274 ; CHECK: ; %bb.0: ; %entry
275 ; CHECK-NEXT: movi.2d v0, #0000000000000000
276 ; CHECK-NEXT: movi.2d v2, #0000000000000000
277 ; CHECK-NEXT: uaddlv.8b h1, v0
278 ; CHECK-NEXT: stp q0, q0, [x0, #32]
279 ; CHECK-NEXT: mov.h v2[0], v1[0]
280 ; CHECK-NEXT: bic.4h v2, #255, lsl #8
281 ; CHECK-NEXT: ushll.4s v2, v2, #0
282 ; CHECK-NEXT: ucvtf.4s v2, v2
283 ; CHECK-NEXT: stp q2, q0, [x0]
287 %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> zeroinitializer)
288 %1 = trunc i32 %vaddlv to i8
289 %2 = insertelement <16 x i8> zeroinitializer, i8 %1, i64 0
290 %3 = uitofp <16 x i8> %2 to <16 x float>
291 store <16 x float> %3, ptr %0, align 8
295 define void @insert_vec_v8i8_uaddlv_from_v8i8(ptr %0) {
296 ; CHECK-LABEL: insert_vec_v8i8_uaddlv_from_v8i8:
297 ; CHECK: ; %bb.0: ; %entry
298 ; CHECK-NEXT: movi.2d v0, #0000000000000000
299 ; CHECK-NEXT: stp xzr, xzr, [x0, #16]
300 ; CHECK-NEXT: uaddlv.8b h1, v0
301 ; CHECK-NEXT: mov.h v0[0], v1[0]
302 ; CHECK-NEXT: bic.4h v0, #7, lsl #8
303 ; CHECK-NEXT: ushll.4s v0, v0, #0
304 ; CHECK-NEXT: ucvtf.4s v0, v0
305 ; CHECK-NEXT: str q0, [x0]
309 %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> zeroinitializer)
310 %1 = trunc i32 %vaddlv to i8
311 %2 = insertelement <8 x i8> zeroinitializer, i8 %1, i64 0
312 %3 = uitofp <8 x i8> %2 to <8 x float>
313 store <8 x float> %3, ptr %0, align 8
317 define void @insert_vec_v12i16_uaddlv_from_v4i16(ptr %0) {
318 ; CHECK-LABEL: insert_vec_v12i16_uaddlv_from_v4i16:
319 ; CHECK: ; %bb.0: ; %entry
320 ; CHECK-NEXT: movi.2d v0, #0000000000000000
321 ; CHECK-NEXT: stp xzr, xzr, [x0, #16]
322 ; CHECK-NEXT: stp xzr, xzr, [x0, #32]
323 ; CHECK-NEXT: uaddlv.4h s1, v0
324 ; CHECK-NEXT: mov.h v0[0], v1[0]
325 ; CHECK-NEXT: ushll.4s v0, v0, #0
326 ; CHECK-NEXT: ucvtf.4s v0, v0
327 ; CHECK-NEXT: str q0, [x0]
331 %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> zeroinitializer)
332 %1 = trunc i32 %vaddlv to i16
333 %2 = insertelement <12 x i16> zeroinitializer, i16 %1, i64 0
334 %3 = uitofp <12 x i16> %2 to <12 x float>
335 store <12 x float> %3, ptr %0, align 8
339 define void @insert_vec_v8i32_uaddlv_from_v4i32(ptr %0) {
340 ; CHECK-LABEL: insert_vec_v8i32_uaddlv_from_v4i32:
341 ; CHECK: ; %bb.0: ; %entry
342 ; CHECK-NEXT: movi.2d v0, #0000000000000000
343 ; CHECK-NEXT: stp xzr, xzr, [x0, #16]
344 ; CHECK-NEXT: uaddlv.4s d1, v0
345 ; CHECK-NEXT: mov.s v0[0], v1[0]
346 ; CHECK-NEXT: ucvtf.4s v0, v0
347 ; CHECK-NEXT: str q0, [x0]
351 %vaddlv = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> zeroinitializer)
352 %1 = trunc i64 %vaddlv to i32
353 %2 = insertelement <8 x i32> zeroinitializer, i32 %1, i64 0
354 %3 = uitofp <8 x i32> %2 to <8 x float>
355 store <8 x float> %3, ptr %0, align 8
359 define void @insert_vec_v16i32_uaddlv_from_v4i32(ptr %0) {
360 ; CHECK-LABEL: insert_vec_v16i32_uaddlv_from_v4i32:
361 ; CHECK: ; %bb.0: ; %entry
362 ; CHECK-NEXT: movi.2d v0, #0000000000000000
363 ; CHECK-NEXT: movi.2d v2, #0000000000000000
364 ; CHECK-NEXT: uaddlv.4s d1, v0
365 ; CHECK-NEXT: stp q0, q0, [x0, #32]
366 ; CHECK-NEXT: mov.s v2[0], v1[0]
367 ; CHECK-NEXT: ucvtf.4s v1, v2
368 ; CHECK-NEXT: stp q1, q0, [x0]
372 %vaddlv = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> zeroinitializer)
373 %1 = trunc i64 %vaddlv to i32
374 %2 = insertelement <16 x i32> zeroinitializer, i32 %1, i64 0
375 %3 = uitofp <16 x i32> %2 to <16 x float>
376 store <16 x float> %3, ptr %0, align 8
380 define void @insert_vec_v4i16_uaddlv_from_v4i32(ptr %0) {
381 ; CHECK-LABEL: insert_vec_v4i16_uaddlv_from_v4i32:
382 ; CHECK: ; %bb.0: ; %entry
383 ; CHECK-NEXT: movi.2d v0, #0000000000000000
384 ; CHECK-NEXT: movi.2d v1, #0000000000000000
385 ; CHECK-NEXT: uaddlv.4s d0, v0
386 ; CHECK-NEXT: mov.h v1[0], v0[0]
387 ; CHECK-NEXT: ushll.4s v0, v1, #0
388 ; CHECK-NEXT: ucvtf.4s v0, v0
389 ; CHECK-NEXT: str q0, [x0]
393 %vaddlv = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> zeroinitializer)
394 %1 = trunc i64 %vaddlv to i16
395 %2 = insertelement <4 x i16> zeroinitializer, i16 %1, i64 0
396 %3 = uitofp <4 x i16> %2 to <4 x float>
397 store <4 x float> %3, ptr %0, align 8
401 define void @insert_vec_v16i16_uaddlv_from_v4i32(ptr %0) {
402 ; CHECK-LABEL: insert_vec_v16i16_uaddlv_from_v4i32:
403 ; CHECK: ; %bb.0: ; %entry
404 ; CHECK-NEXT: movi.2d v0, #0000000000000000
405 ; CHECK-NEXT: movi.2d v1, #0000000000000000
406 ; CHECK-NEXT: uaddlv.4s d0, v0
407 ; CHECK-NEXT: mov.h v1[0], v0[0]
408 ; CHECK-NEXT: movi.2d v0, #0000000000000000
409 ; CHECK-NEXT: ushll.4s v1, v1, #0
410 ; CHECK-NEXT: stp q0, q0, [x0, #32]
411 ; CHECK-NEXT: ucvtf.4s v1, v1
412 ; CHECK-NEXT: stp q1, q0, [x0]
416 %vaddlv = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> zeroinitializer)
417 %1 = trunc i64 %vaddlv to i16
418 %2 = insertelement <16 x i16> zeroinitializer, i16 %1, i64 0
419 %3 = uitofp <16 x i16> %2 to <16 x float>
420 store <16 x float> %3, ptr %0, align 8
424 define void @insert_vec_v8i8_uaddlv_from_v4i32(ptr %0) {
425 ; CHECK-LABEL: insert_vec_v8i8_uaddlv_from_v4i32:
426 ; CHECK: ; %bb.0: ; %entry
427 ; CHECK-NEXT: movi.2d v0, #0000000000000000
428 ; CHECK-NEXT: movi.2d v1, #0000000000000000
429 ; CHECK-NEXT: stp xzr, xzr, [x0, #16]
430 ; CHECK-NEXT: uaddlv.4s d0, v0
431 ; CHECK-NEXT: mov.h v1[0], v0[0]
432 ; CHECK-NEXT: bic.4h v1, #255, lsl #8
433 ; CHECK-NEXT: ushll.4s v0, v1, #0
434 ; CHECK-NEXT: ucvtf.4s v0, v0
435 ; CHECK-NEXT: str q0, [x0]
439 %vaddlv = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> zeroinitializer)
440 %1 = trunc i64 %vaddlv to i8
441 %2 = insertelement <8 x i8> zeroinitializer, i8 %1, i64 0
442 %3 = uitofp <8 x i8> %2 to <8 x float>
443 store <8 x float> %3, ptr %0, align 8
447 define void @insert_vec_v16i8_uaddlv_from_v4i32(ptr %0) {
448 ; CHECK-LABEL: insert_vec_v16i8_uaddlv_from_v4i32:
449 ; CHECK: ; %bb.0: ; %entry
450 ; CHECK-NEXT: movi.2d v0, #0000000000000000
451 ; CHECK-NEXT: movi.2d v1, #0000000000000000
452 ; CHECK-NEXT: uaddlv.4s d0, v0
453 ; CHECK-NEXT: mov.h v1[0], v0[0]
454 ; CHECK-NEXT: movi.2d v0, #0000000000000000
455 ; CHECK-NEXT: bic.4h v1, #255, lsl #8
456 ; CHECK-NEXT: stp q0, q0, [x0, #32]
457 ; CHECK-NEXT: ushll.4s v1, v1, #0
458 ; CHECK-NEXT: ucvtf.4s v1, v1
459 ; CHECK-NEXT: stp q1, q0, [x0]
463 %vaddlv = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> zeroinitializer)
464 %1 = trunc i64 %vaddlv to i8
465 %2 = insertelement <16 x i8> zeroinitializer, i8 %1, i64 0
466 %3 = uitofp <16 x i8> %2 to <16 x float>
467 store <16 x float> %3, ptr %0, align 8
471 define void @insert_vec_v2i32_uaddlv_from_v8i16_nz_index(ptr %0) {
472 ; CHECK-LABEL: insert_vec_v2i32_uaddlv_from_v8i16_nz_index:
473 ; CHECK: ; %bb.0: ; %entry
474 ; CHECK-NEXT: movi.2d v0, #0000000000000000
475 ; CHECK-NEXT: uaddlv.8h s1, v0
476 ; CHECK-NEXT: mov.s v0[2], v1[0]
477 ; CHECK-NEXT: ucvtf.4s v0, v0
478 ; CHECK-NEXT: str q0, [x0]
482 %vaddlv = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> zeroinitializer)
483 %1 = insertelement <4 x i32> zeroinitializer, i32 %vaddlv, i64 2
484 %2 = uitofp <4 x i32> %1 to <4 x float>
485 store <4 x float> %2, ptr %0, align 8
489 define void @store_saddlv_v8i8(ptr %H, <8 x i8> %sum_h, i32 %idx) {
490 ; CHECK-LABEL: store_saddlv_v8i8:
491 ; CHECK: ; %bb.0: ; %entry
492 ; CHECK-NEXT: saddlv.8b h0, v0
493 ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
494 ; CHECK-NEXT: sbfiz x8, x1, #3, #32
495 ; CHECK-NEXT: str s0, [x0, x8]
498 %vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %sum_h)
499 %idxprom = sext i32 %idx to i64
500 %arrayidx = getelementptr inbounds i64, ptr %H, i64 %idxprom
501 store i32 %vaddlvq_s32.i, ptr %arrayidx, align 8
505 define void @store_saddlv_v16i8(ptr %H, <16 x i8> %sum_h, i32 %idx) {
506 ; CHECK-LABEL: store_saddlv_v16i8:
507 ; CHECK: ; %bb.0: ; %entry
508 ; CHECK-NEXT: saddlv.16b h0, v0
509 ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
510 ; CHECK-NEXT: sbfiz x8, x1, #3, #32
511 ; CHECK-NEXT: str s0, [x0, x8]
514 %vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %sum_h)
515 %idxprom = sext i32 %idx to i64
516 %arrayidx = getelementptr inbounds i64, ptr %H, i64 %idxprom
517 store i32 %vaddlvq_s32.i, ptr %arrayidx, align 8
521 define void @store_saddlv_v4i16(ptr %H, <4 x i16> %sum_h, i32 %idx) {
522 ; CHECK-LABEL: store_saddlv_v4i16:
523 ; CHECK: ; %bb.0: ; %entry
524 ; CHECK-NEXT: saddlv.4h s0, v0
525 ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
526 ; CHECK-NEXT: sbfiz x8, x1, #3, #32
527 ; CHECK-NEXT: str s0, [x0, x8]
530 %vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> %sum_h)
531 %idxprom = sext i32 %idx to i64
532 %arrayidx = getelementptr inbounds i64, ptr %H, i64 %idxprom
533 store i32 %vaddlvq_s32.i, ptr %arrayidx, align 8
537 define void @store_saddlv_v8i16(ptr %H, <8 x i16> %sum_h, i32 %idx) {
538 ; CHECK-LABEL: store_saddlv_v8i16:
539 ; CHECK: ; %bb.0: ; %entry
540 ; CHECK-NEXT: saddlv.8h s0, v0
541 ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1
542 ; CHECK-NEXT: sbfiz x8, x1, #3, #32
543 ; CHECK-NEXT: str s0, [x0, x8]
546 %vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> %sum_h)
547 %idxprom = sext i32 %idx to i64
548 %arrayidx = getelementptr inbounds i64, ptr %H, i64 %idxprom
549 store i32 %vaddlvq_s32.i, ptr %arrayidx, align 8
553 define void @store_saddlv_v2i32(ptr %H, <2 x i32> %sum_h, i32 %idx) {
554 ; CHECK-LABEL: store_saddlv_v2i32:
555 ; CHECK: ; %bb.0: ; %entry
556 ; CHECK-NEXT: saddlp.1d v0, v0
557 ; CHECK-NEXT: str d0, [x0, w1, sxtw #3]
560 %vaddlvq_s32.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %sum_h)
561 %idxprom = sext i32 %idx to i64
562 %arrayidx = getelementptr inbounds i64, ptr %H, i64 %idxprom
563 store i64 %vaddlvq_s32.i, ptr %arrayidx, align 8
567 define void @store_saddlv_v4i32(ptr %H, <4 x i32> %sum_h, i32 %idx) {
568 ; CHECK-LABEL: store_saddlv_v4i32:
569 ; CHECK: ; %bb.0: ; %entry
570 ; CHECK-NEXT: saddlv.4s d0, v0
571 ; CHECK-NEXT: str d0, [x0, w1, sxtw #3]
574 %vaddlvq_s32.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> %sum_h)
575 %idxprom = sext i32 %idx to i64
576 %arrayidx = getelementptr inbounds i64, ptr %H, i64 %idxprom
577 store i64 %vaddlvq_s32.i, ptr %arrayidx, align 8