1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 define <16 x i8> @trunc_v16i16_v16i8(ptr %in) vscale_range(2,0) #0 {
13 ; CHECK-LABEL: trunc_v16i16_v16i8:
15 ; CHECK-NEXT: ptrue p0.h, vl16
16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
17 ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
18 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
20 %a = load <16 x i16>, ptr %in
21 %b = trunc <16 x i16> %a to <16 x i8>
25 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
26 define void @trunc_v32i16_v32i8(ptr %in, ptr %out) #0 {
27 ; VBITS_GE_256-LABEL: trunc_v32i16_v32i8:
28 ; VBITS_GE_256: // %bb.0:
29 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
30 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
31 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
32 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
33 ; VBITS_GE_256-NEXT: ptrue p0.b, vl16
34 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
35 ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
36 ; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z0.b
37 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
38 ; VBITS_GE_256-NEXT: add z0.b, z1.b, z1.b
39 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1]
40 ; VBITS_GE_256-NEXT: ret
42 ; VBITS_GE_512-LABEL: trunc_v32i16_v32i8:
43 ; VBITS_GE_512: // %bb.0:
44 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
45 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
46 ; VBITS_GE_512-NEXT: ptrue p0.b, vl32
47 ; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
48 ; VBITS_GE_512-NEXT: add z0.b, z0.b, z0.b
49 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x1]
50 ; VBITS_GE_512-NEXT: ret
51 %a = load <32 x i16>, ptr %in
52 %b = trunc <32 x i16> %a to <32 x i8>
53 %c = add <32 x i8> %b, %b
54 store <32 x i8> %c, ptr %out
58 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
59 define void @trunc_v64i16_v64i8(ptr %in, ptr %out) vscale_range(8,0) #0 {
60 ; CHECK-LABEL: trunc_v64i16_v64i8:
62 ; CHECK-NEXT: ptrue p0.h, vl64
63 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
64 ; CHECK-NEXT: ptrue p0.b, vl64
65 ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
66 ; CHECK-NEXT: add z0.b, z0.b, z0.b
67 ; CHECK-NEXT: st1b { z0.b }, p0, [x1]
69 %a = load <64 x i16>, ptr %in
70 %b = trunc <64 x i16> %a to <64 x i8>
71 %c = add <64 x i8> %b, %b
72 store <64 x i8> %c, ptr %out
76 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
77 define void @trunc_v128i16_v128i8(ptr %in, ptr %out) vscale_range(16,0) #0 {
78 ; CHECK-LABEL: trunc_v128i16_v128i8:
80 ; CHECK-NEXT: ptrue p0.h, vl128
81 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
82 ; CHECK-NEXT: ptrue p0.b, vl128
83 ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
84 ; CHECK-NEXT: add z0.b, z0.b, z0.b
85 ; CHECK-NEXT: st1b { z0.b }, p0, [x1]
87 %a = load <128 x i16>, ptr %in
88 %b = trunc <128 x i16> %a to <128 x i8>
89 %c = add <128 x i8> %b, %b
90 store <128 x i8> %c, ptr %out
98 define <8 x i8> @trunc_v8i32_v8i8(ptr %in) vscale_range(2,0) #0 {
99 ; CHECK-LABEL: trunc_v8i32_v8i8:
101 ; CHECK-NEXT: ptrue p0.s, vl8
102 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
103 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
104 ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
105 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
107 %a = load <8 x i32>, ptr %in
108 %b = trunc <8 x i32> %a to <8 x i8>
112 define <16 x i8> @trunc_v16i32_v16i8(ptr %in) #0 {
113 ; VBITS_GE_256-LABEL: trunc_v16i32_v16i8:
114 ; VBITS_GE_256: // %bb.0:
115 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
116 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
117 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
118 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
119 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
120 ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
121 ; VBITS_GE_256-NEXT: uzp1 z2.b, z0.b, z0.b
122 ; VBITS_GE_256-NEXT: uzp1 z0.b, z1.b, z1.b
123 ; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0]
124 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
125 ; VBITS_GE_256-NEXT: ret
127 ; VBITS_GE_512-LABEL: trunc_v16i32_v16i8:
128 ; VBITS_GE_512: // %bb.0:
129 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
130 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
131 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
132 ; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
133 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
134 ; VBITS_GE_512-NEXT: ret
135 %a = load <16 x i32>, ptr %in
136 %b = trunc <16 x i32> %a to <16 x i8>
140 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
141 define void @trunc_v32i32_v32i8(ptr %in, ptr %out) vscale_range(8,0) #0 {
142 ; CHECK-LABEL: trunc_v32i32_v32i8:
144 ; CHECK-NEXT: ptrue p0.s, vl32
145 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
146 ; CHECK-NEXT: ptrue p0.b, vl32
147 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
148 ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
149 ; CHECK-NEXT: add z0.b, z0.b, z0.b
150 ; CHECK-NEXT: st1b { z0.b }, p0, [x1]
152 %a = load <32 x i32>, ptr %in
153 %b = trunc <32 x i32> %a to <32 x i8>
154 %c = add <32 x i8> %b, %b
155 store <32 x i8> %c, ptr %out
159 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
160 define void @trunc_v64i32_v64i8(ptr %in, ptr %out) vscale_range(16,0) #0 {
161 ; CHECK-LABEL: trunc_v64i32_v64i8:
163 ; CHECK-NEXT: ptrue p0.s, vl64
164 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
165 ; CHECK-NEXT: ptrue p0.b, vl64
166 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
167 ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
168 ; CHECK-NEXT: add z0.b, z0.b, z0.b
169 ; CHECK-NEXT: st1b { z0.b }, p0, [x1]
171 %a = load <64 x i32>, ptr %in
172 %b = trunc <64 x i32> %a to <64 x i8>
173 %c = add <64 x i8> %b, %b
174 store <64 x i8> %c, ptr %out
179 ; truncate i32 -> i16
182 define <8 x i16> @trunc_v8i32_v8i16(ptr %in) vscale_range(2,0) #0 {
183 ; CHECK-LABEL: trunc_v8i32_v8i16:
185 ; CHECK-NEXT: ptrue p0.s, vl8
186 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
187 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
188 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
190 %a = load <8 x i32>, ptr %in
191 %b = trunc <8 x i32> %a to <8 x i16>
195 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
196 define void @trunc_v16i32_v16i16(ptr %in, ptr %out) #0 {
197 ; VBITS_GE_256-LABEL: trunc_v16i32_v16i16:
198 ; VBITS_GE_256: // %bb.0:
199 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
200 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
201 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
202 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
203 ; VBITS_GE_256-NEXT: ptrue p0.h, vl8
204 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
205 ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
206 ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
207 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
208 ; VBITS_GE_256-NEXT: add z0.h, z1.h, z1.h
209 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1]
210 ; VBITS_GE_256-NEXT: ret
212 ; VBITS_GE_512-LABEL: trunc_v16i32_v16i16:
213 ; VBITS_GE_512: // %bb.0:
214 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
215 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
216 ; VBITS_GE_512-NEXT: ptrue p0.h, vl16
217 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
218 ; VBITS_GE_512-NEXT: add z0.h, z0.h, z0.h
219 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
220 ; VBITS_GE_512-NEXT: ret
221 %a = load <16 x i32>, ptr %in
222 %b = trunc <16 x i32> %a to <16 x i16>
223 %c = add <16 x i16> %b, %b
224 store <16 x i16> %c, ptr %out
228 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
229 define void @trunc_v32i32_v32i16(ptr %in, ptr %out) vscale_range(8,0) #0 {
230 ; CHECK-LABEL: trunc_v32i32_v32i16:
232 ; CHECK-NEXT: ptrue p0.s, vl32
233 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
234 ; CHECK-NEXT: ptrue p0.h, vl32
235 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
236 ; CHECK-NEXT: add z0.h, z0.h, z0.h
237 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
239 %a = load <32 x i32>, ptr %in
240 %b = trunc <32 x i32> %a to <32 x i16>
241 %c = add <32 x i16> %b, %b
242 store <32 x i16> %c, ptr %out
246 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
247 define void @trunc_v64i32_v64i16(ptr %in, ptr %out) vscale_range(16,0) #0 {
248 ; CHECK-LABEL: trunc_v64i32_v64i16:
250 ; CHECK-NEXT: ptrue p0.s, vl64
251 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
252 ; CHECK-NEXT: ptrue p0.h, vl64
253 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
254 ; CHECK-NEXT: add z0.h, z0.h, z0.h
255 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
257 %a = load <64 x i32>, ptr %in
258 %b = trunc <64 x i32> %a to <64 x i16>
259 %c = add <64 x i16> %b, %b
260 store <64 x i16> %c, ptr %out
268 ; NOTE: v4i8 is not legal so result i8 elements are held within i16 containers.
269 define <4 x i8> @trunc_v4i64_v4i8(ptr %in) vscale_range(2,0) #0 {
270 ; CHECK-LABEL: trunc_v4i64_v4i8:
272 ; CHECK-NEXT: ptrue p0.d, vl4
273 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
274 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
275 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
276 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
278 %a = load <4 x i64>, ptr %in
279 %b = trunc <4 x i64> %a to <4 x i8>
283 define <8 x i8> @trunc_v8i64_v8i8(ptr %in) #0 {
284 ; VBITS_GE_256-LABEL: trunc_v8i64_v8i8:
285 ; VBITS_GE_256: // %bb.0:
286 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
287 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
288 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
289 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
290 ; VBITS_GE_256-NEXT: ptrue p0.s, vl4
291 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
292 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
293 ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
294 ; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
295 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
296 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
297 ; VBITS_GE_256-NEXT: ret
299 ; VBITS_GE_512-LABEL: trunc_v8i64_v8i8:
300 ; VBITS_GE_512: // %bb.0:
301 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
302 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
303 ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
304 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
305 ; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b
306 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
307 ; VBITS_GE_512-NEXT: ret
308 %a = load <8 x i64>, ptr %in
309 %b = trunc <8 x i64> %a to <8 x i8>
313 define <16 x i8> @trunc_v16i64_v16i8(ptr %in) vscale_range(8,0) #0 {
314 ; CHECK-LABEL: trunc_v16i64_v16i8:
316 ; CHECK-NEXT: ptrue p0.d, vl16
317 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
318 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
319 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
320 ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
321 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
323 %a = load <16 x i64>, ptr %in
324 %b = trunc <16 x i64> %a to <16 x i8>
328 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
329 define void @trunc_v32i64_v32i8(ptr %in, ptr %out) vscale_range(16,0) #0 {
330 ; CHECK-LABEL: trunc_v32i64_v32i8:
332 ; CHECK-NEXT: ptrue p0.d, vl32
333 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
334 ; CHECK-NEXT: ptrue p0.b, vl32
335 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
336 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
337 ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
338 ; CHECK-NEXT: add z0.b, z0.b, z0.b
339 ; CHECK-NEXT: st1b { z0.b }, p0, [x1]
341 %a = load <32 x i64>, ptr %in
342 %b = trunc <32 x i64> %a to <32 x i8>
343 %c = add <32 x i8> %b, %b
344 store <32 x i8> %c, ptr %out
349 ; truncate i64 -> i16
352 define <4 x i16> @trunc_v4i64_v4i16(ptr %in) vscale_range(2,0) #0 {
353 ; CHECK-LABEL: trunc_v4i64_v4i16:
355 ; CHECK-NEXT: ptrue p0.d, vl4
356 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
357 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
358 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
359 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
361 %a = load <4 x i64>, ptr %in
362 %b = trunc <4 x i64> %a to <4 x i16>
366 define <8 x i16> @trunc_v8i64_v8i16(ptr %in) #0 {
367 ; VBITS_GE_256-LABEL: trunc_v8i64_v8i16:
368 ; VBITS_GE_256: // %bb.0:
369 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
370 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
371 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
372 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
373 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
374 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
375 ; VBITS_GE_256-NEXT: uzp1 z2.h, z0.h, z0.h
376 ; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
377 ; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0]
378 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
379 ; VBITS_GE_256-NEXT: ret
381 ; VBITS_GE_512-LABEL: trunc_v8i64_v8i16:
382 ; VBITS_GE_512: // %bb.0:
383 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
384 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
385 ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
386 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
387 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
388 ; VBITS_GE_512-NEXT: ret
389 %a = load <8 x i64>, ptr %in
390 %b = trunc <8 x i64> %a to <8 x i16>
394 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
395 define void @trunc_v16i64_v16i16(ptr %in, ptr %out) vscale_range(8,0) #0 {
396 ; CHECK-LABEL: trunc_v16i64_v16i16:
398 ; CHECK-NEXT: ptrue p0.d, vl16
399 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
400 ; CHECK-NEXT: ptrue p0.h, vl16
401 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
402 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
403 ; CHECK-NEXT: add z0.h, z0.h, z0.h
404 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
406 %a = load <16 x i64>, ptr %in
407 %b = trunc <16 x i64> %a to <16 x i16>
408 %c = add <16 x i16> %b, %b
409 store <16 x i16> %c, ptr %out
413 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
414 define void @trunc_v32i64_v32i16(ptr %in, ptr %out) vscale_range(16,0) #0 {
415 ; CHECK-LABEL: trunc_v32i64_v32i16:
417 ; CHECK-NEXT: ptrue p0.d, vl32
418 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
419 ; CHECK-NEXT: ptrue p0.h, vl32
420 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
421 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
422 ; CHECK-NEXT: add z0.h, z0.h, z0.h
423 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
425 %a = load <32 x i64>, ptr %in
426 %b = trunc <32 x i64> %a to <32 x i16>
427 %c = add <32 x i16> %b, %b
428 store <32 x i16> %c, ptr %out
433 ; truncate i64 -> i32
436 define <4 x i32> @trunc_v4i64_v4i32(ptr %in) vscale_range(2,0) #0 {
437 ; CHECK-LABEL: trunc_v4i64_v4i32:
439 ; CHECK-NEXT: ptrue p0.d, vl4
440 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
441 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
442 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
444 %a = load <4 x i64>, ptr %in
445 %b = trunc <4 x i64> %a to <4 x i32>
449 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
450 define void @trunc_v8i64_v8i32(ptr %in, ptr %out) #0 {
451 ; VBITS_GE_256-LABEL: trunc_v8i64_v8i32:
452 ; VBITS_GE_256: // %bb.0:
453 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
454 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
455 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
456 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
457 ; VBITS_GE_256-NEXT: ptrue p0.s, vl4
458 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
459 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
460 ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
461 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
462 ; VBITS_GE_256-NEXT: add z0.s, z1.s, z1.s
463 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1]
464 ; VBITS_GE_256-NEXT: ret
466 ; VBITS_GE_512-LABEL: trunc_v8i64_v8i32:
467 ; VBITS_GE_512: // %bb.0:
468 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
469 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
470 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
471 ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
472 ; VBITS_GE_512-NEXT: add z0.s, z0.s, z0.s
473 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
474 ; VBITS_GE_512-NEXT: ret
475 %a = load <8 x i64>, ptr %in
476 %b = trunc <8 x i64> %a to <8 x i32>
477 %c = add <8 x i32> %b, %b
478 store <8 x i32> %c, ptr %out
482 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
483 define void @trunc_v16i64_v16i32(ptr %in, ptr %out) vscale_range(8,0) #0 {
484 ; CHECK-LABEL: trunc_v16i64_v16i32:
486 ; CHECK-NEXT: ptrue p0.d, vl16
487 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
488 ; CHECK-NEXT: ptrue p0.s, vl16
489 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
490 ; CHECK-NEXT: add z0.s, z0.s, z0.s
491 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
493 %a = load <16 x i64>, ptr %in
494 %b = trunc <16 x i64> %a to <16 x i32>
495 %c = add <16 x i32> %b, %b
496 store <16 x i32> %c, ptr %out
500 ; NOTE: Extra 'add' is to prevent the truncate being combined with the store.
501 define void @trunc_v32i64_v32i32(ptr %in, ptr %out) vscale_range(16,0) #0 {
502 ; CHECK-LABEL: trunc_v32i64_v32i32:
504 ; CHECK-NEXT: ptrue p0.d, vl32
505 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
506 ; CHECK-NEXT: ptrue p0.s, vl32
507 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
508 ; CHECK-NEXT: add z0.s, z0.s, z0.s
509 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
511 %a = load <32 x i64>, ptr %in
512 %b = trunc <32 x i64> %a to <32 x i32>
513 %c = add <32 x i32> %b, %b
514 store <32 x i32> %c, ptr %out
518 attributes #0 = { nounwind "target-features"="+sve" }