1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 ; NOTE: Covers the scenario where a SIGN_EXTEND_INREG is required, whose inreg
13 ; type's element type is not byte based and thus cannot be lowered directly to
15 define void @sext_v8i1_v8i32(<8 x i1> %a, ptr %out) vscale_range(2,0) #0 {
16 ; CHECK-LABEL: sext_v8i1_v8i32:
18 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
19 ; CHECK-NEXT: ptrue p0.s, vl8
20 ; CHECK-NEXT: uunpklo z0.h, z0.b
21 ; CHECK-NEXT: uunpklo z0.s, z0.h
22 ; CHECK-NEXT: lsl z0.s, z0.s, #31
23 ; CHECK-NEXT: asr z0.s, z0.s, #31
24 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
26 %b = sext <8 x i1> %a to <8 x i32>
27 store <8 x i32> %b, ptr %out
35 ; NOTE: Covers the scenario where a SIGN_EXTEND_INREG is required, whose inreg
36 ; type's element type is not power-of-2 based and thus cannot be lowered
37 ; directly to an SVE instruction.
38 define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) vscale_range(2,0) #0 {
39 ; CHECK-LABEL: sext_v4i3_v4i64:
41 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
42 ; CHECK-NEXT: ptrue p0.d, vl4
43 ; CHECK-NEXT: uunpklo z0.s, z0.h
44 ; CHECK-NEXT: uunpklo z0.d, z0.s
45 ; CHECK-NEXT: lsl z0.d, z0.d, #61
46 ; CHECK-NEXT: asr z0.d, z0.d, #61
47 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
49 %b = sext <4 x i3> %a to <4 x i64>
50 store <4 x i64> %b, ptr %out
58 define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) vscale_range(2,0) #0 {
59 ; CHECK-LABEL: sext_v16i8_v16i16:
61 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
62 ; CHECK-NEXT: ptrue p0.h, vl16
63 ; CHECK-NEXT: sunpklo z0.h, z0.b
64 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
66 %b = sext <16 x i8> %a to <16 x i16>
67 store <16 x i16>%b, ptr %out
71 ; NOTE: Extra 'add' is to prevent the extend being combined with the load.
72 define void @sext_v32i8_v32i16(ptr %in, ptr %out) #0 {
73 ; VBITS_GE_256-LABEL: sext_v32i8_v32i16:
74 ; VBITS_GE_256: // %bb.0:
75 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
76 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
77 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
78 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
79 ; VBITS_GE_256-NEXT: add z0.b, z0.b, z0.b
80 ; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b
81 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
82 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
83 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
84 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
85 ; VBITS_GE_256-NEXT: ret
87 ; VBITS_GE_512-LABEL: sext_v32i8_v32i16:
88 ; VBITS_GE_512: // %bb.0:
89 ; VBITS_GE_512-NEXT: ptrue p0.b, vl32
90 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
91 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
92 ; VBITS_GE_512-NEXT: add z0.b, z0.b, z0.b
93 ; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b
94 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
95 ; VBITS_GE_512-NEXT: ret
96 %a = load <32 x i8>, ptr %in
97 %b = add <32 x i8> %a, %a
98 %c = sext <32 x i8> %b to <32 x i16>
99 store <32 x i16> %c, ptr %out
103 define void @sext_v64i8_v64i16(ptr %in, ptr %out) vscale_range(8,0) #0 {
104 ; CHECK-LABEL: sext_v64i8_v64i16:
106 ; CHECK-NEXT: ptrue p0.b, vl64
107 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
108 ; CHECK-NEXT: ptrue p0.h, vl64
109 ; CHECK-NEXT: add z0.b, z0.b, z0.b
110 ; CHECK-NEXT: sunpklo z0.h, z0.b
111 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
113 %a = load <64 x i8>, ptr %in
114 %b = add <64 x i8> %a, %a
115 %c = sext <64 x i8> %b to <64 x i16>
116 store <64 x i16> %c, ptr %out
120 define void @sext_v128i8_v128i16(ptr %in, ptr %out) vscale_range(16,0) #0 {
121 ; CHECK-LABEL: sext_v128i8_v128i16:
123 ; CHECK-NEXT: ptrue p0.b, vl128
124 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
125 ; CHECK-NEXT: ptrue p0.h, vl128
126 ; CHECK-NEXT: add z0.b, z0.b, z0.b
127 ; CHECK-NEXT: sunpklo z0.h, z0.b
128 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
130 %a = load <128 x i8>, ptr %in
131 %b = add <128 x i8> %a, %a
132 %c = sext <128 x i8> %b to <128 x i16>
133 store <128 x i16> %c, ptr %out
141 define void @sext_v8i8_v8i32(<8 x i8> %a, ptr %out) vscale_range(2,0) #0 {
142 ; CHECK-LABEL: sext_v8i8_v8i32:
144 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
145 ; CHECK-NEXT: ptrue p0.s, vl8
146 ; CHECK-NEXT: sunpklo z0.h, z0.b
147 ; CHECK-NEXT: sunpklo z0.s, z0.h
148 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
150 %b = sext <8 x i8> %a to <8 x i32>
151 store <8 x i32>%b, ptr %out
155 define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) #0 {
156 ; VBITS_GE_256-LABEL: sext_v16i8_v16i32:
157 ; VBITS_GE_256: // %bb.0:
158 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
159 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
160 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
161 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
162 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
163 ; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
164 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
165 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
166 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
167 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2]
168 ; VBITS_GE_256-NEXT: ret
170 ; VBITS_GE_512-LABEL: sext_v16i8_v16i32:
171 ; VBITS_GE_512: // %bb.0:
172 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
173 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
174 ; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b
175 ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
176 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
177 ; VBITS_GE_512-NEXT: ret
178 %b = sext <16 x i8> %a to <16 x i32>
179 store <16 x i32> %b, ptr %out
183 define void @sext_v32i8_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 {
184 ; CHECK-LABEL: sext_v32i8_v32i32:
186 ; CHECK-NEXT: ptrue p0.b, vl32
187 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
188 ; CHECK-NEXT: ptrue p0.s, vl32
189 ; CHECK-NEXT: add z0.b, z0.b, z0.b
190 ; CHECK-NEXT: sunpklo z0.h, z0.b
191 ; CHECK-NEXT: sunpklo z0.s, z0.h
192 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
194 %a = load <32 x i8>, ptr %in
195 %b = add <32 x i8> %a, %a
196 %c = sext <32 x i8> %b to <32 x i32>
197 store <32 x i32> %c, ptr %out
201 define void @sext_v64i8_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 {
202 ; CHECK-LABEL: sext_v64i8_v64i32:
204 ; CHECK-NEXT: ptrue p0.b, vl64
205 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
206 ; CHECK-NEXT: ptrue p0.s, vl64
207 ; CHECK-NEXT: add z0.b, z0.b, z0.b
208 ; CHECK-NEXT: sunpklo z0.h, z0.b
209 ; CHECK-NEXT: sunpklo z0.s, z0.h
210 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
212 %a = load <64 x i8>, ptr %in
213 %b = add <64 x i8> %a, %a
214 %c = sext <64 x i8> %b to <64 x i32>
215 store <64 x i32> %c, ptr %out
223 ; NOTE: v4i8 is an unpacked typed stored within a v4i16 container. The sign
224 ; extend is a two step process where the container is any_extend'd with the
225 ; result feeding an inreg sign extend.
226 define void @sext_v4i8_v4i64(<4 x i8> %a, ptr %out) vscale_range(2,0) #0 {
227 ; CHECK-LABEL: sext_v4i8_v4i64:
229 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
230 ; CHECK-NEXT: ptrue p0.d, vl4
231 ; CHECK-NEXT: uunpklo z0.s, z0.h
232 ; CHECK-NEXT: uunpklo z0.d, z0.s
233 ; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
234 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
236 %b = sext <4 x i8> %a to <4 x i64>
237 store <4 x i64>%b, ptr %out
241 define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) #0 {
242 ; VBITS_GE_256-LABEL: sext_v8i8_v8i64:
243 ; VBITS_GE_256: // %bb.0:
244 ; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0
245 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
246 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
247 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
248 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
249 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
250 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
251 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
252 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
253 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3]
254 ; VBITS_GE_256-NEXT: ret
256 ; VBITS_GE_512-LABEL: sext_v8i8_v8i64:
257 ; VBITS_GE_512: // %bb.0:
258 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
259 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
260 ; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b
261 ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
262 ; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s
263 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
264 ; VBITS_GE_512-NEXT: ret
265 %b = sext <8 x i8> %a to <8 x i64>
266 store <8 x i64>%b, ptr %out
270 define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) vscale_range(8,0) #0 {
271 ; CHECK-LABEL: sext_v16i8_v16i64:
273 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
274 ; CHECK-NEXT: ptrue p0.d, vl16
275 ; CHECK-NEXT: sunpklo z0.h, z0.b
276 ; CHECK-NEXT: sunpklo z0.s, z0.h
277 ; CHECK-NEXT: sunpklo z0.d, z0.s
278 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
280 %b = sext <16 x i8> %a to <16 x i64>
281 store <16 x i64> %b, ptr %out
285 define void @sext_v32i8_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
286 ; CHECK-LABEL: sext_v32i8_v32i64:
288 ; CHECK-NEXT: ptrue p0.b, vl32
289 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
290 ; CHECK-NEXT: ptrue p0.d, vl32
291 ; CHECK-NEXT: add z0.b, z0.b, z0.b
292 ; CHECK-NEXT: sunpklo z0.h, z0.b
293 ; CHECK-NEXT: sunpklo z0.s, z0.h
294 ; CHECK-NEXT: sunpklo z0.d, z0.s
295 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
297 %a = load <32 x i8>, ptr %in
298 %b = add <32 x i8> %a, %a
299 %c = sext <32 x i8> %b to <32 x i64>
300 store <32 x i64> %c, ptr %out
308 define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) vscale_range(2,0) #0 {
309 ; CHECK-LABEL: sext_v8i16_v8i32:
311 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
312 ; CHECK-NEXT: ptrue p0.s, vl8
313 ; CHECK-NEXT: sunpklo z0.s, z0.h
314 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
316 %b = sext <8 x i16> %a to <8 x i32>
317 store <8 x i32>%b, ptr %out
321 define void @sext_v16i16_v16i32(ptr %in, ptr %out) #0 {
322 ; VBITS_GE_256-LABEL: sext_v16i16_v16i32:
323 ; VBITS_GE_256: // %bb.0:
324 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
325 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
326 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
327 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
328 ; VBITS_GE_256-NEXT: add z0.h, z0.h, z0.h
329 ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h
330 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
331 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
332 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
333 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
334 ; VBITS_GE_256-NEXT: ret
336 ; VBITS_GE_512-LABEL: sext_v16i16_v16i32:
337 ; VBITS_GE_512: // %bb.0:
338 ; VBITS_GE_512-NEXT: ptrue p0.h, vl16
339 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
340 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
341 ; VBITS_GE_512-NEXT: add z0.h, z0.h, z0.h
342 ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
343 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
344 ; VBITS_GE_512-NEXT: ret
345 %a = load <16 x i16>, ptr %in
346 %b = add <16 x i16> %a, %a
347 %c = sext <16 x i16> %b to <16 x i32>
348 store <16 x i32> %c, ptr %out
352 define void @sext_v32i16_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 {
353 ; CHECK-LABEL: sext_v32i16_v32i32:
355 ; CHECK-NEXT: ptrue p0.h, vl32
356 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
357 ; CHECK-NEXT: ptrue p0.s, vl32
358 ; CHECK-NEXT: add z0.h, z0.h, z0.h
359 ; CHECK-NEXT: sunpklo z0.s, z0.h
360 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
362 %a = load <32 x i16>, ptr %in
363 %b = add <32 x i16> %a, %a
364 %c = sext <32 x i16> %b to <32 x i32>
365 store <32 x i32> %c, ptr %out
369 define void @sext_v64i16_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 {
370 ; CHECK-LABEL: sext_v64i16_v64i32:
372 ; CHECK-NEXT: ptrue p0.h, vl64
373 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
374 ; CHECK-NEXT: ptrue p0.s, vl64
375 ; CHECK-NEXT: add z0.h, z0.h, z0.h
376 ; CHECK-NEXT: sunpklo z0.s, z0.h
377 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
379 %a = load <64 x i16>, ptr %in
380 %b = add <64 x i16> %a, %a
381 %c = sext <64 x i16> %b to <64 x i32>
382 store <64 x i32> %c, ptr %out
390 define void @sext_v4i16_v4i64(<4 x i16> %a, ptr %out) vscale_range(2,0) #0 {
391 ; CHECK-LABEL: sext_v4i16_v4i64:
393 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
394 ; CHECK-NEXT: ptrue p0.d, vl4
395 ; CHECK-NEXT: sunpklo z0.s, z0.h
396 ; CHECK-NEXT: sunpklo z0.d, z0.s
397 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
399 %b = sext <4 x i16> %a to <4 x i64>
400 store <4 x i64>%b, ptr %out
404 define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) #0 {
405 ; VBITS_GE_256-LABEL: sext_v8i16_v8i64:
406 ; VBITS_GE_256: // %bb.0:
407 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
408 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
409 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
410 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
411 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
412 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
413 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
414 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
415 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
416 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3]
417 ; VBITS_GE_256-NEXT: ret
419 ; VBITS_GE_512-LABEL: sext_v8i16_v8i64:
420 ; VBITS_GE_512: // %bb.0:
421 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
422 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
423 ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
424 ; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s
425 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
426 ; VBITS_GE_512-NEXT: ret
427 %b = sext <8 x i16> %a to <8 x i64>
428 store <8 x i64>%b, ptr %out
432 define void @sext_v16i16_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 {
433 ; CHECK-LABEL: sext_v16i16_v16i64:
435 ; CHECK-NEXT: ptrue p0.h, vl16
436 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
437 ; CHECK-NEXT: ptrue p0.d, vl16
438 ; CHECK-NEXT: add z0.h, z0.h, z0.h
439 ; CHECK-NEXT: sunpklo z0.s, z0.h
440 ; CHECK-NEXT: sunpklo z0.d, z0.s
441 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
443 %a = load <16 x i16>, ptr %in
444 %b = add <16 x i16> %a, %a
445 %c = sext <16 x i16> %b to <16 x i64>
446 store <16 x i64> %c, ptr %out
450 define void @sext_v32i16_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
451 ; CHECK-LABEL: sext_v32i16_v32i64:
453 ; CHECK-NEXT: ptrue p0.h, vl32
454 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
455 ; CHECK-NEXT: ptrue p0.d, vl32
456 ; CHECK-NEXT: add z0.h, z0.h, z0.h
457 ; CHECK-NEXT: sunpklo z0.s, z0.h
458 ; CHECK-NEXT: sunpklo z0.d, z0.s
459 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
461 %a = load <32 x i16>, ptr %in
462 %b = add <32 x i16> %a, %a
463 %c = sext <32 x i16> %b to <32 x i64>
464 store <32 x i64> %c, ptr %out
472 define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) vscale_range(2,0) #0 {
473 ; CHECK-LABEL: sext_v4i32_v4i64:
475 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
476 ; CHECK-NEXT: ptrue p0.d, vl4
477 ; CHECK-NEXT: sunpklo z0.d, z0.s
478 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
480 %b = sext <4 x i32> %a to <4 x i64>
481 store <4 x i64>%b, ptr %out
485 define void @sext_v8i32_v8i64(ptr %in, ptr %out) #0 {
486 ; VBITS_GE_256-LABEL: sext_v8i32_v8i64:
487 ; VBITS_GE_256: // %bb.0:
488 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
489 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
490 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
491 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
492 ; VBITS_GE_256-NEXT: add z0.s, z0.s, z0.s
493 ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s
494 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
495 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
496 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
497 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
498 ; VBITS_GE_256-NEXT: ret
500 ; VBITS_GE_512-LABEL: sext_v8i32_v8i64:
501 ; VBITS_GE_512: // %bb.0:
502 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
503 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
504 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
505 ; VBITS_GE_512-NEXT: add z0.s, z0.s, z0.s
506 ; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s
507 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
508 ; VBITS_GE_512-NEXT: ret
509 %a = load <8 x i32>, ptr %in
510 %b = add <8 x i32> %a, %a
511 %c = sext <8 x i32> %b to <8 x i64>
512 store <8 x i64> %c, ptr %out
516 define void @sext_v16i32_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 {
517 ; CHECK-LABEL: sext_v16i32_v16i64:
519 ; CHECK-NEXT: ptrue p0.s, vl16
520 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
521 ; CHECK-NEXT: ptrue p0.d, vl16
522 ; CHECK-NEXT: add z0.s, z0.s, z0.s
523 ; CHECK-NEXT: sunpklo z0.d, z0.s
524 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
526 %a = load <16 x i32>, ptr %in
527 %b = add <16 x i32> %a, %a
528 %c = sext <16 x i32> %b to <16 x i64>
529 store <16 x i64> %c, ptr %out
533 define void @sext_v32i32_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
534 ; CHECK-LABEL: sext_v32i32_v32i64:
536 ; CHECK-NEXT: ptrue p0.s, vl32
537 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
538 ; CHECK-NEXT: ptrue p0.d, vl32
539 ; CHECK-NEXT: add z0.s, z0.s, z0.s
540 ; CHECK-NEXT: sunpklo z0.d, z0.s
541 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
543 %a = load <32 x i32>, ptr %in
544 %b = add <32 x i32> %a, %a
545 %c = sext <32 x i32> %b to <32 x i64>
546 store <32 x i64> %c, ptr %out
554 define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) vscale_range(2,0) #0 {
555 ; CHECK-LABEL: zext_v16i8_v16i16:
557 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
558 ; CHECK-NEXT: ptrue p0.h, vl16
559 ; CHECK-NEXT: uunpklo z0.h, z0.b
560 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
562 %b = zext <16 x i8> %a to <16 x i16>
563 store <16 x i16>%b, ptr %out
567 ; NOTE: Extra 'add' is to prevent the extend being combined with the load.
568 define void @zext_v32i8_v32i16(ptr %in, ptr %out) #0 {
569 ; VBITS_GE_256-LABEL: zext_v32i8_v32i16:
570 ; VBITS_GE_256: // %bb.0:
571 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
572 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
573 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0]
574 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
575 ; VBITS_GE_256-NEXT: add z0.b, z0.b, z0.b
576 ; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b
577 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
578 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
579 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
580 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
581 ; VBITS_GE_256-NEXT: ret
583 ; VBITS_GE_512-LABEL: zext_v32i8_v32i16:
584 ; VBITS_GE_512: // %bb.0:
585 ; VBITS_GE_512-NEXT: ptrue p0.b, vl32
586 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
587 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
588 ; VBITS_GE_512-NEXT: add z0.b, z0.b, z0.b
589 ; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b
590 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
591 ; VBITS_GE_512-NEXT: ret
592 %a = load <32 x i8>, ptr %in
593 %b = add <32 x i8> %a, %a
594 %c = zext <32 x i8> %b to <32 x i16>
595 store <32 x i16> %c, ptr %out
599 define void @zext_v64i8_v64i16(ptr %in, ptr %out) vscale_range(8,0) #0 {
600 ; CHECK-LABEL: zext_v64i8_v64i16:
602 ; CHECK-NEXT: ptrue p0.b, vl64
603 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
604 ; CHECK-NEXT: ptrue p0.h, vl64
605 ; CHECK-NEXT: add z0.b, z0.b, z0.b
606 ; CHECK-NEXT: uunpklo z0.h, z0.b
607 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
609 %a = load <64 x i8>, ptr %in
610 %b = add <64 x i8> %a, %a
611 %c = zext <64 x i8> %b to <64 x i16>
612 store <64 x i16> %c, ptr %out
616 define void @zext_v128i8_v128i16(ptr %in, ptr %out) vscale_range(16,0) #0 {
617 ; CHECK-LABEL: zext_v128i8_v128i16:
619 ; CHECK-NEXT: ptrue p0.b, vl128
620 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
621 ; CHECK-NEXT: ptrue p0.h, vl128
622 ; CHECK-NEXT: add z0.b, z0.b, z0.b
623 ; CHECK-NEXT: uunpklo z0.h, z0.b
624 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
626 %a = load <128 x i8>, ptr %in
627 %b = add <128 x i8> %a, %a
628 %c = zext <128 x i8> %b to <128 x i16>
629 store <128 x i16> %c, ptr %out
637 define void @zext_v8i8_v8i32(<8 x i8> %a, ptr %out) vscale_range(2,0) #0 {
638 ; CHECK-LABEL: zext_v8i8_v8i32:
640 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
641 ; CHECK-NEXT: ptrue p0.s, vl8
642 ; CHECK-NEXT: uunpklo z0.h, z0.b
643 ; CHECK-NEXT: uunpklo z0.s, z0.h
644 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
646 %b = zext <8 x i8> %a to <8 x i32>
647 store <8 x i32>%b, ptr %out
651 define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) #0 {
652 ; VBITS_GE_256-LABEL: zext_v16i8_v16i32:
653 ; VBITS_GE_256: // %bb.0:
654 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
655 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
656 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
657 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
658 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
659 ; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
660 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
661 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
662 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0]
663 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2]
664 ; VBITS_GE_256-NEXT: ret
666 ; VBITS_GE_512-LABEL: zext_v16i8_v16i32:
667 ; VBITS_GE_512: // %bb.0:
668 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
669 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
670 ; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b
671 ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
672 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
673 ; VBITS_GE_512-NEXT: ret
674 %b = zext <16 x i8> %a to <16 x i32>
675 store <16 x i32> %b, ptr %out
679 define void @zext_v32i8_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 {
680 ; CHECK-LABEL: zext_v32i8_v32i32:
682 ; CHECK-NEXT: ptrue p0.b, vl32
683 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
684 ; CHECK-NEXT: ptrue p0.s, vl32
685 ; CHECK-NEXT: add z0.b, z0.b, z0.b
686 ; CHECK-NEXT: uunpklo z0.h, z0.b
687 ; CHECK-NEXT: uunpklo z0.s, z0.h
688 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
690 %a = load <32 x i8>, ptr %in
691 %b = add <32 x i8> %a, %a
692 %c = zext <32 x i8> %b to <32 x i32>
693 store <32 x i32> %c, ptr %out
697 define void @zext_v64i8_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 {
698 ; CHECK-LABEL: zext_v64i8_v64i32:
700 ; CHECK-NEXT: ptrue p0.b, vl64
701 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
702 ; CHECK-NEXT: ptrue p0.s, vl64
703 ; CHECK-NEXT: add z0.b, z0.b, z0.b
704 ; CHECK-NEXT: uunpklo z0.h, z0.b
705 ; CHECK-NEXT: uunpklo z0.s, z0.h
706 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
708 %a = load <64 x i8>, ptr %in
709 %b = add <64 x i8> %a, %a
710 %c = zext <64 x i8> %b to <64 x i32>
711 store <64 x i32> %c, ptr %out
719 ; NOTE: v4i8 is an unpacked typed stored within a v4i16 container. The zero
720 ; extend is a two step process where the container is zero_extend_inreg'd with
721 ; the result feeding a normal zero extend from halfs to doublewords.
722 define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) vscale_range(2,0) #0 {
723 ; CHECK-LABEL: zext_v4i8_v4i64:
725 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
726 ; CHECK-NEXT: ptrue p0.d, vl4
727 ; CHECK-NEXT: bic v0.4h, #255, lsl #8
728 ; CHECK-NEXT: uunpklo z0.s, z0.h
729 ; CHECK-NEXT: uunpklo z0.d, z0.s
730 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
732 %b = zext <4 x i8> %a to <4 x i64>
733 store <4 x i64>%b, ptr %out
737 define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) #0 {
738 ; VBITS_GE_256-LABEL: zext_v8i8_v8i64:
739 ; VBITS_GE_256: // %bb.0:
740 ; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0
741 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
742 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
743 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
744 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
745 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
746 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
747 ; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
748 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
749 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3]
750 ; VBITS_GE_256-NEXT: ret
752 ; VBITS_GE_512-LABEL: zext_v8i8_v8i64:
753 ; VBITS_GE_512: // %bb.0:
754 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
755 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
756 ; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b
757 ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
758 ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
759 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
760 ; VBITS_GE_512-NEXT: ret
761 %b = zext <8 x i8> %a to <8 x i64>
762 store <8 x i64>%b, ptr %out
766 define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) vscale_range(8,0) #0 {
767 ; CHECK-LABEL: zext_v16i8_v16i64:
769 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
770 ; CHECK-NEXT: ptrue p0.d, vl16
771 ; CHECK-NEXT: uunpklo z0.h, z0.b
772 ; CHECK-NEXT: uunpklo z0.s, z0.h
773 ; CHECK-NEXT: uunpklo z0.d, z0.s
774 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
776 %b = zext <16 x i8> %a to <16 x i64>
777 store <16 x i64> %b, ptr %out
781 define void @zext_v32i8_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
782 ; CHECK-LABEL: zext_v32i8_v32i64:
784 ; CHECK-NEXT: ptrue p0.b, vl32
785 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
786 ; CHECK-NEXT: ptrue p0.d, vl32
787 ; CHECK-NEXT: add z0.b, z0.b, z0.b
788 ; CHECK-NEXT: uunpklo z0.h, z0.b
789 ; CHECK-NEXT: uunpklo z0.s, z0.h
790 ; CHECK-NEXT: uunpklo z0.d, z0.s
791 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
793 %a = load <32 x i8>, ptr %in
794 %b = add <32 x i8> %a, %a
795 %c = zext <32 x i8> %b to <32 x i64>
796 store <32 x i64> %c, ptr %out
804 define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) vscale_range(2,0) #0 {
805 ; CHECK-LABEL: zext_v8i16_v8i32:
807 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
808 ; CHECK-NEXT: ptrue p0.s, vl8
809 ; CHECK-NEXT: uunpklo z0.s, z0.h
810 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
812 %b = zext <8 x i16> %a to <8 x i32>
813 store <8 x i32>%b, ptr %out
817 define void @zext_v16i16_v16i32(ptr %in, ptr %out) #0 {
818 ; VBITS_GE_256-LABEL: zext_v16i16_v16i32:
819 ; VBITS_GE_256: // %bb.0:
820 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
821 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
822 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
823 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
824 ; VBITS_GE_256-NEXT: add z0.h, z0.h, z0.h
825 ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
826 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
827 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
828 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
829 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
830 ; VBITS_GE_256-NEXT: ret
832 ; VBITS_GE_512-LABEL: zext_v16i16_v16i32:
833 ; VBITS_GE_512: // %bb.0:
834 ; VBITS_GE_512-NEXT: ptrue p0.h, vl16
835 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
836 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
837 ; VBITS_GE_512-NEXT: add z0.h, z0.h, z0.h
838 ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
839 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
840 ; VBITS_GE_512-NEXT: ret
841 %a = load <16 x i16>, ptr %in
842 %b = add <16 x i16> %a, %a
843 %c = zext <16 x i16> %b to <16 x i32>
844 store <16 x i32> %c, ptr %out
848 define void @zext_v32i16_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 {
849 ; CHECK-LABEL: zext_v32i16_v32i32:
851 ; CHECK-NEXT: ptrue p0.h, vl32
852 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
853 ; CHECK-NEXT: ptrue p0.s, vl32
854 ; CHECK-NEXT: add z0.h, z0.h, z0.h
855 ; CHECK-NEXT: uunpklo z0.s, z0.h
856 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
858 %a = load <32 x i16>, ptr %in
859 %b = add <32 x i16> %a, %a
860 %c = zext <32 x i16> %b to <32 x i32>
861 store <32 x i32> %c, ptr %out
865 define void @zext_v64i16_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 {
866 ; CHECK-LABEL: zext_v64i16_v64i32:
868 ; CHECK-NEXT: ptrue p0.h, vl64
869 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
870 ; CHECK-NEXT: ptrue p0.s, vl64
871 ; CHECK-NEXT: add z0.h, z0.h, z0.h
872 ; CHECK-NEXT: uunpklo z0.s, z0.h
873 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
875 %a = load <64 x i16>, ptr %in
876 %b = add <64 x i16> %a, %a
877 %c = zext <64 x i16> %b to <64 x i32>
878 store <64 x i32> %c, ptr %out
886 define void @zext_v4i16_v4i64(<4 x i16> %a, ptr %out) vscale_range(2,0) #0 {
887 ; CHECK-LABEL: zext_v4i16_v4i64:
889 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
890 ; CHECK-NEXT: ptrue p0.d, vl4
891 ; CHECK-NEXT: uunpklo z0.s, z0.h
892 ; CHECK-NEXT: uunpklo z0.d, z0.s
893 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
895 %b = zext <4 x i16> %a to <4 x i64>
896 store <4 x i64>%b, ptr %out
900 define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) #0 {
901 ; VBITS_GE_256-LABEL: zext_v8i16_v8i64:
902 ; VBITS_GE_256: // %bb.0:
903 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0
904 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
905 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
906 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
907 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
908 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
909 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
910 ; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
911 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0]
912 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3]
913 ; VBITS_GE_256-NEXT: ret
915 ; VBITS_GE_512-LABEL: zext_v8i16_v8i64:
916 ; VBITS_GE_512: // %bb.0:
917 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0
918 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
919 ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
920 ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
921 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
922 ; VBITS_GE_512-NEXT: ret
923 %b = zext <8 x i16> %a to <8 x i64>
924 store <8 x i64>%b, ptr %out
928 define void @zext_v16i16_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 {
929 ; CHECK-LABEL: zext_v16i16_v16i64:
931 ; CHECK-NEXT: ptrue p0.h, vl16
932 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
933 ; CHECK-NEXT: ptrue p0.d, vl16
934 ; CHECK-NEXT: add z0.h, z0.h, z0.h
935 ; CHECK-NEXT: uunpklo z0.s, z0.h
936 ; CHECK-NEXT: uunpklo z0.d, z0.s
937 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
939 %a = load <16 x i16>, ptr %in
940 %b = add <16 x i16> %a, %a
941 %c = zext <16 x i16> %b to <16 x i64>
942 store <16 x i64> %c, ptr %out
946 define void @zext_v32i16_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
947 ; CHECK-LABEL: zext_v32i16_v32i64:
949 ; CHECK-NEXT: ptrue p0.h, vl32
950 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
951 ; CHECK-NEXT: ptrue p0.d, vl32
952 ; CHECK-NEXT: add z0.h, z0.h, z0.h
953 ; CHECK-NEXT: uunpklo z0.s, z0.h
954 ; CHECK-NEXT: uunpklo z0.d, z0.s
955 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
957 %a = load <32 x i16>, ptr %in
958 %b = add <32 x i16> %a, %a
959 %c = zext <32 x i16> %b to <32 x i64>
960 store <32 x i64> %c, ptr %out
968 define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) vscale_range(2,0) #0 {
969 ; CHECK-LABEL: zext_v4i32_v4i64:
971 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
972 ; CHECK-NEXT: ptrue p0.d, vl4
973 ; CHECK-NEXT: uunpklo z0.d, z0.s
974 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
976 %b = zext <4 x i32> %a to <4 x i64>
977 store <4 x i64>%b, ptr %out
981 define void @zext_v8i32_v8i64(ptr %in, ptr %out) #0 {
982 ; VBITS_GE_256-LABEL: zext_v8i32_v8i64:
983 ; VBITS_GE_256: // %bb.0:
984 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
985 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
986 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
987 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
988 ; VBITS_GE_256-NEXT: add z0.s, z0.s, z0.s
989 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
990 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
991 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
992 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
993 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
994 ; VBITS_GE_256-NEXT: ret
996 ; VBITS_GE_512-LABEL: zext_v8i32_v8i64:
997 ; VBITS_GE_512: // %bb.0:
998 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
999 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1000 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1001 ; VBITS_GE_512-NEXT: add z0.s, z0.s, z0.s
1002 ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
1003 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
1004 ; VBITS_GE_512-NEXT: ret
1005 %a = load <8 x i32>, ptr %in
1006 %b = add <8 x i32> %a, %a
1007 %c = zext <8 x i32> %b to <8 x i64>
1008 store <8 x i64> %c, ptr %out
1012 define void @zext_v16i32_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 {
1013 ; CHECK-LABEL: zext_v16i32_v16i64:
1015 ; CHECK-NEXT: ptrue p0.s, vl16
1016 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1017 ; CHECK-NEXT: ptrue p0.d, vl16
1018 ; CHECK-NEXT: add z0.s, z0.s, z0.s
1019 ; CHECK-NEXT: uunpklo z0.d, z0.s
1020 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1022 %a = load <16 x i32>, ptr %in
1023 %b = add <16 x i32> %a, %a
1024 %c = zext <16 x i32> %b to <16 x i64>
1025 store <16 x i64> %c, ptr %out
1029 define void @zext_v32i32_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
1030 ; CHECK-LABEL: zext_v32i32_v32i64:
1032 ; CHECK-NEXT: ptrue p0.s, vl32
1033 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1034 ; CHECK-NEXT: ptrue p0.d, vl32
1035 ; CHECK-NEXT: add z0.s, z0.s, z0.s
1036 ; CHECK-NEXT: uunpklo z0.d, z0.s
1037 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1039 %a = load <32 x i32>, ptr %in
1040 %b = add <32 x i32> %a, %a
1041 %c = zext <32 x i32> %b to <32 x i64>
1042 store <32 x i64> %c, ptr %out
1046 attributes #0 = { nounwind "target-features"="+sve" }