1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 ; Don't use SVE for 64-bit vectors.
13 define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) vscale_range(2,0) #0 {
14 ; CHECK-LABEL: ucvtf_v4i16_v4f16:
16 ; CHECK-NEXT: ucvtf v0.4h, v0.4h
18 %res = uitofp <4 x i16> %op1 to <4 x half>
22 ; Don't use SVE for 128-bit vectors.
23 define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
24 ; CHECK-LABEL: ucvtf_v8i16_v8f16:
26 ; CHECK-NEXT: ldr q0, [x0]
27 ; CHECK-NEXT: ucvtf v0.8h, v0.8h
28 ; CHECK-NEXT: str q0, [x1]
30 %op1 = load <8 x i16>, ptr %a
31 %res = uitofp <8 x i16> %op1 to <8 x half>
32 store <8 x half> %res, ptr %b
36 define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
37 ; CHECK-LABEL: ucvtf_v16i16_v16f16:
39 ; CHECK-NEXT: ptrue p0.h, vl16
40 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
41 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h
42 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
44 %op1 = load <16 x i16>, ptr %a
45 %res = uitofp <16 x i16> %op1 to <16 x half>
46 store <16 x half> %res, ptr %b
50 define void @ucvtf_v32i16_v32f16(ptr %a, ptr %b) #0 {
51 ; VBITS_GE_256-LABEL: ucvtf_v32i16_v32f16:
52 ; VBITS_GE_256: // %bb.0:
53 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
54 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
55 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
56 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
57 ; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.h
58 ; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.h
59 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
60 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
61 ; VBITS_GE_256-NEXT: ret
63 ; VBITS_GE_512-LABEL: ucvtf_v32i16_v32f16:
64 ; VBITS_GE_512: // %bb.0:
65 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
66 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
67 ; VBITS_GE_512-NEXT: ucvtf z0.h, p0/m, z0.h
68 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
69 ; VBITS_GE_512-NEXT: ret
70 %op1 = load <32 x i16>, ptr %a
71 %res = uitofp <32 x i16> %op1 to <32 x half>
72 store <32 x half> %res, ptr %b
76 define void @ucvtf_v64i16_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
77 ; CHECK-LABEL: ucvtf_v64i16_v64f16:
79 ; CHECK-NEXT: ptrue p0.h, vl64
80 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
81 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h
82 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
84 %op1 = load <64 x i16>, ptr %a
85 %res = uitofp <64 x i16> %op1 to <64 x half>
86 store <64 x half> %res, ptr %b
90 define void @ucvtf_v128i16_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
91 ; CHECK-LABEL: ucvtf_v128i16_v128f16:
93 ; CHECK-NEXT: ptrue p0.h, vl128
94 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
95 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h
96 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
98 %op1 = load <128 x i16>, ptr %a
99 %res = uitofp <128 x i16> %op1 to <128 x half>
100 store <128 x half> %res, ptr %b
108 ; Don't use SVE for 64-bit vectors.
109 define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) vscale_range(2,0) #0 {
110 ; CHECK-LABEL: ucvtf_v2i16_v2f32:
112 ; CHECK-NEXT: movi d1, #0x00ffff0000ffff
113 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
114 ; CHECK-NEXT: ucvtf v0.2s, v0.2s
116 %res = uitofp <2 x i16> %op1 to <2 x float>
120 ; Don't use SVE for 128-bit vectors.
121 define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) vscale_range(2,0) #0 {
122 ; CHECK-LABEL: ucvtf_v4i16_v4f32:
124 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
125 ; CHECK-NEXT: ucvtf v0.4s, v0.4s
127 %res = uitofp <4 x i16> %op1 to <4 x float>
131 define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
132 ; CHECK-LABEL: ucvtf_v8i16_v8f32:
134 ; CHECK-NEXT: ldr q0, [x0]
135 ; CHECK-NEXT: ptrue p0.s, vl8
136 ; CHECK-NEXT: uunpklo z0.s, z0.h
137 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
138 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
140 %op1 = load <8 x i16>, ptr %a
141 %res = uitofp <8 x i16> %op1 to <8 x float>
142 store <8 x float> %res, ptr %b
146 define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) #0 {
147 ; VBITS_GE_256-LABEL: ucvtf_v16i16_v16f32:
148 ; VBITS_GE_256: // %bb.0:
149 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
150 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
151 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
152 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
153 ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
154 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
155 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
156 ; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.s
157 ; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.s
158 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
159 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
160 ; VBITS_GE_256-NEXT: ret
162 ; VBITS_GE_512-LABEL: ucvtf_v16i16_v16f32:
163 ; VBITS_GE_512: // %bb.0:
164 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
165 ; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0]
166 ; VBITS_GE_512-NEXT: ucvtf z0.s, p0/m, z0.s
167 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
168 ; VBITS_GE_512-NEXT: ret
169 %op1 = load <16 x i16>, ptr %a
170 %res = uitofp <16 x i16> %op1 to <16 x float>
171 store <16 x float> %res, ptr %b
175 define void @ucvtf_v32i16_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
176 ; CHECK-LABEL: ucvtf_v32i16_v32f32:
178 ; CHECK-NEXT: ptrue p0.s, vl32
179 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
180 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
181 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
183 %op1 = load <32 x i16>, ptr %a
184 %res = uitofp <32 x i16> %op1 to <32 x float>
185 store <32 x float> %res, ptr %b
189 define void @ucvtf_v64i16_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
190 ; CHECK-LABEL: ucvtf_v64i16_v64f32:
192 ; CHECK-NEXT: ptrue p0.s, vl64
193 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
194 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
195 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
197 %op1 = load <64 x i16>, ptr %a
198 %res = uitofp <64 x i16> %op1 to <64 x float>
199 store <64 x float> %res, ptr %b
207 ; v1i16 is perfered to be widened to v4i16, which pushes the output into SVE types, so use SVE
208 define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) vscale_range(2,0) #0 {
209 ; CHECK-LABEL: ucvtf_v1i16_v1f64:
211 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
212 ; CHECK-NEXT: ptrue p0.d, vl4
213 ; CHECK-NEXT: uunpklo z0.s, z0.h
214 ; CHECK-NEXT: uunpklo z0.d, z0.s
215 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
216 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
218 %res = uitofp <1 x i16> %op1 to <1 x double>
219 ret <1 x double> %res
222 ; Don't use SVE for 128-bit vectors.
223 define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) vscale_range(2,0) #0 {
224 ; CHECK-LABEL: ucvtf_v2i16_v2f64:
226 ; CHECK-NEXT: movi d1, #0x00ffff0000ffff
227 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
228 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
229 ; CHECK-NEXT: ucvtf v0.2d, v0.2d
231 %res = uitofp <2 x i16> %op1 to <2 x double>
232 ret <2 x double> %res
235 define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
236 ; CHECK-LABEL: ucvtf_v4i16_v4f64:
238 ; CHECK-NEXT: ldr d0, [x0]
239 ; CHECK-NEXT: ptrue p0.d, vl4
240 ; CHECK-NEXT: uunpklo z0.s, z0.h
241 ; CHECK-NEXT: uunpklo z0.d, z0.s
242 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
243 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
245 %op1 = load <4 x i16>, ptr %a
246 %res = uitofp <4 x i16> %op1 to <4 x double>
247 store <4 x double> %res, ptr %b
251 define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) #0 {
252 ; VBITS_GE_256-LABEL: ucvtf_v8i16_v8f64:
253 ; VBITS_GE_256: // %bb.0:
254 ; VBITS_GE_256-NEXT: ldr q0, [x0]
255 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
256 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
257 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
258 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
259 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
260 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
261 ; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
262 ; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d
263 ; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d
264 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
265 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
266 ; VBITS_GE_256-NEXT: ret
268 ; VBITS_GE_512-LABEL: ucvtf_v8i16_v8f64:
269 ; VBITS_GE_512: // %bb.0:
270 ; VBITS_GE_512-NEXT: ldr q0, [x0]
271 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
272 ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
273 ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
274 ; VBITS_GE_512-NEXT: ucvtf z0.d, p0/m, z0.d
275 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
276 ; VBITS_GE_512-NEXT: ret
277 %op1 = load <8 x i16>, ptr %a
278 %res = uitofp <8 x i16> %op1 to <8 x double>
279 store <8 x double> %res, ptr %b
283 define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
284 ; CHECK-LABEL: ucvtf_v16i16_v16f64:
286 ; CHECK-NEXT: ptrue p0.d, vl16
287 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
288 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
289 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
291 %op1 = load <16 x i16>, ptr %a
292 %res = uitofp <16 x i16> %op1 to <16 x double>
293 store <16 x double> %res, ptr %b
297 define void @ucvtf_v32i16_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
298 ; CHECK-LABEL: ucvtf_v32i16_v32f64:
300 ; CHECK-NEXT: ptrue p0.d, vl32
301 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
302 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
303 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
305 %op1 = load <32 x i16>, ptr %a
306 %res = uitofp <32 x i16> %op1 to <32 x double>
307 store <32 x double> %res, ptr %b
315 ; Don't use SVE for 64-bit vectors.
316 define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) vscale_range(2,0) #0 {
317 ; CHECK-LABEL: ucvtf_v2i32_v2f16:
319 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
320 ; CHECK-NEXT: ucvtf v0.4s, v0.4s
321 ; CHECK-NEXT: fcvtn v0.4h, v0.4s
323 %res = uitofp <2 x i32> %op1 to <2 x half>
327 ; Don't use SVE for 128-bit vectors.
328 define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) vscale_range(2,0) #0 {
329 ; CHECK-LABEL: ucvtf_v4i32_v4f16:
331 ; CHECK-NEXT: ucvtf v0.4s, v0.4s
332 ; CHECK-NEXT: fcvtn v0.4h, v0.4s
334 %res = uitofp <4 x i32> %op1 to <4 x half>
338 define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) vscale_range(2,0) #0 {
339 ; CHECK-LABEL: ucvtf_v8i32_v8f16:
341 ; CHECK-NEXT: ptrue p0.s, vl8
342 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
343 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s
344 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
345 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
347 %op1 = load <8 x i32>, ptr %a
348 %res = uitofp <8 x i32> %op1 to <8 x half>
352 define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) #0 {
353 ; VBITS_GE_256-LABEL: ucvtf_v16i32_v16f16:
354 ; VBITS_GE_256: // %bb.0:
355 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
356 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
357 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
358 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
359 ; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.s
360 ; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.s
361 ; VBITS_GE_256-NEXT: ptrue p0.h, vl8
362 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
363 ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
364 ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
365 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
366 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
367 ; VBITS_GE_256-NEXT: ret
369 ; VBITS_GE_512-LABEL: ucvtf_v16i32_v16f16:
370 ; VBITS_GE_512: // %bb.0:
371 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
372 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
373 ; VBITS_GE_512-NEXT: ucvtf z0.h, p0/m, z0.s
374 ; VBITS_GE_512-NEXT: ptrue p0.h, vl16
375 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
376 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
377 ; VBITS_GE_512-NEXT: ret
378 %op1 = load <16 x i32>, ptr %a
379 %res = uitofp <16 x i32> %op1 to <16 x half>
380 store <16 x half> %res, ptr %b
384 define void @ucvtf_v32i32_v32f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
385 ; CHECK-LABEL: ucvtf_v32i32_v32f16:
387 ; CHECK-NEXT: ptrue p0.s, vl32
388 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
389 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s
390 ; CHECK-NEXT: ptrue p0.h, vl32
391 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
392 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
394 %op1 = load <32 x i32>, ptr %a
395 %res = uitofp <32 x i32> %op1 to <32 x half>
396 store <32 x half> %res, ptr %b
400 define void @ucvtf_v64i32_v64f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
401 ; CHECK-LABEL: ucvtf_v64i32_v64f16:
403 ; CHECK-NEXT: ptrue p0.s, vl64
404 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
405 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s
406 ; CHECK-NEXT: ptrue p0.h, vl64
407 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
408 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
410 %op1 = load <64 x i32>, ptr %a
411 %res = uitofp <64 x i32> %op1 to <64 x half>
412 store <64 x half> %res, ptr %b
420 ; Don't use SVE for 64-bit vectors.
421 define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) vscale_range(2,0) #0 {
422 ; CHECK-LABEL: ucvtf_v2i32_v2f32:
424 ; CHECK-NEXT: ucvtf v0.2s, v0.2s
426 %res = uitofp <2 x i32> %op1 to <2 x float>
430 ; Don't use SVE for 128-bit vectors.
431 define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) vscale_range(2,0) #0 {
432 ; CHECK-LABEL: ucvtf_v4i32_v4f32:
434 ; CHECK-NEXT: ucvtf v0.4s, v0.4s
436 %res = uitofp <4 x i32> %op1 to <4 x float>
440 define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
441 ; CHECK-LABEL: ucvtf_v8i32_v8f32:
443 ; CHECK-NEXT: ptrue p0.s, vl8
444 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
445 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
446 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
448 %op1 = load <8 x i32>, ptr %a
449 %res = uitofp <8 x i32> %op1 to <8 x float>
450 store <8 x float> %res, ptr %b
454 define void @ucvtf_v16i32_v16f32(ptr %a, ptr %b) #0 {
455 ; VBITS_GE_256-LABEL: ucvtf_v16i32_v16f32:
456 ; VBITS_GE_256: // %bb.0:
457 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
458 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
459 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
460 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
461 ; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.s
462 ; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.s
463 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
464 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
465 ; VBITS_GE_256-NEXT: ret
467 ; VBITS_GE_512-LABEL: ucvtf_v16i32_v16f32:
468 ; VBITS_GE_512: // %bb.0:
469 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
470 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
471 ; VBITS_GE_512-NEXT: ucvtf z0.s, p0/m, z0.s
472 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
473 ; VBITS_GE_512-NEXT: ret
474 %op1 = load <16 x i32>, ptr %a
475 %res = uitofp <16 x i32> %op1 to <16 x float>
476 store <16 x float> %res, ptr %b
480 define void @ucvtf_v32i32_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
481 ; CHECK-LABEL: ucvtf_v32i32_v32f32:
483 ; CHECK-NEXT: ptrue p0.s, vl32
484 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
485 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
486 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
488 %op1 = load <32 x i32>, ptr %a
489 %res = uitofp <32 x i32> %op1 to <32 x float>
490 store <32 x float> %res, ptr %b
494 define void @ucvtf_v64i32_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
495 ; CHECK-LABEL: ucvtf_v64i32_v64f32:
497 ; CHECK-NEXT: ptrue p0.s, vl64
498 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
499 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
500 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
502 %op1 = load <64 x i32>, ptr %a
503 %res = uitofp <64 x i32> %op1 to <64 x float>
504 store <64 x float> %res, ptr %b
512 ; Don't use SVE for 64-bit vectors.
513 define <1 x double> @ucvtf_v1i32_v1f64(<1 x i32> %op1) vscale_range(2,0) #0 {
514 ; CHECK-LABEL: ucvtf_v1i32_v1f64:
516 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
517 ; CHECK-NEXT: ucvtf v0.2d, v0.2d
518 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
520 %res = uitofp <1 x i32> %op1 to <1 x double>
521 ret <1 x double> %res
524 ; Don't use SVE for 128-bit vectors.
525 define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) vscale_range(2,0) #0 {
526 ; CHECK-LABEL: ucvtf_v2i32_v2f64:
528 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
529 ; CHECK-NEXT: ucvtf v0.2d, v0.2d
531 %res = uitofp <2 x i32> %op1 to <2 x double>
532 ret <2 x double> %res
535 define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
536 ; CHECK-LABEL: ucvtf_v4i32_v4f64:
538 ; CHECK-NEXT: ldr q0, [x0]
539 ; CHECK-NEXT: ptrue p0.d, vl4
540 ; CHECK-NEXT: uunpklo z0.d, z0.s
541 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
542 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
544 %op1 = load <4 x i32>, ptr %a
545 %res = uitofp <4 x i32> %op1 to <4 x double>
546 store <4 x double> %res, ptr %b
550 define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) #0 {
551 ; VBITS_GE_256-LABEL: ucvtf_v8i32_v8f64:
552 ; VBITS_GE_256: // %bb.0:
553 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
554 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
555 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
556 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
557 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
558 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
559 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
560 ; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d
561 ; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d
562 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
563 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
564 ; VBITS_GE_256-NEXT: ret
566 ; VBITS_GE_512-LABEL: ucvtf_v8i32_v8f64:
567 ; VBITS_GE_512: // %bb.0:
568 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
569 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x0]
570 ; VBITS_GE_512-NEXT: ucvtf z0.d, p0/m, z0.d
571 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
572 ; VBITS_GE_512-NEXT: ret
573 %op1 = load <8 x i32>, ptr %a
574 %res = uitofp <8 x i32> %op1 to <8 x double>
575 store <8 x double> %res, ptr %b
579 define void @ucvtf_v16i32_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
580 ; CHECK-LABEL: ucvtf_v16i32_v16f64:
582 ; CHECK-NEXT: ptrue p0.d, vl16
583 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
584 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
585 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
587 %op1 = load <16 x i32>, ptr %a
588 %res = uitofp <16 x i32> %op1 to <16 x double>
589 store <16 x double> %res, ptr %b
593 define void @ucvtf_v32i32_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
594 ; CHECK-LABEL: ucvtf_v32i32_v32f64:
596 ; CHECK-NEXT: ptrue p0.d, vl32
597 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
598 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
599 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
601 %op1 = load <32 x i32>, ptr %a
602 %res = uitofp <32 x i32> %op1 to <32 x double>
603 store <32 x double> %res, ptr %b
611 ; Don't use SVE for 64-bit vectors.
612 define <1 x half> @ucvtf_v1i64_v1f16(<1 x i64> %op1) vscale_range(2,0) #0 {
613 ; CHECK-LABEL: ucvtf_v1i64_v1f16:
615 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
616 ; CHECK-NEXT: fmov x8, d0
617 ; CHECK-NEXT: ucvtf h0, x8
619 %res = uitofp <1 x i64> %op1 to <1 x half>
623 ; v2f16 is not legal for NEON, so use SVE
624 define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) vscale_range(2,0) #0 {
625 ; CHECK-LABEL: ucvtf_v2i64_v2f16:
627 ; CHECK-NEXT: ptrue p0.d, vl4
628 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
629 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d
630 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
631 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
632 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
634 %res = uitofp <2 x i64> %op1 to <2 x half>
638 define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) vscale_range(2,0) #0 {
639 ; CHECK-LABEL: ucvtf_v4i64_v4f16:
641 ; CHECK-NEXT: ptrue p0.d, vl4
642 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
643 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d
644 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
645 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
646 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
648 %op1 = load <4 x i64>, ptr %a
649 %res = uitofp <4 x i64> %op1 to <4 x half>
653 define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) #0 {
654 ; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f16:
655 ; VBITS_GE_256: // %bb.0:
656 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
657 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
658 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
659 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
660 ; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.d
661 ; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.d
662 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
663 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
664 ; VBITS_GE_256-NEXT: uzp1 z2.h, z0.h, z0.h
665 ; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
666 ; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0]
667 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
668 ; VBITS_GE_256-NEXT: ret
670 ; VBITS_GE_512-LABEL: ucvtf_v8i64_v8f16:
671 ; VBITS_GE_512: // %bb.0:
672 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
673 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
674 ; VBITS_GE_512-NEXT: ucvtf z0.h, p0/m, z0.d
675 ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
676 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
677 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
678 ; VBITS_GE_512-NEXT: ret
679 %op1 = load <8 x i64>, ptr %a
680 %res = uitofp <8 x i64> %op1 to <8 x half>
684 define void @ucvtf_v16i64_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
685 ; CHECK-LABEL: ucvtf_v16i64_v16f16:
687 ; CHECK-NEXT: ptrue p0.d, vl16
688 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
689 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d
690 ; CHECK-NEXT: ptrue p0.s, vl16
691 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
692 ; CHECK-NEXT: st1h { z0.s }, p0, [x1]
694 %op1 = load <16 x i64>, ptr %a
695 %res = uitofp <16 x i64> %op1 to <16 x half>
696 store <16 x half> %res, ptr %b
700 define void @ucvtf_v32i64_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
701 ; CHECK-LABEL: ucvtf_v32i64_v32f16:
703 ; CHECK-NEXT: ptrue p0.d, vl32
704 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
705 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d
706 ; CHECK-NEXT: ptrue p0.s, vl32
707 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
708 ; CHECK-NEXT: st1h { z0.s }, p0, [x1]
710 %op1 = load <32 x i64>, ptr %a
711 %res = uitofp <32 x i64> %op1 to <32 x half>
712 store <32 x half> %res, ptr %b
720 ; Don't use SVE for 64-bit vectors.
721 define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
722 ; CHECK-LABEL: ucvtf_v1i64_v1f32:
724 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
725 ; CHECK-NEXT: ucvtf v0.2d, v0.2d
726 ; CHECK-NEXT: fcvtn v0.2s, v0.2d
728 %res = uitofp <1 x i64> %op1 to <1 x float>
732 ; Don't use SVE for 128-bit vectors.
733 define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) vscale_range(2,0) #0 {
734 ; CHECK-LABEL: ucvtf_v2i64_v2f32:
736 ; CHECK-NEXT: ucvtf v0.2d, v0.2d
737 ; CHECK-NEXT: fcvtn v0.2s, v0.2d
739 %res = uitofp <2 x i64> %op1 to <2 x float>
743 define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) vscale_range(2,0) #0 {
744 ; CHECK-LABEL: ucvtf_v4i64_v4f32:
746 ; CHECK-NEXT: ptrue p0.d, vl4
747 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
748 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d
749 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
750 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
752 %op1 = load <4 x i64>, ptr %a
753 %res = uitofp <4 x i64> %op1 to <4 x float>
757 define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) #0 {
758 ; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f32:
759 ; VBITS_GE_256: // %bb.0:
760 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
761 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
762 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
763 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
764 ; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.d
765 ; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.d
766 ; VBITS_GE_256-NEXT: ptrue p0.s, vl4
767 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
768 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
769 ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
770 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
771 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
772 ; VBITS_GE_256-NEXT: ret
774 ; VBITS_GE_512-LABEL: ucvtf_v8i64_v8f32:
775 ; VBITS_GE_512: // %bb.0:
776 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
777 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
778 ; VBITS_GE_512-NEXT: ucvtf z0.s, p0/m, z0.d
779 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
780 ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
781 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
782 ; VBITS_GE_512-NEXT: ret
783 %op1 = load <8 x i64>, ptr %a
784 %res = uitofp <8 x i64> %op1 to <8 x float>
785 store <8 x float> %res, ptr %b
789 define void @ucvtf_v16i64_v16f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
790 ; CHECK-LABEL: ucvtf_v16i64_v16f32:
792 ; CHECK-NEXT: ptrue p0.d, vl16
793 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
794 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d
795 ; CHECK-NEXT: ptrue p0.s, vl16
796 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
797 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
799 %op1 = load <16 x i64>, ptr %a
800 %res = uitofp <16 x i64> %op1 to <16 x float>
801 store <16 x float> %res, ptr %b
805 define void @ucvtf_v32i64_v32f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
806 ; CHECK-LABEL: ucvtf_v32i64_v32f32:
808 ; CHECK-NEXT: ptrue p0.d, vl32
809 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
810 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d
811 ; CHECK-NEXT: ptrue p0.s, vl32
812 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
813 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
815 %op1 = load <32 x i64>, ptr %a
816 %res = uitofp <32 x i64> %op1 to <32 x float>
817 store <32 x float> %res, ptr %b
825 ; Don't use SVE for 64-bit vectors.
826 define <1 x double> @ucvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
827 ; CHECK-LABEL: ucvtf_v1i64_v1f64:
829 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
830 ; CHECK-NEXT: ucvtf d0, d0
832 %res = uitofp <1 x i64> %op1 to <1 x double>
833 ret <1 x double> %res
836 ; Don't use SVE for 128-bit vectors.
837 define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) vscale_range(2,0) #0 {
838 ; CHECK-LABEL: ucvtf_v2i64_v2f64:
840 ; CHECK-NEXT: ucvtf v0.2d, v0.2d
842 %res = uitofp <2 x i64> %op1 to <2 x double>
843 ret <2 x double> %res
846 define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
847 ; CHECK-LABEL: ucvtf_v4i64_v4f64:
849 ; CHECK-NEXT: ptrue p0.d, vl4
850 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
851 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
852 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
854 %op1 = load <4 x i64>, ptr %a
855 %res = uitofp <4 x i64> %op1 to <4 x double>
856 store <4 x double> %res, ptr %b
860 define void @ucvtf_v8i64_v8f64(ptr %a, ptr %b) #0 {
861 ; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f64:
862 ; VBITS_GE_256: // %bb.0:
863 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
864 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
865 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
866 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
867 ; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d
868 ; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d
869 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
870 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
871 ; VBITS_GE_256-NEXT: ret
873 ; VBITS_GE_512-LABEL: ucvtf_v8i64_v8f64:
874 ; VBITS_GE_512: // %bb.0:
875 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
876 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
877 ; VBITS_GE_512-NEXT: ucvtf z0.d, p0/m, z0.d
878 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
879 ; VBITS_GE_512-NEXT: ret
880 %op1 = load <8 x i64>, ptr %a
881 %res = uitofp <8 x i64> %op1 to <8 x double>
882 store <8 x double> %res, ptr %b
886 define void @ucvtf_v16i64_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
887 ; CHECK-LABEL: ucvtf_v16i64_v16f64:
889 ; CHECK-NEXT: ptrue p0.d, vl16
890 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
891 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
892 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
894 %op1 = load <16 x i64>, ptr %a
895 %res = uitofp <16 x i64> %op1 to <16 x double>
896 store <16 x double> %res, ptr %b
900 define void @ucvtf_v32i64_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
901 ; CHECK-LABEL: ucvtf_v32i64_v32f64:
903 ; CHECK-NEXT: ptrue p0.d, vl32
904 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
905 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
906 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
908 %op1 = load <32 x i64>, ptr %a
909 %res = uitofp <32 x i64> %op1 to <32 x double>
910 store <32 x double> %res, ptr %b
918 ; Don't use SVE for 64-bit vectors.
919 define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) vscale_range(2,0) #0 {
920 ; CHECK-LABEL: scvtf_v4i16_v4f16:
922 ; CHECK-NEXT: scvtf v0.4h, v0.4h
924 %res = sitofp <4 x i16> %op1 to <4 x half>
928 ; Don't use SVE for 128-bit vectors.
929 define void @scvtf_v8i16_v8f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
930 ; CHECK-LABEL: scvtf_v8i16_v8f16:
932 ; CHECK-NEXT: ldr q0, [x0]
933 ; CHECK-NEXT: scvtf v0.8h, v0.8h
934 ; CHECK-NEXT: str q0, [x1]
936 %op1 = load <8 x i16>, ptr %a
937 %res = sitofp <8 x i16> %op1 to <8 x half>
938 store <8 x half> %res, ptr %b
942 define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
943 ; CHECK-LABEL: scvtf_v16i16_v16f16:
945 ; CHECK-NEXT: ptrue p0.h, vl16
946 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
947 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.h
948 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
950 %op1 = load <16 x i16>, ptr %a
951 %res = sitofp <16 x i16> %op1 to <16 x half>
952 store <16 x half> %res, ptr %b
956 define void @scvtf_v32i16_v32f16(ptr %a, ptr %b) #0 {
957 ; VBITS_GE_256-LABEL: scvtf_v32i16_v32f16:
958 ; VBITS_GE_256: // %bb.0:
959 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
960 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
961 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
962 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
963 ; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.h
964 ; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.h
965 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
966 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
967 ; VBITS_GE_256-NEXT: ret
969 ; VBITS_GE_512-LABEL: scvtf_v32i16_v32f16:
970 ; VBITS_GE_512: // %bb.0:
971 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
972 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
973 ; VBITS_GE_512-NEXT: scvtf z0.h, p0/m, z0.h
974 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
975 ; VBITS_GE_512-NEXT: ret
976 %op1 = load <32 x i16>, ptr %a
977 %res = sitofp <32 x i16> %op1 to <32 x half>
978 store <32 x half> %res, ptr %b
982 define void @scvtf_v64i16_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
983 ; CHECK-LABEL: scvtf_v64i16_v64f16:
985 ; CHECK-NEXT: ptrue p0.h, vl64
986 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
987 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.h
988 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
990 %op1 = load <64 x i16>, ptr %a
991 %res = sitofp <64 x i16> %op1 to <64 x half>
992 store <64 x half> %res, ptr %b
996 define void @scvtf_v128i16_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
997 ; CHECK-LABEL: scvtf_v128i16_v128f16:
999 ; CHECK-NEXT: ptrue p0.h, vl128
1000 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1001 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.h
1002 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
1004 %op1 = load <128 x i16>, ptr %a
1005 %res = sitofp <128 x i16> %op1 to <128 x half>
1006 store <128 x half> %res, ptr %b
1014 ; Don't use SVE for 64-bit vectors.
1015 define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) vscale_range(2,0) #0 {
1016 ; CHECK-LABEL: scvtf_v2i16_v2f32:
1018 ; CHECK-NEXT: shl v0.2s, v0.2s, #16
1019 ; CHECK-NEXT: sshr v0.2s, v0.2s, #16
1020 ; CHECK-NEXT: scvtf v0.2s, v0.2s
1022 %res = sitofp <2 x i16> %op1 to <2 x float>
1023 ret <2 x float> %res
1026 ; Don't use SVE for 128-bit vectors.
1027 define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) vscale_range(2,0) #0 {
1028 ; CHECK-LABEL: scvtf_v4i16_v4f32:
1030 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
1031 ; CHECK-NEXT: scvtf v0.4s, v0.4s
1033 %res = sitofp <4 x i16> %op1 to <4 x float>
1034 ret <4 x float> %res
1037 define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1038 ; CHECK-LABEL: scvtf_v8i16_v8f32:
1040 ; CHECK-NEXT: ldr q0, [x0]
1041 ; CHECK-NEXT: ptrue p0.s, vl8
1042 ; CHECK-NEXT: sunpklo z0.s, z0.h
1043 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
1044 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1046 %op1 = load <8 x i16>, ptr %a
1047 %res = sitofp <8 x i16> %op1 to <8 x float>
1048 store <8 x float> %res, ptr %b
1052 define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) #0 {
1053 ; VBITS_GE_256-LABEL: scvtf_v16i16_v16f32:
1054 ; VBITS_GE_256: // %bb.0:
1055 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1056 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1057 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
1058 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1059 ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h
1060 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1061 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
1062 ; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.s
1063 ; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.s
1064 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
1065 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
1066 ; VBITS_GE_256-NEXT: ret
1068 ; VBITS_GE_512-LABEL: scvtf_v16i16_v16f32:
1069 ; VBITS_GE_512: // %bb.0:
1070 ; VBITS_GE_512-NEXT: ptrue p0.h, vl16
1071 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
1072 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1073 ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
1074 ; VBITS_GE_512-NEXT: scvtf z0.s, p0/m, z0.s
1075 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
1076 ; VBITS_GE_512-NEXT: ret
1077 %op1 = load <16 x i16>, ptr %a
1078 %res = sitofp <16 x i16> %op1 to <16 x float>
1079 store <16 x float> %res, ptr %b
1083 define void @scvtf_v32i16_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1084 ; CHECK-LABEL: scvtf_v32i16_v32f32:
1086 ; CHECK-NEXT: ptrue p0.h, vl32
1087 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1088 ; CHECK-NEXT: ptrue p0.s, vl32
1089 ; CHECK-NEXT: sunpklo z0.s, z0.h
1090 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
1091 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1093 %op1 = load <32 x i16>, ptr %a
1094 %res = sitofp <32 x i16> %op1 to <32 x float>
1095 store <32 x float> %res, ptr %b
1099 define void @scvtf_v64i16_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1100 ; CHECK-LABEL: scvtf_v64i16_v64f32:
1102 ; CHECK-NEXT: ptrue p0.h, vl64
1103 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1104 ; CHECK-NEXT: ptrue p0.s, vl64
1105 ; CHECK-NEXT: sunpklo z0.s, z0.h
1106 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
1107 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1109 %op1 = load <64 x i16>, ptr %a
1110 %res = sitofp <64 x i16> %op1 to <64 x float>
1111 store <64 x float> %res, ptr %b
1119 ; v1i16 is perfered to be widened to v4i16, which pushes the output into SVE types, so use SVE
1120 define <1 x double> @scvtf_v1i16_v1f64(<1 x i16> %op1) vscale_range(2,0) #0 {
1121 ; CHECK-LABEL: scvtf_v1i16_v1f64:
1123 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
1124 ; CHECK-NEXT: ptrue p0.d, vl4
1125 ; CHECK-NEXT: sunpklo z0.s, z0.h
1126 ; CHECK-NEXT: sunpklo z0.d, z0.s
1127 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1128 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1130 %res = sitofp <1 x i16> %op1 to <1 x double>
1131 ret <1 x double> %res
1134 ; Don't use SVE for 128-bit vectors.
1135 define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) vscale_range(2,0) #0 {
1136 ; CHECK-LABEL: scvtf_v2i16_v2f64:
1138 ; CHECK-NEXT: shl v0.2s, v0.2s, #16
1139 ; CHECK-NEXT: sshr v0.2s, v0.2s, #16
1140 ; CHECK-NEXT: sshll v0.2d, v0.2s, #0
1141 ; CHECK-NEXT: scvtf v0.2d, v0.2d
1143 %res = sitofp <2 x i16> %op1 to <2 x double>
1144 ret <2 x double> %res
1147 define void @scvtf_v4i16_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1148 ; CHECK-LABEL: scvtf_v4i16_v4f64:
1150 ; CHECK-NEXT: ldr d0, [x0]
1151 ; CHECK-NEXT: ptrue p0.d, vl4
1152 ; CHECK-NEXT: sunpklo z0.s, z0.h
1153 ; CHECK-NEXT: sunpklo z0.d, z0.s
1154 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1155 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1157 %op1 = load <4 x i16>, ptr %a
1158 %res = sitofp <4 x i16> %op1 to <4 x double>
1159 store <4 x double> %res, ptr %b
1163 define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) #0 {
1164 ; VBITS_GE_256-LABEL: scvtf_v8i16_v8f64:
1165 ; VBITS_GE_256: // %bb.0:
1166 ; VBITS_GE_256-NEXT: ldr q0, [x0]
1167 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1168 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1169 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
1170 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
1171 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
1172 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
1173 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
1174 ; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d
1175 ; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d
1176 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
1177 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
1178 ; VBITS_GE_256-NEXT: ret
1180 ; VBITS_GE_512-LABEL: scvtf_v8i16_v8f64:
1181 ; VBITS_GE_512: // %bb.0:
1182 ; VBITS_GE_512-NEXT: ldr q0, [x0]
1183 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1184 ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
1185 ; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s
1186 ; VBITS_GE_512-NEXT: scvtf z0.d, p0/m, z0.d
1187 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
1188 ; VBITS_GE_512-NEXT: ret
1189 %op1 = load <8 x i16>, ptr %a
1190 %res = sitofp <8 x i16> %op1 to <8 x double>
1191 store <8 x double> %res, ptr %b
1195 define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1196 ; CHECK-LABEL: scvtf_v16i16_v16f64:
1198 ; CHECK-NEXT: ptrue p0.h, vl16
1199 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1200 ; CHECK-NEXT: ptrue p0.d, vl16
1201 ; CHECK-NEXT: sunpklo z0.s, z0.h
1202 ; CHECK-NEXT: sunpklo z0.d, z0.s
1203 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1204 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1206 %op1 = load <16 x i16>, ptr %a
1207 %res = sitofp <16 x i16> %op1 to <16 x double>
1208 store <16 x double> %res, ptr %b
1212 define void @scvtf_v32i16_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1213 ; CHECK-LABEL: scvtf_v32i16_v32f64:
1215 ; CHECK-NEXT: ptrue p0.h, vl32
1216 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1217 ; CHECK-NEXT: ptrue p0.d, vl32
1218 ; CHECK-NEXT: sunpklo z0.s, z0.h
1219 ; CHECK-NEXT: sunpklo z0.d, z0.s
1220 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1221 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1223 %op1 = load <32 x i16>, ptr %a
1224 %res = sitofp <32 x i16> %op1 to <32 x double>
1225 store <32 x double> %res, ptr %b
1233 ; Don't use SVE for 64-bit vectors.
1234 define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) vscale_range(2,0) #0 {
1235 ; CHECK-LABEL: scvtf_v2i32_v2f16:
1237 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1238 ; CHECK-NEXT: scvtf v0.4s, v0.4s
1239 ; CHECK-NEXT: fcvtn v0.4h, v0.4s
1241 %res = sitofp <2 x i32> %op1 to <2 x half>
1245 ; Don't use SVE for 128-bit vectors.
1246 define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) vscale_range(2,0) #0 {
1247 ; CHECK-LABEL: scvtf_v4i32_v4f16:
1249 ; CHECK-NEXT: scvtf v0.4s, v0.4s
1250 ; CHECK-NEXT: fcvtn v0.4h, v0.4s
1252 %res = sitofp <4 x i32> %op1 to <4 x half>
1256 define <8 x half> @scvtf_v8i32_v8f16(ptr %a) vscale_range(2,0) #0 {
1257 ; CHECK-LABEL: scvtf_v8i32_v8f16:
1259 ; CHECK-NEXT: ptrue p0.s, vl8
1260 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1261 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.s
1262 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
1263 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
1265 %op1 = load <8 x i32>, ptr %a
1266 %res = sitofp <8 x i32> %op1 to <8 x half>
1270 define void @scvtf_v16i32_v16f16(ptr %a, ptr %b) #0 {
1271 ; VBITS_GE_256-LABEL: scvtf_v16i32_v16f16:
1272 ; VBITS_GE_256: // %bb.0:
1273 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1274 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1275 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1276 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1277 ; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.s
1278 ; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.s
1279 ; VBITS_GE_256-NEXT: ptrue p0.h, vl8
1280 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
1281 ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
1282 ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
1283 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1284 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
1285 ; VBITS_GE_256-NEXT: ret
1287 ; VBITS_GE_512-LABEL: scvtf_v16i32_v16f16:
1288 ; VBITS_GE_512: // %bb.0:
1289 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1290 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1291 ; VBITS_GE_512-NEXT: scvtf z0.h, p0/m, z0.s
1292 ; VBITS_GE_512-NEXT: ptrue p0.h, vl16
1293 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
1294 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
1295 ; VBITS_GE_512-NEXT: ret
1296 %op1 = load <16 x i32>, ptr %a
1297 %res = sitofp <16 x i32> %op1 to <16 x half>
1298 store <16 x half> %res, ptr %b
1302 define void @scvtf_v32i32_v32f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1303 ; CHECK-LABEL: scvtf_v32i32_v32f16:
1305 ; CHECK-NEXT: ptrue p0.s, vl32
1306 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1307 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.s
1308 ; CHECK-NEXT: ptrue p0.h, vl32
1309 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
1310 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
1312 %op1 = load <32 x i32>, ptr %a
1313 %res = sitofp <32 x i32> %op1 to <32 x half>
1314 store <32 x half> %res, ptr %b
1318 define void @scvtf_v64i32_v64f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1319 ; CHECK-LABEL: scvtf_v64i32_v64f16:
1321 ; CHECK-NEXT: ptrue p0.s, vl64
1322 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1323 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.s
1324 ; CHECK-NEXT: ptrue p0.h, vl64
1325 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
1326 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
1328 %op1 = load <64 x i32>, ptr %a
1329 %res = sitofp <64 x i32> %op1 to <64 x half>
1330 store <64 x half> %res, ptr %b
1338 ; Don't use SVE for 64-bit vectors.
1339 define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) vscale_range(2,0) #0 {
1340 ; CHECK-LABEL: scvtf_v2i32_v2f32:
1342 ; CHECK-NEXT: scvtf v0.2s, v0.2s
1344 %res = sitofp <2 x i32> %op1 to <2 x float>
1345 ret <2 x float> %res
1348 ; Don't use SVE for 128-bit vectors.
1349 define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) vscale_range(2,0) #0 {
1350 ; CHECK-LABEL: scvtf_v4i32_v4f32:
1352 ; CHECK-NEXT: scvtf v0.4s, v0.4s
1354 %res = sitofp <4 x i32> %op1 to <4 x float>
1355 ret <4 x float> %res
1358 define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1359 ; CHECK-LABEL: scvtf_v8i32_v8f32:
1361 ; CHECK-NEXT: ptrue p0.s, vl8
1362 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1363 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
1364 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1366 %op1 = load <8 x i32>, ptr %a
1367 %res = sitofp <8 x i32> %op1 to <8 x float>
1368 store <8 x float> %res, ptr %b
1372 define void @scvtf_v16i32_v16f32(ptr %a, ptr %b) #0 {
1373 ; VBITS_GE_256-LABEL: scvtf_v16i32_v16f32:
1374 ; VBITS_GE_256: // %bb.0:
1375 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1376 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1377 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1378 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1379 ; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.s
1380 ; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.s
1381 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
1382 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
1383 ; VBITS_GE_256-NEXT: ret
1385 ; VBITS_GE_512-LABEL: scvtf_v16i32_v16f32:
1386 ; VBITS_GE_512: // %bb.0:
1387 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1388 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1389 ; VBITS_GE_512-NEXT: scvtf z0.s, p0/m, z0.s
1390 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
1391 ; VBITS_GE_512-NEXT: ret
1392 %op1 = load <16 x i32>, ptr %a
1393 %res = sitofp <16 x i32> %op1 to <16 x float>
1394 store <16 x float> %res, ptr %b
1398 define void @scvtf_v32i32_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1399 ; CHECK-LABEL: scvtf_v32i32_v32f32:
1401 ; CHECK-NEXT: ptrue p0.s, vl32
1402 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1403 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
1404 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1406 %op1 = load <32 x i32>, ptr %a
1407 %res = sitofp <32 x i32> %op1 to <32 x float>
1408 store <32 x float> %res, ptr %b
1412 define void @scvtf_v64i32_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1413 ; CHECK-LABEL: scvtf_v64i32_v64f32:
1415 ; CHECK-NEXT: ptrue p0.s, vl64
1416 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1417 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
1418 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1420 %op1 = load <64 x i32>, ptr %a
1421 %res = sitofp <64 x i32> %op1 to <64 x float>
1422 store <64 x float> %res, ptr %b
1430 ; Don't use SVE for 64-bit vectors.
1431 define <1 x double> @scvtf_v1i32_v1f64(<1 x i32> %op1) vscale_range(2,0) #0 {
1432 ; CHECK-LABEL: scvtf_v1i32_v1f64:
1434 ; CHECK-NEXT: sshll v0.2d, v0.2s, #0
1435 ; CHECK-NEXT: scvtf v0.2d, v0.2d
1436 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
1438 %res = sitofp <1 x i32> %op1 to <1 x double>
1439 ret <1 x double> %res
1442 ; Don't use SVE for 128-bit vectors.
1443 define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) vscale_range(2,0) #0 {
1444 ; CHECK-LABEL: scvtf_v2i32_v2f64:
1446 ; CHECK-NEXT: sshll v0.2d, v0.2s, #0
1447 ; CHECK-NEXT: scvtf v0.2d, v0.2d
1449 %res = sitofp <2 x i32> %op1 to <2 x double>
1450 ret <2 x double> %res
1453 define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1454 ; CHECK-LABEL: scvtf_v4i32_v4f64:
1456 ; CHECK-NEXT: ldr q0, [x0]
1457 ; CHECK-NEXT: ptrue p0.d, vl4
1458 ; CHECK-NEXT: sunpklo z0.d, z0.s
1459 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1460 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1462 %op1 = load <4 x i32>, ptr %a
1463 %res = sitofp <4 x i32> %op1 to <4 x double>
1464 store <4 x double> %res, ptr %b
1468 define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) #0 {
1469 ; VBITS_GE_256-LABEL: scvtf_v8i32_v8f64:
1470 ; VBITS_GE_256: // %bb.0:
1471 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1472 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1473 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
1474 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1475 ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s
1476 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1477 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
1478 ; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d
1479 ; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d
1480 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
1481 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
1482 ; VBITS_GE_256-NEXT: ret
1484 ; VBITS_GE_512-LABEL: scvtf_v8i32_v8f64:
1485 ; VBITS_GE_512: // %bb.0:
1486 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
1487 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1488 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1489 ; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s
1490 ; VBITS_GE_512-NEXT: scvtf z0.d, p0/m, z0.d
1491 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
1492 ; VBITS_GE_512-NEXT: ret
1493 %op1 = load <8 x i32>, ptr %a
1494 %res = sitofp <8 x i32> %op1 to <8 x double>
1495 store <8 x double> %res, ptr %b
1499 define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1500 ; CHECK-LABEL: scvtf_v16i32_v16f64:
1502 ; CHECK-NEXT: ptrue p0.s, vl16
1503 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1504 ; CHECK-NEXT: ptrue p0.d, vl16
1505 ; CHECK-NEXT: sunpklo z0.d, z0.s
1506 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1507 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1509 %op1 = load <16 x i32>, ptr %a
1510 %res = sitofp <16 x i32> %op1 to <16 x double>
1511 store <16 x double> %res, ptr %b
1515 define void @scvtf_v32i32_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1516 ; CHECK-LABEL: scvtf_v32i32_v32f64:
1518 ; CHECK-NEXT: ptrue p0.s, vl32
1519 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1520 ; CHECK-NEXT: ptrue p0.d, vl32
1521 ; CHECK-NEXT: sunpklo z0.d, z0.s
1522 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1523 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1525 %op1 = load <32 x i32>, ptr %a
1526 %res = sitofp <32 x i32> %op1 to <32 x double>
1527 store <32 x double> %res, ptr %b
1535 ; Don't use SVE for 64-bit vectors.
1536 define <1 x half> @scvtf_v1i64_v1f16(<1 x i64> %op1) vscale_range(2,0) #0 {
1537 ; CHECK-LABEL: scvtf_v1i64_v1f16:
1539 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1540 ; CHECK-NEXT: fmov x8, d0
1541 ; CHECK-NEXT: scvtf h0, x8
1543 %res = sitofp <1 x i64> %op1 to <1 x half>
1547 ; v2f16 is not legal for NEON, so use SVE
1548 define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) vscale_range(2,0) #0 {
1549 ; CHECK-LABEL: scvtf_v2i64_v2f16:
1551 ; CHECK-NEXT: ptrue p0.d, vl4
1552 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
1553 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.d
1554 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1555 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
1556 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1558 %res = sitofp <2 x i64> %op1 to <2 x half>
1562 define <4 x half> @scvtf_v4i64_v4f16(ptr %a) vscale_range(2,0) #0 {
1563 ; CHECK-LABEL: scvtf_v4i64_v4f16:
1565 ; CHECK-NEXT: ptrue p0.d, vl4
1566 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1567 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.d
1568 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1569 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
1570 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1572 %op1 = load <4 x i64>, ptr %a
1573 %res = sitofp <4 x i64> %op1 to <4 x half>
1577 define <8 x half> @scvtf_v8i64_v8f16(ptr %a) #0 {
1578 ; VBITS_GE_256-LABEL: scvtf_v8i64_v8f16:
1579 ; VBITS_GE_256: // %bb.0:
1580 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1581 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1582 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1583 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1584 ; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.d
1585 ; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.d
1586 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
1587 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
1588 ; VBITS_GE_256-NEXT: uzp1 z2.h, z0.h, z0.h
1589 ; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
1590 ; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0]
1591 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
1592 ; VBITS_GE_256-NEXT: ret
1594 ; VBITS_GE_512-LABEL: scvtf_v8i64_v8f16:
1595 ; VBITS_GE_512: // %bb.0:
1596 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1597 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1598 ; VBITS_GE_512-NEXT: scvtf z0.h, p0/m, z0.d
1599 ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
1600 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
1601 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
1602 ; VBITS_GE_512-NEXT: ret
1603 %op1 = load <8 x i64>, ptr %a
1604 %res = sitofp <8 x i64> %op1 to <8 x half>
1608 define void @scvtf_v16i64_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1609 ; CHECK-LABEL: scvtf_v16i64_v16f16:
1611 ; CHECK-NEXT: ptrue p0.d, vl16
1612 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1613 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.d
1614 ; CHECK-NEXT: ptrue p0.s, vl16
1615 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1616 ; CHECK-NEXT: st1h { z0.s }, p0, [x1]
1618 %op1 = load <16 x i64>, ptr %a
1619 %res = sitofp <16 x i64> %op1 to <16 x half>
1620 store <16 x half> %res, ptr %b
1624 define void @scvtf_v32i64_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1625 ; CHECK-LABEL: scvtf_v32i64_v32f16:
1627 ; CHECK-NEXT: ptrue p0.d, vl32
1628 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1629 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.d
1630 ; CHECK-NEXT: ptrue p0.s, vl32
1631 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1632 ; CHECK-NEXT: st1h { z0.s }, p0, [x1]
1634 %op1 = load <32 x i64>, ptr %a
1635 %res = sitofp <32 x i64> %op1 to <32 x half>
1636 store <32 x half> %res, ptr %b
1644 ; Don't use SVE for 64-bit vectors.
1645 define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
1646 ; CHECK-LABEL: scvtf_v1i64_v1f32:
1648 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1649 ; CHECK-NEXT: scvtf v0.2d, v0.2d
1650 ; CHECK-NEXT: fcvtn v0.2s, v0.2d
1652 %res = sitofp <1 x i64> %op1 to <1 x float>
1653 ret <1 x float> %res
1656 ; Don't use SVE for 128-bit vectors.
1657 define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) vscale_range(2,0) #0 {
1658 ; CHECK-LABEL: scvtf_v2i64_v2f32:
1660 ; CHECK-NEXT: scvtf v0.2d, v0.2d
1661 ; CHECK-NEXT: fcvtn v0.2s, v0.2d
1663 %res = sitofp <2 x i64> %op1 to <2 x float>
1664 ret <2 x float> %res
1667 define <4 x float> @scvtf_v4i64_v4f32(ptr %a) vscale_range(2,0) #0 {
1668 ; CHECK-LABEL: scvtf_v4i64_v4f32:
1670 ; CHECK-NEXT: ptrue p0.d, vl4
1671 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1672 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.d
1673 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1674 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
1676 %op1 = load <4 x i64>, ptr %a
1677 %res = sitofp <4 x i64> %op1 to <4 x float>
1678 ret <4 x float> %res
1681 define void @scvtf_v8i64_v8f32(ptr %a, ptr %b) #0 {
1682 ; VBITS_GE_256-LABEL: scvtf_v8i64_v8f32:
1683 ; VBITS_GE_256: // %bb.0:
1684 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1685 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1686 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1687 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1688 ; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.d
1689 ; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.d
1690 ; VBITS_GE_256-NEXT: ptrue p0.s, vl4
1691 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
1692 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
1693 ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
1694 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1695 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
1696 ; VBITS_GE_256-NEXT: ret
1698 ; VBITS_GE_512-LABEL: scvtf_v8i64_v8f32:
1699 ; VBITS_GE_512: // %bb.0:
1700 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1701 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1702 ; VBITS_GE_512-NEXT: scvtf z0.s, p0/m, z0.d
1703 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
1704 ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
1705 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
1706 ; VBITS_GE_512-NEXT: ret
1707 %op1 = load <8 x i64>, ptr %a
1708 %res = sitofp <8 x i64> %op1 to <8 x float>
1709 store <8 x float> %res, ptr %b
1713 define void @scvtf_v16i64_v16f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1714 ; CHECK-LABEL: scvtf_v16i64_v16f32:
1716 ; CHECK-NEXT: ptrue p0.d, vl16
1717 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1718 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.d
1719 ; CHECK-NEXT: ptrue p0.s, vl16
1720 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1721 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1723 %op1 = load <16 x i64>, ptr %a
1724 %res = sitofp <16 x i64> %op1 to <16 x float>
1725 store <16 x float> %res, ptr %b
1729 define void @scvtf_v32i64_v32f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1730 ; CHECK-LABEL: scvtf_v32i64_v32f32:
1732 ; CHECK-NEXT: ptrue p0.d, vl32
1733 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1734 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.d
1735 ; CHECK-NEXT: ptrue p0.s, vl32
1736 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1737 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1739 %op1 = load <32 x i64>, ptr %a
1740 %res = sitofp <32 x i64> %op1 to <32 x float>
1741 store <32 x float> %res, ptr %b
1749 ; Don't use SVE for 64-bit vectors.
1750 define <1 x double> @scvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
1751 ; CHECK-LABEL: scvtf_v1i64_v1f64:
1753 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1754 ; CHECK-NEXT: scvtf d0, d0
1756 %res = sitofp <1 x i64> %op1 to <1 x double>
1757 ret <1 x double> %res
1760 ; Don't use SVE for 128-bit vectors.
1761 define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) vscale_range(2,0) #0 {
1762 ; CHECK-LABEL: scvtf_v2i64_v2f64:
1764 ; CHECK-NEXT: scvtf v0.2d, v0.2d
1766 %res = sitofp <2 x i64> %op1 to <2 x double>
1767 ret <2 x double> %res
1770 define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1771 ; CHECK-LABEL: scvtf_v4i64_v4f64:
1773 ; CHECK-NEXT: ptrue p0.d, vl4
1774 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1775 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1776 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1778 %op1 = load <4 x i64>, ptr %a
1779 %res = sitofp <4 x i64> %op1 to <4 x double>
1780 store <4 x double> %res, ptr %b
1784 define void @scvtf_v8i64_v8f64(ptr %a, ptr %b) #0 {
1785 ; VBITS_GE_256-LABEL: scvtf_v8i64_v8f64:
1786 ; VBITS_GE_256: // %bb.0:
1787 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1788 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1789 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1790 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1791 ; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d
1792 ; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d
1793 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
1794 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
1795 ; VBITS_GE_256-NEXT: ret
1797 ; VBITS_GE_512-LABEL: scvtf_v8i64_v8f64:
1798 ; VBITS_GE_512: // %bb.0:
1799 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1800 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1801 ; VBITS_GE_512-NEXT: scvtf z0.d, p0/m, z0.d
1802 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
1803 ; VBITS_GE_512-NEXT: ret
1804 %op1 = load <8 x i64>, ptr %a
1805 %res = sitofp <8 x i64> %op1 to <8 x double>
1806 store <8 x double> %res, ptr %b
1810 define void @scvtf_v16i64_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1811 ; CHECK-LABEL: scvtf_v16i64_v16f64:
1813 ; CHECK-NEXT: ptrue p0.d, vl16
1814 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1815 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1816 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1818 %op1 = load <16 x i64>, ptr %a
1819 %res = sitofp <16 x i64> %op1 to <16 x double>
1820 store <16 x double> %res, ptr %b
1824 define void @scvtf_v32i64_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1825 ; CHECK-LABEL: scvtf_v32i64_v32f64:
1827 ; CHECK-NEXT: ptrue p0.d, vl32
1828 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1829 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1830 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1832 %op1 = load <32 x i64>, ptr %a
1833 %res = sitofp <32 x i64> %op1 to <32 x double>
1834 store <32 x double> %res, ptr %b
1838 attributes #0 = { "target-features"="+sve" }