1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 ; Don't use SVE for 64-bit vectors.
13 define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) vscale_range(2,0) #0 {
14 ; CHECK-LABEL: ucvtf_v4i16_v4f16:
16 ; CHECK-NEXT: ucvtf v0.4h, v0.4h
18 %res = uitofp <4 x i16> %op1 to <4 x half>
22 ; Don't use SVE for 128-bit vectors.
23 define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
24 ; CHECK-LABEL: ucvtf_v8i16_v8f16:
26 ; CHECK-NEXT: ldr q0, [x0]
27 ; CHECK-NEXT: ucvtf v0.8h, v0.8h
28 ; CHECK-NEXT: str q0, [x1]
30 %op1 = load <8 x i16>, ptr %a
31 %res = uitofp <8 x i16> %op1 to <8 x half>
32 store <8 x half> %res, ptr %b
36 define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
37 ; CHECK-LABEL: ucvtf_v16i16_v16f16:
39 ; CHECK-NEXT: ptrue p0.h, vl16
40 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
41 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h
42 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
44 %op1 = load <16 x i16>, ptr %a
45 %res = uitofp <16 x i16> %op1 to <16 x half>
46 store <16 x half> %res, ptr %b
50 define void @ucvtf_v32i16_v32f16(ptr %a, ptr %b) #0 {
51 ; VBITS_GE_256-LABEL: ucvtf_v32i16_v32f16:
52 ; VBITS_GE_256: // %bb.0:
53 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
54 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
55 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
56 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
57 ; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.h
58 ; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.h
59 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
60 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
61 ; VBITS_GE_256-NEXT: ret
63 ; VBITS_GE_512-LABEL: ucvtf_v32i16_v32f16:
64 ; VBITS_GE_512: // %bb.0:
65 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
66 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
67 ; VBITS_GE_512-NEXT: ucvtf z0.h, p0/m, z0.h
68 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
69 ; VBITS_GE_512-NEXT: ret
70 %op1 = load <32 x i16>, ptr %a
71 %res = uitofp <32 x i16> %op1 to <32 x half>
72 store <32 x half> %res, ptr %b
76 define void @ucvtf_v64i16_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
77 ; CHECK-LABEL: ucvtf_v64i16_v64f16:
79 ; CHECK-NEXT: ptrue p0.h, vl64
80 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
81 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h
82 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
84 %op1 = load <64 x i16>, ptr %a
85 %res = uitofp <64 x i16> %op1 to <64 x half>
86 store <64 x half> %res, ptr %b
90 define void @ucvtf_v128i16_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
91 ; CHECK-LABEL: ucvtf_v128i16_v128f16:
93 ; CHECK-NEXT: ptrue p0.h, vl128
94 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
95 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h
96 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
98 %op1 = load <128 x i16>, ptr %a
99 %res = uitofp <128 x i16> %op1 to <128 x half>
100 store <128 x half> %res, ptr %b
108 ; Don't use SVE for 64-bit vectors.
109 define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) vscale_range(2,0) #0 {
110 ; CHECK-LABEL: ucvtf_v2i16_v2f32:
112 ; CHECK-NEXT: movi d1, #0x00ffff0000ffff
113 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
114 ; CHECK-NEXT: ucvtf v0.2s, v0.2s
116 %res = uitofp <2 x i16> %op1 to <2 x float>
120 ; Don't use SVE for 128-bit vectors.
121 define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) vscale_range(2,0) #0 {
122 ; CHECK-LABEL: ucvtf_v4i16_v4f32:
124 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0
125 ; CHECK-NEXT: ucvtf v0.4s, v0.4s
127 %res = uitofp <4 x i16> %op1 to <4 x float>
131 define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
132 ; CHECK-LABEL: ucvtf_v8i16_v8f32:
134 ; CHECK-NEXT: ptrue p0.s, vl8
135 ; CHECK-NEXT: ldr q0, [x0]
136 ; CHECK-NEXT: uunpklo z0.s, z0.h
137 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
138 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
140 %op1 = load <8 x i16>, ptr %a
141 %res = uitofp <8 x i16> %op1 to <8 x float>
142 store <8 x float> %res, ptr %b
146 define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) #0 {
147 ; VBITS_GE_256-LABEL: ucvtf_v16i16_v16f32:
148 ; VBITS_GE_256: // %bb.0:
149 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
150 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
151 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
152 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
153 ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h
154 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
155 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
156 ; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.s
157 ; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.s
158 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
159 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
160 ; VBITS_GE_256-NEXT: ret
162 ; VBITS_GE_512-LABEL: ucvtf_v16i16_v16f32:
163 ; VBITS_GE_512: // %bb.0:
164 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
165 ; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0]
166 ; VBITS_GE_512-NEXT: ucvtf z0.s, p0/m, z0.s
167 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
168 ; VBITS_GE_512-NEXT: ret
169 %op1 = load <16 x i16>, ptr %a
170 %res = uitofp <16 x i16> %op1 to <16 x float>
171 store <16 x float> %res, ptr %b
175 define void @ucvtf_v32i16_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
176 ; CHECK-LABEL: ucvtf_v32i16_v32f32:
178 ; CHECK-NEXT: ptrue p0.s, vl32
179 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
180 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
181 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
183 %op1 = load <32 x i16>, ptr %a
184 %res = uitofp <32 x i16> %op1 to <32 x float>
185 store <32 x float> %res, ptr %b
189 define void @ucvtf_v64i16_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
190 ; CHECK-LABEL: ucvtf_v64i16_v64f32:
192 ; CHECK-NEXT: ptrue p0.s, vl64
193 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
194 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
195 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
197 %op1 = load <64 x i16>, ptr %a
198 %res = uitofp <64 x i16> %op1 to <64 x float>
199 store <64 x float> %res, ptr %b
207 ; v1i16 is perfered to be widened to v4i16, which pushes the output into SVE types, so use SVE
208 define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) vscale_range(2,0) #0 {
209 ; CHECK-LABEL: ucvtf_v1i16_v1f64:
211 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
212 ; CHECK-NEXT: ptrue p0.d, vl4
213 ; CHECK-NEXT: uunpklo z0.s, z0.h
214 ; CHECK-NEXT: uunpklo z0.d, z0.s
215 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
216 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
218 %res = uitofp <1 x i16> %op1 to <1 x double>
219 ret <1 x double> %res
222 ; Don't use SVE for 128-bit vectors.
223 define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) vscale_range(2,0) #0 {
224 ; CHECK-LABEL: ucvtf_v2i16_v2f64:
226 ; CHECK-NEXT: movi d1, #0x00ffff0000ffff
227 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
228 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
229 ; CHECK-NEXT: ucvtf v0.2d, v0.2d
231 %res = uitofp <2 x i16> %op1 to <2 x double>
232 ret <2 x double> %res
235 define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
236 ; CHECK-LABEL: ucvtf_v4i16_v4f64:
238 ; CHECK-NEXT: ldr d0, [x0]
239 ; CHECK-NEXT: ptrue p0.d, vl4
240 ; CHECK-NEXT: uunpklo z0.s, z0.h
241 ; CHECK-NEXT: uunpklo z0.d, z0.s
242 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
243 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
245 %op1 = load <4 x i16>, ptr %a
246 %res = uitofp <4 x i16> %op1 to <4 x double>
247 store <4 x double> %res, ptr %b
251 define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) #0 {
252 ; VBITS_GE_256-LABEL: ucvtf_v8i16_v8f64:
253 ; VBITS_GE_256: // %bb.0:
254 ; VBITS_GE_256-NEXT: ldr q0, [x0]
255 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
256 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
257 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
258 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h
259 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h
260 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
261 ; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s
262 ; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d
263 ; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d
264 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
265 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
266 ; VBITS_GE_256-NEXT: ret
268 ; VBITS_GE_512-LABEL: ucvtf_v8i16_v8f64:
269 ; VBITS_GE_512: // %bb.0:
270 ; VBITS_GE_512-NEXT: ldr q0, [x0]
271 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
272 ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h
273 ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s
274 ; VBITS_GE_512-NEXT: ucvtf z0.d, p0/m, z0.d
275 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
276 ; VBITS_GE_512-NEXT: ret
277 %op1 = load <8 x i16>, ptr %a
278 %res = uitofp <8 x i16> %op1 to <8 x double>
279 store <8 x double> %res, ptr %b
283 define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
284 ; CHECK-LABEL: ucvtf_v16i16_v16f64:
286 ; CHECK-NEXT: ptrue p0.d, vl16
287 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
288 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
289 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
291 %op1 = load <16 x i16>, ptr %a
292 %res = uitofp <16 x i16> %op1 to <16 x double>
293 store <16 x double> %res, ptr %b
297 define void @ucvtf_v32i16_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
298 ; CHECK-LABEL: ucvtf_v32i16_v32f64:
300 ; CHECK-NEXT: ptrue p0.d, vl32
301 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
302 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
303 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
305 %op1 = load <32 x i16>, ptr %a
306 %res = uitofp <32 x i16> %op1 to <32 x double>
307 store <32 x double> %res, ptr %b
315 ; Don't use SVE for 64-bit vectors.
316 define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) vscale_range(2,0) #0 {
317 ; CHECK-LABEL: ucvtf_v2i32_v2f16:
319 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
320 ; CHECK-NEXT: ucvtf v0.4s, v0.4s
321 ; CHECK-NEXT: fcvtn v0.4h, v0.4s
323 %res = uitofp <2 x i32> %op1 to <2 x half>
327 ; Don't use SVE for 128-bit vectors.
328 define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) vscale_range(2,0) #0 {
329 ; CHECK-LABEL: ucvtf_v4i32_v4f16:
331 ; CHECK-NEXT: ucvtf v0.4s, v0.4s
332 ; CHECK-NEXT: fcvtn v0.4h, v0.4s
334 %res = uitofp <4 x i32> %op1 to <4 x half>
338 define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) vscale_range(2,0) #0 {
339 ; CHECK-LABEL: ucvtf_v8i32_v8f16:
341 ; CHECK-NEXT: ptrue p0.s, vl8
342 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
343 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s
344 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
345 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
347 %op1 = load <8 x i32>, ptr %a
348 %res = uitofp <8 x i32> %op1 to <8 x half>
352 define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) #0 {
353 ; VBITS_GE_256-LABEL: ucvtf_v16i32_v16f16:
354 ; VBITS_GE_256: // %bb.0:
355 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
356 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
357 ; VBITS_GE_256-NEXT: ptrue p1.h, vl16
358 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
359 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
360 ; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.s
361 ; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.s
362 ; VBITS_GE_256-NEXT: ptrue p0.h, vl8
363 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
364 ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
365 ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
366 ; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x1]
367 ; VBITS_GE_256-NEXT: ret
369 ; VBITS_GE_512-LABEL: ucvtf_v16i32_v16f16:
370 ; VBITS_GE_512: // %bb.0:
371 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
372 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
373 ; VBITS_GE_512-NEXT: ucvtf z0.h, p0/m, z0.s
374 ; VBITS_GE_512-NEXT: ptrue p0.h, vl16
375 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
376 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
377 ; VBITS_GE_512-NEXT: ret
378 %op1 = load <16 x i32>, ptr %a
379 %res = uitofp <16 x i32> %op1 to <16 x half>
380 store <16 x half> %res, ptr %b
384 define void @ucvtf_v32i32_v32f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
385 ; CHECK-LABEL: ucvtf_v32i32_v32f16:
387 ; CHECK-NEXT: ptrue p0.s, vl32
388 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
389 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s
390 ; CHECK-NEXT: ptrue p0.h, vl32
391 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
392 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
394 %op1 = load <32 x i32>, ptr %a
395 %res = uitofp <32 x i32> %op1 to <32 x half>
396 store <32 x half> %res, ptr %b
400 define void @ucvtf_v64i32_v64f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
401 ; CHECK-LABEL: ucvtf_v64i32_v64f16:
403 ; CHECK-NEXT: ptrue p0.s, vl64
404 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
405 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s
406 ; CHECK-NEXT: ptrue p0.h, vl64
407 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
408 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
410 %op1 = load <64 x i32>, ptr %a
411 %res = uitofp <64 x i32> %op1 to <64 x half>
412 store <64 x half> %res, ptr %b
420 ; Don't use SVE for 64-bit vectors.
421 define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) vscale_range(2,0) #0 {
422 ; CHECK-LABEL: ucvtf_v2i32_v2f32:
424 ; CHECK-NEXT: ucvtf v0.2s, v0.2s
426 %res = uitofp <2 x i32> %op1 to <2 x float>
430 ; Don't use SVE for 128-bit vectors.
431 define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) vscale_range(2,0) #0 {
432 ; CHECK-LABEL: ucvtf_v4i32_v4f32:
434 ; CHECK-NEXT: ucvtf v0.4s, v0.4s
436 %res = uitofp <4 x i32> %op1 to <4 x float>
440 define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
441 ; CHECK-LABEL: ucvtf_v8i32_v8f32:
443 ; CHECK-NEXT: ptrue p0.s, vl8
444 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
445 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
446 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
448 %op1 = load <8 x i32>, ptr %a
449 %res = uitofp <8 x i32> %op1 to <8 x float>
450 store <8 x float> %res, ptr %b
454 define void @ucvtf_v16i32_v16f32(ptr %a, ptr %b) #0 {
455 ; VBITS_GE_256-LABEL: ucvtf_v16i32_v16f32:
456 ; VBITS_GE_256: // %bb.0:
457 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
458 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
459 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
460 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
461 ; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.s
462 ; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.s
463 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
464 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
465 ; VBITS_GE_256-NEXT: ret
467 ; VBITS_GE_512-LABEL: ucvtf_v16i32_v16f32:
468 ; VBITS_GE_512: // %bb.0:
469 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
470 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
471 ; VBITS_GE_512-NEXT: ucvtf z0.s, p0/m, z0.s
472 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
473 ; VBITS_GE_512-NEXT: ret
474 %op1 = load <16 x i32>, ptr %a
475 %res = uitofp <16 x i32> %op1 to <16 x float>
476 store <16 x float> %res, ptr %b
480 define void @ucvtf_v32i32_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
481 ; CHECK-LABEL: ucvtf_v32i32_v32f32:
483 ; CHECK-NEXT: ptrue p0.s, vl32
484 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
485 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
486 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
488 %op1 = load <32 x i32>, ptr %a
489 %res = uitofp <32 x i32> %op1 to <32 x float>
490 store <32 x float> %res, ptr %b
494 define void @ucvtf_v64i32_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
495 ; CHECK-LABEL: ucvtf_v64i32_v64f32:
497 ; CHECK-NEXT: ptrue p0.s, vl64
498 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
499 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s
500 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
502 %op1 = load <64 x i32>, ptr %a
503 %res = uitofp <64 x i32> %op1 to <64 x float>
504 store <64 x float> %res, ptr %b
512 ; Don't use SVE for 64-bit vectors.
513 define <1 x double> @ucvtf_v1i32_v1f64(<1 x i32> %op1) vscale_range(2,0) #0 {
514 ; CHECK-LABEL: ucvtf_v1i32_v1f64:
516 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
517 ; CHECK-NEXT: ucvtf v0.2d, v0.2d
518 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
520 %res = uitofp <1 x i32> %op1 to <1 x double>
521 ret <1 x double> %res
524 ; Don't use SVE for 128-bit vectors.
525 define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) vscale_range(2,0) #0 {
526 ; CHECK-LABEL: ucvtf_v2i32_v2f64:
528 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0
529 ; CHECK-NEXT: ucvtf v0.2d, v0.2d
531 %res = uitofp <2 x i32> %op1 to <2 x double>
532 ret <2 x double> %res
535 define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
536 ; CHECK-LABEL: ucvtf_v4i32_v4f64:
538 ; CHECK-NEXT: ptrue p0.d, vl4
539 ; CHECK-NEXT: ldr q0, [x0]
540 ; CHECK-NEXT: uunpklo z0.d, z0.s
541 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
542 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
544 %op1 = load <4 x i32>, ptr %a
545 %res = uitofp <4 x i32> %op1 to <4 x double>
546 store <4 x double> %res, ptr %b
550 define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) #0 {
551 ; VBITS_GE_256-LABEL: ucvtf_v8i32_v8f64:
552 ; VBITS_GE_256: // %bb.0:
553 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
554 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
555 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
556 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
557 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s
558 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
559 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s
560 ; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d
561 ; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d
562 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
563 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
564 ; VBITS_GE_256-NEXT: ret
566 ; VBITS_GE_512-LABEL: ucvtf_v8i32_v8f64:
567 ; VBITS_GE_512: // %bb.0:
568 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
569 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x0]
570 ; VBITS_GE_512-NEXT: ucvtf z0.d, p0/m, z0.d
571 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
572 ; VBITS_GE_512-NEXT: ret
573 %op1 = load <8 x i32>, ptr %a
574 %res = uitofp <8 x i32> %op1 to <8 x double>
575 store <8 x double> %res, ptr %b
579 define void @ucvtf_v16i32_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
580 ; CHECK-LABEL: ucvtf_v16i32_v16f64:
582 ; CHECK-NEXT: ptrue p0.d, vl16
583 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
584 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
585 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
587 %op1 = load <16 x i32>, ptr %a
588 %res = uitofp <16 x i32> %op1 to <16 x double>
589 store <16 x double> %res, ptr %b
593 define void @ucvtf_v32i32_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
594 ; CHECK-LABEL: ucvtf_v32i32_v32f64:
596 ; CHECK-NEXT: ptrue p0.d, vl32
597 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
598 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
599 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
601 %op1 = load <32 x i32>, ptr %a
602 %res = uitofp <32 x i32> %op1 to <32 x double>
603 store <32 x double> %res, ptr %b
611 ; Don't use SVE for 64-bit vectors.
612 define <1 x half> @ucvtf_v1i64_v1f16(<1 x i64> %op1) vscale_range(2,0) #0 {
613 ; CHECK-LABEL: ucvtf_v1i64_v1f16:
615 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
616 ; CHECK-NEXT: fmov x8, d0
617 ; CHECK-NEXT: ucvtf h0, x8
619 %res = uitofp <1 x i64> %op1 to <1 x half>
623 ; v2f16 is not legal for NEON, so use SVE
624 define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) vscale_range(2,0) #0 {
625 ; CHECK-LABEL: ucvtf_v2i64_v2f16:
627 ; CHECK-NEXT: ptrue p0.d, vl4
628 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
629 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d
630 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
631 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
632 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
634 %res = uitofp <2 x i64> %op1 to <2 x half>
638 define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) vscale_range(2,0) #0 {
639 ; CHECK-LABEL: ucvtf_v4i64_v4f16:
641 ; CHECK-NEXT: ptrue p0.d, vl4
642 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
643 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d
644 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
645 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
646 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
648 %op1 = load <4 x i64>, ptr %a
649 %res = uitofp <4 x i64> %op1 to <4 x half>
653 define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) #0 {
654 ; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f16:
655 ; VBITS_GE_256: // %bb.0:
656 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
657 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
658 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
659 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
660 ; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.d
661 ; VBITS_GE_256-NEXT: ucvtf z1.h, p0/m, z1.d
662 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
663 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
664 ; VBITS_GE_256-NEXT: uzp1 z2.h, z0.h, z0.h
665 ; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
666 ; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0]
667 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
668 ; VBITS_GE_256-NEXT: ret
670 ; VBITS_GE_512-LABEL: ucvtf_v8i64_v8f16:
671 ; VBITS_GE_512: // %bb.0:
672 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
673 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
674 ; VBITS_GE_512-NEXT: ucvtf z0.h, p0/m, z0.d
675 ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
676 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
677 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
678 ; VBITS_GE_512-NEXT: ret
679 %op1 = load <8 x i64>, ptr %a
680 %res = uitofp <8 x i64> %op1 to <8 x half>
684 define void @ucvtf_v16i64_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
685 ; CHECK-LABEL: ucvtf_v16i64_v16f16:
687 ; CHECK-NEXT: ptrue p0.d, vl16
688 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
689 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d
690 ; CHECK-NEXT: ptrue p0.s, vl16
691 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
692 ; CHECK-NEXT: st1h { z0.s }, p0, [x1]
694 %op1 = load <16 x i64>, ptr %a
695 %res = uitofp <16 x i64> %op1 to <16 x half>
696 store <16 x half> %res, ptr %b
700 define void @ucvtf_v32i64_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
701 ; CHECK-LABEL: ucvtf_v32i64_v32f16:
703 ; CHECK-NEXT: ptrue p0.d, vl32
704 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
705 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d
706 ; CHECK-NEXT: ptrue p0.s, vl32
707 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
708 ; CHECK-NEXT: st1h { z0.s }, p0, [x1]
710 %op1 = load <32 x i64>, ptr %a
711 %res = uitofp <32 x i64> %op1 to <32 x half>
712 store <32 x half> %res, ptr %b
720 ; Don't use SVE for 64-bit vectors.
721 define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
722 ; CHECK-LABEL: ucvtf_v1i64_v1f32:
724 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
725 ; CHECK-NEXT: ucvtf v0.2d, v0.2d
726 ; CHECK-NEXT: fcvtn v0.2s, v0.2d
728 %res = uitofp <1 x i64> %op1 to <1 x float>
732 ; Don't use SVE for 128-bit vectors.
733 define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) vscale_range(2,0) #0 {
734 ; CHECK-LABEL: ucvtf_v2i64_v2f32:
736 ; CHECK-NEXT: ucvtf v0.2d, v0.2d
737 ; CHECK-NEXT: fcvtn v0.2s, v0.2d
739 %res = uitofp <2 x i64> %op1 to <2 x float>
743 define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) vscale_range(2,0) #0 {
744 ; CHECK-LABEL: ucvtf_v4i64_v4f32:
746 ; CHECK-NEXT: ptrue p0.d, vl4
747 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
748 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d
749 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
750 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
752 %op1 = load <4 x i64>, ptr %a
753 %res = uitofp <4 x i64> %op1 to <4 x float>
757 define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) #0 {
758 ; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f32:
759 ; VBITS_GE_256: // %bb.0:
760 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
761 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
762 ; VBITS_GE_256-NEXT: ptrue p1.s, vl8
763 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
764 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
765 ; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.d
766 ; VBITS_GE_256-NEXT: ucvtf z1.s, p0/m, z1.d
767 ; VBITS_GE_256-NEXT: ptrue p0.s, vl4
768 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
769 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
770 ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
771 ; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x1]
772 ; VBITS_GE_256-NEXT: ret
774 ; VBITS_GE_512-LABEL: ucvtf_v8i64_v8f32:
775 ; VBITS_GE_512: // %bb.0:
776 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
777 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
778 ; VBITS_GE_512-NEXT: ucvtf z0.s, p0/m, z0.d
779 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
780 ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
781 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
782 ; VBITS_GE_512-NEXT: ret
783 %op1 = load <8 x i64>, ptr %a
784 %res = uitofp <8 x i64> %op1 to <8 x float>
785 store <8 x float> %res, ptr %b
789 define void @ucvtf_v16i64_v16f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
790 ; CHECK-LABEL: ucvtf_v16i64_v16f32:
792 ; CHECK-NEXT: ptrue p0.d, vl16
793 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
794 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d
795 ; CHECK-NEXT: ptrue p0.s, vl16
796 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
797 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
799 %op1 = load <16 x i64>, ptr %a
800 %res = uitofp <16 x i64> %op1 to <16 x float>
801 store <16 x float> %res, ptr %b
805 define void @ucvtf_v32i64_v32f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
806 ; CHECK-LABEL: ucvtf_v32i64_v32f32:
808 ; CHECK-NEXT: ptrue p0.d, vl32
809 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
810 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d
811 ; CHECK-NEXT: ptrue p0.s, vl32
812 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
813 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
815 %op1 = load <32 x i64>, ptr %a
816 %res = uitofp <32 x i64> %op1 to <32 x float>
817 store <32 x float> %res, ptr %b
825 ; Don't use SVE for 64-bit vectors.
826 define <1 x double> @ucvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
827 ; CHECK-LABEL: ucvtf_v1i64_v1f64:
829 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
830 ; CHECK-NEXT: fmov x8, d0
831 ; CHECK-NEXT: ucvtf d0, x8
833 %res = uitofp <1 x i64> %op1 to <1 x double>
834 ret <1 x double> %res
837 ; Don't use SVE for 128-bit vectors.
838 define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) vscale_range(2,0) #0 {
839 ; CHECK-LABEL: ucvtf_v2i64_v2f64:
841 ; CHECK-NEXT: ucvtf v0.2d, v0.2d
843 %res = uitofp <2 x i64> %op1 to <2 x double>
844 ret <2 x double> %res
847 define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
848 ; CHECK-LABEL: ucvtf_v4i64_v4f64:
850 ; CHECK-NEXT: ptrue p0.d, vl4
851 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
852 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
853 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
855 %op1 = load <4 x i64>, ptr %a
856 %res = uitofp <4 x i64> %op1 to <4 x double>
857 store <4 x double> %res, ptr %b
861 define void @ucvtf_v8i64_v8f64(ptr %a, ptr %b) #0 {
862 ; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f64:
863 ; VBITS_GE_256: // %bb.0:
864 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
865 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
866 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
867 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
868 ; VBITS_GE_256-NEXT: ucvtf z0.d, p0/m, z0.d
869 ; VBITS_GE_256-NEXT: ucvtf z1.d, p0/m, z1.d
870 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
871 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
872 ; VBITS_GE_256-NEXT: ret
874 ; VBITS_GE_512-LABEL: ucvtf_v8i64_v8f64:
875 ; VBITS_GE_512: // %bb.0:
876 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
877 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
878 ; VBITS_GE_512-NEXT: ucvtf z0.d, p0/m, z0.d
879 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
880 ; VBITS_GE_512-NEXT: ret
881 %op1 = load <8 x i64>, ptr %a
882 %res = uitofp <8 x i64> %op1 to <8 x double>
883 store <8 x double> %res, ptr %b
887 define void @ucvtf_v16i64_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
888 ; CHECK-LABEL: ucvtf_v16i64_v16f64:
890 ; CHECK-NEXT: ptrue p0.d, vl16
891 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
892 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
893 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
895 %op1 = load <16 x i64>, ptr %a
896 %res = uitofp <16 x i64> %op1 to <16 x double>
897 store <16 x double> %res, ptr %b
901 define void @ucvtf_v32i64_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
902 ; CHECK-LABEL: ucvtf_v32i64_v32f64:
904 ; CHECK-NEXT: ptrue p0.d, vl32
905 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
906 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d
907 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
909 %op1 = load <32 x i64>, ptr %a
910 %res = uitofp <32 x i64> %op1 to <32 x double>
911 store <32 x double> %res, ptr %b
919 ; Don't use SVE for 64-bit vectors.
920 define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) vscale_range(2,0) #0 {
921 ; CHECK-LABEL: scvtf_v4i16_v4f16:
923 ; CHECK-NEXT: scvtf v0.4h, v0.4h
925 %res = sitofp <4 x i16> %op1 to <4 x half>
929 ; Don't use SVE for 128-bit vectors.
930 define void @scvtf_v8i16_v8f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
931 ; CHECK-LABEL: scvtf_v8i16_v8f16:
933 ; CHECK-NEXT: ldr q0, [x0]
934 ; CHECK-NEXT: scvtf v0.8h, v0.8h
935 ; CHECK-NEXT: str q0, [x1]
937 %op1 = load <8 x i16>, ptr %a
938 %res = sitofp <8 x i16> %op1 to <8 x half>
939 store <8 x half> %res, ptr %b
943 define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
944 ; CHECK-LABEL: scvtf_v16i16_v16f16:
946 ; CHECK-NEXT: ptrue p0.h, vl16
947 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
948 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.h
949 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
951 %op1 = load <16 x i16>, ptr %a
952 %res = sitofp <16 x i16> %op1 to <16 x half>
953 store <16 x half> %res, ptr %b
957 define void @scvtf_v32i16_v32f16(ptr %a, ptr %b) #0 {
958 ; VBITS_GE_256-LABEL: scvtf_v32i16_v32f16:
959 ; VBITS_GE_256: // %bb.0:
960 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
961 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
962 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
963 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
964 ; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.h
965 ; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.h
966 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
967 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1]
968 ; VBITS_GE_256-NEXT: ret
970 ; VBITS_GE_512-LABEL: scvtf_v32i16_v32f16:
971 ; VBITS_GE_512: // %bb.0:
972 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
973 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
974 ; VBITS_GE_512-NEXT: scvtf z0.h, p0/m, z0.h
975 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
976 ; VBITS_GE_512-NEXT: ret
977 %op1 = load <32 x i16>, ptr %a
978 %res = sitofp <32 x i16> %op1 to <32 x half>
979 store <32 x half> %res, ptr %b
983 define void @scvtf_v64i16_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
984 ; CHECK-LABEL: scvtf_v64i16_v64f16:
986 ; CHECK-NEXT: ptrue p0.h, vl64
987 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
988 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.h
989 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
991 %op1 = load <64 x i16>, ptr %a
992 %res = sitofp <64 x i16> %op1 to <64 x half>
993 store <64 x half> %res, ptr %b
997 define void @scvtf_v128i16_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
998 ; CHECK-LABEL: scvtf_v128i16_v128f16:
1000 ; CHECK-NEXT: ptrue p0.h, vl128
1001 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1002 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.h
1003 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
1005 %op1 = load <128 x i16>, ptr %a
1006 %res = sitofp <128 x i16> %op1 to <128 x half>
1007 store <128 x half> %res, ptr %b
1015 ; Don't use SVE for 64-bit vectors.
1016 define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) vscale_range(2,0) #0 {
1017 ; CHECK-LABEL: scvtf_v2i16_v2f32:
1019 ; CHECK-NEXT: shl v0.2s, v0.2s, #16
1020 ; CHECK-NEXT: sshr v0.2s, v0.2s, #16
1021 ; CHECK-NEXT: scvtf v0.2s, v0.2s
1023 %res = sitofp <2 x i16> %op1 to <2 x float>
1024 ret <2 x float> %res
1027 ; Don't use SVE for 128-bit vectors.
1028 define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) vscale_range(2,0) #0 {
1029 ; CHECK-LABEL: scvtf_v4i16_v4f32:
1031 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0
1032 ; CHECK-NEXT: scvtf v0.4s, v0.4s
1034 %res = sitofp <4 x i16> %op1 to <4 x float>
1035 ret <4 x float> %res
1038 define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1039 ; CHECK-LABEL: scvtf_v8i16_v8f32:
1041 ; CHECK-NEXT: ptrue p0.s, vl8
1042 ; CHECK-NEXT: ldr q0, [x0]
1043 ; CHECK-NEXT: sunpklo z0.s, z0.h
1044 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
1045 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1047 %op1 = load <8 x i16>, ptr %a
1048 %res = sitofp <8 x i16> %op1 to <8 x float>
1049 store <8 x float> %res, ptr %b
1053 define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) #0 {
1054 ; VBITS_GE_256-LABEL: scvtf_v16i16_v16f32:
1055 ; VBITS_GE_256: // %bb.0:
1056 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1057 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1058 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0]
1059 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1060 ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h
1061 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1062 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
1063 ; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.s
1064 ; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.s
1065 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
1066 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
1067 ; VBITS_GE_256-NEXT: ret
1069 ; VBITS_GE_512-LABEL: scvtf_v16i16_v16f32:
1070 ; VBITS_GE_512: // %bb.0:
1071 ; VBITS_GE_512-NEXT: ptrue p0.h, vl16
1072 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
1073 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1074 ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
1075 ; VBITS_GE_512-NEXT: scvtf z0.s, p0/m, z0.s
1076 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
1077 ; VBITS_GE_512-NEXT: ret
1078 %op1 = load <16 x i16>, ptr %a
1079 %res = sitofp <16 x i16> %op1 to <16 x float>
1080 store <16 x float> %res, ptr %b
1084 define void @scvtf_v32i16_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1085 ; CHECK-LABEL: scvtf_v32i16_v32f32:
1087 ; CHECK-NEXT: ptrue p0.h, vl32
1088 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1089 ; CHECK-NEXT: ptrue p0.s, vl32
1090 ; CHECK-NEXT: sunpklo z0.s, z0.h
1091 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
1092 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1094 %op1 = load <32 x i16>, ptr %a
1095 %res = sitofp <32 x i16> %op1 to <32 x float>
1096 store <32 x float> %res, ptr %b
1100 define void @scvtf_v64i16_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1101 ; CHECK-LABEL: scvtf_v64i16_v64f32:
1103 ; CHECK-NEXT: ptrue p0.h, vl64
1104 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1105 ; CHECK-NEXT: ptrue p0.s, vl64
1106 ; CHECK-NEXT: sunpklo z0.s, z0.h
1107 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
1108 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1110 %op1 = load <64 x i16>, ptr %a
1111 %res = sitofp <64 x i16> %op1 to <64 x float>
1112 store <64 x float> %res, ptr %b
1120 ; v1i16 is perfered to be widened to v4i16, which pushes the output into SVE types, so use SVE
1121 define <1 x double> @scvtf_v1i16_v1f64(<1 x i16> %op1) vscale_range(2,0) #0 {
1122 ; CHECK-LABEL: scvtf_v1i16_v1f64:
1124 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
1125 ; CHECK-NEXT: ptrue p0.d, vl4
1126 ; CHECK-NEXT: sunpklo z0.s, z0.h
1127 ; CHECK-NEXT: sunpklo z0.d, z0.s
1128 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1129 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1131 %res = sitofp <1 x i16> %op1 to <1 x double>
1132 ret <1 x double> %res
1135 ; Don't use SVE for 128-bit vectors.
1136 define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) vscale_range(2,0) #0 {
1137 ; CHECK-LABEL: scvtf_v2i16_v2f64:
1139 ; CHECK-NEXT: shl v0.2s, v0.2s, #16
1140 ; CHECK-NEXT: sshr v0.2s, v0.2s, #16
1141 ; CHECK-NEXT: sshll v0.2d, v0.2s, #0
1142 ; CHECK-NEXT: scvtf v0.2d, v0.2d
1144 %res = sitofp <2 x i16> %op1 to <2 x double>
1145 ret <2 x double> %res
1148 define void @scvtf_v4i16_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1149 ; CHECK-LABEL: scvtf_v4i16_v4f64:
1151 ; CHECK-NEXT: ldr d0, [x0]
1152 ; CHECK-NEXT: ptrue p0.d, vl4
1153 ; CHECK-NEXT: sunpklo z0.s, z0.h
1154 ; CHECK-NEXT: sunpklo z0.d, z0.s
1155 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1156 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1158 %op1 = load <4 x i16>, ptr %a
1159 %res = sitofp <4 x i16> %op1 to <4 x double>
1160 store <4 x double> %res, ptr %b
1164 define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) #0 {
1165 ; VBITS_GE_256-LABEL: scvtf_v8i16_v8f64:
1166 ; VBITS_GE_256: // %bb.0:
1167 ; VBITS_GE_256-NEXT: ldr q0, [x0]
1168 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1169 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1170 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8
1171 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h
1172 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h
1173 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
1174 ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s
1175 ; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d
1176 ; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d
1177 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1]
1178 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3]
1179 ; VBITS_GE_256-NEXT: ret
1181 ; VBITS_GE_512-LABEL: scvtf_v8i16_v8f64:
1182 ; VBITS_GE_512: // %bb.0:
1183 ; VBITS_GE_512-NEXT: ldr q0, [x0]
1184 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1185 ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
1186 ; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s
1187 ; VBITS_GE_512-NEXT: scvtf z0.d, p0/m, z0.d
1188 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
1189 ; VBITS_GE_512-NEXT: ret
1190 %op1 = load <8 x i16>, ptr %a
1191 %res = sitofp <8 x i16> %op1 to <8 x double>
1192 store <8 x double> %res, ptr %b
1196 define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1197 ; CHECK-LABEL: scvtf_v16i16_v16f64:
1199 ; CHECK-NEXT: ptrue p0.h, vl16
1200 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1201 ; CHECK-NEXT: ptrue p0.d, vl16
1202 ; CHECK-NEXT: sunpklo z0.s, z0.h
1203 ; CHECK-NEXT: sunpklo z0.d, z0.s
1204 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1205 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1207 %op1 = load <16 x i16>, ptr %a
1208 %res = sitofp <16 x i16> %op1 to <16 x double>
1209 store <16 x double> %res, ptr %b
1213 define void @scvtf_v32i16_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1214 ; CHECK-LABEL: scvtf_v32i16_v32f64:
1216 ; CHECK-NEXT: ptrue p0.h, vl32
1217 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1218 ; CHECK-NEXT: ptrue p0.d, vl32
1219 ; CHECK-NEXT: sunpklo z0.s, z0.h
1220 ; CHECK-NEXT: sunpklo z0.d, z0.s
1221 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1222 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1224 %op1 = load <32 x i16>, ptr %a
1225 %res = sitofp <32 x i16> %op1 to <32 x double>
1226 store <32 x double> %res, ptr %b
1234 ; Don't use SVE for 64-bit vectors.
1235 define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) vscale_range(2,0) #0 {
1236 ; CHECK-LABEL: scvtf_v2i32_v2f16:
1238 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1239 ; CHECK-NEXT: scvtf v0.4s, v0.4s
1240 ; CHECK-NEXT: fcvtn v0.4h, v0.4s
1242 %res = sitofp <2 x i32> %op1 to <2 x half>
1246 ; Don't use SVE for 128-bit vectors.
1247 define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) vscale_range(2,0) #0 {
1248 ; CHECK-LABEL: scvtf_v4i32_v4f16:
1250 ; CHECK-NEXT: scvtf v0.4s, v0.4s
1251 ; CHECK-NEXT: fcvtn v0.4h, v0.4s
1253 %res = sitofp <4 x i32> %op1 to <4 x half>
1257 define <8 x half> @scvtf_v8i32_v8f16(ptr %a) vscale_range(2,0) #0 {
1258 ; CHECK-LABEL: scvtf_v8i32_v8f16:
1260 ; CHECK-NEXT: ptrue p0.s, vl8
1261 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1262 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.s
1263 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
1264 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
1266 %op1 = load <8 x i32>, ptr %a
1267 %res = sitofp <8 x i32> %op1 to <8 x half>
1271 define void @scvtf_v16i32_v16f16(ptr %a, ptr %b) #0 {
1272 ; VBITS_GE_256-LABEL: scvtf_v16i32_v16f16:
1273 ; VBITS_GE_256: // %bb.0:
1274 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1275 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1276 ; VBITS_GE_256-NEXT: ptrue p1.h, vl16
1277 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1278 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1279 ; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.s
1280 ; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.s
1281 ; VBITS_GE_256-NEXT: ptrue p0.h, vl8
1282 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h
1283 ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h
1284 ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h
1285 ; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x1]
1286 ; VBITS_GE_256-NEXT: ret
1288 ; VBITS_GE_512-LABEL: scvtf_v16i32_v16f16:
1289 ; VBITS_GE_512: // %bb.0:
1290 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1291 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1292 ; VBITS_GE_512-NEXT: scvtf z0.h, p0/m, z0.s
1293 ; VBITS_GE_512-NEXT: ptrue p0.h, vl16
1294 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
1295 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1]
1296 ; VBITS_GE_512-NEXT: ret
1297 %op1 = load <16 x i32>, ptr %a
1298 %res = sitofp <16 x i32> %op1 to <16 x half>
1299 store <16 x half> %res, ptr %b
1303 define void @scvtf_v32i32_v32f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1304 ; CHECK-LABEL: scvtf_v32i32_v32f16:
1306 ; CHECK-NEXT: ptrue p0.s, vl32
1307 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1308 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.s
1309 ; CHECK-NEXT: ptrue p0.h, vl32
1310 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
1311 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
1313 %op1 = load <32 x i32>, ptr %a
1314 %res = sitofp <32 x i32> %op1 to <32 x half>
1315 store <32 x half> %res, ptr %b
1319 define void @scvtf_v64i32_v64f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1320 ; CHECK-LABEL: scvtf_v64i32_v64f16:
1322 ; CHECK-NEXT: ptrue p0.s, vl64
1323 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1324 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.s
1325 ; CHECK-NEXT: ptrue p0.h, vl64
1326 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
1327 ; CHECK-NEXT: st1h { z0.h }, p0, [x1]
1329 %op1 = load <64 x i32>, ptr %a
1330 %res = sitofp <64 x i32> %op1 to <64 x half>
1331 store <64 x half> %res, ptr %b
1339 ; Don't use SVE for 64-bit vectors.
1340 define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) vscale_range(2,0) #0 {
1341 ; CHECK-LABEL: scvtf_v2i32_v2f32:
1343 ; CHECK-NEXT: scvtf v0.2s, v0.2s
1345 %res = sitofp <2 x i32> %op1 to <2 x float>
1346 ret <2 x float> %res
1349 ; Don't use SVE for 128-bit vectors.
1350 define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) vscale_range(2,0) #0 {
1351 ; CHECK-LABEL: scvtf_v4i32_v4f32:
1353 ; CHECK-NEXT: scvtf v0.4s, v0.4s
1355 %res = sitofp <4 x i32> %op1 to <4 x float>
1356 ret <4 x float> %res
1359 define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1360 ; CHECK-LABEL: scvtf_v8i32_v8f32:
1362 ; CHECK-NEXT: ptrue p0.s, vl8
1363 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1364 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
1365 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1367 %op1 = load <8 x i32>, ptr %a
1368 %res = sitofp <8 x i32> %op1 to <8 x float>
1369 store <8 x float> %res, ptr %b
1373 define void @scvtf_v16i32_v16f32(ptr %a, ptr %b) #0 {
1374 ; VBITS_GE_256-LABEL: scvtf_v16i32_v16f32:
1375 ; VBITS_GE_256: // %bb.0:
1376 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1377 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1378 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1379 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1380 ; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.s
1381 ; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.s
1382 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
1383 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1]
1384 ; VBITS_GE_256-NEXT: ret
1386 ; VBITS_GE_512-LABEL: scvtf_v16i32_v16f32:
1387 ; VBITS_GE_512: // %bb.0:
1388 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1389 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1390 ; VBITS_GE_512-NEXT: scvtf z0.s, p0/m, z0.s
1391 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
1392 ; VBITS_GE_512-NEXT: ret
1393 %op1 = load <16 x i32>, ptr %a
1394 %res = sitofp <16 x i32> %op1 to <16 x float>
1395 store <16 x float> %res, ptr %b
1399 define void @scvtf_v32i32_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1400 ; CHECK-LABEL: scvtf_v32i32_v32f32:
1402 ; CHECK-NEXT: ptrue p0.s, vl32
1403 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1404 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
1405 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1407 %op1 = load <32 x i32>, ptr %a
1408 %res = sitofp <32 x i32> %op1 to <32 x float>
1409 store <32 x float> %res, ptr %b
1413 define void @scvtf_v64i32_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1414 ; CHECK-LABEL: scvtf_v64i32_v64f32:
1416 ; CHECK-NEXT: ptrue p0.s, vl64
1417 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1418 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s
1419 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1421 %op1 = load <64 x i32>, ptr %a
1422 %res = sitofp <64 x i32> %op1 to <64 x float>
1423 store <64 x float> %res, ptr %b
1431 ; Don't use SVE for 64-bit vectors.
1432 define <1 x double> @scvtf_v1i32_v1f64(<1 x i32> %op1) vscale_range(2,0) #0 {
1433 ; CHECK-LABEL: scvtf_v1i32_v1f64:
1435 ; CHECK-NEXT: sshll v0.2d, v0.2s, #0
1436 ; CHECK-NEXT: scvtf v0.2d, v0.2d
1437 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
1439 %res = sitofp <1 x i32> %op1 to <1 x double>
1440 ret <1 x double> %res
1443 ; Don't use SVE for 128-bit vectors.
1444 define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) vscale_range(2,0) #0 {
1445 ; CHECK-LABEL: scvtf_v2i32_v2f64:
1447 ; CHECK-NEXT: sshll v0.2d, v0.2s, #0
1448 ; CHECK-NEXT: scvtf v0.2d, v0.2d
1450 %res = sitofp <2 x i32> %op1 to <2 x double>
1451 ret <2 x double> %res
1454 define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1455 ; CHECK-LABEL: scvtf_v4i32_v4f64:
1457 ; CHECK-NEXT: ptrue p0.d, vl4
1458 ; CHECK-NEXT: ldr q0, [x0]
1459 ; CHECK-NEXT: sunpklo z0.d, z0.s
1460 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1461 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1463 %op1 = load <4 x i32>, ptr %a
1464 %res = sitofp <4 x i32> %op1 to <4 x double>
1465 store <4 x double> %res, ptr %b
1469 define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) #0 {
1470 ; VBITS_GE_256-LABEL: scvtf_v8i32_v8f64:
1471 ; VBITS_GE_256: // %bb.0:
1472 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1473 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1474 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0]
1475 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1476 ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s
1477 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
1478 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s
1479 ; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d
1480 ; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d
1481 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
1482 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
1483 ; VBITS_GE_256-NEXT: ret
1485 ; VBITS_GE_512-LABEL: scvtf_v8i32_v8f64:
1486 ; VBITS_GE_512: // %bb.0:
1487 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
1488 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1489 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1490 ; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s
1491 ; VBITS_GE_512-NEXT: scvtf z0.d, p0/m, z0.d
1492 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
1493 ; VBITS_GE_512-NEXT: ret
1494 %op1 = load <8 x i32>, ptr %a
1495 %res = sitofp <8 x i32> %op1 to <8 x double>
1496 store <8 x double> %res, ptr %b
1500 define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1501 ; CHECK-LABEL: scvtf_v16i32_v16f64:
1503 ; CHECK-NEXT: ptrue p0.s, vl16
1504 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1505 ; CHECK-NEXT: ptrue p0.d, vl16
1506 ; CHECK-NEXT: sunpklo z0.d, z0.s
1507 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1508 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1510 %op1 = load <16 x i32>, ptr %a
1511 %res = sitofp <16 x i32> %op1 to <16 x double>
1512 store <16 x double> %res, ptr %b
1516 define void @scvtf_v32i32_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1517 ; CHECK-LABEL: scvtf_v32i32_v32f64:
1519 ; CHECK-NEXT: ptrue p0.s, vl32
1520 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1521 ; CHECK-NEXT: ptrue p0.d, vl32
1522 ; CHECK-NEXT: sunpklo z0.d, z0.s
1523 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1524 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1526 %op1 = load <32 x i32>, ptr %a
1527 %res = sitofp <32 x i32> %op1 to <32 x double>
1528 store <32 x double> %res, ptr %b
1536 ; Don't use SVE for 64-bit vectors.
1537 define <1 x half> @scvtf_v1i64_v1f16(<1 x i64> %op1) vscale_range(2,0) #0 {
1538 ; CHECK-LABEL: scvtf_v1i64_v1f16:
1540 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1541 ; CHECK-NEXT: fmov x8, d0
1542 ; CHECK-NEXT: scvtf h0, x8
1544 %res = sitofp <1 x i64> %op1 to <1 x half>
1548 ; v2f16 is not legal for NEON, so use SVE
1549 define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) vscale_range(2,0) #0 {
1550 ; CHECK-LABEL: scvtf_v2i64_v2f16:
1552 ; CHECK-NEXT: ptrue p0.d, vl4
1553 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
1554 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.d
1555 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1556 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
1557 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1559 %res = sitofp <2 x i64> %op1 to <2 x half>
1563 define <4 x half> @scvtf_v4i64_v4f16(ptr %a) vscale_range(2,0) #0 {
1564 ; CHECK-LABEL: scvtf_v4i64_v4f16:
1566 ; CHECK-NEXT: ptrue p0.d, vl4
1567 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1568 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.d
1569 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1570 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
1571 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1573 %op1 = load <4 x i64>, ptr %a
1574 %res = sitofp <4 x i64> %op1 to <4 x half>
1578 define <8 x half> @scvtf_v8i64_v8f16(ptr %a) #0 {
1579 ; VBITS_GE_256-LABEL: scvtf_v8i64_v8f16:
1580 ; VBITS_GE_256: // %bb.0:
1581 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1582 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1583 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1584 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1585 ; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.d
1586 ; VBITS_GE_256-NEXT: scvtf z1.h, p0/m, z1.d
1587 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
1588 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
1589 ; VBITS_GE_256-NEXT: uzp1 z2.h, z0.h, z0.h
1590 ; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h
1591 ; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0]
1592 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0
1593 ; VBITS_GE_256-NEXT: ret
1595 ; VBITS_GE_512-LABEL: scvtf_v8i64_v8f16:
1596 ; VBITS_GE_512: // %bb.0:
1597 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1598 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1599 ; VBITS_GE_512-NEXT: scvtf z0.h, p0/m, z0.d
1600 ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
1601 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h
1602 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0
1603 ; VBITS_GE_512-NEXT: ret
1604 %op1 = load <8 x i64>, ptr %a
1605 %res = sitofp <8 x i64> %op1 to <8 x half>
1609 define void @scvtf_v16i64_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1610 ; CHECK-LABEL: scvtf_v16i64_v16f16:
1612 ; CHECK-NEXT: ptrue p0.d, vl16
1613 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1614 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.d
1615 ; CHECK-NEXT: ptrue p0.s, vl16
1616 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1617 ; CHECK-NEXT: st1h { z0.s }, p0, [x1]
1619 %op1 = load <16 x i64>, ptr %a
1620 %res = sitofp <16 x i64> %op1 to <16 x half>
1621 store <16 x half> %res, ptr %b
1625 define void @scvtf_v32i64_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1626 ; CHECK-LABEL: scvtf_v32i64_v32f16:
1628 ; CHECK-NEXT: ptrue p0.d, vl32
1629 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1630 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.d
1631 ; CHECK-NEXT: ptrue p0.s, vl32
1632 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1633 ; CHECK-NEXT: st1h { z0.s }, p0, [x1]
1635 %op1 = load <32 x i64>, ptr %a
1636 %res = sitofp <32 x i64> %op1 to <32 x half>
1637 store <32 x half> %res, ptr %b
1645 ; Don't use SVE for 64-bit vectors.
1646 define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 {
1647 ; CHECK-LABEL: scvtf_v1i64_v1f32:
1649 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1650 ; CHECK-NEXT: scvtf v0.2d, v0.2d
1651 ; CHECK-NEXT: fcvtn v0.2s, v0.2d
1653 %res = sitofp <1 x i64> %op1 to <1 x float>
1654 ret <1 x float> %res
1657 ; Don't use SVE for 128-bit vectors.
1658 define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) vscale_range(2,0) #0 {
1659 ; CHECK-LABEL: scvtf_v2i64_v2f32:
1661 ; CHECK-NEXT: scvtf v0.2d, v0.2d
1662 ; CHECK-NEXT: fcvtn v0.2s, v0.2d
1664 %res = sitofp <2 x i64> %op1 to <2 x float>
1665 ret <2 x float> %res
1668 define <4 x float> @scvtf_v4i64_v4f32(ptr %a) vscale_range(2,0) #0 {
1669 ; CHECK-LABEL: scvtf_v4i64_v4f32:
1671 ; CHECK-NEXT: ptrue p0.d, vl4
1672 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1673 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.d
1674 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1675 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
1677 %op1 = load <4 x i64>, ptr %a
1678 %res = sitofp <4 x i64> %op1 to <4 x float>
1679 ret <4 x float> %res
1682 define void @scvtf_v8i64_v8f32(ptr %a, ptr %b) #0 {
1683 ; VBITS_GE_256-LABEL: scvtf_v8i64_v8f32:
1684 ; VBITS_GE_256: // %bb.0:
1685 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1686 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1687 ; VBITS_GE_256-NEXT: ptrue p1.s, vl8
1688 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1689 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1690 ; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.d
1691 ; VBITS_GE_256-NEXT: scvtf z1.s, p0/m, z1.d
1692 ; VBITS_GE_256-NEXT: ptrue p0.s, vl4
1693 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s
1694 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s
1695 ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s
1696 ; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x1]
1697 ; VBITS_GE_256-NEXT: ret
1699 ; VBITS_GE_512-LABEL: scvtf_v8i64_v8f32:
1700 ; VBITS_GE_512: // %bb.0:
1701 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1702 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1703 ; VBITS_GE_512-NEXT: scvtf z0.s, p0/m, z0.d
1704 ; VBITS_GE_512-NEXT: ptrue p0.s, vl8
1705 ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s
1706 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1]
1707 ; VBITS_GE_512-NEXT: ret
1708 %op1 = load <8 x i64>, ptr %a
1709 %res = sitofp <8 x i64> %op1 to <8 x float>
1710 store <8 x float> %res, ptr %b
1714 define void @scvtf_v16i64_v16f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1715 ; CHECK-LABEL: scvtf_v16i64_v16f32:
1717 ; CHECK-NEXT: ptrue p0.d, vl16
1718 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1719 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.d
1720 ; CHECK-NEXT: ptrue p0.s, vl16
1721 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1722 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1724 %op1 = load <16 x i64>, ptr %a
1725 %res = sitofp <16 x i64> %op1 to <16 x float>
1726 store <16 x float> %res, ptr %b
1730 define void @scvtf_v32i64_v32f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1731 ; CHECK-LABEL: scvtf_v32i64_v32f32:
1733 ; CHECK-NEXT: ptrue p0.d, vl32
1734 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1735 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.d
1736 ; CHECK-NEXT: ptrue p0.s, vl32
1737 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
1738 ; CHECK-NEXT: st1w { z0.s }, p0, [x1]
1740 %op1 = load <32 x i64>, ptr %a
1741 %res = sitofp <32 x i64> %op1 to <32 x float>
1742 store <32 x float> %res, ptr %b
1750 ; Don't use SVE for 64-bit vectors.
1751 define <1 x double> @scvtf_v1i64_v1f64(<1 x i64> %op1) vscale_range(2,0) #0 {
1752 ; CHECK-LABEL: scvtf_v1i64_v1f64:
1754 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
1755 ; CHECK-NEXT: fmov x8, d0
1756 ; CHECK-NEXT: scvtf d0, x8
1758 %res = sitofp <1 x i64> %op1 to <1 x double>
1759 ret <1 x double> %res
1762 ; Don't use SVE for 128-bit vectors.
1763 define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) vscale_range(2,0) #0 {
1764 ; CHECK-LABEL: scvtf_v2i64_v2f64:
1766 ; CHECK-NEXT: scvtf v0.2d, v0.2d
1768 %res = sitofp <2 x i64> %op1 to <2 x double>
1769 ret <2 x double> %res
1772 define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1773 ; CHECK-LABEL: scvtf_v4i64_v4f64:
1775 ; CHECK-NEXT: ptrue p0.d, vl4
1776 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1777 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1778 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1780 %op1 = load <4 x i64>, ptr %a
1781 %res = sitofp <4 x i64> %op1 to <4 x double>
1782 store <4 x double> %res, ptr %b
1786 define void @scvtf_v8i64_v8f64(ptr %a, ptr %b) #0 {
1787 ; VBITS_GE_256-LABEL: scvtf_v8i64_v8f64:
1788 ; VBITS_GE_256: // %bb.0:
1789 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1790 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1791 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1792 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1793 ; VBITS_GE_256-NEXT: scvtf z0.d, p0/m, z0.d
1794 ; VBITS_GE_256-NEXT: scvtf z1.d, p0/m, z1.d
1795 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
1796 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1]
1797 ; VBITS_GE_256-NEXT: ret
1799 ; VBITS_GE_512-LABEL: scvtf_v8i64_v8f64:
1800 ; VBITS_GE_512: // %bb.0:
1801 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1802 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1803 ; VBITS_GE_512-NEXT: scvtf z0.d, p0/m, z0.d
1804 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1]
1805 ; VBITS_GE_512-NEXT: ret
1806 %op1 = load <8 x i64>, ptr %a
1807 %res = sitofp <8 x i64> %op1 to <8 x double>
1808 store <8 x double> %res, ptr %b
1812 define void @scvtf_v16i64_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1813 ; CHECK-LABEL: scvtf_v16i64_v16f64:
1815 ; CHECK-NEXT: ptrue p0.d, vl16
1816 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1817 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1818 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1820 %op1 = load <16 x i64>, ptr %a
1821 %res = sitofp <16 x i64> %op1 to <16 x double>
1822 store <16 x double> %res, ptr %b
1826 define void @scvtf_v32i64_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1827 ; CHECK-LABEL: scvtf_v32i64_v32f64:
1829 ; CHECK-NEXT: ptrue p0.d, vl32
1830 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1831 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d
1832 ; CHECK-NEXT: st1d { z0.d }, p0, [x1]
1834 %op1 = load <32 x i64>, ptr %a
1835 %res = sitofp <32 x i64> %op1 to <32 x double>
1836 store <32 x double> %res, ptr %b
1840 attributes #0 = { "target-features"="+sve" }