1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=hexagon -hexagon-hvx-widen=32 < %s | FileCheck %s
4 target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
5 target triple = "hexagon"
9 define void @f16s8_0(ptr %a0, ptr %a1) #0 {
10 ; CHECK-LABEL: f16s8_0:
11 ; CHECK: .cfi_startproc
12 ; CHECK-NEXT: // %bb.0:
14 ; CHECK-NEXT: r3:2 = combine(##32768,#1)
15 ; CHECK-NEXT: r4 = #14
16 ; CHECK-NEXT: v1 = vmem(r0+#0)
19 ; CHECK-NEXT: v2.h = vsplat(r3)
21 ; CHECK-NEXT: v3.h = vasl(v0.h,r2)
22 ; CHECK-NEXT: v0.cur = vmem(r0+#1)
25 ; CHECK-NEXT: v4.h = vsplat(r4)
26 ; CHECK-NEXT: v8.h = vasl(v1.h,r2)
27 ; CHECK-NEXT: v3.h = vsub(v3.h,v2.h)
30 ; CHECK-NEXT: r5:4 = combine(#11,##32767)
31 ; CHECK-NEXT: v7 = vxor(v7,v7)
32 ; CHECK-NEXT: v8.h = vsub(v8.h,v2.h)
35 ; CHECK-NEXT: r3 = #16
36 ; CHECK-NEXT: v5.h = vasl(v1.h,r6)
37 ; CHECK-NEXT: q1 = vcmp.gt(v7.h,v0.h)
40 ; CHECK-NEXT: v6.h = vsplat(r3)
41 ; CHECK-NEXT: v27.h = vasr(v3.h,r5)
42 ; CHECK-NEXT: v5 = vor(v5,v2)
43 ; CHECK-NEXT: q0 = vcmp.gt(v7.h,v1.h)
46 ; CHECK-NEXT: v9.h = vsplat(r4)
47 ; CHECK-NEXT: v8.h = vasr(v8.h,r5)
50 ; CHECK-NEXT: v26.h = vasl(v0.h,r6)
51 ; CHECK-NEXT: v0.h = vsub(v4.h,v27.h)
52 ; CHECK-NEXT: v4.h = vsub(v4.h,v8.h)
53 ; CHECK-NEXT: v28 = vmux(q0,v2,v9)
56 ; CHECK-NEXT: v4.h = vmin(v4.h,v6.h)
57 ; CHECK-NEXT: v1 = vor(v26,v2)
58 ; CHECK-NEXT: v0.h = vmin(v0.h,v6.h)
59 ; CHECK-NEXT: v2 = vmux(q1,v2,v9)
62 ; CHECK-NEXT: q2 = vcmp.gt(v4.h,v7.h)
63 ; CHECK-NEXT: q3 = vcmp.gt(v0.h,v7.h)
66 ; CHECK-NEXT: v5.h = vlsr(v5.h,v4.h)
69 ; CHECK-NEXT: v1.h = vlsr(v1.h,v0.h)
70 ; CHECK-NEXT: v29.h = vsub(v7.h,v5.h)
73 ; CHECK-NEXT: v30.h = vsub(v7.h,v1.h)
74 ; CHECK-NEXT: v5 = vmux(q0,v29,v5)
77 ; CHECK-NEXT: v1 = vmux(q1,v30,v1)
78 ; CHECK-NEXT: v31 = vmux(q2,v5,v28)
81 ; CHECK-NEXT: v1 = vmux(q3,v1,v2)
84 ; CHECK-NEXT: v0.b = vpack(v1.h,v31.h):sat
85 ; CHECK-NEXT: jumpr r31
86 ; CHECK-NEXT: vmem(r1+#0) = v0.new
88 %v0 = load <128 x half>, ptr %a0, align 128
89 %v1 = fptosi <128 x half> %v0 to <128 x i8>
90 store <128 x i8> %v1, ptr %a1, align 128
95 define void @f16s8_1(ptr %a0, ptr %a1) #0 {
96 ; CHECK-LABEL: f16s8_1:
97 ; CHECK: .cfi_startproc
98 ; CHECK-NEXT: // %bb.0:
100 ; CHECK-NEXT: r7 = ##32768
101 ; CHECK-NEXT: r3:2 = combine(#5,#1)
102 ; CHECK-NEXT: v0 = vmem(r0+#0)
105 ; CHECK-NEXT: v2.h = vsplat(r7)
106 ; CHECK-NEXT: v3.h = vasl(v0.h,r2)
107 ; CHECK-NEXT: r6 = #14
108 ; CHECK-NEXT: r5 = #11
111 ; CHECK-NEXT: v4.h = vsplat(r6)
112 ; CHECK-NEXT: r4 = #16
113 ; CHECK-NEXT: v6.h = vasl(v0.h,r3)
114 ; CHECK-NEXT: v3.h = vsub(v3.h,v2.h)
117 ; CHECK-NEXT: v5.h = vsplat(r4)
118 ; CHECK-NEXT: r3 = #32767
119 ; CHECK-NEXT: v29 = vor(v6,v2)
120 ; CHECK-NEXT: v1 = vxor(v1,v1)
123 ; CHECK-NEXT: v30.h = vsplat(r3)
124 ; CHECK-NEXT: r2 = #64
125 ; CHECK-NEXT: v3.h = vasr(v3.h,r5)
126 ; CHECK-NEXT: q0 = vcmp.gt(v1.h,v0.h)
129 ; CHECK-NEXT: q3 = vsetq(r2)
130 ; CHECK-NEXT: v3.h = vsub(v4.h,v3.h)
131 ; CHECK-NEXT: v2 = vmux(q0,v2,v30)
134 ; CHECK-NEXT: v3.h = vmin(v3.h,v5.h)
137 ; CHECK-NEXT: q2 = vcmp.gt(v3.h,v1.h)
140 ; CHECK-NEXT: v4.h = vlsr(v29.h,v3.h)
143 ; CHECK-NEXT: v31.h = vsub(v1.h,v4.h)
146 ; CHECK-NEXT: v0 = vmux(q0,v31,v4)
149 ; CHECK-NEXT: v0 = vmux(q2,v0,v2)
152 ; CHECK-NEXT: v0.b = vpack(v0.h,v0.h):sat
155 ; CHECK-NEXT: jumpr r31
156 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
158 %v0 = load <64 x half>, ptr %a0, align 128
159 %v1 = fptosi <64 x half> %v0 to <64 x i8>
160 store <64 x i8> %v1, ptr %a1, align 128
166 define void @f16s16_0(ptr %a0, ptr %a1) #0 {
167 ; CHECK-LABEL: f16s16_0:
168 ; CHECK: .cfi_startproc
169 ; CHECK-NEXT: // %bb.0:
171 ; CHECK-NEXT: r7 = ##32768
172 ; CHECK-NEXT: r3:2 = combine(#5,#1)
173 ; CHECK-NEXT: v0 = vmem(r0+#0)
176 ; CHECK-NEXT: v2.h = vsplat(r7)
177 ; CHECK-NEXT: v3.h = vasl(v0.h,r2)
178 ; CHECK-NEXT: r6 = #14
179 ; CHECK-NEXT: r5 = #11
182 ; CHECK-NEXT: v4.h = vsplat(r6)
183 ; CHECK-NEXT: r4 = #16
184 ; CHECK-NEXT: v6.h = vasl(v0.h,r3)
185 ; CHECK-NEXT: v3.h = vsub(v3.h,v2.h)
188 ; CHECK-NEXT: v5.h = vsplat(r4)
189 ; CHECK-NEXT: r2 = #32767
190 ; CHECK-NEXT: v29 = vor(v6,v2)
191 ; CHECK-NEXT: v1 = vxor(v1,v1)
194 ; CHECK-NEXT: v30.h = vsplat(r2)
195 ; CHECK-NEXT: v3.h = vasr(v3.h,r5)
196 ; CHECK-NEXT: q0 = vcmp.gt(v1.h,v0.h)
199 ; CHECK-NEXT: v3.h = vsub(v4.h,v3.h)
200 ; CHECK-NEXT: v2 = vmux(q0,v2,v30)
203 ; CHECK-NEXT: v3.h = vmin(v3.h,v5.h)
206 ; CHECK-NEXT: q3 = vcmp.gt(v3.h,v1.h)
209 ; CHECK-NEXT: v4.h = vlsr(v29.h,v3.h)
212 ; CHECK-NEXT: v31.h = vsub(v1.h,v4.h)
215 ; CHECK-NEXT: v0 = vmux(q0,v31,v4)
218 ; CHECK-NEXT: v0 = vmux(q3,v0,v2)
219 ; CHECK-NEXT: jumpr r31
220 ; CHECK-NEXT: vmem(r1+#0) = v0.new
222 %v0 = load <64 x half>, ptr %a0, align 128
223 %v1 = fptosi <64 x half> %v0 to <64 x i16>
224 store <64 x i16> %v1, ptr %a1, align 128
228 ; Widen input and result
229 define void @f16s16_1(ptr %a0, ptr %a1) #0 {
230 ; CHECK-LABEL: f16s16_1:
231 ; CHECK: .cfi_startproc
232 ; CHECK-NEXT: // %bb.0:
234 ; CHECK-NEXT: r7 = ##32768
235 ; CHECK-NEXT: r3:2 = combine(#5,#1)
236 ; CHECK-NEXT: v0 = vmem(r0+#0)
239 ; CHECK-NEXT: v2.h = vsplat(r7)
240 ; CHECK-NEXT: v3.h = vasl(v0.h,r2)
241 ; CHECK-NEXT: r6 = #14
242 ; CHECK-NEXT: r5 = #11
245 ; CHECK-NEXT: v4.h = vsplat(r6)
246 ; CHECK-NEXT: r4 = #16
247 ; CHECK-NEXT: v6.h = vasl(v0.h,r3)
248 ; CHECK-NEXT: v3.h = vsub(v3.h,v2.h)
251 ; CHECK-NEXT: v5.h = vsplat(r4)
252 ; CHECK-NEXT: r3 = #32767
253 ; CHECK-NEXT: v29 = vor(v6,v2)
254 ; CHECK-NEXT: v1 = vxor(v1,v1)
257 ; CHECK-NEXT: v30.h = vsplat(r3)
258 ; CHECK-NEXT: r2 = #64
259 ; CHECK-NEXT: v3.h = vasr(v3.h,r5)
260 ; CHECK-NEXT: q0 = vcmp.gt(v1.h,v0.h)
263 ; CHECK-NEXT: q3 = vsetq(r2)
264 ; CHECK-NEXT: v3.h = vsub(v4.h,v3.h)
265 ; CHECK-NEXT: v2 = vmux(q0,v2,v30)
268 ; CHECK-NEXT: v3.h = vmin(v3.h,v5.h)
271 ; CHECK-NEXT: q1 = vcmp.gt(v3.h,v1.h)
274 ; CHECK-NEXT: v4.h = vlsr(v29.h,v3.h)
277 ; CHECK-NEXT: v31.h = vsub(v1.h,v4.h)
280 ; CHECK-NEXT: v0 = vmux(q0,v31,v4)
283 ; CHECK-NEXT: v0 = vmux(q1,v0,v2)
286 ; CHECK-NEXT: jumpr r31
287 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
289 %v0 = load <32 x half>, ptr %a0, align 128
290 %v1 = fptosi <32 x half> %v0 to <32 x i16>
291 store <32 x i16> %v1, ptr %a1, align 128
297 define void @f16s32_0(ptr %a0, ptr %a1) #0 {
298 ; CHECK-LABEL: f16s32_0:
299 ; CHECK: .cfi_startproc
300 ; CHECK-NEXT: // %bb.0:
302 ; CHECK-NEXT: r2 = #15360
303 ; CHECK-NEXT: r7 = #-4
304 ; CHECK-NEXT: r6 = #1
305 ; CHECK-NEXT: v1 = vmem(r0+#0)
308 ; CHECK-NEXT: v0.h = vsplat(r2)
309 ; CHECK-NEXT: r4 = #32
310 ; CHECK-NEXT: r5 = #8
313 ; CHECK-NEXT: v7 = vsplat(r4)
314 ; CHECK-NEXT: r2 = ##2147483647
315 ; CHECK-NEXT: v24 = vxor(v24,v24)
318 ; CHECK-NEXT: v25 = vsplat(r2)
321 ; CHECK-NEXT: v1:0.qf32 = vmpy(v1.hf,v0.hf)
324 ; CHECK-NEXT: v0.sf = v0.qf32
327 ; CHECK-NEXT: v1.sf = v1.qf32
330 ; CHECK-NEXT: r7 = ##-2147483648
331 ; CHECK-NEXT: v1:0 = vshuff(v1,v0,r7)
334 ; CHECK-NEXT: v2 = vsplat(r7)
335 ; CHECK-NEXT: q0 = vcmp.gt(v24.w,v1.w)
336 ; CHECK-NEXT: q1 = vcmp.gt(v24.w,v0.w)
339 ; CHECK-NEXT: v3.w = vasl(v0.w,r6)
340 ; CHECK-NEXT: v28 = vmux(q0,v2,v25)
343 ; CHECK-NEXT: r7:6 = combine(#30,#24)
344 ; CHECK-NEXT: v4.w = vasl(v1.w,r6)
345 ; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
348 ; CHECK-NEXT: v6 = vsplat(r7)
349 ; CHECK-NEXT: v5.w = vasl(v0.w,r5)
350 ; CHECK-NEXT: v4.w = vsub(v4.w,v2.w)
353 ; CHECK-NEXT: v3.w = vasr(v3.w,r6)
354 ; CHECK-NEXT: v5 = vor(v5,v2)
357 ; CHECK-NEXT: v4.w = vasr(v4.w,r6)
358 ; CHECK-NEXT: v3.w = vsub(v6.w,v3.w)
361 ; CHECK-NEXT: v8.w = vasl(v1.w,r5)
362 ; CHECK-NEXT: v4.w = vsub(v6.w,v4.w)
363 ; CHECK-NEXT: v3.w = vmin(v3.w,v7.w)
366 ; CHECK-NEXT: v8 = vor(v8,v2)
367 ; CHECK-NEXT: v4.w = vmin(v4.w,v7.w)
368 ; CHECK-NEXT: v2 = vmux(q1,v2,v25)
369 ; CHECK-NEXT: q3 = vcmp.gt(v3.w,v24.w)
372 ; CHECK-NEXT: v26.w = vlsr(v5.w,v3.w)
373 ; CHECK-NEXT: q2 = vcmp.gt(v4.w,v24.w)
376 ; CHECK-NEXT: v27.w = vlsr(v8.w,v4.w)
377 ; CHECK-NEXT: v29.w = vsub(v24.w,v26.w)
380 ; CHECK-NEXT: v9.w = vsub(v24.w,v27.w)
381 ; CHECK-NEXT: v1 = vmux(q1,v29,v26)
384 ; CHECK-NEXT: v30 = vmux(q0,v9,v27)
385 ; CHECK-NEXT: v31 = vmux(q3,v1,v2)
386 ; CHECK-NEXT: vmem(r1+#0) = v31.new
389 ; CHECK-NEXT: v0 = vmux(q2,v30,v28)
390 ; CHECK-NEXT: jumpr r31
391 ; CHECK-NEXT: vmem(r1+#1) = v0.new
393 %v0 = load <64 x half>, ptr %a0, align 128
394 %v1 = fptosi <64 x half> %v0 to <64 x i32>
395 store <64 x i32> %v1, ptr %a1, align 128
400 define void @f16s32_1(ptr %a0, ptr %a1) #0 {
401 ; CHECK-LABEL: f16s32_1:
402 ; CHECK: .cfi_startproc
403 ; CHECK-NEXT: // %bb.0:
405 ; CHECK-NEXT: r4 = #15360
406 ; CHECK-NEXT: r7 = #-4
407 ; CHECK-NEXT: v0 = vmem(r0+#0)
410 ; CHECK-NEXT: v1.h = vsplat(r4)
411 ; CHECK-NEXT: r2 = ##-2147483648
412 ; CHECK-NEXT: r3 = #1
415 ; CHECK-NEXT: v3 = vsplat(r2)
416 ; CHECK-NEXT: r5:4 = combine(#8,#30)
417 ; CHECK-NEXT: r6 = #24
420 ; CHECK-NEXT: v4 = vsplat(r4)
421 ; CHECK-NEXT: r2 = ##2147483647
422 ; CHECK-NEXT: r4 = #32
425 ; CHECK-NEXT: v1:0.qf32 = vmpy(v0.hf,v1.hf)
426 ; CHECK-NEXT: v2 = vxor(v2,v2)
429 ; CHECK-NEXT: v5 = vsplat(r4)
430 ; CHECK-NEXT: v30 = vsplat(r2)
433 ; CHECK-NEXT: v0.sf = v0.qf32
436 ; CHECK-NEXT: v1.sf = v1.qf32
439 ; CHECK-NEXT: v1:0 = vshuff(v1,v0,r7)
442 ; CHECK-NEXT: q0 = vcmp.gt(v2.w,v0.w)
445 ; CHECK-NEXT: v1.w = vasl(v0.w,r3)
448 ; CHECK-NEXT: v6.w = vasl(v0.w,r5)
449 ; CHECK-NEXT: v1.w = vsub(v1.w,v3.w)
452 ; CHECK-NEXT: v29 = vor(v6,v3)
453 ; CHECK-NEXT: v3 = vmux(q0,v3,v30)
456 ; CHECK-NEXT: v1.w = vasr(v1.w,r6)
459 ; CHECK-NEXT: v1.w = vsub(v4.w,v1.w)
462 ; CHECK-NEXT: v1.w = vmin(v1.w,v5.w)
465 ; CHECK-NEXT: q3 = vcmp.gt(v1.w,v2.w)
468 ; CHECK-NEXT: v4.w = vlsr(v29.w,v1.w)
471 ; CHECK-NEXT: v31.w = vsub(v2.w,v4.w)
474 ; CHECK-NEXT: v0 = vmux(q0,v31,v4)
477 ; CHECK-NEXT: v0 = vmux(q3,v0,v3)
478 ; CHECK-NEXT: jumpr r31
479 ; CHECK-NEXT: vmem(r1+#0) = v0.new
481 %v0 = load <32 x half>, ptr %a0, align 128
482 %v1 = fptosi <32 x half> %v0 to <32 x i32>
483 store <32 x i32> %v1, ptr %a1, align 128
489 define void @f32s8_0(ptr %a0, ptr %a1) #0 {
490 ; CHECK-LABEL: f32s8_0:
491 ; CHECK: .cfi_startproc
492 ; CHECK-NEXT: // %bb.0:
494 ; CHECK-NEXT: r4 = ##-2147483648
495 ; CHECK-NEXT: r3:2 = combine(#1,#8)
496 ; CHECK-NEXT: v5 = vmem(r0+#0)
499 ; CHECK-NEXT: v1 = vsplat(r4)
500 ; CHECK-NEXT: r7 = #30
501 ; CHECK-NEXT: r6 = #24
502 ; CHECK-NEXT: v2 = vmem(r0+#2)
505 ; CHECK-NEXT: v10 = vsplat(r7)
506 ; CHECK-NEXT: r5 = #32
507 ; CHECK-NEXT: v8.w = vasl(v4.w,r3)
508 ; CHECK-NEXT: v4.cur = vmem(r0+#1)
511 ; CHECK-NEXT: v7.w = vasl(v5.w,r3)
512 ; CHECK-NEXT: v12 = vxor(v12,v12)
513 ; CHECK-NEXT: v8.w = vsub(v8.w,v1.w)
514 ; CHECK-NEXT: v0 = vmem(r0+#3)
517 ; CHECK-NEXT: v13 = vsplat(r5)
518 ; CHECK-NEXT: v11.w = vasl(v0.w,r3)
519 ; CHECK-NEXT: v7.w = vsub(v7.w,v1.w)
520 ; CHECK-NEXT: q0 = vcmp.gt(v12.w,v5.w)
523 ; CHECK-NEXT: v9.w = vasl(v2.w,r3)
524 ; CHECK-NEXT: q1 = vcmp.gt(v12.w,v4.w)
525 ; CHECK-NEXT: v11.w = vsub(v11.w,v1.w)
528 ; CHECK-NEXT: r3 = ##2147483647
529 ; CHECK-NEXT: r7 = #64
530 ; CHECK-NEXT: v8.w = vasr(v8.w,r6)
533 ; CHECK-NEXT: v22 = vsplat(r3)
534 ; CHECK-NEXT: v7.w = vasr(v7.w,r6)
535 ; CHECK-NEXT: v19.w = vsub(v9.w,v1.w)
536 ; CHECK-NEXT: v8.w = vsub(v10.w,v8.w)
539 ; CHECK-NEXT: v20.w = vasl(v4.w,r2)
540 ; CHECK-NEXT: v27 = vmux(q1,v1,v22)
541 ; CHECK-NEXT: v25 = vmux(q0,v1,v22)
542 ; CHECK-NEXT: v7.w = vsub(v10.w,v7.w)
545 ; CHECK-NEXT: v6.w = vasl(v5.w,r2)
546 ; CHECK-NEXT: v8.w = vmin(v8.w,v13.w)
547 ; CHECK-NEXT: v9 = vor(v20,v1)
548 ; CHECK-NEXT: v21.w = vmin(v7.w,v13.w)
551 ; CHECK-NEXT: v5.w = vasr(v19.w,r6)
552 ; CHECK-NEXT: q3 = vcmp.gt(v8.w,v12.w)
553 ; CHECK-NEXT: v6 = vor(v6,v1)
554 ; CHECK-NEXT: q2 = vcmp.gt(v21.w,v12.w)
557 ; CHECK-NEXT: v11.w = vasr(v11.w,r6)
558 ; CHECK-NEXT: v5.w = vsub(v10.w,v5.w)
561 ; CHECK-NEXT: v3.w = vasl(v2.w,r2)
562 ; CHECK-NEXT: v10.w = vsub(v10.w,v11.w)
563 ; CHECK-NEXT: v5.w = vmin(v5.w,v13.w)
566 ; CHECK-NEXT: v23.w = vasl(v0.w,r2)
567 ; CHECK-NEXT: v3 = vor(v3,v1)
568 ; CHECK-NEXT: v10.w = vmin(v10.w,v13.w)
571 ; CHECK-NEXT: v8.w = vlsr(v9.w,v8.w)
572 ; CHECK-NEXT: v4 = vor(v23,v1)
575 ; CHECK-NEXT: v6.w = vlsr(v6.w,v21.w)
576 ; CHECK-NEXT: v26.w = vsub(v12.w,v8.w)
579 ; CHECK-NEXT: v3.w = vlsr(v3.w,v5.w)
580 ; CHECK-NEXT: v24.w = vsub(v12.w,v6.w)
581 ; CHECK-NEXT: v8 = vmux(q1,v26,v8)
584 ; CHECK-NEXT: v4.w = vlsr(v4.w,v10.w)
585 ; CHECK-NEXT: v6 = vmux(q0,v24,v6)
586 ; CHECK-NEXT: q0 = vcmp.gt(v12.w,v2.w)
587 ; CHECK-NEXT: v28.w = vsub(v12.w,v3.w)
590 ; CHECK-NEXT: v2 = vmux(q3,v8,v27)
591 ; CHECK-NEXT: v29.w = vsub(v12.w,v4.w)
592 ; CHECK-NEXT: q3 = vcmp.gt(v12.w,v0.w)
593 ; CHECK-NEXT: v6 = vmux(q2,v6,v25)
596 ; CHECK-NEXT: v30 = vmux(q0,v1,v22)
597 ; CHECK-NEXT: v3 = vmux(q0,v28,v3)
598 ; CHECK-NEXT: q2 = vcmp.gt(v5.w,v12.w)
599 ; CHECK-NEXT: v4 = vmux(q3,v29,v4)
602 ; CHECK-NEXT: v2.h = vpack(v2.w,v6.w):sat
603 ; CHECK-NEXT: v1 = vmux(q3,v1,v22)
604 ; CHECK-NEXT: q3 = vcmp.gt(v10.w,v12.w)
605 ; CHECK-NEXT: v0 = vmux(q2,v3,v30)
608 ; CHECK-NEXT: v1 = vmux(q3,v4,v1)
611 ; CHECK-NEXT: v3.h = vpack(v1.w,v0.w):sat
614 ; CHECK-NEXT: v0.h = vpack(v1.w,v0.w):sat
617 ; CHECK-NEXT: v31.b = vpack(v3.h,v2.h):sat
620 ; CHECK-NEXT: v0.b = vpack(v3.h,v0.h):sat
623 ; CHECK-NEXT: v1:0 = vshuff(v0,v31,r7)
624 ; CHECK-NEXT: jumpr r31
625 ; CHECK-NEXT: vmem(r1+#0) = v0.new
627 %v0 = load <128 x float>, ptr %a0, align 128
628 %v1 = fptosi <128 x float> %v0 to <128 x i8>
629 store <128 x i8> %v1, ptr %a1, align 128
634 define void @f32s8_1(ptr %a0, ptr %a1) #0 {
635 ; CHECK-LABEL: f32s8_1:
636 ; CHECK: .cfi_startproc
637 ; CHECK-NEXT: // %bb.0:
639 ; CHECK-NEXT: r3:2 = combine(##-2147483648,#8)
640 ; CHECK-NEXT: r4 = #1
641 ; CHECK-NEXT: v1 = vmem(r0+#0)
644 ; CHECK-NEXT: v3 = vsplat(r3)
645 ; CHECK-NEXT: r5 = #30
646 ; CHECK-NEXT: v4.w = vasl(v0.w,r4)
647 ; CHECK-NEXT: v0.cur = vmem(r0+#1)
650 ; CHECK-NEXT: v5.w = vasl(v1.w,r4)
651 ; CHECK-NEXT: v4.w = vsub(v4.w,v3.w)
652 ; CHECK-NEXT: r6 = #24
653 ; CHECK-NEXT: r4 = #32
656 ; CHECK-NEXT: v6 = vsplat(r5)
657 ; CHECK-NEXT: v7 = vsplat(r4)
658 ; CHECK-NEXT: v2.w = vasl(v1.w,r2)
659 ; CHECK-NEXT: v5.w = vsub(v5.w,v3.w)
662 ; CHECK-NEXT: v4.w = vasr(v4.w,r6)
663 ; CHECK-NEXT: v26 = vxor(v26,v26)
664 ; CHECK-NEXT: v2 = vor(v2,v3)
667 ; CHECK-NEXT: r3 = ##2147483647
668 ; CHECK-NEXT: v5.w = vasr(v5.w,r6)
669 ; CHECK-NEXT: q0 = vcmp.gt(v26.w,v1.w)
672 ; CHECK-NEXT: v27 = vsplat(r3)
673 ; CHECK-NEXT: v4.w = vsub(v6.w,v4.w)
674 ; CHECK-NEXT: q2 = vcmp.gt(v26.w,v0.w)
675 ; CHECK-NEXT: v5.w = vsub(v6.w,v5.w)
678 ; CHECK-NEXT: v8.w = vasl(v0.w,r2)
679 ; CHECK-NEXT: v4.w = vmin(v4.w,v7.w)
680 ; CHECK-NEXT: v30 = vmux(q0,v3,v27)
681 ; CHECK-NEXT: v5.w = vmin(v5.w,v7.w)
684 ; CHECK-NEXT: v25 = vor(v8,v3)
685 ; CHECK-NEXT: v1 = vmux(q2,v3,v27)
686 ; CHECK-NEXT: q3 = vcmp.gt(v4.w,v26.w)
687 ; CHECK-NEXT: q1 = vcmp.gt(v5.w,v26.w)
690 ; CHECK-NEXT: r2 = #64
691 ; CHECK-NEXT: v2.w = vlsr(v2.w,v5.w)
694 ; CHECK-NEXT: v28.w = vlsr(v25.w,v4.w)
695 ; CHECK-NEXT: v29.w = vsub(v26.w,v2.w)
698 ; CHECK-NEXT: v6.w = vsub(v26.w,v28.w)
699 ; CHECK-NEXT: v0 = vmux(q0,v29,v2)
702 ; CHECK-NEXT: v31 = vmux(q2,v6,v28)
703 ; CHECK-NEXT: v0 = vmux(q1,v0,v30)
706 ; CHECK-NEXT: q3 = vsetq(r2)
707 ; CHECK-NEXT: v1 = vmux(q3,v31,v1)
710 ; CHECK-NEXT: v2.h = vpack(v1.w,v0.w):sat
713 ; CHECK-NEXT: v0.h = vpack(v1.w,v0.w):sat
716 ; CHECK-NEXT: v0.b = vpack(v2.h,v0.h):sat
719 ; CHECK-NEXT: jumpr r31
720 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
722 %v0 = load <64 x float>, ptr %a0, align 128
723 %v1 = fptosi <64 x float> %v0 to <64 x i8>
724 store <64 x i8> %v1, ptr %a1, align 128
729 define void @f32s8_2(ptr %a0, ptr %a1) #0 {
730 ; CHECK-LABEL: f32s8_2:
731 ; CHECK: .cfi_startproc
732 ; CHECK-NEXT: // %bb.0:
734 ; CHECK-NEXT: r7 = ##-2147483648
735 ; CHECK-NEXT: r3:2 = combine(#30,#1)
736 ; CHECK-NEXT: v0 = vmem(r0+#0)
739 ; CHECK-NEXT: v2 = vsplat(r7)
740 ; CHECK-NEXT: r5:4 = combine(#8,#24)
741 ; CHECK-NEXT: r6 = #32
742 ; CHECK-NEXT: v3.w = vasl(v0.w,r2)
745 ; CHECK-NEXT: v4 = vsplat(r3)
746 ; CHECK-NEXT: v5 = vsplat(r6)
747 ; CHECK-NEXT: v6.w = vasl(v0.w,r5)
748 ; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
751 ; CHECK-NEXT: v1 = vxor(v1,v1)
752 ; CHECK-NEXT: v29 = vor(v6,v2)
755 ; CHECK-NEXT: q3 = vsetq(r6)
756 ; CHECK-NEXT: v3.w = vasr(v3.w,r4)
757 ; CHECK-NEXT: q0 = vcmp.gt(v1.w,v0.w)
760 ; CHECK-NEXT: r4 = ##2147483647
761 ; CHECK-NEXT: v3.w = vsub(v4.w,v3.w)
764 ; CHECK-NEXT: v30 = vsplat(r4)
765 ; CHECK-NEXT: v3.w = vmin(v3.w,v5.w)
768 ; CHECK-NEXT: v2 = vmux(q0,v2,v30)
769 ; CHECK-NEXT: q2 = vcmp.gt(v3.w,v1.w)
772 ; CHECK-NEXT: v4.w = vlsr(v29.w,v3.w)
775 ; CHECK-NEXT: v31.w = vsub(v1.w,v4.w)
778 ; CHECK-NEXT: v0 = vmux(q0,v31,v4)
781 ; CHECK-NEXT: v0 = vmux(q2,v0,v2)
784 ; CHECK-NEXT: v1.h = vpack(v1.w,v0.w):sat
787 ; CHECK-NEXT: v0.h = vpack(v0.w,v0.w):sat
790 ; CHECK-NEXT: v0.b = vpack(v1.h,v0.h):sat
793 ; CHECK-NEXT: jumpr r31
794 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
796 %v0 = load <32 x float>, ptr %a0, align 128
797 %v1 = fptosi <32 x float> %v0 to <32 x i8>
798 store <32 x i8> %v1, ptr %a1, align 128
804 define void @f32s16_0(ptr %a0, ptr %a1) #0 {
805 ; CHECK-LABEL: f32s16_0:
806 ; CHECK: .cfi_startproc
807 ; CHECK-NEXT: // %bb.0:
809 ; CHECK-NEXT: r3:2 = combine(##-2147483648,#1)
810 ; CHECK-NEXT: r4 = #30
811 ; CHECK-NEXT: v1 = vmem(r0+#0)
814 ; CHECK-NEXT: v2 = vsplat(r3)
815 ; CHECK-NEXT: r6 = #8
816 ; CHECK-NEXT: v3.w = vasl(v0.w,r2)
817 ; CHECK-NEXT: v0.cur = vmem(r0+#1)
820 ; CHECK-NEXT: v4 = vsplat(r4)
821 ; CHECK-NEXT: v8.w = vasl(v1.w,r2)
822 ; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
825 ; CHECK-NEXT: r5:4 = combine(#24,##2147483647)
826 ; CHECK-NEXT: v7 = vxor(v7,v7)
827 ; CHECK-NEXT: v8.w = vsub(v8.w,v2.w)
830 ; CHECK-NEXT: r3 = #32
831 ; CHECK-NEXT: v5.w = vasl(v1.w,r6)
832 ; CHECK-NEXT: q1 = vcmp.gt(v7.w,v0.w)
835 ; CHECK-NEXT: v6 = vsplat(r3)
836 ; CHECK-NEXT: v27.w = vasr(v3.w,r5)
837 ; CHECK-NEXT: v5 = vor(v5,v2)
838 ; CHECK-NEXT: q0 = vcmp.gt(v7.w,v1.w)
841 ; CHECK-NEXT: v9 = vsplat(r4)
842 ; CHECK-NEXT: v8.w = vasr(v8.w,r5)
845 ; CHECK-NEXT: v26.w = vasl(v0.w,r6)
846 ; CHECK-NEXT: v0.w = vsub(v4.w,v27.w)
847 ; CHECK-NEXT: v4.w = vsub(v4.w,v8.w)
848 ; CHECK-NEXT: v28 = vmux(q0,v2,v9)
851 ; CHECK-NEXT: v4.w = vmin(v4.w,v6.w)
852 ; CHECK-NEXT: v1 = vor(v26,v2)
853 ; CHECK-NEXT: v0.w = vmin(v0.w,v6.w)
854 ; CHECK-NEXT: v2 = vmux(q1,v2,v9)
857 ; CHECK-NEXT: q2 = vcmp.gt(v4.w,v7.w)
858 ; CHECK-NEXT: q3 = vcmp.gt(v0.w,v7.w)
861 ; CHECK-NEXT: v5.w = vlsr(v5.w,v4.w)
864 ; CHECK-NEXT: v1.w = vlsr(v1.w,v0.w)
865 ; CHECK-NEXT: v29.w = vsub(v7.w,v5.w)
868 ; CHECK-NEXT: v30.w = vsub(v7.w,v1.w)
869 ; CHECK-NEXT: v5 = vmux(q0,v29,v5)
872 ; CHECK-NEXT: v1 = vmux(q1,v30,v1)
873 ; CHECK-NEXT: v31 = vmux(q2,v5,v28)
876 ; CHECK-NEXT: v1 = vmux(q3,v1,v2)
879 ; CHECK-NEXT: v0.h = vpack(v1.w,v31.w):sat
880 ; CHECK-NEXT: jumpr r31
881 ; CHECK-NEXT: vmem(r1+#0) = v0.new
883 %v0 = load <64 x float>, ptr %a0, align 128
884 %v1 = fptosi <64 x float> %v0 to <64 x i16>
885 store <64 x i16> %v1, ptr %a1, align 128
890 define void @f32s16_1(ptr %a0, ptr %a1) #0 {
891 ; CHECK-LABEL: f32s16_1:
892 ; CHECK: .cfi_startproc
893 ; CHECK-NEXT: // %bb.0:
895 ; CHECK-NEXT: r7 = ##-2147483648
896 ; CHECK-NEXT: r3:2 = combine(#8,#1)
897 ; CHECK-NEXT: v0 = vmem(r0+#0)
900 ; CHECK-NEXT: v2 = vsplat(r7)
901 ; CHECK-NEXT: v3.w = vasl(v0.w,r2)
902 ; CHECK-NEXT: r6 = #30
903 ; CHECK-NEXT: r5 = #24
906 ; CHECK-NEXT: v4 = vsplat(r6)
907 ; CHECK-NEXT: r4 = #32
908 ; CHECK-NEXT: v6.w = vasl(v0.w,r3)
909 ; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
912 ; CHECK-NEXT: v5 = vsplat(r4)
913 ; CHECK-NEXT: v29 = vor(v6,v2)
914 ; CHECK-NEXT: v1 = vxor(v1,v1)
917 ; CHECK-NEXT: r3 = ##2147483647
918 ; CHECK-NEXT: r2 = #64
919 ; CHECK-NEXT: v3.w = vasr(v3.w,r5)
922 ; CHECK-NEXT: v30 = vsplat(r3)
923 ; CHECK-NEXT: q3 = vsetq(r2)
924 ; CHECK-NEXT: q0 = vcmp.gt(v1.w,v0.w)
925 ; CHECK-NEXT: v3.w = vsub(v4.w,v3.w)
928 ; CHECK-NEXT: v2 = vmux(q0,v2,v30)
929 ; CHECK-NEXT: v3.w = vmin(v3.w,v5.w)
932 ; CHECK-NEXT: q2 = vcmp.gt(v3.w,v1.w)
935 ; CHECK-NEXT: v4.w = vlsr(v29.w,v3.w)
938 ; CHECK-NEXT: v31.w = vsub(v1.w,v4.w)
941 ; CHECK-NEXT: v0 = vmux(q0,v31,v4)
944 ; CHECK-NEXT: v0 = vmux(q2,v0,v2)
947 ; CHECK-NEXT: v0.h = vpack(v0.w,v0.w):sat
950 ; CHECK-NEXT: jumpr r31
951 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
953 %v0 = load <32 x float>, ptr %a0, align 128
954 %v1 = fptosi <32 x float> %v0 to <32 x i16>
955 store <32 x i16> %v1, ptr %a1, align 128
961 define void @f32s32_0(ptr %a0, ptr %a1) #0 {
962 ; CHECK-LABEL: f32s32_0:
963 ; CHECK: .cfi_startproc
964 ; CHECK-NEXT: // %bb.0:
966 ; CHECK-NEXT: r7 = ##-2147483648
967 ; CHECK-NEXT: r3:2 = combine(#8,#1)
968 ; CHECK-NEXT: v0 = vmem(r0+#0)
971 ; CHECK-NEXT: v2 = vsplat(r7)
972 ; CHECK-NEXT: v3.w = vasl(v0.w,r2)
973 ; CHECK-NEXT: r6 = #30
974 ; CHECK-NEXT: r5 = #24
977 ; CHECK-NEXT: v4 = vsplat(r6)
978 ; CHECK-NEXT: r4 = #32
979 ; CHECK-NEXT: v6.w = vasl(v0.w,r3)
980 ; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
983 ; CHECK-NEXT: v5 = vsplat(r4)
984 ; CHECK-NEXT: v29 = vor(v6,v2)
985 ; CHECK-NEXT: v1 = vxor(v1,v1)
988 ; CHECK-NEXT: r2 = ##2147483647
989 ; CHECK-NEXT: v3.w = vasr(v3.w,r5)
992 ; CHECK-NEXT: v30 = vsplat(r2)
993 ; CHECK-NEXT: q0 = vcmp.gt(v1.w,v0.w)
994 ; CHECK-NEXT: v3.w = vsub(v4.w,v3.w)
997 ; CHECK-NEXT: v2 = vmux(q0,v2,v30)
998 ; CHECK-NEXT: v3.w = vmin(v3.w,v5.w)
1001 ; CHECK-NEXT: q3 = vcmp.gt(v3.w,v1.w)
1004 ; CHECK-NEXT: v4.w = vlsr(v29.w,v3.w)
1007 ; CHECK-NEXT: v31.w = vsub(v1.w,v4.w)
1010 ; CHECK-NEXT: v0 = vmux(q0,v31,v4)
1013 ; CHECK-NEXT: v0 = vmux(q3,v0,v2)
1014 ; CHECK-NEXT: jumpr r31
1015 ; CHECK-NEXT: vmem(r1+#0) = v0.new
1017 %v0 = load <32 x float>, ptr %a0, align 128
1018 %v1 = fptosi <32 x float> %v0 to <32 x i32>
1019 store <32 x i32> %v1, ptr %a1, align 128
1023 ; Widen input and result
1024 define void @f32s32_1(ptr %a0, ptr %a1) #0 {
1025 ; CHECK-LABEL: f32s32_1:
1026 ; CHECK: .cfi_startproc
1027 ; CHECK-NEXT: // %bb.0:
1029 ; CHECK-NEXT: r7 = ##-2147483648
1030 ; CHECK-NEXT: r3:2 = combine(#8,#1)
1031 ; CHECK-NEXT: v0 = vmem(r0+#0)
1034 ; CHECK-NEXT: v2 = vsplat(r7)
1035 ; CHECK-NEXT: v3.w = vasl(v0.w,r2)
1036 ; CHECK-NEXT: r6 = #30
1037 ; CHECK-NEXT: r5 = #24
1040 ; CHECK-NEXT: v4 = vsplat(r6)
1041 ; CHECK-NEXT: r4 = #32
1042 ; CHECK-NEXT: v6.w = vasl(v0.w,r3)
1043 ; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
1046 ; CHECK-NEXT: v5 = vsplat(r4)
1047 ; CHECK-NEXT: v29 = vor(v6,v2)
1048 ; CHECK-NEXT: v1 = vxor(v1,v1)
1051 ; CHECK-NEXT: r3 = ##2147483647
1052 ; CHECK-NEXT: r2 = #64
1053 ; CHECK-NEXT: v3.w = vasr(v3.w,r5)
1056 ; CHECK-NEXT: v30 = vsplat(r3)
1057 ; CHECK-NEXT: q3 = vsetq(r2)
1058 ; CHECK-NEXT: q0 = vcmp.gt(v1.w,v0.w)
1059 ; CHECK-NEXT: v3.w = vsub(v4.w,v3.w)
1062 ; CHECK-NEXT: v2 = vmux(q0,v2,v30)
1063 ; CHECK-NEXT: v3.w = vmin(v3.w,v5.w)
1066 ; CHECK-NEXT: q1 = vcmp.gt(v3.w,v1.w)
1069 ; CHECK-NEXT: v4.w = vlsr(v29.w,v3.w)
1072 ; CHECK-NEXT: v31.w = vsub(v1.w,v4.w)
1075 ; CHECK-NEXT: v0 = vmux(q0,v31,v4)
1078 ; CHECK-NEXT: v0 = vmux(q1,v0,v2)
1081 ; CHECK-NEXT: jumpr r31
1082 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
1084 %v0 = load <16 x float>, ptr %a0, align 128
1085 %v1 = fptosi <16 x float> %v0 to <16 x i32>
1086 store <16 x i32> %v1, ptr %a1, align 128
1093 define void @f16u8_0(ptr %a0, ptr %a1) #0 {
1094 ; CHECK-LABEL: f16u8_0:
1095 ; CHECK: .cfi_startproc
1096 ; CHECK-NEXT: // %bb.0:
1098 ; CHECK-NEXT: r3:2 = combine(##32768,#1)
1099 ; CHECK-NEXT: r4 = #14
1100 ; CHECK-NEXT: v0 = vmem(r0+#1)
1103 ; CHECK-NEXT: v2.h = vsplat(r3)
1104 ; CHECK-NEXT: r7:6 = combine(#11,#16)
1105 ; CHECK-NEXT: v3.h = vasl(v0.h,r2)
1106 ; CHECK-NEXT: v1 = vmem(r0+#0)
1109 ; CHECK-NEXT: v6.h = vsplat(r4)
1110 ; CHECK-NEXT: r5 = #5
1111 ; CHECK-NEXT: v4.h = vasl(v1.h,r2)
1112 ; CHECK-NEXT: v3.h = vsub(v3.h,v2.h)
1115 ; CHECK-NEXT: v7.h = vsplat(r6)
1116 ; CHECK-NEXT: v5.h = vasl(v1.h,r5)
1117 ; CHECK-NEXT: v4.h = vsub(v4.h,v2.h)
1118 ; CHECK-NEXT: v28 = vxor(v28,v28)
1121 ; CHECK-NEXT: r2 = #32767
1122 ; CHECK-NEXT: v3.h = vasr(v3.h,r7)
1123 ; CHECK-NEXT: v5 = vor(v5,v2)
1126 ; CHECK-NEXT: v29.h = vsplat(r2)
1127 ; CHECK-NEXT: v4.h = vasr(v4.h,r7)
1128 ; CHECK-NEXT: q2 = vcmp.gt(v28.h,v1.h)
1129 ; CHECK-NEXT: v3.h = vsub(v6.h,v3.h)
1132 ; CHECK-NEXT: v8.h = vasl(v0.h,r5)
1133 ; CHECK-NEXT: q3 = vcmp.gt(v28.h,v0.h)
1134 ; CHECK-NEXT: v4.h = vsub(v6.h,v4.h)
1135 ; CHECK-NEXT: v3.h = vmin(v3.h,v7.h)
1138 ; CHECK-NEXT: v4.h = vmin(v4.h,v7.h)
1139 ; CHECK-NEXT: v2 = vor(v8,v2)
1140 ; CHECK-NEXT: q1 = vcmp.gt(v28.h,v3.h)
1143 ; CHECK-NEXT: q0 = vcmp.gt(v28.h,v4.h)
1146 ; CHECK-NEXT: v5.h = vlsr(v5.h,v4.h)
1149 ; CHECK-NEXT: v2.h = vlsr(v2.h,v3.h)
1150 ; CHECK-NEXT: v30 = vmux(q0,v29,v5)
1153 ; CHECK-NEXT: v31 = vmux(q1,v29,v2)
1154 ; CHECK-NEXT: v0 = vmux(q2,v28,v30)
1157 ; CHECK-NEXT: v1 = vmux(q3,v28,v31)
1160 ; CHECK-NEXT: v0.ub = vpack(v1.h,v0.h):sat
1161 ; CHECK-NEXT: jumpr r31
1162 ; CHECK-NEXT: vmem(r1+#0) = v0.new
1164 %v0 = load <128 x half>, ptr %a0, align 128
1165 %v1 = fptoui <128 x half> %v0 to <128 x i8>
1166 store <128 x i8> %v1, ptr %a1, align 128
1171 define void @f16u8_1(ptr %a0, ptr %a1) #0 {
1172 ; CHECK-LABEL: f16u8_1:
1173 ; CHECK: .cfi_startproc
1174 ; CHECK-NEXT: // %bb.0:
1176 ; CHECK-NEXT: r7 = ##32768
1177 ; CHECK-NEXT: r3:2 = combine(#5,#1)
1178 ; CHECK-NEXT: v0 = vmem(r0+#0)
1181 ; CHECK-NEXT: v2.h = vsplat(r7)
1182 ; CHECK-NEXT: v3.h = vasl(v0.h,r2)
1183 ; CHECK-NEXT: r6 = #14
1184 ; CHECK-NEXT: r5 = #11
1187 ; CHECK-NEXT: v4.h = vsplat(r6)
1188 ; CHECK-NEXT: r4 = #16
1189 ; CHECK-NEXT: v6.h = vasl(v0.h,r3)
1190 ; CHECK-NEXT: v3.h = vsub(v3.h,v2.h)
1193 ; CHECK-NEXT: v5.h = vsplat(r4)
1194 ; CHECK-NEXT: r3 = #32767
1195 ; CHECK-NEXT: v2 = vor(v6,v2)
1196 ; CHECK-NEXT: v1 = vxor(v1,v1)
1199 ; CHECK-NEXT: v30.h = vsplat(r3)
1200 ; CHECK-NEXT: r2 = #64
1201 ; CHECK-NEXT: v3.h = vasr(v3.h,r5)
1202 ; CHECK-NEXT: q1 = vcmp.gt(v1.h,v0.h)
1205 ; CHECK-NEXT: q3 = vsetq(r2)
1206 ; CHECK-NEXT: v3.h = vsub(v4.h,v3.h)
1209 ; CHECK-NEXT: v3.h = vmin(v3.h,v5.h)
1212 ; CHECK-NEXT: q0 = vcmp.gt(v1.h,v3.h)
1215 ; CHECK-NEXT: v2.h = vlsr(v2.h,v3.h)
1218 ; CHECK-NEXT: v31 = vmux(q0,v30,v2)
1221 ; CHECK-NEXT: v0 = vmux(q1,v1,v31)
1224 ; CHECK-NEXT: v0.ub = vpack(v0.h,v0.h):sat
1227 ; CHECK-NEXT: jumpr r31
1228 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
1230 %v0 = load <64 x half>, ptr %a0, align 128
1231 %v1 = fptoui <64 x half> %v0 to <64 x i8>
1232 store <64 x i8> %v1, ptr %a1, align 128
1238 define void @f16u16_0(ptr %a0, ptr %a1) #0 {
1239 ; CHECK-LABEL: f16u16_0:
1240 ; CHECK: .cfi_startproc
1241 ; CHECK-NEXT: // %bb.0:
1243 ; CHECK-NEXT: r7 = ##32768
1244 ; CHECK-NEXT: r3:2 = combine(#5,#1)
1245 ; CHECK-NEXT: v0 = vmem(r0+#0)
1248 ; CHECK-NEXT: v2.h = vsplat(r7)
1249 ; CHECK-NEXT: v3.h = vasl(v0.h,r2)
1250 ; CHECK-NEXT: r6 = #14
1251 ; CHECK-NEXT: r5 = #11
1254 ; CHECK-NEXT: v4.h = vsplat(r6)
1255 ; CHECK-NEXT: r4 = #16
1256 ; CHECK-NEXT: v6.h = vasl(v0.h,r3)
1257 ; CHECK-NEXT: v3.h = vsub(v3.h,v2.h)
1260 ; CHECK-NEXT: v5.h = vsplat(r4)
1261 ; CHECK-NEXT: r2 = #32767
1262 ; CHECK-NEXT: v2 = vor(v6,v2)
1263 ; CHECK-NEXT: v1 = vxor(v1,v1)
1266 ; CHECK-NEXT: v30.h = vsplat(r2)
1267 ; CHECK-NEXT: v3.h = vasr(v3.h,r5)
1268 ; CHECK-NEXT: q1 = vcmp.gt(v1.h,v0.h)
1271 ; CHECK-NEXT: v3.h = vsub(v4.h,v3.h)
1274 ; CHECK-NEXT: v3.h = vmin(v3.h,v5.h)
1277 ; CHECK-NEXT: q0 = vcmp.gt(v1.h,v3.h)
1280 ; CHECK-NEXT: v2.h = vlsr(v2.h,v3.h)
1283 ; CHECK-NEXT: v31 = vmux(q0,v30,v2)
1286 ; CHECK-NEXT: v0 = vmux(q1,v1,v31)
1287 ; CHECK-NEXT: jumpr r31
1288 ; CHECK-NEXT: vmem(r1+#0) = v0.new
1290 %v0 = load <64 x half>, ptr %a0, align 128
1291 %v1 = fptoui <64 x half> %v0 to <64 x i16>
1292 store <64 x i16> %v1, ptr %a1, align 128
1296 ; Widen input and result
1297 define void @f16u16_1(ptr %a0, ptr %a1) #0 {
1298 ; CHECK-LABEL: f16u16_1:
1299 ; CHECK: .cfi_startproc
1300 ; CHECK-NEXT: // %bb.0:
1302 ; CHECK-NEXT: r7 = ##32768
1303 ; CHECK-NEXT: r3:2 = combine(#5,#1)
1304 ; CHECK-NEXT: v0 = vmem(r0+#0)
1307 ; CHECK-NEXT: v2.h = vsplat(r7)
1308 ; CHECK-NEXT: v3.h = vasl(v0.h,r2)
1309 ; CHECK-NEXT: r6 = #14
1310 ; CHECK-NEXT: r5 = #11
1313 ; CHECK-NEXT: v4.h = vsplat(r6)
1314 ; CHECK-NEXT: r4 = #16
1315 ; CHECK-NEXT: v6.h = vasl(v0.h,r3)
1316 ; CHECK-NEXT: v3.h = vsub(v3.h,v2.h)
1319 ; CHECK-NEXT: v5.h = vsplat(r4)
1320 ; CHECK-NEXT: r3 = #32767
1321 ; CHECK-NEXT: v2 = vor(v6,v2)
1322 ; CHECK-NEXT: v1 = vxor(v1,v1)
1325 ; CHECK-NEXT: v30.h = vsplat(r3)
1326 ; CHECK-NEXT: r2 = #64
1327 ; CHECK-NEXT: v3.h = vasr(v3.h,r5)
1328 ; CHECK-NEXT: q1 = vcmp.gt(v1.h,v0.h)
1331 ; CHECK-NEXT: q3 = vsetq(r2)
1332 ; CHECK-NEXT: v3.h = vsub(v4.h,v3.h)
1335 ; CHECK-NEXT: v3.h = vmin(v3.h,v5.h)
1338 ; CHECK-NEXT: q0 = vcmp.gt(v1.h,v3.h)
1341 ; CHECK-NEXT: v2.h = vlsr(v2.h,v3.h)
1344 ; CHECK-NEXT: v31 = vmux(q0,v30,v2)
1347 ; CHECK-NEXT: v0 = vmux(q1,v1,v31)
1350 ; CHECK-NEXT: jumpr r31
1351 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
1353 %v0 = load <32 x half>, ptr %a0, align 128
1354 %v1 = fptoui <32 x half> %v0 to <32 x i16>
1355 store <32 x i16> %v1, ptr %a1, align 128
1361 define void @f16u32_0(ptr %a0, ptr %a1) #0 {
1362 ; CHECK-LABEL: f16u32_0:
1363 ; CHECK: .cfi_startproc
1364 ; CHECK-NEXT: // %bb.0:
1366 ; CHECK-NEXT: r2 = #15360
1367 ; CHECK-NEXT: r7 = #-4
1368 ; CHECK-NEXT: v0 = vmem(r0+#0)
1371 ; CHECK-NEXT: v1.h = vsplat(r2)
1372 ; CHECK-NEXT: r4 = ##-2147483648
1373 ; CHECK-NEXT: r3:2 = combine(#30,#1)
1376 ; CHECK-NEXT: v2 = vsplat(r4)
1377 ; CHECK-NEXT: r4 = #32
1378 ; CHECK-NEXT: r6 = #24
1379 ; CHECK-NEXT: r0 = #8
1382 ; CHECK-NEXT: v6 = vsplat(r3)
1383 ; CHECK-NEXT: v26 = vxor(v26,v26)
1386 ; CHECK-NEXT: v1:0.qf32 = vmpy(v0.hf,v1.hf)
1389 ; CHECK-NEXT: v7 = vsplat(r4)
1392 ; CHECK-NEXT: v0.sf = v0.qf32
1395 ; CHECK-NEXT: v1.sf = v1.qf32
1398 ; CHECK-NEXT: v1:0 = vshuff(v1,v0,r7)
1401 ; CHECK-NEXT: q1 = vcmp.gt(v26.w,v1.w)
1402 ; CHECK-NEXT: q3 = vcmp.gt(v26.w,v0.w)
1405 ; CHECK-NEXT: v3.w = vasl(v1.w,r2)
1408 ; CHECK-NEXT: r2 = ##2147483647
1409 ; CHECK-NEXT: v4.w = vasl(v0.w,r2)
1410 ; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
1413 ; CHECK-NEXT: v27 = vsplat(r2)
1414 ; CHECK-NEXT: v5.w = vasl(v1.w,r0)
1415 ; CHECK-NEXT: v4.w = vsub(v4.w,v2.w)
1418 ; CHECK-NEXT: v3.w = vasr(v3.w,r6)
1419 ; CHECK-NEXT: v5 = vor(v5,v2)
1422 ; CHECK-NEXT: v4.w = vasr(v4.w,r6)
1423 ; CHECK-NEXT: v3.w = vsub(v6.w,v3.w)
1426 ; CHECK-NEXT: v8.w = vasl(v0.w,r0)
1427 ; CHECK-NEXT: v4.w = vsub(v6.w,v4.w)
1428 ; CHECK-NEXT: v3.w = vmin(v3.w,v7.w)
1431 ; CHECK-NEXT: v4.w = vmin(v4.w,v7.w)
1432 ; CHECK-NEXT: v2 = vor(v8,v2)
1433 ; CHECK-NEXT: q0 = vcmp.gt(v26.w,v3.w)
1436 ; CHECK-NEXT: v3.w = vlsr(v5.w,v3.w)
1437 ; CHECK-NEXT: q2 = vcmp.gt(v26.w,v4.w)
1440 ; CHECK-NEXT: v28.w = vlsr(v2.w,v4.w)
1441 ; CHECK-NEXT: v29 = vmux(q0,v27,v3)
1444 ; CHECK-NEXT: v30 = vmux(q2,v27,v28)
1445 ; CHECK-NEXT: v31 = vmux(q1,v26,v29)
1446 ; CHECK-NEXT: vmem(r1+#1) = v31.new
1449 ; CHECK-NEXT: v0 = vmux(q3,v26,v30)
1450 ; CHECK-NEXT: jumpr r31
1451 ; CHECK-NEXT: vmem(r1+#0) = v0.new
1453 %v0 = load <64 x half>, ptr %a0, align 128
1454 %v1 = fptoui <64 x half> %v0 to <64 x i32>
1455 store <64 x i32> %v1, ptr %a1, align 128
1460 define void @f16u32_1(ptr %a0, ptr %a1) #0 {
1461 ; CHECK-LABEL: f16u32_1:
1462 ; CHECK: .cfi_startproc
1463 ; CHECK-NEXT: // %bb.0:
1465 ; CHECK-NEXT: r4 = #15360
1466 ; CHECK-NEXT: r7 = #-4
1467 ; CHECK-NEXT: v0 = vmem(r0+#0)
1470 ; CHECK-NEXT: v1.h = vsplat(r4)
1471 ; CHECK-NEXT: r2 = ##-2147483648
1472 ; CHECK-NEXT: r3 = #1
1475 ; CHECK-NEXT: v3 = vsplat(r2)
1476 ; CHECK-NEXT: r5:4 = combine(#8,#30)
1477 ; CHECK-NEXT: r6 = #24
1480 ; CHECK-NEXT: v4 = vsplat(r4)
1481 ; CHECK-NEXT: r2 = ##2147483647
1482 ; CHECK-NEXT: r4 = #32
1485 ; CHECK-NEXT: v1:0.qf32 = vmpy(v0.hf,v1.hf)
1486 ; CHECK-NEXT: v2 = vxor(v2,v2)
1489 ; CHECK-NEXT: v5 = vsplat(r4)
1490 ; CHECK-NEXT: v30 = vsplat(r2)
1493 ; CHECK-NEXT: v0.sf = v0.qf32
1496 ; CHECK-NEXT: v1.sf = v1.qf32
1499 ; CHECK-NEXT: v1:0 = vshuff(v1,v0,r7)
1502 ; CHECK-NEXT: q1 = vcmp.gt(v2.w,v0.w)
1505 ; CHECK-NEXT: v1.w = vasl(v0.w,r3)
1508 ; CHECK-NEXT: v6.w = vasl(v0.w,r5)
1509 ; CHECK-NEXT: v1.w = vsub(v1.w,v3.w)
1512 ; CHECK-NEXT: v3 = vor(v6,v3)
1515 ; CHECK-NEXT: v1.w = vasr(v1.w,r6)
1518 ; CHECK-NEXT: v1.w = vsub(v4.w,v1.w)
1521 ; CHECK-NEXT: v1.w = vmin(v1.w,v5.w)
1524 ; CHECK-NEXT: q0 = vcmp.gt(v2.w,v1.w)
1527 ; CHECK-NEXT: v1.w = vlsr(v3.w,v1.w)
1530 ; CHECK-NEXT: v31 = vmux(q0,v30,v1)
1533 ; CHECK-NEXT: v0 = vmux(q1,v2,v31)
1534 ; CHECK-NEXT: jumpr r31
1535 ; CHECK-NEXT: vmem(r1+#0) = v0.new
1537 %v0 = load <32 x half>, ptr %a0, align 128
1538 %v1 = fptoui <32 x half> %v0 to <32 x i32>
1539 store <32 x i32> %v1, ptr %a1, align 128
1545 define void @f32u8_0(ptr %a0, ptr %a1) #0 {
1546 ; CHECK-LABEL: f32u8_0:
1547 ; CHECK: .cfi_startproc
1548 ; CHECK-NEXT: // %bb.0:
1550 ; CHECK-NEXT: r3:2 = combine(#8,#1)
1551 ; CHECK-NEXT: r4 = ##-2147483648
1552 ; CHECK-NEXT: v5 = vmem(r0+#0)
1555 ; CHECK-NEXT: v3 = vsplat(r4)
1556 ; CHECK-NEXT: r5 = #30
1557 ; CHECK-NEXT: r6 = #24
1558 ; CHECK-NEXT: v2 = vmem(r0+#1)
1561 ; CHECK-NEXT: v14 = vsplat(r5)
1562 ; CHECK-NEXT: r4 = #32
1563 ; CHECK-NEXT: v8.w = vasl(v5.w,r2)
1564 ; CHECK-NEXT: v0 = vmem(r0+#3)
1567 ; CHECK-NEXT: v9.w = vasl(v2.w,r2)
1568 ; CHECK-NEXT: v13 = vxor(v13,v13)
1569 ; CHECK-NEXT: v8.w = vsub(v8.w,v3.w)
1570 ; CHECK-NEXT: v1 = vmem(r0+#2)
1573 ; CHECK-NEXT: v20 = vsplat(r4)
1574 ; CHECK-NEXT: v12.w = vasl(v0.w,r2)
1575 ; CHECK-NEXT: v9.w = vsub(v9.w,v3.w)
1576 ; CHECK-NEXT: q0 = vcmp.gt(v13.w,v5.w)
1579 ; CHECK-NEXT: v11.w = vasl(v1.w,r2)
1580 ; CHECK-NEXT: q3 = vcmp.gt(v13.w,v2.w)
1581 ; CHECK-NEXT: v12.w = vsub(v12.w,v3.w)
1584 ; CHECK-NEXT: r2 = ##2147483647
1585 ; CHECK-NEXT: r7 = #64
1586 ; CHECK-NEXT: v11.w = vsub(v11.w,v3.w)
1589 ; CHECK-NEXT: v22 = vsplat(r2)
1590 ; CHECK-NEXT: v8.w = vasr(v8.w,r6)
1593 ; CHECK-NEXT: v9.w = vasr(v9.w,r6)
1594 ; CHECK-NEXT: v8.w = vsub(v14.w,v8.w)
1597 ; CHECK-NEXT: v6.w = vasl(v5.w,r3)
1598 ; CHECK-NEXT: v9.w = vsub(v14.w,v9.w)
1599 ; CHECK-NEXT: v8.w = vmin(v8.w,v20.w)
1602 ; CHECK-NEXT: v7.w = vasl(v2.w,r3)
1603 ; CHECK-NEXT: v6 = vor(v6,v3)
1604 ; CHECK-NEXT: v9.w = vmin(v9.w,v20.w)
1605 ; CHECK-NEXT: q1 = vcmp.gt(v13.w,v8.w)
1608 ; CHECK-NEXT: v19.w = vasr(v11.w,r6)
1609 ; CHECK-NEXT: v7 = vor(v7,v3)
1610 ; CHECK-NEXT: q2 = vcmp.gt(v13.w,v9.w)
1613 ; CHECK-NEXT: v12.w = vasr(v12.w,r6)
1614 ; CHECK-NEXT: v5.w = vsub(v14.w,v19.w)
1617 ; CHECK-NEXT: v4.w = vasl(v1.w,r3)
1618 ; CHECK-NEXT: v21.w = vsub(v14.w,v12.w)
1619 ; CHECK-NEXT: v5.w = vmin(v5.w,v20.w)
1622 ; CHECK-NEXT: v10.w = vasl(v0.w,r3)
1623 ; CHECK-NEXT: v4 = vor(v4,v3)
1626 ; CHECK-NEXT: v6.w = vlsr(v6.w,v8.w)
1627 ; CHECK-NEXT: v3 = vor(v10,v3)
1628 ; CHECK-NEXT: v10.w = vmin(v21.w,v20.w)
1631 ; CHECK-NEXT: v7.w = vlsr(v7.w,v9.w)
1632 ; CHECK-NEXT: v24 = vmux(q1,v22,v6)
1633 ; CHECK-NEXT: q1 = vcmp.gt(v13.w,v5.w)
1636 ; CHECK-NEXT: v23.w = vlsr(v4.w,v5.w)
1637 ; CHECK-NEXT: v25 = vmux(q2,v22,v7)
1638 ; CHECK-NEXT: q2 = vcmp.gt(v13.w,v10.w)
1639 ; CHECK-NEXT: v4 = vmux(q0,v13,v24)
1642 ; CHECK-NEXT: v3.w = vlsr(v3.w,v10.w)
1643 ; CHECK-NEXT: v26 = vmux(q3,v13,v25)
1644 ; CHECK-NEXT: v2 = vmux(q1,v22,v23)
1645 ; CHECK-NEXT: q1 = vcmp.gt(v13.w,v1.w)
1648 ; CHECK-NEXT: v27 = vmux(q2,v22,v3)
1649 ; CHECK-NEXT: q3 = vcmp.gt(v13.w,v0.w)
1650 ; CHECK-NEXT: v28 = vmux(q1,v13,v2)
1653 ; CHECK-NEXT: v29.uh = vpack(v26.w,v4.w):sat
1654 ; CHECK-NEXT: v1 = vmux(q3,v13,v27)
1657 ; CHECK-NEXT: v30.uh = vpack(v1.w,v28.w):sat
1660 ; CHECK-NEXT: v0.uh = vpack(v1.w,v28.w):sat
1663 ; CHECK-NEXT: v31.ub = vpack(v30.h,v29.h):sat
1666 ; CHECK-NEXT: v0.ub = vpack(v30.h,v0.h):sat
1669 ; CHECK-NEXT: v1:0 = vshuff(v0,v31,r7)
1670 ; CHECK-NEXT: jumpr r31
1671 ; CHECK-NEXT: vmem(r1+#0) = v0.new
1673 %v0 = load <128 x float>, ptr %a0, align 128
1674 %v1 = fptoui <128 x float> %v0 to <128 x i8>
1675 store <128 x i8> %v1, ptr %a1, align 128
1680 define void @f32u8_1(ptr %a0, ptr %a1) #0 {
1681 ; CHECK-LABEL: f32u8_1:
1682 ; CHECK: .cfi_startproc
1683 ; CHECK-NEXT: // %bb.0:
1685 ; CHECK-NEXT: r3:2 = combine(##-2147483648,#1)
1686 ; CHECK-NEXT: r4 = #30
1687 ; CHECK-NEXT: v0 = vmem(r0+#1)
1690 ; CHECK-NEXT: v2 = vsplat(r3)
1691 ; CHECK-NEXT: r7:6 = combine(#24,#32)
1692 ; CHECK-NEXT: v3.w = vasl(v0.w,r2)
1693 ; CHECK-NEXT: v1 = vmem(r0+#0)
1696 ; CHECK-NEXT: v6 = vsplat(r4)
1697 ; CHECK-NEXT: r5 = #8
1698 ; CHECK-NEXT: v4.w = vasl(v1.w,r2)
1699 ; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
1702 ; CHECK-NEXT: v7 = vsplat(r6)
1703 ; CHECK-NEXT: v5.w = vasl(v1.w,r5)
1704 ; CHECK-NEXT: v4.w = vsub(v4.w,v2.w)
1705 ; CHECK-NEXT: v27 = vxor(v27,v27)
1708 ; CHECK-NEXT: r3 = ##2147483647
1709 ; CHECK-NEXT: v3.w = vasr(v3.w,r7)
1710 ; CHECK-NEXT: v5 = vor(v5,v2)
1713 ; CHECK-NEXT: v28 = vsplat(r3)
1714 ; CHECK-NEXT: v4.w = vasr(v4.w,r7)
1715 ; CHECK-NEXT: q2 = vcmp.gt(v27.w,v1.w)
1716 ; CHECK-NEXT: v3.w = vsub(v6.w,v3.w)
1719 ; CHECK-NEXT: r2 = #64
1720 ; CHECK-NEXT: v8.w = vasl(v0.w,r5)
1721 ; CHECK-NEXT: q3 = vcmp.gt(v27.w,v0.w)
1722 ; CHECK-NEXT: v4.w = vsub(v6.w,v4.w)
1725 ; CHECK-NEXT: v3.w = vmin(v3.w,v7.w)
1726 ; CHECK-NEXT: v4.w = vmin(v4.w,v7.w)
1727 ; CHECK-NEXT: v2 = vor(v8,v2)
1730 ; CHECK-NEXT: q1 = vcmp.gt(v27.w,v3.w)
1731 ; CHECK-NEXT: q0 = vcmp.gt(v27.w,v4.w)
1734 ; CHECK-NEXT: v5.w = vlsr(v5.w,v4.w)
1737 ; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w)
1738 ; CHECK-NEXT: v29 = vmux(q0,v28,v5)
1741 ; CHECK-NEXT: v30 = vmux(q1,v28,v2)
1742 ; CHECK-NEXT: v0 = vmux(q2,v27,v29)
1745 ; CHECK-NEXT: q3 = vsetq(r2)
1746 ; CHECK-NEXT: v1 = vmux(q3,v27,v30)
1749 ; CHECK-NEXT: v31.uh = vpack(v1.w,v0.w):sat
1752 ; CHECK-NEXT: v0.uh = vpack(v1.w,v0.w):sat
1755 ; CHECK-NEXT: v0.ub = vpack(v31.h,v0.h):sat
1758 ; CHECK-NEXT: jumpr r31
1759 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
1761 %v0 = load <64 x float>, ptr %a0, align 128
1762 %v1 = fptoui <64 x float> %v0 to <64 x i8>
1763 store <64 x i8> %v1, ptr %a1, align 128
1768 define void @f32u8_2(ptr %a0, ptr %a1) #0 {
1769 ; CHECK-LABEL: f32u8_2:
1770 ; CHECK: .cfi_startproc
1771 ; CHECK-NEXT: // %bb.0:
1773 ; CHECK-NEXT: r7 = ##-2147483648
1774 ; CHECK-NEXT: r3:2 = combine(#30,#1)
1775 ; CHECK-NEXT: v0 = vmem(r0+#0)
1778 ; CHECK-NEXT: v2 = vsplat(r7)
1779 ; CHECK-NEXT: r5:4 = combine(#8,#24)
1780 ; CHECK-NEXT: r6 = #32
1781 ; CHECK-NEXT: v3.w = vasl(v0.w,r2)
1784 ; CHECK-NEXT: v4 = vsplat(r3)
1785 ; CHECK-NEXT: v5 = vsplat(r6)
1786 ; CHECK-NEXT: v6.w = vasl(v0.w,r5)
1787 ; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
1790 ; CHECK-NEXT: v1 = vxor(v1,v1)
1791 ; CHECK-NEXT: v2 = vor(v6,v2)
1794 ; CHECK-NEXT: q3 = vsetq(r6)
1795 ; CHECK-NEXT: v3.w = vasr(v3.w,r4)
1796 ; CHECK-NEXT: q1 = vcmp.gt(v1.w,v0.w)
1799 ; CHECK-NEXT: r4 = ##2147483647
1800 ; CHECK-NEXT: v3.w = vsub(v4.w,v3.w)
1803 ; CHECK-NEXT: v30 = vsplat(r4)
1804 ; CHECK-NEXT: v3.w = vmin(v3.w,v5.w)
1807 ; CHECK-NEXT: q0 = vcmp.gt(v1.w,v3.w)
1810 ; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w)
1813 ; CHECK-NEXT: v31 = vmux(q0,v30,v2)
1816 ; CHECK-NEXT: v0 = vmux(q1,v1,v31)
1819 ; CHECK-NEXT: v1.uh = vpack(v1.w,v0.w):sat
1822 ; CHECK-NEXT: v0.uh = vpack(v0.w,v0.w):sat
1825 ; CHECK-NEXT: v0.ub = vpack(v1.h,v0.h):sat
1828 ; CHECK-NEXT: jumpr r31
1829 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
1831 %v0 = load <32 x float>, ptr %a0, align 128
1832 %v1 = fptoui <32 x float> %v0 to <32 x i8>
1833 store <32 x i8> %v1, ptr %a1, align 128
1839 define void @f32u16_0(ptr %a0, ptr %a1) #0 {
1840 ; CHECK-LABEL: f32u16_0:
1841 ; CHECK: .cfi_startproc
1842 ; CHECK-NEXT: // %bb.0:
1844 ; CHECK-NEXT: r3:2 = combine(##-2147483648,#1)
1845 ; CHECK-NEXT: r4 = #30
1846 ; CHECK-NEXT: v0 = vmem(r0+#1)
1849 ; CHECK-NEXT: v2 = vsplat(r3)
1850 ; CHECK-NEXT: r7:6 = combine(#24,#32)
1851 ; CHECK-NEXT: v3.w = vasl(v0.w,r2)
1852 ; CHECK-NEXT: v1 = vmem(r0+#0)
1855 ; CHECK-NEXT: v6 = vsplat(r4)
1856 ; CHECK-NEXT: r5 = #8
1857 ; CHECK-NEXT: v4.w = vasl(v1.w,r2)
1858 ; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
1861 ; CHECK-NEXT: v7 = vsplat(r6)
1862 ; CHECK-NEXT: v5.w = vasl(v1.w,r5)
1863 ; CHECK-NEXT: v4.w = vsub(v4.w,v2.w)
1864 ; CHECK-NEXT: v28 = vxor(v28,v28)
1867 ; CHECK-NEXT: r2 = ##2147483647
1868 ; CHECK-NEXT: v3.w = vasr(v3.w,r7)
1869 ; CHECK-NEXT: v5 = vor(v5,v2)
1872 ; CHECK-NEXT: v29 = vsplat(r2)
1873 ; CHECK-NEXT: v4.w = vasr(v4.w,r7)
1874 ; CHECK-NEXT: q2 = vcmp.gt(v28.w,v1.w)
1875 ; CHECK-NEXT: v3.w = vsub(v6.w,v3.w)
1878 ; CHECK-NEXT: v8.w = vasl(v0.w,r5)
1879 ; CHECK-NEXT: q3 = vcmp.gt(v28.w,v0.w)
1880 ; CHECK-NEXT: v4.w = vsub(v6.w,v4.w)
1881 ; CHECK-NEXT: v3.w = vmin(v3.w,v7.w)
1884 ; CHECK-NEXT: v4.w = vmin(v4.w,v7.w)
1885 ; CHECK-NEXT: v2 = vor(v8,v2)
1886 ; CHECK-NEXT: q1 = vcmp.gt(v28.w,v3.w)
1889 ; CHECK-NEXT: q0 = vcmp.gt(v28.w,v4.w)
1892 ; CHECK-NEXT: v5.w = vlsr(v5.w,v4.w)
1895 ; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w)
1896 ; CHECK-NEXT: v30 = vmux(q0,v29,v5)
1899 ; CHECK-NEXT: v31 = vmux(q1,v29,v2)
1900 ; CHECK-NEXT: v0 = vmux(q2,v28,v30)
1903 ; CHECK-NEXT: v1 = vmux(q3,v28,v31)
1906 ; CHECK-NEXT: v0.uh = vpack(v1.w,v0.w):sat
1907 ; CHECK-NEXT: jumpr r31
1908 ; CHECK-NEXT: vmem(r1+#0) = v0.new
1910 %v0 = load <64 x float>, ptr %a0, align 128
1911 %v1 = fptoui <64 x float> %v0 to <64 x i16>
1912 store <64 x i16> %v1, ptr %a1, align 128
1917 define void @f32u16_1(ptr %a0, ptr %a1) #0 {
1918 ; CHECK-LABEL: f32u16_1:
1919 ; CHECK: .cfi_startproc
1920 ; CHECK-NEXT: // %bb.0:
1922 ; CHECK-NEXT: r7 = ##-2147483648
1923 ; CHECK-NEXT: r3:2 = combine(#8,#1)
1924 ; CHECK-NEXT: v0 = vmem(r0+#0)
1927 ; CHECK-NEXT: v2 = vsplat(r7)
1928 ; CHECK-NEXT: v3.w = vasl(v0.w,r2)
1929 ; CHECK-NEXT: r6 = #30
1930 ; CHECK-NEXT: r5 = #24
1933 ; CHECK-NEXT: v4 = vsplat(r6)
1934 ; CHECK-NEXT: r4 = #32
1935 ; CHECK-NEXT: v6.w = vasl(v0.w,r3)
1936 ; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
1939 ; CHECK-NEXT: v5 = vsplat(r4)
1940 ; CHECK-NEXT: v2 = vor(v6,v2)
1941 ; CHECK-NEXT: v1 = vxor(v1,v1)
1944 ; CHECK-NEXT: r3 = ##2147483647
1945 ; CHECK-NEXT: r2 = #64
1946 ; CHECK-NEXT: v3.w = vasr(v3.w,r5)
1949 ; CHECK-NEXT: v30 = vsplat(r3)
1950 ; CHECK-NEXT: q3 = vsetq(r2)
1951 ; CHECK-NEXT: q1 = vcmp.gt(v1.w,v0.w)
1952 ; CHECK-NEXT: v3.w = vsub(v4.w,v3.w)
1955 ; CHECK-NEXT: v3.w = vmin(v3.w,v5.w)
1958 ; CHECK-NEXT: q0 = vcmp.gt(v1.w,v3.w)
1961 ; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w)
1964 ; CHECK-NEXT: v31 = vmux(q0,v30,v2)
1967 ; CHECK-NEXT: v0 = vmux(q1,v1,v31)
1970 ; CHECK-NEXT: v0.uh = vpack(v0.w,v0.w):sat
1973 ; CHECK-NEXT: jumpr r31
1974 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
1976 %v0 = load <32 x float>, ptr %a0, align 128
1977 %v1 = fptoui <32 x float> %v0 to <32 x i16>
1978 store <32 x i16> %v1, ptr %a1, align 128
1984 define void @f32u32_0(ptr %a0, ptr %a1) #0 {
1985 ; CHECK-LABEL: f32u32_0:
1986 ; CHECK: .cfi_startproc
1987 ; CHECK-NEXT: // %bb.0:
1989 ; CHECK-NEXT: r7 = ##-2147483648
1990 ; CHECK-NEXT: r3:2 = combine(#8,#1)
1991 ; CHECK-NEXT: v0 = vmem(r0+#0)
1994 ; CHECK-NEXT: v2 = vsplat(r7)
1995 ; CHECK-NEXT: v3.w = vasl(v0.w,r2)
1996 ; CHECK-NEXT: r6 = #30
1997 ; CHECK-NEXT: r5 = #24
2000 ; CHECK-NEXT: v4 = vsplat(r6)
2001 ; CHECK-NEXT: r4 = #32
2002 ; CHECK-NEXT: v6.w = vasl(v0.w,r3)
2003 ; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
2006 ; CHECK-NEXT: v5 = vsplat(r4)
2007 ; CHECK-NEXT: v2 = vor(v6,v2)
2008 ; CHECK-NEXT: v1 = vxor(v1,v1)
2011 ; CHECK-NEXT: r2 = ##2147483647
2012 ; CHECK-NEXT: v3.w = vasr(v3.w,r5)
2015 ; CHECK-NEXT: v30 = vsplat(r2)
2016 ; CHECK-NEXT: q1 = vcmp.gt(v1.w,v0.w)
2017 ; CHECK-NEXT: v3.w = vsub(v4.w,v3.w)
2020 ; CHECK-NEXT: v3.w = vmin(v3.w,v5.w)
2023 ; CHECK-NEXT: q0 = vcmp.gt(v1.w,v3.w)
2026 ; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w)
2029 ; CHECK-NEXT: v31 = vmux(q0,v30,v2)
2032 ; CHECK-NEXT: v0 = vmux(q1,v1,v31)
2033 ; CHECK-NEXT: jumpr r31
2034 ; CHECK-NEXT: vmem(r1+#0) = v0.new
2036 %v0 = load <32 x float>, ptr %a0, align 128
2037 %v1 = fptoui <32 x float> %v0 to <32 x i32>
2038 store <32 x i32> %v1, ptr %a1, align 128
2042 ; Widen input and result
2043 define void @f32u32_1(ptr %a0, ptr %a1) #0 {
2044 ; CHECK-LABEL: f32u32_1:
2045 ; CHECK: .cfi_startproc
2046 ; CHECK-NEXT: // %bb.0:
2048 ; CHECK-NEXT: r7 = ##-2147483648
2049 ; CHECK-NEXT: r3:2 = combine(#8,#1)
2050 ; CHECK-NEXT: v0 = vmem(r0+#0)
2053 ; CHECK-NEXT: v2 = vsplat(r7)
2054 ; CHECK-NEXT: v3.w = vasl(v0.w,r2)
2055 ; CHECK-NEXT: r6 = #30
2056 ; CHECK-NEXT: r5 = #24
2059 ; CHECK-NEXT: v4 = vsplat(r6)
2060 ; CHECK-NEXT: r4 = #32
2061 ; CHECK-NEXT: v6.w = vasl(v0.w,r3)
2062 ; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
2065 ; CHECK-NEXT: v5 = vsplat(r4)
2066 ; CHECK-NEXT: v2 = vor(v6,v2)
2067 ; CHECK-NEXT: v1 = vxor(v1,v1)
2070 ; CHECK-NEXT: r3 = ##2147483647
2071 ; CHECK-NEXT: r2 = #64
2072 ; CHECK-NEXT: v3.w = vasr(v3.w,r5)
2075 ; CHECK-NEXT: v30 = vsplat(r3)
2076 ; CHECK-NEXT: q3 = vsetq(r2)
2077 ; CHECK-NEXT: q1 = vcmp.gt(v1.w,v0.w)
2078 ; CHECK-NEXT: v3.w = vsub(v4.w,v3.w)
2081 ; CHECK-NEXT: v3.w = vmin(v3.w,v5.w)
2084 ; CHECK-NEXT: q0 = vcmp.gt(v1.w,v3.w)
2087 ; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w)
2090 ; CHECK-NEXT: v31 = vmux(q0,v30,v2)
2093 ; CHECK-NEXT: v0 = vmux(q1,v1,v31)
2096 ; CHECK-NEXT: jumpr r31
2097 ; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
2099 %v0 = load <16 x float>, ptr %a0, align 128
2100 %v1 = fptoui <16 x float> %v0 to <16 x i32>
2101 store <16 x i32> %v1, ptr %a1, align 128
2106 attributes #0 = { "target-features"="+v68,+hvxv68,+hvx-length128b,+hvx-qfloat" }