1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-SDAG %s
3 ; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-GISEL %s
5 declare <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src0, <16 x float> %src1, float %scale)
6 declare <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src0, <16 x float> %src1, float %scale)
7 declare <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half>, i32, float, i32, i1)
8 declare float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32, float, i32)
9 declare <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half>, i32, float, i32, i1)
10 declare float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32, float, i32)
11 declare <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16>, float, float, float, i1)
12 declare <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16>, float, float, float, i1)
13 declare <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32, float, i1)
14 declare <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32, float, i1)
15 declare <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16>, <2 x half>, float, i1)
16 declare <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.fb16(<2 x i16>, <2 x bfloat>, float, i1)
17 declare <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16>, <2 x half>, float, i1)
18 declare <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.fb16(<2 x i16>, <2 x bfloat>, float, i1)
19 declare <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32, float, i32)
20 declare i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32, float, float, float, i32)
21 declare <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32, float, i32)
22 declare <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32, float, i32)
23 declare <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32>, float)
24 declare <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32>, float)
25 declare <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32>, float)
26 declare <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32>, float)
27 declare <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32>, float)
28 declare <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32>, float)
29 declare <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32, float, i1)
30 declare <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32, float, i1)
31 declare <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32, float, i1)
32 declare <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32, float, i1)
33 declare i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32, <2 x half>, float, i32)
34 declare i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32, <2 x bfloat>, float, i32)
36 define amdgpu_ps void @test_scalef32_pk32_fp6_f32_vv(<16 x float> %src, float %scale, ptr addrspace(1) %out) {
37 ; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_vv:
38 ; GFX950-SDAG: ; %bb.0:
39 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v18
40 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v17
41 ; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[0:15], v[0:15], v16
42 ; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16
43 ; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off
44 ; GFX950-SDAG-NEXT: s_endpgm
46 ; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f32_vv:
47 ; GFX950-GISEL: ; %bb.0:
48 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v17
49 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v18
50 ; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[0:15], v[0:15], v16
51 ; GFX950-GISEL-NEXT: global_store_dwordx4 v[20:21], v[0:3], off
52 ; GFX950-GISEL-NEXT: global_store_dwordx2 v[20:21], v[4:5], off offset:16
53 ; GFX950-GISEL-NEXT: s_endpgm
54 %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src, <16 x float> %src, float %scale)
55 store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
59 define amdgpu_ps void @test_scalef32_pk32_fp6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) {
60 ; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_sl:
61 ; GFX950-SDAG: ; %bb.0:
62 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
63 ; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000
64 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
65 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
66 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
67 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
68 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
69 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
70 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
71 ; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[2:7], v[2:17], v[2:17], s16
72 ; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
73 ; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
74 ; GFX950-SDAG-NEXT: s_endpgm
76 ; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f32_sl:
77 ; GFX950-GISEL: ; %bb.0:
78 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
79 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
80 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
81 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
82 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
83 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
84 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
85 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
86 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000
87 ; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[2:7], v[2:17], v[2:17], v18
88 ; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
89 ; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
90 ; GFX950-GISEL-NEXT: s_endpgm
91 %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src, <16 x float> %src, float 100.0)
92 store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
96 define amdgpu_ps void @test_scalef32_pk32_bf6_f32_vv(<16 x float> %src, float %scale, ptr addrspace(1) %out) {
97 ; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_vv:
98 ; GFX950-SDAG: ; %bb.0:
99 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v18
100 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v17
101 ; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[0:15], v[0:15], v16
102 ; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16
103 ; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off
104 ; GFX950-SDAG-NEXT: s_endpgm
106 ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f32_vv:
107 ; GFX950-GISEL: ; %bb.0:
108 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v17
109 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v18
110 ; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[0:15], v[0:15], v16
111 ; GFX950-GISEL-NEXT: global_store_dwordx4 v[20:21], v[0:3], off
112 ; GFX950-GISEL-NEXT: global_store_dwordx2 v[20:21], v[4:5], off offset:16
113 ; GFX950-GISEL-NEXT: s_endpgm
114 %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src, <16 x float> %src, float %scale)
115 store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
119 define amdgpu_ps void @test_scalef32_pk32_bf6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) {
120 ; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_sl:
121 ; GFX950-SDAG: ; %bb.0:
122 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
123 ; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000
124 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
125 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
126 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
127 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
128 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
129 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
130 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
131 ; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[2:7], v[2:17], v[2:17], s16
132 ; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
133 ; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
134 ; GFX950-SDAG-NEXT: s_endpgm
136 ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f32_sl:
137 ; GFX950-GISEL: ; %bb.0:
138 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
139 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
140 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
141 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
142 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
143 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
144 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
145 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
146 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000
147 ; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[2:7], v[2:17], v[2:17], v18
148 ; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
149 ; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
150 ; GFX950-GISEL-NEXT: s_endpgm
151 %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src, <16 x float> %src, float 100.0)
152 store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
156 define <2 x half> @test_cvt_scalef32_f16_fp8_byte0_dst_lo(i32 %src, float %scale, <2 x half> %old) {
157 ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte0_dst_lo:
159 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
160 ; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1
161 ; GCN-NEXT: v_mov_b32_e32 v0, v2
162 ; GCN-NEXT: s_setpc_b64 s[30:31]
163 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 0, i1 false)
167 define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_lo(i32 %src, float %scale, <2 x half> %old) {
168 ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_lo:
170 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171 ; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[0,1,0]
172 ; GCN-NEXT: v_mov_b32_e32 v0, v2
173 ; GCN-NEXT: s_setpc_b64 s[30:31]
174 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false)
178 define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_lo(i32 %src, float %scale, <2 x half> %old) {
179 ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_lo:
181 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182 ; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[1,0,0]
183 ; GCN-NEXT: v_mov_b32_e32 v0, v2
184 ; GCN-NEXT: s_setpc_b64 s[30:31]
185 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false)
189 define <2 x half> @test_cvt_scalef32_f16_fp8_byte3_dst_lo(i32 %src, float %scale, <2 x half> %old) {
190 ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte3_dst_lo:
192 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
193 ; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[1,1,0]
194 ; GCN-NEXT: v_mov_b32_e32 v0, v2
195 ; GCN-NEXT: s_setpc_b64 s[30:31]
196 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 3, i1 false)
200 define <2 x half> @test_cvt_scalef32_f16_fp8_byte0_dst_hi(i32 %src, float %scale, <2 x half> %old) {
201 ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte0_dst_hi:
203 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204 ; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[0,0,1]
206 ; GCN-NEXT: v_mov_b32_e32 v0, v2
207 ; GCN-NEXT: s_setpc_b64 s[30:31]
208 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 0, i1 true)
212 define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_hi(i32 %src, float %scale, <2 x half> %old) {
213 ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_hi:
215 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
216 ; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[0,1,1]
218 ; GCN-NEXT: v_mov_b32_e32 v0, v2
219 ; GCN-NEXT: s_setpc_b64 s[30:31]
220 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 true)
224 define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_hi(i32 %src, float %scale, <2 x half> %old) {
225 ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_hi:
227 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228 ; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[1,0,1]
230 ; GCN-NEXT: v_mov_b32_e32 v0, v2
231 ; GCN-NEXT: s_setpc_b64 s[30:31]
232 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 true)
236 define <2 x half> @test_cvt_scalef32_f16_fp8_byte3_dst_hi(i32 %src, float %scale, <2 x half> %old) {
237 ; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte3_dst_hi:
239 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240 ; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[1,1,1]
242 ; GCN-NEXT: v_mov_b32_e32 v0, v2
243 ; GCN-NEXT: s_setpc_b64 s[30:31]
244 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 3, i1 true)
248 define float @test_cvt_scalef32_f32_fp8_byte0(i32 %src, float %scale) {
249 ; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte0:
251 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
252 ; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, v0, v1
253 ; GCN-NEXT: s_setpc_b64 s[30:31]
254 %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 0)
258 define float @test_cvt_scalef32_f32_fp8_byte1(i32 %src, float %scale) {
259 ; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte1:
261 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
262 ; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, v0, v1 op_sel:[0,1,0]
263 ; GCN-NEXT: s_setpc_b64 s[30:31]
264 %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 1)
268 define float @test_cvt_scalef32_f32_fp8_byte2(i32 %src, float %scale) {
269 ; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte2:
271 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
272 ; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, v0, v1 op_sel:[1,0,0]
273 ; GCN-NEXT: s_setpc_b64 s[30:31]
274 %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 2)
278 define float @test_cvt_scalef32_f32_fp8_byte3(i32 %src, float %scale) {
279 ; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte3:
281 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
282 ; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, v0, v1 op_sel:[1,1,0]
283 ; GCN-NEXT: s_setpc_b64 s[30:31]
284 %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 3)
288 define <2 x half> @test_cvt_scalef32_f16_bf8_byte0_dst_lo(i32 %src, float %scale, <2 x half> %old) {
289 ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte0_dst_lo:
291 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
292 ; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1
293 ; GCN-NEXT: v_mov_b32_e32 v0, v2
294 ; GCN-NEXT: s_setpc_b64 s[30:31]
295 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 0, i1 false)
299 define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_lo(i32 %src, float %scale, <2 x half> %old) {
300 ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_lo:
302 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
303 ; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[0,1,0]
304 ; GCN-NEXT: v_mov_b32_e32 v0, v2
305 ; GCN-NEXT: s_setpc_b64 s[30:31]
306 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false)
310 define <2 x half> @test_cvt_scalef32_f16_bf8_byte2_dst_lo(i32 %src, float %scale, <2 x half> %old) {
311 ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte2_dst_lo:
313 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
314 ; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[1,0,0]
315 ; GCN-NEXT: v_mov_b32_e32 v0, v2
316 ; GCN-NEXT: s_setpc_b64 s[30:31]
317 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false)
321 define <2 x half> @test_cvt_scalef32_f16_bf8_byte3_dst_lo(i32 %src, float %scale, <2 x half> %old) {
322 ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte3_dst_lo:
324 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325 ; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[1,1,0]
326 ; GCN-NEXT: v_mov_b32_e32 v0, v2
327 ; GCN-NEXT: s_setpc_b64 s[30:31]
328 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 3, i1 false)
332 define <2 x half> @test_cvt_scalef32_f16_bf8_byte0_dst_hi(i32 %src, float %scale, <2 x half> %old) {
333 ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte0_dst_hi:
335 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
336 ; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[0,0,1]
338 ; GCN-NEXT: v_mov_b32_e32 v0, v2
339 ; GCN-NEXT: s_setpc_b64 s[30:31]
340 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 0, i1 true)
344 define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_hi(i32 %src, float %scale, <2 x half> %old) {
345 ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_hi:
347 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
348 ; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[0,1,1]
350 ; GCN-NEXT: v_mov_b32_e32 v0, v2
351 ; GCN-NEXT: s_setpc_b64 s[30:31]
352 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 true)
356 define <2 x half> @test_cvt_scalef32_f16_bf8_byte2_dst_hi(i32 %src, float %scale, <2 x half> %old) {
357 ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte2_dst_hi:
359 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
360 ; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[1,0,1]
362 ; GCN-NEXT: v_mov_b32_e32 v0, v2
363 ; GCN-NEXT: s_setpc_b64 s[30:31]
364 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 true)
368 define <2 x half> @test_cvt_scalef32_f16_bf8_byte3_dst_hi(i32 %src, float %scale, <2 x half> %old) {
369 ; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte3_dst_hi:
371 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
372 ; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[1,1,1]
374 ; GCN-NEXT: v_mov_b32_e32 v0, v2
375 ; GCN-NEXT: s_setpc_b64 s[30:31]
376 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 3, i1 true)
380 define float @test_cvt_scalef32_f32_bf8_byte0(i32 %src, float %scale) {
381 ; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte0:
383 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
384 ; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, v0, v1
385 ; GCN-NEXT: s_setpc_b64 s[30:31]
386 %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 0)
390 define float @test_cvt_scalef32_f32_bf8_byte1(i32 %src, float %scale) {
391 ; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte1:
393 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
394 ; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, v0, v1 op_sel:[0,1,0]
395 ; GCN-NEXT: s_setpc_b64 s[30:31]
396 %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 1)
400 define float @test_cvt_scalef32_f32_bf8_byte2(i32 %src, float %scale) {
401 ; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte2:
403 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
404 ; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, v0, v1 op_sel:[1,0,0]
405 ; GCN-NEXT: s_setpc_b64 s[30:31]
406 %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 2)
410 define float @test_cvt_scalef32_f32_bf8_byte3(i32 %src, float %scale) {
411 ; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte3:
413 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
414 ; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, v0, v1 op_sel:[1,1,0]
415 ; GCN-NEXT: s_setpc_b64 s[30:31]
416 %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 3)
420 define <2 x i16> @test_cvt_scalef32_pk_fp8_f32_word0(<2 x i16> %old, float %src0, float %src1, float %scale) {
421 ; GCN-LABEL: test_cvt_scalef32_pk_fp8_f32_word0:
423 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
424 ; GCN-NEXT: v_cvt_scalef32_pk_fp8_f32 v0, v1, v2, v3
425 ; GCN-NEXT: s_setpc_b64 s[30:31]
426 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> %old, float %src0, float %src1, float %scale, i1 false)
430 define <2 x i16> @test_cvt_scalef32_pk_fp8_f32_word1(<2 x i16> %old, float %src0, float %src1, float %scale) {
431 ; GCN-LABEL: test_cvt_scalef32_pk_fp8_f32_word1:
433 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
434 ; GCN-NEXT: v_cvt_scalef32_pk_fp8_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
435 ; GCN-NEXT: s_setpc_b64 s[30:31]
436 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> %old, float %src0, float %src1, float %scale, i1 true)
440 define <2 x i16> @test_cvt_scalef32_pk_bf8_f32_word0(<2 x i16> %old, float %src0, float %src1, float %scale) {
441 ; GCN-LABEL: test_cvt_scalef32_pk_bf8_f32_word0:
443 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
444 ; GCN-NEXT: v_cvt_scalef32_pk_bf8_f32 v0, v1, v2, v3
445 ; GCN-NEXT: s_setpc_b64 s[30:31]
446 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> %old, float %src0, float %src1, float %scale, i1 false)
450 define <2 x i16> @test_cvt_scalef32_pk_bf8_f32_word1(<2 x i16> %old, float %src0, float %src1, float %scale) {
451 ; GCN-LABEL: test_cvt_scalef32_pk_bf8_f32_word1:
453 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
454 ; GCN-NEXT: v_cvt_scalef32_pk_bf8_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
455 ; GCN-NEXT: s_setpc_b64 s[30:31]
456 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> %old, float %src0, float %src1, float %scale, i1 true)
460 define <2 x i16> @test_cvt_scalef32_pk_fp8_f32_word1_fabs(<2 x i16> %old, float %src0, float %src1, float %scale) {
461 ; GCN-LABEL: test_cvt_scalef32_pk_fp8_f32_word1_fabs:
463 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
464 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
465 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2
466 ; GCN-NEXT: v_cvt_scalef32_pk_fp8_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
467 ; GCN-NEXT: s_setpc_b64 s[30:31]
468 %fabs.src0 = call float @llvm.fabs.f32(float %src0)
469 %fabs.src1 = call float @llvm.fabs.f32(float %src1)
470 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> %old, float %fabs.src0, float %fabs.src1, float %scale, i1 true)
474 define <2 x i16> @test_cvt_scalef32_pk_bf8_f32_word1_fabs(<2 x i16> %old, float %src0, float %src1, float %scale) {
475 ; GCN-LABEL: test_cvt_scalef32_pk_bf8_f32_word1_fabs:
477 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
478 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
479 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2
480 ; GCN-NEXT: v_cvt_scalef32_pk_bf8_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
481 ; GCN-NEXT: s_setpc_b64 s[30:31]
482 %fabs.src0 = call float @llvm.fabs.f32(float %src0)
483 %fabs.src1 = call float @llvm.fabs.f32(float %src1)
484 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> %old, float %fabs.src0, float %fabs.src1, float %scale, i1 true)
488 define <2 x i16> @test_cvt_scalef32_pk_fp8_f32_word1_fneg(<2 x i16> %old, float %src0, float %src1, float %scale) {
489 ; GCN-LABEL: test_cvt_scalef32_pk_fp8_f32_word1_fneg:
491 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
492 ; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
493 ; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
494 ; GCN-NEXT: v_cvt_scalef32_pk_fp8_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
495 ; GCN-NEXT: s_setpc_b64 s[30:31]
496 %fneg.src0 = fneg float %src0
497 %fneg.src1 = fneg float %src1
498 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> %old, float %fneg.src0, float %fneg.src1, float %scale, i1 true)
502 define <2 x i16> @test_cvt_scalef32_pk_bf8_f32_word1_fneg(<2 x i16> %old, float %src0, float %src1, float %scale) {
503 ; GCN-LABEL: test_cvt_scalef32_pk_bf8_f32_word1_fneg:
505 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
506 ; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
507 ; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
508 ; GCN-NEXT: v_cvt_scalef32_pk_bf8_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
509 ; GCN-NEXT: s_setpc_b64 s[30:31]
510 %fneg.src0 = fneg float %src0
511 %fneg.src1 = fneg float %src1
512 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> %old, float %fneg.src0, float %fneg.src1, float %scale, i1 true)
516 define <2 x i16> @test_cvt_scalef32_pk_fp8_f32_word1_fabs_fneg(<2 x i16> %old, float %src0, float %src1, float %scale) {
517 ; GCN-LABEL: test_cvt_scalef32_pk_fp8_f32_word1_fabs_fneg:
519 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
520 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
521 ; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
522 ; GCN-NEXT: v_cvt_scalef32_pk_fp8_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
523 ; GCN-NEXT: s_setpc_b64 s[30:31]
524 %fabs.src0 = call float @llvm.fabs.f32(float %src0)
525 %fneg.src1 = fneg float %src1
526 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> %old, float %fabs.src0, float %fneg.src1, float %scale, i1 true)
530 define <2 x i16> @test_cvt_scalef32_pk_bf8_f32_word1_fabs_fneg(<2 x i16> %old, float %src0, float %src1, float %scale) {
531 ; GCN-LABEL: test_cvt_scalef32_pk_bf8_f32_word1_fabs_fneg:
533 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
535 ; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
536 ; GCN-NEXT: v_cvt_scalef32_pk_bf8_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
537 ; GCN-NEXT: s_setpc_b64 s[30:31]
538 %fabs.src0 = call float @llvm.fabs.f32(float %src0)
539 %fneg.src1 = fneg float %src1
540 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> %old, float %fabs.src0, float %fneg.src1, float %scale, i1 true)
544 define <2 x float> @test_cvt_scalef32_pk_f32_fp8_word0(i32 %src, float %scale) {
545 ; GCN-LABEL: test_cvt_scalef32_pk_f32_fp8_word0:
547 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
548 ; GCN-NEXT: v_cvt_scalef32_pk_f32_fp8 v[0:1], v0, v1
549 ; GCN-NEXT: s_setpc_b64 s[30:31]
550 %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 %src, float %scale, i1 false)
554 define <2 x float> @test_cvt_scalef32_pk_f32_fp8_word1(i32 %src, float %scale) {
555 ; GCN-LABEL: test_cvt_scalef32_pk_f32_fp8_word1:
557 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
558 ; GCN-NEXT: v_cvt_scalef32_pk_f32_fp8 v[0:1], v0, v1 op_sel:[1,0,0]
559 ; GCN-NEXT: s_setpc_b64 s[30:31]
560 %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 %src, float %scale, i1 true)
564 define <2 x float> @test_cvt_scalef32_pk_f32_bf8_word0(i32 %src, float %scale) {
565 ; GCN-LABEL: test_cvt_scalef32_pk_f32_bf8_word0:
567 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
568 ; GCN-NEXT: v_cvt_scalef32_pk_f32_bf8 v[0:1], v0, v1
569 ; GCN-NEXT: s_setpc_b64 s[30:31]
570 %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 %src, float %scale, i1 false)
574 define <2 x float> @test_cvt_scalef32_pk_f32_bf8_word1(i32 %src, float %scale) {
575 ; GCN-LABEL: test_cvt_scalef32_pk_f32_bf8_word1:
577 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
578 ; GCN-NEXT: v_cvt_scalef32_pk_f32_bf8 v[0:1], v0, v1 op_sel:[1,0,0]
579 ; GCN-NEXT: s_setpc_b64 s[30:31]
580 %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 %src, float %scale, i1 true)
584 define <2 x i16> @test_cvt_scalef32_pk_fp8_f16_word0(<2 x i16> %old, <2 x half> %src, float %scale) {
585 ; GCN-LABEL: test_cvt_scalef32_pk_fp8_f16_word0:
587 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
588 ; GCN-NEXT: v_cvt_scalef32_pk_fp8_f16 v0, v1, v2
589 ; GCN-NEXT: s_setpc_b64 s[30:31]
590 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> %old, <2 x half> %src, float %scale, i1 false)
594 define <2 x i16> @test_cvt_scalef32_pk_fp8_f16_word1(<2 x i16> %old, <2 x half> %src, float %scale) {
595 ; GCN-LABEL: test_cvt_scalef32_pk_fp8_f16_word1:
597 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
598 ; GCN-NEXT: v_cvt_scalef32_pk_fp8_f16 v0, v1, v2 op_sel:[0,0,1]
599 ; GCN-NEXT: s_setpc_b64 s[30:31]
600 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> %old, <2 x half> %src, float %scale, i1 true)
604 define <2 x i16> @test_cvt_scalef32_pk_fp8_bf16_word0(<2 x i16> %old, <2 x bfloat> %src, float %scale) {
605 ; GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_word0:
607 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
608 ; GCN-NEXT: v_cvt_scalef32_pk_fp8_bf16 v0, v1, v2
609 ; GCN-NEXT: s_setpc_b64 s[30:31]
610 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> %old, <2 x bfloat> %src, float %scale, i1 false)
614 define <2 x i16> @test_cvt_scalef32_pk_fp8_bf16_word1(<2 x i16> %old, <2 x bfloat> %src, float %scale) {
615 ; GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_word1:
617 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
618 ; GCN-NEXT: v_cvt_scalef32_pk_fp8_bf16 v0, v1, v2 op_sel:[0,0,1]
619 ; GCN-NEXT: s_setpc_b64 s[30:31]
620 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> %old, <2 x bfloat> %src, float %scale, i1 true)
624 define <2 x i16> @test_cvt_scalef32_pk_bf8_f16_word0(<2 x i16> %old, <2 x half> %src, float %scale) {
625 ; GCN-LABEL: test_cvt_scalef32_pk_bf8_f16_word0:
627 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
628 ; GCN-NEXT: v_cvt_scalef32_pk_bf8_f16 v0, v1, v2
629 ; GCN-NEXT: s_setpc_b64 s[30:31]
630 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> %old, <2 x half> %src, float %scale, i1 false)
634 define <2 x i16> @test_cvt_scalef32_pk_bf8_f16_word1(<2 x i16> %old, <2 x half> %src, float %scale) {
635 ; GCN-LABEL: test_cvt_scalef32_pk_bf8_f16_word1:
637 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
638 ; GCN-NEXT: v_cvt_scalef32_pk_bf8_f16 v0, v1, v2 op_sel:[0,0,1]
639 ; GCN-NEXT: s_setpc_b64 s[30:31]
640 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> %old, <2 x half> %src, float %scale, i1 true)
644 define <2 x i16> @test_cvt_scalef32_pk_bf8_bf16_word0(<2 x i16> %old, <2 x bfloat> %src, float %scale) {
645 ; GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_word0:
647 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
648 ; GCN-NEXT: v_cvt_scalef32_pk_bf8_bf16 v0, v1, v2
649 ; GCN-NEXT: s_setpc_b64 s[30:31]
650 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> %old, <2 x bfloat> %src, float %scale, i1 false)
654 define <2 x i16> @test_cvt_scalef32_pk_bf8_bf16_word1(<2 x i16> %old, <2 x bfloat> %src, float %scale) {
655 ; GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_word1:
657 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
658 ; GCN-NEXT: v_cvt_scalef32_pk_bf8_bf16 v0, v1, v2 op_sel:[0,0,1]
659 ; GCN-NEXT: s_setpc_b64 s[30:31]
660 %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> %old, <2 x bfloat> %src, float %scale, i1 true)
664 define <2 x float> @test_cvt_scale_f32_fp4_byte0(i32 %src, float %scale) {
665 ; GCN-LABEL: test_cvt_scale_f32_fp4_byte0:
667 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
668 ; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], v0, v1
669 ; GCN-NEXT: s_setpc_b64 s[30:31]
670 %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 0)
674 define <2 x float> @test_cvt_scale_f32_fp4_byte1(i32 %src, float %scale) {
675 ; GCN-LABEL: test_cvt_scale_f32_fp4_byte1:
677 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
678 ; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], v0, v1 op_sel:[0,1,0]
679 ; GCN-NEXT: s_setpc_b64 s[30:31]
680 %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 1)
684 define <2 x float> @test_cvt_scale_f32_fp4_byte2(i32 %src, float %scale) {
685 ; GCN-LABEL: test_cvt_scale_f32_fp4_byte2:
687 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
688 ; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], v0, v1 op_sel:[1,0,0]
689 ; GCN-NEXT: s_setpc_b64 s[30:31]
690 %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 2)
694 define <2 x float> @test_cvt_scale_f32_fp4_byte3(i32 %src, float %scale) {
695 ; GCN-LABEL: test_cvt_scale_f32_fp4_byte3:
697 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
698 ; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], v0, v1 op_sel:[1,1,0]
699 ; GCN-NEXT: s_setpc_b64 s[30:31]
700 %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 3)
704 define i32 @test_cvt_scale_fp4_f32_byte0(i32 %old, float %src0, float %src1, float %scale) {
705 ; GCN-LABEL: test_cvt_scale_fp4_f32_byte0:
707 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
708 ; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3
709 ; GCN-NEXT: s_setpc_b64 s[30:31]
710 %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 0)
714 define i32 @test_cvt_scale_fp4_f32_byte1(i32 %old, float %src0, float %src1, float %scale) {
715 ; GCN-LABEL: test_cvt_scale_fp4_f32_byte1:
717 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
718 ; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,1,0]
719 ; GCN-NEXT: s_setpc_b64 s[30:31]
720 %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 1)
724 define i32 @test_cvt_scale_fp4_f32_byte2(i32 %old, float %src0, float %src1, float %scale) {
725 ; GCN-LABEL: test_cvt_scale_fp4_f32_byte2:
727 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
728 ; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
729 ; GCN-NEXT: s_setpc_b64 s[30:31]
730 %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 2)
734 define i32 @test_cvt_scale_fp4_f32_byte3(i32 %old, float %src0, float %src1, float %scale) {
735 ; GCN-LABEL: test_cvt_scale_fp4_f32_byte3:
737 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
738 ; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,1,1]
739 ; GCN-NEXT: s_setpc_b64 s[30:31]
740 %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 3)
744 define i32 @test_cvt_scale_fp4_f32_byte3_fabs(i32 %old, float %src0, float %src1, float %scale) {
745 ; GCN-LABEL: test_cvt_scale_fp4_f32_byte3_fabs:
747 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
748 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
749 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2
750 ; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,1,1]
751 ; GCN-NEXT: s_setpc_b64 s[30:31]
752 %fabs.src0 = call float @llvm.fabs.f32(float %src0)
753 %fabs.src1 = call float @llvm.fabs.f32(float %src1)
754 %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %fabs.src0, float %fabs.src1, float %scale, i32 3)
758 define i32 @test_cvt_scale_fp4_f32_byte3_fneg(i32 %old, float %src0, float %src1, float %scale) {
759 ; GCN-LABEL: test_cvt_scale_fp4_f32_byte3_fneg:
761 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
762 ; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
763 ; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
764 ; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,1,1]
765 ; GCN-NEXT: s_setpc_b64 s[30:31]
766 %fneg.src0 = fneg float %src0
767 %fneg.src1 = fneg float %src1
768 %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %fneg.src0, float %fneg.src1, float %scale, i32 3)
772 define i32 @test_cvt_scale_fp4_f32_byte3_fabs_fneg(i32 %old, float %src0, float %src1, float %scale) {
773 ; GCN-LABEL: test_cvt_scale_fp4_f32_byte3_fabs_fneg:
775 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
776 ; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
777 ; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
778 ; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,1,1]
779 ; GCN-NEXT: s_setpc_b64 s[30:31]
780 %fabs.src0 = call float @llvm.fabs.f32(float %src0)
781 %fneg.src1 = fneg float %src1
782 %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %fabs.src0, float %fneg.src1, float %scale, i32 3)
786 define <2 x half> @test_cvt_scale_f16_fp4_byte0(i32 %src, float %scale) {
787 ; GCN-LABEL: test_cvt_scale_f16_fp4_byte0:
789 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
790 ; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, v0, v1
791 ; GCN-NEXT: s_setpc_b64 s[30:31]
792 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 0)
796 define <2 x half> @test_cvt_scale_f16_fp4_byte1(i32 %src, float %scale) {
797 ; GCN-LABEL: test_cvt_scale_f16_fp4_byte1:
799 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
800 ; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, v0, v1 op_sel:[0,1,0]
801 ; GCN-NEXT: s_setpc_b64 s[30:31]
802 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 1)
806 define <2 x half> @test_cvt_scale_f16_fp4_byte2(i32 %src, float %scale) {
807 ; GCN-LABEL: test_cvt_scale_f16_fp4_byte2:
809 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
810 ; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, v0, v1 op_sel:[1,0,0]
811 ; GCN-NEXT: s_setpc_b64 s[30:31]
812 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 2)
816 define <2 x half> @test_cvt_scale_f16_fp4_byte3(i32 %src, float %scale) {
817 ; GCN-LABEL: test_cvt_scale_f16_fp4_byte3:
819 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
820 ; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, v0, v1 op_sel:[1,1,0]
821 ; GCN-NEXT: s_setpc_b64 s[30:31]
822 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 3)
826 define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte0(i32 %src, float %scale) {
827 ; GCN-LABEL: test_cvt_scale_bf16_fp4_byte0:
829 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
830 ; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, v0, v1
831 ; GCN-NEXT: s_setpc_b64 s[30:31]
832 %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 0)
833 ret <2 x bfloat> %ret
836 define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte1(i32 %src, float %scale) {
837 ; GCN-LABEL: test_cvt_scale_bf16_fp4_byte1:
839 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
840 ; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, v0, v1 op_sel:[0,1,0]
841 ; GCN-NEXT: s_setpc_b64 s[30:31]
842 %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 1)
843 ret <2 x bfloat> %ret
846 define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte2(i32 %src, float %scale) {
847 ; GCN-LABEL: test_cvt_scale_bf16_fp4_byte2:
849 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
850 ; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, v0, v1 op_sel:[1,0,0]
851 ; GCN-NEXT: s_setpc_b64 s[30:31]
852 %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 2)
853 ret <2 x bfloat> %ret
856 define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte3(i32 %src, float %scale) {
857 ; GCN-LABEL: test_cvt_scale_bf16_fp4_byte3:
859 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
860 ; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, v0, v1 op_sel:[1,1,0]
861 ; GCN-NEXT: s_setpc_b64 s[30:31]
862 %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 3)
863 ret <2 x bfloat> %ret
866 define <32 x float> @test_cvt_scale_pk32_f32_fp6(<6 x i32> %src, float %scale) {
867 ; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_fp6:
868 ; GFX950-SDAG: ; %bb.0:
869 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
870 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, v6
871 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, v5
872 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, v4
873 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, v3
874 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, v2
875 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v33, v1
876 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0
877 ; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38
878 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
880 ; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_fp6:
881 ; GFX950-GISEL: ; %bb.0:
882 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
883 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0
884 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v33, v1
885 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v34, v2
886 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v35, v3
887 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v36, v4
888 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v37, v5
889 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v38, v6
890 ; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38
891 ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
892 %ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32> %src, float %scale)
893 ret <32 x float> %ret
896 define <32 x float> @test_cvt_scale_pk32_f32_bf6(<6 x i32> %src, float %scale) {
897 ; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_bf6:
898 ; GFX950-SDAG: ; %bb.0:
899 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
900 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, v6
901 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, v5
902 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, v4
903 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, v3
904 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, v2
905 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v33, v1
906 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0
907 ; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38
908 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
910 ; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_bf6:
911 ; GFX950-GISEL: ; %bb.0:
912 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
913 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0
914 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v33, v1
915 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v34, v2
916 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v35, v3
917 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v36, v4
918 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v37, v5
919 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v38, v6
920 ; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38
921 ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
922 %ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32> %src, float %scale)
923 ret <32 x float> %ret
926 define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_vv(<6 x i32> %src, float %scale) {
927 ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
928 ; GFX950-SDAG: ; %bb.0:
929 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
930 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
931 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
932 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
933 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
934 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
935 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
936 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
937 ; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
938 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
940 ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
941 ; GFX950-GISEL: ; %bb.0:
942 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
943 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0
944 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, v1
945 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v2
946 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v3
947 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v4
948 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v5
949 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, v6
950 ; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
951 ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
952 %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float %scale)
956 define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) {
957 ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl:
958 ; GFX950-SDAG: ; %bb.0:
959 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
960 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
961 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
962 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
963 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
964 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
965 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
966 ; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
967 ; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], s0
968 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
970 ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl:
971 ; GFX950-GISEL: ; %bb.0:
972 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
973 ; GFX950-GISEL-NEXT: s_mov_b32 s4, s16
974 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s17
975 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5]
976 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
977 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
978 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000
979 ; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
980 ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
981 %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float 100.0)
985 define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_vv(<6 x i32> %src, float %scale) {
986 ; GCN-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
988 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
989 ; GCN-NEXT: v_mov_b32_e32 v22, v6
990 ; GCN-NEXT: v_mov_b32_e32 v21, v5
991 ; GCN-NEXT: v_mov_b32_e32 v20, v4
992 ; GCN-NEXT: v_mov_b32_e32 v19, v3
993 ; GCN-NEXT: v_mov_b32_e32 v18, v2
994 ; GCN-NEXT: v_mov_b32_e32 v17, v1
995 ; GCN-NEXT: v_mov_b32_e32 v16, v0
996 ; GCN-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22
997 ; GCN-NEXT: s_setpc_b64 s[30:31]
998 %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> %src, float %scale)
999 ret <32 x bfloat> %ret
1002 define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) {
1003 ; GCN-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
1005 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1006 ; GCN-NEXT: v_mov_b32_e32 v16, s0
1007 ; GCN-NEXT: v_mov_b32_e32 v17, s1
1008 ; GCN-NEXT: v_mov_b32_e32 v18, s2
1009 ; GCN-NEXT: v_mov_b32_e32 v19, s3
1010 ; GCN-NEXT: v_mov_b32_e32 v20, s16
1011 ; GCN-NEXT: v_mov_b32_e32 v21, s17
1012 ; GCN-NEXT: s_mov_b32 s0, 0x42c80000
1013 ; GCN-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0
1014 ; GCN-NEXT: s_setpc_b64 s[30:31]
1015 %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> %src, float 100.0)
1016 ret <32 x bfloat> %ret
1019 define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_vv(<6 x i32> %src, float %scale) {
1020 ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
1021 ; GFX950-SDAG: ; %bb.0:
1022 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1023 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6
1024 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5
1025 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4
1026 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3
1027 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2
1028 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1
1029 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
1030 ; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
1031 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
1033 ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
1034 ; GFX950-GISEL: ; %bb.0:
1035 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1036 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0
1037 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, v1
1038 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v2
1039 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v3
1040 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v4
1041 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v5
1042 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, v6
1043 ; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
1044 ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
1045 %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float %scale)
1046 ret <32 x half> %ret
1049 define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) {
1050 ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl:
1051 ; GFX950-SDAG: ; %bb.0:
1052 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1053 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
1054 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
1055 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
1056 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
1057 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
1058 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
1059 ; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
1060 ; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], s0
1061 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
1063 ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl:
1064 ; GFX950-GISEL: ; %bb.0:
1065 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1066 ; GFX950-GISEL-NEXT: s_mov_b32 s4, s16
1067 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s17
1068 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5]
1069 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
1070 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
1071 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000
1072 ; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
1073 ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
1074 %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float 100.0)
1075 ret <32 x half> %ret
1078 define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_vv(<6 x i32> %src, float %scale) {
1079 ; GCN-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
1081 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1082 ; GCN-NEXT: v_mov_b32_e32 v22, v6
1083 ; GCN-NEXT: v_mov_b32_e32 v21, v5
1084 ; GCN-NEXT: v_mov_b32_e32 v20, v4
1085 ; GCN-NEXT: v_mov_b32_e32 v19, v3
1086 ; GCN-NEXT: v_mov_b32_e32 v18, v2
1087 ; GCN-NEXT: v_mov_b32_e32 v17, v1
1088 ; GCN-NEXT: v_mov_b32_e32 v16, v0
1089 ; GCN-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22
1090 ; GCN-NEXT: s_setpc_b64 s[30:31]
1091 %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> %src, float %scale)
1092 ret <32 x bfloat> %ret
1095 define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_sl(<6 x i32> inreg %src) {
1096 ; GCN-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
1098 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1099 ; GCN-NEXT: v_mov_b32_e32 v16, s0
1100 ; GCN-NEXT: v_mov_b32_e32 v17, s1
1101 ; GCN-NEXT: v_mov_b32_e32 v18, s2
1102 ; GCN-NEXT: v_mov_b32_e32 v19, s3
1103 ; GCN-NEXT: v_mov_b32_e32 v20, s16
1104 ; GCN-NEXT: v_mov_b32_e32 v21, s17
1105 ; GCN-NEXT: s_mov_b32 s0, 0x42c80000
1106 ; GCN-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0
1107 ; GCN-NEXT: s_setpc_b64 s[30:31]
1108 %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> %src, float 100.0)
1109 ret <32 x bfloat> %ret
1112 define <2 x half> @test_cvt_scalef32_pk_f16_fp8_word0(i32 %src, float %scale) {
1113 ; GCN-LABEL: test_cvt_scalef32_pk_f16_fp8_word0:
1115 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1116 ; GCN-NEXT: v_cvt_scalef32_pk_f16_fp8 v0, v0, v1
1117 ; GCN-NEXT: s_setpc_b64 s[30:31]
1118 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 %src, float %scale, i1 false)
1122 define <2 x half> @test_cvt_scalef32_pk_f16_fp8_word1(i32 %src, float %scale) {
1123 ; GCN-LABEL: test_cvt_scalef32_pk_f16_fp8_word1:
1125 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1126 ; GCN-NEXT: v_cvt_scalef32_pk_f16_fp8 v0, v0, v1 op_sel:[1,0,0]
1127 ; GCN-NEXT: s_setpc_b64 s[30:31]
1128 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 %src, float %scale, i1 true)
1132 define <2 x half> @test_cvt_scalef32_pk_f16_bf8_word0(i32 %src, float %scale) {
1133 ; GCN-LABEL: test_cvt_scalef32_pk_f16_bf8_word0:
1135 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1136 ; GCN-NEXT: v_cvt_scalef32_pk_f16_bf8 v0, v0, v1
1137 ; GCN-NEXT: s_setpc_b64 s[30:31]
1138 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 %src, float %scale, i1 false)
1142 define <2 x half> @test_cvt_scalef32_pk_f16_bf8_word1(i32 %src, float %scale) {
1143 ; GCN-LABEL: test_cvt_scalef32_pk_f16_bf8_word1:
1145 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1146 ; GCN-NEXT: v_cvt_scalef32_pk_f16_bf8 v0, v0, v1 op_sel:[1,0,0]
1147 ; GCN-NEXT: s_setpc_b64 s[30:31]
1148 %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 %src, float %scale, i1 true)
1152 define <2 x bfloat> @test_cvt_scalef32_pk_bf16_fp8_word0(i32 %src, float %scale) {
1153 ; GCN-LABEL: test_cvt_scalef32_pk_bf16_fp8_word0:
1155 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1156 ; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp8 v0, v0, v1
1157 ; GCN-NEXT: s_setpc_b64 s[30:31]
1158 %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 %src, float %scale, i1 false)
1159 ret <2 x bfloat> %ret
1162 define <2 x bfloat> @test_cvt_scalef32_pk_bf16_fp8_word1(i32 %src, float %scale) {
1163 ; GCN-LABEL: test_cvt_scalef32_pk_bf16_fp8_word1:
1165 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1166 ; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp8 v0, v0, v1 op_sel:[1,0,0]
1167 ; GCN-NEXT: s_setpc_b64 s[30:31]
1168 %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 %src, float %scale, i1 true)
1169 ret <2 x bfloat> %ret
1172 define <2 x bfloat> @test_cvt_scalef32_pk_bf16_bf8_word0(i32 %src, float %scale) {
1173 ; GCN-LABEL: test_cvt_scalef32_pk_bf16_bf8_word0:
1175 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1176 ; GCN-NEXT: v_cvt_scalef32_pk_bf16_bf8 v0, v0, v1
1177 ; GCN-NEXT: s_setpc_b64 s[30:31]
1178 %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 %src, float %scale, i1 false)
1179 ret <2 x bfloat> %ret
1182 define <2 x bfloat> @test_cvt_scalef32_pk_bf16_bf8_word1(i32 %src, float %scale) {
1183 ; GCN-LABEL: test_cvt_scalef32_pk_bf16_bf8_word1:
1185 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1186 ; GCN-NEXT: v_cvt_scalef32_pk_bf16_bf8 v0, v0, v1 op_sel:[1,0,0]
1187 ; GCN-NEXT: s_setpc_b64 s[30:31]
1188 %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 %src, float %scale, i1 true)
1189 ret <2 x bfloat> %ret
1192 define i32 @test_cvt_scalef32_fp4_f16_byte0(<2 x half> %src0, float %scale, i32 %old) {
1193 ; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte0:
1195 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1196 ; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1
1197 ; GCN-NEXT: v_mov_b32_e32 v0, v2
1198 ; GCN-NEXT: s_setpc_b64 s[30:31]
1199 %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 0)
1203 define i32 @test_cvt_scalef32_fp4_f16_byte1(<2 x half> %src0, float %scale, i32 %old) {
1204 ; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte1:
1206 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1207 ; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,1,0]
1209 ; GCN-NEXT: v_mov_b32_e32 v0, v2
1210 ; GCN-NEXT: s_setpc_b64 s[30:31]
1211 %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 1)
1215 define i32 @test_cvt_scalef32_fp4_f16_byte2(<2 x half> %src0, float %scale, i32 %old) {
1216 ; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte2:
1218 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1219 ; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,0,1]
1221 ; GCN-NEXT: v_mov_b32_e32 v0, v2
1222 ; GCN-NEXT: s_setpc_b64 s[30:31]
1223 %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 2)
1227 define i32 @test_cvt_scalef32_fp4_f16_byte3(<2 x half> %src0, float %scale, i32 %old) {
1228 ; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte3:
1230 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1231 ; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,1,1]
1233 ; GCN-NEXT: v_mov_b32_e32 v0, v2
1234 ; GCN-NEXT: s_setpc_b64 s[30:31]
1235 %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 3)
1239 define i32 @test_cvt_scalef32_fp4_bf16_byte0(<2 x bfloat> %src0, float %scale, i32 %old) {
1240 ; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte0:
1242 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1243 ; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1
1244 ; GCN-NEXT: v_mov_b32_e32 v0, v2
1245 ; GCN-NEXT: s_setpc_b64 s[30:31]
1246 %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 0)
1250 define i32 @test_cvt_scalef32_fp4_bf16_byte1(<2 x bfloat> %src0, float %scale, i32 %old) {
1251 ; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte1:
1253 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1254 ; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,1,0]
1256 ; GCN-NEXT: v_mov_b32_e32 v0, v2
1257 ; GCN-NEXT: s_setpc_b64 s[30:31]
1258 %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 1)
1262 define i32 @test_cvt_scalef32_fp4_bf16_byte2(<2 x bfloat> %src0, float %scale, i32 %old) {
1263 ; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte2:
1265 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1266 ; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,0,1]
1268 ; GCN-NEXT: v_mov_b32_e32 v0, v2
1269 ; GCN-NEXT: s_setpc_b64 s[30:31]
1270 %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 2)
1274 define i32 @test_cvt_scalef32_fp4_bf16_byte3(<2 x bfloat> %src0, float %scale, i32 %old) {
1275 ; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte3:
1277 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1278 ; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,1,1]
1280 ; GCN-NEXT: v_mov_b32_e32 v0, v2
1281 ; GCN-NEXT: s_setpc_b64 s[30:31]
1282 %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 3)