1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
4 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
6 define amdgpu_kernel void @madak_f16(
8 ; SI: ; %bb.0: ; %entry
9 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
10 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
11 ; SI-NEXT: s_mov_b32 s3, 0xf000
12 ; SI-NEXT: s_mov_b32 s2, -1
13 ; SI-NEXT: s_mov_b32 s14, s2
14 ; SI-NEXT: s_waitcnt lgkmcnt(0)
15 ; SI-NEXT: s_mov_b32 s12, s6
16 ; SI-NEXT: s_mov_b32 s13, s7
17 ; SI-NEXT: s_mov_b32 s15, s3
18 ; SI-NEXT: s_mov_b32 s10, s2
19 ; SI-NEXT: s_mov_b32 s11, s3
20 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
21 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
22 ; SI-NEXT: s_mov_b32 s0, s4
23 ; SI-NEXT: s_mov_b32 s1, s5
24 ; SI-NEXT: s_waitcnt vmcnt(1)
25 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
26 ; SI-NEXT: s_waitcnt vmcnt(0)
27 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
28 ; SI-NEXT: v_madak_f32 v0, v0, v1, 0x41200000
29 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
30 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
33 ; VI-LABEL: madak_f16:
34 ; VI: ; %bb.0: ; %entry
35 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
36 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
37 ; VI-NEXT: s_mov_b32 s3, 0xf000
38 ; VI-NEXT: s_mov_b32 s2, -1
39 ; VI-NEXT: s_mov_b32 s14, s2
40 ; VI-NEXT: s_waitcnt lgkmcnt(0)
41 ; VI-NEXT: s_mov_b32 s12, s6
42 ; VI-NEXT: s_mov_b32 s13, s7
43 ; VI-NEXT: s_mov_b32 s15, s3
44 ; VI-NEXT: s_mov_b32 s10, s2
45 ; VI-NEXT: s_mov_b32 s11, s3
46 ; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
47 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
48 ; VI-NEXT: s_mov_b32 s0, s4
49 ; VI-NEXT: s_mov_b32 s1, s5
50 ; VI-NEXT: s_waitcnt vmcnt(0)
51 ; VI-NEXT: v_madak_f16 v0, v0, v1, 0x4900
52 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
55 ; GFX11-LABEL: madak_f16:
56 ; GFX11: ; %bb.0: ; %entry
57 ; GFX11-NEXT: s_clause 0x1
58 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
59 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
60 ; GFX11-NEXT: s_mov_b32 s10, -1
61 ; GFX11-NEXT: s_mov_b32 s11, 0x31016000
62 ; GFX11-NEXT: s_mov_b32 s14, s10
63 ; GFX11-NEXT: s_mov_b32 s15, s11
64 ; GFX11-NEXT: s_mov_b32 s2, s10
65 ; GFX11-NEXT: s_mov_b32 s3, s11
66 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
67 ; GFX11-NEXT: s_mov_b32 s12, s6
68 ; GFX11-NEXT: s_mov_b32 s13, s7
69 ; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0
70 ; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0
71 ; GFX11-NEXT: s_mov_b32 s8, s4
72 ; GFX11-NEXT: s_mov_b32 s9, s5
73 ; GFX11-NEXT: s_waitcnt vmcnt(0)
74 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
75 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
76 ; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0
77 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
79 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
80 ; GFX11-NEXT: s_endpgm
83 ptr addrspace(1) %b) #0 {
85 %a.val = load half, ptr addrspace(1) %a
86 %b.val = load half, ptr addrspace(1) %b
88 %t.val = fmul half %a.val, %b.val
89 %r.val = fadd half %t.val, 10.0
91 store half %r.val, ptr addrspace(1) %r
95 define amdgpu_kernel void @madak_f16_use_2(
96 ; SI-LABEL: madak_f16_use_2:
97 ; SI: ; %bb.0: ; %entry
98 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
99 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
100 ; SI-NEXT: s_mov_b32 s3, 0xf000
101 ; SI-NEXT: s_mov_b32 s2, -1
102 ; SI-NEXT: s_mov_b32 s18, s2
103 ; SI-NEXT: s_waitcnt lgkmcnt(0)
104 ; SI-NEXT: s_mov_b32 s16, s8
105 ; SI-NEXT: s_mov_b32 s17, s9
106 ; SI-NEXT: s_mov_b32 s19, s3
107 ; SI-NEXT: s_mov_b32 s8, s10
108 ; SI-NEXT: s_mov_b32 s9, s11
109 ; SI-NEXT: s_mov_b32 s10, s2
110 ; SI-NEXT: s_mov_b32 s11, s3
111 ; SI-NEXT: s_mov_b32 s14, s2
112 ; SI-NEXT: s_mov_b32 s15, s3
113 ; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc
114 ; SI-NEXT: s_waitcnt vmcnt(0)
115 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
116 ; SI-NEXT: s_waitcnt vmcnt(0)
117 ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc
118 ; SI-NEXT: s_waitcnt vmcnt(0)
119 ; SI-NEXT: v_mov_b32_e32 v3, 0x41200000
120 ; SI-NEXT: s_mov_b32 s0, s4
121 ; SI-NEXT: s_mov_b32 s1, s5
122 ; SI-NEXT: s_mov_b32 s8, s6
123 ; SI-NEXT: s_mov_b32 s9, s7
124 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
125 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
126 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
127 ; SI-NEXT: v_madak_f32 v1, v0, v1, 0x41200000
128 ; SI-NEXT: v_mac_f32_e32 v3, v0, v2
129 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v1
130 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3
131 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
132 ; SI-NEXT: buffer_store_short v1, off, s[8:11], 0
135 ; VI-LABEL: madak_f16_use_2:
136 ; VI: ; %bb.0: ; %entry
137 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
138 ; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
139 ; VI-NEXT: s_mov_b32 s3, 0xf000
140 ; VI-NEXT: s_mov_b32 s2, -1
141 ; VI-NEXT: s_mov_b32 s18, s2
142 ; VI-NEXT: s_waitcnt lgkmcnt(0)
143 ; VI-NEXT: s_mov_b32 s16, s8
144 ; VI-NEXT: s_mov_b32 s17, s9
145 ; VI-NEXT: s_mov_b32 s19, s3
146 ; VI-NEXT: s_mov_b32 s8, s10
147 ; VI-NEXT: s_mov_b32 s9, s11
148 ; VI-NEXT: s_mov_b32 s10, s2
149 ; VI-NEXT: s_mov_b32 s11, s3
150 ; VI-NEXT: s_mov_b32 s14, s2
151 ; VI-NEXT: s_mov_b32 s15, s3
152 ; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 glc
153 ; VI-NEXT: s_waitcnt vmcnt(0)
154 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
155 ; VI-NEXT: s_waitcnt vmcnt(0)
156 ; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 glc
157 ; VI-NEXT: s_waitcnt vmcnt(0)
158 ; VI-NEXT: v_mov_b32_e32 v3, 0x4900
159 ; VI-NEXT: s_mov_b32 s0, s4
160 ; VI-NEXT: s_mov_b32 s1, s5
161 ; VI-NEXT: s_mov_b32 s8, s6
162 ; VI-NEXT: s_mov_b32 s9, s7
163 ; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900
164 ; VI-NEXT: v_mac_f16_e32 v3, v0, v2
165 ; VI-NEXT: buffer_store_short v1, off, s[0:3], 0
166 ; VI-NEXT: buffer_store_short v3, off, s[8:11], 0
169 ; GFX11-LABEL: madak_f16_use_2:
170 ; GFX11: ; %bb.0: ; %entry
171 ; GFX11-NEXT: s_clause 0x1
172 ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
173 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
174 ; GFX11-NEXT: s_mov_b32 s14, -1
175 ; GFX11-NEXT: s_mov_b32 s15, 0x31016000
176 ; GFX11-NEXT: s_mov_b32 s18, s14
177 ; GFX11-NEXT: s_mov_b32 s19, s15
178 ; GFX11-NEXT: s_mov_b32 s22, s14
179 ; GFX11-NEXT: s_mov_b32 s23, s15
180 ; GFX11-NEXT: s_mov_b32 s2, s14
181 ; GFX11-NEXT: s_mov_b32 s3, s15
182 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
183 ; GFX11-NEXT: s_mov_b32 s16, s8
184 ; GFX11-NEXT: s_mov_b32 s17, s9
185 ; GFX11-NEXT: s_mov_b32 s20, s10
186 ; GFX11-NEXT: s_mov_b32 s21, s11
187 ; GFX11-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc
188 ; GFX11-NEXT: s_waitcnt vmcnt(0)
189 ; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc
190 ; GFX11-NEXT: s_waitcnt vmcnt(0)
191 ; GFX11-NEXT: buffer_load_u16 v2, off, s[0:3], 0 glc dlc
192 ; GFX11-NEXT: s_waitcnt vmcnt(0)
193 ; GFX11-NEXT: s_mov_b32 s12, s4
194 ; GFX11-NEXT: s_mov_b32 s13, s5
195 ; GFX11-NEXT: s_mov_b32 s0, s6
196 ; GFX11-NEXT: s_mov_b32 s1, s7
197 ; GFX11-NEXT: v_mul_f16_e32 v1, v0, v1
198 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2
199 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
200 ; GFX11-NEXT: v_add_f16_e32 v1, 0x4900, v1
201 ; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0
202 ; GFX11-NEXT: buffer_store_b16 v1, off, s[12:15], 0
203 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
204 ; GFX11-NEXT: s_nop 0
205 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
206 ; GFX11-NEXT: s_endpgm
207 ptr addrspace(1) %r0,
208 ptr addrspace(1) %r1,
211 ptr addrspace(1) %c) #0 {
213 %a.val = load volatile half, ptr addrspace(1) %a
214 %b.val = load volatile half, ptr addrspace(1) %b
215 %c.val = load volatile half, ptr addrspace(1) %c
217 %t0.val = fmul half %a.val, %b.val
218 %t1.val = fmul half %a.val, %c.val
219 %r0.val = fadd half %t0.val, 10.0
220 %r1.val = fadd half %t1.val, 10.0
222 store half %r0.val, ptr addrspace(1) %r0
223 store half %r1.val, ptr addrspace(1) %r1
227 attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }