1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX10 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX11 %s
7 ; FIXME: GFX9 should be producing v_mad_u16 instead of v_mad_legacy_u16.
9 define amdgpu_kernel void @mad_u16(
10 ; GFX8-LABEL: mad_u16:
11 ; GFX8: ; %bb.0: ; %entry
12 ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
13 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0
14 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
16 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4
17 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
18 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
19 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4
20 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
21 ; GFX8-NEXT: v_mov_b32_e32 v5, s7
22 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v4
23 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
24 ; GFX8-NEXT: flat_load_ushort v6, v[0:1] glc
25 ; GFX8-NEXT: s_waitcnt vmcnt(0)
26 ; GFX8-NEXT: flat_load_ushort v2, v[2:3] glc
27 ; GFX8-NEXT: s_waitcnt vmcnt(0)
28 ; GFX8-NEXT: flat_load_ushort v3, v[4:5] glc
29 ; GFX8-NEXT: s_waitcnt vmcnt(0)
30 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
31 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
32 ; GFX8-NEXT: v_mad_u16 v2, v6, v2, v3
33 ; GFX8-NEXT: flat_store_short v[0:1], v2
36 ; GFX9-LABEL: mad_u16:
37 ; GFX9: ; %bb.0: ; %entry
38 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
39 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
40 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
41 ; GFX9-NEXT: global_load_ushort v1, v0, s[10:11] glc
42 ; GFX9-NEXT: s_waitcnt vmcnt(0)
43 ; GFX9-NEXT: global_load_ushort v2, v0, s[12:13] glc
44 ; GFX9-NEXT: s_waitcnt vmcnt(0)
45 ; GFX9-NEXT: global_load_ushort v3, v0, s[14:15] glc
46 ; GFX9-NEXT: s_waitcnt vmcnt(0)
47 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
48 ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
49 ; GFX9-NEXT: global_store_short v0, v1, s[8:9]
52 ; GFX10-LABEL: mad_u16:
53 ; GFX10: ; %bb.0: ; %entry
54 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
55 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
56 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
57 ; GFX10-NEXT: global_load_ushort v1, v0, s[10:11] glc dlc
58 ; GFX10-NEXT: s_waitcnt vmcnt(0)
59 ; GFX10-NEXT: global_load_ushort v2, v0, s[12:13] glc dlc
60 ; GFX10-NEXT: s_waitcnt vmcnt(0)
61 ; GFX10-NEXT: global_load_ushort v3, v0, s[14:15] glc dlc
62 ; GFX10-NEXT: s_waitcnt vmcnt(0)
63 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
64 ; GFX10-NEXT: v_mad_u16 v1, v1, v2, v3
65 ; GFX10-NEXT: global_store_short v0, v1, s[8:9]
66 ; GFX10-NEXT: s_endpgm
68 ; GFX11-LABEL: mad_u16:
69 ; GFX11: ; %bb.0: ; %entry
70 ; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
71 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
72 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
73 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
74 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
75 ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
76 ; GFX11-NEXT: s_waitcnt vmcnt(0)
77 ; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
78 ; GFX11-NEXT: s_waitcnt vmcnt(0)
79 ; GFX11-NEXT: global_load_u16 v0, v0, s[6:7] glc dlc
80 ; GFX11-NEXT: s_waitcnt vmcnt(0)
81 ; GFX11-NEXT: v_mad_u16 v0, v1, v2, v0
82 ; GFX11-NEXT: global_store_b16 v3, v0, s[0:1]
83 ; GFX11-NEXT: s_endpgm
87 ptr addrspace(1) %c) {
89 %tid = call i32 @llvm.amdgcn.workitem.id.x()
90 %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a, i32 %tid
91 %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b, i32 %tid
92 %c.gep = getelementptr inbounds i16, ptr addrspace(1) %c, i32 %tid
94 %a.val = load volatile i16, ptr addrspace(1) %a.gep
95 %b.val = load volatile i16, ptr addrspace(1) %b.gep
96 %c.val = load volatile i16, ptr addrspace(1) %c.gep
98 %m.val = mul i16 %a.val, %b.val
99 %r.val = add i16 %m.val, %c.val
101 store i16 %r.val, ptr addrspace(1) %r
105 define i16 @v_mad_u16(i16 %arg0, i16 %arg1, i16 %arg2) {
106 ; GFX8-LABEL: v_mad_u16:
108 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109 ; GFX8-NEXT: v_mad_u16 v0, v0, v1, v2
110 ; GFX8-NEXT: s_setpc_b64 s[30:31]
112 ; GFX9-LABEL: v_mad_u16:
114 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
115 ; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v1, v2
116 ; GFX9-NEXT: s_setpc_b64 s[30:31]
118 ; GFX10-LABEL: v_mad_u16:
120 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121 ; GFX10-NEXT: v_mad_u16 v0, v0, v1, v2
122 ; GFX10-NEXT: s_setpc_b64 s[30:31]
124 ; GFX11-LABEL: v_mad_u16:
126 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127 ; GFX11-NEXT: v_mad_u16 v0, v0, v1, v2
128 ; GFX11-NEXT: s_setpc_b64 s[30:31]
129 %mul = mul i16 %arg0, %arg1
130 %add = add i16 %mul, %arg2
134 define i32 @v_mad_u16_zext(i16 %arg0, i16 %arg1, i16 %arg2) {
135 ; GFX8-LABEL: v_mad_u16_zext:
137 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138 ; GFX8-NEXT: v_mad_u16 v0, v0, v1, v2
139 ; GFX8-NEXT: s_setpc_b64 s[30:31]
141 ; GFX9-LABEL: v_mad_u16_zext:
143 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
144 ; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v1, v2
145 ; GFX9-NEXT: s_setpc_b64 s[30:31]
147 ; GFX10-LABEL: v_mad_u16_zext:
149 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150 ; GFX10-NEXT: v_mad_u16 v0, v0, v1, v2
151 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
152 ; GFX10-NEXT: s_setpc_b64 s[30:31]
154 ; GFX11-LABEL: v_mad_u16_zext:
156 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157 ; GFX11-NEXT: v_mad_u16 v0, v0, v1, v2
158 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
159 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
160 ; GFX11-NEXT: s_setpc_b64 s[30:31]
161 %mul = mul i16 %arg0, %arg1
162 %add = add i16 %mul, %arg2
163 %zext = zext i16 %add to i32
167 define i64 @v_mad_u16_zext64(i16 %arg0, i16 %arg1, i16 %arg2) {
168 ; GFX8-LABEL: v_mad_u16_zext64:
170 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171 ; GFX8-NEXT: v_mad_u16 v0, v0, v1, v2
172 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
173 ; GFX8-NEXT: s_setpc_b64 s[30:31]
175 ; GFX9-LABEL: v_mad_u16_zext64:
177 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
178 ; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v1, v2
179 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
180 ; GFX9-NEXT: s_setpc_b64 s[30:31]
182 ; GFX10-LABEL: v_mad_u16_zext64:
184 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
185 ; GFX10-NEXT: v_mad_u16 v0, v0, v1, v2
186 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
187 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
188 ; GFX10-NEXT: s_setpc_b64 s[30:31]
190 ; GFX11-LABEL: v_mad_u16_zext64:
192 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
193 ; GFX11-NEXT: v_mad_u16 v0, v0, v1, v2
194 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
195 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
196 ; GFX11-NEXT: s_setpc_b64 s[30:31]
197 %mul = mul i16 %arg0, %arg1
198 %add = add i16 %mul, %arg2
199 %zext = zext i16 %add to i64
203 define amdgpu_ps i16 @s_mad_u16(i16 inreg %arg0, i16 inreg %arg1, i16 inreg %arg2) {
204 ; GFX8-LABEL: s_mad_u16:
206 ; GFX8-NEXT: s_mul_i32 s0, s0, s1
207 ; GFX8-NEXT: s_add_i32 s0, s0, s2
208 ; GFX8-NEXT: ; return to shader part epilog
210 ; GFX9-LABEL: s_mad_u16:
212 ; GFX9-NEXT: s_mul_i32 s0, s0, s1
213 ; GFX9-NEXT: s_add_i32 s0, s0, s2
214 ; GFX9-NEXT: ; return to shader part epilog
216 ; GFX10-LABEL: s_mad_u16:
218 ; GFX10-NEXT: s_mul_i32 s0, s0, s1
219 ; GFX10-NEXT: s_add_i32 s0, s0, s2
220 ; GFX10-NEXT: ; return to shader part epilog
222 ; GFX11-LABEL: s_mad_u16:
224 ; GFX11-NEXT: s_mul_i32 s0, s0, s1
225 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
226 ; GFX11-NEXT: s_add_i32 s0, s0, s2
227 ; GFX11-NEXT: ; return to shader part epilog
228 %mul = mul i16 %arg0, %arg1
229 %add = add i16 %mul, %arg2
233 define amdgpu_ps i32 @s_mad_u16_zext(i16 inreg %arg0, i16 inreg %arg1, i16 inreg %arg2) {
234 ; GFX8-LABEL: s_mad_u16_zext:
236 ; GFX8-NEXT: s_mul_i32 s0, s0, s1
237 ; GFX8-NEXT: s_add_i32 s0, s0, s2
238 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
239 ; GFX8-NEXT: ; return to shader part epilog
241 ; GFX9-LABEL: s_mad_u16_zext:
243 ; GFX9-NEXT: s_mul_i32 s0, s0, s1
244 ; GFX9-NEXT: s_add_i32 s0, s0, s2
245 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
246 ; GFX9-NEXT: ; return to shader part epilog
248 ; GFX10-LABEL: s_mad_u16_zext:
250 ; GFX10-NEXT: s_mul_i32 s0, s0, s1
251 ; GFX10-NEXT: s_add_i32 s0, s0, s2
252 ; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
253 ; GFX10-NEXT: ; return to shader part epilog
255 ; GFX11-LABEL: s_mad_u16_zext:
257 ; GFX11-NEXT: s_mul_i32 s0, s0, s1
258 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
259 ; GFX11-NEXT: s_add_i32 s0, s0, s2
260 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
261 ; GFX11-NEXT: ; return to shader part epilog
262 %mul = mul i16 %arg0, %arg1
263 %add = add i16 %mul, %arg2
264 %zext = zext i16 %add to i32
268 define amdgpu_ps i64 @s_mad_u16_zext64(i16 inreg %arg0, i16 inreg %arg1, i16 inreg %arg2) {
269 ; GFX8-LABEL: s_mad_u16_zext64:
271 ; GFX8-NEXT: s_mul_i32 s0, s0, s1
272 ; GFX8-NEXT: s_add_i32 s0, s0, s2
273 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
274 ; GFX8-NEXT: s_mov_b32 s1, 0
275 ; GFX8-NEXT: ; return to shader part epilog
277 ; GFX9-LABEL: s_mad_u16_zext64:
279 ; GFX9-NEXT: s_mul_i32 s0, s0, s1
280 ; GFX9-NEXT: s_add_i32 s0, s0, s2
281 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
282 ; GFX9-NEXT: s_mov_b32 s1, 0
283 ; GFX9-NEXT: ; return to shader part epilog
285 ; GFX10-LABEL: s_mad_u16_zext64:
287 ; GFX10-NEXT: s_mul_i32 s0, s0, s1
288 ; GFX10-NEXT: s_mov_b32 s1, 0
289 ; GFX10-NEXT: s_add_i32 s0, s0, s2
290 ; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
291 ; GFX10-NEXT: ; return to shader part epilog
293 ; GFX11-LABEL: s_mad_u16_zext64:
295 ; GFX11-NEXT: s_mul_i32 s0, s0, s1
296 ; GFX11-NEXT: s_mov_b32 s1, 0
297 ; GFX11-NEXT: s_add_i32 s0, s0, s2
298 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
299 ; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
300 ; GFX11-NEXT: ; return to shader part epilog
301 %mul = mul i16 %arg0, %arg1
302 %add = add i16 %mul, %arg2
303 %zext = zext i16 %add to i64
307 define amdgpu_ps i32 @s_mad_u16_sext(i16 inreg %arg0, i16 inreg %arg1, i16 inreg %arg2) {
308 ; GFX8-LABEL: s_mad_u16_sext:
310 ; GFX8-NEXT: s_mul_i32 s0, s0, s1
311 ; GFX8-NEXT: s_add_i32 s0, s0, s2
312 ; GFX8-NEXT: s_sext_i32_i16 s0, s0
313 ; GFX8-NEXT: ; return to shader part epilog
315 ; GFX9-LABEL: s_mad_u16_sext:
317 ; GFX9-NEXT: s_mul_i32 s0, s0, s1
318 ; GFX9-NEXT: s_add_i32 s0, s0, s2
319 ; GFX9-NEXT: s_sext_i32_i16 s0, s0
320 ; GFX9-NEXT: ; return to shader part epilog
322 ; GFX10-LABEL: s_mad_u16_sext:
324 ; GFX10-NEXT: s_mul_i32 s0, s0, s1
325 ; GFX10-NEXT: s_add_i32 s0, s0, s2
326 ; GFX10-NEXT: s_sext_i32_i16 s0, s0
327 ; GFX10-NEXT: ; return to shader part epilog
329 ; GFX11-LABEL: s_mad_u16_sext:
331 ; GFX11-NEXT: s_mul_i32 s0, s0, s1
332 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
333 ; GFX11-NEXT: s_add_i32 s0, s0, s2
334 ; GFX11-NEXT: s_sext_i32_i16 s0, s0
335 ; GFX11-NEXT: ; return to shader part epilog
336 %mul = mul i16 %arg0, %arg1
337 %add = add i16 %mul, %arg2
338 %sext = sext i16 %add to i32
342 declare i32 @llvm.amdgcn.workitem.id.x()
343 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: