1 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8PLUS,MAD %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8PLUS,MAD %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,GFX10-MAD %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,FMA,GFX10PLUS-FMA %s
6 ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8PLUS,FMA,GFX940-FMA %s
7 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,GFX11-MAD %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,FMA,GFX10PLUS-FMA %s
10 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
11 declare float @llvm.fabs.f32(float) nounwind readnone
13 ; GCN-LABEL: {{^}}madak_f32:
14 ; GFX6: buffer_load_dword [[VA:v[0-9]+]]
15 ; GFX6: buffer_load_dword [[VB:v[0-9]+]]
16 ; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
17 ; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
18 ; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
19 ; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
20 ; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
21 ; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]]
22 ; MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
23 ; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
24 ; FMA: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
25 define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
26 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
27 %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
28 %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
29 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
31 %a = load float, ptr addrspace(1) %in.a.gep, align 4
32 %b = load float, ptr addrspace(1) %in.b.gep, align 4
34 %mul = fmul float %a, %b
35 %madak = fadd float %mul, 10.0
36 store float %madak, ptr addrspace(1) %out.gep, align 4
40 ; Make sure this is only folded with one use. This is a code size
41 ; optimization and if we fold the immediate multiple times, we'll undo
44 ; GCN-LABEL: {{^}}madak_2_use_f32:
45 ; GFX9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
46 ; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
47 ; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
48 ; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
49 ; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]],
50 ; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]],
51 ; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VC:v[0-9]+]],
52 ; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
53 ; GFX8-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
54 ; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
55 ; GFX10-MAD-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
56 ; FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
57 ; MAD-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
58 ; GFX10PLUS-FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VC]], 0x41200000
59 ; GFX940-FMA-DAG:v_fmac_f32_e32 [[VK]], [[VA]], [[VC]]
61 define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
62 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
64 %in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
65 %in.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1
66 %in.gep.2 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 2
68 %out.gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
69 %out.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1
71 %a = load volatile float, ptr addrspace(1) %in.gep.0, align 4
72 %b = load volatile float, ptr addrspace(1) %in.gep.1, align 4
73 %c = load volatile float, ptr addrspace(1) %in.gep.2, align 4
75 %mul0 = fmul float %a, %b
76 %mul1 = fmul float %a, %c
77 %madak0 = fadd float %mul0, 10.0
78 %madak1 = fadd float %mul1, 10.0
80 store volatile float %madak0, ptr addrspace(1) %out.gep.0, align 4
81 store volatile float %madak1, ptr addrspace(1) %out.gep.1, align 4
85 ; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
86 ; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
87 ; MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
88 ; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
89 ; FMA: v_fmaak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
90 define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a) #0 {
91 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
92 %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
93 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
95 %a = load float, ptr addrspace(1) %in.a.gep, align 4
97 %mul = fmul float 4.0, %a
98 %madak = fadd float %mul, 10.0
99 store float %madak, ptr addrspace(1) %out.gep, align 4
103 ; Make sure nothing weird happens with a value that is also allowed as
104 ; an inline immediate.
106 ; GCN-LABEL: {{^}}madak_inline_imm_f32:
107 ; GFX6: buffer_load_dword [[VA:v[0-9]+]]
108 ; GFX6: buffer_load_dword [[VB:v[0-9]+]]
109 ; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
110 ; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
111 ; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
112 ; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
113 ; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
114 ; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]]
115 ; MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
116 ; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
117 ; FMA: v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
118 define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
119 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
120 %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
121 %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
122 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
124 %a = load float, ptr addrspace(1) %in.a.gep, align 4
125 %b = load float, ptr addrspace(1) %in.b.gep, align 4
127 %mul = fmul float %a, %b
128 %madak = fadd float %mul, 4.0
129 store float %madak, ptr addrspace(1) %out.gep, align 4
133 ; We can't use an SGPR when forming madak
134 ; GCN-LABEL: {{^}}s_v_madak_f32:
135 ; GCN-DAG: s_load_{{dword|b32}} [[SB:s[0-9]+]]
136 ; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
137 ; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}}{{(_addtid)?}} [[VA:v[0-9]+]]
138 ; GCN-NOT: v_madak_f32
139 ; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
140 ; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
141 ; GFX10PLUS-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
142 ; GFX940-FMA: v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]]
143 ; GFX11-MAD: v_mul_f32_e32 [[VMUL:v[0-9]+]], [[SB]], [[VA]]
144 ; GFX11-MAD: v_add_f32_e32 {{v[0-9]+}}, 0x41200000, [[VMUL]]
145 define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, float %b) #0 {
146 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
147 %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
148 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
150 %a = load float, ptr addrspace(1) %in.a.gep, align 4
152 %mul = fmul float %a, %b
153 %madak = fadd float %mul, 10.0
154 store float %madak, ptr addrspace(1) %out.gep, align 4
158 ; GCN-LABEL: @v_s_madak_f32
159 ; GCN-DAG: s_load_{{dword|b32}} [[SB:s[0-9]+]]
160 ; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
161 ; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}}{{(_addtid)?}} [[VA:v[0-9]+]]
162 ; GFX6_8_9-NOT: v_madak_f32
163 ; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
164 ; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
165 ; GFX10PLUS-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
166 ; GFX940-FMA: v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]]
167 ; GFX11-MAD: v_mul_f32_e32 [[VMUL:v[0-9]+]], [[SB]], [[VA]]
168 ; GFX11-MAD: v_add_f32_e32 {{v[0-9]+}}, 0x41200000, [[VMUL]]
169 define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a, ptr addrspace(1) noalias %in.b) #0 {
170 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
171 %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
172 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
174 %b = load float, ptr addrspace(1) %in.b.gep, align 4
176 %mul = fmul float %a, %b
177 %madak = fadd float %mul, 10.0
178 store float %madak, ptr addrspace(1) %out.gep, align 4
182 ; GCN-LABEL: {{^}}s_s_madak_f32:
183 ; GCN-NOT: v_madak_f32
184 ; GFX8_9: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
185 ; GFX10-MAD: v_mac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
186 ; GFX10PLUS-FMA: v_fmac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
187 ; GFX940-FMA: v_fmac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
188 ; GFX11-MAD: v_mul_f32_e64 [[VMUL:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
189 ; GFX11-MAD: v_dual_mov_b32 {{v[0-9]+}}, 0 :: v_dual_add_f32 {{v[0-9]+}}, 0x41200000, [[VMUL]]
190 define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
191 %mul = fmul float %a, %b
192 %madak = fadd float %mul, 10.0
193 store float %madak, ptr addrspace(1) %out, align 4
197 ; GCN-LABEL: {{^}}no_madak_src0_modifier_f32:
198 ; GFX6: buffer_load_dword [[VA:v[0-9]+]]
199 ; GFX6: buffer_load_dword [[VB:v[0-9]+]]
200 ; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]]
201 ; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
202 ; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
203 ; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000
204 ; GFX10PLUS-FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000
205 ; GFX940-FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{s[0-9]+}}
206 ; GFX11-MAD: v_mul_f32_e64 [[VMUL:v[0-9]+]], |{{v[0-9]+}}|, {{v[0-9]+}}
207 ; GFX11-MAD: v_add_f32_e32 {{v[0-9]+}}, 0x41200000, [[VMUL]]
209 define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
210 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
211 %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
212 %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
213 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
215 %a = load float, ptr addrspace(1) %in.a.gep, align 4
216 %b = load float, ptr addrspace(1) %in.b.gep, align 4
218 %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
220 %mul = fmul float %a.fabs, %b
221 %madak = fadd float %mul, 10.0
222 store float %madak, ptr addrspace(1) %out.gep, align 4
226 ; GCN-LABEL: {{^}}no_madak_src1_modifier_f32:
227 ; GFX6: buffer_load_dword [[VA:v[0-9]+]]
228 ; GFX6: buffer_load_dword [[VB:v[0-9]+]]
229 ; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]]
230 ; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
231 ; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
232 ; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000
233 ; GFX10PLUS-FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000
234 ; GFX940-FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{s[0-9]+}}
235 ; GFX11-MAD: v_mul_f32_e64 [[VMUL:v[0-9]+]], {{v[0-9]+}}, |{{v[0-9]+}}|
236 ; GFX11-MAD: v_add_f32_e32 {{v[0-9]+}}, 0x41200000, [[VMUL]]
238 define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
239 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
240 %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
241 %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
242 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
244 %a = load float, ptr addrspace(1) %in.a.gep, align 4
245 %b = load float, ptr addrspace(1) %in.b.gep, align 4
247 %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
249 %mul = fmul float %a, %b.fabs
250 %madak = fadd float %mul, 10.0
251 store float %madak, ptr addrspace(1) %out.gep, align 4
255 ; SIFoldOperands should not fold the SGPR copy into the instruction before GFX10
256 ; because the implicit immediate already uses the constant bus.
257 ; On GFX10+ we can use two scalar operands.
258 ; GCN-LABEL: {{^}}madak_constant_bus_violation:
259 ; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[VGPR:v[0-9]+]]
260 ; GCN: s_load_{{dword|b32}} [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
261 ; MAD: v_mov_b32_e32 [[MADAK:v[0-9]+]], 0x42280000
262 ; MAD: v_mac_f32_e64 [[MADAK]], [[SGPR0]], 0.5
263 ; GFX10-MAD: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
264 ; GFX10PLUS-FMA: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
265 ; GFX10-MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
266 ; GFX10PLUS-FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
267 ; GFX940-FMA: v_fmac_f32_e64 [[MADAK:v[0-9]+]], [[SGPR0]], 0.5
268 ; GFX11-MAD: v_mul_f32_e64 [[VMUL:v[0-9]+]], [[SGPR0]], 0.5
269 ; GFX11-MAD: v_add_f32_e32 [[MADAK:v[0-9]+]], 0x42280000, [[VMUL]]
270 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
271 ; GFX6: buffer_store_dword [[MUL]]
272 ; GFX8PLUS: {{flat|global}}_store_{{dword|b32}} v[{{[0-9:]+}}], [[MUL]]
273 define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 {
275 %tmp = icmp eq i32 %arg1, 0
276 br i1 %tmp, label %bb3, label %bb4
279 store volatile float 0.0, ptr addrspace(1) undef
283 %vgpr = load volatile float, ptr addrspace(1) undef
284 %tmp0 = fmul float %sgpr0, 0.5
285 %tmp1 = fadd float %tmp0, 42.0
286 %tmp2 = fmul float %tmp1, %vgpr
287 store volatile float %tmp2, ptr addrspace(1) undef, align 4
291 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }