1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s
2 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s
4 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
5 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
7 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GFX10-FLUSH,GFX10 %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GFX10-FLUSH,GFX10 %s
9 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s
10 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s
12 declare i32 @llvm.amdgcn.workitem.id.x() #1
13 declare half @llvm.fmuladd.f16(half, half, half) #1
14 declare half @llvm.fabs.f16(half) #1
16 ; GCN-LABEL: {{^}}fmuladd_f16:
17 ; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
19 ; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
21 ; GFX10-FLUSH: v_mul_f16_e32
22 ; GFX10-FLUSH: v_add_f16_e32
23 ; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
25 define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
26 half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
27 %r0 = load half, half addrspace(1)* %in1
28 %r1 = load half, half addrspace(1)* %in2
29 %r2 = load half, half addrspace(1)* %in3
30 %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2)
31 store half %r3, half addrspace(1)* %out
35 ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16
36 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
37 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
38 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
39 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
41 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
42 ; GFX10-DENORM: v_fmac_f16_e32 [[R2:v[0-9]+]], 2.0, [[R1]]
44 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
45 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
47 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
48 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
49 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
51 define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
52 %tid = call i32 @llvm.amdgcn.workitem.id.x()
53 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
54 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
55 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
57 %r1 = load volatile half, half addrspace(1)* %gep.0
58 %r2 = load volatile half, half addrspace(1)* %gep.1
60 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2)
61 store half %r3, half addrspace(1)* %gep.out
65 ; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16
66 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
67 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
68 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
69 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
71 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
72 ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
74 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
75 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
77 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
78 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
79 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
81 define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
82 %tid = call i32 @llvm.amdgcn.workitem.id.x()
83 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
84 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
85 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
87 %r1 = load volatile half, half addrspace(1)* %gep.0
88 %r2 = load volatile half, half addrspace(1)* %gep.1
90 %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2)
91 store half %r3, half addrspace(1)* %gep.out
95 ; GCN-LABEL: {{^}}fadd_a_a_b_f16:
96 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
97 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
98 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
99 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
101 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
102 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
104 ; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
105 ; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
107 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
109 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
110 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
111 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
112 ; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
113 ; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
115 define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
116 half addrspace(1)* %in1,
117 half addrspace(1)* %in2) #0 {
118 %tid = call i32 @llvm.amdgcn.workitem.id.x()
119 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
120 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
121 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
123 %r0 = load volatile half, half addrspace(1)* %gep.0
124 %r1 = load volatile half, half addrspace(1)* %gep.1
126 %add.0 = fadd half %r0, %r0
127 %add.1 = fadd half %add.0, %r1
128 store half %add.1, half addrspace(1)* %gep.out
132 ; GCN-LABEL: {{^}}fadd_b_a_a_f16:
133 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
134 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
135 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
136 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
138 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
139 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
141 ; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
142 ; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
144 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
146 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
147 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
148 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
149 ; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
150 ; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
152 define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
153 half addrspace(1)* %in1,
154 half addrspace(1)* %in2) #0 {
155 %tid = call i32 @llvm.amdgcn.workitem.id.x()
156 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
157 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
158 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
160 %r0 = load volatile half, half addrspace(1)* %gep.0
161 %r1 = load volatile half, half addrspace(1)* %gep.1
163 %add.0 = fadd half %r0, %r0
164 %add.1 = fadd half %r1, %add.0
165 store half %add.1, half addrspace(1)* %gep.out
169 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16
170 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
171 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
172 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
173 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
174 ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
175 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
176 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
177 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
178 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
179 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
180 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
181 define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
182 %tid = call i32 @llvm.amdgcn.workitem.id.x()
183 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
184 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
185 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
187 %r1 = load volatile half, half addrspace(1)* %gep.0
188 %r2 = load volatile half, half addrspace(1)* %gep.1
190 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2)
191 store half %r3, half addrspace(1)* %gep.out
195 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16
196 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
197 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
198 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
199 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
201 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
202 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
204 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
205 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
206 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
208 ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
209 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
210 define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
211 %tid = call i32 @llvm.amdgcn.workitem.id.x()
212 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
213 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
214 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
216 %r1 = load volatile half, half addrspace(1)* %gep.0
217 %r2 = load volatile half, half addrspace(1)* %gep.1
219 %r1.fneg = fsub half -0.000000e+00, %r1
221 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2)
222 store half %r3, half addrspace(1)* %gep.out
226 ; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16
227 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
228 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
229 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
230 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
232 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
233 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
235 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
236 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
237 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
239 ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
240 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
241 define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
242 %tid = call i32 @llvm.amdgcn.workitem.id.x()
243 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
244 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
245 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
247 %r1 = load volatile half, half addrspace(1)* %gep.0
248 %r2 = load volatile half, half addrspace(1)* %gep.1
250 %r1.fneg = fsub half -0.000000e+00, %r1
252 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2)
253 store half %r3, half addrspace(1)* %gep.out
257 ; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16
258 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
259 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
260 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
261 ; GCN-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
262 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
263 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
264 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
265 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
266 define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
267 %tid = call i32 @llvm.amdgcn.workitem.id.x()
268 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
269 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
270 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
272 %r1 = load volatile half, half addrspace(1)* %gep.0
273 %r2 = load volatile half, half addrspace(1)* %gep.1
275 %r2.fneg = fsub half -0.000000e+00, %r2
277 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg)
278 store half %r3, half addrspace(1)* %gep.out
282 ; GCN-LABEL: {{^}}mad_sub_f16:
283 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
284 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
285 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
287 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
289 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
291 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
292 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
294 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
296 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
297 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
298 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
299 define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
300 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
301 %tid.ext = sext i32 %tid to i64
302 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
303 %add1 = add i64 %tid.ext, 1
304 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
305 %add2 = add i64 %tid.ext, 2
306 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
307 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
308 %a = load volatile half, half addrspace(1)* %gep0, align 2
309 %b = load volatile half, half addrspace(1)* %gep1, align 2
310 %c = load volatile half, half addrspace(1)* %gep2, align 2
311 %mul = fmul half %a, %b
312 %sub = fsub half %mul, %c
313 store half %sub, half addrspace(1)* %outgep, align 2
317 ; GCN-LABEL: {{^}}mad_sub_inv_f16:
318 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
319 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
320 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
321 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
323 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
324 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e64 [[REGC]], -[[REGA]], [[REGB]]
326 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
327 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
329 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
331 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
332 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
333 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
334 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
335 define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
336 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
337 %tid.ext = sext i32 %tid to i64
338 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
339 %add1 = add i64 %tid.ext, 1
340 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
341 %add2 = add i64 %tid.ext, 2
342 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
343 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
344 %a = load volatile half, half addrspace(1)* %gep0, align 2
345 %b = load volatile half, half addrspace(1)* %gep1, align 2
346 %c = load volatile half, half addrspace(1)* %gep2, align 2
347 %mul = fmul half %a, %b
348 %sub = fsub half %c, %mul
349 store half %sub, half addrspace(1)* %outgep, align 2
353 ; GCN-LABEL: {{^}}mad_sub_fabs_f16:
354 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
355 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
356 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
357 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
359 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
361 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
362 ; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
364 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
366 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
367 ; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
368 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
369 define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
370 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
371 %tid.ext = sext i32 %tid to i64
372 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
373 %add1 = add i64 %tid.ext, 1
374 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
375 %add2 = add i64 %tid.ext, 2
376 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
377 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
378 %a = load volatile half, half addrspace(1)* %gep0, align 2
379 %b = load volatile half, half addrspace(1)* %gep1, align 2
380 %c = load volatile half, half addrspace(1)* %gep2, align 2
381 %c.abs = call half @llvm.fabs.f16(half %c) #0
382 %mul = fmul half %a, %b
383 %sub = fsub half %mul, %c.abs
384 store half %sub, half addrspace(1)* %outgep, align 2
388 ; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16:
389 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
390 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
391 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
393 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
395 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
397 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
398 ; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
400 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
402 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
403 ; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
404 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
405 define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
406 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
407 %tid.ext = sext i32 %tid to i64
408 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
409 %add1 = add i64 %tid.ext, 1
410 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
411 %add2 = add i64 %tid.ext, 2
412 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
413 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
414 %a = load volatile half, half addrspace(1)* %gep0, align 2
415 %b = load volatile half, half addrspace(1)* %gep1, align 2
416 %c = load volatile half, half addrspace(1)* %gep2, align 2
417 %c.abs = call half @llvm.fabs.f16(half %c) #0
418 %mul = fmul half %a, %b
419 %sub = fsub half %c.abs, %mul
420 store half %sub, half addrspace(1)* %outgep, align 2
424 ; GCN-LABEL: {{^}}neg_neg_mad_f16:
425 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
426 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
427 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
429 ; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
430 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
432 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
433 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
435 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
436 ; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
437 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
439 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
440 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
441 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
442 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
443 define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
444 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
445 %tid.ext = sext i32 %tid to i64
446 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
447 %add1 = add i64 %tid.ext, 1
448 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
449 %add2 = add i64 %tid.ext, 2
450 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
451 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
452 %a = load volatile half, half addrspace(1)* %gep0, align 2
453 %b = load volatile half, half addrspace(1)* %gep1, align 2
454 %c = load volatile half, half addrspace(1)* %gep2, align 2
455 %nega = fsub half -0.000000e+00, %a
456 %negb = fsub half -0.000000e+00, %b
457 %mul = fmul half %nega, %negb
458 %sub = fadd half %mul, %c
459 store half %sub, half addrspace(1)* %outgep, align 2
463 ; GCN-LABEL: {{^}}mad_fabs_sub_f16:
464 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
465 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
466 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
468 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
470 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
472 ; GCN-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
473 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
475 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
477 ; GFX10-FLUSH: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
478 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
479 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
480 define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
481 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
482 %tid.ext = sext i32 %tid to i64
483 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
484 %add1 = add i64 %tid.ext, 1
485 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
486 %add2 = add i64 %tid.ext, 2
487 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
488 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
489 %a = load volatile half, half addrspace(1)* %gep0, align 2
490 %b = load volatile half, half addrspace(1)* %gep1, align 2
491 %c = load volatile half, half addrspace(1)* %gep2, align 2
492 %b.abs = call half @llvm.fabs.f16(half %b) #0
493 %mul = fmul half %a, %b.abs
494 %sub = fsub half %mul, %c
495 store half %sub, half addrspace(1)* %outgep, align 2
499 ; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16:
500 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
501 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
502 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
503 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
505 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
506 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
508 ; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
509 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
511 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
513 ; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
514 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
515 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
516 ; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
517 ; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
518 define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
519 %tid = call i32 @llvm.amdgcn.workitem.id.x()
520 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
521 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
522 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
524 %r1 = load volatile half, half addrspace(1)* %gep.0
525 %r2 = load volatile half, half addrspace(1)* %gep.1
527 %add = fadd half %r1, %r1
528 %r3 = fsub half %r2, %add
530 store half %r3, half addrspace(1)* %gep.out
534 ; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16:
535 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
536 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
538 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
540 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
542 ; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
543 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
545 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
547 ; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
548 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
549 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
550 define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
551 %tid = call i32 @llvm.amdgcn.workitem.id.x()
552 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
553 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
554 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
556 %r1 = load volatile half, half addrspace(1)* %gep.0
557 %r2 = load volatile half, half addrspace(1)* %gep.1
559 %add = fadd half %r1, %r1
560 %r3 = fsub half %add, %r2
562 store half %r3, half addrspace(1)* %gep.out
566 attributes #0 = { nounwind }
567 attributes #1 = { nounwind readnone }