1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s
2 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s
4 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
5 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
7 declare i32 @llvm.amdgcn.workitem.id.x() #1
8 declare half @llvm.fmuladd.f16(half, half, half) #1
9 declare half @llvm.fabs.f16(half) #1
11 ; GCN-LABEL: {{^}}fmuladd_f16:
12 ; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
14 ; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
15 define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
16 half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
17 %r0 = load half, half addrspace(1)* %in1
18 %r1 = load half, half addrspace(1)* %in2
19 %r2 = load half, half addrspace(1)* %in3
20 %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2)
21 store half %r3, half addrspace(1)* %out
25 ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16
26 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
27 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
28 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
29 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
31 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
32 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
33 define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
34 %tid = call i32 @llvm.amdgcn.workitem.id.x()
35 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
36 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
37 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
39 %r1 = load volatile half, half addrspace(1)* %gep.0
40 %r2 = load volatile half, half addrspace(1)* %gep.1
42 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2)
43 store half %r3, half addrspace(1)* %gep.out
47 ; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16
48 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
49 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
50 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
51 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
53 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
54 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
55 define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
56 %tid = call i32 @llvm.amdgcn.workitem.id.x()
57 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
58 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
59 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
61 %r1 = load volatile half, half addrspace(1)* %gep.0
62 %r2 = load volatile half, half addrspace(1)* %gep.1
64 %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2)
65 store half %r3, half addrspace(1)* %gep.out
69 ; GCN-LABEL: {{^}}fadd_a_a_b_f16:
70 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
71 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
72 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
73 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
75 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
77 ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
78 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
80 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
81 define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
82 half addrspace(1)* %in1,
83 half addrspace(1)* %in2) #0 {
84 %tid = call i32 @llvm.amdgcn.workitem.id.x()
85 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
86 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
87 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
89 %r0 = load volatile half, half addrspace(1)* %gep.0
90 %r1 = load volatile half, half addrspace(1)* %gep.1
92 %add.0 = fadd half %r0, %r0
93 %add.1 = fadd half %add.0, %r1
94 store half %add.1, half addrspace(1)* %gep.out
98 ; GCN-LABEL: {{^}}fadd_b_a_a_f16:
99 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
100 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
101 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
102 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
104 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
106 ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
107 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
109 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
110 define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
111 half addrspace(1)* %in1,
112 half addrspace(1)* %in2) #0 {
113 %tid = call i32 @llvm.amdgcn.workitem.id.x()
114 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
115 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
116 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
118 %r0 = load volatile half, half addrspace(1)* %gep.0
119 %r1 = load volatile half, half addrspace(1)* %gep.1
121 %add.0 = fadd half %r0, %r0
122 %add.1 = fadd half %r1, %add.0
123 store half %add.1, half addrspace(1)* %gep.out
127 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16
128 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
129 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
130 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
131 ; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]]
132 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
133 define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
134 %tid = call i32 @llvm.amdgcn.workitem.id.x()
135 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
136 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
137 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
139 %r1 = load volatile half, half addrspace(1)* %gep.0
140 %r2 = load volatile half, half addrspace(1)* %gep.1
142 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2)
143 store half %r3, half addrspace(1)* %gep.out
147 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16
148 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
149 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
150 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
151 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
153 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
154 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
155 define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
156 %tid = call i32 @llvm.amdgcn.workitem.id.x()
157 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
158 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
159 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
161 %r1 = load volatile half, half addrspace(1)* %gep.0
162 %r2 = load volatile half, half addrspace(1)* %gep.1
164 %r1.fneg = fsub half -0.000000e+00, %r1
166 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2)
167 store half %r3, half addrspace(1)* %gep.out
171 ; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16
172 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
173 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
174 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
175 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
177 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
178 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
179 define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
180 %tid = call i32 @llvm.amdgcn.workitem.id.x()
181 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
182 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
183 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
185 %r1 = load volatile half, half addrspace(1)* %gep.0
186 %r2 = load volatile half, half addrspace(1)* %gep.1
188 %r1.fneg = fsub half -0.000000e+00, %r1
190 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2)
191 store half %r3, half addrspace(1)* %gep.out
195 ; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16
196 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
197 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
198 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
199 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
200 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
201 define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
202 %tid = call i32 @llvm.amdgcn.workitem.id.x()
203 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
204 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
205 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
207 %r1 = load volatile half, half addrspace(1)* %gep.0
208 %r2 = load volatile half, half addrspace(1)* %gep.1
210 %r2.fneg = fsub half -0.000000e+00, %r2
212 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg)
213 store half %r3, half addrspace(1)* %gep.out
217 ; GCN-LABEL: {{^}}mad_sub_f16:
218 ; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
219 ; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
220 ; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
222 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
224 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
226 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
227 ; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
229 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
230 define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
231 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
232 %tid.ext = sext i32 %tid to i64
233 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
234 %add1 = add i64 %tid.ext, 1
235 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
236 %add2 = add i64 %tid.ext, 2
237 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
238 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
239 %a = load volatile half, half addrspace(1)* %gep0, align 2
240 %b = load volatile half, half addrspace(1)* %gep1, align 2
241 %c = load volatile half, half addrspace(1)* %gep2, align 2
242 %mul = fmul half %a, %b
243 %sub = fsub half %mul, %c
244 store half %sub, half addrspace(1)* %outgep, align 2
248 ; GCN-LABEL: {{^}}mad_sub_inv_f16:
249 ; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
250 ; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
251 ; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
252 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
254 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
256 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
257 ; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
259 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
260 define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
261 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
262 %tid.ext = sext i32 %tid to i64
263 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
264 %add1 = add i64 %tid.ext, 1
265 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
266 %add2 = add i64 %tid.ext, 2
267 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
268 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
269 %a = load volatile half, half addrspace(1)* %gep0, align 2
270 %b = load volatile half, half addrspace(1)* %gep1, align 2
271 %c = load volatile half, half addrspace(1)* %gep2, align 2
272 %mul = fmul half %a, %b
273 %sub = fsub half %c, %mul
274 store half %sub, half addrspace(1)* %outgep, align 2
278 ; GCN-LABEL: {{^}}mad_sub_fabs_f16:
279 ; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
280 ; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
281 ; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
282 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
284 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
286 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
287 ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
289 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
290 define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
291 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
292 %tid.ext = sext i32 %tid to i64
293 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
294 %add1 = add i64 %tid.ext, 1
295 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
296 %add2 = add i64 %tid.ext, 2
297 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
298 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
299 %a = load volatile half, half addrspace(1)* %gep0, align 2
300 %b = load volatile half, half addrspace(1)* %gep1, align 2
301 %c = load volatile half, half addrspace(1)* %gep2, align 2
302 %c.abs = call half @llvm.fabs.f16(half %c) #0
303 %mul = fmul half %a, %b
304 %sub = fsub half %mul, %c.abs
305 store half %sub, half addrspace(1)* %outgep, align 2
309 ; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16:
310 ; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
311 ; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
312 ; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
314 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
316 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
318 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
319 ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
321 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
322 define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
323 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
324 %tid.ext = sext i32 %tid to i64
325 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
326 %add1 = add i64 %tid.ext, 1
327 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
328 %add2 = add i64 %tid.ext, 2
329 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
330 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
331 %a = load volatile half, half addrspace(1)* %gep0, align 2
332 %b = load volatile half, half addrspace(1)* %gep1, align 2
333 %c = load volatile half, half addrspace(1)* %gep2, align 2
334 %c.abs = call half @llvm.fabs.f16(half %c) #0
335 %mul = fmul half %a, %b
336 %sub = fsub half %c.abs, %mul
337 store half %sub, half addrspace(1)* %outgep, align 2
341 ; GCN-LABEL: {{^}}neg_neg_mad_f16:
342 ; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
343 ; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
344 ; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
346 ; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
347 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
349 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
351 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
352 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
353 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
354 define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
355 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
356 %tid.ext = sext i32 %tid to i64
357 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
358 %add1 = add i64 %tid.ext, 1
359 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
360 %add2 = add i64 %tid.ext, 2
361 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
362 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
363 %a = load volatile half, half addrspace(1)* %gep0, align 2
364 %b = load volatile half, half addrspace(1)* %gep1, align 2
365 %c = load volatile half, half addrspace(1)* %gep2, align 2
366 %nega = fsub half -0.000000e+00, %a
367 %negb = fsub half -0.000000e+00, %b
368 %mul = fmul half %nega, %negb
369 %sub = fadd half %mul, %c
370 store half %sub, half addrspace(1)* %outgep, align 2
374 ; GCN-LABEL: {{^}}mad_fabs_sub_f16:
375 ; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
376 ; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
377 ; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
379 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
381 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
383 ; VI-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
384 ; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
386 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
387 define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
388 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
389 %tid.ext = sext i32 %tid to i64
390 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
391 %add1 = add i64 %tid.ext, 1
392 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
393 %add2 = add i64 %tid.ext, 2
394 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
395 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
396 %a = load volatile half, half addrspace(1)* %gep0, align 2
397 %b = load volatile half, half addrspace(1)* %gep1, align 2
398 %c = load volatile half, half addrspace(1)* %gep2, align 2
399 %b.abs = call half @llvm.fabs.f16(half %b) #0
400 %mul = fmul half %a, %b.abs
401 %sub = fsub half %mul, %c
402 store half %sub, half addrspace(1)* %outgep, align 2
406 ; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16:
407 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
408 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
409 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
410 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
412 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
414 ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
415 ; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
417 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
418 define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
419 %tid = call i32 @llvm.amdgcn.workitem.id.x()
420 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
421 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
422 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
424 %r1 = load volatile half, half addrspace(1)* %gep.0
425 %r2 = load volatile half, half addrspace(1)* %gep.1
427 %add = fadd half %r1, %r1
428 %r3 = fsub half %r2, %add
430 store half %r3, half addrspace(1)* %gep.out
434 ; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16:
435 ; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
436 ; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
438 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
440 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
442 ; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
443 ; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
445 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
446 define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
447 %tid = call i32 @llvm.amdgcn.workitem.id.x()
448 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
449 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
450 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
452 %r1 = load volatile half, half addrspace(1)* %gep.0
453 %r2 = load volatile half, half addrspace(1)* %gep.1
455 %add = fadd half %r1, %r1
456 %r3 = fsub half %add, %r2
458 store half %r3, half addrspace(1)* %gep.out
462 attributes #0 = { nounwind }
463 attributes #1 = { nounwind readnone }