1 ; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI-FLUSH,VI %s
2 ; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI-FLUSH,VI %s
4 ; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM,VI %s
5 ; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
7 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s
9 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s
10 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s
12 declare i32 @llvm.amdgcn.workitem.id.x() #1
13 declare half @llvm.fmuladd.f16(half, half, half) #1
14 declare half @llvm.fabs.f16(half) #1
16 ; GCN-LABEL: {{^}}fmuladd_f16:
17 ; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
19 ; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
21 ; GFX10-FLUSH: v_mul_f16_e32
22 ; GFX10-FLUSH: v_add_f16_e32
23 ; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
25 define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
26 half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
27 %r0 = load half, half addrspace(1)* %in1
28 %r1 = load half, half addrspace(1)* %in2
29 %r2 = load half, half addrspace(1)* %in3
30 %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2)
31 store half %r3, half addrspace(1)* %out
35 ; GCN-LABEL: {{^}}fmul_fadd_f16:
36 ; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
38 ; VI-DENORM-CONTRACT: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
40 ; GFX10-FLUSH: v_mul_f16_e32
41 ; GFX10-FLUSH: v_add_f16_e32
42 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
44 define amdgpu_kernel void @fmul_fadd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
45 half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
46 %r0 = load half, half addrspace(1)* %in1
47 %r1 = load half, half addrspace(1)* %in2
48 %r2 = load half, half addrspace(1)* %in3
49 %mul = fmul half %r0, %r1
50 %add = fadd half %mul, %r2
51 store half %add, half addrspace(1)* %out
55 ; GCN-LABEL: {{^}}fmul_fadd_contract_f16:
56 ; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
58 ; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
60 ; GFX10-FLUSH: v_mul_f16_e32
61 ; GFX10-FLUSH: v_add_f16_e32
62 ; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
64 define amdgpu_kernel void @fmul_fadd_contract_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
65 half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
66 %r0 = load half, half addrspace(1)* %in1
67 %r1 = load half, half addrspace(1)* %in2
68 %r2 = load half, half addrspace(1)* %in3
69 %mul = fmul contract half %r0, %r1
70 %add = fadd contract half %mul, %r2
71 store half %add, half addrspace(1)* %out
75 ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16
76 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
77 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
78 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
79 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
81 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
82 ; GFX10-DENORM: v_fmac_f16_e32 [[R2:v[0-9]+]], 2.0, [[R1]]
84 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
85 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
87 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
88 ; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]]
89 ; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
91 define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
92 %tid = call i32 @llvm.amdgcn.workitem.id.x()
93 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
94 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
95 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
97 %r1 = load volatile half, half addrspace(1)* %gep.0
98 %r2 = load volatile half, half addrspace(1)* %gep.1
100 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2)
101 store half %r3, half addrspace(1)* %gep.out
105 ; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16
106 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
107 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
108 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
109 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
111 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
112 ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
114 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
115 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
117 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
118 ; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]]
119 ; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]]
121 define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
122 %tid = call i32 @llvm.amdgcn.workitem.id.x()
123 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
124 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
125 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
127 %r1 = load volatile half, half addrspace(1)* %gep.0
128 %r2 = load volatile half, half addrspace(1)* %gep.1
130 %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2)
131 store half %r3, half addrspace(1)* %gep.out
135 ; GCN-LABEL: {{^}}fadd_a_a_b_f16:
136 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
137 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
138 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
139 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
141 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
142 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
144 ; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
145 ; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
147 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
149 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
150 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
151 ; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]]
152 ; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]]
153 ; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]]
155 define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
156 half addrspace(1)* %in1,
157 half addrspace(1)* %in2) #0 {
158 %tid = call i32 @llvm.amdgcn.workitem.id.x()
159 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
160 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
161 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
163 %r0 = load volatile half, half addrspace(1)* %gep.0
164 %r1 = load volatile half, half addrspace(1)* %gep.1
166 %add.0 = fadd half %r0, %r0
167 %add.1 = fadd half %add.0, %r1
168 store half %add.1, half addrspace(1)* %gep.out
172 ; GCN-LABEL: {{^}}fadd_b_a_a_f16:
173 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
174 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
175 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
176 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
178 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
179 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
181 ; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
182 ; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
184 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
186 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
187 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
188 ; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]]
189 ; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]]
190 ; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]]
192 define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
193 half addrspace(1)* %in1,
194 half addrspace(1)* %in2) #0 {
195 %tid = call i32 @llvm.amdgcn.workitem.id.x()
196 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
197 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
198 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
200 %r0 = load volatile half, half addrspace(1)* %gep.0
201 %r1 = load volatile half, half addrspace(1)* %gep.1
203 %add.0 = fadd half %r0, %r0
204 %add.1 = fadd half %r1, %add.0
205 store half %add.1, half addrspace(1)* %gep.out
209 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16
210 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
211 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
212 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
213 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
214 ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
215 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
216 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
217 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
218 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
219 ; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]]
220 ; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]]
221 define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
222 %tid = call i32 @llvm.amdgcn.workitem.id.x()
223 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
224 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
225 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
227 %r1 = load volatile half, half addrspace(1)* %gep.0
228 %r2 = load volatile half, half addrspace(1)* %gep.1
230 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2)
231 store half %r3, half addrspace(1)* %gep.out
235 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16
236 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
237 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
238 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
239 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
241 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
242 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
244 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
245 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
246 ; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]]
248 ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
249 ; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]]
250 define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
251 %tid = call i32 @llvm.amdgcn.workitem.id.x()
252 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
253 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
254 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
256 %r1 = load volatile half, half addrspace(1)* %gep.0
257 %r2 = load volatile half, half addrspace(1)* %gep.1
259 %r1.fneg = fneg half %r1
261 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2)
262 store half %r3, half addrspace(1)* %gep.out
266 ; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16
267 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
268 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
269 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
270 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
272 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
273 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
275 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
276 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
277 ; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]]
279 ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
280 ; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]]
281 define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
282 %tid = call i32 @llvm.amdgcn.workitem.id.x()
283 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
284 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
285 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
287 %r1 = load volatile half, half addrspace(1)* %gep.0
288 %r2 = load volatile half, half addrspace(1)* %gep.1
290 %r1.fneg = fneg half %r1
292 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2)
293 store half %r3, half addrspace(1)* %gep.out
297 ; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16
298 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
299 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
300 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
301 ; GCN-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
302 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
303 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
304 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
305 ; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]]
306 define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
307 %tid = call i32 @llvm.amdgcn.workitem.id.x()
308 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
309 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
310 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
312 %r1 = load volatile half, half addrspace(1)* %gep.0
313 %r2 = load volatile half, half addrspace(1)* %gep.1
315 %r2.fneg = fneg half %r2
317 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg)
318 store half %r3, half addrspace(1)* %gep.out
322 ; GCN-LABEL: {{^}}mad_sub_f16:
323 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
324 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
325 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
327 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
329 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
331 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
332 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
334 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
336 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
337 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
338 ; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]]
339 define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
340 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
341 %tid.ext = sext i32 %tid to i64
342 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
343 %add1 = add i64 %tid.ext, 1
344 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
345 %add2 = add i64 %tid.ext, 2
346 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
347 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
348 %a = load volatile half, half addrspace(1)* %gep0, align 2
349 %b = load volatile half, half addrspace(1)* %gep1, align 2
350 %c = load volatile half, half addrspace(1)* %gep2, align 2
351 %mul = fmul half %a, %b
352 %sub = fsub half %mul, %c
353 store half %sub, half addrspace(1)* %outgep, align 2
357 ; GCN-LABEL: {{^}}mad_sub_inv_f16:
358 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
359 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
360 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
361 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
363 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
364 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e64 [[REGC]], -[[REGA]], [[REGB]]
366 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
367 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
369 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
371 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
372 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
373 ; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]]
374 ; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]]
375 ; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[REGC]]
376 define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
377 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
378 %tid.ext = sext i32 %tid to i64
379 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
380 %add1 = add i64 %tid.ext, 1
381 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
382 %add2 = add i64 %tid.ext, 2
383 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
384 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
385 %a = load volatile half, half addrspace(1)* %gep0, align 2
386 %b = load volatile half, half addrspace(1)* %gep1, align 2
387 %c = load volatile half, half addrspace(1)* %gep2, align 2
388 %mul = fmul half %a, %b
389 %sub = fsub half %c, %mul
390 store half %sub, half addrspace(1)* %outgep, align 2
394 ; GCN-LABEL: {{^}}mad_sub_fabs_f16:
395 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
396 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
397 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
398 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
400 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
402 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
403 ; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
405 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
407 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
408 ; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
409 ; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]]
410 define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
411 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
412 %tid.ext = sext i32 %tid to i64
413 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
414 %add1 = add i64 %tid.ext, 1
415 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
416 %add2 = add i64 %tid.ext, 2
417 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
418 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
419 %a = load volatile half, half addrspace(1)* %gep0, align 2
420 %b = load volatile half, half addrspace(1)* %gep1, align 2
421 %c = load volatile half, half addrspace(1)* %gep2, align 2
422 %c.abs = call half @llvm.fabs.f16(half %c) #0
423 %mul = fmul half %a, %b
424 %sub = fsub half %mul, %c.abs
425 store half %sub, half addrspace(1)* %outgep, align 2
429 ; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16:
430 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
431 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
432 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
434 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
436 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
438 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
439 ; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
441 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
443 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
444 ; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
445 ; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]]
446 define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
447 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
448 %tid.ext = sext i32 %tid to i64
449 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
450 %add1 = add i64 %tid.ext, 1
451 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
452 %add2 = add i64 %tid.ext, 2
453 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
454 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
455 %a = load volatile half, half addrspace(1)* %gep0, align 2
456 %b = load volatile half, half addrspace(1)* %gep1, align 2
457 %c = load volatile half, half addrspace(1)* %gep2, align 2
458 %c.abs = call half @llvm.fabs.f16(half %c) #0
459 %mul = fmul half %a, %b
460 %sub = fsub half %c.abs, %mul
461 store half %sub, half addrspace(1)* %outgep, align 2
465 ; GCN-LABEL: {{^}}neg_neg_mad_f16:
466 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
467 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
468 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
470 ; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
471 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
473 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
474 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
476 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
477 ; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
478 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
480 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
481 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
482 ; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]]
483 ; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]]
484 ; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[REGC]]
485 define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
486 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
487 %tid.ext = sext i32 %tid to i64
488 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
489 %add1 = add i64 %tid.ext, 1
490 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
491 %add2 = add i64 %tid.ext, 2
492 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
493 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
494 %a = load volatile half, half addrspace(1)* %gep0, align 2
495 %b = load volatile half, half addrspace(1)* %gep1, align 2
496 %c = load volatile half, half addrspace(1)* %gep2, align 2
499 %mul = fmul half %nega, %negb
500 %sub = fadd half %mul, %c
501 store half %sub, half addrspace(1)* %outgep, align 2
505 ; GCN-LABEL: {{^}}mad_fabs_sub_f16:
506 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
507 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
508 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
510 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
512 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
514 ; GCN-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
515 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
517 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
519 ; GFX10-FLUSH: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
520 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
521 ; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]]
522 define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
523 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
524 %tid.ext = sext i32 %tid to i64
525 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
526 %add1 = add i64 %tid.ext, 1
527 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
528 %add2 = add i64 %tid.ext, 2
529 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
530 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
531 %a = load volatile half, half addrspace(1)* %gep0, align 2
532 %b = load volatile half, half addrspace(1)* %gep1, align 2
533 %c = load volatile half, half addrspace(1)* %gep2, align 2
534 %b.abs = call half @llvm.fabs.f16(half %b) #0
535 %mul = fmul half %a, %b.abs
536 %sub = fsub half %mul, %c
537 store half %sub, half addrspace(1)* %outgep, align 2
541 ; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16:
542 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
543 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
544 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
545 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
547 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
548 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
550 ; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
551 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
553 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
555 ; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
556 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
557 ; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]]
558 ; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]]
559 ; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]]
560 define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
561 %tid = call i32 @llvm.amdgcn.workitem.id.x()
562 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
563 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
564 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
566 %r1 = load volatile half, half addrspace(1)* %gep.0
567 %r2 = load volatile half, half addrspace(1)* %gep.1
569 %add = fadd half %r1, %r1
570 %r3 = fsub half %r2, %add
572 store half %r3, half addrspace(1)* %gep.out
576 ; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16:
577 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
578 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
580 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
582 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
584 ; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
585 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
587 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
589 ; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
590 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
591 ; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]]
592 define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
593 %tid = call i32 @llvm.amdgcn.workitem.id.x()
594 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
595 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
596 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
598 %r1 = load volatile half, half addrspace(1)* %gep.0
599 %r2 = load volatile half, half addrspace(1)* %gep.1
601 %add = fadd half %r1, %r1
602 %r3 = fsub half %add, %r2
604 store half %r3, half addrspace(1)* %gep.out
608 attributes #0 = { nounwind }
609 attributes #1 = { nounwind readnone }