1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s
2 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s
4 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
5 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
7 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GFX10-FLUSH,GFX10 %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GFX10-FLUSH,GFX10 %s
9 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s
10 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s
12 declare i32 @llvm.amdgcn.workitem.id.x() #1
13 declare half @llvm.fmuladd.f16(half, half, half) #1
14 declare half @llvm.fabs.f16(half) #1
16 ; GCN-LABEL: {{^}}fmuladd_f16:
17 ; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
19 ; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
21 ; GFX10-FLUSH: v_mul_f16_e32
22 ; GFX10-FLUSH: v_add_f16_e32
23 ; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
25 define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
26 half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
27 %r0 = load half, half addrspace(1)* %in1
28 %r1 = load half, half addrspace(1)* %in2
29 %r2 = load half, half addrspace(1)* %in3
30 %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2)
31 store half %r3, half addrspace(1)* %out
35 ; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16
36 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
37 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
38 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
39 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
41 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
42 ; GFX10-DENORM: v_fmac_f16_e32 [[R2:v[0-9]+]], 2.0, [[R1]]
44 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
45 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
47 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
48 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
49 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
51 define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
52 %tid = call i32 @llvm.amdgcn.workitem.id.x()
53 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
54 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
55 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
57 %r1 = load volatile half, half addrspace(1)* %gep.0
58 %r2 = load volatile half, half addrspace(1)* %gep.1
60 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2)
61 store half %r3, half addrspace(1)* %gep.out
65 ; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16
66 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
67 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
68 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
69 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
71 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
72 ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
74 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
75 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
77 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
78 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
79 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
81 define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
82 %tid = call i32 @llvm.amdgcn.workitem.id.x()
83 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
84 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
85 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
87 %r1 = load volatile half, half addrspace(1)* %gep.0
88 %r2 = load volatile half, half addrspace(1)* %gep.1
90 %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2)
91 store half %r3, half addrspace(1)* %gep.out
95 ; GCN-LABEL: {{^}}fadd_a_a_b_f16:
96 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
97 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
98 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
99 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
101 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
102 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
104 ; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
105 ; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
107 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
109 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
110 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
111 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
112 ; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
113 ; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
115 define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
116 half addrspace(1)* %in1,
117 half addrspace(1)* %in2) #0 {
118 %tid = call i32 @llvm.amdgcn.workitem.id.x()
119 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
120 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
121 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
123 %r0 = load volatile half, half addrspace(1)* %gep.0
124 %r1 = load volatile half, half addrspace(1)* %gep.1
126 %add.0 = fadd half %r0, %r0
127 %add.1 = fadd half %add.0, %r1
128 store half %add.1, half addrspace(1)* %gep.out
132 ; GCN-LABEL: {{^}}fadd_b_a_a_f16:
133 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
134 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
135 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
136 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
138 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
139 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
141 ; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
142 ; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
144 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
146 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
147 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
148 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
149 ; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
150 ; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
152 define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
153 half addrspace(1)* %in1,
154 half addrspace(1)* %in2) #0 {
155 %tid = call i32 @llvm.amdgcn.workitem.id.x()
156 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
157 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
158 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
160 %r0 = load volatile half, half addrspace(1)* %gep.0
161 %r1 = load volatile half, half addrspace(1)* %gep.1
163 %add.0 = fadd half %r0, %r0
164 %add.1 = fadd half %r1, %add.0
165 store half %add.1, half addrspace(1)* %gep.out
169 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16
170 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
171 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
172 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
173 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
174 ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
175 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
176 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
177 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
178 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
179 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
180 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
181 define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
182 %tid = call i32 @llvm.amdgcn.workitem.id.x()
183 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
184 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
185 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
187 %r1 = load volatile half, half addrspace(1)* %gep.0
188 %r2 = load volatile half, half addrspace(1)* %gep.1
190 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2)
191 store half %r3, half addrspace(1)* %gep.out
195 ; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16
196 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
197 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
198 ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
199 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
201 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
202 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
204 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
205 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
206 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
208 ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
209 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
210 define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
211 %tid = call i32 @llvm.amdgcn.workitem.id.x()
212 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
213 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
214 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
216 %r1 = load volatile half, half addrspace(1)* %gep.0
217 %r2 = load volatile half, half addrspace(1)* %gep.1
219 %r1.fneg = fsub half -0.000000e+00, %r1
221 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2)
222 store half %r3, half addrspace(1)* %gep.out
226 ; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16
227 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
228 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
229 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
230 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
232 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
233 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
235 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
236 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
237 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
239 ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
240 ; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
241 define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
242 %tid = call i32 @llvm.amdgcn.workitem.id.x()
243 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
244 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
245 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
247 %r1 = load volatile half, half addrspace(1)* %gep.0
248 %r2 = load volatile half, half addrspace(1)* %gep.1
250 %r1.fneg = fsub half -0.000000e+00, %r1
252 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2)
253 store half %r3, half addrspace(1)* %gep.out
257 ; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16
258 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
259 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
260 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
261 ; GCN-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
262 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
263 ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
264 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
265 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
266 define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
267 %tid = call i32 @llvm.amdgcn.workitem.id.x()
268 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
269 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
270 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
272 %r1 = load volatile half, half addrspace(1)* %gep.0
273 %r2 = load volatile half, half addrspace(1)* %gep.1
275 %r2.fneg = fsub half -0.000000e+00, %r2
277 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg)
278 store half %r3, half addrspace(1)* %gep.out
282 ; GCN-LABEL: {{^}}mad_sub_f16:
283 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
284 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
285 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
287 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
289 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
291 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
292 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
294 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
296 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
297 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
298 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
299 define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
300 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
301 %tid.ext = sext i32 %tid to i64
302 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
303 %add1 = add i64 %tid.ext, 1
304 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
305 %add2 = add i64 %tid.ext, 2
306 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
307 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
308 %a = load volatile half, half addrspace(1)* %gep0, align 2
309 %b = load volatile half, half addrspace(1)* %gep1, align 2
310 %c = load volatile half, half addrspace(1)* %gep2, align 2
311 %mul = fmul half %a, %b
312 %sub = fsub half %mul, %c
313 store half %sub, half addrspace(1)* %outgep, align 2
317 ; GCN-LABEL: {{^}}mad_sub_inv_f16:
318 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
319 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
320 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
321 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
323 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
324 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e64 [[REGC]], -[[REGA]], [[REGB]]
326 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
327 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
329 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
331 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
332 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
333 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
334 ; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
335 ; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
336 define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
337 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
338 %tid.ext = sext i32 %tid to i64
339 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
340 %add1 = add i64 %tid.ext, 1
341 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
342 %add2 = add i64 %tid.ext, 2
343 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
344 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
345 %a = load volatile half, half addrspace(1)* %gep0, align 2
346 %b = load volatile half, half addrspace(1)* %gep1, align 2
347 %c = load volatile half, half addrspace(1)* %gep2, align 2
348 %mul = fmul half %a, %b
349 %sub = fsub half %c, %mul
350 store half %sub, half addrspace(1)* %outgep, align 2
354 ; GCN-LABEL: {{^}}mad_sub_fabs_f16:
355 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
356 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
357 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
358 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
360 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
362 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
363 ; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
365 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
367 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
368 ; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
369 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
370 define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
371 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
372 %tid.ext = sext i32 %tid to i64
373 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
374 %add1 = add i64 %tid.ext, 1
375 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
376 %add2 = add i64 %tid.ext, 2
377 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
378 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
379 %a = load volatile half, half addrspace(1)* %gep0, align 2
380 %b = load volatile half, half addrspace(1)* %gep1, align 2
381 %c = load volatile half, half addrspace(1)* %gep2, align 2
382 %c.abs = call half @llvm.fabs.f16(half %c) #0
383 %mul = fmul half %a, %b
384 %sub = fsub half %mul, %c.abs
385 store half %sub, half addrspace(1)* %outgep, align 2
389 ; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16:
390 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
391 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
392 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
394 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
396 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
398 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
399 ; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
401 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
403 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
404 ; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
405 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
406 define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
407 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
408 %tid.ext = sext i32 %tid to i64
409 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
410 %add1 = add i64 %tid.ext, 1
411 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
412 %add2 = add i64 %tid.ext, 2
413 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
414 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
415 %a = load volatile half, half addrspace(1)* %gep0, align 2
416 %b = load volatile half, half addrspace(1)* %gep1, align 2
417 %c = load volatile half, half addrspace(1)* %gep2, align 2
418 %c.abs = call half @llvm.fabs.f16(half %c) #0
419 %mul = fmul half %a, %b
420 %sub = fsub half %c.abs, %mul
421 store half %sub, half addrspace(1)* %outgep, align 2
425 ; GCN-LABEL: {{^}}neg_neg_mad_f16:
426 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
427 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
428 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
430 ; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
431 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
433 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
434 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
436 ; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
437 ; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
438 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
440 ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
441 ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
442 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
443 ; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
444 ; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
445 define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
446 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
447 %tid.ext = sext i32 %tid to i64
448 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
449 %add1 = add i64 %tid.ext, 1
450 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
451 %add2 = add i64 %tid.ext, 2
452 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
453 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
454 %a = load volatile half, half addrspace(1)* %gep0, align 2
455 %b = load volatile half, half addrspace(1)* %gep1, align 2
456 %c = load volatile half, half addrspace(1)* %gep2, align 2
457 %nega = fsub half -0.000000e+00, %a
458 %negb = fsub half -0.000000e+00, %b
459 %mul = fmul half %nega, %negb
460 %sub = fadd half %mul, %c
461 store half %sub, half addrspace(1)* %outgep, align 2
465 ; GCN-LABEL: {{^}}mad_fabs_sub_f16:
466 ; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
467 ; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
468 ; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
470 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
472 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
474 ; GCN-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
475 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
477 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
479 ; GFX10-FLUSH: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
480 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
481 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
482 define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
483 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
484 %tid.ext = sext i32 %tid to i64
485 %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
486 %add1 = add i64 %tid.ext, 1
487 %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
488 %add2 = add i64 %tid.ext, 2
489 %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
490 %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
491 %a = load volatile half, half addrspace(1)* %gep0, align 2
492 %b = load volatile half, half addrspace(1)* %gep1, align 2
493 %c = load volatile half, half addrspace(1)* %gep2, align 2
494 %b.abs = call half @llvm.fabs.f16(half %b) #0
495 %mul = fmul half %a, %b.abs
496 %sub = fsub half %mul, %c
497 store half %sub, half addrspace(1)* %outgep, align 2
501 ; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16:
502 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
503 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
504 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
505 ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
507 ; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
508 ; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
510 ; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
511 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
513 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
515 ; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
516 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
517 ; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
518 ; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
519 ; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
520 define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
521 %tid = call i32 @llvm.amdgcn.workitem.id.x()
522 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
523 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
524 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
526 %r1 = load volatile half, half addrspace(1)* %gep.0
527 %r2 = load volatile half, half addrspace(1)* %gep.1
529 %add = fadd half %r1, %r1
530 %r3 = fsub half %r2, %add
532 store half %r3, half addrspace(1)* %gep.out
536 ; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16:
537 ; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
538 ; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
540 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
542 ; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
544 ; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
545 ; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
547 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
549 ; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
550 ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
551 ; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
552 define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
553 %tid = call i32 @llvm.amdgcn.workitem.id.x()
554 %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
555 %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
556 %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
558 %r1 = load volatile half, half addrspace(1)* %gep.0
559 %r2 = load volatile half, half addrspace(1)* %gep.1
561 %add = fadd half %r1, %r1
562 %r3 = fsub half %add, %r2
564 store half %r3, half addrspace(1)* %gep.out
568 attributes #0 = { nounwind }
569 attributes #1 = { nounwind readnone }