1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
2 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
4 ; IEEE bit enabled for compute kernel, no shouldn't use.
5 ; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_signed_zeros:
6 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
7 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
8 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
9 define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
10 %tid = call i32 @llvm.amdgcn.workitem.id.x()
11 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
12 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
13 %a = load float, float addrspace(1)* %gep0
14 %add = fadd float %a, 1.0
15 %div2 = fmul float %add, 0.5
16 store float %div2, float addrspace(1)* %out.gep
20 ; IEEE bit enabled for compute kernel, no shouldn't use even though nsz is allowed
21 ; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_nsz:
22 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
23 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
24 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
25 define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
26 %tid = call i32 @llvm.amdgcn.workitem.id.x()
27 %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
28 %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
29 %a = load float, float addrspace(1)* %gep0
30 %add = fadd float %a, 1.0
31 %div2 = fmul float %add, 0.5
32 store float %div2, float addrspace(1)* %out.gep
36 ; Only allow without IEEE bit if signed zeros are significant.
37 ; GCN-LABEL: {{^}}v_omod_div2_f32_signed_zeros:
38 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
39 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
40 define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 {
41 %add = fadd float %a, 1.0
42 %div2 = fmul float %add, 0.5
43 store float %div2, float addrspace(1)* undef
47 ; GCN-LABEL: {{^}}v_omod_div2_f32:
48 ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 div:2{{$}}
49 define amdgpu_ps void @v_omod_div2_f32(float %a) #0 {
50 %add = fadd float %a, 1.0
51 %div2 = fmul float %add, 0.5
52 store float %div2, float addrspace(1)* undef
56 ; GCN-LABEL: {{^}}v_omod_mul2_f32:
57 ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:2{{$}}
58 define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 {
59 %add = fadd float %a, 1.0
60 %div2 = fmul float %add, 2.0
61 store float %div2, float addrspace(1)* undef
65 ; GCN-LABEL: {{^}}v_omod_mul4_f32:
66 ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}}
67 define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 {
68 %add = fadd float %a, 1.0
69 %div2 = fmul float %add, 4.0
70 store float %div2, float addrspace(1)* undef
74 ; GCN-LABEL: {{^}}v_omod_mul4_multi_use_f32:
75 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
76 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 4.0, [[ADD]]{{$}}
77 define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {
78 %add = fadd float %a, 1.0
79 %div2 = fmul float %add, 4.0
80 store float %div2, float addrspace(1)* undef
81 store volatile float %add, float addrspace(1)* undef
85 ; GCN-LABEL: {{^}}v_omod_mul4_dbg_use_f32:
86 ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}}
87 define amdgpu_ps void @v_omod_mul4_dbg_use_f32(float %a) #0 {
88 %add = fadd float %a, 1.0
89 call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
90 %div2 = fmul float %add, 4.0
91 store float %div2, float addrspace(1)* undef
95 ; Clamp is applied after omod, folding both into instruction is OK.
96 ; GCN-LABEL: {{^}}v_clamp_omod_div2_f32:
97 ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 clamp div:2{{$}}
98 define amdgpu_ps void @v_clamp_omod_div2_f32(float %a) #0 {
99 %add = fadd float %a, 1.0
100 %div2 = fmul float %add, 0.5
102 %max = call float @llvm.maxnum.f32(float %div2, float 0.0)
103 %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
104 store float %clamp, float addrspace(1)* undef
108 ; Cannot fold omod into clamp
109 ; GCN-LABEL: {{^}}v_omod_div2_clamp_f32:
110 ; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], v0, 1.0 clamp{{$}}
111 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
112 define amdgpu_ps void @v_omod_div2_clamp_f32(float %a) #0 {
113 %add = fadd float %a, 1.0
114 %max = call float @llvm.maxnum.f32(float %add, float 0.0)
115 %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
116 %div2 = fmul float %clamp, 0.5
117 store float %div2, float addrspace(1)* undef
121 ; GCN-LABEL: {{^}}v_omod_div2_abs_src_f32:
122 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
123 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ADD]]|, 0.5{{$}}
124 define amdgpu_ps void @v_omod_div2_abs_src_f32(float %a) #0 {
125 %add = fadd float %a, 1.0
126 %abs.add = call float @llvm.fabs.f32(float %add)
127 %div2 = fmul float %abs.add, 0.5
128 store float %div2, float addrspace(1)* undef
132 ; GCN-LABEL: {{^}}v_omod_add_self_clamp_f32:
133 ; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, v0 clamp{{$}}
134 define amdgpu_ps void @v_omod_add_self_clamp_f32(float %a) #0 {
135 %add = fadd float %a, %a
136 %max = call float @llvm.maxnum.f32(float %add, float 0.0)
137 %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
138 store float %clamp, float addrspace(1)* undef
142 ; GCN-LABEL: {{^}}v_omod_add_clamp_self_f32:
143 ; GCN: v_max_f32_e64 [[CLAMP:v[0-9]+]], v0, v0 clamp{{$}}
144 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[CLAMP]], [[CLAMP]]{{$}}
145 define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 {
146 %max = call float @llvm.maxnum.f32(float %a, float 0.0)
147 %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
148 %add = fadd float %clamp, %clamp
149 store float %add, float addrspace(1)* undef
153 ; GCN-LABEL: {{^}}v_omod_add_abs_self_f32:
154 ; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0
155 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, |[[X]]|{{$}}
156 define amdgpu_ps void @v_omod_add_abs_self_f32(float %a) #0 {
157 %x = fadd float %a, 1.0
158 %abs.x = call float @llvm.fabs.f32(float %x)
159 %add = fadd float %abs.x, %abs.x
160 store float %add, float addrspace(1)* undef
164 ; GCN-LABEL: {{^}}v_omod_add_abs_x_x_f32:
166 ; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0
167 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[X]]{{$}}
168 define amdgpu_ps void @v_omod_add_abs_x_x_f32(float %a) #0 {
169 %x = fadd float %a, 1.0
170 %abs.x = call float @llvm.fabs.f32(float %x)
171 %add = fadd float %abs.x, %x
172 store float %add, float addrspace(1)* undef
176 ; GCN-LABEL: {{^}}v_omod_add_x_abs_x_f32:
177 ; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0
178 ; GCN: v_add_f32_e64 v{{[0-9]+}}, [[X]], |[[X]]|{{$}}
179 define amdgpu_ps void @v_omod_add_x_abs_x_f32(float %a) #0 {
180 %x = fadd float %a, 1.0
181 %abs.x = call float @llvm.fabs.f32(float %x)
182 %add = fadd float %x, %abs.x
183 store float %add, float addrspace(1)* undef
187 ; Don't fold omod into omod into another omod.
188 ; GCN-LABEL: {{^}}v_omod_div2_omod_div2_f32:
189 ; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], v0, 1.0 div:2{{$}}
190 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
191 define amdgpu_ps void @v_omod_div2_omod_div2_f32(float %a) #0 {
192 %add = fadd float %a, 1.0
193 %div2.0 = fmul float %add, 0.5
194 %div2.1 = fmul float %div2.0, 0.5
195 store float %div2.1, float addrspace(1)* undef
199 ; Don't fold omod if denorms enabled
200 ; GCN-LABEL: {{^}}v_omod_div2_f32_denormals:
201 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
202 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
203 define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 {
204 %add = fadd float %a, 1.0
205 %div2 = fmul float %add, 0.5
206 store float %div2, float addrspace(1)* undef
210 ; Don't fold omod if denorms enabled for add form.
211 ; GCN-LABEL: {{^}}v_omod_mul2_f32_denormals:
212 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
213 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[ADD]], [[ADD]]{{$}}
214 define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 {
215 %add = fadd float %a, 1.0
216 %mul2 = fadd float %add, %add
217 store float %mul2, float addrspace(1)* undef
221 ; Don't fold omod if denorms enabled
222 ; GCN-LABEL: {{^}}v_omod_div2_f16_denormals:
223 ; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
224 ; VI: v_mul_f16_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
225 define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 {
226 %add = fadd half %a, 1.0
227 %div2 = fmul half %add, 0.5
228 store half %div2, half addrspace(1)* undef
232 ; Don't fold omod if denorms enabled for add form.
233 ; GCN-LABEL: {{^}}v_omod_mul2_f16_denormals:
234 ; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
235 ; VI: v_add_f16_e32 v{{[0-9]+}}, [[ADD]], [[ADD]]{{$}}
236 define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 {
237 %add = fadd half %a, 1.0
238 %mul2 = fadd half %add, %add
239 store half %mul2, half addrspace(1)* undef
243 ; GCN-LABEL: {{^}}v_omod_div2_f16_no_denormals:
245 ; VI: v_add_f16_e64 [[ADD:v[0-9]+]], v0, 1.0 div:2{{$}}
246 define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 {
247 %add = fadd half %a, 1.0
248 %div2 = fmul half %add, 0.5
249 store half %div2, half addrspace(1)* undef
253 ; GCN-LABEL: {{^}}v_omod_mac_to_mad:
254 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} mul:2{{$}}
255 define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 {
256 %mul = fmul float %a, %a
257 %add = fadd float %mul, %b
258 %mad = fmul float %add, 2.0
259 %res = fmul float %mad, %b
260 store float %res, float addrspace(1)* undef
264 declare i32 @llvm.amdgcn.workitem.id.x() #1
265 declare float @llvm.fabs.f32(float) #1
266 declare float @llvm.floor.f32(float) #1
267 declare float @llvm.minnum.f32(float, float) #1
268 declare float @llvm.maxnum.f32(float, float) #1
269 declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
270 declare double @llvm.fabs.f64(double) #1
271 declare double @llvm.minnum.f64(double, double) #1
272 declare double @llvm.maxnum.f64(double, double) #1
273 declare half @llvm.fabs.f16(half) #1
274 declare half @llvm.minnum.f16(half, half) #1
275 declare half @llvm.maxnum.f16(half, half) #1
276 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
278 attributes #0 = { nounwind "no-signed-zeros-fp-math"="true" }
279 attributes #1 = { nounwind readnone }
280 attributes #2 = { nounwind "target-features"="+fp32-denormals" "no-signed-zeros-fp-math"="true" }
281 attributes #3 = { nounwind "target-features"="-fp64-fp16-denormals" "no-signed-zeros-fp-math"="true" }
282 attributes #4 = { nounwind "no-signed-zeros-fp-math"="false" }
285 !llvm.module.flags = !{!2, !3}
287 !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
288 !1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
289 !2 = !{i32 2, !"Dwarf Version", i32 4}
290 !3 = !{i32 2, !"Debug Info Version", i32 3}
291 !4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)
292 !5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
293 !6 = !DISubroutineType(types: !7)
295 !8 = !DIBasicType(name: "float", size: 32, align: 32)
297 !10 = !DILocation(line: 1, column: 42, scope: !5)