1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefixes=VI %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
6 ; IEEE bit enabled for compute kernel, so shouldn't use.
7 define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 {
8 ; SI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
10 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
11 ; SI-NEXT: s_mov_b32 s7, 0xf000
12 ; SI-NEXT: s_mov_b32 s6, 0
13 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
14 ; SI-NEXT: v_mov_b32_e32 v1, 0
15 ; SI-NEXT: s_waitcnt lgkmcnt(0)
16 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
17 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
18 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
19 ; SI-NEXT: s_waitcnt vmcnt(0)
20 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2
21 ; SI-NEXT: v_mul_f32_e32 v2, 0.5, v2
22 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
25 ; VI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
27 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
28 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
29 ; VI-NEXT: s_waitcnt lgkmcnt(0)
30 ; VI-NEXT: v_mov_b32_e32 v1, s3
31 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
32 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
33 ; VI-NEXT: flat_load_dword v3, v[0:1]
34 ; VI-NEXT: v_mov_b32_e32 v1, s1
35 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
36 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
37 ; VI-NEXT: s_waitcnt vmcnt(0)
38 ; VI-NEXT: v_add_f32_e32 v2, 1.0, v3
39 ; VI-NEXT: v_mul_f32_e32 v2, 0.5, v2
40 ; VI-NEXT: flat_store_dword v[0:1], v2
43 ; GFX11-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
45 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
46 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
47 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
48 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
49 ; GFX11-NEXT: s_waitcnt vmcnt(0)
50 ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
51 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
52 ; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1
53 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
55 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
56 ; GFX11-NEXT: s_endpgm
57 %tid = call i32 @llvm.amdgcn.workitem.id.x()
58 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
59 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
60 %a = load float, ptr addrspace(1) %gep0
61 %add = fadd float %a, 1.0
62 %div2 = fmul float %add, 0.5
63 store float %div2, ptr addrspace(1) %out.gep
67 ; IEEE bit enabled for compute kernel, so shouldn't use.
68 define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 {
69 ; SI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
71 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
72 ; SI-NEXT: s_mov_b32 s7, 0xf000
73 ; SI-NEXT: s_mov_b32 s6, 0
74 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
75 ; SI-NEXT: v_mov_b32_e32 v1, 0
76 ; SI-NEXT: s_waitcnt lgkmcnt(0)
77 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
78 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
79 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
80 ; SI-NEXT: s_waitcnt vmcnt(0)
81 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
82 ; SI-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
83 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
86 ; VI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
88 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
89 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
90 ; VI-NEXT: s_waitcnt lgkmcnt(0)
91 ; VI-NEXT: v_mov_b32_e32 v1, s3
92 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
93 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
94 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
95 ; VI-NEXT: v_mov_b32_e32 v3, s1
96 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
97 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
98 ; VI-NEXT: s_waitcnt vmcnt(0)
99 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
100 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
101 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
104 ; GFX11-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
106 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
107 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
108 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
109 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
110 ; GFX11-NEXT: s_waitcnt vmcnt(0)
111 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
112 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
113 ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
114 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
115 ; GFX11-NEXT: s_nop 0
116 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
117 ; GFX11-NEXT: s_endpgm
118 %tid = call i32 @llvm.amdgcn.workitem.id.x()
119 %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
120 %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
121 %a = load double, ptr addrspace(1) %gep0
122 %add = fadd double %a, 1.0
123 %div2 = fmul double %add, 0.5
124 store double %div2, ptr addrspace(1) %out.gep
128 ; IEEE bit enabled for compute kernel, so shouldn't use even though nsz is allowed
129 define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
130 ; SI-LABEL: v_omod_div2_f32_enable_ieee_nsz:
132 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
133 ; SI-NEXT: s_mov_b32 s7, 0xf000
134 ; SI-NEXT: s_mov_b32 s6, 0
135 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
136 ; SI-NEXT: v_mov_b32_e32 v1, 0
137 ; SI-NEXT: s_waitcnt lgkmcnt(0)
138 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
139 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
140 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
141 ; SI-NEXT: s_waitcnt vmcnt(0)
142 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2
143 ; SI-NEXT: v_mul_f32_e32 v2, 0.5, v2
144 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
147 ; VI-LABEL: v_omod_div2_f32_enable_ieee_nsz:
149 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
150 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
151 ; VI-NEXT: s_waitcnt lgkmcnt(0)
152 ; VI-NEXT: v_mov_b32_e32 v1, s3
153 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
154 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
155 ; VI-NEXT: flat_load_dword v3, v[0:1]
156 ; VI-NEXT: v_mov_b32_e32 v1, s1
157 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
158 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
159 ; VI-NEXT: s_waitcnt vmcnt(0)
160 ; VI-NEXT: v_add_f32_e32 v2, 1.0, v3
161 ; VI-NEXT: v_mul_f32_e32 v2, 0.5, v2
162 ; VI-NEXT: flat_store_dword v[0:1], v2
165 ; GFX11-LABEL: v_omod_div2_f32_enable_ieee_nsz:
167 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
168 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
169 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
170 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
171 ; GFX11-NEXT: s_waitcnt vmcnt(0)
172 ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
173 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
174 ; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1
175 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
176 ; GFX11-NEXT: s_nop 0
177 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
178 ; GFX11-NEXT: s_endpgm
179 %tid = call i32 @llvm.amdgcn.workitem.id.x()
180 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
181 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
182 %a = load float, ptr addrspace(1) %gep0
183 %add = fadd float %a, 1.0
184 %div2 = fmul float %add, 0.5
185 store float %div2, ptr addrspace(1) %out.gep
189 ; IEEE bit enabled for compute kernel, so shouldn't use even though nsz is allowed.
190 define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #5 {
191 ; SI-LABEL: v_omod_div2_f64_enable_ieee_nsz:
193 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
194 ; SI-NEXT: s_mov_b32 s7, 0xf000
195 ; SI-NEXT: s_mov_b32 s6, 0
196 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
197 ; SI-NEXT: v_mov_b32_e32 v1, 0
198 ; SI-NEXT: s_waitcnt lgkmcnt(0)
199 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
200 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
201 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
202 ; SI-NEXT: s_waitcnt vmcnt(0)
203 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
204 ; SI-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
205 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
208 ; VI-LABEL: v_omod_div2_f64_enable_ieee_nsz:
210 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
211 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
212 ; VI-NEXT: s_waitcnt lgkmcnt(0)
213 ; VI-NEXT: v_mov_b32_e32 v1, s3
214 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
215 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
216 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
217 ; VI-NEXT: v_mov_b32_e32 v3, s1
218 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
219 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
220 ; VI-NEXT: s_waitcnt vmcnt(0)
221 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
222 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
223 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
226 ; GFX11-LABEL: v_omod_div2_f64_enable_ieee_nsz:
228 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
229 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
230 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
231 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
232 ; GFX11-NEXT: s_waitcnt vmcnt(0)
233 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
234 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
235 ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
236 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
237 ; GFX11-NEXT: s_nop 0
238 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
239 ; GFX11-NEXT: s_endpgm
240 %tid = call i32 @llvm.amdgcn.workitem.id.x()
241 %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
242 %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
243 %a = load double, ptr addrspace(1) %gep0
244 %add = fadd double %a, 1.0
245 %div2 = fmul double %add, 0.5
246 store double %div2, ptr addrspace(1) %out.gep
250 ; Only allow without IEEE bit if signed zeros are significant.
251 define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 {
252 ; SI-LABEL: v_omod_div2_f32_signed_zeros:
254 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
255 ; SI-NEXT: v_mul_f32_e32 v0, 0.5, v0
256 ; SI-NEXT: s_mov_b32 s3, 0xf000
257 ; SI-NEXT: s_mov_b32 s2, -1
258 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
261 ; VI-LABEL: v_omod_div2_f32_signed_zeros:
263 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
264 ; VI-NEXT: v_mul_f32_e32 v0, 0.5, v0
265 ; VI-NEXT: flat_store_dword v[0:1], v0
268 ; GFX11-LABEL: v_omod_div2_f32_signed_zeros:
270 ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0
271 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
272 ; GFX11-NEXT: v_mul_f32_e32 v0, 0.5, v0
273 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
274 ; GFX11-NEXT: s_nop 0
275 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
276 ; GFX11-NEXT: s_endpgm
277 %add = fadd float %a, 1.0
278 %div2 = fmul float %add, 0.5
279 store float %div2, ptr addrspace(1) undef
283 ; Only allow without IEEE bit if signed zeros are significant.
284 define amdgpu_ps void @v_omod_div2_f64_signed_zeros(double %a) #4 {
285 ; SI-LABEL: v_omod_div2_f64_signed_zeros:
287 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
288 ; SI-NEXT: s_mov_b32 s3, 0xf000
289 ; SI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
290 ; SI-NEXT: s_mov_b32 s2, -1
291 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
294 ; VI-LABEL: v_omod_div2_f64_signed_zeros:
296 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
297 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
298 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
301 ; GFX11-LABEL: v_omod_div2_f64_signed_zeros:
303 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
304 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
305 ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
306 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
307 ; GFX11-NEXT: s_nop 0
308 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
309 ; GFX11-NEXT: s_endpgm
310 %add = fadd double %a, 1.0
311 %div2 = fmul double %add, 0.5
312 store double %div2, ptr addrspace(1) undef
316 define amdgpu_ps void @v_omod_div2_f32(float %a) #0 {
317 ; SI-LABEL: v_omod_div2_f32:
319 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
320 ; SI-NEXT: s_mov_b32 s3, 0xf000
321 ; SI-NEXT: s_mov_b32 s2, -1
322 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
325 ; VI-LABEL: v_omod_div2_f32:
327 ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
328 ; VI-NEXT: flat_store_dword v[0:1], v0
331 ; GFX11-LABEL: v_omod_div2_f32:
333 ; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
334 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
335 ; GFX11-NEXT: s_nop 0
336 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
337 ; GFX11-NEXT: s_endpgm
338 %add = fadd float %a, 1.0
339 %div2 = fmul float %add, 0.5
340 store float %div2, ptr addrspace(1) undef
344 define amdgpu_ps void @v_omod_div2_f64(double %a) #5 {
345 ; SI-LABEL: v_omod_div2_f64:
347 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 div:2
348 ; SI-NEXT: s_mov_b32 s3, 0xf000
349 ; SI-NEXT: s_mov_b32 s2, -1
350 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
353 ; VI-LABEL: v_omod_div2_f64:
355 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 div:2
356 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
359 ; GFX11-LABEL: v_omod_div2_f64:
361 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 div:2
362 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
363 ; GFX11-NEXT: s_nop 0
364 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
365 ; GFX11-NEXT: s_endpgm
366 %add = fadd nsz double %a, 1.0
367 %div2 = fmul nsz double %add, 0.5
368 store double %div2, ptr addrspace(1) undef
372 define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 {
373 ; SI-LABEL: v_omod_mul2_f32:
375 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2
376 ; SI-NEXT: s_mov_b32 s3, 0xf000
377 ; SI-NEXT: s_mov_b32 s2, -1
378 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
381 ; VI-LABEL: v_omod_mul2_f32:
383 ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2
384 ; VI-NEXT: flat_store_dword v[0:1], v0
387 ; GFX11-LABEL: v_omod_mul2_f32:
389 ; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2
390 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
391 ; GFX11-NEXT: s_nop 0
392 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
393 ; GFX11-NEXT: s_endpgm
394 %add = fadd float %a, 1.0
395 %div2 = fmul float %add, 2.0
396 store float %div2, ptr addrspace(1) undef
400 define amdgpu_ps void @v_omod_mul2_med3(float %x, float %y, float %z) #0 {
401 ; SI-LABEL: v_omod_mul2_med3:
403 ; SI-NEXT: v_med3_f32 v0, v0, v1, v2 mul:2
404 ; SI-NEXT: s_mov_b32 s3, 0xf000
405 ; SI-NEXT: s_mov_b32 s2, -1
406 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
409 ; VI-LABEL: v_omod_mul2_med3:
411 ; VI-NEXT: v_med3_f32 v0, v0, v1, v2 mul:2
412 ; VI-NEXT: flat_store_dword v[0:1], v0
415 ; GFX11-LABEL: v_omod_mul2_med3:
417 ; GFX11-NEXT: v_med3_f32 v0, v0, v1, v2 mul:2
418 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
419 ; GFX11-NEXT: s_nop 0
420 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
421 ; GFX11-NEXT: s_endpgm
422 %fmed3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
423 %div2 = fmul float %fmed3, 2.0
424 store float %div2, float addrspace(1)* undef
428 define amdgpu_ps void @v_omod_mul2_f64(double %a) #5 {
429 ; SI-LABEL: v_omod_mul2_f64:
431 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:2
432 ; SI-NEXT: s_mov_b32 s3, 0xf000
433 ; SI-NEXT: s_mov_b32 s2, -1
434 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
437 ; VI-LABEL: v_omod_mul2_f64:
439 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:2
440 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
443 ; GFX11-LABEL: v_omod_mul2_f64:
445 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:2
446 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
447 ; GFX11-NEXT: s_nop 0
448 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
449 ; GFX11-NEXT: s_endpgm
450 %add = fadd nsz double %a, 1.0
451 %div2 = fmul nsz double %add, 2.0
452 store double %div2, ptr addrspace(1) undef
456 define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 {
457 ; SI-LABEL: v_omod_mul4_f32:
459 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4
460 ; SI-NEXT: s_mov_b32 s3, 0xf000
461 ; SI-NEXT: s_mov_b32 s2, -1
462 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
465 ; VI-LABEL: v_omod_mul4_f32:
467 ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4
468 ; VI-NEXT: flat_store_dword v[0:1], v0
471 ; GFX11-LABEL: v_omod_mul4_f32:
473 ; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4
474 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
475 ; GFX11-NEXT: s_nop 0
476 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
477 ; GFX11-NEXT: s_endpgm
478 %add = fadd float %a, 1.0
479 %div2 = fmul float %add, 4.0
480 store float %div2, ptr addrspace(1) undef
484 define amdgpu_ps void @v_omod_mul4_f64(double %a) #5 {
485 ; SI-LABEL: v_omod_mul4_f64:
487 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:4
488 ; SI-NEXT: s_mov_b32 s3, 0xf000
489 ; SI-NEXT: s_mov_b32 s2, -1
490 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
493 ; VI-LABEL: v_omod_mul4_f64:
495 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:4
496 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
499 ; GFX11-LABEL: v_omod_mul4_f64:
501 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:4
502 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
503 ; GFX11-NEXT: s_nop 0
504 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
505 ; GFX11-NEXT: s_endpgm
506 %add = fadd nsz double %a, 1.0
507 %div2 = fmul nsz double %add, 4.0
508 store double %div2, ptr addrspace(1) undef
512 define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {
513 ; SI-LABEL: v_omod_mul4_multi_use_f32:
515 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
516 ; SI-NEXT: v_mul_f32_e32 v1, 4.0, v0
517 ; SI-NEXT: s_mov_b32 s3, 0xf000
518 ; SI-NEXT: s_mov_b32 s2, -1
519 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
520 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
521 ; SI-NEXT: s_waitcnt vmcnt(0)
524 ; VI-LABEL: v_omod_mul4_multi_use_f32:
526 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
527 ; VI-NEXT: v_mul_f32_e32 v1, 4.0, v0
528 ; VI-NEXT: flat_store_dword v[0:1], v1
529 ; VI-NEXT: flat_store_dword v[0:1], v0
530 ; VI-NEXT: s_waitcnt vmcnt(0)
533 ; GFX11-LABEL: v_omod_mul4_multi_use_f32:
535 ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0
536 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
537 ; GFX11-NEXT: v_mul_f32_e32 v1, 4.0, v0
538 ; GFX11-NEXT: s_clause 0x1
539 ; GFX11-NEXT: global_store_b32 v[0:1], v1, off
540 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
541 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
542 ; GFX11-NEXT: s_nop 0
543 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
544 ; GFX11-NEXT: s_endpgm
545 %add = fadd float %a, 1.0
546 %div2 = fmul float %add, 4.0
547 store float %div2, ptr addrspace(1) undef
548 store volatile float %add, ptr addrspace(1) undef
552 define amdgpu_ps void @v_omod_mul4_dbg_use_f32(float %a) #0 {
553 ; SI-LABEL: v_omod_mul4_dbg_use_f32:
555 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4
556 ; SI-NEXT: s_mov_b32 s3, 0xf000
557 ; SI-NEXT: s_mov_b32 s2, -1
558 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
561 ; VI-LABEL: v_omod_mul4_dbg_use_f32:
563 ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4
564 ; VI-NEXT: flat_store_dword v[0:1], v0
567 ; GFX11-LABEL: v_omod_mul4_dbg_use_f32:
569 ; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4
570 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
571 ; GFX11-NEXT: s_nop 0
572 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
573 ; GFX11-NEXT: s_endpgm
574 %add = fadd float %a, 1.0
575 call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
576 %div2 = fmul float %add, 4.0
577 store float %div2, ptr addrspace(1) undef
581 ; Clamp is applied after omod, folding both into instruction is OK.
582 define amdgpu_ps void @v_clamp_omod_div2_f32(float %a) #0 {
583 ; SI-LABEL: v_clamp_omod_div2_f32:
585 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp div:2
586 ; SI-NEXT: s_mov_b32 s3, 0xf000
587 ; SI-NEXT: s_mov_b32 s2, -1
588 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
591 ; VI-LABEL: v_clamp_omod_div2_f32:
593 ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp div:2
594 ; VI-NEXT: flat_store_dword v[0:1], v0
597 ; GFX11-LABEL: v_clamp_omod_div2_f32:
599 ; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp div:2
600 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
601 ; GFX11-NEXT: s_nop 0
602 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
603 ; GFX11-NEXT: s_endpgm
604 %add = fadd float %a, 1.0
605 %div2 = fmul float %add, 0.5
607 %max = call float @llvm.maxnum.f32(float %div2, float 0.0)
608 %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
609 store float %clamp, ptr addrspace(1) undef
613 ; Cannot fold omod into clamp
614 define amdgpu_ps void @v_omod_div2_clamp_f32(float %a) #0 {
615 ; SI-LABEL: v_omod_div2_clamp_f32:
617 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp
618 ; SI-NEXT: v_mul_f32_e32 v0, 0.5, v0
619 ; SI-NEXT: s_mov_b32 s3, 0xf000
620 ; SI-NEXT: s_mov_b32 s2, -1
621 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
624 ; VI-LABEL: v_omod_div2_clamp_f32:
626 ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp
627 ; VI-NEXT: v_mul_f32_e32 v0, 0.5, v0
628 ; VI-NEXT: flat_store_dword v[0:1], v0
631 ; GFX11-LABEL: v_omod_div2_clamp_f32:
633 ; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp
634 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
635 ; GFX11-NEXT: v_mul_f32_e32 v0, 0.5, v0
636 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
637 ; GFX11-NEXT: s_nop 0
638 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
639 ; GFX11-NEXT: s_endpgm
640 %add = fadd float %a, 1.0
641 %max = call float @llvm.maxnum.f32(float %add, float 0.0)
642 %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
643 %div2 = fmul float %clamp, 0.5
644 store float %div2, ptr addrspace(1) undef
648 define amdgpu_ps void @v_omod_div2_abs_src_f32(float %a) #0 {
649 ; SI-LABEL: v_omod_div2_abs_src_f32:
651 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
652 ; SI-NEXT: v_mul_f32_e64 v0, |v0|, 0.5
653 ; SI-NEXT: s_mov_b32 s3, 0xf000
654 ; SI-NEXT: s_mov_b32 s2, -1
655 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
658 ; VI-LABEL: v_omod_div2_abs_src_f32:
660 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
661 ; VI-NEXT: v_mul_f32_e64 v0, |v0|, 0.5
662 ; VI-NEXT: flat_store_dword v[0:1], v0
665 ; GFX11-LABEL: v_omod_div2_abs_src_f32:
667 ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0
668 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
669 ; GFX11-NEXT: v_mul_f32_e64 v0, |v0|, 0.5
670 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
671 ; GFX11-NEXT: s_nop 0
672 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
673 ; GFX11-NEXT: s_endpgm
674 %add = fadd float %a, 1.0
675 %abs.add = call float @llvm.fabs.f32(float %add)
676 %div2 = fmul float %abs.add, 0.5
677 store float %div2, ptr addrspace(1) undef
681 define amdgpu_ps void @v_omod_add_self_clamp_f32(float %a) #0 {
682 ; SI-LABEL: v_omod_add_self_clamp_f32:
684 ; SI-NEXT: v_add_f32_e64 v0, v0, v0 clamp
685 ; SI-NEXT: s_mov_b32 s3, 0xf000
686 ; SI-NEXT: s_mov_b32 s2, -1
687 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
690 ; VI-LABEL: v_omod_add_self_clamp_f32:
692 ; VI-NEXT: v_add_f32_e64 v0, v0, v0 clamp
693 ; VI-NEXT: flat_store_dword v[0:1], v0
696 ; GFX11-LABEL: v_omod_add_self_clamp_f32:
698 ; GFX11-NEXT: v_add_f32_e64 v0, v0, v0 clamp
699 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
700 ; GFX11-NEXT: s_nop 0
701 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
702 ; GFX11-NEXT: s_endpgm
703 %add = fadd float %a, %a
704 %max = call float @llvm.maxnum.f32(float %add, float 0.0)
705 %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
706 store float %clamp, ptr addrspace(1) undef
710 define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 {
711 ; SI-LABEL: v_omod_add_clamp_self_f32:
713 ; SI-NEXT: v_max_f32_e64 v0, v0, v0 clamp
714 ; SI-NEXT: v_add_f32_e32 v0, v0, v0
715 ; SI-NEXT: s_mov_b32 s3, 0xf000
716 ; SI-NEXT: s_mov_b32 s2, -1
717 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
720 ; VI-LABEL: v_omod_add_clamp_self_f32:
722 ; VI-NEXT: v_max_f32_e64 v0, v0, v0 clamp
723 ; VI-NEXT: v_add_f32_e32 v0, v0, v0
724 ; VI-NEXT: flat_store_dword v[0:1], v0
727 ; GFX11-LABEL: v_omod_add_clamp_self_f32:
729 ; GFX11-NEXT: v_max_f32_e64 v0, v0, v0 clamp
730 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
731 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v0
732 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
733 ; GFX11-NEXT: s_nop 0
734 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
735 ; GFX11-NEXT: s_endpgm
736 %max = call float @llvm.maxnum.f32(float %a, float 0.0)
737 %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
738 %add = fadd float %clamp, %clamp
739 store float %add, ptr addrspace(1) undef
743 define amdgpu_ps void @v_omod_add_abs_self_f32(float %a) #0 {
744 ; SI-LABEL: v_omod_add_abs_self_f32:
746 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
747 ; SI-NEXT: v_add_f32_e64 v0, |v0|, |v0|
748 ; SI-NEXT: s_mov_b32 s3, 0xf000
749 ; SI-NEXT: s_mov_b32 s2, -1
750 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
753 ; VI-LABEL: v_omod_add_abs_self_f32:
755 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
756 ; VI-NEXT: v_add_f32_e64 v0, |v0|, |v0|
757 ; VI-NEXT: flat_store_dword v[0:1], v0
760 ; GFX11-LABEL: v_omod_add_abs_self_f32:
762 ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0
763 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
764 ; GFX11-NEXT: v_add_f32_e64 v0, |v0|, |v0|
765 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
766 ; GFX11-NEXT: s_nop 0
767 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
768 ; GFX11-NEXT: s_endpgm
769 %x = fadd float %a, 1.0
770 %abs.x = call float @llvm.fabs.f32(float %x)
771 %add = fadd float %abs.x, %abs.x
772 store float %add, ptr addrspace(1) undef
776 define amdgpu_ps void @v_omod_add_abs_x_x_f32(float %a) #0 {
777 ; SI-LABEL: v_omod_add_abs_x_x_f32:
779 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
780 ; SI-NEXT: v_add_f32_e64 v0, |v0|, v0
781 ; SI-NEXT: s_mov_b32 s3, 0xf000
782 ; SI-NEXT: s_mov_b32 s2, -1
783 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
786 ; VI-LABEL: v_omod_add_abs_x_x_f32:
788 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
789 ; VI-NEXT: v_add_f32_e64 v0, |v0|, v0
790 ; VI-NEXT: flat_store_dword v[0:1], v0
793 ; GFX11-LABEL: v_omod_add_abs_x_x_f32:
795 ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0
796 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
797 ; GFX11-NEXT: v_add_f32_e64 v0, |v0|, v0
798 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
799 ; GFX11-NEXT: s_nop 0
800 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
801 ; GFX11-NEXT: s_endpgm
802 %x = fadd float %a, 1.0
803 %abs.x = call float @llvm.fabs.f32(float %x)
804 %add = fadd float %abs.x, %x
805 store float %add, ptr addrspace(1) undef
809 define amdgpu_ps void @v_omod_add_x_abs_x_f32(float %a) #0 {
810 ; SI-LABEL: v_omod_add_x_abs_x_f32:
812 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
813 ; SI-NEXT: v_add_f32_e64 v0, v0, |v0|
814 ; SI-NEXT: s_mov_b32 s3, 0xf000
815 ; SI-NEXT: s_mov_b32 s2, -1
816 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
819 ; VI-LABEL: v_omod_add_x_abs_x_f32:
821 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
822 ; VI-NEXT: v_add_f32_e64 v0, v0, |v0|
823 ; VI-NEXT: flat_store_dword v[0:1], v0
826 ; GFX11-LABEL: v_omod_add_x_abs_x_f32:
828 ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0
829 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
830 ; GFX11-NEXT: v_add_f32_e64 v0, v0, |v0|
831 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
832 ; GFX11-NEXT: s_nop 0
833 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
834 ; GFX11-NEXT: s_endpgm
835 %x = fadd float %a, 1.0
836 %abs.x = call float @llvm.fabs.f32(float %x)
837 %add = fadd float %x, %abs.x
838 store float %add, ptr addrspace(1) undef
842 ; Don't fold omod into omod into another omod.
843 define amdgpu_ps void @v_omod_div2_omod_div2_f32(float %a) #0 {
844 ; SI-LABEL: v_omod_div2_omod_div2_f32:
846 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
847 ; SI-NEXT: v_mul_f32_e32 v0, 0.5, v0
848 ; SI-NEXT: s_mov_b32 s3, 0xf000
849 ; SI-NEXT: s_mov_b32 s2, -1
850 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
853 ; VI-LABEL: v_omod_div2_omod_div2_f32:
855 ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
856 ; VI-NEXT: v_mul_f32_e32 v0, 0.5, v0
857 ; VI-NEXT: flat_store_dword v[0:1], v0
860 ; GFX11-LABEL: v_omod_div2_omod_div2_f32:
862 ; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
863 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
864 ; GFX11-NEXT: v_mul_f32_e32 v0, 0.5, v0
865 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
866 ; GFX11-NEXT: s_nop 0
867 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
868 ; GFX11-NEXT: s_endpgm
869 %add = fadd float %a, 1.0
870 %div2.0 = fmul float %add, 0.5
871 %div2.1 = fmul float %div2.0, 0.5
872 store float %div2.1, ptr addrspace(1) undef
876 ; Don't fold omod if denorms enabled
877 define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 {
878 ; SI-LABEL: v_omod_div2_f32_denormals:
880 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
881 ; SI-NEXT: v_mul_f32_e32 v0, 0.5, v0
882 ; SI-NEXT: s_mov_b32 s3, 0xf000
883 ; SI-NEXT: s_mov_b32 s2, -1
884 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
887 ; VI-LABEL: v_omod_div2_f32_denormals:
889 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
890 ; VI-NEXT: v_mul_f32_e32 v0, 0.5, v0
891 ; VI-NEXT: flat_store_dword v[0:1], v0
894 ; GFX11-LABEL: v_omod_div2_f32_denormals:
896 ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0
897 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
898 ; GFX11-NEXT: v_mul_f32_e32 v0, 0.5, v0
899 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
900 ; GFX11-NEXT: s_nop 0
901 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
902 ; GFX11-NEXT: s_endpgm
903 %add = fadd float %a, 1.0
904 %div2 = fmul float %add, 0.5
905 store float %div2, ptr addrspace(1) undef
909 ; Don't fold omod if denorms enabled.
910 define amdgpu_ps void @v_omod_div2_f64_denormals(double %a) #6 {
911 ; SI-LABEL: v_omod_div2_f64_denormals:
913 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
914 ; SI-NEXT: s_mov_b32 s3, 0xf000
915 ; SI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
916 ; SI-NEXT: s_mov_b32 s2, -1
917 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
920 ; VI-LABEL: v_omod_div2_f64_denormals:
922 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
923 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
924 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
927 ; GFX11-LABEL: v_omod_div2_f64_denormals:
929 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
930 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
931 ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
932 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
933 ; GFX11-NEXT: s_nop 0
934 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
935 ; GFX11-NEXT: s_endpgm
936 %add = fadd double %a, 1.0
937 %div2 = fmul double %add, 0.5
938 store double %div2, ptr addrspace(1) undef
942 ; Don't fold omod if denorms enabled for add form.
943 define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 {
944 ; SI-LABEL: v_omod_mul2_f32_denormals:
946 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
947 ; SI-NEXT: v_add_f32_e32 v0, v0, v0
948 ; SI-NEXT: s_mov_b32 s3, 0xf000
949 ; SI-NEXT: s_mov_b32 s2, -1
950 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
953 ; VI-LABEL: v_omod_mul2_f32_denormals:
955 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
956 ; VI-NEXT: v_add_f32_e32 v0, v0, v0
957 ; VI-NEXT: flat_store_dword v[0:1], v0
960 ; GFX11-LABEL: v_omod_mul2_f32_denormals:
962 ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0
963 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
964 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v0
965 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
966 ; GFX11-NEXT: s_nop 0
967 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
968 ; GFX11-NEXT: s_endpgm
969 %add = fadd float %a, 1.0
970 %mul2 = fadd float %add, %add
971 store float %mul2, ptr addrspace(1) undef
975 ; Don't fold omod if denorms enabled for add form.
976 define amdgpu_ps void @v_omod_mul2_f64_denormals(double %a) #2 {
977 ; SI-LABEL: v_omod_mul2_f64_denormals:
979 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
980 ; SI-NEXT: s_mov_b32 s3, 0xf000
981 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[0:1]
982 ; SI-NEXT: s_mov_b32 s2, -1
983 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
986 ; VI-LABEL: v_omod_mul2_f64_denormals:
988 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
989 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], v[0:1]
990 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
993 ; GFX11-LABEL: v_omod_mul2_f64_denormals:
995 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
996 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
997 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[0:1]
998 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
999 ; GFX11-NEXT: s_nop 0
1000 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1001 ; GFX11-NEXT: s_endpgm
1002 %add = fadd double %a, 1.0
1003 %mul2 = fadd double %add, %add
1004 store double %mul2, ptr addrspace(1) undef
1008 ; Don't fold omod if denorms enabled
1009 define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 {
1010 ; SI-LABEL: v_omod_div2_f16_denormals:
1012 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1013 ; SI-NEXT: s_mov_b32 s3, 0xf000
1014 ; SI-NEXT: s_mov_b32 s2, -1
1015 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1016 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
1017 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1018 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1021 ; VI-LABEL: v_omod_div2_f16_denormals:
1023 ; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
1024 ; VI-NEXT: v_mul_f16_e32 v0, 0.5, v0
1025 ; VI-NEXT: flat_store_short v[0:1], v0
1028 ; GFX11-LABEL: v_omod_div2_f16_denormals:
1030 ; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0
1031 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1032 ; GFX11-NEXT: v_mul_f16_e32 v0, 0.5, v0
1033 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off
1034 ; GFX11-NEXT: s_nop 0
1035 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1036 ; GFX11-NEXT: s_endpgm
1037 %add = fadd half %a, 1.0
1038 %div2 = fmul half %add, 0.5
1039 store half %div2, ptr addrspace(1) undef
1043 ; Don't fold omod if denorms enabled for add form.
1044 define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 {
1045 ; SI-LABEL: v_omod_mul2_f16_denormals:
1047 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1048 ; SI-NEXT: s_mov_b32 s3, 0xf000
1049 ; SI-NEXT: s_mov_b32 s2, -1
1050 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1051 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2
1052 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1053 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1056 ; VI-LABEL: v_omod_mul2_f16_denormals:
1058 ; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
1059 ; VI-NEXT: v_add_f16_e32 v0, v0, v0
1060 ; VI-NEXT: flat_store_short v[0:1], v0
1063 ; GFX11-LABEL: v_omod_mul2_f16_denormals:
1065 ; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0
1066 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1067 ; GFX11-NEXT: v_add_f16_e32 v0, v0, v0
1068 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off
1069 ; GFX11-NEXT: s_nop 0
1070 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1071 ; GFX11-NEXT: s_endpgm
1072 %add = fadd half %a, 1.0
1073 %mul2 = fadd half %add, %add
1074 store half %mul2, ptr addrspace(1) undef
1078 define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 {
1079 ; SI-LABEL: v_omod_div2_f16_no_denormals:
1081 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1082 ; SI-NEXT: s_mov_b32 s3, 0xf000
1083 ; SI-NEXT: s_mov_b32 s2, -1
1084 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1085 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
1086 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1087 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1090 ; VI-LABEL: v_omod_div2_f16_no_denormals:
1092 ; VI-NEXT: v_add_f16_e64 v0, v0, 1.0 div:2
1093 ; VI-NEXT: flat_store_short v[0:1], v0
1096 ; GFX11-LABEL: v_omod_div2_f16_no_denormals:
1098 ; GFX11-NEXT: v_add_f16_e64 v0, v0, 1.0 div:2
1099 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off
1100 ; GFX11-NEXT: s_nop 0
1101 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1102 ; GFX11-NEXT: s_endpgm
1103 %add = fadd half %a, 1.0
1104 %div2 = fmul half %add, 0.5
1105 store half %div2, ptr addrspace(1) undef
1109 define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 {
1110 ; SI-LABEL: v_omod_mac_to_mad:
1112 ; SI-NEXT: v_mad_f32 v1, v1, v1, v0 mul:2
1113 ; SI-NEXT: v_mul_f32_e32 v0, v1, v0
1114 ; SI-NEXT: s_mov_b32 s3, 0xf000
1115 ; SI-NEXT: s_mov_b32 s2, -1
1116 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1119 ; VI-LABEL: v_omod_mac_to_mad:
1121 ; VI-NEXT: v_mad_f32 v1, v1, v1, v0 mul:2
1122 ; VI-NEXT: v_mul_f32_e32 v0, v1, v0
1123 ; VI-NEXT: flat_store_dword v[0:1], v0
1126 ; GFX11-LABEL: v_omod_mac_to_mad:
1128 ; GFX11-NEXT: v_mul_f32_e32 v1, v1, v1
1129 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1130 ; GFX11-NEXT: v_add_f32_e64 v1, v1, v0 mul:2
1131 ; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0
1132 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
1133 ; GFX11-NEXT: s_nop 0
1134 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1135 ; GFX11-NEXT: s_endpgm
1136 %mul = fmul float %a, %a
1137 %add = fadd float %mul, %b
1138 %mad = fmul float %add, 2.0
1139 %res = fmul float %mad, %b
1140 store float %res, ptr addrspace(1) undef
1144 declare i32 @llvm.amdgcn.workitem.id.x() #1
1145 declare float @llvm.fabs.f32(float) #1
1146 declare float @llvm.floor.f32(float) #1
1147 declare float @llvm.minnum.f32(float, float) #1
1148 declare float @llvm.maxnum.f32(float, float) #1
1149 declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
1150 declare double @llvm.fabs.f64(double) #1
1151 declare double @llvm.minnum.f64(double, double) #1
1152 declare double @llvm.maxnum.f64(double, double) #1
1153 declare half @llvm.fabs.f16(half) #1
1154 declare half @llvm.minnum.f16(half, half) #1
1155 declare half @llvm.maxnum.f16(half, half) #1
1156 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
1158 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" }
1159 attributes #1 = { nounwind readnone }
1160 attributes #2 = { nounwind "denormal-fp-math-f32"="ieee,ieee" "no-signed-zeros-fp-math"="true" }
1161 attributes #3 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" }
1162 attributes #4 = { nounwind "no-signed-zeros-fp-math"="false" }
1163 attributes #5 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
1164 attributes #6 = { nounwind "denormal-fp-math"="ieee,ieee" "no-signed-zeros-fp-math"="true" }
1166 !llvm.dbg.cu = !{!0}
1167 !llvm.module.flags = !{!2, !3}
1169 !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
1170 !1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
1171 !2 = !{i32 2, !"Dwarf Version", i32 4}
1172 !3 = !{i32 2, !"Debug Info Version", i32 3}
1173 !4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)
1174 !5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
1175 !6 = !DISubroutineType(types: !7)
1177 !8 = !DIBasicType(name: "float", size: 32, align: 32)
1178 !9 = !DIExpression()
1179 !10 = !DILocation(line: 1, column: 42, scope: !5)