1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefixes=VI %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11PLUS,GFX11 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11PLUS,GFX12 %s
7 ; IEEE bit enabled for compute kernel, so shouldn't use.
8 define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 {
9 ; SI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
11 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
12 ; SI-NEXT: s_mov_b32 s7, 0xf000
13 ; SI-NEXT: s_mov_b32 s6, 0
14 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
15 ; SI-NEXT: v_mov_b32_e32 v1, 0
16 ; SI-NEXT: s_waitcnt lgkmcnt(0)
17 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
18 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
19 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
20 ; SI-NEXT: s_waitcnt vmcnt(0)
21 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2
22 ; SI-NEXT: v_mul_f32_e32 v2, 0.5, v2
23 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
26 ; VI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
28 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
29 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
30 ; VI-NEXT: s_waitcnt lgkmcnt(0)
31 ; VI-NEXT: v_mov_b32_e32 v1, s3
32 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
33 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
34 ; VI-NEXT: flat_load_dword v3, v[0:1]
35 ; VI-NEXT: v_mov_b32_e32 v1, s1
36 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
37 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
38 ; VI-NEXT: s_waitcnt vmcnt(0)
39 ; VI-NEXT: v_add_f32_e32 v2, 1.0, v3
40 ; VI-NEXT: v_mul_f32_e32 v2, 0.5, v2
41 ; VI-NEXT: flat_store_dword v[0:1], v2
44 ; GFX11-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
46 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
47 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
48 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
49 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
50 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
51 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
52 ; GFX11-NEXT: s_waitcnt vmcnt(0)
53 ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
54 ; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1
55 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
57 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
58 ; GFX11-NEXT: s_endpgm
60 ; GFX12-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
62 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
63 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
64 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
65 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
66 ; GFX12-NEXT: s_wait_kmcnt 0x0
67 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
68 ; GFX12-NEXT: s_wait_loadcnt 0x0
69 ; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1
70 ; GFX12-NEXT: v_mul_f32_e32 v1, 0.5, v1
71 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
73 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
74 ; GFX12-NEXT: s_endpgm
75 %tid = call i32 @llvm.amdgcn.workitem.id.x()
76 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
77 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
78 %a = load float, ptr addrspace(1) %gep0
79 %add = fadd float %a, 1.0
80 %div2 = fmul float %add, 0.5
81 store float %div2, ptr addrspace(1) %out.gep
85 ; IEEE bit enabled for compute kernel, so shouldn't use.
86 define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 {
87 ; SI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
89 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
90 ; SI-NEXT: s_mov_b32 s7, 0xf000
91 ; SI-NEXT: s_mov_b32 s6, 0
92 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
93 ; SI-NEXT: v_mov_b32_e32 v1, 0
94 ; SI-NEXT: s_waitcnt lgkmcnt(0)
95 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
96 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
97 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
98 ; SI-NEXT: s_waitcnt vmcnt(0)
99 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
100 ; SI-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
101 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
104 ; VI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
106 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
107 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
108 ; VI-NEXT: s_waitcnt lgkmcnt(0)
109 ; VI-NEXT: v_mov_b32_e32 v1, s3
110 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
111 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
112 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
113 ; VI-NEXT: v_mov_b32_e32 v3, s1
114 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
115 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
116 ; VI-NEXT: s_waitcnt vmcnt(0)
117 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
118 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
119 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
122 ; GFX11-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
124 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
125 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
126 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
127 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
128 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
129 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
130 ; GFX11-NEXT: s_waitcnt vmcnt(0)
131 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
132 ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
133 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
134 ; GFX11-NEXT: s_nop 0
135 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
136 ; GFX11-NEXT: s_endpgm
138 ; GFX12-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
140 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
141 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
142 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
143 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
144 ; GFX12-NEXT: s_wait_kmcnt 0x0
145 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
146 ; GFX12-NEXT: s_wait_loadcnt 0x0
147 ; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1]
148 ; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1]
149 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
150 ; GFX12-NEXT: s_nop 0
151 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
152 ; GFX12-NEXT: s_endpgm
153 %tid = call i32 @llvm.amdgcn.workitem.id.x()
154 %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
155 %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
156 %a = load double, ptr addrspace(1) %gep0
157 %add = fadd double %a, 1.0
158 %div2 = fmul double %add, 0.5
159 store double %div2, ptr addrspace(1) %out.gep
163 ; IEEE bit enabled for compute kernel, so shouldn't use even though nsz is allowed
164 define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
165 ; SI-LABEL: v_omod_div2_f32_enable_ieee_nsz:
167 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
168 ; SI-NEXT: s_mov_b32 s7, 0xf000
169 ; SI-NEXT: s_mov_b32 s6, 0
170 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
171 ; SI-NEXT: v_mov_b32_e32 v1, 0
172 ; SI-NEXT: s_waitcnt lgkmcnt(0)
173 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
174 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
175 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
176 ; SI-NEXT: s_waitcnt vmcnt(0)
177 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2
178 ; SI-NEXT: v_mul_f32_e32 v2, 0.5, v2
179 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
182 ; VI-LABEL: v_omod_div2_f32_enable_ieee_nsz:
184 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
185 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
186 ; VI-NEXT: s_waitcnt lgkmcnt(0)
187 ; VI-NEXT: v_mov_b32_e32 v1, s3
188 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
189 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
190 ; VI-NEXT: flat_load_dword v3, v[0:1]
191 ; VI-NEXT: v_mov_b32_e32 v1, s1
192 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
193 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
194 ; VI-NEXT: s_waitcnt vmcnt(0)
195 ; VI-NEXT: v_add_f32_e32 v2, 1.0, v3
196 ; VI-NEXT: v_mul_f32_e32 v2, 0.5, v2
197 ; VI-NEXT: flat_store_dword v[0:1], v2
200 ; GFX11-LABEL: v_omod_div2_f32_enable_ieee_nsz:
202 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
203 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
204 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
205 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
206 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
207 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
208 ; GFX11-NEXT: s_waitcnt vmcnt(0)
209 ; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
210 ; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1
211 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
212 ; GFX11-NEXT: s_nop 0
213 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
214 ; GFX11-NEXT: s_endpgm
216 ; GFX12-LABEL: v_omod_div2_f32_enable_ieee_nsz:
218 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
219 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
220 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
221 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
222 ; GFX12-NEXT: s_wait_kmcnt 0x0
223 ; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
224 ; GFX12-NEXT: s_wait_loadcnt 0x0
225 ; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1
226 ; GFX12-NEXT: v_mul_f32_e32 v1, 0.5, v1
227 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
228 ; GFX12-NEXT: s_nop 0
229 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
230 ; GFX12-NEXT: s_endpgm
231 %tid = call i32 @llvm.amdgcn.workitem.id.x()
232 %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
233 %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
234 %a = load float, ptr addrspace(1) %gep0
235 %add = fadd float %a, 1.0
236 %div2 = fmul float %add, 0.5
237 store float %div2, ptr addrspace(1) %out.gep
241 ; IEEE bit enabled for compute kernel, so shouldn't use even though nsz is allowed.
242 define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #5 {
243 ; SI-LABEL: v_omod_div2_f64_enable_ieee_nsz:
245 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
246 ; SI-NEXT: s_mov_b32 s7, 0xf000
247 ; SI-NEXT: s_mov_b32 s6, 0
248 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
249 ; SI-NEXT: v_mov_b32_e32 v1, 0
250 ; SI-NEXT: s_waitcnt lgkmcnt(0)
251 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
252 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
253 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
254 ; SI-NEXT: s_waitcnt vmcnt(0)
255 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0
256 ; SI-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5
257 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
260 ; VI-LABEL: v_omod_div2_f64_enable_ieee_nsz:
262 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
263 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
264 ; VI-NEXT: s_waitcnt lgkmcnt(0)
265 ; VI-NEXT: v_mov_b32_e32 v1, s3
266 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
267 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
268 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
269 ; VI-NEXT: v_mov_b32_e32 v3, s1
270 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
271 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
272 ; VI-NEXT: s_waitcnt vmcnt(0)
273 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
274 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
275 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
278 ; GFX11-LABEL: v_omod_div2_f64_enable_ieee_nsz:
280 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
281 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
282 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
283 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
284 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
285 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
286 ; GFX11-NEXT: s_waitcnt vmcnt(0)
287 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
288 ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
289 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
290 ; GFX11-NEXT: s_nop 0
291 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
292 ; GFX11-NEXT: s_endpgm
294 ; GFX12-LABEL: v_omod_div2_f64_enable_ieee_nsz:
296 ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
297 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
298 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
299 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
300 ; GFX12-NEXT: s_wait_kmcnt 0x0
301 ; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
302 ; GFX12-NEXT: s_wait_loadcnt 0x0
303 ; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1]
304 ; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1]
305 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
306 ; GFX12-NEXT: s_nop 0
307 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
308 ; GFX12-NEXT: s_endpgm
309 %tid = call i32 @llvm.amdgcn.workitem.id.x()
310 %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
311 %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
312 %a = load double, ptr addrspace(1) %gep0
313 %add = fadd double %a, 1.0
314 %div2 = fmul double %add, 0.5
315 store double %div2, ptr addrspace(1) %out.gep
319 ; Only allow without IEEE bit if signed zeros are significant.
320 define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 {
321 ; SI-LABEL: v_omod_div2_f32_signed_zeros:
323 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
324 ; SI-NEXT: v_mul_f32_e32 v0, 0.5, v0
325 ; SI-NEXT: s_mov_b32 s3, 0xf000
326 ; SI-NEXT: s_mov_b32 s2, -1
327 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
330 ; VI-LABEL: v_omod_div2_f32_signed_zeros:
332 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
333 ; VI-NEXT: v_mul_f32_e32 v0, 0.5, v0
334 ; VI-NEXT: flat_store_dword v[0:1], v0
337 ; GFX11PLUS-LABEL: v_omod_div2_f32_signed_zeros:
338 ; GFX11PLUS: ; %bb.0:
339 ; GFX11PLUS-NEXT: v_add_f32_e32 v0, 1.0, v0
340 ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1)
341 ; GFX11PLUS-NEXT: v_mul_f32_e32 v0, 0.5, v0
342 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off
343 ; GFX11PLUS-NEXT: s_nop 0
344 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
345 ; GFX11PLUS-NEXT: s_endpgm
346 %add = fadd float %a, 1.0
347 %div2 = fmul float %add, 0.5
348 store float %div2, ptr addrspace(1) undef
352 ; Only allow without IEEE bit if signed zeros are significant.
353 define amdgpu_ps void @v_omod_div2_f64_signed_zeros(double %a) #4 {
354 ; SI-LABEL: v_omod_div2_f64_signed_zeros:
356 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
357 ; SI-NEXT: s_mov_b32 s3, 0xf000
358 ; SI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
359 ; SI-NEXT: s_mov_b32 s2, -1
360 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
363 ; VI-LABEL: v_omod_div2_f64_signed_zeros:
365 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
366 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
367 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
370 ; GFX11-LABEL: v_omod_div2_f64_signed_zeros:
372 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
373 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
374 ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
375 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
376 ; GFX11-NEXT: s_nop 0
377 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
378 ; GFX11-NEXT: s_endpgm
380 ; GFX12-LABEL: v_omod_div2_f64_signed_zeros:
382 ; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1]
383 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
384 ; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1]
385 ; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
386 ; GFX12-NEXT: s_nop 0
387 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
388 ; GFX12-NEXT: s_endpgm
389 %add = fadd double %a, 1.0
390 %div2 = fmul double %add, 0.5
391 store double %div2, ptr addrspace(1) undef
395 define amdgpu_ps void @v_omod_div2_f32(float %a) #0 {
396 ; SI-LABEL: v_omod_div2_f32:
398 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
399 ; SI-NEXT: s_mov_b32 s3, 0xf000
400 ; SI-NEXT: s_mov_b32 s2, -1
401 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
404 ; VI-LABEL: v_omod_div2_f32:
406 ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
407 ; VI-NEXT: flat_store_dword v[0:1], v0
410 ; GFX11PLUS-LABEL: v_omod_div2_f32:
411 ; GFX11PLUS: ; %bb.0:
412 ; GFX11PLUS-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
413 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off
414 ; GFX11PLUS-NEXT: s_nop 0
415 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
416 ; GFX11PLUS-NEXT: s_endpgm
417 %add = fadd float %a, 1.0
418 %div2 = fmul float %add, 0.5
419 store float %div2, ptr addrspace(1) undef
423 define amdgpu_ps void @v_omod_div2_f64(double %a) #5 {
424 ; SI-LABEL: v_omod_div2_f64:
426 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 div:2
427 ; SI-NEXT: s_mov_b32 s3, 0xf000
428 ; SI-NEXT: s_mov_b32 s2, -1
429 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
432 ; VI-LABEL: v_omod_div2_f64:
434 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 div:2
435 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
438 ; GFX11-LABEL: v_omod_div2_f64:
440 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 div:2
441 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
442 ; GFX11-NEXT: s_nop 0
443 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
444 ; GFX11-NEXT: s_endpgm
446 ; GFX12-LABEL: v_omod_div2_f64:
448 ; GFX12-NEXT: v_add_f64_e64 v[0:1], v[0:1], 1.0 div:2
449 ; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
450 ; GFX12-NEXT: s_nop 0
451 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
452 ; GFX12-NEXT: s_endpgm
453 %add = fadd nsz double %a, 1.0
454 %div2 = fmul nsz double %add, 0.5
455 store double %div2, ptr addrspace(1) undef
459 define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 {
460 ; SI-LABEL: v_omod_mul2_f32:
462 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2
463 ; SI-NEXT: s_mov_b32 s3, 0xf000
464 ; SI-NEXT: s_mov_b32 s2, -1
465 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
468 ; VI-LABEL: v_omod_mul2_f32:
470 ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2
471 ; VI-NEXT: flat_store_dword v[0:1], v0
474 ; GFX11PLUS-LABEL: v_omod_mul2_f32:
475 ; GFX11PLUS: ; %bb.0:
476 ; GFX11PLUS-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2
477 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off
478 ; GFX11PLUS-NEXT: s_nop 0
479 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
480 ; GFX11PLUS-NEXT: s_endpgm
481 %add = fadd float %a, 1.0
482 %div2 = fmul float %add, 2.0
483 store float %div2, ptr addrspace(1) undef
487 define amdgpu_ps void @v_omod_mul2_med3(float %x, float %y, float %z) #0 {
488 ; SI-LABEL: v_omod_mul2_med3:
490 ; SI-NEXT: v_med3_f32 v0, v0, v1, v2 mul:2
491 ; SI-NEXT: s_mov_b32 s3, 0xf000
492 ; SI-NEXT: s_mov_b32 s2, -1
493 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
496 ; VI-LABEL: v_omod_mul2_med3:
498 ; VI-NEXT: v_med3_f32 v0, v0, v1, v2 mul:2
499 ; VI-NEXT: flat_store_dword v[0:1], v0
502 ; GFX11-LABEL: v_omod_mul2_med3:
504 ; GFX11-NEXT: v_med3_f32 v0, v0, v1, v2 mul:2
505 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
506 ; GFX11-NEXT: s_nop 0
507 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
508 ; GFX11-NEXT: s_endpgm
510 ; GFX12-LABEL: v_omod_mul2_med3:
512 ; GFX12-NEXT: v_med3_num_f32 v0, v0, v1, v2 mul:2
513 ; GFX12-NEXT: global_store_b32 v[0:1], v0, off
514 ; GFX12-NEXT: s_nop 0
515 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
516 ; GFX12-NEXT: s_endpgm
517 %fmed3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
518 %div2 = fmul float %fmed3, 2.0
519 store float %div2, ptr addrspace(1) undef
523 define amdgpu_ps void @v_omod_mul2_f64(double %a) #5 {
524 ; SI-LABEL: v_omod_mul2_f64:
526 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:2
527 ; SI-NEXT: s_mov_b32 s3, 0xf000
528 ; SI-NEXT: s_mov_b32 s2, -1
529 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
532 ; VI-LABEL: v_omod_mul2_f64:
534 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:2
535 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
538 ; GFX11-LABEL: v_omod_mul2_f64:
540 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:2
541 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
542 ; GFX11-NEXT: s_nop 0
543 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
544 ; GFX11-NEXT: s_endpgm
546 ; GFX12-LABEL: v_omod_mul2_f64:
548 ; GFX12-NEXT: v_add_f64_e64 v[0:1], v[0:1], 1.0 mul:2
549 ; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
550 ; GFX12-NEXT: s_nop 0
551 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
552 ; GFX12-NEXT: s_endpgm
553 %add = fadd nsz double %a, 1.0
554 %div2 = fmul nsz double %add, 2.0
555 store double %div2, ptr addrspace(1) undef
559 define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 {
560 ; SI-LABEL: v_omod_mul4_f32:
562 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4
563 ; SI-NEXT: s_mov_b32 s3, 0xf000
564 ; SI-NEXT: s_mov_b32 s2, -1
565 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
568 ; VI-LABEL: v_omod_mul4_f32:
570 ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4
571 ; VI-NEXT: flat_store_dword v[0:1], v0
574 ; GFX11PLUS-LABEL: v_omod_mul4_f32:
575 ; GFX11PLUS: ; %bb.0:
576 ; GFX11PLUS-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4
577 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off
578 ; GFX11PLUS-NEXT: s_nop 0
579 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
580 ; GFX11PLUS-NEXT: s_endpgm
581 %add = fadd float %a, 1.0
582 %div2 = fmul float %add, 4.0
583 store float %div2, ptr addrspace(1) undef
587 define amdgpu_ps void @v_omod_mul4_f64(double %a) #5 {
588 ; SI-LABEL: v_omod_mul4_f64:
590 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:4
591 ; SI-NEXT: s_mov_b32 s3, 0xf000
592 ; SI-NEXT: s_mov_b32 s2, -1
593 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
596 ; VI-LABEL: v_omod_mul4_f64:
598 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:4
599 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
602 ; GFX11-LABEL: v_omod_mul4_f64:
604 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:4
605 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
606 ; GFX11-NEXT: s_nop 0
607 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
608 ; GFX11-NEXT: s_endpgm
610 ; GFX12-LABEL: v_omod_mul4_f64:
612 ; GFX12-NEXT: v_add_f64_e64 v[0:1], v[0:1], 1.0 mul:4
613 ; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
614 ; GFX12-NEXT: s_nop 0
615 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
616 ; GFX12-NEXT: s_endpgm
617 %add = fadd nsz double %a, 1.0
618 %div2 = fmul nsz double %add, 4.0
619 store double %div2, ptr addrspace(1) undef
623 define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {
624 ; SI-LABEL: v_omod_mul4_multi_use_f32:
626 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
627 ; SI-NEXT: v_mul_f32_e32 v1, 4.0, v0
628 ; SI-NEXT: s_mov_b32 s3, 0xf000
629 ; SI-NEXT: s_mov_b32 s2, -1
630 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
631 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
632 ; SI-NEXT: s_waitcnt vmcnt(0)
635 ; VI-LABEL: v_omod_mul4_multi_use_f32:
637 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
638 ; VI-NEXT: v_mul_f32_e32 v1, 4.0, v0
639 ; VI-NEXT: flat_store_dword v[0:1], v1
640 ; VI-NEXT: flat_store_dword v[0:1], v0
641 ; VI-NEXT: s_waitcnt vmcnt(0)
644 ; GFX11-LABEL: v_omod_mul4_multi_use_f32:
646 ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0
647 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
648 ; GFX11-NEXT: v_mul_f32_e32 v1, 4.0, v0
649 ; GFX11-NEXT: s_clause 0x1
650 ; GFX11-NEXT: global_store_b32 v[0:1], v1, off
651 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
652 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
653 ; GFX11-NEXT: s_nop 0
654 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
655 ; GFX11-NEXT: s_endpgm
657 ; GFX12-LABEL: v_omod_mul4_multi_use_f32:
659 ; GFX12-NEXT: v_add_f32_e32 v0, 1.0, v0
660 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
661 ; GFX12-NEXT: v_mul_f32_e32 v1, 4.0, v0
662 ; GFX12-NEXT: global_store_b32 v[0:1], v1, off
663 ; GFX12-NEXT: s_wait_storecnt 0x0
664 ; GFX12-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
665 ; GFX12-NEXT: s_wait_storecnt 0x0
666 ; GFX12-NEXT: s_nop 0
667 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
668 ; GFX12-NEXT: s_endpgm
669 %add = fadd float %a, 1.0
670 %div2 = fmul float %add, 4.0
671 store float %div2, ptr addrspace(1) undef
672 store volatile float %add, ptr addrspace(1) undef
676 define amdgpu_ps void @v_omod_mul4_dbg_use_f32(float %a) #0 {
677 ; SI-LABEL: v_omod_mul4_dbg_use_f32:
679 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4
680 ; SI-NEXT: s_mov_b32 s3, 0xf000
681 ; SI-NEXT: s_mov_b32 s2, -1
682 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
685 ; VI-LABEL: v_omod_mul4_dbg_use_f32:
687 ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4
688 ; VI-NEXT: flat_store_dword v[0:1], v0
691 ; GFX11PLUS-LABEL: v_omod_mul4_dbg_use_f32:
692 ; GFX11PLUS: ; %bb.0:
693 ; GFX11PLUS-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4
694 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off
695 ; GFX11PLUS-NEXT: s_nop 0
696 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
697 ; GFX11PLUS-NEXT: s_endpgm
698 %add = fadd float %a, 1.0
699 call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
700 %div2 = fmul float %add, 4.0
701 store float %div2, ptr addrspace(1) undef
705 ; Clamp is applied after omod, folding both into instruction is OK.
706 define amdgpu_ps void @v_clamp_omod_div2_f32(float %a) #0 {
707 ; SI-LABEL: v_clamp_omod_div2_f32:
709 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp div:2
710 ; SI-NEXT: s_mov_b32 s3, 0xf000
711 ; SI-NEXT: s_mov_b32 s2, -1
712 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
715 ; VI-LABEL: v_clamp_omod_div2_f32:
717 ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp div:2
718 ; VI-NEXT: flat_store_dword v[0:1], v0
721 ; GFX11PLUS-LABEL: v_clamp_omod_div2_f32:
722 ; GFX11PLUS: ; %bb.0:
723 ; GFX11PLUS-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp div:2
724 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off
725 ; GFX11PLUS-NEXT: s_nop 0
726 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
727 ; GFX11PLUS-NEXT: s_endpgm
728 %add = fadd float %a, 1.0
729 %div2 = fmul float %add, 0.5
731 %max = call float @llvm.maxnum.f32(float %div2, float 0.0)
732 %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
733 store float %clamp, ptr addrspace(1) undef
737 ; Cannot fold omod into clamp
738 define amdgpu_ps void @v_omod_div2_clamp_f32(float %a) #0 {
739 ; SI-LABEL: v_omod_div2_clamp_f32:
741 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp
742 ; SI-NEXT: v_mul_f32_e32 v0, 0.5, v0
743 ; SI-NEXT: s_mov_b32 s3, 0xf000
744 ; SI-NEXT: s_mov_b32 s2, -1
745 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
748 ; VI-LABEL: v_omod_div2_clamp_f32:
750 ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp
751 ; VI-NEXT: v_mul_f32_e32 v0, 0.5, v0
752 ; VI-NEXT: flat_store_dword v[0:1], v0
755 ; GFX11PLUS-LABEL: v_omod_div2_clamp_f32:
756 ; GFX11PLUS: ; %bb.0:
757 ; GFX11PLUS-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp
758 ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1)
759 ; GFX11PLUS-NEXT: v_mul_f32_e32 v0, 0.5, v0
760 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off
761 ; GFX11PLUS-NEXT: s_nop 0
762 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
763 ; GFX11PLUS-NEXT: s_endpgm
764 %add = fadd float %a, 1.0
765 %max = call float @llvm.maxnum.f32(float %add, float 0.0)
766 %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
767 %div2 = fmul float %clamp, 0.5
768 store float %div2, ptr addrspace(1) undef
772 define amdgpu_ps void @v_omod_div2_abs_src_f32(float %a) #0 {
773 ; SI-LABEL: v_omod_div2_abs_src_f32:
775 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
776 ; SI-NEXT: v_mul_f32_e64 v0, |v0|, 0.5
777 ; SI-NEXT: s_mov_b32 s3, 0xf000
778 ; SI-NEXT: s_mov_b32 s2, -1
779 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
782 ; VI-LABEL: v_omod_div2_abs_src_f32:
784 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
785 ; VI-NEXT: v_mul_f32_e64 v0, |v0|, 0.5
786 ; VI-NEXT: flat_store_dword v[0:1], v0
789 ; GFX11PLUS-LABEL: v_omod_div2_abs_src_f32:
790 ; GFX11PLUS: ; %bb.0:
791 ; GFX11PLUS-NEXT: v_add_f32_e32 v0, 1.0, v0
792 ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1)
793 ; GFX11PLUS-NEXT: v_mul_f32_e64 v0, |v0|, 0.5
794 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off
795 ; GFX11PLUS-NEXT: s_nop 0
796 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
797 ; GFX11PLUS-NEXT: s_endpgm
798 %add = fadd float %a, 1.0
799 %abs.add = call float @llvm.fabs.f32(float %add)
800 %div2 = fmul float %abs.add, 0.5
801 store float %div2, ptr addrspace(1) undef
805 define amdgpu_ps void @v_omod_add_self_clamp_f32(float %a) #0 {
806 ; SI-LABEL: v_omod_add_self_clamp_f32:
808 ; SI-NEXT: v_add_f32_e64 v0, v0, v0 clamp
809 ; SI-NEXT: s_mov_b32 s3, 0xf000
810 ; SI-NEXT: s_mov_b32 s2, -1
811 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
814 ; VI-LABEL: v_omod_add_self_clamp_f32:
816 ; VI-NEXT: v_add_f32_e64 v0, v0, v0 clamp
817 ; VI-NEXT: flat_store_dword v[0:1], v0
820 ; GFX11PLUS-LABEL: v_omod_add_self_clamp_f32:
821 ; GFX11PLUS: ; %bb.0:
822 ; GFX11PLUS-NEXT: v_add_f32_e64 v0, v0, v0 clamp
823 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off
824 ; GFX11PLUS-NEXT: s_nop 0
825 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
826 ; GFX11PLUS-NEXT: s_endpgm
827 %add = fadd float %a, %a
828 %max = call float @llvm.maxnum.f32(float %add, float 0.0)
829 %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
830 store float %clamp, ptr addrspace(1) undef
834 define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 {
835 ; SI-LABEL: v_omod_add_clamp_self_f32:
837 ; SI-NEXT: v_max_f32_e64 v0, v0, v0 clamp
838 ; SI-NEXT: v_add_f32_e32 v0, v0, v0
839 ; SI-NEXT: s_mov_b32 s3, 0xf000
840 ; SI-NEXT: s_mov_b32 s2, -1
841 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
844 ; VI-LABEL: v_omod_add_clamp_self_f32:
846 ; VI-NEXT: v_max_f32_e64 v0, v0, v0 clamp
847 ; VI-NEXT: v_add_f32_e32 v0, v0, v0
848 ; VI-NEXT: flat_store_dword v[0:1], v0
851 ; GFX11-LABEL: v_omod_add_clamp_self_f32:
853 ; GFX11-NEXT: v_max_f32_e64 v0, v0, v0 clamp
854 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
855 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v0
856 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
857 ; GFX11-NEXT: s_nop 0
858 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
859 ; GFX11-NEXT: s_endpgm
861 ; GFX12-LABEL: v_omod_add_clamp_self_f32:
863 ; GFX12-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp
864 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
865 ; GFX12-NEXT: v_add_f32_e32 v0, v0, v0
866 ; GFX12-NEXT: global_store_b32 v[0:1], v0, off
867 ; GFX12-NEXT: s_nop 0
868 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
869 ; GFX12-NEXT: s_endpgm
870 %max = call float @llvm.maxnum.f32(float %a, float 0.0)
871 %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
872 %add = fadd float %clamp, %clamp
873 store float %add, ptr addrspace(1) undef
877 define amdgpu_ps void @v_omod_add_abs_self_f32(float %a) #0 {
878 ; SI-LABEL: v_omod_add_abs_self_f32:
880 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
881 ; SI-NEXT: v_add_f32_e64 v0, |v0|, |v0|
882 ; SI-NEXT: s_mov_b32 s3, 0xf000
883 ; SI-NEXT: s_mov_b32 s2, -1
884 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
887 ; VI-LABEL: v_omod_add_abs_self_f32:
889 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
890 ; VI-NEXT: v_add_f32_e64 v0, |v0|, |v0|
891 ; VI-NEXT: flat_store_dword v[0:1], v0
894 ; GFX11PLUS-LABEL: v_omod_add_abs_self_f32:
895 ; GFX11PLUS: ; %bb.0:
896 ; GFX11PLUS-NEXT: v_add_f32_e32 v0, 1.0, v0
897 ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1)
898 ; GFX11PLUS-NEXT: v_add_f32_e64 v0, |v0|, |v0|
899 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off
900 ; GFX11PLUS-NEXT: s_nop 0
901 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
902 ; GFX11PLUS-NEXT: s_endpgm
903 %x = fadd float %a, 1.0
904 %abs.x = call float @llvm.fabs.f32(float %x)
905 %add = fadd float %abs.x, %abs.x
906 store float %add, ptr addrspace(1) undef
910 define amdgpu_ps void @v_omod_add_abs_x_x_f32(float %a) #0 {
911 ; SI-LABEL: v_omod_add_abs_x_x_f32:
913 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
914 ; SI-NEXT: v_add_f32_e64 v0, |v0|, v0
915 ; SI-NEXT: s_mov_b32 s3, 0xf000
916 ; SI-NEXT: s_mov_b32 s2, -1
917 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
920 ; VI-LABEL: v_omod_add_abs_x_x_f32:
922 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
923 ; VI-NEXT: v_add_f32_e64 v0, |v0|, v0
924 ; VI-NEXT: flat_store_dword v[0:1], v0
927 ; GFX11PLUS-LABEL: v_omod_add_abs_x_x_f32:
928 ; GFX11PLUS: ; %bb.0:
929 ; GFX11PLUS-NEXT: v_add_f32_e32 v0, 1.0, v0
930 ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1)
931 ; GFX11PLUS-NEXT: v_add_f32_e64 v0, |v0|, v0
932 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off
933 ; GFX11PLUS-NEXT: s_nop 0
934 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
935 ; GFX11PLUS-NEXT: s_endpgm
936 %x = fadd float %a, 1.0
937 %abs.x = call float @llvm.fabs.f32(float %x)
938 %add = fadd float %abs.x, %x
939 store float %add, ptr addrspace(1) undef
943 define amdgpu_ps void @v_omod_add_x_abs_x_f32(float %a) #0 {
944 ; SI-LABEL: v_omod_add_x_abs_x_f32:
946 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
947 ; SI-NEXT: v_add_f32_e64 v0, v0, |v0|
948 ; SI-NEXT: s_mov_b32 s3, 0xf000
949 ; SI-NEXT: s_mov_b32 s2, -1
950 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
953 ; VI-LABEL: v_omod_add_x_abs_x_f32:
955 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
956 ; VI-NEXT: v_add_f32_e64 v0, v0, |v0|
957 ; VI-NEXT: flat_store_dword v[0:1], v0
960 ; GFX11PLUS-LABEL: v_omod_add_x_abs_x_f32:
961 ; GFX11PLUS: ; %bb.0:
962 ; GFX11PLUS-NEXT: v_add_f32_e32 v0, 1.0, v0
963 ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1)
964 ; GFX11PLUS-NEXT: v_add_f32_e64 v0, v0, |v0|
965 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off
966 ; GFX11PLUS-NEXT: s_nop 0
967 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
968 ; GFX11PLUS-NEXT: s_endpgm
969 %x = fadd float %a, 1.0
970 %abs.x = call float @llvm.fabs.f32(float %x)
971 %add = fadd float %x, %abs.x
972 store float %add, ptr addrspace(1) undef
976 ; Don't fold omod into omod into another omod.
977 define amdgpu_ps void @v_omod_div2_omod_div2_f32(float %a) #0 {
978 ; SI-LABEL: v_omod_div2_omod_div2_f32:
980 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
981 ; SI-NEXT: v_mul_f32_e32 v0, 0.5, v0
982 ; SI-NEXT: s_mov_b32 s3, 0xf000
983 ; SI-NEXT: s_mov_b32 s2, -1
984 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
987 ; VI-LABEL: v_omod_div2_omod_div2_f32:
989 ; VI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
990 ; VI-NEXT: v_mul_f32_e32 v0, 0.5, v0
991 ; VI-NEXT: flat_store_dword v[0:1], v0
994 ; GFX11PLUS-LABEL: v_omod_div2_omod_div2_f32:
995 ; GFX11PLUS: ; %bb.0:
996 ; GFX11PLUS-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
997 ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1)
998 ; GFX11PLUS-NEXT: v_mul_f32_e32 v0, 0.5, v0
999 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off
1000 ; GFX11PLUS-NEXT: s_nop 0
1001 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1002 ; GFX11PLUS-NEXT: s_endpgm
1003 %add = fadd float %a, 1.0
1004 %div2.0 = fmul float %add, 0.5
1005 %div2.1 = fmul float %div2.0, 0.5
1006 store float %div2.1, ptr addrspace(1) undef
1010 ; Don't fold omod if denorms enabled
1011 define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 {
1012 ; SI-LABEL: v_omod_div2_f32_denormals:
1014 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
1015 ; SI-NEXT: v_mul_f32_e32 v0, 0.5, v0
1016 ; SI-NEXT: s_mov_b32 s3, 0xf000
1017 ; SI-NEXT: s_mov_b32 s2, -1
1018 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1021 ; VI-LABEL: v_omod_div2_f32_denormals:
1023 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
1024 ; VI-NEXT: v_mul_f32_e32 v0, 0.5, v0
1025 ; VI-NEXT: flat_store_dword v[0:1], v0
1028 ; GFX11PLUS-LABEL: v_omod_div2_f32_denormals:
1029 ; GFX11PLUS: ; %bb.0:
1030 ; GFX11PLUS-NEXT: v_add_f32_e32 v0, 1.0, v0
1031 ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1)
1032 ; GFX11PLUS-NEXT: v_mul_f32_e32 v0, 0.5, v0
1033 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off
1034 ; GFX11PLUS-NEXT: s_nop 0
1035 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1036 ; GFX11PLUS-NEXT: s_endpgm
1037 %add = fadd float %a, 1.0
1038 %div2 = fmul float %add, 0.5
1039 store float %div2, ptr addrspace(1) undef
1043 ; Don't fold omod if denorms enabled.
1044 define amdgpu_ps void @v_omod_div2_f64_denormals(double %a) #6 {
1045 ; SI-LABEL: v_omod_div2_f64_denormals:
1047 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
1048 ; SI-NEXT: s_mov_b32 s3, 0xf000
1049 ; SI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
1050 ; SI-NEXT: s_mov_b32 s2, -1
1051 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1054 ; VI-LABEL: v_omod_div2_f64_denormals:
1056 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
1057 ; VI-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
1058 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
1061 ; GFX11-LABEL: v_omod_div2_f64_denormals:
1063 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
1064 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1065 ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
1066 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
1067 ; GFX11-NEXT: s_nop 0
1068 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1069 ; GFX11-NEXT: s_endpgm
1071 ; GFX12-LABEL: v_omod_div2_f64_denormals:
1073 ; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1]
1074 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1075 ; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1]
1076 ; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
1077 ; GFX12-NEXT: s_nop 0
1078 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1079 ; GFX12-NEXT: s_endpgm
1080 %add = fadd double %a, 1.0
1081 %div2 = fmul double %add, 0.5
1082 store double %div2, ptr addrspace(1) undef
1086 ; Don't fold omod if denorms enabled for add form.
1087 define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 {
1088 ; SI-LABEL: v_omod_mul2_f32_denormals:
1090 ; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
1091 ; SI-NEXT: v_add_f32_e32 v0, v0, v0
1092 ; SI-NEXT: s_mov_b32 s3, 0xf000
1093 ; SI-NEXT: s_mov_b32 s2, -1
1094 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1097 ; VI-LABEL: v_omod_mul2_f32_denormals:
1099 ; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
1100 ; VI-NEXT: v_add_f32_e32 v0, v0, v0
1101 ; VI-NEXT: flat_store_dword v[0:1], v0
1104 ; GFX11PLUS-LABEL: v_omod_mul2_f32_denormals:
1105 ; GFX11PLUS: ; %bb.0:
1106 ; GFX11PLUS-NEXT: v_add_f32_e32 v0, 1.0, v0
1107 ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1)
1108 ; GFX11PLUS-NEXT: v_add_f32_e32 v0, v0, v0
1109 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off
1110 ; GFX11PLUS-NEXT: s_nop 0
1111 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1112 ; GFX11PLUS-NEXT: s_endpgm
1113 %add = fadd float %a, 1.0
1114 %mul2 = fadd float %add, %add
1115 store float %mul2, ptr addrspace(1) undef
1119 ; Don't fold omod if denorms enabled for add form.
1120 define amdgpu_ps void @v_omod_mul2_f64_denormals(double %a) #2 {
1121 ; SI-LABEL: v_omod_mul2_f64_denormals:
1123 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
1124 ; SI-NEXT: s_mov_b32 s3, 0xf000
1125 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[0:1]
1126 ; SI-NEXT: s_mov_b32 s2, -1
1127 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1130 ; VI-LABEL: v_omod_mul2_f64_denormals:
1132 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
1133 ; VI-NEXT: v_add_f64 v[0:1], v[0:1], v[0:1]
1134 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
1137 ; GFX11-LABEL: v_omod_mul2_f64_denormals:
1139 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
1140 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1141 ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[0:1]
1142 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
1143 ; GFX11-NEXT: s_nop 0
1144 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1145 ; GFX11-NEXT: s_endpgm
1147 ; GFX12-LABEL: v_omod_mul2_f64_denormals:
1149 ; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1]
1150 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1151 ; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[0:1]
1152 ; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
1153 ; GFX12-NEXT: s_nop 0
1154 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1155 ; GFX12-NEXT: s_endpgm
1156 %add = fadd double %a, 1.0
1157 %mul2 = fadd double %add, %add
1158 store double %mul2, ptr addrspace(1) undef
1162 ; Don't fold omod if denorms enabled
1163 define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 {
1164 ; SI-LABEL: v_omod_div2_f16_denormals:
1166 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1167 ; SI-NEXT: s_mov_b32 s3, 0xf000
1168 ; SI-NEXT: s_mov_b32 s2, -1
1169 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1170 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
1171 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1172 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1175 ; VI-LABEL: v_omod_div2_f16_denormals:
1177 ; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
1178 ; VI-NEXT: v_mul_f16_e32 v0, 0.5, v0
1179 ; VI-NEXT: flat_store_short v[0:1], v0
1182 ; GFX11PLUS-LABEL: v_omod_div2_f16_denormals:
1183 ; GFX11PLUS: ; %bb.0:
1184 ; GFX11PLUS-NEXT: v_add_f16_e32 v0, 1.0, v0
1185 ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1)
1186 ; GFX11PLUS-NEXT: v_mul_f16_e32 v0, 0.5, v0
1187 ; GFX11PLUS-NEXT: global_store_b16 v[0:1], v0, off
1188 ; GFX11PLUS-NEXT: s_nop 0
1189 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1190 ; GFX11PLUS-NEXT: s_endpgm
1191 %add = fadd half %a, 1.0
1192 %div2 = fmul half %add, 0.5
1193 store half %div2, ptr addrspace(1) undef
1197 ; Don't fold omod if denorms enabled for add form.
1198 define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 {
1199 ; SI-LABEL: v_omod_mul2_f16_denormals:
1201 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1202 ; SI-NEXT: s_mov_b32 s3, 0xf000
1203 ; SI-NEXT: s_mov_b32 s2, -1
1204 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1205 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2
1206 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1207 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1210 ; VI-LABEL: v_omod_mul2_f16_denormals:
1212 ; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
1213 ; VI-NEXT: v_add_f16_e32 v0, v0, v0
1214 ; VI-NEXT: flat_store_short v[0:1], v0
1217 ; GFX11PLUS-LABEL: v_omod_mul2_f16_denormals:
1218 ; GFX11PLUS: ; %bb.0:
1219 ; GFX11PLUS-NEXT: v_add_f16_e32 v0, 1.0, v0
1220 ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1)
1221 ; GFX11PLUS-NEXT: v_add_f16_e32 v0, v0, v0
1222 ; GFX11PLUS-NEXT: global_store_b16 v[0:1], v0, off
1223 ; GFX11PLUS-NEXT: s_nop 0
1224 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1225 ; GFX11PLUS-NEXT: s_endpgm
1226 %add = fadd half %a, 1.0
1227 %mul2 = fadd half %add, %add
1228 store half %mul2, ptr addrspace(1) undef
1232 define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 {
1233 ; SI-LABEL: v_omod_div2_f16_no_denormals:
1235 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1236 ; SI-NEXT: s_mov_b32 s3, 0xf000
1237 ; SI-NEXT: s_mov_b32 s2, -1
1238 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1239 ; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
1240 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1241 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1244 ; VI-LABEL: v_omod_div2_f16_no_denormals:
1246 ; VI-NEXT: v_add_f16_e64 v0, v0, 1.0 div:2
1247 ; VI-NEXT: flat_store_short v[0:1], v0
1250 ; GFX11PLUS-LABEL: v_omod_div2_f16_no_denormals:
1251 ; GFX11PLUS: ; %bb.0:
1252 ; GFX11PLUS-NEXT: v_add_f16_e64 v0, v0, 1.0 div:2
1253 ; GFX11PLUS-NEXT: global_store_b16 v[0:1], v0, off
1254 ; GFX11PLUS-NEXT: s_nop 0
1255 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1256 ; GFX11PLUS-NEXT: s_endpgm
1257 %add = fadd half %a, 1.0
1258 %div2 = fmul half %add, 0.5
1259 store half %div2, ptr addrspace(1) undef
1263 define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 {
1264 ; SI-LABEL: v_omod_mac_to_mad:
1266 ; SI-NEXT: v_mad_f32 v1, v1, v1, v0 mul:2
1267 ; SI-NEXT: v_mul_f32_e32 v0, v1, v0
1268 ; SI-NEXT: s_mov_b32 s3, 0xf000
1269 ; SI-NEXT: s_mov_b32 s2, -1
1270 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1273 ; VI-LABEL: v_omod_mac_to_mad:
1275 ; VI-NEXT: v_mad_f32 v1, v1, v1, v0 mul:2
1276 ; VI-NEXT: v_mul_f32_e32 v0, v1, v0
1277 ; VI-NEXT: flat_store_dword v[0:1], v0
1280 ; GFX11PLUS-LABEL: v_omod_mac_to_mad:
1281 ; GFX11PLUS: ; %bb.0:
1282 ; GFX11PLUS-NEXT: v_mul_f32_e32 v1, v1, v1
1283 ; GFX11PLUS-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1284 ; GFX11PLUS-NEXT: v_add_f32_e64 v1, v1, v0 mul:2
1285 ; GFX11PLUS-NEXT: v_mul_f32_e32 v0, v1, v0
1286 ; GFX11PLUS-NEXT: global_store_b32 v[0:1], v0, off
1287 ; GFX11PLUS-NEXT: s_nop 0
1288 ; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1289 ; GFX11PLUS-NEXT: s_endpgm
1290 %mul = fmul float %a, %a
1291 %add = fadd float %mul, %b
1292 %mad = fmul float %add, 2.0
1293 %res = fmul float %mad, %b
1294 store float %res, ptr addrspace(1) undef
1298 declare i32 @llvm.amdgcn.workitem.id.x() #1
1299 declare float @llvm.fabs.f32(float) #1
1300 declare float @llvm.floor.f32(float) #1
1301 declare float @llvm.minnum.f32(float, float) #1
1302 declare float @llvm.maxnum.f32(float, float) #1
1303 declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
1304 declare double @llvm.fabs.f64(double) #1
1305 declare double @llvm.minnum.f64(double, double) #1
1306 declare double @llvm.maxnum.f64(double, double) #1
1307 declare half @llvm.fabs.f16(half) #1
1308 declare half @llvm.minnum.f16(half, half) #1
1309 declare half @llvm.maxnum.f16(half, half) #1
1310 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
1312 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" }
1313 attributes #1 = { nounwind readnone }
1314 attributes #2 = { nounwind "denormal-fp-math-f32"="ieee,ieee" "no-signed-zeros-fp-math"="true" }
1315 attributes #3 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" }
1316 attributes #4 = { nounwind "no-signed-zeros-fp-math"="false" }
1317 attributes #5 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
1318 attributes #6 = { nounwind "denormal-fp-math"="ieee,ieee" "no-signed-zeros-fp-math"="true" }
1320 !llvm.dbg.cu = !{!0}
1321 !llvm.module.flags = !{!2, !3}
1323 !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
1324 !1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
1325 !2 = !{i32 2, !"Dwarf Version", i32 4}
1326 !3 = !{i32 2, !"Debug Info Version", i32 3}
1327 !4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)
1328 !5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
1329 !6 = !DISubroutineType(types: !7)
1331 !8 = !DIBasicType(name: "float", size: 32, align: 32)
1332 !9 = !DIExpression()
1333 !10 = !DILocation(line: 1, column: 42, scope: !5)