llvm/test/CodeGen/AMDGPU/omod.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
   3 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefixes=VI %s
   4 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
   5
   6 ; IEEE bit enabled for compute kernel, so shouldn't use.
   7 define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 {
   8 ; SI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
   9 ; SI:       ; %bb.0:
  10 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
  11 ; SI-NEXT:    s_mov_b32 s7, 0xf000
  12 ; SI-NEXT:    s_mov_b32 s6, 0
  13 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
  14 ; SI-NEXT:    v_mov_b32_e32 v1, 0
  15 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  16 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
  17 ; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
  18 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
  19 ; SI-NEXT:    s_waitcnt vmcnt(0)
  20 ; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
  21 ; SI-NEXT:    v_mul_f32_e32 v2, 0.5, v2
  22 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
  23 ; SI-NEXT:    s_endpgm
  24 ;
  25 ; VI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
  26 ; VI:       ; %bb.0:
  27 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
  28 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
  29 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  30 ; VI-NEXT:    v_mov_b32_e32 v1, s3
  31 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
  32 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
  33 ; VI-NEXT:    flat_load_dword v3, v[0:1]
  34 ; VI-NEXT:    v_mov_b32_e32 v1, s1
  35 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
  36 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
  37 ; VI-NEXT:    s_waitcnt vmcnt(0)
  38 ; VI-NEXT:    v_add_f32_e32 v2, 1.0, v3
  39 ; VI-NEXT:    v_mul_f32_e32 v2, 0.5, v2
  40 ; VI-NEXT:    flat_store_dword v[0:1], v2
  41 ; VI-NEXT:    s_endpgm
  42 ;
  43 ; GFX11-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
  44 ; GFX11:       ; %bb.0:
  45 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
  46 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
  47 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
  48 ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
  49 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
  50 ; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
  51 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
  52 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0.5, v1
  53 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
  54 ; GFX11-NEXT:    s_nop 0
  55 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
  56 ; GFX11-NEXT:    s_endpgm
  57   %tid = call i32 @llvm.amdgcn.workitem.id.x()
  58   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
  59   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
  60   %a = load float, ptr addrspace(1) %gep0
  61   %add = fadd float %a, 1.0
  62   %div2 = fmul float %add, 0.5
  63   store float %div2, ptr addrspace(1) %out.gep
  64   ret void
  65 }
  66
  67 ; IEEE bit enabled for compute kernel, so shouldn't use.
  68 define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 {
  69 ; SI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
  70 ; SI:       ; %bb.0:
  71 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
  72 ; SI-NEXT:    s_mov_b32 s7, 0xf000
  73 ; SI-NEXT:    s_mov_b32 s6, 0
  74 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
  75 ; SI-NEXT:    v_mov_b32_e32 v1, 0
  76 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
  77 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
  78 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
  79 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
  80 ; SI-NEXT:    s_waitcnt vmcnt(0)
  81 ; SI-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
  82 ; SI-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
  83 ; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
  84 ; SI-NEXT:    s_endpgm
  85 ;
  86 ; VI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
  87 ; VI:       ; %bb.0:
  88 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
  89 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
  90 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
  91 ; VI-NEXT:    v_mov_b32_e32 v1, s3
  92 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
  93 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
  94 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
  95 ; VI-NEXT:    v_mov_b32_e32 v3, s1
  96 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
  97 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
  98 ; VI-NEXT:    s_waitcnt vmcnt(0)
  99 ; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 100 ; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
 101 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 102 ; VI-NEXT:    s_endpgm
 103 ;
 104 ; GFX11-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
 105 ; GFX11:       ; %bb.0:
 106 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 107 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 108 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 109 ; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
 110 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 111 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 112 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 113 ; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
 114 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 115 ; GFX11-NEXT:    s_nop 0
 116 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 117 ; GFX11-NEXT:    s_endpgm
 118   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 119   %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
 120   %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
 121   %a = load double, ptr addrspace(1) %gep0
 122   %add = fadd double %a, 1.0
 123   %div2 = fmul double %add, 0.5
 124   store double %div2, ptr addrspace(1) %out.gep
 125   ret void
 126 }
 127
 128 ; IEEE bit enabled for compute kernel, so shouldn't use even though nsz is allowed
 129 define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
 130 ; SI-LABEL: v_omod_div2_f32_enable_ieee_nsz:
 131 ; SI:       ; %bb.0:
 132 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 133 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 134 ; SI-NEXT:    s_mov_b32 s6, 0
 135 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 136 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 137 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 138 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 139 ; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 140 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 141 ; SI-NEXT:    s_waitcnt vmcnt(0)
 142 ; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
 143 ; SI-NEXT:    v_mul_f32_e32 v2, 0.5, v2
 144 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 145 ; SI-NEXT:    s_endpgm
 146 ;
 147 ; VI-LABEL: v_omod_div2_f32_enable_ieee_nsz:
 148 ; VI:       ; %bb.0:
 149 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 150 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 151 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 152 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 153 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 154 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 155 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 156 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 157 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
 158 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 159 ; VI-NEXT:    s_waitcnt vmcnt(0)
 160 ; VI-NEXT:    v_add_f32_e32 v2, 1.0, v3
 161 ; VI-NEXT:    v_mul_f32_e32 v2, 0.5, v2
 162 ; VI-NEXT:    flat_store_dword v[0:1], v2
 163 ; VI-NEXT:    s_endpgm
 164 ;
 165 ; GFX11-LABEL: v_omod_div2_f32_enable_ieee_nsz:
 166 ; GFX11:       ; %bb.0:
 167 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 168 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 169 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 170 ; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
 171 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 172 ; GFX11-NEXT:    v_add_f32_e32 v1, 1.0, v1
 173 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 174 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0.5, v1
 175 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 176 ; GFX11-NEXT:    s_nop 0
 177 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 178 ; GFX11-NEXT:    s_endpgm
 179   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 180   %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid
 181   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
 182   %a = load float, ptr addrspace(1) %gep0
 183   %add = fadd float %a, 1.0
 184   %div2 = fmul float %add, 0.5
 185   store float %div2, ptr addrspace(1) %out.gep
 186   ret void
 187 }
 188
 189 ; IEEE bit enabled for compute kernel, so shouldn't use even though nsz is allowed.
 190 define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #5 {
 191 ; SI-LABEL: v_omod_div2_f64_enable_ieee_nsz:
 192 ; SI:       ; %bb.0:
 193 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 194 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 195 ; SI-NEXT:    s_mov_b32 s6, 0
 196 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 197 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 198 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 199 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 200 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
 201 ; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 202 ; SI-NEXT:    s_waitcnt vmcnt(0)
 203 ; SI-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
 204 ; SI-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
 205 ; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 206 ; SI-NEXT:    s_endpgm
 207 ;
 208 ; VI-LABEL: v_omod_div2_f64_enable_ieee_nsz:
 209 ; VI:       ; %bb.0:
 210 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 211 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 212 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 213 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 214 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 215 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 216 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 217 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 218 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 219 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 220 ; VI-NEXT:    s_waitcnt vmcnt(0)
 221 ; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 222 ; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
 223 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 224 ; VI-NEXT:    s_endpgm
 225 ;
 226 ; GFX11-LABEL: v_omod_div2_f64_enable_ieee_nsz:
 227 ; GFX11:       ; %bb.0:
 228 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 229 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 230 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 231 ; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
 232 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 233 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 234 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 235 ; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
 236 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 237 ; GFX11-NEXT:    s_nop 0
 238 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 239 ; GFX11-NEXT:    s_endpgm
 240   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 241   %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid
 242   %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
 243   %a = load double, ptr addrspace(1) %gep0
 244   %add = fadd double %a, 1.0
 245   %div2 = fmul double %add, 0.5
 246   store double %div2, ptr addrspace(1) %out.gep
 247   ret void
 248 }
 249
 250 ; Only allow without IEEE bit if signed zeros are significant.
 251 define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 {
 252 ; SI-LABEL: v_omod_div2_f32_signed_zeros:
 253 ; SI:       ; %bb.0:
 254 ; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 255 ; SI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
 256 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 257 ; SI-NEXT:    s_mov_b32 s2, -1
 258 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 259 ; SI-NEXT:    s_endpgm
 260 ;
 261 ; VI-LABEL: v_omod_div2_f32_signed_zeros:
 262 ; VI:       ; %bb.0:
 263 ; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 264 ; VI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
 265 ; VI-NEXT:    flat_store_dword v[0:1], v0
 266 ; VI-NEXT:    s_endpgm
 267 ;
 268 ; GFX11-LABEL: v_omod_div2_f32_signed_zeros:
 269 ; GFX11:       ; %bb.0:
 270 ; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
 271 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 272 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0.5, v0
 273 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 274 ; GFX11-NEXT:    s_nop 0
 275 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 276 ; GFX11-NEXT:    s_endpgm
 277   %add = fadd float %a, 1.0
 278   %div2 = fmul float %add, 0.5
 279   store float %div2, ptr addrspace(1) undef
 280   ret void
 281 }
 282
 283 ; Only allow without IEEE bit if signed zeros are significant.
 284 define amdgpu_ps void @v_omod_div2_f64_signed_zeros(double %a) #4 {
 285 ; SI-LABEL: v_omod_div2_f64_signed_zeros:
 286 ; SI:       ; %bb.0:
 287 ; SI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 288 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 289 ; SI-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
 290 ; SI-NEXT:    s_mov_b32 s2, -1
 291 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 292 ; SI-NEXT:    s_endpgm
 293 ;
 294 ; VI-LABEL: v_omod_div2_f64_signed_zeros:
 295 ; VI:       ; %bb.0:
 296 ; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 297 ; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
 298 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
 299 ; VI-NEXT:    s_endpgm
 300 ;
 301 ; GFX11-LABEL: v_omod_div2_f64_signed_zeros:
 302 ; GFX11:       ; %bb.0:
 303 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 304 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 305 ; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
 306 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
 307 ; GFX11-NEXT:    s_nop 0
 308 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 309 ; GFX11-NEXT:    s_endpgm
 310   %add = fadd double %a, 1.0
 311   %div2 = fmul double %add, 0.5
 312   store double %div2, ptr addrspace(1) undef
 313   ret void
 314 }
 315
 316 define amdgpu_ps void @v_omod_div2_f32(float %a) #0 {
 317 ; SI-LABEL: v_omod_div2_f32:
 318 ; SI:       ; %bb.0:
 319 ; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 div:2
 320 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 321 ; SI-NEXT:    s_mov_b32 s2, -1
 322 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 323 ; SI-NEXT:    s_endpgm
 324 ;
 325 ; VI-LABEL: v_omod_div2_f32:
 326 ; VI:       ; %bb.0:
 327 ; VI-NEXT:    v_add_f32_e64 v0, v0, 1.0 div:2
 328 ; VI-NEXT:    flat_store_dword v[0:1], v0
 329 ; VI-NEXT:    s_endpgm
 330 ;
 331 ; GFX11-LABEL: v_omod_div2_f32:
 332 ; GFX11:       ; %bb.0:
 333 ; GFX11-NEXT:    v_add_f32_e64 v0, v0, 1.0 div:2
 334 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 335 ; GFX11-NEXT:    s_nop 0
 336 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 337 ; GFX11-NEXT:    s_endpgm
 338   %add = fadd float %a, 1.0
 339   %div2 = fmul float %add, 0.5
 340   store float %div2, ptr addrspace(1) undef
 341   ret void
 342 }
 343
 344 define amdgpu_ps void @v_omod_div2_f64(double %a) #5 {
 345 ; SI-LABEL: v_omod_div2_f64:
 346 ; SI:       ; %bb.0:
 347 ; SI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 div:2
 348 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 349 ; SI-NEXT:    s_mov_b32 s2, -1
 350 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 351 ; SI-NEXT:    s_endpgm
 352 ;
 353 ; VI-LABEL: v_omod_div2_f64:
 354 ; VI:       ; %bb.0:
 355 ; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 div:2
 356 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
 357 ; VI-NEXT:    s_endpgm
 358 ;
 359 ; GFX11-LABEL: v_omod_div2_f64:
 360 ; GFX11:       ; %bb.0:
 361 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 div:2
 362 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
 363 ; GFX11-NEXT:    s_nop 0
 364 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 365 ; GFX11-NEXT:    s_endpgm
 366   %add = fadd nsz double %a, 1.0
 367   %div2 = fmul nsz double %add, 0.5
 368   store double %div2, ptr addrspace(1) undef
 369   ret void
 370 }
 371
 372 define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 {
 373 ; SI-LABEL: v_omod_mul2_f32:
 374 ; SI:       ; %bb.0:
 375 ; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:2
 376 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 377 ; SI-NEXT:    s_mov_b32 s2, -1
 378 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 379 ; SI-NEXT:    s_endpgm
 380 ;
 381 ; VI-LABEL: v_omod_mul2_f32:
 382 ; VI:       ; %bb.0:
 383 ; VI-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:2
 384 ; VI-NEXT:    flat_store_dword v[0:1], v0
 385 ; VI-NEXT:    s_endpgm
 386 ;
 387 ; GFX11-LABEL: v_omod_mul2_f32:
 388 ; GFX11:       ; %bb.0:
 389 ; GFX11-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:2
 390 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 391 ; GFX11-NEXT:    s_nop 0
 392 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 393 ; GFX11-NEXT:    s_endpgm
 394   %add = fadd float %a, 1.0
 395   %div2 = fmul float %add, 2.0
 396   store float %div2, ptr addrspace(1) undef
 397   ret void
 398 }
 399
 400 define amdgpu_ps void @v_omod_mul2_med3(float %x, float %y, float %z) #0 {
 401 ; SI-LABEL: v_omod_mul2_med3:
 402 ; SI:       ; %bb.0:
 403 ; SI-NEXT:    v_med3_f32 v0, v0, v1, v2 mul:2
 404 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 405 ; SI-NEXT:    s_mov_b32 s2, -1
 406 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 407 ; SI-NEXT:    s_endpgm
 408 ;
 409 ; VI-LABEL: v_omod_mul2_med3:
 410 ; VI:       ; %bb.0:
 411 ; VI-NEXT:    v_med3_f32 v0, v0, v1, v2 mul:2
 412 ; VI-NEXT:    flat_store_dword v[0:1], v0
 413 ; VI-NEXT:    s_endpgm
 414 ;
 415 ; GFX11-LABEL: v_omod_mul2_med3:
 416 ; GFX11:       ; %bb.0:
 417 ; GFX11-NEXT:    v_med3_f32 v0, v0, v1, v2 mul:2
 418 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 419 ; GFX11-NEXT:    s_nop 0
 420 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 421 ; GFX11-NEXT:    s_endpgm
 422   %fmed3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
 423   %div2 = fmul float %fmed3, 2.0
 424   store float %div2, float addrspace(1)* undef
 425   ret void
 426 }
 427
 428 define amdgpu_ps void @v_omod_mul2_f64(double %a) #5 {
 429 ; SI-LABEL: v_omod_mul2_f64:
 430 ; SI:       ; %bb.0:
 431 ; SI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 mul:2
 432 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 433 ; SI-NEXT:    s_mov_b32 s2, -1
 434 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 435 ; SI-NEXT:    s_endpgm
 436 ;
 437 ; VI-LABEL: v_omod_mul2_f64:
 438 ; VI:       ; %bb.0:
 439 ; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 mul:2
 440 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
 441 ; VI-NEXT:    s_endpgm
 442 ;
 443 ; GFX11-LABEL: v_omod_mul2_f64:
 444 ; GFX11:       ; %bb.0:
 445 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 mul:2
 446 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
 447 ; GFX11-NEXT:    s_nop 0
 448 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 449 ; GFX11-NEXT:    s_endpgm
 450   %add = fadd nsz double %a, 1.0
 451   %div2 = fmul nsz double %add, 2.0
 452   store double %div2, ptr addrspace(1) undef
 453   ret void
 454 }
 455
 456 define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 {
 457 ; SI-LABEL: v_omod_mul4_f32:
 458 ; SI:       ; %bb.0:
 459 ; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:4
 460 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 461 ; SI-NEXT:    s_mov_b32 s2, -1
 462 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 463 ; SI-NEXT:    s_endpgm
 464 ;
 465 ; VI-LABEL: v_omod_mul4_f32:
 466 ; VI:       ; %bb.0:
 467 ; VI-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:4
 468 ; VI-NEXT:    flat_store_dword v[0:1], v0
 469 ; VI-NEXT:    s_endpgm
 470 ;
 471 ; GFX11-LABEL: v_omod_mul4_f32:
 472 ; GFX11:       ; %bb.0:
 473 ; GFX11-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:4
 474 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 475 ; GFX11-NEXT:    s_nop 0
 476 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 477 ; GFX11-NEXT:    s_endpgm
 478   %add = fadd float %a, 1.0
 479   %div2 = fmul float %add, 4.0
 480   store float %div2, ptr addrspace(1) undef
 481   ret void
 482 }
 483
 484 define amdgpu_ps void @v_omod_mul4_f64(double %a) #5 {
 485 ; SI-LABEL: v_omod_mul4_f64:
 486 ; SI:       ; %bb.0:
 487 ; SI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 mul:4
 488 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 489 ; SI-NEXT:    s_mov_b32 s2, -1
 490 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 491 ; SI-NEXT:    s_endpgm
 492 ;
 493 ; VI-LABEL: v_omod_mul4_f64:
 494 ; VI:       ; %bb.0:
 495 ; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 mul:4
 496 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
 497 ; VI-NEXT:    s_endpgm
 498 ;
 499 ; GFX11-LABEL: v_omod_mul4_f64:
 500 ; GFX11:       ; %bb.0:
 501 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0 mul:4
 502 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
 503 ; GFX11-NEXT:    s_nop 0
 504 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 505 ; GFX11-NEXT:    s_endpgm
 506   %add = fadd nsz double %a, 1.0
 507   %div2 = fmul nsz double %add, 4.0
 508   store double %div2, ptr addrspace(1) undef
 509   ret void
 510 }
 511
 512 define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {
 513 ; SI-LABEL: v_omod_mul4_multi_use_f32:
 514 ; SI:       ; %bb.0:
 515 ; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 516 ; SI-NEXT:    v_mul_f32_e32 v1, 4.0, v0
 517 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 518 ; SI-NEXT:    s_mov_b32 s2, -1
 519 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 520 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 521 ; SI-NEXT:    s_waitcnt vmcnt(0)
 522 ; SI-NEXT:    s_endpgm
 523 ;
 524 ; VI-LABEL: v_omod_mul4_multi_use_f32:
 525 ; VI:       ; %bb.0:
 526 ; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 527 ; VI-NEXT:    v_mul_f32_e32 v1, 4.0, v0
 528 ; VI-NEXT:    flat_store_dword v[0:1], v1
 529 ; VI-NEXT:    flat_store_dword v[0:1], v0
 530 ; VI-NEXT:    s_waitcnt vmcnt(0)
 531 ; VI-NEXT:    s_endpgm
 532 ;
 533 ; GFX11-LABEL: v_omod_mul4_multi_use_f32:
 534 ; GFX11:       ; %bb.0:
 535 ; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
 536 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 537 ; GFX11-NEXT:    v_mul_f32_e32 v1, 4.0, v0
 538 ; GFX11-NEXT:    s_clause 0x1
 539 ; GFX11-NEXT:    global_store_b32 v[0:1], v1, off
 540 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
 541 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 542 ; GFX11-NEXT:    s_nop 0
 543 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 544 ; GFX11-NEXT:    s_endpgm
 545   %add = fadd float %a, 1.0
 546   %div2 = fmul float %add, 4.0
 547   store float %div2, ptr addrspace(1) undef
 548   store volatile float %add, ptr addrspace(1) undef
 549   ret void
 550 }
 551
 552 define amdgpu_ps void @v_omod_mul4_dbg_use_f32(float %a) #0 {
 553 ; SI-LABEL: v_omod_mul4_dbg_use_f32:
 554 ; SI:       ; %bb.0:
 555 ; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:4
 556 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 557 ; SI-NEXT:    s_mov_b32 s2, -1
 558 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 559 ; SI-NEXT:    s_endpgm
 560 ;
 561 ; VI-LABEL: v_omod_mul4_dbg_use_f32:
 562 ; VI:       ; %bb.0:
 563 ; VI-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:4
 564 ; VI-NEXT:    flat_store_dword v[0:1], v0
 565 ; VI-NEXT:    s_endpgm
 566 ;
 567 ; GFX11-LABEL: v_omod_mul4_dbg_use_f32:
 568 ; GFX11:       ; %bb.0:
 569 ; GFX11-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:4
 570 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 571 ; GFX11-NEXT:    s_nop 0
 572 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 573 ; GFX11-NEXT:    s_endpgm
 574   %add = fadd float %a, 1.0
 575   call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
 576   %div2 = fmul float %add, 4.0
 577   store float %div2, ptr addrspace(1) undef
 578   ret void
 579 }
 580
 581 ; Clamp is applied after omod, folding both into instruction is OK.
 582 define amdgpu_ps void @v_clamp_omod_div2_f32(float %a) #0 {
 583 ; SI-LABEL: v_clamp_omod_div2_f32:
 584 ; SI:       ; %bb.0:
 585 ; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp div:2
 586 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 587 ; SI-NEXT:    s_mov_b32 s2, -1
 588 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 589 ; SI-NEXT:    s_endpgm
 590 ;
 591 ; VI-LABEL: v_clamp_omod_div2_f32:
 592 ; VI:       ; %bb.0:
 593 ; VI-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp div:2
 594 ; VI-NEXT:    flat_store_dword v[0:1], v0
 595 ; VI-NEXT:    s_endpgm
 596 ;
 597 ; GFX11-LABEL: v_clamp_omod_div2_f32:
 598 ; GFX11:       ; %bb.0:
 599 ; GFX11-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp div:2
 600 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 601 ; GFX11-NEXT:    s_nop 0
 602 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 603 ; GFX11-NEXT:    s_endpgm
 604   %add = fadd float %a, 1.0
 605   %div2 = fmul float %add, 0.5
 606
 607   %max = call float @llvm.maxnum.f32(float %div2, float 0.0)
 608   %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
 609   store float %clamp, ptr addrspace(1) undef
 610   ret void
 611 }
 612
 613 ; Cannot fold omod into clamp
 614 define amdgpu_ps void @v_omod_div2_clamp_f32(float %a) #0 {
 615 ; SI-LABEL: v_omod_div2_clamp_f32:
 616 ; SI:       ; %bb.0:
 617 ; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp
 618 ; SI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
 619 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 620 ; SI-NEXT:    s_mov_b32 s2, -1
 621 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 622 ; SI-NEXT:    s_endpgm
 623 ;
 624 ; VI-LABEL: v_omod_div2_clamp_f32:
 625 ; VI:       ; %bb.0:
 626 ; VI-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp
 627 ; VI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
 628 ; VI-NEXT:    flat_store_dword v[0:1], v0
 629 ; VI-NEXT:    s_endpgm
 630 ;
 631 ; GFX11-LABEL: v_omod_div2_clamp_f32:
 632 ; GFX11:       ; %bb.0:
 633 ; GFX11-NEXT:    v_add_f32_e64 v0, v0, 1.0 clamp
 634 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 635 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0.5, v0
 636 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 637 ; GFX11-NEXT:    s_nop 0
 638 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 639 ; GFX11-NEXT:    s_endpgm
 640   %add = fadd float %a, 1.0
 641   %max = call float @llvm.maxnum.f32(float %add, float 0.0)
 642   %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
 643   %div2 = fmul float %clamp, 0.5
 644   store float %div2, ptr addrspace(1) undef
 645   ret void
 646 }
 647
 648 define amdgpu_ps void @v_omod_div2_abs_src_f32(float %a) #0 {
 649 ; SI-LABEL: v_omod_div2_abs_src_f32:
 650 ; SI:       ; %bb.0:
 651 ; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 652 ; SI-NEXT:    v_mul_f32_e64 v0, |v0|, 0.5
 653 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 654 ; SI-NEXT:    s_mov_b32 s2, -1
 655 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 656 ; SI-NEXT:    s_endpgm
 657 ;
 658 ; VI-LABEL: v_omod_div2_abs_src_f32:
 659 ; VI:       ; %bb.0:
 660 ; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 661 ; VI-NEXT:    v_mul_f32_e64 v0, |v0|, 0.5
 662 ; VI-NEXT:    flat_store_dword v[0:1], v0
 663 ; VI-NEXT:    s_endpgm
 664 ;
 665 ; GFX11-LABEL: v_omod_div2_abs_src_f32:
 666 ; GFX11:       ; %bb.0:
 667 ; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
 668 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 669 ; GFX11-NEXT:    v_mul_f32_e64 v0, |v0|, 0.5
 670 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 671 ; GFX11-NEXT:    s_nop 0
 672 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 673 ; GFX11-NEXT:    s_endpgm
 674   %add = fadd float %a, 1.0
 675   %abs.add = call float @llvm.fabs.f32(float %add)
 676   %div2 = fmul float %abs.add, 0.5
 677   store float %div2, ptr addrspace(1) undef
 678   ret void
 679 }
 680
 681 define amdgpu_ps void @v_omod_add_self_clamp_f32(float %a) #0 {
 682 ; SI-LABEL: v_omod_add_self_clamp_f32:
 683 ; SI:       ; %bb.0:
 684 ; SI-NEXT:    v_add_f32_e64 v0, v0, v0 clamp
 685 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 686 ; SI-NEXT:    s_mov_b32 s2, -1
 687 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 688 ; SI-NEXT:    s_endpgm
 689 ;
 690 ; VI-LABEL: v_omod_add_self_clamp_f32:
 691 ; VI:       ; %bb.0:
 692 ; VI-NEXT:    v_add_f32_e64 v0, v0, v0 clamp
 693 ; VI-NEXT:    flat_store_dword v[0:1], v0
 694 ; VI-NEXT:    s_endpgm
 695 ;
 696 ; GFX11-LABEL: v_omod_add_self_clamp_f32:
 697 ; GFX11:       ; %bb.0:
 698 ; GFX11-NEXT:    v_add_f32_e64 v0, v0, v0 clamp
 699 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 700 ; GFX11-NEXT:    s_nop 0
 701 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 702 ; GFX11-NEXT:    s_endpgm
 703   %add = fadd float %a, %a
 704   %max = call float @llvm.maxnum.f32(float %add, float 0.0)
 705   %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
 706   store float %clamp, ptr addrspace(1) undef
 707   ret void
 708 }
 709
 710 define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 {
 711 ; SI-LABEL: v_omod_add_clamp_self_f32:
 712 ; SI:       ; %bb.0:
 713 ; SI-NEXT:    v_max_f32_e64 v0, v0, v0 clamp
 714 ; SI-NEXT:    v_add_f32_e32 v0, v0, v0
 715 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 716 ; SI-NEXT:    s_mov_b32 s2, -1
 717 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 718 ; SI-NEXT:    s_endpgm
 719 ;
 720 ; VI-LABEL: v_omod_add_clamp_self_f32:
 721 ; VI:       ; %bb.0:
 722 ; VI-NEXT:    v_max_f32_e64 v0, v0, v0 clamp
 723 ; VI-NEXT:    v_add_f32_e32 v0, v0, v0
 724 ; VI-NEXT:    flat_store_dword v[0:1], v0
 725 ; VI-NEXT:    s_endpgm
 726 ;
 727 ; GFX11-LABEL: v_omod_add_clamp_self_f32:
 728 ; GFX11:       ; %bb.0:
 729 ; GFX11-NEXT:    v_max_f32_e64 v0, v0, v0 clamp
 730 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 731 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v0
 732 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 733 ; GFX11-NEXT:    s_nop 0
 734 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 735 ; GFX11-NEXT:    s_endpgm
 736   %max = call float @llvm.maxnum.f32(float %a, float 0.0)
 737   %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
 738   %add = fadd float %clamp, %clamp
 739   store float %add, ptr addrspace(1) undef
 740   ret void
 741 }
 742
 743 define amdgpu_ps void @v_omod_add_abs_self_f32(float %a) #0 {
 744 ; SI-LABEL: v_omod_add_abs_self_f32:
 745 ; SI:       ; %bb.0:
 746 ; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 747 ; SI-NEXT:    v_add_f32_e64 v0, |v0|, |v0|
 748 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 749 ; SI-NEXT:    s_mov_b32 s2, -1
 750 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 751 ; SI-NEXT:    s_endpgm
 752 ;
 753 ; VI-LABEL: v_omod_add_abs_self_f32:
 754 ; VI:       ; %bb.0:
 755 ; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 756 ; VI-NEXT:    v_add_f32_e64 v0, |v0|, |v0|
 757 ; VI-NEXT:    flat_store_dword v[0:1], v0
 758 ; VI-NEXT:    s_endpgm
 759 ;
 760 ; GFX11-LABEL: v_omod_add_abs_self_f32:
 761 ; GFX11:       ; %bb.0:
 762 ; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
 763 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 764 ; GFX11-NEXT:    v_add_f32_e64 v0, |v0|, |v0|
 765 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 766 ; GFX11-NEXT:    s_nop 0
 767 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 768 ; GFX11-NEXT:    s_endpgm
 769   %x = fadd float %a, 1.0
 770   %abs.x = call float @llvm.fabs.f32(float %x)
 771   %add = fadd float %abs.x, %abs.x
 772   store float %add, ptr addrspace(1) undef
 773   ret void
 774 }
 775
 776 define amdgpu_ps void @v_omod_add_abs_x_x_f32(float %a) #0 {
 777 ; SI-LABEL: v_omod_add_abs_x_x_f32:
 778 ; SI:       ; %bb.0:
 779 ; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 780 ; SI-NEXT:    v_add_f32_e64 v0, |v0|, v0
 781 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 782 ; SI-NEXT:    s_mov_b32 s2, -1
 783 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 784 ; SI-NEXT:    s_endpgm
 785 ;
 786 ; VI-LABEL: v_omod_add_abs_x_x_f32:
 787 ; VI:       ; %bb.0:
 788 ; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 789 ; VI-NEXT:    v_add_f32_e64 v0, |v0|, v0
 790 ; VI-NEXT:    flat_store_dword v[0:1], v0
 791 ; VI-NEXT:    s_endpgm
 792 ;
 793 ; GFX11-LABEL: v_omod_add_abs_x_x_f32:
 794 ; GFX11:       ; %bb.0:
 795 ; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
 796 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 797 ; GFX11-NEXT:    v_add_f32_e64 v0, |v0|, v0
 798 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 799 ; GFX11-NEXT:    s_nop 0
 800 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 801 ; GFX11-NEXT:    s_endpgm
 802   %x = fadd float %a, 1.0
 803   %abs.x = call float @llvm.fabs.f32(float %x)
 804   %add = fadd float %abs.x, %x
 805   store float %add, ptr addrspace(1) undef
 806   ret void
 807 }
 808
 809 define amdgpu_ps void @v_omod_add_x_abs_x_f32(float %a) #0 {
 810 ; SI-LABEL: v_omod_add_x_abs_x_f32:
 811 ; SI:       ; %bb.0:
 812 ; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 813 ; SI-NEXT:    v_add_f32_e64 v0, v0, |v0|
 814 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 815 ; SI-NEXT:    s_mov_b32 s2, -1
 816 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 817 ; SI-NEXT:    s_endpgm
 818 ;
 819 ; VI-LABEL: v_omod_add_x_abs_x_f32:
 820 ; VI:       ; %bb.0:
 821 ; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 822 ; VI-NEXT:    v_add_f32_e64 v0, v0, |v0|
 823 ; VI-NEXT:    flat_store_dword v[0:1], v0
 824 ; VI-NEXT:    s_endpgm
 825 ;
 826 ; GFX11-LABEL: v_omod_add_x_abs_x_f32:
 827 ; GFX11:       ; %bb.0:
 828 ; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
 829 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 830 ; GFX11-NEXT:    v_add_f32_e64 v0, v0, |v0|
 831 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 832 ; GFX11-NEXT:    s_nop 0
 833 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 834 ; GFX11-NEXT:    s_endpgm
 835   %x = fadd float %a, 1.0
 836   %abs.x = call float @llvm.fabs.f32(float %x)
 837   %add = fadd float %x, %abs.x
 838   store float %add, ptr addrspace(1) undef
 839   ret void
 840 }
 841
 842 ; Don't fold omod into omod into another omod.
 843 define amdgpu_ps void @v_omod_div2_omod_div2_f32(float %a) #0 {
 844 ; SI-LABEL: v_omod_div2_omod_div2_f32:
 845 ; SI:       ; %bb.0:
 846 ; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 div:2
 847 ; SI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
 848 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 849 ; SI-NEXT:    s_mov_b32 s2, -1
 850 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 851 ; SI-NEXT:    s_endpgm
 852 ;
 853 ; VI-LABEL: v_omod_div2_omod_div2_f32:
 854 ; VI:       ; %bb.0:
 855 ; VI-NEXT:    v_add_f32_e64 v0, v0, 1.0 div:2
 856 ; VI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
 857 ; VI-NEXT:    flat_store_dword v[0:1], v0
 858 ; VI-NEXT:    s_endpgm
 859 ;
 860 ; GFX11-LABEL: v_omod_div2_omod_div2_f32:
 861 ; GFX11:       ; %bb.0:
 862 ; GFX11-NEXT:    v_add_f32_e64 v0, v0, 1.0 div:2
 863 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 864 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0.5, v0
 865 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 866 ; GFX11-NEXT:    s_nop 0
 867 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 868 ; GFX11-NEXT:    s_endpgm
 869   %add = fadd float %a, 1.0
 870   %div2.0 = fmul float %add, 0.5
 871   %div2.1 = fmul float %div2.0, 0.5
 872   store float %div2.1, ptr addrspace(1) undef
 873   ret void
 874 }
 875
 876 ; Don't fold omod if denorms enabled
 877 define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 {
 878 ; SI-LABEL: v_omod_div2_f32_denormals:
 879 ; SI:       ; %bb.0:
 880 ; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 881 ; SI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
 882 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 883 ; SI-NEXT:    s_mov_b32 s2, -1
 884 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 885 ; SI-NEXT:    s_endpgm
 886 ;
 887 ; VI-LABEL: v_omod_div2_f32_denormals:
 888 ; VI:       ; %bb.0:
 889 ; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 890 ; VI-NEXT:    v_mul_f32_e32 v0, 0.5, v0
 891 ; VI-NEXT:    flat_store_dword v[0:1], v0
 892 ; VI-NEXT:    s_endpgm
 893 ;
 894 ; GFX11-LABEL: v_omod_div2_f32_denormals:
 895 ; GFX11:       ; %bb.0:
 896 ; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
 897 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 898 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0.5, v0
 899 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 900 ; GFX11-NEXT:    s_nop 0
 901 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 902 ; GFX11-NEXT:    s_endpgm
 903   %add = fadd float %a, 1.0
 904   %div2 = fmul float %add, 0.5
 905   store float %div2, ptr addrspace(1) undef
 906   ret void
 907 }
 908
 909 ; Don't fold omod if denorms enabled.
 910 define amdgpu_ps void @v_omod_div2_f64_denormals(double %a) #6 {
 911 ; SI-LABEL: v_omod_div2_f64_denormals:
 912 ; SI:       ; %bb.0:
 913 ; SI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 914 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 915 ; SI-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
 916 ; SI-NEXT:    s_mov_b32 s2, -1
 917 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 918 ; SI-NEXT:    s_endpgm
 919 ;
 920 ; VI-LABEL: v_omod_div2_f64_denormals:
 921 ; VI:       ; %bb.0:
 922 ; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 923 ; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
 924 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
 925 ; VI-NEXT:    s_endpgm
 926 ;
 927 ; GFX11-LABEL: v_omod_div2_f64_denormals:
 928 ; GFX11:       ; %bb.0:
 929 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 930 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 931 ; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
 932 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
 933 ; GFX11-NEXT:    s_nop 0
 934 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 935 ; GFX11-NEXT:    s_endpgm
 936   %add = fadd double %a, 1.0
 937   %div2 = fmul double %add, 0.5
 938   store double %div2, ptr addrspace(1) undef
 939   ret void
 940 }
 941
 942 ; Don't fold omod if denorms enabled for add form.
 943 define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 {
 944 ; SI-LABEL: v_omod_mul2_f32_denormals:
 945 ; SI:       ; %bb.0:
 946 ; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 947 ; SI-NEXT:    v_add_f32_e32 v0, v0, v0
 948 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 949 ; SI-NEXT:    s_mov_b32 s2, -1
 950 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 951 ; SI-NEXT:    s_endpgm
 952 ;
 953 ; VI-LABEL: v_omod_mul2_f32_denormals:
 954 ; VI:       ; %bb.0:
 955 ; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
 956 ; VI-NEXT:    v_add_f32_e32 v0, v0, v0
 957 ; VI-NEXT:    flat_store_dword v[0:1], v0
 958 ; VI-NEXT:    s_endpgm
 959 ;
 960 ; GFX11-LABEL: v_omod_mul2_f32_denormals:
 961 ; GFX11:       ; %bb.0:
 962 ; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
 963 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 964 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v0
 965 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 966 ; GFX11-NEXT:    s_nop 0
 967 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 968 ; GFX11-NEXT:    s_endpgm
 969   %add = fadd float %a, 1.0
 970   %mul2 = fadd float %add, %add
 971   store float %mul2, ptr addrspace(1) undef
 972   ret void
 973 }
 974
 975 ; Don't fold omod if denorms enabled for add form.
 976 define amdgpu_ps void @v_omod_mul2_f64_denormals(double %a) #2 {
 977 ; SI-LABEL: v_omod_mul2_f64_denormals:
 978 ; SI:       ; %bb.0:
 979 ; SI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 980 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 981 ; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[0:1]
 982 ; SI-NEXT:    s_mov_b32 s2, -1
 983 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 984 ; SI-NEXT:    s_endpgm
 985 ;
 986 ; VI-LABEL: v_omod_mul2_f64_denormals:
 987 ; VI:       ; %bb.0:
 988 ; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 989 ; VI-NEXT:    v_add_f64 v[0:1], v[0:1], v[0:1]
 990 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
 991 ; VI-NEXT:    s_endpgm
 992 ;
 993 ; GFX11-LABEL: v_omod_mul2_f64_denormals:
 994 ; GFX11:       ; %bb.0:
 995 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 996 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 997 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[0:1]
 998 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
 999 ; GFX11-NEXT:    s_nop 0
1000 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1001 ; GFX11-NEXT:    s_endpgm
1002   %add = fadd double %a, 1.0
1003   %mul2 = fadd double %add, %add
1004   store double %mul2, ptr addrspace(1) undef
1005   ret void
1006 }
1007
1008 ; Don't fold omod if denorms enabled
1009 define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 {
1010 ; SI-LABEL: v_omod_div2_f16_denormals:
1011 ; SI:       ; %bb.0:
1012 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1013 ; SI-NEXT:    s_mov_b32 s3, 0xf000
1014 ; SI-NEXT:    s_mov_b32 s2, -1
1015 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1016 ; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 div:2
1017 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1018 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1019 ; SI-NEXT:    s_endpgm
1020 ;
1021 ; VI-LABEL: v_omod_div2_f16_denormals:
1022 ; VI:       ; %bb.0:
1023 ; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
1024 ; VI-NEXT:    v_mul_f16_e32 v0, 0.5, v0
1025 ; VI-NEXT:    flat_store_short v[0:1], v0
1026 ; VI-NEXT:    s_endpgm
1027 ;
1028 ; GFX11-LABEL: v_omod_div2_f16_denormals:
1029 ; GFX11:       ; %bb.0:
1030 ; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
1031 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1032 ; GFX11-NEXT:    v_mul_f16_e32 v0, 0.5, v0
1033 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
1034 ; GFX11-NEXT:    s_nop 0
1035 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1036 ; GFX11-NEXT:    s_endpgm
1037   %add = fadd half %a, 1.0
1038   %div2 = fmul half %add, 0.5
1039   store half %div2, ptr addrspace(1) undef
1040   ret void
1041 }
1042
1043 ; Don't fold omod if denorms enabled for add form.
1044 define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 {
1045 ; SI-LABEL: v_omod_mul2_f16_denormals:
1046 ; SI:       ; %bb.0:
1047 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1048 ; SI-NEXT:    s_mov_b32 s3, 0xf000
1049 ; SI-NEXT:    s_mov_b32 s2, -1
1050 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1051 ; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 mul:2
1052 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1053 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1054 ; SI-NEXT:    s_endpgm
1055 ;
1056 ; VI-LABEL: v_omod_mul2_f16_denormals:
1057 ; VI:       ; %bb.0:
1058 ; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
1059 ; VI-NEXT:    v_add_f16_e32 v0, v0, v0
1060 ; VI-NEXT:    flat_store_short v[0:1], v0
1061 ; VI-NEXT:    s_endpgm
1062 ;
1063 ; GFX11-LABEL: v_omod_mul2_f16_denormals:
1064 ; GFX11:       ; %bb.0:
1065 ; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
1066 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1067 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v0
1068 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
1069 ; GFX11-NEXT:    s_nop 0
1070 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1071 ; GFX11-NEXT:    s_endpgm
1072   %add = fadd half %a, 1.0
1073   %mul2 = fadd half %add, %add
1074   store half %mul2, ptr addrspace(1) undef
1075   ret void
1076 }
1077
1078 define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 {
1079 ; SI-LABEL: v_omod_div2_f16_no_denormals:
1080 ; SI:       ; %bb.0:
1081 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1082 ; SI-NEXT:    s_mov_b32 s3, 0xf000
1083 ; SI-NEXT:    s_mov_b32 s2, -1
1084 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1085 ; SI-NEXT:    v_add_f32_e64 v0, v0, 1.0 div:2
1086 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
1087 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1088 ; SI-NEXT:    s_endpgm
1089 ;
1090 ; VI-LABEL: v_omod_div2_f16_no_denormals:
1091 ; VI:       ; %bb.0:
1092 ; VI-NEXT:    v_add_f16_e64 v0, v0, 1.0 div:2
1093 ; VI-NEXT:    flat_store_short v[0:1], v0
1094 ; VI-NEXT:    s_endpgm
1095 ;
1096 ; GFX11-LABEL: v_omod_div2_f16_no_denormals:
1097 ; GFX11:       ; %bb.0:
1098 ; GFX11-NEXT:    v_add_f16_e64 v0, v0, 1.0 div:2
1099 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
1100 ; GFX11-NEXT:    s_nop 0
1101 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1102 ; GFX11-NEXT:    s_endpgm
1103   %add = fadd half %a, 1.0
1104   %div2 = fmul half %add, 0.5
1105   store half %div2, ptr addrspace(1) undef
1106   ret void
1107 }
1108
1109 define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 {
1110 ; SI-LABEL: v_omod_mac_to_mad:
1111 ; SI:       ; %bb.0:
1112 ; SI-NEXT:    v_mad_f32 v1, v1, v1, v0 mul:2
1113 ; SI-NEXT:    v_mul_f32_e32 v0, v1, v0
1114 ; SI-NEXT:    s_mov_b32 s3, 0xf000
1115 ; SI-NEXT:    s_mov_b32 s2, -1
1116 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1117 ; SI-NEXT:    s_endpgm
1118 ;
1119 ; VI-LABEL: v_omod_mac_to_mad:
1120 ; VI:       ; %bb.0:
1121 ; VI-NEXT:    v_mad_f32 v1, v1, v1, v0 mul:2
1122 ; VI-NEXT:    v_mul_f32_e32 v0, v1, v0
1123 ; VI-NEXT:    flat_store_dword v[0:1], v0
1124 ; VI-NEXT:    s_endpgm
1125 ;
1126 ; GFX11-LABEL: v_omod_mac_to_mad:
1127 ; GFX11:       ; %bb.0:
1128 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v1
1129 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1130 ; GFX11-NEXT:    v_add_f32_e64 v1, v1, v0 mul:2
1131 ; GFX11-NEXT:    v_mul_f32_e32 v0, v1, v0
1132 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
1133 ; GFX11-NEXT:    s_nop 0
1134 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1135 ; GFX11-NEXT:    s_endpgm
1136   %mul = fmul float %a, %a
1137   %add = fadd float %mul, %b
1138   %mad = fmul float %add, 2.0
1139   %res = fmul float %mad, %b
1140   store float %res, ptr addrspace(1) undef
1141   ret void
1142 }
1143
1144 declare i32 @llvm.amdgcn.workitem.id.x() #1
1145 declare float @llvm.fabs.f32(float) #1
1146 declare float @llvm.floor.f32(float) #1
1147 declare float @llvm.minnum.f32(float, float) #1
1148 declare float @llvm.maxnum.f32(float, float) #1
1149 declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
1150 declare double @llvm.fabs.f64(double) #1
1151 declare double @llvm.minnum.f64(double, double) #1
1152 declare double @llvm.maxnum.f64(double, double) #1
1153 declare half @llvm.fabs.f16(half) #1
1154 declare half @llvm.minnum.f16(half, half) #1
1155 declare half @llvm.maxnum.f16(half, half) #1
1156 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
1157
1158 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" }
1159 attributes #1 = { nounwind readnone }
1160 attributes #2 = { nounwind "denormal-fp-math-f32"="ieee,ieee" "no-signed-zeros-fp-math"="true" }
1161 attributes #3 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" }
1162 attributes #4 = { nounwind "no-signed-zeros-fp-math"="false" }
1163 attributes #5 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
1164 attributes #6 = { nounwind "denormal-fp-math"="ieee,ieee" "no-signed-zeros-fp-math"="true" }
1165
1166 !llvm.dbg.cu = !{!0}
1167 !llvm.module.flags = !{!2, !3}
1168
1169 !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
1170 !1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
1171 !2 = !{i32 2, !"Dwarf Version", i32 4}
1172 !3 = !{i32 2, !"Debug Info Version", i32 3}
1173 !4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)
1174 !5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
1175 !6 = !DISubroutineType(types: !7)
1176 !7 = !{null, !8}
1177 !8 = !DIBasicType(name: "float", size: 32, align: 32)
1178 !9 = !DIExpression()
1179 !10 = !DILocation(line: 1, column: 42, scope: !5)