llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CHECK %s
   3
   4 ; Check that WQM is not triggered by the softwqm intrinsic alone.
   5 ;
   6 define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) {
   7 ; CHECK-LABEL: test1:
   8 ; CHECK:       ; %bb.0: ; %main_body
   9 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
  10 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
  11 ; CHECK-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
  12 ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
  13 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
  14 ; CHECK-NEXT:    v_add_f32_e32 v0, v0, v1
  15 ; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
  16 ; CHECK-NEXT:    ; return to shader part epilog
  17 main_body:
  18   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
  19   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
  20   %out = fadd float %src0, %src1
  21   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
  22   ret float %out.0
  23 }
  24
  25 ; Check that the softwqm intrinsic works correctly for integers.
  26 ;
  27 define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) {
  28 ; CHECK-LABEL: test2:
  29 ; CHECK:       ; %bb.0: ; %main_body
  30 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
  31 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
  32 ; CHECK-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
  33 ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
  34 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
  35 ; CHECK-NEXT:    v_add_f32_e32 v0, v0, v1
  36 ; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
  37 ; CHECK-NEXT:    ; return to shader part epilog
  38 main_body:
  39   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
  40   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
  41   %out = fadd float %src0, %src1
  42   %out.0 = bitcast float %out to i32
  43   %out.1 = call i32 @llvm.amdgcn.softwqm.i32(i32 %out.0)
  44   %out.2 = bitcast i32 %out.1 to float
  45   ret float %out.2
  46 }
  47
  48 ; Make sure the transition from WQM to Exact to softwqm does not trigger WQM.
  49 ;
  50 define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) {
  51 ; CHECK-LABEL: test_softwqm1:
  52 ; CHECK:       ; %bb.0: ; %main_body
  53 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
  54 ; CHECK-NEXT:    v_mov_b32_e32 v2, s1
  55 ; CHECK-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 idxen
  56 ; CHECK-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
  57 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
  58 ; CHECK-NEXT:    v_add_f32_e32 v1, v1, v2
  59 ; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
  60 ; CHECK-NEXT:    v_add_f32_e32 v0, v1, v1
  61 ; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
  62 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
  63 ; CHECK-NEXT:    ; return to shader part epilog
  64 main_body:
  65   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
  66   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
  67   %temp = fadd float %src0, %src1
  68   call void @llvm.amdgcn.struct.buffer.store.f32(float %temp, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
  69   %out = fadd float %temp, %temp
  70   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
  71   ret float %out.0
  72 }
  73
  74 ; Make sure the transition from WQM to Exact to softwqm does trigger WQM.
  75 ;
  76 define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) {
  77 ; CHECK-LABEL: test_softwqm2:
  78 ; CHECK:       ; %bb.0: ; %main_body
  79 ; CHECK-NEXT:    s_mov_b64 s[2:3], exec
  80 ; CHECK-NEXT:    s_wqm_b64 exec, exec
  81 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
  82 ; CHECK-NEXT:    v_mov_b32_e32 v2, s1
  83 ; CHECK-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 idxen
  84 ; CHECK-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
  85 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
  86 ; CHECK-NEXT:    v_add_f32_e32 v1, v1, v2
  87 ; CHECK-NEXT:    v_mov_b32_e32 v2, v1
  88 ; CHECK-NEXT:    v_add_f32_e32 v1, v1, v1
  89 ; CHECK-NEXT:    s_and_b64 exec, exec, s[2:3]
  90 ; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
  91 ; CHECK-NEXT:    s_wqm_b64 exec, exec
  92 ; CHECK-NEXT:    v_mov_b32_e32 v0, v1
  93 ; CHECK-NEXT:    s_and_b64 exec, exec, s[2:3]
  94 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
  95 ; CHECK-NEXT:    ; return to shader part epilog
  96 main_body:
  97   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
  98   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
  99   %temp = fadd float %src0, %src1
 100   %temp.0 = call float @llvm.amdgcn.wqm.f32(float %temp)
 101   call void @llvm.amdgcn.struct.buffer.store.f32(float %temp.0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 102   %out = fadd float %temp, %temp
 103   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
 104   ret float %out.0
 105 }
 106
 107 ; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
 108 ; Make sure the transition from Exact to STRICT_WWM then softwqm does not trigger WQM.
 109 ;
 110 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
 111 ; CHECK-LABEL: test_wwm1:
 112 ; CHECK:       ; %bb.0: ; %main_body
 113 ; CHECK-NEXT:    s_or_saveexec_b64 s[2:3], -1
 114 ; CHECK-NEXT:    v_mov_b32_e32 v1, s0
 115 ; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
 116 ; CHECK-NEXT:    s_mov_b64 exec, s[2:3]
 117 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 118 ; CHECK-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
 119 ; CHECK-NEXT:    s_or_saveexec_b64 s[2:3], -1
 120 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
 121 ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 122 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 123 ; CHECK-NEXT:    v_add_f32_e32 v1, v2, v1
 124 ; CHECK-NEXT:    s_mov_b64 exec, s[2:3]
 125 ; CHECK-NEXT:    v_mov_b32_e32 v0, v1
 126 ; CHECK-NEXT:    v_add_f32_e32 v0, v0, v0
 127 ; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
 128 ; CHECK-NEXT:    ; return to shader part epilog
 129 main_body:
 130   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 131   call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 132   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 133   %temp = fadd float %src0, %src1
 134   %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
 135   %out = fadd float %temp.0, %temp.0
 136   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
 137   ret float %out.0
 138 }
 139
 140 ; Make sure the transition from Exact to STRICT_WWM then softwqm does not trigger WQM.
 141 ;
 142 define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
 143 ; CHECK-LABEL: test_strict_wwm1:
 144 ; CHECK:       ; %bb.0: ; %main_body
 145 ; CHECK-NEXT:    s_or_saveexec_b64 s[2:3], -1
 146 ; CHECK-NEXT:    v_mov_b32_e32 v1, s0
 147 ; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
 148 ; CHECK-NEXT:    s_mov_b64 exec, s[2:3]
 149 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 150 ; CHECK-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
 151 ; CHECK-NEXT:    s_or_saveexec_b64 s[2:3], -1
 152 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
 153 ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 154 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 155 ; CHECK-NEXT:    v_add_f32_e32 v1, v2, v1
 156 ; CHECK-NEXT:    s_mov_b64 exec, s[2:3]
 157 ; CHECK-NEXT:    v_mov_b32_e32 v0, v1
 158 ; CHECK-NEXT:    v_add_f32_e32 v0, v0, v0
 159 ; CHECK-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
 160 ; CHECK-NEXT:    ; return to shader part epilog
 161 main_body:
 162   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 163   call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 164   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 165   %temp = fadd float %src0, %src1
 166   %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
 167   %out = fadd float %temp.0, %temp.0
 168   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
 169   ret float %out.0
 170 }
 171
 172
 173 ; Check that softwqm on one case of branch does not trigger WQM for shader.
 174 ;
 175 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
 176 ; CHECK-LABEL: test_control_flow_0:
 177 ; CHECK:       ; %bb.0: ; %main_body
 178 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 179 ; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 180 ; CHECK-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
 181 ; CHECK-NEXT:    s_cbranch_execz .LBB6_2
 182 ; CHECK-NEXT:  ; %bb.1: ; %ELSE
 183 ; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
 184 ; CHECK-NEXT:  .LBB6_2: ; %Flow
 185 ; CHECK-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
 186 ; CHECK-NEXT:    s_cbranch_execz .LBB6_4
 187 ; CHECK-NEXT:  ; %bb.3: ; %IF
 188 ; CHECK-NEXT:    v_mov_b32_e32 v0, s12
 189 ; CHECK-NEXT:    v_mov_b32_e32 v1, s13
 190 ; CHECK-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 191 ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 192 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 193 ; CHECK-NEXT:    v_add_f32_e32 v2, v0, v1
 194 ; CHECK-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec
 195 ; CHECK-NEXT:  .LBB6_4: ; %END
 196 ; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
 197 ; CHECK-NEXT:    v_mov_b32_e32 v0, v2
 198 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 199 ; CHECK-NEXT:    ; return to shader part epilog
 200 main_body:
 201   %cmp = icmp eq i32 %z, 0
 202   br i1 %cmp, label %IF, label %ELSE
 203
 204 IF:
 205   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 206   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 207   %out = fadd float %src0, %src1
 208   %data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
 209   br label %END
 210
 211 ELSE:
 212   call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
 213   br label %END
 214
 215 END:
 216   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
 217   ret float %r
 218 }
 219
 220 ; Check that softwqm on one case of branch is treated as WQM in WQM shader.
 221 ;
 222 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
 223 ; CHECK-LABEL: test_control_flow_1:
 224 ; CHECK:       ; %bb.0: ; %main_body
 225 ; CHECK-NEXT:    s_mov_b64 s[14:15], exec
 226 ; CHECK-NEXT:    s_wqm_b64 exec, exec
 227 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 228 ; CHECK-NEXT:    s_and_saveexec_b64 s[16:17], vcc
 229 ; CHECK-NEXT:    s_xor_b64 s[16:17], exec, s[16:17]
 230 ; CHECK-NEXT:    s_cbranch_execz .LBB7_2
 231 ; CHECK-NEXT:  ; %bb.1: ; %ELSE
 232 ; CHECK-NEXT:    image_sample v1, v0, s[0:7], s[8:11] dmask:0x1
 233 ; CHECK-NEXT:    s_and_saveexec_b64 s[18:19], s[14:15]
 234 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 235 ; CHECK-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
 236 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 237 ; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
 238 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
 239 ; CHECK-NEXT:  .LBB7_2: ; %Flow
 240 ; CHECK-NEXT:    s_andn2_saveexec_b64 s[0:1], s[16:17]
 241 ; CHECK-NEXT:    s_cbranch_execz .LBB7_4
 242 ; CHECK-NEXT:  ; %bb.3: ; %IF
 243 ; CHECK-NEXT:    v_mov_b32_e32 v0, s12
 244 ; CHECK-NEXT:    v_mov_b32_e32 v1, s13
 245 ; CHECK-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
 246 ; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
 247 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 248 ; CHECK-NEXT:    v_add_f32_e32 v2, v0, v1
 249 ; CHECK-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec
 250 ; CHECK-NEXT:  .LBB7_4: ; %END
 251 ; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
 252 ; CHECK-NEXT:    s_and_b64 exec, exec, s[14:15]
 253 ; CHECK-NEXT:    v_mov_b32_e32 v0, v2
 254 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 255 ; CHECK-NEXT:    ; return to shader part epilog
 256 main_body:
 257   %cmp = icmp eq i32 %z, 0
 258   br i1 %cmp, label %IF, label %ELSE
 259
 260 IF:
 261   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 262   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 263   %out = fadd float %src0, %src1
 264   %data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
 265   br label %END
 266
 267 ELSE:
 268   %c.bc = bitcast i32 %c to float
 269   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
 270   %tex0 = extractelement <4 x float> %tex, i32 0
 271   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
 272   %data.sample = extractelement <4 x float> %dtex, i32 0
 273
 274   call void @llvm.amdgcn.struct.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
 275   br label %END
 276
 277 END:
 278   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
 279   ret float %r
 280 }
 281
 282 declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
 283 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
 284 declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) #3
 285 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
 286 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
 287 declare float @llvm.amdgcn.wqm.f32(float) #3
 288 declare float @llvm.amdgcn.softwqm.f32(float) #3
 289 declare i32 @llvm.amdgcn.softwqm.i32(i32) #3
 290 declare float @llvm.amdgcn.strict.wwm.f32(float) #3
 291 declare float @llvm.amdgcn.wwm.f32(float) #3
 292 declare void @llvm.amdgcn.wqm.demote(i1) #1
 293
 294 attributes #1 = { nounwind }
 295 attributes #2 = { nounwind readonly }
 296 attributes #3 = { nounwind readnone }