llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll

   1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CHECK %s
   2
   3 ; Check that WQM is not triggered by the softwqm intrinsic alone.
   4 ;
   5 ;CHECK-LABEL: {{^}}test1:
   6 ;CHECK-NOT: s_wqm_b64 exec, exec
   7 ;CHECK: buffer_load_dword
   8 ;CHECK: buffer_load_dword
   9 ;CHECK: v_add_f32_e32
  10 define amdgpu_ps float @test1(i32 inreg %idx0, i32 inreg %idx1) {
  11 main_body:
  12   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
  13   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
  14   %out = fadd float %src0, %src1
  15   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
  16   ret float %out.0
  17 }
  18
  19 ; Check that the softwqm intrinsic works correctly for integers.
  20 ;
  21 ;CHECK-LABEL: {{^}}test2:
  22 ;CHECK-NOT: s_wqm_b64 exec, exec
  23 ;CHECK: buffer_load_dword
  24 ;CHECK: buffer_load_dword
  25 ;CHECK: v_add_f32_e32
  26 define amdgpu_ps float @test2(i32 inreg %idx0, i32 inreg %idx1) {
  27 main_body:
  28   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
  29   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
  30   %out = fadd float %src0, %src1
  31   %out.0 = bitcast float %out to i32
  32   %out.1 = call i32 @llvm.amdgcn.softwqm.i32(i32 %out.0)
  33   %out.2 = bitcast i32 %out.1 to float
  34   ret float %out.2
  35 }
  36
  37 ; Make sure the transition from WQM to Exact to softwqm does not trigger WQM.
  38 ;
  39 ;CHECK-LABEL: {{^}}test_softwqm1:
  40 ;CHECK-NOT: s_wqm_b64 exec, exec
  41 ;CHECK: buffer_load_dword
  42 ;CHECK: buffer_load_dword
  43 ;CHECK: buffer_store_dword
  44 ;CHECK-NOT; s_wqm_b64 exec, exec
  45 ;CHECK: v_add_f32_e32
  46 define amdgpu_ps float @test_softwqm1(i32 inreg %idx0, i32 inreg %idx1) {
  47 main_body:
  48   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
  49   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
  50   %temp = fadd float %src0, %src1
  51   call void @llvm.amdgcn.struct.buffer.store.f32(float %temp, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
  52   %out = fadd float %temp, %temp
  53   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
  54   ret float %out.0
  55 }
  56
  57 ; Make sure the transition from WQM to Exact to softwqm does trigger WQM.
  58 ;
  59 ;CHECK-LABEL: {{^}}test_softwqm2:
  60 ;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
  61 ;CHECK: s_wqm_b64 exec, exec
  62 ;CHECK: buffer_load_dword
  63 ;CHECK: buffer_load_dword
  64 ;CHECK: v_add_f32_e32
  65 ;CHECK: v_add_f32_e32
  66 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
  67 ;CHECK: buffer_store_dword
  68 define amdgpu_ps float @test_softwqm2(i32 inreg %idx0, i32 inreg %idx1) {
  69 main_body:
  70   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
  71   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
  72   %temp = fadd float %src0, %src1
  73   %temp.0 = call float @llvm.amdgcn.wqm.f32(float %temp)
  74   call void @llvm.amdgcn.struct.buffer.store.f32(float %temp.0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
  75   %out = fadd float %temp, %temp
  76   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
  77   ret float %out.0
  78 }
  79
  80 ; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
  81 ; Make sure the transition from Exact to STRICT_WWM then softwqm does not trigger WQM.
  82 ;
  83 ;CHECK-LABEL: {{^}}test_wwm1:
  84 ;CHECK: s_or_saveexec_b64 [[ORIG0:s\[[0-9]+:[0-9]+\]]], -1
  85 ;CHECK: buffer_load_dword
  86 ;CHECK: s_mov_b64 exec, [[ORIG0]]
  87 ;CHECK: buffer_store_dword
  88 ;CHECK: s_or_saveexec_b64 [[ORIG1:s\[[0-9]+:[0-9]+\]]], -1
  89 ;CHECK: buffer_load_dword
  90 ;CHECK: v_add_f32_e32
  91 ;CHECK: s_mov_b64 exec, [[ORIG1]]
  92 ;CHECK-NOT: s_wqm_b64
  93 define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
  94 main_body:
  95   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
  96   call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
  97   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
  98   %temp = fadd float %src0, %src1
  99   %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
 100   %out = fadd float %temp.0, %temp.0
 101   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
 102   ret float %out.0
 103 }
 104
 105 ; Make sure the transition from Exact to STRICT_WWM then softwqm does not trigger WQM.
 106 ;
 107 ;CHECK-LABEL: {{^}}test_strict_wwm1:
 108 ;CHECK: s_or_saveexec_b64 [[ORIG0:s\[[0-9]+:[0-9]+\]]], -1
 109 ;CHECK: buffer_load_dword
 110 ;CHECK: s_mov_b64 exec, [[ORIG0]]
 111 ;CHECK: buffer_store_dword
 112 ;CHECK: s_or_saveexec_b64 [[ORIG1:s\[[0-9]+:[0-9]+\]]], -1
 113 ;CHECK: buffer_load_dword
 114 ;CHECK: v_add_f32_e32
 115 ;CHECK: s_mov_b64 exec, [[ORIG1]]
 116 ;CHECK-NOT: s_wqm_b64
 117 define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
 118 main_body:
 119   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 120   call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 121   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 122   %temp = fadd float %src0, %src1
 123   %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
 124   %out = fadd float %temp.0, %temp.0
 125   %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
 126   ret float %out.0
 127 }
 128
 129
 130 ; Check that softwqm on one case of branch does not trigger WQM for shader.
 131 ;
 132 ;CHECK-LABEL: {{^}}test_control_flow_0:
 133 ;CHECK-NEXT: ; %main_body
 134 ;CHECK-NOT: s_wqm_b64 exec, exec
 135 ;CHECK: %ELSE
 136 ;CHECK: store
 137 ;CHECK: %IF
 138 ;CHECK: buffer_load
 139 ;CHECK: buffer_load
 140 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
 141 main_body:
 142   %cmp = icmp eq i32 %z, 0
 143   br i1 %cmp, label %IF, label %ELSE
 144
 145 IF:
 146   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 147   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 148   %out = fadd float %src0, %src1
 149   %data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
 150   br label %END
 151
 152 ELSE:
 153   call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
 154   br label %END
 155
 156 END:
 157   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
 158   ret float %r
 159 }
 160
 161 ; Check that softwqm on one case of branch is treated as WQM in WQM shader.
 162 ;
 163 ;CHECK-LABEL: {{^}}test_control_flow_1:
 164 ;CHECK-NEXT: ; %main_body
 165 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 166 ;CHECK-NEXT: s_wqm_b64 exec, exec
 167 ;CHECK: %ELSE
 168 ;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
 169 ;CHECK: store
 170 ;CHECK: s_mov_b64 exec, [[SAVED]]
 171 ;CHECK: %IF
 172 ;CHECK-NOT: s_and_saveexec_b64
 173 ;CHECK-NOT: s_and_b64 exec
 174 ;CHECK: buffer_load
 175 ;CHECK: buffer_load
 176 define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 inreg %idx0, i32 inreg %idx1, i32 %c, i32 %z, float %data) {
 177 main_body:
 178   %c.bc = bitcast i32 %c to float
 179   %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
 180   %tex0 = extractelement <4 x float> %tex, i32 0
 181   %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
 182   %data.sample = extractelement <4 x float> %dtex, i32 0
 183
 184   %cmp = icmp eq i32 %z, 0
 185   br i1 %cmp, label %IF, label %ELSE
 186
 187 IF:
 188   %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
 189   %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
 190   %out = fadd float %src0, %src1
 191   %data.if = call float @llvm.amdgcn.softwqm.f32(float %out)
 192   br label %END
 193
 194 ELSE:
 195   call void @llvm.amdgcn.struct.buffer.store.f32(float %data.sample, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
 196   br label %END
 197
 198 END:
 199   %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
 200   ret float %r
 201 }
 202
 203 declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
 204 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
 205 declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) #3
 206 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
 207 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
 208 declare void @llvm.amdgcn.kill(i1) #1
 209 declare float @llvm.amdgcn.wqm.f32(float) #3
 210 declare float @llvm.amdgcn.softwqm.f32(float) #3
 211 declare i32 @llvm.amdgcn.softwqm.i32(i32) #3
 212 declare float @llvm.amdgcn.strict.wwm.f32(float) #3
 213 declare float @llvm.amdgcn.wwm.f32(float) #3
 214
 215 attributes #1 = { nounwind }
 216 attributes #2 = { nounwind readonly }
 217 attributes #3 = { nounwind readnone }