test/CodeGen/AMDGPU/ds_read2st64.ll

   1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI %s
   2
   3 @lds = addrspace(3) global [512 x float] undef, align 4
   4 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
   5
   6
   7 ; SI-LABEL: @simple_read2st64_f32_0_1
   8 ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
   9 ; SI: s_waitcnt lgkmcnt(0)
  10 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
  11 ; SI: buffer_store_dword [[RESULT]]
  12 ; SI: s_endpgm
  13 define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
  14   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
  15   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
  16   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
  17   %add.x = add nsw i32 %x.i, 64
  18   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
  19   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
  20   %sum = fadd float %val0, %val1
  21   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
  22   store float %sum, float addrspace(1)* %out.gep, align 4
  23   ret void
  24 }
  25
  26 ; SI-LABEL: @simple_read2st64_f32_1_2
  27 ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
  28 ; SI: s_waitcnt lgkmcnt(0)
  29 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
  30 ; SI: buffer_store_dword [[RESULT]]
  31 ; SI: s_endpgm
  32 define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
  33   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
  34   %add.x.0 = add nsw i32 %x.i, 64
  35   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
  36   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
  37   %add.x.1 = add nsw i32 %x.i, 128
  38   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
  39   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
  40   %sum = fadd float %val0, %val1
  41   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
  42   store float %sum, float addrspace(1)* %out.gep, align 4
  43   ret void
  44 }
  45
  46 ; SI-LABEL: @simple_read2st64_f32_max_offset
  47 ; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
  48 ; SI: s_waitcnt lgkmcnt(0)
  49 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]]
  50 ; SI: buffer_store_dword [[RESULT]]
  51 ; SI: s_endpgm
  52 define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
  53   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
  54   %add.x.0 = add nsw i32 %x.i, 64
  55   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
  56   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
  57   %add.x.1 = add nsw i32 %x.i, 16320
  58   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
  59   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
  60   %sum = fadd float %val0, %val1
  61   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
  62   store float %sum, float addrspace(1)* %out.gep, align 4
  63   ret void
  64 }
  65
  66 ; SI-LABEL: @simple_read2st64_f32_over_max_offset
  67 ; SI-NOT: ds_read2st64_b32
  68 ; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
  69 ; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
  70 ; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
  71 ; SI: s_endpgm
  72 define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
  73   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
  74   %add.x.0 = add nsw i32 %x.i, 64
  75   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
  76   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
  77   %add.x.1 = add nsw i32 %x.i, 16384
  78   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.1
  79   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
  80   %sum = fadd float %val0, %val1
  81   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
  82   store float %sum, float addrspace(1)* %out.gep, align 4
  83   ret void
  84 }
  85
  86 ; SI-LABEL: @odd_invalid_read2st64_f32_0
  87 ; SI-NOT: ds_read2st64_b32
  88 ; SI: s_endpgm
  89 define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
  90   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
  91   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
  92   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
  93   %add.x = add nsw i32 %x.i, 63
  94   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
  95   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
  96   %sum = fadd float %val0, %val1
  97   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
  98   store float %sum, float addrspace(1)* %out.gep, align 4
  99   ret void
 100 }
 101
 102 ; SI-LABEL: @odd_invalid_read2st64_f32_1
 103 ; SI-NOT: ds_read2st64_b32
 104 ; SI: s_endpgm
 105 define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
 106   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 107   %add.x.0 = add nsw i32 %x.i, 64
 108   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
 109   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
 110   %add.x.1 = add nsw i32 %x.i, 127
 111   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
 112   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
 113   %sum = fadd float %val0, %val1
 114   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
 115   store float %sum, float addrspace(1)* %out.gep, align 4
 116   ret void
 117 }
 118
 119 ; SI-LABEL: @simple_read2st64_f64_0_1
 120 ; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1
 121 ; SI: s_waitcnt lgkmcnt(0)
 122 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 123 ; SI: buffer_store_dwordx2 [[RESULT]]
 124 ; SI: s_endpgm
 125 define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
 126   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 127   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
 128   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
 129   %add.x = add nsw i32 %x.i, 64
 130   %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
 131   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
 132   %sum = fadd double %val0, %val1
 133   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
 134   store double %sum, double addrspace(1)* %out.gep, align 8
 135   ret void
 136 }
 137
 138 ; SI-LABEL: @simple_read2st64_f64_1_2
 139 ; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
 140 ; SI: s_waitcnt lgkmcnt(0)
 141 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 142 ; SI: buffer_store_dwordx2 [[RESULT]]
 143 ; SI: s_endpgm
 144 define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
 145   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 146   %add.x.0 = add nsw i32 %x.i, 64
 147   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
 148   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
 149   %add.x.1 = add nsw i32 %x.i, 128
 150   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
 151   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
 152   %sum = fadd double %val0, %val1
 153   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
 154   store double %sum, double addrspace(1)* %out.gep, align 8
 155   ret void
 156 }
 157
 158 ; Alignment only
 159
 160 ; SI-LABEL: @misaligned_read2st64_f64
 161 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
 162 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
 163 ; SI: s_endpgm
 164 define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
 165   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 166   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
 167   %val0 = load double, double addrspace(3)* %arrayidx0, align 4
 168   %add.x = add nsw i32 %x.i, 64
 169   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
 170   %val1 = load double, double addrspace(3)* %arrayidx1, align 4
 171   %sum = fadd double %val0, %val1
 172   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
 173   store double %sum, double addrspace(1)* %out.gep, align 4
 174   ret void
 175 }
 176
 177 ; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff
 178 ; SI-LABEL: @simple_read2st64_f64_max_offset
 179 ; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
 180 ; SI: s_waitcnt lgkmcnt(0)
 181 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 182 ; SI: buffer_store_dwordx2 [[RESULT]]
 183 ; SI: s_endpgm
 184 define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
 185   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 186   %add.x.0 = add nsw i32 %x.i, 256
 187   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
 188   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
 189   %add.x.1 = add nsw i32 %x.i, 8128
 190   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
 191   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
 192   %sum = fadd double %val0, %val1
 193   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
 194   store double %sum, double addrspace(1)* %out.gep, align 8
 195   ret void
 196 }
 197
 198 ; SI-LABEL: @simple_read2st64_f64_over_max_offset
 199 ; SI-NOT: ds_read2st64_b64
 200 ; SI-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
 201 ; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
 202 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
 203 ; SI: s_endpgm
 204 define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
 205   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 206   %add.x.0 = add nsw i32 %x.i, 64
 207   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
 208   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
 209   %add.x.1 = add nsw i32 %x.i, 8192
 210   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
 211   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
 212   %sum = fadd double %val0, %val1
 213   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
 214   store double %sum, double addrspace(1)* %out.gep, align 8
 215   ret void
 216 }
 217
 218 ; SI-LABEL: @invalid_read2st64_f64_odd_offset
 219 ; SI-NOT: ds_read2st64_b64
 220 ; SI: s_endpgm
 221 define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
 222   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 223   %add.x.0 = add nsw i32 %x.i, 64
 224   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
 225   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
 226   %add.x.1 = add nsw i32 %x.i, 8129
 227   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.1
 228   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
 229   %sum = fadd double %val0, %val1
 230   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
 231   store double %sum, double addrspace(1)* %out.gep, align 8
 232   ret void
 233 }
 234
 235 ; The stride of 8 elements is 8 * 8 bytes. We need to make sure the
 236 ; stride in elements, not bytes, is a multiple of 64.
 237
 238 ; SI-LABEL: @byte_size_only_divisible_64_read2_f64
 239 ; SI-NOT: ds_read2st_b64
 240 ; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
 241 ; SI: s_endpgm
 242 define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
 243   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 244   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
 245   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
 246   %add.x = add nsw i32 %x.i, 8
 247   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
 248   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
 249   %sum = fadd double %val0, %val1
 250   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
 251   store double %sum, double addrspace(1)* %out.gep, align 4
 252   ret void
 253 }
 254
 255 ; Function Attrs: nounwind readnone
 256 declare i32 @llvm.amdgcn.workitem.id.x() #1
 257
 258 ; Function Attrs: nounwind readnone
 259 declare i32 @llvm.amdgcn.workitem.id.y() #1
 260
 261 attributes #0 = { nounwind }
 262 attributes #1 = { nounwind readnone }