llvm/test/CodeGen/AMDGPU/ds_read2.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s
   3 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s
   4 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s
   5
   6 ; FIXME: We don't get cases where the address was an SGPR because we
   7 ; get a copy to the address register for each one.
   8
   9 @lds = addrspace(3) global [512 x float] undef, align 4
  10 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
  11
  12 define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
  13 ; CI-LABEL: simple_read2_f32:
  14 ; CI:       ; %bb.0:
  15 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
  16 ; CI-NEXT:    s_mov_b32 m0, -1
  17 ; CI-NEXT:    ds_read2_b32 v[1:2], v0 offset1:8
  18 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
  19 ; CI-NEXT:    s_mov_b32 s3, 0xf000
  20 ; CI-NEXT:    s_mov_b32 s2, 0
  21 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
  22 ; CI-NEXT:    v_add_f32_e32 v2, v1, v2
  23 ; CI-NEXT:    v_mov_b32_e32 v1, 0
  24 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
  25 ; CI-NEXT:    s_endpgm
  26 ;
  27 ; GFX9-LABEL: simple_read2_f32:
  28 ; GFX9:       ; %bb.0:
  29 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
  30 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:8
  31 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
  32 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
  33 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
  34 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
  35 ; GFX9-NEXT:    s_endpgm
  36   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
  37   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
  38   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
  39   %add.x = add nsw i32 %x.i, 8
  40   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
  41   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
  42   %sum = fadd float %val0, %val1
  43   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
  44   store float %sum, float addrspace(1)* %out.gep, align 4
  45   ret void
  46 }
  47
  48 define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
  49 ; CI-LABEL: simple_read2_f32_max_offset:
  50 ; CI:       ; %bb.0:
  51 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
  52 ; CI-NEXT:    s_mov_b32 m0, -1
  53 ; CI-NEXT:    ds_read2_b32 v[1:2], v0 offset1:255
  54 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
  55 ; CI-NEXT:    s_mov_b32 s3, 0xf000
  56 ; CI-NEXT:    s_mov_b32 s2, 0
  57 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
  58 ; CI-NEXT:    v_add_f32_e32 v2, v1, v2
  59 ; CI-NEXT:    v_mov_b32_e32 v1, 0
  60 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
  61 ; CI-NEXT:    s_endpgm
  62 ;
  63 ; GFX9-LABEL: simple_read2_f32_max_offset:
  64 ; GFX9:       ; %bb.0:
  65 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
  66 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:255
  67 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
  68 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
  69 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
  70 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
  71 ; GFX9-NEXT:    s_endpgm
  72   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
  73   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
  74   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
  75   %add.x = add nsw i32 %x.i, 255
  76   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
  77   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
  78   %sum = fadd float %val0, %val1
  79   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
  80   store float %sum, float addrspace(1)* %out.gep, align 4
  81   ret void
  82 }
  83
  84 define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
  85 ; CI-LABEL: simple_read2_f32_too_far:
  86 ; CI:       ; %bb.0:
  87 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
  88 ; CI-NEXT:    s_mov_b32 m0, -1
  89 ; CI-NEXT:    ds_read_b32 v1, v0
  90 ; CI-NEXT:    ds_read_b32 v2, v0 offset:1028
  91 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
  92 ; CI-NEXT:    s_mov_b32 s3, 0xf000
  93 ; CI-NEXT:    s_mov_b32 s2, 0
  94 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
  95 ; CI-NEXT:    v_add_f32_e32 v2, v1, v2
  96 ; CI-NEXT:    v_mov_b32_e32 v1, 0
  97 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
  98 ; CI-NEXT:    s_endpgm
  99 ;
 100 ; GFX9-LABEL: simple_read2_f32_too_far:
 101 ; GFX9:       ; %bb.0:
 102 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 103 ; GFX9-NEXT:    ds_read_b32 v1, v0
 104 ; GFX9-NEXT:    ds_read_b32 v2, v0 offset:1028
 105 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 106 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 107 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
 108 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 109 ; GFX9-NEXT:    s_endpgm
 110   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 111   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
 112   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
 113   %add.x = add nsw i32 %x.i, 257
 114   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
 115   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
 116   %sum = fadd float %val0, %val1
 117   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
 118   store float %sum, float addrspace(1)* %out.gep, align 4
 119   ret void
 120 }
 121
 122 define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
 123 ; CI-LABEL: simple_read2_f32_x2:
 124 ; CI:       ; %bb.0:
 125 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 126 ; CI-NEXT:    s_mov_b32 m0, -1
 127 ; CI-NEXT:    ds_read2_b32 v[1:2], v0 offset1:8
 128 ; CI-NEXT:    ds_read2_b32 v[3:4], v0 offset0:11 offset1:27
 129 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 130 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 131 ; CI-NEXT:    s_mov_b32 s2, 0
 132 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 133 ; CI-NEXT:    v_add_f32_e32 v1, v1, v2
 134 ; CI-NEXT:    v_add_f32_e32 v2, v3, v4
 135 ; CI-NEXT:    v_add_f32_e32 v2, v1, v2
 136 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 137 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 138 ; CI-NEXT:    s_endpgm
 139 ;
 140 ; GFX9-LABEL: simple_read2_f32_x2:
 141 ; GFX9:       ; %bb.0:
 142 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 143 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v4 offset1:8
 144 ; GFX9-NEXT:    ds_read2_b32 v[2:3], v4 offset0:11 offset1:27
 145 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 146 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 147 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 148 ; GFX9-NEXT:    v_add_f32_e32 v1, v2, v3
 149 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 150 ; GFX9-NEXT:    global_store_dword v4, v0, s[0:1]
 151 ; GFX9-NEXT:    s_endpgm
 152   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 153   %idx.0 = add nsw i32 %tid.x, 0
 154   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
 155   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
 156
 157   %idx.1 = add nsw i32 %tid.x, 8
 158   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
 159   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
 160   %sum.0 = fadd float %val0, %val1
 161
 162   %idx.2 = add nsw i32 %tid.x, 11
 163   %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
 164   %val2 = load float, float addrspace(3)* %arrayidx2, align 4
 165
 166   %idx.3 = add nsw i32 %tid.x, 27
 167   %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
 168   %val3 = load float, float addrspace(3)* %arrayidx3, align 4
 169   %sum.1 = fadd float %val2, %val3
 170
 171   %sum = fadd float %sum.0, %sum.1
 172   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
 173   store float %sum, float addrspace(1)* %out.gep, align 4
 174   ret void
 175 }
 176
 177 ; Make sure there is an instruction between the two sets of reads.
 178 define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
 179 ; CI-LABEL: simple_read2_f32_x2_barrier:
 180 ; CI:       ; %bb.0:
 181 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 182 ; CI-NEXT:    s_mov_b32 m0, -1
 183 ; CI-NEXT:    ds_read2_b32 v[1:2], v0 offset1:8
 184 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 185 ; CI-NEXT:    s_barrier
 186 ; CI-NEXT:    ds_read2_b32 v[3:4], v0 offset0:11 offset1:27
 187 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 188 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 189 ; CI-NEXT:    v_add_f32_e32 v1, v1, v2
 190 ; CI-NEXT:    s_mov_b32 s2, 0
 191 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 192 ; CI-NEXT:    v_add_f32_e32 v2, v3, v4
 193 ; CI-NEXT:    v_add_f32_e32 v2, v1, v2
 194 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 195 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 196 ; CI-NEXT:    s_endpgm
 197 ;
 198 ; GFX9-LABEL: simple_read2_f32_x2_barrier:
 199 ; GFX9:       ; %bb.0:
 200 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 201 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v4 offset1:8
 202 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 203 ; GFX9-NEXT:    s_barrier
 204 ; GFX9-NEXT:    ds_read2_b32 v[2:3], v4 offset0:11 offset1:27
 205 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 206 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 207 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 208 ; GFX9-NEXT:    v_add_f32_e32 v1, v2, v3
 209 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 210 ; GFX9-NEXT:    global_store_dword v4, v0, s[0:1]
 211 ; GFX9-NEXT:    s_endpgm
 212   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 213   %idx.0 = add nsw i32 %tid.x, 0
 214   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
 215   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
 216
 217   %idx.1 = add nsw i32 %tid.x, 8
 218   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
 219   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
 220   %sum.0 = fadd float %val0, %val1
 221
 222   call void @llvm.amdgcn.s.barrier() #2
 223
 224   %idx.2 = add nsw i32 %tid.x, 11
 225   %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
 226   %val2 = load float, float addrspace(3)* %arrayidx2, align 4
 227
 228   %idx.3 = add nsw i32 %tid.x, 27
 229   %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
 230   %val3 = load float, float addrspace(3)* %arrayidx3, align 4
 231   %sum.1 = fadd float %val2, %val3
 232
 233   %sum = fadd float %sum.0, %sum.1
 234   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
 235   store float %sum, float addrspace(1)* %out.gep, align 4
 236   ret void
 237 }
 238
 239 ; For some reason adding something to the base address for the first
 240 ; element results in only folding the inner pair.
 241 define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
 242 ; CI-LABEL: simple_read2_f32_x2_nonzero_base:
 243 ; CI:       ; %bb.0:
 244 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 245 ; CI-NEXT:    s_mov_b32 m0, -1
 246 ; CI-NEXT:    ds_read2_b32 v[1:2], v0 offset0:2 offset1:8
 247 ; CI-NEXT:    ds_read2_b32 v[3:4], v0 offset0:11 offset1:27
 248 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 249 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 250 ; CI-NEXT:    s_mov_b32 s2, 0
 251 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 252 ; CI-NEXT:    v_add_f32_e32 v1, v1, v2
 253 ; CI-NEXT:    v_add_f32_e32 v2, v3, v4
 254 ; CI-NEXT:    v_add_f32_e32 v2, v1, v2
 255 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 256 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:8
 257 ; CI-NEXT:    s_endpgm
 258 ;
 259 ; GFX9-LABEL: simple_read2_f32_x2_nonzero_base:
 260 ; GFX9:       ; %bb.0:
 261 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
 262 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v4 offset0:2 offset1:8
 263 ; GFX9-NEXT:    ds_read2_b32 v[2:3], v4 offset0:11 offset1:27
 264 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 265 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 266 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 267 ; GFX9-NEXT:    v_add_f32_e32 v1, v2, v3
 268 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 269 ; GFX9-NEXT:    global_store_dword v4, v0, s[0:1] offset:8
 270 ; GFX9-NEXT:    s_endpgm
 271   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 272   %idx.0 = add nsw i32 %tid.x, 2
 273   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
 274   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
 275
 276   %idx.1 = add nsw i32 %tid.x, 8
 277   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
 278   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
 279   %sum.0 = fadd float %val0, %val1
 280
 281   %idx.2 = add nsw i32 %tid.x, 11
 282   %arrayidx2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
 283   %val2 = load float, float addrspace(3)* %arrayidx2, align 4
 284
 285   %idx.3 = add nsw i32 %tid.x, 27
 286   %arrayidx3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
 287   %val3 = load float, float addrspace(3)* %arrayidx3, align 4
 288   %sum.1 = fadd float %val2, %val3
 289
 290   %sum = fadd float %sum.0, %sum.1
 291   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %idx.0
 292   store float %sum, float addrspace(1)* %out.gep, align 4
 293   ret void
 294 }
 295
 296 ; Be careful of vectors of pointers. We don't know if the 2 pointers
 297 ; in the vectors are really the same base, so this is not safe to
 298 ; merge.
 299 ; Base pointers come from different subregister of same super
 300 ; register. We can't safely merge this.
 301 define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
 302 ; CI-LABEL: read2_ptr_is_subreg_arg_f32:
 303 ; CI:       ; %bb.0:
 304 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 305 ; CI-NEXT:    s_mov_b32 m0, -1
 306 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 307 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 308 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 309 ; CI-NEXT:    v_mov_b32_e32 v2, s3
 310 ; CI-NEXT:    ds_read_b32 v1, v1 offset:32
 311 ; CI-NEXT:    ds_read_b32 v2, v2
 312 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 313 ; CI-NEXT:    s_mov_b32 s2, 0
 314 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 315 ; CI-NEXT:    v_add_f32_e32 v2, v1, v2
 316 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 317 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 318 ; CI-NEXT:    s_endpgm
 319 ;
 320 ; GFX9-LABEL: read2_ptr_is_subreg_arg_f32:
 321 ; GFX9:       ; %bb.0:
 322 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 323 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 324 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 325 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 326 ; GFX9-NEXT:    v_mov_b32_e32 v2, s3
 327 ; GFX9-NEXT:    ds_read_b32 v1, v1 offset:32
 328 ; GFX9-NEXT:    ds_read_b32 v2, v2
 329 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 330 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
 331 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 332 ; GFX9-NEXT:    s_endpgm
 333   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 334   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
 335   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
 336   %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
 337   %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
 338   %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
 339   %val0 = load float, float addrspace(3)* %gep.0, align 4
 340   %val1 = load float, float addrspace(3)* %gep.1, align 4
 341   %add.x = add nsw i32 %x.i, 8
 342   %sum = fadd float %val0, %val1
 343   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
 344   store float %sum, float addrspace(1)* %out.gep, align 4
 345   ret void
 346 }
 347
 348 ; Apply a constant scalar offset after the pointer vector extract.  We
 349 ; are rejecting merges that have the same, constant 0 offset, so make
 350 ; sure we are really rejecting it because of the different
 351 ; subregisters.
 352 define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
 353 ; CI-LABEL: read2_ptr_is_subreg_arg_offset_f32:
 354 ; CI:       ; %bb.0:
 355 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 356 ; CI-NEXT:    s_mov_b32 m0, -1
 357 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 358 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 359 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 360 ; CI-NEXT:    v_mov_b32_e32 v2, s3
 361 ; CI-NEXT:    ds_read_b32 v1, v1 offset:32
 362 ; CI-NEXT:    ds_read_b32 v2, v2 offset:32
 363 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 364 ; CI-NEXT:    s_mov_b32 s2, 0
 365 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 366 ; CI-NEXT:    v_add_f32_e32 v2, v1, v2
 367 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 368 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 369 ; CI-NEXT:    s_endpgm
 370 ;
 371 ; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32:
 372 ; GFX9:       ; %bb.0:
 373 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
 374 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 375 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 376 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 377 ; GFX9-NEXT:    v_mov_b32_e32 v2, s3
 378 ; GFX9-NEXT:    ds_read_b32 v1, v1 offset:32
 379 ; GFX9-NEXT:    ds_read_b32 v2, v2 offset:32
 380 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 381 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
 382 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 383 ; GFX9-NEXT:    s_endpgm
 384   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 385   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
 386   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
 387   %gep = getelementptr inbounds float, <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
 388   %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
 389   %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
 390
 391   ; Apply an additional offset after the vector that will be more obviously folded.
 392   %gep.1.offset = getelementptr float, float addrspace(3)* %gep.1, i32 8
 393
 394   %val0 = load float, float addrspace(3)* %gep.0, align 4
 395   %val1 = load float, float addrspace(3)* %gep.1.offset, align 4
 396   %add.x = add nsw i32 %x.i, 8
 397   %sum = fadd float %val0, %val1
 398   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
 399   store float %sum, float addrspace(1)* %out.gep, align 4
 400   ret void
 401 }
 402
 403 define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
 404 ; CI-LABEL: read2_ptr_is_subreg_f32:
 405 ; CI:       ; %bb.0:
 406 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 407 ; CI-NEXT:    s_mov_b32 m0, -1
 408 ; CI-NEXT:    ds_read2_b32 v[1:2], v0 offset1:8
 409 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 410 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 411 ; CI-NEXT:    s_mov_b32 s2, 0
 412 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 413 ; CI-NEXT:    v_add_f32_e32 v2, v1, v2
 414 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 415 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 416 ; CI-NEXT:    s_endpgm
 417 ;
 418 ; GFX9-LABEL: read2_ptr_is_subreg_f32:
 419 ; GFX9:       ; %bb.0:
 420 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 421 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:8
 422 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 423 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 424 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 425 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 426 ; GFX9-NEXT:    s_endpgm
 427   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 428   %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
 429   %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1
 430   %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
 431   %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1
 432   %idx = add <2 x i32> %x.i.v.1, <i32 0, i32 8>
 433   %gep = getelementptr inbounds [512 x float], <2 x [512 x float] addrspace(3)*> %ptr.1, <2 x i32> <i32 0, i32 0>, <2 x i32> %idx
 434   %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
 435   %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
 436   %val0 = load float, float addrspace(3)* %gep.0, align 4
 437   %val1 = load float, float addrspace(3)* %gep.1, align 4
 438   %add.x = add nsw i32 %x.i, 8
 439   %sum = fadd float %val0, %val1
 440   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
 441   store float %sum, float addrspace(1)* %out.gep, align 4
 442   ret void
 443 }
 444
 445 define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
 446 ; CI-LABEL: simple_read2_f32_volatile_0:
 447 ; CI:       ; %bb.0:
 448 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 449 ; CI-NEXT:    s_mov_b32 m0, -1
 450 ; CI-NEXT:    ds_read_b32 v1, v0
 451 ; CI-NEXT:    ds_read_b32 v2, v0 offset:32
 452 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 453 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 454 ; CI-NEXT:    s_mov_b32 s2, 0
 455 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 456 ; CI-NEXT:    v_add_f32_e32 v2, v1, v2
 457 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 458 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 459 ; CI-NEXT:    s_endpgm
 460 ;
 461 ; GFX9-LABEL: simple_read2_f32_volatile_0:
 462 ; GFX9:       ; %bb.0:
 463 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 464 ; GFX9-NEXT:    ds_read_b32 v1, v0
 465 ; GFX9-NEXT:    ds_read_b32 v2, v0 offset:32
 466 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 467 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 468 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
 469 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 470 ; GFX9-NEXT:    s_endpgm
 471   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 472   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
 473   %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4
 474   %add.x = add nsw i32 %x.i, 8
 475   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
 476   %val1 = load float, float addrspace(3)* %arrayidx1, align 4
 477   %sum = fadd float %val0, %val1
 478   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
 479   store float %sum, float addrspace(1)* %out.gep, align 4
 480   ret void
 481 }
 482
 483 define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
 484 ; CI-LABEL: simple_read2_f32_volatile_1:
 485 ; CI:       ; %bb.0:
 486 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 487 ; CI-NEXT:    s_mov_b32 m0, -1
 488 ; CI-NEXT:    ds_read_b32 v1, v0
 489 ; CI-NEXT:    ds_read_b32 v2, v0 offset:32
 490 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 491 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 492 ; CI-NEXT:    s_mov_b32 s2, 0
 493 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 494 ; CI-NEXT:    v_add_f32_e32 v2, v1, v2
 495 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 496 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 497 ; CI-NEXT:    s_endpgm
 498 ;
 499 ; GFX9-LABEL: simple_read2_f32_volatile_1:
 500 ; GFX9:       ; %bb.0:
 501 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 502 ; GFX9-NEXT:    ds_read_b32 v1, v0
 503 ; GFX9-NEXT:    ds_read_b32 v2, v0 offset:32
 504 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 505 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 506 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
 507 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 508 ; GFX9-NEXT:    s_endpgm
 509   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 510   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
 511   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
 512   %add.x = add nsw i32 %x.i, 8
 513   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
 514   %val1 = load volatile float, float addrspace(3)* %arrayidx1, align 4
 515   %sum = fadd float %val0, %val1
 516   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
 517   store float %sum, float addrspace(1)* %out.gep, align 4
 518   ret void
 519 }
 520
 521 ; Can't fold since not correctly aligned.
 522 define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
 523 ; CI-LABEL: unaligned_read2_f32:
 524 ; CI:       ; %bb.0:
 525 ; CI-NEXT:    s_load_dword s2, s[0:1], 0x2
 526 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 527 ; CI-NEXT:    s_mov_b32 m0, -1
 528 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 529 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 530 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 531 ; CI-NEXT:    v_add_i32_e32 v1, vcc, s2, v0
 532 ; CI-NEXT:    ds_read_u8 v2, v1 offset:34
 533 ; CI-NEXT:    ds_read_u8 v3, v1 offset:32
 534 ; CI-NEXT:    ds_read_u8 v4, v1 offset:3
 535 ; CI-NEXT:    ds_read_u8 v5, v1 offset:2
 536 ; CI-NEXT:    ds_read_u8 v6, v1 offset:1
 537 ; CI-NEXT:    ds_read_u8 v7, v1
 538 ; CI-NEXT:    ds_read_u8 v8, v1 offset:33
 539 ; CI-NEXT:    ds_read_u8 v1, v1 offset:35
 540 ; CI-NEXT:    s_waitcnt lgkmcnt(5)
 541 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 542 ; CI-NEXT:    s_waitcnt lgkmcnt(3)
 543 ; CI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 544 ; CI-NEXT:    v_or_b32_e32 v4, v4, v5
 545 ; CI-NEXT:    s_waitcnt lgkmcnt(1)
 546 ; CI-NEXT:    v_lshlrev_b32_e32 v5, 8, v8
 547 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 548 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 549 ; CI-NEXT:    v_or_b32_e32 v1, v1, v2
 550 ; CI-NEXT:    v_or_b32_e32 v6, v6, v7
 551 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 552 ; CI-NEXT:    v_or_b32_e32 v3, v5, v3
 553 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 554 ; CI-NEXT:    v_or_b32_e32 v4, v4, v6
 555 ; CI-NEXT:    v_or_b32_e32 v1, v1, v3
 556 ; CI-NEXT:    v_add_f32_e32 v2, v4, v1
 557 ; CI-NEXT:    s_mov_b32 s2, 0
 558 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 559 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 560 ; CI-NEXT:    s_endpgm
 561 ;
 562 ; GFX9-ALIGNED-LABEL: unaligned_read2_f32:
 563 ; GFX9-ALIGNED:       ; %bb.0:
 564 ; GFX9-ALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x8
 565 ; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 566 ; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 567 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 568 ; GFX9-ALIGNED-NEXT:    v_add_u32_e32 v1, s4, v0
 569 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v2, v1
 570 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v3, v1 offset:1
 571 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v4, v1 offset:2
 572 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v5, v1 offset:3
 573 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v6, v1 offset:32
 574 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v7, v1 offset:33
 575 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v8, v1 offset:34
 576 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v1, v1 offset:35
 577 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(6)
 578 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v2, v3, 8, v2
 579 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(4)
 580 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
 581 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 582 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(2)
 583 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v6
 584 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 585 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v1, v1, 8, v8
 586 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 587 ; GFX9-ALIGNED-NEXT:    v_add_f32_e32 v1, v2, v1
 588 ; GFX9-ALIGNED-NEXT:    global_store_dword v0, v1, s[2:3]
 589 ; GFX9-ALIGNED-NEXT:    s_endpgm
 590 ;
 591 ; GFX9-UNALIGNED-LABEL: unaligned_read2_f32:
 592 ; GFX9-UNALIGNED:       ; %bb.0:
 593 ; GFX9-UNALIGNED-NEXT:    s_load_dword s2, s[0:1], 0x8
 594 ; GFX9-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 595 ; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 596 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 597 ; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v0, s2, v2
 598 ; GFX9-UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:8
 599 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 600 ; GFX9-UNALIGNED-NEXT:    v_add_f32_e32 v0, v0, v1
 601 ; GFX9-UNALIGNED-NEXT:    global_store_dword v2, v0, s[0:1]
 602 ; GFX9-UNALIGNED-NEXT:    s_endpgm
 603   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 604   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
 605   %val0 = load float, float addrspace(3)* %arrayidx0, align 1
 606   %add.x = add nsw i32 %x.i, 8
 607   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
 608   %val1 = load float, float addrspace(3)* %arrayidx1, align 1
 609   %sum = fadd float %val0, %val1
 610   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
 611   store float %sum, float addrspace(1)* %out.gep, align 4
 612   ret void
 613 }
 614
 615 define amdgpu_kernel void @unaligned_offset_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
 616 ; CI-LABEL: unaligned_offset_read2_f32:
 617 ; CI:       ; %bb.0:
 618 ; CI-NEXT:    s_load_dword s2, s[0:1], 0x2
 619 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 620 ; CI-NEXT:    s_mov_b32 m0, -1
 621 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 622 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 623 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 624 ; CI-NEXT:    v_add_i32_e32 v1, vcc, s2, v0
 625 ; CI-NEXT:    ds_read_u8 v2, v1 offset:11
 626 ; CI-NEXT:    ds_read_u8 v3, v1 offset:9
 627 ; CI-NEXT:    ds_read_u8 v4, v1 offset:8
 628 ; CI-NEXT:    ds_read_u8 v5, v1 offset:7
 629 ; CI-NEXT:    ds_read_u8 v6, v1 offset:6
 630 ; CI-NEXT:    ds_read_u8 v7, v1 offset:5
 631 ; CI-NEXT:    ds_read_u8 v8, v1 offset:10
 632 ; CI-NEXT:    ds_read_u8 v1, v1 offset:12
 633 ; CI-NEXT:    s_waitcnt lgkmcnt(5)
 634 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 635 ; CI-NEXT:    s_waitcnt lgkmcnt(3)
 636 ; CI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 637 ; CI-NEXT:    v_or_b32_e32 v4, v4, v5
 638 ; CI-NEXT:    s_waitcnt lgkmcnt(1)
 639 ; CI-NEXT:    v_lshlrev_b32_e32 v5, 8, v8
 640 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 641 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 642 ; CI-NEXT:    v_or_b32_e32 v1, v1, v2
 643 ; CI-NEXT:    v_or_b32_e32 v6, v6, v7
 644 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 645 ; CI-NEXT:    v_or_b32_e32 v3, v5, v3
 646 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 647 ; CI-NEXT:    v_or_b32_e32 v4, v4, v6
 648 ; CI-NEXT:    v_or_b32_e32 v1, v1, v3
 649 ; CI-NEXT:    v_add_f32_e32 v2, v4, v1
 650 ; CI-NEXT:    s_mov_b32 s2, 0
 651 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 652 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 653 ; CI-NEXT:    s_endpgm
 654 ;
 655 ; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32:
 656 ; GFX9-ALIGNED:       ; %bb.0:
 657 ; GFX9-ALIGNED-NEXT:    s_load_dword s4, s[0:1], 0x8
 658 ; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 659 ; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 660 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 661 ; GFX9-ALIGNED-NEXT:    v_add_u32_e32 v1, s4, v0
 662 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v2, v1 offset:5
 663 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v3, v1 offset:6
 664 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v4, v1 offset:7
 665 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v5, v1 offset:8
 666 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v6, v1 offset:9
 667 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v7, v1 offset:10
 668 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v8, v1 offset:11
 669 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v1, v1 offset:12
 670 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(6)
 671 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v2, v3, 8, v2
 672 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(4)
 673 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
 674 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 675 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(2)
 676 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v6
 677 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 678 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v1, v1, 8, v8
 679 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 680 ; GFX9-ALIGNED-NEXT:    v_add_f32_e32 v1, v2, v1
 681 ; GFX9-ALIGNED-NEXT:    global_store_dword v0, v1, s[2:3]
 682 ; GFX9-ALIGNED-NEXT:    s_endpgm
 683 ;
 684 ; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32:
 685 ; GFX9-UNALIGNED:       ; %bb.0:
 686 ; GFX9-UNALIGNED-NEXT:    s_load_dword s2, s[0:1], 0x8
 687 ; GFX9-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 688 ; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 689 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 690 ; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v0, s2, v2
 691 ; GFX9-UNALIGNED-NEXT:    ds_read_b64 v[0:1], v0 offset:5
 692 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 693 ; GFX9-UNALIGNED-NEXT:    v_add_f32_e32 v0, v0, v1
 694 ; GFX9-UNALIGNED-NEXT:    global_store_dword v2, v0, s[0:1]
 695 ; GFX9-UNALIGNED-NEXT:    s_endpgm
 696   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 697   %base = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
 698   %base.i8 = bitcast float addrspace(3)* %base to i8 addrspace(3)*
 699   %addr0.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 5
 700   %addr0 = bitcast i8 addrspace(3)* %addr0.i8 to float addrspace(3)*
 701   %val0 = load float, float addrspace(3)* %addr0, align 1
 702   %addr1.i8 = getelementptr inbounds i8, i8 addrspace(3)* %base.i8, i32 9
 703   %addr1 = bitcast i8 addrspace(3)* %addr1.i8 to float addrspace(3)*
 704   %val1 = load float, float addrspace(3)* %addr1, align 1
 705   %sum = fadd float %val0, %val1
 706   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
 707   store float %sum, float addrspace(1)* %out.gep, align 4
 708   ret void
 709 }
 710
 711 define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
 712 ; CI-LABEL: misaligned_2_simple_read2_f32:
 713 ; CI:       ; %bb.0:
 714 ; CI-NEXT:    s_load_dword s2, s[0:1], 0x2
 715 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 716 ; CI-NEXT:    s_mov_b32 m0, -1
 717 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 718 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 719 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 720 ; CI-NEXT:    v_add_i32_e32 v1, vcc, s2, v0
 721 ; CI-NEXT:    ds_read_u16 v2, v1 offset:32
 722 ; CI-NEXT:    ds_read_u16 v3, v1 offset:2
 723 ; CI-NEXT:    ds_read_u16 v4, v1
 724 ; CI-NEXT:    ds_read_u16 v1, v1 offset:34
 725 ; CI-NEXT:    s_mov_b32 s2, 0
 726 ; CI-NEXT:    s_waitcnt lgkmcnt(2)
 727 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 728 ; CI-NEXT:    s_waitcnt lgkmcnt(1)
 729 ; CI-NEXT:    v_or_b32_e32 v3, v3, v4
 730 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 731 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 732 ; CI-NEXT:    v_or_b32_e32 v1, v1, v2
 733 ; CI-NEXT:    v_add_f32_e32 v2, v3, v1
 734 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 735 ; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 736 ; CI-NEXT:    s_endpgm
 737 ;
 738 ; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32:
 739 ; GFX9-ALIGNED:       ; %bb.0:
 740 ; GFX9-ALIGNED-NEXT:    s_load_dword s2, s[0:1], 0x8
 741 ; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 742 ; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 743 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 744 ; GFX9-ALIGNED-NEXT:    v_add_u32_e32 v1, s2, v0
 745 ; GFX9-ALIGNED-NEXT:    ds_read_u16 v2, v1
 746 ; GFX9-ALIGNED-NEXT:    ds_read_u16 v3, v1 offset:2
 747 ; GFX9-ALIGNED-NEXT:    ds_read_u16 v4, v1 offset:32
 748 ; GFX9-ALIGNED-NEXT:    ds_read_u16 v1, v1 offset:34
 749 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(2)
 750 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
 751 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 752 ; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
 753 ; GFX9-ALIGNED-NEXT:    v_add_f32_e32 v1, v2, v1
 754 ; GFX9-ALIGNED-NEXT:    global_store_dword v0, v1, s[0:1]
 755 ; GFX9-ALIGNED-NEXT:    s_endpgm
 756 ;
 757 ; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32:
 758 ; GFX9-UNALIGNED:       ; %bb.0:
 759 ; GFX9-UNALIGNED-NEXT:    s_load_dword s2, s[0:1], 0x8
 760 ; GFX9-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 761 ; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 762 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 763 ; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v0, s2, v2
 764 ; GFX9-UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:8
 765 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 766 ; GFX9-UNALIGNED-NEXT:    v_add_f32_e32 v0, v0, v1
 767 ; GFX9-UNALIGNED-NEXT:    global_store_dword v2, v0, s[0:1]
 768 ; GFX9-UNALIGNED-NEXT:    s_endpgm
 769   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 770   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
 771   %val0 = load float, float addrspace(3)* %arrayidx0, align 2
 772   %add.x = add nsw i32 %x.i, 8
 773   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x
 774   %val1 = load float, float addrspace(3)* %arrayidx1, align 2
 775   %sum = fadd float %val0, %val1
 776   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i
 777   store float %sum, float addrspace(1)* %out.gep, align 4
 778   ret void
 779 }
 780
 781 define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {
 782 ; CI-LABEL: simple_read2_f64:
 783 ; CI:       ; %bb.0:
 784 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 785 ; CI-NEXT:    s_mov_b32 m0, -1
 786 ; CI-NEXT:    ds_read2_b64 v[0:3], v4 offset1:8
 787 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 788 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 789 ; CI-NEXT:    s_mov_b32 s2, 0
 790 ; CI-NEXT:    v_mov_b32_e32 v5, 0
 791 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 792 ; CI-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 793 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
 794 ; CI-NEXT:    s_endpgm
 795 ;
 796 ; GFX9-LABEL: simple_read2_f64:
 797 ; GFX9:       ; %bb.0:
 798 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 799 ; GFX9-NEXT:    ds_read2_b64 v[0:3], v4 offset1:8
 800 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 801 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 802 ; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 803 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 804 ; GFX9-NEXT:    s_endpgm
 805   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 806   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
 807   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
 808   %add.x = add nsw i32 %x.i, 8
 809   %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
 810   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
 811   %sum = fadd double %val0, %val1
 812   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
 813   store double %sum, double addrspace(1)* %out.gep, align 8
 814   ret void
 815 }
 816
 817 define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
 818 ; CI-LABEL: simple_read2_f64_max_offset:
 819 ; CI:       ; %bb.0:
 820 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 821 ; CI-NEXT:    s_mov_b32 m0, -1
 822 ; CI-NEXT:    ds_read2_b64 v[0:3], v4 offset1:255
 823 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 824 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 825 ; CI-NEXT:    s_mov_b32 s2, 0
 826 ; CI-NEXT:    v_mov_b32_e32 v5, 0
 827 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 828 ; CI-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 829 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
 830 ; CI-NEXT:    s_endpgm
 831 ;
 832 ; GFX9-LABEL: simple_read2_f64_max_offset:
 833 ; GFX9:       ; %bb.0:
 834 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 835 ; GFX9-NEXT:    ds_read2_b64 v[0:3], v4 offset1:255
 836 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 837 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 838 ; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 839 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 840 ; GFX9-NEXT:    s_endpgm
 841   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 842   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
 843   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
 844   %add.x = add nsw i32 %x.i, 255
 845   %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
 846   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
 847   %sum = fadd double %val0, %val1
 848   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
 849   store double %sum, double addrspace(1)* %out.gep, align 8
 850   ret void
 851 }
 852
 853 define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
 854 ; CI-LABEL: simple_read2_f64_too_far:
 855 ; CI:       ; %bb.0:
 856 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 857 ; CI-NEXT:    s_mov_b32 m0, -1
 858 ; CI-NEXT:    ds_read_b64 v[1:2], v0
 859 ; CI-NEXT:    ds_read_b64 v[3:4], v0 offset:2056
 860 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 861 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 862 ; CI-NEXT:    s_mov_b32 s2, 0
 863 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 864 ; CI-NEXT:    v_add_f64 v[2:3], v[1:2], v[3:4]
 865 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 866 ; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 867 ; CI-NEXT:    s_endpgm
 868 ;
 869 ; GFX9-LABEL: simple_read2_f64_too_far:
 870 ; GFX9:       ; %bb.0:
 871 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 872 ; GFX9-NEXT:    ds_read_b64 v[0:1], v4
 873 ; GFX9-NEXT:    ds_read_b64 v[2:3], v4 offset:2056
 874 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 875 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 876 ; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 877 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 878 ; GFX9-NEXT:    s_endpgm
 879   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 880   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
 881   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
 882   %add.x = add nsw i32 %x.i, 257
 883   %arrayidx1 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
 884   %val1 = load double, double addrspace(3)* %arrayidx1, align 8
 885   %sum = fadd double %val0, %val1
 886   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
 887   store double %sum, double addrspace(1)* %out.gep, align 8
 888   ret void
 889 }
 890
 891 ; Alignment only 4
 892 define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
 893 ; CI-LABEL: misaligned_read2_f64:
 894 ; CI:       ; %bb.0:
 895 ; CI-NEXT:    s_load_dword s2, s[0:1], 0x2
 896 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 897 ; CI-NEXT:    s_mov_b32 m0, -1
 898 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 899 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 900 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 901 ; CI-NEXT:    v_add_i32_e32 v3, vcc, s2, v0
 902 ; CI-NEXT:    ds_read2_b32 v[1:2], v3 offset1:1
 903 ; CI-NEXT:    ds_read2_b32 v[3:4], v3 offset0:14 offset1:15
 904 ; CI-NEXT:    s_mov_b32 s2, 0
 905 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 906 ; CI-NEXT:    v_add_f64 v[2:3], v[1:2], v[3:4]
 907 ; CI-NEXT:    v_mov_b32_e32 v1, 0
 908 ; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 909 ; CI-NEXT:    s_endpgm
 910 ;
 911 ; GFX9-LABEL: misaligned_read2_f64:
 912 ; GFX9:       ; %bb.0:
 913 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x8
 914 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 915 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 916 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 917 ; GFX9-NEXT:    v_add_u32_e32 v2, s2, v4
 918 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
 919 ; GFX9-NEXT:    ds_read2_b32 v[2:3], v2 offset0:14 offset1:15
 920 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 921 ; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 922 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 923 ; GFX9-NEXT:    s_endpgm
 924   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 925   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
 926   %val0 = load double, double addrspace(3)* %arrayidx0, align 4
 927   %add.x = add nsw i32 %x.i, 7
 928   %arrayidx1 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x
 929   %val1 = load double, double addrspace(3)* %arrayidx1, align 4
 930   %sum = fadd double %val0, %val1
 931   %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i32 %x.i
 932   store double %sum, double addrspace(1)* %out.gep, align 4
 933   ret void
 934 }
 935
 936 @foo = addrspace(3) global [4 x i32] undef, align 4
 937
 938 define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
 939 ; CI-LABEL: load_constant_adjacent_offsets:
 940 ; CI:       ; %bb.0:
 941 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 942 ; CI-NEXT:    s_mov_b32 m0, -1
 943 ; CI-NEXT:    ds_read_b64 v[0:1], v0
 944 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 945 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 946 ; CI-NEXT:    s_mov_b32 s2, -1
 947 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 948 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 949 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 950 ; CI-NEXT:    s_endpgm
 951 ;
 952 ; GFX9-LABEL: load_constant_adjacent_offsets:
 953 ; GFX9:       ; %bb.0:
 954 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 955 ; GFX9-NEXT:    ds_read_b64 v[0:1], v2
 956 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 957 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 958 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 959 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 960 ; GFX9-NEXT:    s_endpgm
 961   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
 962   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
 963   %sum = add i32 %val0, %val1
 964   store i32 %sum, i32 addrspace(1)* %out, align 4
 965   ret void
 966 }
 967
 968 define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
 969 ; CI-LABEL: load_constant_disjoint_offsets:
 970 ; CI:       ; %bb.0:
 971 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 972 ; CI-NEXT:    s_mov_b32 m0, -1
 973 ; CI-NEXT:    ds_read2_b32 v[0:1], v0 offset1:2
 974 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 975 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 976 ; CI-NEXT:    s_mov_b32 s2, -1
 977 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 978 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 979 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 980 ; CI-NEXT:    s_endpgm
 981 ;
 982 ; GFX9-LABEL: load_constant_disjoint_offsets:
 983 ; GFX9:       ; %bb.0:
 984 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 985 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:2
 986 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 987 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 988 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 989 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 990 ; GFX9-NEXT:    s_endpgm
 991   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
 992   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
 993   %sum = add i32 %val0, %val1
 994   store i32 %sum, i32 addrspace(1)* %out, align 4
 995   ret void
 996 }
 997
 998 @bar = addrspace(3) global [4 x i64] undef, align 4
 999
1000 define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
1001 ; CI-LABEL: load_misaligned64_constant_offsets:
1002 ; CI:       ; %bb.0:
1003 ; CI-NEXT:    v_mov_b32_e32 v0, 0
1004 ; CI-NEXT:    s_mov_b32 m0, -1
1005 ; CI-NEXT:    ds_read_b128 v[0:3], v0
1006 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
1007 ; CI-NEXT:    s_mov_b32 s3, 0xf000
1008 ; CI-NEXT:    s_mov_b32 s2, -1
1009 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
1010 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1011 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
1012 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1013 ; CI-NEXT:    s_endpgm
1014 ;
1015 ; GFX9-LABEL: load_misaligned64_constant_offsets:
1016 ; GFX9:       ; %bb.0:
1017 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1018 ; GFX9-NEXT:    ds_read_b128 v[0:3], v4
1019 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
1020 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1021 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
1022 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
1023 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
1024 ; GFX9-NEXT:    s_endpgm
1025   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
1026   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
1027   %sum = add i64 %val0, %val1
1028   store i64 %sum, i64 addrspace(1)* %out, align 8
1029   ret void
1030 }
1031
1032 @bar.large = addrspace(3) global [4096 x i64] undef, align 4
1033
1034 define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
1035 ; CI-LABEL: load_misaligned64_constant_large_offsets:
1036 ; CI:       ; %bb.0:
1037 ; CI-NEXT:    v_mov_b32_e32 v2, 0
1038 ; CI-NEXT:    s_mov_b32 m0, -1
1039 ; CI-NEXT:    ds_read_b64 v[0:1], v2 offset:16384
1040 ; CI-NEXT:    ds_read_b64 v[2:3], v2 offset:32760
1041 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
1042 ; CI-NEXT:    s_mov_b32 s3, 0xf000
1043 ; CI-NEXT:    s_mov_b32 s2, -1
1044 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
1045 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1046 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
1047 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1048 ; CI-NEXT:    s_endpgm
1049 ;
1050 ; GFX9-LABEL: load_misaligned64_constant_large_offsets:
1051 ; GFX9:       ; %bb.0:
1052 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1053 ; GFX9-NEXT:    ds_read_b64 v[0:1], v4 offset:16384
1054 ; GFX9-NEXT:    ds_read_b64 v[2:3], v4 offset:32760
1055 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
1056 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1057 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
1058 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
1059 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
1060 ; GFX9-NEXT:    s_endpgm
1061   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
1062   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
1063   %sum = add i64 %val0, %val1
1064   store i64 %sum, i64 addrspace(1)* %out, align 8
1065   ret void
1066 }
1067
1068 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
1069 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
1070
1071 define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
1072 ; CI-LABEL: sgemm_inner_loop_read2_sequence:
1073 ; CI:       ; %bb.0:
1074 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
1075 ; CI-NEXT:    s_lshl_b32 s0, s2, 2
1076 ; CI-NEXT:    s_add_i32 s1, s0, 0xc20
1077 ; CI-NEXT:    s_addk_i32 s0, 0xc60
1078 ; CI-NEXT:    v_mov_b32_e32 v0, s1
1079 ; CI-NEXT:    v_mov_b32_e32 v2, s0
1080 ; CI-NEXT:    v_lshlrev_b32_e32 v8, 2, v1
1081 ; CI-NEXT:    s_mov_b32 m0, -1
1082 ; CI-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
1083 ; CI-NEXT:    ds_read2_b32 v[2:3], v2 offset1:1
1084 ; CI-NEXT:    ds_read2_b32 v[4:5], v8 offset1:1
1085 ; CI-NEXT:    ds_read2_b32 v[6:7], v8 offset0:32 offset1:33
1086 ; CI-NEXT:    ds_read2_b32 v[8:9], v8 offset0:64 offset1:65
1087 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
1088 ; CI-NEXT:    v_add_f32_e32 v0, v0, v1
1089 ; CI-NEXT:    v_add_f32_e32 v0, v0, v2
1090 ; CI-NEXT:    v_add_f32_e32 v0, v0, v3
1091 ; CI-NEXT:    v_add_f32_e32 v0, v0, v4
1092 ; CI-NEXT:    v_add_f32_e32 v0, v0, v5
1093 ; CI-NEXT:    v_add_f32_e32 v0, v0, v6
1094 ; CI-NEXT:    v_add_f32_e32 v0, v0, v7
1095 ; CI-NEXT:    v_add_f32_e32 v0, v0, v8
1096 ; CI-NEXT:    s_mov_b32 s7, 0xf000
1097 ; CI-NEXT:    s_mov_b32 s6, -1
1098 ; CI-NEXT:    v_add_f32_e32 v0, v0, v9
1099 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1100 ; CI-NEXT:    s_endpgm
1101 ;
1102 ; GFX9-LABEL: sgemm_inner_loop_read2_sequence:
1103 ; GFX9:       ; %bb.0:
1104 ; GFX9-NEXT:    s_lshl_b32 s2, s2, 2
1105 ; GFX9-NEXT:    s_add_i32 s3, s2, 0xc20
1106 ; GFX9-NEXT:    s_addk_i32 s2, 0xc60
1107 ; GFX9-NEXT:    v_mov_b32_e32 v0, s3
1108 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1109 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 2, v1
1110 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
1111 ; GFX9-NEXT:    ds_read2_b32 v[2:3], v2 offset1:1
1112 ; GFX9-NEXT:    ds_read2_b32 v[4:5], v8 offset1:1
1113 ; GFX9-NEXT:    ds_read2_b32 v[6:7], v8 offset0:32 offset1:33
1114 ; GFX9-NEXT:    ds_read2_b32 v[8:9], v8 offset0:64 offset1:65
1115 ; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
1116 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
1117 ; GFX9-NEXT:    s_waitcnt lgkmcnt(3)
1118 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
1119 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v3
1120 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
1121 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
1122 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
1123 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v5
1124 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1125 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v6
1126 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v7
1127 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
1128 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0
1129 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v9
1130 ; GFX9-NEXT:    global_store_dword v10, v0, s[0:1]
1131 ; GFX9-NEXT:    s_endpgm
1132   %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
1133   %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
1134   %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
1135   %tmp16 = load float, float addrspace(3)* %arrayidx44, align 4
1136   %add47 = add nsw i32 %x.i, 1
1137   %arrayidx48 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
1138   %tmp17 = load float, float addrspace(3)* %arrayidx48, align 4
1139   %add51 = add nsw i32 %x.i, 16
1140   %arrayidx52 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
1141   %tmp18 = load float, float addrspace(3)* %arrayidx52, align 4
1142   %add55 = add nsw i32 %x.i, 17
1143   %arrayidx56 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
1144   %tmp19 = load float, float addrspace(3)* %arrayidx56, align 4
1145   %arrayidx60 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
1146   %tmp20 = load float, float addrspace(3)* %arrayidx60, align 4
1147   %add63 = add nsw i32 %y.i, 1
1148   %arrayidx64 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
1149   %tmp21 = load float, float addrspace(3)* %arrayidx64, align 4
1150   %add67 = add nsw i32 %y.i, 32
1151   %arrayidx68 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
1152   %tmp22 = load float, float addrspace(3)* %arrayidx68, align 4
1153   %add71 = add nsw i32 %y.i, 33
1154   %arrayidx72 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
1155   %tmp23 = load float, float addrspace(3)* %arrayidx72, align 4
1156   %add75 = add nsw i32 %y.i, 64
1157   %arrayidx76 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
1158   %tmp24 = load float, float addrspace(3)* %arrayidx76, align 4
1159   %add79 = add nsw i32 %y.i, 65
1160   %arrayidx80 = getelementptr inbounds [776 x float], [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
1161   %tmp25 = load float, float addrspace(3)* %arrayidx80, align 4
1162   %sum.0 = fadd float %tmp16, %tmp17
1163   %sum.1 = fadd float %sum.0, %tmp18
1164   %sum.2 = fadd float %sum.1, %tmp19
1165   %sum.3 = fadd float %sum.2, %tmp20
1166   %sum.4 = fadd float %sum.3, %tmp21
1167   %sum.5 = fadd float %sum.4, %tmp22
1168   %sum.6 = fadd float %sum.5, %tmp23
1169   %sum.7 = fadd float %sum.6, %tmp24
1170   %sum.8 = fadd float %sum.7, %tmp25
1171   store float %sum.8, float addrspace(1)* %C, align 4
1172   ret void
1173 }
1174
1175 define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
1176 ; CI-LABEL: misaligned_read2_v2i32:
1177 ; CI:       ; %bb.0:
1178 ; CI-NEXT:    s_load_dword s2, s[0:1], 0x2
1179 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
1180 ; CI-NEXT:    s_mov_b32 m0, -1
1181 ; CI-NEXT:    s_mov_b32 s3, 0xf000
1182 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
1183 ; CI-NEXT:    v_mov_b32_e32 v0, s2
1184 ; CI-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
1185 ; CI-NEXT:    s_mov_b32 s2, -1
1186 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
1187 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1188 ; CI-NEXT:    s_endpgm
1189 ;
1190 ; GFX9-LABEL: misaligned_read2_v2i32:
1191 ; GFX9:       ; %bb.0:
1192 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x8
1193 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1194 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1195 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1196 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1197 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
1198 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1199 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
1200 ; GFX9-NEXT:    s_endpgm
1201   %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
1202   store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
1203   ret void
1204 }
1205
1206 define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
1207 ; CI-LABEL: misaligned_read2_i64:
1208 ; CI:       ; %bb.0:
1209 ; CI-NEXT:    s_load_dword s2, s[0:1], 0x2
1210 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
1211 ; CI-NEXT:    s_mov_b32 m0, -1
1212 ; CI-NEXT:    s_mov_b32 s3, 0xf000
1213 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
1214 ; CI-NEXT:    v_mov_b32_e32 v0, s2
1215 ; CI-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
1216 ; CI-NEXT:    s_mov_b32 s2, -1
1217 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
1218 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1219 ; CI-NEXT:    s_endpgm
1220 ;
1221 ; GFX9-LABEL: misaligned_read2_i64:
1222 ; GFX9:       ; %bb.0:
1223 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x8
1224 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
1225 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1226 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1227 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1228 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
1229 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1230 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
1231 ; GFX9-NEXT:    s_endpgm
1232   %load = load i64, i64 addrspace(3)* %in, align 4
1233   store i64 %load, i64 addrspace(1)* %out, align 8
1234   ret void
1235 }
1236
1237 define amdgpu_kernel void @ds_read_diff_base_interleaving(
1238 ; CI-LABEL: ds_read_diff_base_interleaving:
1239 ; CI:       ; %bb.0: ; %bb
1240 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2
1241 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
1242 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1243 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1244 ; CI-NEXT:    s_mov_b32 m0, -1
1245 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
1246 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v1
1247 ; CI-NEXT:    v_add_i32_e32 v3, vcc, s5, v0
1248 ; CI-NEXT:    v_add_i32_e32 v4, vcc, s6, v1
1249 ; CI-NEXT:    v_add_i32_e32 v6, vcc, s7, v0
1250 ; CI-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
1251 ; CI-NEXT:    ds_read2_b32 v[2:3], v3 offset1:4
1252 ; CI-NEXT:    ds_read2_b32 v[4:5], v4 offset1:1
1253 ; CI-NEXT:    ds_read2_b32 v[6:7], v6 offset1:4
1254 ; CI-NEXT:    s_mov_b32 s3, 0xf000
1255 ; CI-NEXT:    s_mov_b32 s2, -1
1256 ; CI-NEXT:    s_waitcnt lgkmcnt(2)
1257 ; CI-NEXT:    v_mul_f32_e32 v0, v0, v2
1258 ; CI-NEXT:    v_add_f32_e32 v0, 2.0, v0
1259 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
1260 ; CI-NEXT:    v_mul_f32_e32 v2, v4, v6
1261 ; CI-NEXT:    v_sub_f32_e32 v0, v0, v2
1262 ; CI-NEXT:    v_mul_f32_e32 v1, v1, v3
1263 ; CI-NEXT:    v_sub_f32_e32 v0, v0, v1
1264 ; CI-NEXT:    v_mul_f32_e32 v1, v5, v7
1265 ; CI-NEXT:    v_sub_f32_e32 v0, v0, v1
1266 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:40
1267 ; CI-NEXT:    s_endpgm
1268 ;
1269 ; GFX9-LABEL: ds_read_diff_base_interleaving:
1270 ; GFX9:       ; %bb.0: ; %bb
1271 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
1272 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1273 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1274 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
1275 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
1276 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1277 ; GFX9-NEXT:    v_add_u32_e32 v2, s4, v1
1278 ; GFX9-NEXT:    v_add_u32_e32 v3, s5, v0
1279 ; GFX9-NEXT:    v_add_u32_e32 v4, s6, v1
1280 ; GFX9-NEXT:    v_add_u32_e32 v6, s7, v0
1281 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
1282 ; GFX9-NEXT:    ds_read2_b32 v[2:3], v3 offset1:4
1283 ; GFX9-NEXT:    ds_read2_b32 v[4:5], v4 offset1:1
1284 ; GFX9-NEXT:    ds_read2_b32 v[6:7], v6 offset1:4
1285 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
1286 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
1287 ; GFX9-NEXT:    v_add_f32_e32 v0, 2.0, v0
1288 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1289 ; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v6
1290 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
1291 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
1292 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
1293 ; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v7
1294 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
1295 ; GFX9-NEXT:    global_store_dword v8, v0, s[0:1] offset:40
1296 ; GFX9-NEXT:    s_endpgm
1297   float addrspace(1)* nocapture %arg,
1298   [4 x [4 x float]] addrspace(3)* %arg1,
1299   [4 x [4 x float]] addrspace(3)* %arg2,
1300   [4 x [4 x float]] addrspace(3)* %arg3,
1301   [4 x [4 x float]] addrspace(3)* %arg4) #1 {
1302 bb:
1303   %tmp = getelementptr float, float addrspace(1)* %arg, i64 10
1304   %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.x() #2
1305   %tmp6 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
1306   %tmp7 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 0
1307   %tmp8 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 0, i32 %tmp5
1308   %tmp9 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 0
1309   %tmp10 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 0, i32 %tmp5
1310   %tmp11 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 1
1311   %tmp12 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 1, i32 %tmp5
1312   %tmp13 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 1
1313   %tmp14 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 1, i32 %tmp5
1314   %tmp15 = load float, float addrspace(3)* %tmp7
1315   %tmp16 = load float, float addrspace(3)* %tmp8
1316   %tmp17 = fmul float %tmp15, %tmp16
1317   %tmp18 = fadd float 2.000000e+00, %tmp17
1318   %tmp19 = load float, float addrspace(3)* %tmp9
1319   %tmp20 = load float, float addrspace(3)* %tmp10
1320   %tmp21 = fmul float %tmp19, %tmp20
1321   %tmp22 = fsub float %tmp18, %tmp21
1322   %tmp23 = load float, float addrspace(3)* %tmp11
1323   %tmp24 = load float, float addrspace(3)* %tmp12
1324   %tmp25 = fmul float %tmp23, %tmp24
1325   %tmp26 = fsub float %tmp22, %tmp25
1326   %tmp27 = load float, float addrspace(3)* %tmp13
1327   %tmp28 = load float, float addrspace(3)* %tmp14
1328   %tmp29 = fmul float %tmp27, %tmp28
1329   %tmp30 = fsub float %tmp26, %tmp29
1330   store float %tmp30, float addrspace(1)* %tmp
1331   ret void
1332 }
1333
1334 define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspace(3)* %arg) {
1335 ; CI-LABEL: ds_read_call_read:
1336 ; CI:       ; %bb.0:
1337 ; CI-NEXT:    s_getpc_b64 s[40:41]
1338 ; CI-NEXT:    s_mov_b32 s40, s0
1339 ; CI-NEXT:    s_load_dwordx4 s[40:43], s[40:41], 0x0
1340 ; CI-NEXT:    s_mov_b32 s14, s10
1341 ; CI-NEXT:    s_mov_b32 s12, s8
1342 ; CI-NEXT:    s_mov_b32 s13, s9
1343 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
1344 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
1345 ; CI-NEXT:    s_add_u32 s40, s40, s11
1346 ; CI-NEXT:    s_mov_b64 s[10:11], s[6:7]
1347 ; CI-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x0
1348 ; CI-NEXT:    s_load_dword s6, s[4:5], 0x2
1349 ; CI-NEXT:    s_addc_u32 s41, s41, 0
1350 ; CI-NEXT:    s_add_u32 s8, s4, 12
1351 ; CI-NEXT:    s_addc_u32 s9, s5, 0
1352 ; CI-NEXT:    s_getpc_b64 s[4:5]
1353 ; CI-NEXT:    s_add_u32 s4, s4, void_func_void@gotpcrel32@lo+4
1354 ; CI-NEXT:    s_addc_u32 s5, s5, void_func_void@gotpcrel32@hi+12
1355 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
1356 ; CI-NEXT:    v_add_i32_e32 v40, vcc, s6, v3
1357 ; CI-NEXT:    s_mov_b32 m0, -1
1358 ; CI-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
1359 ; CI-NEXT:    ds_read_b32 v41, v40
1360 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
1361 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
1362 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
1363 ; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
1364 ; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
1365 ; CI-NEXT:    s_mov_b64 s[0:1], s[40:41]
1366 ; CI-NEXT:    v_or_b32_e32 v31, v0, v2
1367 ; CI-NEXT:    s_mov_b64 s[2:3], s[42:43]
1368 ; CI-NEXT:    s_mov_b32 s32, 0
1369 ; CI-NEXT:    s_mov_b32 s39, 0xf000
1370 ; CI-NEXT:    s_mov_b32 s38, -1
1371 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
1372 ; CI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
1373 ; CI-NEXT:    ds_read_b32 v0, v40 offset:4
1374 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
1375 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v41, v0
1376 ; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0
1377 ; CI-NEXT:    s_endpgm
1378 ;
1379 ; GFX9-LABEL: ds_read_call_read:
1380 ; GFX9:       ; %bb.0:
1381 ; GFX9-NEXT:    s_getpc_b64 s[36:37]
1382 ; GFX9-NEXT:    s_mov_b32 s36, s0
1383 ; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[36:37], 0x0
1384 ; GFX9-NEXT:    s_mov_b32 s14, s10
1385 ; GFX9-NEXT:    s_mov_b32 s12, s8
1386 ; GFX9-NEXT:    s_mov_b32 s13, s9
1387 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
1388 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1389 ; GFX9-NEXT:    s_add_u32 s36, s36, s11
1390 ; GFX9-NEXT:    s_addc_u32 s37, s37, 0
1391 ; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
1392 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x8
1393 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
1394 ; GFX9-NEXT:    s_add_u32 s8, s4, 12
1395 ; GFX9-NEXT:    s_addc_u32 s9, s5, 0
1396 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
1397 ; GFX9-NEXT:    s_add_u32 s4, s4, void_func_void@gotpcrel32@lo+4
1398 ; GFX9-NEXT:    s_addc_u32 s5, s5, void_func_void@gotpcrel32@hi+12
1399 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1400 ; GFX9-NEXT:    v_lshl_add_u32 v41, v0, 2, s6
1401 ; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
1402 ; GFX9-NEXT:    ds_read_b32 v42, v41
1403 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
1404 ; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
1405 ; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
1406 ; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
1407 ; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
1408 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
1409 ; GFX9-NEXT:    s_mov_b32 s32, 0
1410 ; GFX9-NEXT:    v_mov_b32_e32 v40, 0
1411 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1412 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
1413 ; GFX9-NEXT:    ds_read_b32 v0, v41 offset:4
1414 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1415 ; GFX9-NEXT:    v_add_u32_e32 v0, v42, v0
1416 ; GFX9-NEXT:    global_store_dword v40, v0, s[34:35]
1417 ; GFX9-NEXT:    s_endpgm
1418   %x = call i32 @llvm.amdgcn.workitem.id.x()
1419   %arrayidx0 = getelementptr i32, i32 addrspace(3)* %arg, i32 %x
1420   %arrayidx1 = getelementptr i32, i32 addrspace(3)* %arrayidx0, i32 1
1421   %v0 = load i32, i32 addrspace(3)* %arrayidx0, align 4
1422   call void @void_func_void()
1423   %v1 = load i32, i32 addrspace(3)* %arrayidx1, align 4
1424   %r = add i32 %v0, %v1
1425   store i32 %r, i32 addrspace(1)* %out, align 4
1426   ret void
1427 }
1428
1429 define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, float addrspace(3)* %inptr) {
1430 ; CI-LABEL: ds_read_interp_read:
1431 ; CI:       ; %bb.0:
1432 ; CI-NEXT:    s_mov_b32 m0, -1
1433 ; CI-NEXT:    ds_read_b32 v2, v0
1434 ; CI-NEXT:    s_mov_b32 m0, s0
1435 ; CI-NEXT:    v_interp_mov_f32 v1, p10, attr0.x
1436 ; CI-NEXT:    s_mov_b32 m0, -1
1437 ; CI-NEXT:    ds_read_b32 v0, v0 offset:16
1438 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
1439 ; CI-NEXT:    v_add_f32_e32 v1, v0, v1
1440 ; CI-NEXT:    v_mov_b32_e32 v0, v2
1441 ; CI-NEXT:    ; return to shader part epilog
1442 ;
1443 ; GFX9-LABEL: ds_read_interp_read:
1444 ; GFX9:       ; %bb.0:
1445 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:4
1446 ; GFX9-NEXT:    s_mov_b32 m0, s0
1447 ; GFX9-NEXT:    s_nop 0
1448 ; GFX9-NEXT:    v_interp_mov_f32_e32 v2, p10, attr0.x
1449 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1450 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
1451 ; GFX9-NEXT:    ; return to shader part epilog
1452   %v0 = load float, float addrspace(3)* %inptr, align 4
1453   %intrp = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 %prims)
1454   %ptr1 = getelementptr float, float addrspace(3)* %inptr, i32 4
1455   %v1 = load float, float addrspace(3)* %ptr1, align 4
1456   %v1b = fadd float %v1, %intrp
1457   %r0 = insertelement <2 x float> undef, float %v0, i32 0
1458   %r1 = insertelement <2 x float> %r0, float %v1b, i32 1
1459   ret <2 x float> %r1
1460 }
1461
1462 @v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
1463
1464 define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)* %out) {
1465 ; CI-LABEL: read2_v2i32_align1_odd_offset:
1466 ; CI:       ; %bb.0: ; %entry
1467 ; CI-NEXT:    v_mov_b32_e32 v0, 0
1468 ; CI-NEXT:    s_mov_b32 m0, -1
1469 ; CI-NEXT:    ds_read_u8 v1, v0 offset:70
1470 ; CI-NEXT:    ds_read_u8 v2, v0 offset:72
1471 ; CI-NEXT:    ds_read_u8 v3, v0 offset:71
1472 ; CI-NEXT:    ds_read_u8 v4, v0 offset:69
1473 ; CI-NEXT:    ds_read_u8 v5, v0 offset:68
1474 ; CI-NEXT:    s_waitcnt lgkmcnt(4)
1475 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1476 ; CI-NEXT:    s_waitcnt lgkmcnt(3)
1477 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1478 ; CI-NEXT:    s_waitcnt lgkmcnt(2)
1479 ; CI-NEXT:    v_or_b32_e32 v2, v2, v3
1480 ; CI-NEXT:    s_waitcnt lgkmcnt(1)
1481 ; CI-NEXT:    v_or_b32_e32 v1, v1, v4
1482 ; CI-NEXT:    ds_read_u8 v4, v0 offset:67
1483 ; CI-NEXT:    ds_read_u8 v6, v0 offset:66
1484 ; CI-NEXT:    ds_read_u8 v0, v0 offset:65
1485 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1486 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
1487 ; CI-NEXT:    v_or_b32_e32 v1, v2, v1
1488 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
1489 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
1490 ; CI-NEXT:    v_or_b32_e32 v0, v2, v0
1491 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
1492 ; CI-NEXT:    v_or_b32_e32 v2, v2, v4
1493 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1494 ; CI-NEXT:    s_mov_b32 s3, 0xf000
1495 ; CI-NEXT:    s_mov_b32 s2, -1
1496 ; CI-NEXT:    v_or_b32_e32 v0, v2, v0
1497 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1498 ; CI-NEXT:    s_endpgm
1499 ;
1500 ; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset:
1501 ; GFX9-ALIGNED:       ; %bb.0: ; %entry
1502 ; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v2, 0
1503 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v0, v2 offset:70
1504 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v3, v2 offset:65
1505 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v4, v2 offset:66
1506 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v5, v2 offset:67
1507 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v6, v2 offset:68
1508 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v1, v2 offset:69
1509 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v7, v2 offset:72
1510 ; GFX9-ALIGNED-NEXT:    ds_read_u8 v8, v2 offset:71
1511 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(7)
1512 ; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1513 ; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
1514 ; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
1515 ; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
1516 ; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
1517 ; GFX9-ALIGNED-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1518 ; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v1, v1, v0
1519 ; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
1520 ; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v3
1521 ; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 8, v6
1522 ; GFX9-ALIGNED-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1523 ; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v0, v3, v0
1524 ; GFX9-ALIGNED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1525 ; GFX9-ALIGNED-NEXT:    s_endpgm
1526 ;
1527 ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset:
1528 ; GFX9-UNALIGNED:       ; %bb.0: ; %entry
1529 ; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0
1530 ; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
1531 ; GFX9-UNALIGNED-NEXT:    ds_read_b64 v[0:1], v2 offset:65
1532 ; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
1533 ; GFX9-UNALIGNED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1534 ; GFX9-UNALIGNED-NEXT:    s_endpgm
1535 entry:
1536   %load = load <2 x i32>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1
1537   store <2 x i32> %load, <2 x i32> addrspace(1)* %out
1538   ret void
1539 }
1540
1541 declare void @void_func_void() #3
1542
1543 declare i32 @llvm.amdgcn.workgroup.id.x() #1
1544 declare i32 @llvm.amdgcn.workgroup.id.y() #1
1545 declare i32 @llvm.amdgcn.workitem.id.x() #1
1546 declare i32 @llvm.amdgcn.workitem.id.y() #1
1547
1548 declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) nounwind readnone
1549
1550 declare void @llvm.amdgcn.s.barrier() #2
1551
1552 attributes #0 = { nounwind }
1553 attributes #1 = { nounwind readnone speculatable }
1554 attributes #2 = { convergent nounwind }
1555 attributes #3 = { nounwind noinline }