test/CodeGen/AMDGPU/merge-stores.ll

   1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
   2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
   3
   4 ; This test is mostly to test DAG store merging, so disable the vectorizer.
   5 ; Run with devices with different unaligned load restrictions.
   6
   7 ; TODO: Vector element tests
   8 ; TODO: Non-zero base offset for load and store combinations
   9 ; TODO: Same base addrspacecasted
  10
  11
  12 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
  13 ; GCN: buffer_store_short
  14 ; GCN: s_endpgm
  15 define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
  16   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
  17
  18   store i8 123, i8 addrspace(1)* %out.gep.1
  19   store i8 456, i8 addrspace(1)* %out, align 2
  20   ret void
  21 }
  22
  23 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
  24 ; GCN: buffer_store_byte
  25 ; GCN: buffer_store_byte
  26 ; GCN: s_endpgm
  27 define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
  28   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
  29
  30   store i8 123, i8 addrspace(1)* %out.gep.1
  31   store i8 456, i8 addrspace(1)* %out
  32   ret void
  33 }
  34
  35 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
  36 ; GCN: buffer_store_dword v
  37 define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
  38   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  39
  40   store i16 123, i16 addrspace(1)* %out.gep.1
  41   store i16 456, i16 addrspace(1)* %out, align 4
  42   ret void
  43 }
  44
  45 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
  46 ; GCN: buffer_store_dword v
  47 define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
  48   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  49
  50   store i16 0, i16 addrspace(1)* %out.gep.1
  51   store i16 0, i16 addrspace(1)* %out, align 4
  52   ret void
  53 }
  54
  55 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
  56 ; GCN: buffer_store_short
  57 ; GCN: buffer_store_short
  58 ; GCN: s_endpgm
  59 define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
  60   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  61
  62   store i16 123, i16 addrspace(1)* %out.gep.1
  63   store i16 456, i16 addrspace(1)* %out
  64   ret void
  65 }
  66
  67 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
  68 ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
  69 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
  70 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
  71 define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
  72   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
  73
  74   store i32 123, i32 addrspace(1)* %out.gep.1
  75   store i32 456, i32 addrspace(1)* %out
  76   ret void
  77 }
  78
  79 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
  80 ; GCN: buffer_store_dwordx2
  81 define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
  82   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
  83   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
  84   store float 1.0, float addrspace(1)* %out.gep.1.bc
  85   store i32 456, i32 addrspace(1)* %out
  86   ret void
  87 }
  88
  89 ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
  90 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
  91 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
  92 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
  93 define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
  94   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
  95   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
  96   store i32 123, i32 addrspace(1)* %out.gep.1.bc
  97   store float 4.0, float addrspace(1)* %out
  98   ret void
  99 }
 100
 101 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
 102 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
 103 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
 104 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
 105 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
 106 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
 107 define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
 108   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 109   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 110   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 111
 112   store i32 123, i32 addrspace(1)* %out.gep.1
 113   store i32 456, i32 addrspace(1)* %out.gep.2
 114   store i32 333, i32 addrspace(1)* %out.gep.3
 115   store i32 1234, i32 addrspace(1)* %out
 116   ret void
 117 }
 118
 119 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
 120 ; GCN: buffer_store_dwordx4
 121 define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
 122   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 123   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 124   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 125
 126   store float 8.0, float addrspace(1)* %out
 127   store float 1.0, float addrspace(1)* %out.gep.1
 128   store float 2.0, float addrspace(1)* %out.gep.2
 129   store float 4.0, float addrspace(1)* %out.gep.3
 130   ret void
 131 }
 132
 133 ; First store is out of order.
 134 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
 135 ; GCN: buffer_store_dwordx4
 136 define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
 137   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 138   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 139   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 140
 141   store float 1.0, float addrspace(1)* %out.gep.1
 142   store float 2.0, float addrspace(1)* %out.gep.2
 143   store float 4.0, float addrspace(1)* %out.gep.3
 144   store float 8.0, float addrspace(1)* %out
 145   ret void
 146 }
 147
 148 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
 149 ; GCN-AA: buffer_store_dwordx4 v
 150 ; GCN: s_endpgm
 151 define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
 152   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 153   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 154   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 155
 156   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
 157   %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
 158
 159   store i32 11, i32 addrspace(1)* %out.gep.1.bc
 160   store float 2.0, float addrspace(1)* %out.gep.2
 161   store i32 17, i32 addrspace(1)* %out.gep.3.bc
 162   store float 8.0, float addrspace(1)* %out
 163   ret void
 164 }
 165
 166 ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
 167 ; SI-DAG: buffer_store_dwordx2
 168 ; SI-DAG: buffer_store_dword
 169 ; SI-NOT: buffer_store_dword
 170 ; GCN: s_endpgm
 171 define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
 172   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 173   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 174
 175   store i32 123, i32 addrspace(1)* %out.gep.1
 176   store i32 456, i32 addrspace(1)* %out.gep.2
 177   store i32 1234, i32 addrspace(1)* %out
 178   ret void
 179 }
 180
 181 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
 182 ; GCN: buffer_store_dwordx4
 183 define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
 184   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 185
 186   store i64 123, i64 addrspace(1)* %out.gep.1
 187   store i64 456, i64 addrspace(1)* %out
 188   ret void
 189 }
 190
 191 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
 192 ; GCN: buffer_store_dwordx4
 193 ; GCN: buffer_store_dwordx4
 194 define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
 195   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 196   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
 197   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
 198
 199   store i64 123, i64 addrspace(1)* %out.gep.1
 200   store i64 456, i64 addrspace(1)* %out.gep.2
 201   store i64 333, i64 addrspace(1)* %out.gep.3
 202   store i64 1234, i64 addrspace(1)* %out
 203   ret void
 204 }
 205
 206 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
 207 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 208 ; GCN: buffer_store_dwordx2 [[LOAD]]
 209 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 210   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 211   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 212
 213   %lo = load i32, i32 addrspace(1)* %in
 214   %hi = load i32, i32 addrspace(1)* %in.gep.1
 215
 216   store i32 %lo, i32 addrspace(1)* %out
 217   store i32 %hi, i32 addrspace(1)* %out.gep.1
 218   ret void
 219 }
 220
 221 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
 222 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 223 ; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 224 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 225   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 226   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 227
 228   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 229   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 230   %lo = load i32, i32 addrspace(1)* %in.gep.0
 231   %hi = load i32, i32 addrspace(1)* %in.gep.1
 232
 233   store i32 %lo, i32 addrspace(1)* %out.gep.0
 234   store i32 %hi, i32 addrspace(1)* %out.gep.1
 235   ret void
 236 }
 237
 238 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
 239 ; GCN: buffer_load_dwordx2 v
 240 ; GCN: buffer_store_dwordx2 v
 241 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 242   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 243   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 244
 245   %lo = load i32, i32 addrspace(1)* %in
 246   %hi = load i32, i32 addrspace(1)* %in.gep.1
 247
 248   store i32 %hi, i32 addrspace(1)* %out
 249   store i32 %lo, i32 addrspace(1)* %out.gep.1
 250   ret void
 251 }
 252
 253 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
 254 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 255 ; GCN: buffer_store_dwordx4 [[LOAD]]
 256 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 257   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 258   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 259   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 260   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 261   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 262   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 263
 264   %x = load i32, i32 addrspace(1)* %in
 265   %y = load i32, i32 addrspace(1)* %in.gep.1
 266   %z = load i32, i32 addrspace(1)* %in.gep.2
 267   %w = load i32, i32 addrspace(1)* %in.gep.3
 268
 269   store i32 %x, i32 addrspace(1)* %out
 270   store i32 %y, i32 addrspace(1)* %out.gep.1
 271   store i32 %z, i32 addrspace(1)* %out.gep.2
 272   store i32 %w, i32 addrspace(1)* %out.gep.3
 273   ret void
 274 }
 275
 276 ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
 277 ; SI-DAG: buffer_load_dwordx2
 278 ; SI-DAG: buffer_load_dword v
 279 ; GCN: s_waitcnt
 280 ; SI-DAG: buffer_store_dword v
 281 ; SI-DAG: buffer_store_dwordx2 v
 282 ; GCN: s_endpgm
 283 define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 284   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 285   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 286   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 287   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 288
 289   %x = load i32, i32 addrspace(1)* %in
 290   %y = load i32, i32 addrspace(1)* %in.gep.1
 291   %z = load i32, i32 addrspace(1)* %in.gep.2
 292
 293   store i32 %x, i32 addrspace(1)* %out
 294   store i32 %y, i32 addrspace(1)* %out.gep.1
 295   store i32 %z, i32 addrspace(1)* %out.gep.2
 296   ret void
 297 }
 298
 299 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
 300 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 301 ; GCN: buffer_store_dwordx4 [[LOAD]]
 302 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 303   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 304   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 305   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 306   %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
 307   %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
 308   %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
 309
 310   %x = load float, float addrspace(1)* %in
 311   %y = load float, float addrspace(1)* %in.gep.1
 312   %z = load float, float addrspace(1)* %in.gep.2
 313   %w = load float, float addrspace(1)* %in.gep.3
 314
 315   store float %x, float addrspace(1)* %out
 316   store float %y, float addrspace(1)* %out.gep.1
 317   store float %z, float addrspace(1)* %out.gep.2
 318   store float %w, float addrspace(1)* %out.gep.3
 319   ret void
 320 }
 321
 322 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
 323 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
 324 ; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
 325 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 326   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
 327   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
 328   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
 329   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
 330   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
 331   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
 332   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
 333   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
 334
 335   %x = load i32, i32 addrspace(1)* %in.gep.0
 336   %y = load i32, i32 addrspace(1)* %in.gep.1
 337   %z = load i32, i32 addrspace(1)* %in.gep.2
 338   %w = load i32, i32 addrspace(1)* %in.gep.3
 339
 340   store i32 %x, i32 addrspace(1)* %out.gep.0
 341   store i32 %y, i32 addrspace(1)* %out.gep.1
 342   store i32 %z, i32 addrspace(1)* %out.gep.2
 343   store i32 %w, i32 addrspace(1)* %out.gep.3
 344   ret void
 345 }
 346
 347 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
 348 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 349 ; GCN: s_barrier
 350 ; GCN: buffer_store_dwordx4 [[LOAD]]
 351 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 352   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 353   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 354   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 355   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 356   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 357   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 358
 359   %x = load i32, i32 addrspace(1)* %in
 360   %y = load i32, i32 addrspace(1)* %in.gep.1
 361   %z = load i32, i32 addrspace(1)* %in.gep.2
 362   %w = load i32, i32 addrspace(1)* %in.gep.3
 363
 364   ; Make sure the barrier doesn't stop this
 365   tail call void @llvm.amdgcn.s.barrier() #1
 366
 367   store i32 %w, i32 addrspace(1)* %out.gep.3
 368   store i32 %z, i32 addrspace(1)* %out.gep.2
 369   store i32 %y, i32 addrspace(1)* %out.gep.1
 370   store i32 %x, i32 addrspace(1)* %out
 371
 372   ret void
 373 }
 374
 375 ; TODO: Re-packing of loaded register required. Maybe an IR pass
 376 ; should catch this?
 377
 378 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
 379 ; GCN: buffer_load_dwordx4 v
 380 ; GCN: s_barrier
 381 ; GCN: buffer_store_dwordx4 v
 382 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 383   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 384   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 385   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 386   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 387   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 388   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 389
 390   %x = load i32, i32 addrspace(1)* %in
 391   %y = load i32, i32 addrspace(1)* %in.gep.1
 392   %z = load i32, i32 addrspace(1)* %in.gep.2
 393   %w = load i32, i32 addrspace(1)* %in.gep.3
 394
 395   ; Make sure the barrier doesn't stop this
 396   tail call void @llvm.amdgcn.s.barrier() #1
 397
 398   store i32 %w, i32 addrspace(1)* %out
 399   store i32 %z, i32 addrspace(1)* %out.gep.1
 400   store i32 %y, i32 addrspace(1)* %out.gep.2
 401   store i32 %x, i32 addrspace(1)* %out.gep.3
 402
 403   ret void
 404 }
 405
 406 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
 407 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
 408 ; GCN: buffer_store_dword [[LOAD]]
 409 ; GCN: s_endpgm
 410 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 411   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
 412   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
 413   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
 414   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
 415   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
 416   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
 417
 418   %x = load i8, i8 addrspace(1)* %in, align 4
 419   %y = load i8, i8 addrspace(1)* %in.gep.1
 420   %z = load i8, i8 addrspace(1)* %in.gep.2
 421   %w = load i8, i8 addrspace(1)* %in.gep.3
 422
 423   store i8 %x, i8 addrspace(1)* %out, align 4
 424   store i8 %y, i8 addrspace(1)* %out.gep.1
 425   store i8 %z, i8 addrspace(1)* %out.gep.2
 426   store i8 %w, i8 addrspace(1)* %out.gep.3
 427   ret void
 428 }
 429
 430 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
 431 ; GCN: buffer_load_ubyte
 432 ; GCN: buffer_load_ubyte
 433 ; GCN: buffer_load_ubyte
 434 ; GCN: buffer_load_ubyte
 435 ; GCN: buffer_store_byte
 436 ; GCN: buffer_store_byte
 437 ; GCN: buffer_store_byte
 438 ; GCN: buffer_store_byte
 439 ; GCN: s_endpgm
 440 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 441   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
 442   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
 443   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
 444   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
 445   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
 446   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
 447
 448   %x = load i8, i8 addrspace(1)* %in
 449   %y = load i8, i8 addrspace(1)* %in.gep.1
 450   %z = load i8, i8 addrspace(1)* %in.gep.2
 451   %w = load i8, i8 addrspace(1)* %in.gep.3
 452
 453   store i8 %x, i8 addrspace(1)* %out
 454   store i8 %y, i8 addrspace(1)* %out.gep.1
 455   store i8 %z, i8 addrspace(1)* %out.gep.2
 456   store i8 %w, i8 addrspace(1)* %out.gep.3
 457   ret void
 458 }
 459
 460 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
 461 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 462 ; GCN: buffer_store_dwordx4 [[LOAD]]
 463 ; GCN: s_endpgm
 464 define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 465   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 466   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 467   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 468   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
 469
 470   %x = extractelement <4 x i32> %vec, i32 0
 471   %y = extractelement <4 x i32> %vec, i32 1
 472   %z = extractelement <4 x i32> %vec, i32 2
 473   %w = extractelement <4 x i32> %vec, i32 3
 474
 475   store i32 %x, i32 addrspace(1)* %out
 476   store i32 %y, i32 addrspace(1)* %out.gep.1
 477   store i32 %z, i32 addrspace(1)* %out.gep.2
 478   store i32 %w, i32 addrspace(1)* %out.gep.3
 479   ret void
 480 }
 481
 482 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
 483 ; GCN: ds_write_b16
 484 ; GCN: s_endpgm
 485 define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
 486   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 487
 488   store i8 123, i8 addrspace(3)* %out.gep.1
 489   store i8 456, i8 addrspace(3)* %out, align 2
 490   ret void
 491 }
 492
 493 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
 494 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
 495 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 496 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
 497 define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
 498   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 499
 500   store i32 123, i32 addrspace(3)* %out.gep.1
 501   store i32 456, i32 addrspace(3)* %out
 502   ret void
 503 }
 504
 505 ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
 506 ; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8
 507 ; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d
 508 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3
 509
 510 ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2
 511 ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b
 512 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
 513
 514 ; GCN: s_endpgm
 515 define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
 516   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 517   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
 518   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
 519
 520   store i32 123, i32 addrspace(3)* %out.gep.1
 521   store i32 456, i32 addrspace(3)* %out.gep.2
 522   store i32 333, i32 addrspace(3)* %out.gep.3
 523   store i32 1234, i32 addrspace(3)* %out
 524   ret void
 525 }
 526
 527 ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
 528 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
 529 ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
 530 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
 531 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
 532 ; GCN: buffer_store_dword v[[HI]]
 533 define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
 534   store i32 9, i32 addrspace(1)* %out, align 4
 535   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 536   store i32 12, i32 addrspace(1)* %idx1, align 4
 537   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 538   store i32 16, i32 addrspace(1)* %idx2, align 4
 539   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 540   store i32 -12, i32 addrspace(1)* %idx3, align 4
 541   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 542   store i32 11, i32 addrspace(1)* %idx4, align 4
 543   ret void
 544 }
 545
 546 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
 547 ; GCN: buffer_store_dwordx4
 548 ; GCN: buffer_store_dwordx2
 549 define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
 550   store i32 13, i32 addrspace(1)* %out, align 4
 551   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 552   store i32 15, i32 addrspace(1)* %idx1, align 4
 553   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 554   store i32 62, i32 addrspace(1)* %idx2, align 4
 555   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 556   store i32 63, i32 addrspace(1)* %idx3, align 4
 557   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 558   store i32 11, i32 addrspace(1)* %idx4, align 4
 559   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 560   store i32 123, i32 addrspace(1)* %idx5, align 4
 561   ret void
 562 }
 563
 564 ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
 565 ; GCN: buffer_store_dwordx4
 566 ; GCN: buffer_store_dwordx2
 567 ; GCN: buffer_store_dword v
 568 define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
 569   store i32 34, i32 addrspace(1)* %out, align 4
 570   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 571   store i32 999, i32 addrspace(1)* %idx1, align 4
 572   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 573   store i32 65, i32 addrspace(1)* %idx2, align 4
 574   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 575   store i32 33, i32 addrspace(1)* %idx3, align 4
 576   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 577   store i32 98, i32 addrspace(1)* %idx4, align 4
 578   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 579   store i32 91, i32 addrspace(1)* %idx5, align 4
 580   %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
 581   store i32 212, i32 addrspace(1)* %idx6, align 4
 582   ret void
 583 }
 584
 585 ; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
 586 ; GCN: buffer_store_dwordx4
 587 ; GCN: buffer_store_dwordx4
 588 ; GCN: s_endpgm
 589 define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
 590   store i32 34, i32 addrspace(1)* %out, align 4
 591   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 592   store i32 999, i32 addrspace(1)* %idx1, align 4
 593   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 594   store i32 65, i32 addrspace(1)* %idx2, align 4
 595   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 596   store i32 33, i32 addrspace(1)* %idx3, align 4
 597   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 598   store i32 98, i32 addrspace(1)* %idx4, align 4
 599   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 600   store i32 91, i32 addrspace(1)* %idx5, align 4
 601   %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
 602   store i32 212, i32 addrspace(1)* %idx6, align 4
 603   %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
 604   store i32 999, i32 addrspace(1)* %idx7, align 4
 605   ret void
 606 }
 607
 608 ; This requires handling of scalar_to_vector for v2i64 to avoid
 609 ; scratch usage.
 610 ; FIXME: Should do single load and store
 611
 612 ; GCN-LABEL: {{^}}copy_v3i32_align4:
 613 ; GCN-NOT: SCRATCH_RSRC_DWORD
 614 ; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 615 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 616 ; GCN-NOT: offen
 617 ; GCN: s_waitcnt vmcnt
 618 ; GCN-NOT: offen
 619 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 620 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 621
 622 ; GCN: ScratchSize: 0{{$}}
 623 define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
 624   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
 625   store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
 626   ret void
 627 }
 628
 629 ; GCN-LABEL: {{^}}copy_v3i64_align4:
 630 ; GCN-NOT: SCRATCH_RSRC_DWORD
 631 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 632 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 633 ; GCN-NOT: offen
 634 ; GCN: s_waitcnt vmcnt
 635 ; GCN-NOT: offen
 636 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 637 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 638 ; GCN: ScratchSize: 0{{$}}
 639 define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
 640   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
 641   store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
 642   ret void
 643 }
 644
 645 ; GCN-LABEL: {{^}}copy_v3f32_align4:
 646 ; GCN-NOT: SCRATCH_RSRC_DWORD
 647 ; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 648 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 649 ; GCN-NOT: offen
 650 ; GCN: s_waitcnt vmcnt
 651 ; GCN-NOT: offen
 652 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 653 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 654 ; GCN: ScratchSize: 0{{$}}
 655 define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
 656   %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
 657   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
 658   store <3 x float> %fadd, <3 x float> addrspace(1)* %out
 659   ret void
 660 }
 661
 662 ; GCN-LABEL: {{^}}copy_v3f64_align4:
 663 ; GCN-NOT: SCRATCH_RSRC_DWORD
 664 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 665 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 666 ; GCN-NOT: offen
 667 ; GCN: s_waitcnt vmcnt
 668 ; GCN-NOT: offen
 669 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 670 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 671 ; GCN: ScratchSize: 0{{$}}
 672 define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
 673   %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
 674   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
 675   store <3 x double> %fadd, <3 x double> addrspace(1)* %out
 676   ret void
 677 }
 678
 679 declare void @llvm.amdgcn.s.barrier() #1
 680
 681 attributes #0 = { nounwind }
 682 attributes #1 = { convergent nounwind }