test/CodeGen/AMDGPU/merge-stores.ll

   1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
   2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=GCN-AA %s
   3
   4 ; This test is mostly to test DAG store merging, so disable the vectorizer.
   5 ; Run with devices with different unaligned load restrictions.
   6
   7 ; TODO: Vector element tests
   8 ; TODO: Non-zero base offset for load and store combinations
   9 ; TODO: Same base addrspacecasted
  10
  11
  12 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
  13 ; GCN: buffer_store_short
  14 ; GCN: s_endpgm
  15 define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
  16   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
  17
  18   store i8 123, i8 addrspace(1)* %out.gep.1
  19   store i8 456, i8 addrspace(1)* %out, align 2
  20   ret void
  21 }
  22
  23 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
  24 ; GCN: buffer_store_byte
  25 ; GCN: buffer_store_byte
  26 ; GCN: s_endpgm
  27 define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
  28   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
  29
  30   store i8 123, i8 addrspace(1)* %out.gep.1
  31   store i8 456, i8 addrspace(1)* %out
  32   ret void
  33 }
  34
  35 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
  36 ; GCN: buffer_store_dword v
  37 define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
  38   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  39
  40   store i16 123, i16 addrspace(1)* %out.gep.1
  41   store i16 456, i16 addrspace(1)* %out, align 4
  42   ret void
  43 }
  44
  45 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
  46 ; GCN: buffer_store_dword v
  47 define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
  48   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  49
  50   store i16 0, i16 addrspace(1)* %out.gep.1
  51   store i16 0, i16 addrspace(1)* %out, align 4
  52   ret void
  53 }
  54
  55 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
  56 ; GCN: buffer_store_short
  57 ; GCN: buffer_store_short
  58 ; GCN: s_endpgm
  59 define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
  60   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  61
  62   store i16 123, i16 addrspace(1)* %out.gep.1
  63   store i16 456, i16 addrspace(1)* %out
  64   ret void
  65 }
  66
  67 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
  68 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
  69 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
  70 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
  71 define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
  72   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
  73
  74   store i32 123, i32 addrspace(1)* %out.gep.1
  75   store i32 456, i32 addrspace(1)* %out
  76   ret void
  77 }
  78
  79 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
  80 ; GCN: buffer_store_dwordx2
  81 define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
  82   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
  83   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
  84   store float 1.0, float addrspace(1)* %out.gep.1.bc
  85   store i32 456, i32 addrspace(1)* %out
  86   ret void
  87 }
  88
  89 ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
  90 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
  91 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
  92 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
  93 define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
  94   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
  95   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
  96   store i32 123, i32 addrspace(1)* %out.gep.1.bc
  97   store float 4.0, float addrspace(1)* %out
  98   ret void
  99 }
 100
 101 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
 102 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
 103 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
 104 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
 105 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
 106 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
 107 define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
 108   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 109   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 110   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 111
 112   store i32 123, i32 addrspace(1)* %out.gep.1
 113   store i32 456, i32 addrspace(1)* %out.gep.2
 114   store i32 333, i32 addrspace(1)* %out.gep.3
 115   store i32 1234, i32 addrspace(1)* %out
 116   ret void
 117 }
 118
 119 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
 120 ; GCN: buffer_store_dwordx4
 121 define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
 122   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 123   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 124   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 125
 126   store float 8.0, float addrspace(1)* %out
 127   store float 1.0, float addrspace(1)* %out.gep.1
 128   store float 2.0, float addrspace(1)* %out.gep.2
 129   store float 4.0, float addrspace(1)* %out.gep.3
 130   ret void
 131 }
 132
 133 ; First store is out of order.
 134 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
 135 ; GCN: buffer_store_dwordx4
 136 define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
 137   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 138   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 139   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 140
 141   store float 1.0, float addrspace(1)* %out.gep.1
 142   store float 2.0, float addrspace(1)* %out.gep.2
 143   store float 4.0, float addrspace(1)* %out.gep.3
 144   store float 8.0, float addrspace(1)* %out
 145   ret void
 146 }
 147
 148 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
 149 ; GCN-AA: buffer_store_dwordx4 v
 150 ; GCN: s_endpgm
 151 define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
 152   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 153   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 154   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 155
 156   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
 157   %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
 158
 159   store i32 11, i32 addrspace(1)* %out.gep.1.bc
 160   store float 2.0, float addrspace(1)* %out.gep.2
 161   store i32 17, i32 addrspace(1)* %out.gep.3.bc
 162   store float 8.0, float addrspace(1)* %out
 163   ret void
 164 }
 165
 166 ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
 167 ; SI-DAG: buffer_store_dwordx2
 168 ; SI-DAG: buffer_store_dword v
 169 ; CI-DAG: buffer_store_dwordx3
 170 ; GCN-NOT: buffer_store_dword
 171 ; GCN: s_endpgm
 172 define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
 173   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 174   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 175
 176   store i32 123, i32 addrspace(1)* %out.gep.1
 177   store i32 456, i32 addrspace(1)* %out.gep.2
 178   store i32 1234, i32 addrspace(1)* %out
 179   ret void
 180 }
 181
 182 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
 183 ; GCN: buffer_store_dwordx4
 184 define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
 185   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 186
 187   store i64 123, i64 addrspace(1)* %out.gep.1
 188   store i64 456, i64 addrspace(1)* %out
 189   ret void
 190 }
 191
 192 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
 193 ; GCN: buffer_store_dwordx4
 194 ; GCN: buffer_store_dwordx4
 195 define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
 196   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 197   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
 198   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
 199
 200   store i64 123, i64 addrspace(1)* %out.gep.1
 201   store i64 456, i64 addrspace(1)* %out.gep.2
 202   store i64 333, i64 addrspace(1)* %out.gep.3
 203   store i64 1234, i64 addrspace(1)* %out
 204   ret void
 205 }
 206
 207 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
 208 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 209 ; GCN: buffer_store_dwordx2 [[LOAD]]
 210 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 211   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 212   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 213
 214   %lo = load i32, i32 addrspace(1)* %in
 215   %hi = load i32, i32 addrspace(1)* %in.gep.1
 216
 217   store i32 %lo, i32 addrspace(1)* %out
 218   store i32 %hi, i32 addrspace(1)* %out.gep.1
 219   ret void
 220 }
 221
 222 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
 223 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 224 ; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 225 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 226   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 227   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 228
 229   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 230   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 231   %lo = load i32, i32 addrspace(1)* %in.gep.0
 232   %hi = load i32, i32 addrspace(1)* %in.gep.1
 233
 234   store i32 %lo, i32 addrspace(1)* %out.gep.0
 235   store i32 %hi, i32 addrspace(1)* %out.gep.1
 236   ret void
 237 }
 238
 239 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
 240 ; GCN: buffer_load_dwordx2 v
 241 ; GCN: buffer_store_dwordx2 v
 242 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 243   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 244   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 245
 246   %lo = load i32, i32 addrspace(1)* %in
 247   %hi = load i32, i32 addrspace(1)* %in.gep.1
 248
 249   store i32 %hi, i32 addrspace(1)* %out
 250   store i32 %lo, i32 addrspace(1)* %out.gep.1
 251   ret void
 252 }
 253
 254 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
 255 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 256 ; GCN: buffer_store_dwordx4 [[LOAD]]
 257 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 258   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 259   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 260   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 261   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 262   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 263   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 264
 265   %x = load i32, i32 addrspace(1)* %in
 266   %y = load i32, i32 addrspace(1)* %in.gep.1
 267   %z = load i32, i32 addrspace(1)* %in.gep.2
 268   %w = load i32, i32 addrspace(1)* %in.gep.3
 269
 270   store i32 %x, i32 addrspace(1)* %out
 271   store i32 %y, i32 addrspace(1)* %out.gep.1
 272   store i32 %z, i32 addrspace(1)* %out.gep.2
 273   store i32 %w, i32 addrspace(1)* %out.gep.3
 274   ret void
 275 }
 276
 277 ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
 278 ; SI-DAG: buffer_load_dwordx4
 279 ; CI-DAG: buffer_load_dwordx3
 280 ; GCN: s_waitcnt
 281 ; SI-DAG: buffer_store_dwordx2
 282 ; SI-DAG: buffer_store_dword v
 283 ; CI-DAG: buffer_store_dwordx3
 284 ; GCN: s_endpgm
 285 define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 286   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 287   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 288   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 289   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 290
 291   %x = load i32, i32 addrspace(1)* %in
 292   %y = load i32, i32 addrspace(1)* %in.gep.1
 293   %z = load i32, i32 addrspace(1)* %in.gep.2
 294
 295   store i32 %x, i32 addrspace(1)* %out
 296   store i32 %y, i32 addrspace(1)* %out.gep.1
 297   store i32 %z, i32 addrspace(1)* %out.gep.2
 298   ret void
 299 }
 300
 301 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
 302 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 303 ; GCN: buffer_store_dwordx4 [[LOAD]]
 304 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 305   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 306   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 307   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 308   %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
 309   %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
 310   %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
 311
 312   %x = load float, float addrspace(1)* %in
 313   %y = load float, float addrspace(1)* %in.gep.1
 314   %z = load float, float addrspace(1)* %in.gep.2
 315   %w = load float, float addrspace(1)* %in.gep.3
 316
 317   store float %x, float addrspace(1)* %out
 318   store float %y, float addrspace(1)* %out.gep.1
 319   store float %z, float addrspace(1)* %out.gep.2
 320   store float %w, float addrspace(1)* %out.gep.3
 321   ret void
 322 }
 323
 324 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
 325 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
 326 ; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
 327 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 328   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
 329   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
 330   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
 331   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
 332   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
 333   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
 334   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
 335   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
 336
 337   %x = load i32, i32 addrspace(1)* %in.gep.0
 338   %y = load i32, i32 addrspace(1)* %in.gep.1
 339   %z = load i32, i32 addrspace(1)* %in.gep.2
 340   %w = load i32, i32 addrspace(1)* %in.gep.3
 341
 342   store i32 %x, i32 addrspace(1)* %out.gep.0
 343   store i32 %y, i32 addrspace(1)* %out.gep.1
 344   store i32 %z, i32 addrspace(1)* %out.gep.2
 345   store i32 %w, i32 addrspace(1)* %out.gep.3
 346   ret void
 347 }
 348
 349 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
 350 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 351 ; GCN: s_barrier
 352 ; GCN: buffer_store_dwordx4 [[LOAD]]
 353 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 354   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 355   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 356   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 357   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 358   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 359   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 360
 361   %x = load i32, i32 addrspace(1)* %in
 362   %y = load i32, i32 addrspace(1)* %in.gep.1
 363   %z = load i32, i32 addrspace(1)* %in.gep.2
 364   %w = load i32, i32 addrspace(1)* %in.gep.3
 365
 366   ; Make sure the barrier doesn't stop this
 367   tail call void @llvm.amdgcn.s.barrier() #1
 368
 369   store i32 %w, i32 addrspace(1)* %out.gep.3
 370   store i32 %z, i32 addrspace(1)* %out.gep.2
 371   store i32 %y, i32 addrspace(1)* %out.gep.1
 372   store i32 %x, i32 addrspace(1)* %out
 373
 374   ret void
 375 }
 376
 377 ; TODO: Re-packing of loaded register required. Maybe an IR pass
 378 ; should catch this?
 379
 380 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
 381 ; GCN: buffer_load_dwordx4 v
 382 ; GCN: s_barrier
 383 ; GCN: buffer_store_dwordx4 v
 384 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 385   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 386   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 387   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 388   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 389   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 390   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 391
 392   %x = load i32, i32 addrspace(1)* %in
 393   %y = load i32, i32 addrspace(1)* %in.gep.1
 394   %z = load i32, i32 addrspace(1)* %in.gep.2
 395   %w = load i32, i32 addrspace(1)* %in.gep.3
 396
 397   ; Make sure the barrier doesn't stop this
 398   tail call void @llvm.amdgcn.s.barrier() #1
 399
 400   store i32 %w, i32 addrspace(1)* %out
 401   store i32 %z, i32 addrspace(1)* %out.gep.1
 402   store i32 %y, i32 addrspace(1)* %out.gep.2
 403   store i32 %x, i32 addrspace(1)* %out.gep.3
 404
 405   ret void
 406 }
 407
 408 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
 409 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
 410 ; GCN: buffer_store_dword [[LOAD]]
 411 ; GCN: s_endpgm
 412 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 413   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
 414   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
 415   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
 416   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
 417   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
 418   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
 419
 420   %x = load i8, i8 addrspace(1)* %in, align 4
 421   %y = load i8, i8 addrspace(1)* %in.gep.1
 422   %z = load i8, i8 addrspace(1)* %in.gep.2
 423   %w = load i8, i8 addrspace(1)* %in.gep.3
 424
 425   store i8 %x, i8 addrspace(1)* %out, align 4
 426   store i8 %y, i8 addrspace(1)* %out.gep.1
 427   store i8 %z, i8 addrspace(1)* %out.gep.2
 428   store i8 %w, i8 addrspace(1)* %out.gep.3
 429   ret void
 430 }
 431
 432 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
 433 ; GCN: buffer_load_ubyte
 434 ; GCN: buffer_load_ubyte
 435 ; GCN: buffer_load_ubyte
 436 ; GCN: buffer_load_ubyte
 437 ; GCN: buffer_store_byte
 438 ; GCN: buffer_store_byte
 439 ; GCN: buffer_store_byte
 440 ; GCN: buffer_store_byte
 441 ; GCN: s_endpgm
 442 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 443   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
 444   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
 445   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
 446   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
 447   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
 448   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
 449
 450   %x = load i8, i8 addrspace(1)* %in
 451   %y = load i8, i8 addrspace(1)* %in.gep.1
 452   %z = load i8, i8 addrspace(1)* %in.gep.2
 453   %w = load i8, i8 addrspace(1)* %in.gep.3
 454
 455   store i8 %x, i8 addrspace(1)* %out
 456   store i8 %y, i8 addrspace(1)* %out.gep.1
 457   store i8 %z, i8 addrspace(1)* %out.gep.2
 458   store i8 %w, i8 addrspace(1)* %out.gep.3
 459   ret void
 460 }
 461
 462 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
 463 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 464 ; GCN: buffer_store_dwordx4 [[LOAD]]
 465 ; GCN: s_endpgm
 466 define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 467   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 468   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 469   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 470   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
 471
 472   %x = extractelement <4 x i32> %vec, i32 0
 473   %y = extractelement <4 x i32> %vec, i32 1
 474   %z = extractelement <4 x i32> %vec, i32 2
 475   %w = extractelement <4 x i32> %vec, i32 3
 476
 477   store i32 %x, i32 addrspace(1)* %out
 478   store i32 %y, i32 addrspace(1)* %out.gep.1
 479   store i32 %z, i32 addrspace(1)* %out.gep.2
 480   store i32 %w, i32 addrspace(1)* %out.gep.3
 481   ret void
 482 }
 483
 484 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
 485 ; GCN: ds_write_b16
 486 ; GCN: s_endpgm
 487 define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
 488   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 489
 490   store i8 123, i8 addrspace(3)* %out.gep.1
 491   store i8 456, i8 addrspace(3)* %out, align 2
 492   ret void
 493 }
 494
 495 ; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
 496 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
 497 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 498 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
 499 define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
 500   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 501
 502   store i32 123, i32 addrspace(3)* %out.gep.1
 503   store i32 456, i32 addrspace(3)* %out
 504   ret void
 505 }
 506
 507 ; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
 508 ; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8
 509 ; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d
 510 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3
 511
 512 ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2
 513 ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b
 514 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
 515
 516 ; GCN: s_endpgm
 517 define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
 518   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 519   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
 520   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
 521
 522   store i32 123, i32 addrspace(3)* %out.gep.1
 523   store i32 456, i32 addrspace(3)* %out.gep.2
 524   store i32 333, i32 addrspace(3)* %out.gep.3
 525   store i32 1234, i32 addrspace(3)* %out
 526   ret void
 527 }
 528
 529 ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
 530 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
 531 ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
 532 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
 533 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
 534 ; GCN: buffer_store_dword v[[HI]]
 535 define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
 536   store i32 9, i32 addrspace(1)* %out, align 4
 537   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 538   store i32 12, i32 addrspace(1)* %idx1, align 4
 539   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 540   store i32 16, i32 addrspace(1)* %idx2, align 4
 541   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 542   store i32 -12, i32 addrspace(1)* %idx3, align 4
 543   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 544   store i32 11, i32 addrspace(1)* %idx4, align 4
 545   ret void
 546 }
 547
 548 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
 549 ; GCN: buffer_store_dwordx4
 550 ; GCN: buffer_store_dwordx2
 551 define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
 552   store i32 13, i32 addrspace(1)* %out, align 4
 553   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 554   store i32 15, i32 addrspace(1)* %idx1, align 4
 555   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 556   store i32 62, i32 addrspace(1)* %idx2, align 4
 557   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 558   store i32 63, i32 addrspace(1)* %idx3, align 4
 559   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 560   store i32 11, i32 addrspace(1)* %idx4, align 4
 561   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 562   store i32 123, i32 addrspace(1)* %idx5, align 4
 563   ret void
 564 }
 565
 566 ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
 567 ; GCN: buffer_store_dwordx4
 568 ; SI-DAG: buffer_store_dwordx2
 569 ; SI-DAG: buffer_store_dword v
 570 ; CI: buffer_store_dwordx3
 571 define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
 572   store i32 34, i32 addrspace(1)* %out, align 4
 573   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 574   store i32 999, i32 addrspace(1)* %idx1, align 4
 575   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 576   store i32 65, i32 addrspace(1)* %idx2, align 4
 577   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 578   store i32 33, i32 addrspace(1)* %idx3, align 4
 579   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 580   store i32 98, i32 addrspace(1)* %idx4, align 4
 581   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 582   store i32 91, i32 addrspace(1)* %idx5, align 4
 583   %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
 584   store i32 212, i32 addrspace(1)* %idx6, align 4
 585   ret void
 586 }
 587
 588 ; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
 589 ; GCN: buffer_store_dwordx4
 590 ; GCN: buffer_store_dwordx4
 591 ; GCN: s_endpgm
 592 define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
 593   store i32 34, i32 addrspace(1)* %out, align 4
 594   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 595   store i32 999, i32 addrspace(1)* %idx1, align 4
 596   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 597   store i32 65, i32 addrspace(1)* %idx2, align 4
 598   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 599   store i32 33, i32 addrspace(1)* %idx3, align 4
 600   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 601   store i32 98, i32 addrspace(1)* %idx4, align 4
 602   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 603   store i32 91, i32 addrspace(1)* %idx5, align 4
 604   %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
 605   store i32 212, i32 addrspace(1)* %idx6, align 4
 606   %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
 607   store i32 999, i32 addrspace(1)* %idx7, align 4
 608   ret void
 609 }
 610
 611 ; This requires handling of scalar_to_vector for v2i64 to avoid
 612 ; scratch usage.
 613 ; FIXME: Should do single load and store
 614
 615 ; GCN-LABEL: {{^}}copy_v3i32_align4:
 616 ; GCN-NOT: SCRATCH_RSRC_DWORD
 617 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 618 ; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 619 ; GCN-NOT: offen
 620 ; GCN: s_waitcnt vmcnt
 621 ; GCN-NOT: offen
 622 ; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 623 ; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 624 ; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 625
 626 ; GCN: ScratchSize: 0{{$}}
 627 define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
 628   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
 629   store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
 630   ret void
 631 }
 632
 633 ; GCN-LABEL: {{^}}copy_v3i64_align4:
 634 ; GCN-NOT: SCRATCH_RSRC_DWORD
 635 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 636 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 637 ; GCN-NOT: offen
 638 ; GCN: s_waitcnt vmcnt
 639 ; GCN-NOT: offen
 640 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 641 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 642 ; GCN: ScratchSize: 0{{$}}
 643 define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
 644   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
 645   store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
 646   ret void
 647 }
 648
 649 ; GCN-LABEL: {{^}}copy_v3f32_align4:
 650 ; GCN-NOT: SCRATCH_RSRC_DWORD
 651 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 652 ; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 653 ; GCN-NOT: offen
 654 ; GCN: s_waitcnt vmcnt
 655 ; GCN-NOT: offen
 656 ; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 657 ; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 658 ; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 659 ; GCN: ScratchSize: 0{{$}}
 660 define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
 661   %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
 662   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
 663   store <3 x float> %fadd, <3 x float> addrspace(1)* %out
 664   ret void
 665 }
 666
 667 ; GCN-LABEL: {{^}}copy_v3f64_align4:
 668 ; GCN-NOT: SCRATCH_RSRC_DWORD
 669 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 670 ; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 671 ; GCN-NOT: offen
 672 ; GCN: s_waitcnt vmcnt
 673 ; GCN-NOT: offen
 674 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 675 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 676 ; GCN: ScratchSize: 0{{$}}
 677 define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
 678   %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
 679   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
 680   store <3 x double> %fadd, <3 x double> addrspace(1)* %out
 681   ret void
 682 }
 683
 684 declare void @llvm.amdgcn.s.barrier() #1
 685
 686 attributes #0 = { nounwind }
 687 attributes #1 = { convergent nounwind }