test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll

   1 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s
   2 ; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions
   3
   4 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
   5
   6 ; TODO: Vector element tests
   7 ; TODO: Non-zero base offset for load and store combinations
   8 ; TODO: Same base addrspacecasted
   9
  10
  11 ; CHECK-LABEL: @merge_global_store_2_constants_i8(
  12 ; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(1)* %{{[0-9]+}}, align 2
  13 define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
  14   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
  15
  16   store i8 123, i8 addrspace(1)* %out.gep.1
  17   store i8 456, i8 addrspace(1)* %out, align 2
  18   ret void
  19 }
  20
  21 ; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align
  22 ; CHECK: store <2 x i8>
  23 define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
  24   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
  25
  26   store i8 123, i8 addrspace(1)* %out.gep.1
  27   store i8 456, i8 addrspace(1)* %out
  28   ret void
  29 }
  30
  31 ; CHECK-LABEL: @merge_global_store_2_constants_i16
  32 ; CHECK: store <2 x i16> <i16 456, i16 123>, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
  33 define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
  34   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  35
  36   store i16 123, i16 addrspace(1)* %out.gep.1
  37   store i16 456, i16 addrspace(1)* %out, align 4
  38   ret void
  39 }
  40
  41 ; CHECK-LABEL: @merge_global_store_2_constants_0_i16
  42 ; CHECK: store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
  43 define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
  44   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  45
  46   store i16 0, i16 addrspace(1)* %out.gep.1
  47   store i16 0, i16 addrspace(1)* %out, align 4
  48   ret void
  49 }
  50
  51 ; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align
  52 ; CHECK: store <2 x i16>
  53 define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
  54   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  55
  56   store i16 123, i16 addrspace(1)* %out.gep.1
  57   store i16 456, i16 addrspace(1)* %out
  58   ret void
  59 }
  60
  61 ; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align
  62 ; CHECK: store <2 x half>
  63 define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 {
  64   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
  65
  66   store half 2.0, half addrspace(1)* %out.gep.1
  67   store half 1.0, half addrspace(1)* %out
  68   ret void
  69 }
  70
  71 ; CHECK-LABEL: @merge_global_store_2_constants_i32
  72 ; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
  73 define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
  74   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
  75
  76   store i32 123, i32 addrspace(1)* %out.gep.1
  77   store i32 456, i32 addrspace(1)* %out
  78   ret void
  79 }
  80
  81 ; CHECK-LABEL: @merge_global_store_2_constants_i32_f32
  82 ; CHECK: store <2 x i32> <i32 456, i32 1065353216>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
  83 define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
  84   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
  85   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
  86   store float 1.0, float addrspace(1)* %out.gep.1.bc
  87   store i32 456, i32 addrspace(1)* %out
  88   ret void
  89 }
  90
  91 ; CHECK-LABEL: @merge_global_store_2_constants_f32_i32
  92 ; CHECK  store <2 x float> <float 4.000000e+00, float 0x370EC00000000000>, <2 x float> addrspace(1)* %{{[0-9]+$}}
  93 define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
  94   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
  95   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
  96   store i32 123, i32 addrspace(1)* %out.gep.1.bc
  97   store float 4.0, float addrspace(1)* %out
  98   ret void
  99 }
 100
 101 ; CHECK-LABEL: @merge_global_store_4_constants_i32
 102 ; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 103 define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
 104   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 105   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 106   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 107
 108   store i32 123, i32 addrspace(1)* %out.gep.1
 109   store i32 456, i32 addrspace(1)* %out.gep.2
 110   store i32 333, i32 addrspace(1)* %out.gep.3
 111   store i32 1234, i32 addrspace(1)* %out
 112   ret void
 113 }
 114
 115 ; CHECK-LABEL: @merge_global_store_4_constants_f32_order
 116 ; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}
 117 define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
 118   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 119   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 120   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 121
 122   store float 8.0, float addrspace(1)* %out
 123   store float 1.0, float addrspace(1)* %out.gep.1
 124   store float 2.0, float addrspace(1)* %out.gep.2
 125   store float 4.0, float addrspace(1)* %out.gep.3
 126   ret void
 127 }
 128
 129 ; First store is out of order.
 130 ; CHECK-LABEL: @merge_global_store_4_constants_f32
 131 ; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}, align 4
 132 define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
 133   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 134   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 135   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 136
 137   store float 1.0, float addrspace(1)* %out.gep.1
 138   store float 2.0, float addrspace(1)* %out.gep.2
 139   store float 4.0, float addrspace(1)* %out.gep.3
 140   store float 8.0, float addrspace(1)* %out
 141   ret void
 142 }
 143
 144 ; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32
 145 ; CHECK: store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 146 define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
 147   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 148   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 149   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 150
 151   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
 152   %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
 153
 154   store i32 11, i32 addrspace(1)* %out.gep.1.bc
 155   store float 2.0, float addrspace(1)* %out.gep.2
 156   store i32 17, i32 addrspace(1)* %out.gep.3.bc
 157   store float 8.0, float addrspace(1)* %out
 158   ret void
 159 }
 160
 161 ; CHECK-LABEL: @merge_global_store_3_constants_i32
 162 ; CHECK: store <3 x i32> <i32 1234, i32 123, i32 456>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 163 define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
 164   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 165   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 166
 167   store i32 123, i32 addrspace(1)* %out.gep.1
 168   store i32 456, i32 addrspace(1)* %out.gep.2
 169   store i32 1234, i32 addrspace(1)* %out
 170   ret void
 171 }
 172
 173 ; CHECK-LABEL: @merge_global_store_2_constants_i64
 174 ; CHECK: store <2 x i64> <i64 456, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
 175 define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
 176   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 177
 178   store i64 123, i64 addrspace(1)* %out.gep.1
 179   store i64 456, i64 addrspace(1)* %out
 180   ret void
 181 }
 182
 183 ; CHECK-LABEL: @merge_global_store_4_constants_i64
 184 ; CHECK: store <2 x i64> <i64 456, i64 333>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
 185 ; CHECK: store <2 x i64> <i64 1234, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
 186 define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
 187   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 188   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
 189   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
 190
 191   store i64 123, i64 addrspace(1)* %out.gep.1
 192   store i64 456, i64 addrspace(1)* %out.gep.2
 193   store i64 333, i64 addrspace(1)* %out.gep.3
 194   store i64 1234, i64 addrspace(1)* %out
 195   ret void
 196 }
 197
 198 ; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32
 199 ; CHECK: [[LOAD:%[^ ]+]] = load <2 x i32>
 200 ; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 0
 201 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 1
 202 ; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
 203 ; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT1]], i32 1
 204 ; CHECK: store <2 x i32> [[INSERT1]]
 205 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 206   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 207   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 208
 209   %lo = load i32, i32 addrspace(1)* %in
 210   %hi = load i32, i32 addrspace(1)* %in.gep.1
 211
 212   store i32 %lo, i32 addrspace(1)* %out
 213   store i32 %hi, i32 addrspace(1)* %out.gep.1
 214   ret void
 215 }
 216
 217 ; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32_nonzero_base
 218 ; CHECK: extractelement
 219 ; CHECK: extractelement
 220 ; CHECK: insertelement
 221 ; CHECK: insertelement
 222 ; CHECK: store <2 x i32>
 223 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 224   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 225   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 226
 227   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 228   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 229   %lo = load i32, i32 addrspace(1)* %in.gep.0
 230   %hi = load i32, i32 addrspace(1)* %in.gep.1
 231
 232   store i32 %lo, i32 addrspace(1)* %out.gep.0
 233   store i32 %hi, i32 addrspace(1)* %out.gep.1
 234   ret void
 235 }
 236
 237 ; CHECK-LABEL: @merge_global_store_2_adjacent_loads_shuffle_i32
 238 ; CHECK: [[LOAD:%[^ ]+]] = load <2 x i32>
 239 ; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 0
 240 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 1
 241 ; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT1]], i32 0
 242 ; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT0]], i32 1
 243 ; CHECK: store <2 x i32> [[INSERT1]]
 244 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 245   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 246   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 247
 248   %lo = load i32, i32 addrspace(1)* %in
 249   %hi = load i32, i32 addrspace(1)* %in.gep.1
 250
 251   store i32 %hi, i32 addrspace(1)* %out
 252   store i32 %lo, i32 addrspace(1)* %out.gep.1
 253   ret void
 254 }
 255
 256 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32
 257 ; CHECK: load <4 x i32>
 258 ; CHECK: store <4 x i32>
 259 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 260   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 261   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 262   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 263   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 264   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 265   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 266
 267   %x = load i32, i32 addrspace(1)* %in
 268   %y = load i32, i32 addrspace(1)* %in.gep.1
 269   %z = load i32, i32 addrspace(1)* %in.gep.2
 270   %w = load i32, i32 addrspace(1)* %in.gep.3
 271
 272   store i32 %x, i32 addrspace(1)* %out
 273   store i32 %y, i32 addrspace(1)* %out.gep.1
 274   store i32 %z, i32 addrspace(1)* %out.gep.2
 275   store i32 %w, i32 addrspace(1)* %out.gep.3
 276   ret void
 277 }
 278
 279 ; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32
 280 ; CHECK: load <3 x i32>
 281 ; CHECK: store <3 x i32>
 282 define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 283   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 284   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 285   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 286   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 287
 288   %x = load i32, i32 addrspace(1)* %in
 289   %y = load i32, i32 addrspace(1)* %in.gep.1
 290   %z = load i32, i32 addrspace(1)* %in.gep.2
 291
 292   store i32 %x, i32 addrspace(1)* %out
 293   store i32 %y, i32 addrspace(1)* %out.gep.1
 294   store i32 %z, i32 addrspace(1)* %out.gep.2
 295   ret void
 296 }
 297
 298 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32
 299 ; CHECK: load <4 x float>
 300 ; CHECK: store <4 x float>
 301 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 302   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 303   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 304   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 305   %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
 306   %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
 307   %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
 308
 309   %x = load float, float addrspace(1)* %in
 310   %y = load float, float addrspace(1)* %in.gep.1
 311   %z = load float, float addrspace(1)* %in.gep.2
 312   %w = load float, float addrspace(1)* %in.gep.3
 313
 314   store float %x, float addrspace(1)* %out
 315   store float %y, float addrspace(1)* %out.gep.1
 316   store float %z, float addrspace(1)* %out.gep.2
 317   store float %w, float addrspace(1)* %out.gep.3
 318   ret void
 319 }
 320
 321 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base
 322 ; CHECK: load <4 x i32>
 323 ; CHECK: store <4 x i32>
 324 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 325   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
 326   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
 327   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
 328   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
 329   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
 330   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
 331   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
 332   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
 333
 334   %x = load i32, i32 addrspace(1)* %in.gep.0
 335   %y = load i32, i32 addrspace(1)* %in.gep.1
 336   %z = load i32, i32 addrspace(1)* %in.gep.2
 337   %w = load i32, i32 addrspace(1)* %in.gep.3
 338
 339   store i32 %x, i32 addrspace(1)* %out.gep.0
 340   store i32 %y, i32 addrspace(1)* %out.gep.1
 341   store i32 %z, i32 addrspace(1)* %out.gep.2
 342   store i32 %w, i32 addrspace(1)* %out.gep.3
 343   ret void
 344 }
 345
 346 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32
 347 ; CHECK: load <4 x i32>
 348 ; CHECK: store <4 x i32>
 349 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 350   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 351   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 352   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 353   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 354   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 355   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 356
 357   %x = load i32, i32 addrspace(1)* %in
 358   %y = load i32, i32 addrspace(1)* %in.gep.1
 359   %z = load i32, i32 addrspace(1)* %in.gep.2
 360   %w = load i32, i32 addrspace(1)* %in.gep.3
 361
 362   ; Make sure the barrier doesn't stop this
 363   tail call void @llvm.amdgcn.s.barrier() #1
 364
 365   store i32 %w, i32 addrspace(1)* %out.gep.3
 366   store i32 %z, i32 addrspace(1)* %out.gep.2
 367   store i32 %y, i32 addrspace(1)* %out.gep.1
 368   store i32 %x, i32 addrspace(1)* %out
 369
 370   ret void
 371 }
 372
 373 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32
 374 ; CHECK: load <4 x i32>
 375 ; CHECK: store <4 x i32>
 376 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 377   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 378   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 379   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 380   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 381   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 382   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 383
 384   %x = load i32, i32 addrspace(1)* %in
 385   %y = load i32, i32 addrspace(1)* %in.gep.1
 386   %z = load i32, i32 addrspace(1)* %in.gep.2
 387   %w = load i32, i32 addrspace(1)* %in.gep.3
 388
 389   ; Make sure the barrier doesn't stop this
 390   tail call void @llvm.amdgcn.s.barrier() #1
 391
 392   store i32 %w, i32 addrspace(1)* %out
 393   store i32 %z, i32 addrspace(1)* %out.gep.1
 394   store i32 %y, i32 addrspace(1)* %out.gep.2
 395   store i32 %x, i32 addrspace(1)* %out.gep.3
 396
 397   ret void
 398 }
 399
 400 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8
 401 ; CHECK: load <4 x i8>
 402 ; CHECK: extractelement <4 x i8>
 403 ; CHECK: extractelement <4 x i8>
 404 ; CHECK: extractelement <4 x i8>
 405 ; CHECK: extractelement <4 x i8>
 406 ; CHECK: insertelement <4 x i8>
 407 ; CHECK: insertelement <4 x i8>
 408 ; CHECK: insertelement <4 x i8>
 409 ; CHECK: insertelement <4 x i8>
 410 ; CHECK: store <4 x i8>
 411 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 412   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
 413   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
 414   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
 415   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
 416   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
 417   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
 418
 419   %x = load i8, i8 addrspace(1)* %in, align 4
 420   %y = load i8, i8 addrspace(1)* %in.gep.1
 421   %z = load i8, i8 addrspace(1)* %in.gep.2
 422   %w = load i8, i8 addrspace(1)* %in.gep.3
 423
 424   store i8 %x, i8 addrspace(1)* %out, align 4
 425   store i8 %y, i8 addrspace(1)* %out.gep.1
 426   store i8 %z, i8 addrspace(1)* %out.gep.2
 427   store i8 %w, i8 addrspace(1)* %out.gep.3
 428   ret void
 429 }
 430
 431 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align
 432 ; CHECK: load <4 x i8>
 433 ; CHECK: store <4 x i8>
 434 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 435   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
 436   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
 437   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
 438   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
 439   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
 440   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
 441
 442   %x = load i8, i8 addrspace(1)* %in
 443   %y = load i8, i8 addrspace(1)* %in.gep.1
 444   %z = load i8, i8 addrspace(1)* %in.gep.2
 445   %w = load i8, i8 addrspace(1)* %in.gep.3
 446
 447   store i8 %x, i8 addrspace(1)* %out
 448   store i8 %y, i8 addrspace(1)* %out.gep.1
 449   store i8 %z, i8 addrspace(1)* %out.gep.2
 450   store i8 %w, i8 addrspace(1)* %out.gep.3
 451   ret void
 452 }
 453
 454 ; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32
 455 ; CHECK: load <4 x i32>
 456 ; CHECK: store <4 x i32>
 457 define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 458   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 459   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 460   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 461   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
 462
 463   %x = extractelement <4 x i32> %vec, i32 0
 464   %y = extractelement <4 x i32> %vec, i32 1
 465   %z = extractelement <4 x i32> %vec, i32 2
 466   %w = extractelement <4 x i32> %vec, i32 3
 467
 468   store i32 %x, i32 addrspace(1)* %out
 469   store i32 %y, i32 addrspace(1)* %out.gep.1
 470   store i32 %z, i32 addrspace(1)* %out.gep.2
 471   store i32 %w, i32 addrspace(1)* %out.gep.3
 472   ret void
 473 }
 474
 475 ; CHECK-LABEL: @merge_local_store_2_constants_i8
 476 ; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(3)* %{{[0-9]+}}, align 2
 477 define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
 478   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 479
 480   store i8 123, i8 addrspace(3)* %out.gep.1
 481   store i8 456, i8 addrspace(3)* %out, align 2
 482   ret void
 483 }
 484
 485 ; CHECK-LABEL: @merge_local_store_2_constants_i32
 486 ; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4
 487 define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
 488   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 489
 490   store i32 123, i32 addrspace(3)* %out.gep.1
 491   store i32 456, i32 addrspace(3)* %out
 492   ret void
 493 }
 494
 495 ; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2
 496 ; CHECK: store i32
 497 ; CHECK: store i32
 498 define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
 499   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 500
 501   store i32 123, i32 addrspace(3)* %out.gep.1, align 2
 502   store i32 456, i32 addrspace(3)* %out, align 2
 503   ret void
 504 }
 505
 506 ; CHECK-LABEL: @merge_local_store_4_constants_i32
 507 ; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(3)*
 508 define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
 509   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 510   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
 511   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
 512
 513   store i32 123, i32 addrspace(3)* %out.gep.1
 514   store i32 456, i32 addrspace(3)* %out.gep.2
 515   store i32 333, i32 addrspace(3)* %out.gep.3
 516   store i32 1234, i32 addrspace(3)* %out
 517   ret void
 518 }
 519
 520 ; CHECK-LABEL: @merge_global_store_5_constants_i32
 521 ; CHECK: store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 522 ; CHECK: store i32
 523 define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
 524   store i32 9, i32 addrspace(1)* %out, align 4
 525   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 526   store i32 12, i32 addrspace(1)* %idx1, align 4
 527   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 528   store i32 16, i32 addrspace(1)* %idx2, align 4
 529   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 530   store i32 -12, i32 addrspace(1)* %idx3, align 4
 531   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 532   store i32 11, i32 addrspace(1)* %idx4, align 4
 533   ret void
 534 }
 535
 536 ; CHECK-LABEL: @merge_global_store_6_constants_i32
 537 ; CHECK: store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 538 ; CHECK: store <2 x i32> <i32 11, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 539 define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
 540   store i32 13, i32 addrspace(1)* %out, align 4
 541   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 542   store i32 15, i32 addrspace(1)* %idx1, align 4
 543   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 544   store i32 62, i32 addrspace(1)* %idx2, align 4
 545   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 546   store i32 63, i32 addrspace(1)* %idx3, align 4
 547   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 548   store i32 11, i32 addrspace(1)* %idx4, align 4
 549   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 550   store i32 123, i32 addrspace(1)* %idx5, align 4
 551   ret void
 552 }
 553
 554 ; CHECK-LABEL: @merge_global_store_7_constants_i32
 555 ; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 556 ; CHECK: store <3 x i32> <i32 98, i32 91, i32 212>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 557 define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
 558   store i32 34, i32 addrspace(1)* %out, align 4
 559   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 560   store i32 999, i32 addrspace(1)* %idx1, align 4
 561   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 562   store i32 65, i32 addrspace(1)* %idx2, align 4
 563   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 564   store i32 33, i32 addrspace(1)* %idx3, align 4
 565   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 566   store i32 98, i32 addrspace(1)* %idx4, align 4
 567   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 568   store i32 91, i32 addrspace(1)* %idx5, align 4
 569   %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
 570   store i32 212, i32 addrspace(1)* %idx6, align 4
 571   ret void
 572 }
 573
 574 ; CHECK-LABEL: @merge_global_store_8_constants_i32
 575 ; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 576 ; CHECK: store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 577 define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
 578   store i32 34, i32 addrspace(1)* %out, align 4
 579   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 580   store i32 999, i32 addrspace(1)* %idx1, align 4
 581   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 582   store i32 65, i32 addrspace(1)* %idx2, align 4
 583   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 584   store i32 33, i32 addrspace(1)* %idx3, align 4
 585   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 586   store i32 98, i32 addrspace(1)* %idx4, align 4
 587   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 588   store i32 91, i32 addrspace(1)* %idx5, align 4
 589   %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
 590   store i32 212, i32 addrspace(1)* %idx6, align 4
 591   %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
 592   store i32 999, i32 addrspace(1)* %idx7, align 4
 593   ret void
 594 }
 595
 596 ; CHECK-LABEL: @copy_v3i32_align4
 597 ; CHECK: %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
 598 ; CHECK: store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
 599 define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
 600   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
 601   store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
 602   ret void
 603 }
 604
 605 ; CHECK-LABEL: @copy_v3i64_align4
 606 ; CHECK: %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
 607 ; CHECK: store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
 608 define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
 609   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
 610   store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
 611   ret void
 612 }
 613
 614 ; CHECK-LABEL: @copy_v3f32_align4
 615 ; CHECK: %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
 616 ; CHECK: store <3 x float>
 617 define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
 618   %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
 619   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
 620   store <3 x float> %fadd, <3 x float> addrspace(1)* %out
 621   ret void
 622 }
 623
 624 ; CHECK-LABEL: @copy_v3f64_align4
 625 ; CHECK: %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
 626 ; CHECK: store <3 x double> %fadd, <3 x double> addrspace(1)* %out
 627 define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
 628   %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
 629   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
 630   store <3 x double> %fadd, <3 x double> addrspace(1)* %out
 631   ret void
 632 }
 633
 634 ; Verify that we no longer hit asserts for this test case. No change expected.
 635 ; CHECK-LABEL: @copy_vec_of_ptrs
 636 ; CHECK: %in.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %in, i32 1
 637 ; CHECK: %vec1 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in.gep.1
 638 ; CHECK: %vec2 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in, align 4
 639 ; CHECK: %out.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %out, i32 1
 640 ; CHECK: store <2 x i16*> %vec1, <2 x i16*> addrspace(1)* %out.gep.1
 641 ; CHECK: store <2 x i16*> %vec2, <2 x i16*> addrspace(1)* %out, align 4
 642 define amdgpu_kernel void @copy_vec_of_ptrs(<2 x i16*> addrspace(1)* %out,
 643                                             <2 x i16*> addrspace(1)* %in ) #0 {
 644   %in.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %in, i32 1
 645   %vec1 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in.gep.1
 646   %vec2 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in, align 4
 647
 648   %out.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %out, i32 1
 649   store <2 x i16*> %vec1, <2 x i16*> addrspace(1)* %out.gep.1
 650   store <2 x i16*> %vec2, <2 x i16*> addrspace(1)* %out, align 4
 651   ret void
 652 }
 653
 654 declare void @llvm.amdgcn.s.barrier() #1
 655
 656 attributes #0 = { nounwind }
 657 attributes #1 = { convergent nounwind }