llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll

   1 ; RUN: opt -mtriple=amdgcn-amd-amdhsa --mcpu=hawaii -load-store-vectorizer -S -o - %s | FileCheck %s
   2 ; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions
   3
   4 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
   5
   6 ; TODO: Vector element tests
   7 ; TODO: Non-zero base offset for load and store combinations
   8 ; TODO: Same base addrspacecasted
   9
  10
  11 ; CHECK-LABEL: @merge_global_store_2_constants_i8(
  12 ; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(1)* %{{[0-9]+}}, align 2
  13 define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
  14   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
  15
  16   store i8 123, i8 addrspace(1)* %out.gep.1
  17   store i8 456, i8 addrspace(1)* %out, align 2
  18   ret void
  19 }
  20
  21 ; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align
  22 ; CHECK: store <2 x i8>
  23 define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
  24   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
  25
  26   store i8 123, i8 addrspace(1)* %out.gep.1
  27   store i8 456, i8 addrspace(1)* %out
  28   ret void
  29 }
  30
  31 ; CHECK-LABEL: @merge_global_store_2_constants_i16
  32 ; CHECK: store <2 x i16> <i16 456, i16 123>, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
  33 define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
  34   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  35
  36   store i16 123, i16 addrspace(1)* %out.gep.1
  37   store i16 456, i16 addrspace(1)* %out, align 4
  38   ret void
  39 }
  40
  41 ; CHECK-LABEL: @merge_global_store_2_constants_0_i16
  42 ; CHECK: store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
  43 define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
  44   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  45
  46   store i16 0, i16 addrspace(1)* %out.gep.1
  47   store i16 0, i16 addrspace(1)* %out, align 4
  48   ret void
  49 }
  50
  51 ; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align
  52 ; CHECK: store i16
  53 ; CHECK: store i16
  54 define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
  55   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  56
  57   store i16 123, i16 addrspace(1)* %out.gep.1
  58   store i16 456, i16 addrspace(1)* %out
  59   ret void
  60 }
  61
  62 ; CHECK-LABEL: @merge_global_store_2_constants_i16_align_1
  63 ; CHECK: store <2 x i16>
  64 define amdgpu_kernel void @merge_global_store_2_constants_i16_align_1(i16 addrspace(1)* %out) #0 {
  65   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
  66
  67   store i16 123, i16 addrspace(1)* %out.gep.1, align 1
  68   store i16 456, i16 addrspace(1)* %out, align 1
  69   ret void
  70 }
  71
  72 ; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align
  73 ; CHECK: store half
  74 ; CHECK: store half
  75 define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 {
  76   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
  77
  78   store half 2.0, half addrspace(1)* %out.gep.1
  79   store half 1.0, half addrspace(1)* %out
  80   ret void
  81 }
  82
  83 ; CHECK-LABEL: @merge_global_store_2_constants_half_align_1
  84 ; CHECK: store <2 x half>
  85 define amdgpu_kernel void @merge_global_store_2_constants_half_align_1(half addrspace(1)* %out) #0 {
  86   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
  87
  88   store half 2.0, half addrspace(1)* %out.gep.1, align 1
  89   store half 1.0, half addrspace(1)* %out, align 1
  90   ret void
  91 }
  92
  93 ; CHECK-LABEL: @merge_global_store_2_constants_i32
  94 ; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
  95 define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
  96   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
  97
  98   store i32 123, i32 addrspace(1)* %out.gep.1
  99   store i32 456, i32 addrspace(1)* %out
 100   ret void
 101 }
 102
 103 ; CHECK-LABEL: @merge_global_store_2_constants_i32_f32
 104 ; CHECK: store <2 x i32> <i32 456, i32 1065353216>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 105 define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
 106   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 107   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
 108   store float 1.0, float addrspace(1)* %out.gep.1.bc
 109   store i32 456, i32 addrspace(1)* %out
 110   ret void
 111 }
 112
 113 ; CHECK-LABEL: @merge_global_store_2_constants_f32_i32
 114 ; CHECK: store <2 x i32> <i32 1082130432, i32 123>, <2 x i32> addrspace(1)*
 115 define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
 116   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 117   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
 118   store i32 123, i32 addrspace(1)* %out.gep.1.bc
 119   store float 4.0, float addrspace(1)* %out
 120   ret void
 121 }
 122
 123 ; CHECK-LABEL: @merge_global_store_4_constants_i32
 124 ; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 125 define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
 126   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 127   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 128   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 129
 130   store i32 123, i32 addrspace(1)* %out.gep.1
 131   store i32 456, i32 addrspace(1)* %out.gep.2
 132   store i32 333, i32 addrspace(1)* %out.gep.3
 133   store i32 1234, i32 addrspace(1)* %out
 134   ret void
 135 }
 136
 137 ; CHECK-LABEL: @merge_global_store_4_constants_f32_order
 138 ; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}
 139 define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
 140   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 141   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 142   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 143
 144   store float 8.0, float addrspace(1)* %out
 145   store float 1.0, float addrspace(1)* %out.gep.1
 146   store float 2.0, float addrspace(1)* %out.gep.2
 147   store float 4.0, float addrspace(1)* %out.gep.3
 148   ret void
 149 }
 150
 151 ; First store is out of order.
 152 ; CHECK-LABEL: @merge_global_store_4_constants_f32
 153 ; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}, align 4
 154 define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
 155   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 156   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 157   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 158
 159   store float 1.0, float addrspace(1)* %out.gep.1
 160   store float 2.0, float addrspace(1)* %out.gep.2
 161   store float 4.0, float addrspace(1)* %out.gep.3
 162   store float 8.0, float addrspace(1)* %out
 163   ret void
 164 }
 165
 166 ; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32
 167 ; CHECK: store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 168 define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
 169   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 170   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 171   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 172
 173   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
 174   %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
 175
 176   store i32 11, i32 addrspace(1)* %out.gep.1.bc
 177   store float 2.0, float addrspace(1)* %out.gep.2
 178   store i32 17, i32 addrspace(1)* %out.gep.3.bc
 179   store float 8.0, float addrspace(1)* %out
 180   ret void
 181 }
 182
 183 ; CHECK-LABEL: @merge_global_store_3_constants_i32
 184 ; CHECK: store <3 x i32> <i32 1234, i32 123, i32 456>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 185 define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
 186   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 187   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 188
 189   store i32 123, i32 addrspace(1)* %out.gep.1
 190   store i32 456, i32 addrspace(1)* %out.gep.2
 191   store i32 1234, i32 addrspace(1)* %out
 192   ret void
 193 }
 194
 195 ; CHECK-LABEL: @merge_global_store_2_constants_i64
 196 ; CHECK: store <2 x i64> <i64 456, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
 197 define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
 198   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 199
 200   store i64 123, i64 addrspace(1)* %out.gep.1
 201   store i64 456, i64 addrspace(1)* %out
 202   ret void
 203 }
 204
 205 ; CHECK-LABEL: @merge_global_store_4_constants_i64
 206 ; CHECK: store <2 x i64> <i64 456, i64 333>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
 207 ; CHECK: store <2 x i64> <i64 1234, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
 208 define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
 209   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 210   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
 211   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
 212
 213   store i64 123, i64 addrspace(1)* %out.gep.1
 214   store i64 456, i64 addrspace(1)* %out.gep.2
 215   store i64 333, i64 addrspace(1)* %out.gep.3
 216   store i64 1234, i64 addrspace(1)* %out
 217   ret void
 218 }
 219
 220 ; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32
 221 ; CHECK: [[LOAD:%[^ ]+]] = load <2 x i32>
 222 ; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 0
 223 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 1
 224 ; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
 225 ; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT1]], i32 1
 226 ; CHECK: store <2 x i32> [[INSERT1]]
 227 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 228   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 229   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 230
 231   %lo = load i32, i32 addrspace(1)* %in
 232   %hi = load i32, i32 addrspace(1)* %in.gep.1
 233
 234   store i32 %lo, i32 addrspace(1)* %out
 235   store i32 %hi, i32 addrspace(1)* %out.gep.1
 236   ret void
 237 }
 238
 239 ; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32_nonzero_base
 240 ; CHECK: extractelement
 241 ; CHECK: extractelement
 242 ; CHECK: insertelement
 243 ; CHECK: insertelement
 244 ; CHECK: store <2 x i32>
 245 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 246   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 247   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 248
 249   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 250   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 251   %lo = load i32, i32 addrspace(1)* %in.gep.0
 252   %hi = load i32, i32 addrspace(1)* %in.gep.1
 253
 254   store i32 %lo, i32 addrspace(1)* %out.gep.0
 255   store i32 %hi, i32 addrspace(1)* %out.gep.1
 256   ret void
 257 }
 258
 259 ; CHECK-LABEL: @merge_global_store_2_adjacent_loads_shuffle_i32
 260 ; CHECK: [[LOAD:%[^ ]+]] = load <2 x i32>
 261 ; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 0
 262 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 1
 263 ; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT1]], i32 0
 264 ; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT0]], i32 1
 265 ; CHECK: store <2 x i32> [[INSERT1]]
 266 define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 267   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 268   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 269
 270   %lo = load i32, i32 addrspace(1)* %in
 271   %hi = load i32, i32 addrspace(1)* %in.gep.1
 272
 273   store i32 %hi, i32 addrspace(1)* %out
 274   store i32 %lo, i32 addrspace(1)* %out.gep.1
 275   ret void
 276 }
 277
 278 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32
 279 ; CHECK: load <4 x i32>
 280 ; CHECK: store <4 x i32>
 281 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 282   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 283   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 284   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 285   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 286   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 287   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 288
 289   %x = load i32, i32 addrspace(1)* %in
 290   %y = load i32, i32 addrspace(1)* %in.gep.1
 291   %z = load i32, i32 addrspace(1)* %in.gep.2
 292   %w = load i32, i32 addrspace(1)* %in.gep.3
 293
 294   store i32 %x, i32 addrspace(1)* %out
 295   store i32 %y, i32 addrspace(1)* %out.gep.1
 296   store i32 %z, i32 addrspace(1)* %out.gep.2
 297   store i32 %w, i32 addrspace(1)* %out.gep.3
 298   ret void
 299 }
 300
 301 ; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32
 302 ; CHECK: load <3 x i32>
 303 ; CHECK: store <3 x i32>
 304 define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 305   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 306   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 307   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 308   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 309
 310   %x = load i32, i32 addrspace(1)* %in
 311   %y = load i32, i32 addrspace(1)* %in.gep.1
 312   %z = load i32, i32 addrspace(1)* %in.gep.2
 313
 314   store i32 %x, i32 addrspace(1)* %out
 315   store i32 %y, i32 addrspace(1)* %out.gep.1
 316   store i32 %z, i32 addrspace(1)* %out.gep.2
 317   ret void
 318 }
 319
 320 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32
 321 ; CHECK: load <4 x float>
 322 ; CHECK: store <4 x float>
 323 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 324   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
 325   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
 326   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
 327   %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
 328   %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
 329   %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
 330
 331   %x = load float, float addrspace(1)* %in
 332   %y = load float, float addrspace(1)* %in.gep.1
 333   %z = load float, float addrspace(1)* %in.gep.2
 334   %w = load float, float addrspace(1)* %in.gep.3
 335
 336   store float %x, float addrspace(1)* %out
 337   store float %y, float addrspace(1)* %out.gep.1
 338   store float %z, float addrspace(1)* %out.gep.2
 339   store float %w, float addrspace(1)* %out.gep.3
 340   ret void
 341 }
 342
 343 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base
 344 ; CHECK: load <4 x i32>
 345 ; CHECK: store <4 x i32>
 346 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 347   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
 348   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
 349   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
 350   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
 351   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
 352   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
 353   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
 354   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
 355
 356   %x = load i32, i32 addrspace(1)* %in.gep.0
 357   %y = load i32, i32 addrspace(1)* %in.gep.1
 358   %z = load i32, i32 addrspace(1)* %in.gep.2
 359   %w = load i32, i32 addrspace(1)* %in.gep.3
 360
 361   store i32 %x, i32 addrspace(1)* %out.gep.0
 362   store i32 %y, i32 addrspace(1)* %out.gep.1
 363   store i32 %z, i32 addrspace(1)* %out.gep.2
 364   store i32 %w, i32 addrspace(1)* %out.gep.3
 365   ret void
 366 }
 367
 368 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32
 369 ; CHECK: load <4 x i32>
 370 ; CHECK: store <4 x i32>
 371 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 372   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 373   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 374   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 375   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 376   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 377   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 378
 379   %x = load i32, i32 addrspace(1)* %in
 380   %y = load i32, i32 addrspace(1)* %in.gep.1
 381   %z = load i32, i32 addrspace(1)* %in.gep.2
 382   %w = load i32, i32 addrspace(1)* %in.gep.3
 383
 384   ; Make sure the barrier doesn't stop this
 385   tail call void @llvm.amdgcn.s.barrier() #1
 386
 387   store i32 %w, i32 addrspace(1)* %out.gep.3
 388   store i32 %z, i32 addrspace(1)* %out.gep.2
 389   store i32 %y, i32 addrspace(1)* %out.gep.1
 390   store i32 %x, i32 addrspace(1)* %out
 391
 392   ret void
 393 }
 394
 395 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32
 396 ; CHECK: load <4 x i32>
 397 ; CHECK: store <4 x i32>
 398 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 399   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 400   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 401   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 402   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 403   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
 404   %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 405
 406   %x = load i32, i32 addrspace(1)* %in
 407   %y = load i32, i32 addrspace(1)* %in.gep.1
 408   %z = load i32, i32 addrspace(1)* %in.gep.2
 409   %w = load i32, i32 addrspace(1)* %in.gep.3
 410
 411   ; Make sure the barrier doesn't stop this
 412   tail call void @llvm.amdgcn.s.barrier() #1
 413
 414   store i32 %w, i32 addrspace(1)* %out
 415   store i32 %z, i32 addrspace(1)* %out.gep.1
 416   store i32 %y, i32 addrspace(1)* %out.gep.2
 417   store i32 %x, i32 addrspace(1)* %out.gep.3
 418
 419   ret void
 420 }
 421
 422 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8
 423 ; CHECK: load <4 x i8>
 424 ; CHECK: extractelement <4 x i8>
 425 ; CHECK: extractelement <4 x i8>
 426 ; CHECK: extractelement <4 x i8>
 427 ; CHECK: extractelement <4 x i8>
 428 ; CHECK: insertelement <4 x i8>
 429 ; CHECK: insertelement <4 x i8>
 430 ; CHECK: insertelement <4 x i8>
 431 ; CHECK: insertelement <4 x i8>
 432 ; CHECK: store <4 x i8>
 433 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 434   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
 435   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
 436   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
 437   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
 438   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
 439   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
 440
 441   %x = load i8, i8 addrspace(1)* %in, align 4
 442   %y = load i8, i8 addrspace(1)* %in.gep.1
 443   %z = load i8, i8 addrspace(1)* %in.gep.2
 444   %w = load i8, i8 addrspace(1)* %in.gep.3
 445
 446   store i8 %x, i8 addrspace(1)* %out, align 4
 447   store i8 %y, i8 addrspace(1)* %out.gep.1
 448   store i8 %z, i8 addrspace(1)* %out.gep.2
 449   store i8 %w, i8 addrspace(1)* %out.gep.3
 450   ret void
 451 }
 452
 453 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align
 454 ; CHECK: load <4 x i8>
 455 ; CHECK: store <4 x i8>
 456 define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 457   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
 458   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
 459   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
 460   %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
 461   %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
 462   %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
 463
 464   %x = load i8, i8 addrspace(1)* %in
 465   %y = load i8, i8 addrspace(1)* %in.gep.1
 466   %z = load i8, i8 addrspace(1)* %in.gep.2
 467   %w = load i8, i8 addrspace(1)* %in.gep.3
 468
 469   store i8 %x, i8 addrspace(1)* %out
 470   store i8 %y, i8 addrspace(1)* %out.gep.1
 471   store i8 %z, i8 addrspace(1)* %out.gep.2
 472   store i8 %w, i8 addrspace(1)* %out.gep.3
 473   ret void
 474 }
 475
 476 ; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32
 477 ; CHECK: load <4 x i32>
 478 ; CHECK: store <4 x i32>
 479 define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 480   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 481   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 482   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
 483   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
 484
 485   %x = extractelement <4 x i32> %vec, i32 0
 486   %y = extractelement <4 x i32> %vec, i32 1
 487   %z = extractelement <4 x i32> %vec, i32 2
 488   %w = extractelement <4 x i32> %vec, i32 3
 489
 490   store i32 %x, i32 addrspace(1)* %out
 491   store i32 %y, i32 addrspace(1)* %out.gep.1
 492   store i32 %z, i32 addrspace(1)* %out.gep.2
 493   store i32 %w, i32 addrspace(1)* %out.gep.3
 494   ret void
 495 }
 496
 497 ; CHECK-LABEL: @merge_local_store_2_constants_i8
 498 ; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(3)* %{{[0-9]+}}, align 2
 499 define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
 500   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 501
 502   store i8 123, i8 addrspace(3)* %out.gep.1
 503   store i8 456, i8 addrspace(3)* %out, align 2
 504   ret void
 505 }
 506
 507 ; CHECK-LABEL: @merge_local_store_2_constants_i32
 508 ; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4
 509 define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
 510   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 511
 512   store i32 123, i32 addrspace(3)* %out.gep.1
 513   store i32 456, i32 addrspace(3)* %out
 514   ret void
 515 }
 516
 517 ; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2
 518 ; CHECK: store i32
 519 ; CHECK: store i32
 520 define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
 521   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 522
 523   store i32 123, i32 addrspace(3)* %out.gep.1, align 2
 524   store i32 456, i32 addrspace(3)* %out, align 2
 525   ret void
 526 }
 527
 528 ; CHECK-LABEL: @merge_local_store_4_constants_i32
 529 ; CHECK: store <2 x i32> <i32 456, i32 333>, <2 x i32> addrspace(3)* %1, align 4
 530 ; CHECK: store <2 x i32> <i32 1234, i32 123>, <2 x i32> addrspace(3)* %2, align 4
 531 define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
 532   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 533   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
 534   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
 535
 536   store i32 123, i32 addrspace(3)* %out.gep.1
 537   store i32 456, i32 addrspace(3)* %out.gep.2
 538   store i32 333, i32 addrspace(3)* %out.gep.3
 539   store i32 1234, i32 addrspace(3)* %out
 540   ret void
 541 }
 542
 543 ; CHECK-LABEL: @merge_global_store_5_constants_i32
 544 ; CHECK: store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 545 ; CHECK: store i32
 546 define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
 547   store i32 9, i32 addrspace(1)* %out, align 4
 548   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 549   store i32 12, i32 addrspace(1)* %idx1, align 4
 550   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 551   store i32 16, i32 addrspace(1)* %idx2, align 4
 552   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 553   store i32 -12, i32 addrspace(1)* %idx3, align 4
 554   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 555   store i32 11, i32 addrspace(1)* %idx4, align 4
 556   ret void
 557 }
 558
 559 ; CHECK-LABEL: @merge_global_store_6_constants_i32
 560 ; CHECK: store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 561 ; CHECK: store <2 x i32> <i32 11, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 562 define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
 563   store i32 13, i32 addrspace(1)* %out, align 4
 564   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 565   store i32 15, i32 addrspace(1)* %idx1, align 4
 566   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 567   store i32 62, i32 addrspace(1)* %idx2, align 4
 568   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 569   store i32 63, i32 addrspace(1)* %idx3, align 4
 570   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 571   store i32 11, i32 addrspace(1)* %idx4, align 4
 572   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 573   store i32 123, i32 addrspace(1)* %idx5, align 4
 574   ret void
 575 }
 576
 577 ; CHECK-LABEL: @merge_global_store_7_constants_i32
 578 ; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 579 ; CHECK: store <3 x i32> <i32 98, i32 91, i32 212>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 580 define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
 581   store i32 34, i32 addrspace(1)* %out, align 4
 582   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 583   store i32 999, i32 addrspace(1)* %idx1, align 4
 584   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 585   store i32 65, i32 addrspace(1)* %idx2, align 4
 586   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 587   store i32 33, i32 addrspace(1)* %idx3, align 4
 588   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 589   store i32 98, i32 addrspace(1)* %idx4, align 4
 590   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 591   store i32 91, i32 addrspace(1)* %idx5, align 4
 592   %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
 593   store i32 212, i32 addrspace(1)* %idx6, align 4
 594   ret void
 595 }
 596
 597 ; CHECK-LABEL: @merge_global_store_8_constants_i32
 598 ; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 599 ; CHECK: store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 600 define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
 601   store i32 34, i32 addrspace(1)* %out, align 4
 602   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
 603   store i32 999, i32 addrspace(1)* %idx1, align 4
 604   %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
 605   store i32 65, i32 addrspace(1)* %idx2, align 4
 606   %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
 607   store i32 33, i32 addrspace(1)* %idx3, align 4
 608   %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
 609   store i32 98, i32 addrspace(1)* %idx4, align 4
 610   %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
 611   store i32 91, i32 addrspace(1)* %idx5, align 4
 612   %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
 613   store i32 212, i32 addrspace(1)* %idx6, align 4
 614   %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
 615   store i32 999, i32 addrspace(1)* %idx7, align 4
 616   ret void
 617 }
 618
 619 ; CHECK-LABEL: @copy_v3i32_align4
 620 ; CHECK: %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
 621 ; CHECK: store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
 622 define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
 623   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
 624   store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
 625   ret void
 626 }
 627
 628 ; CHECK-LABEL: @copy_v3i64_align4
 629 ; CHECK: %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
 630 ; CHECK: store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
 631 define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
 632   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
 633   store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
 634   ret void
 635 }
 636
 637 ; CHECK-LABEL: @copy_v3f32_align4
 638 ; CHECK: %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
 639 ; CHECK: store <3 x float>
 640 define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
 641   %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
 642   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
 643   store <3 x float> %fadd, <3 x float> addrspace(1)* %out
 644   ret void
 645 }
 646
 647 ; CHECK-LABEL: @copy_v3f64_align4
 648 ; CHECK: %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
 649 ; CHECK: store <3 x double> %fadd, <3 x double> addrspace(1)* %out
 650 define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
 651   %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
 652   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
 653   store <3 x double> %fadd, <3 x double> addrspace(1)* %out
 654   ret void
 655 }
 656
 657 ; Verify that we no longer hit asserts for this test case. No change expected.
 658 ; CHECK-LABEL: @copy_vec_of_ptrs
 659 ; CHECK: %in.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %in, i32 1
 660 ; CHECK: %vec1 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in.gep.1
 661 ; CHECK: %vec2 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in, align 4
 662 ; CHECK: %out.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %out, i32 1
 663 ; CHECK: store <2 x i16*> %vec1, <2 x i16*> addrspace(1)* %out.gep.1
 664 ; CHECK: store <2 x i16*> %vec2, <2 x i16*> addrspace(1)* %out, align 4
 665 define amdgpu_kernel void @copy_vec_of_ptrs(<2 x i16*> addrspace(1)* %out,
 666                                             <2 x i16*> addrspace(1)* %in ) #0 {
 667   %in.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %in, i32 1
 668   %vec1 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in.gep.1
 669   %vec2 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in, align 4
 670
 671   %out.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %out, i32 1
 672   store <2 x i16*> %vec1, <2 x i16*> addrspace(1)* %out.gep.1
 673   store <2 x i16*> %vec2, <2 x i16*> addrspace(1)* %out, align 4
 674   ret void
 675 }
 676
 677 declare void @llvm.amdgcn.s.barrier() #1
 678
 679 attributes #0 = { nounwind }
 680 attributes #1 = { convergent nounwind }