llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll

   1 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
   2 ; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
   3 ; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
   4 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
   5 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s
   6 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
   7 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
   8 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s
   9
  10 ; GCN-LABEL: test_local_misaligned_v2:
  11 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
  12 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
  13 define amdgpu_kernel void @test_local_misaligned_v2(ptr addrspace(3) %arg) {
  14 bb:
  15   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  16   %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
  17   %load = load <2 x i32>, ptr addrspace(3) %gep, align 4
  18   %v1 = extractelement <2 x i32> %load, i32 0
  19   %v2 = extractelement <2 x i32> %load, i32 1
  20   %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
  21   %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
  22   store <2 x i32> %v4, ptr addrspace(3) %gep, align 4
  23   ret void
  24 }
  25
  26 ; GCN-LABEL: test_local_misaligned_v4:
  27 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
  28 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
  29 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
  30 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
  31 define amdgpu_kernel void @test_local_misaligned_v4(ptr addrspace(3) %arg) {
  32 bb:
  33   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  34   %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
  35   %load = load <4 x i32>, ptr addrspace(3) %gep, align 4
  36   %v1 = extractelement <4 x i32> %load, i32 0
  37   %v2 = extractelement <4 x i32> %load, i32 1
  38   %v3 = extractelement <4 x i32> %load, i32 2
  39   %v4 = extractelement <4 x i32> %load, i32 3
  40   %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
  41   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
  42   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
  43   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
  44   store <4 x i32> %v8, ptr addrspace(3) %gep, align 4
  45   ret void
  46 }
  47
  48 ; GCN-LABEL: test_local_misaligned_v3:
  49 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
  50 ; GCN-DAG: ds_{{read|load}}_b32
  51 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
  52 ; GCN-DAG: ds_{{write|store}}_b32
  53 define amdgpu_kernel void @test_local_misaligned_v3(ptr addrspace(3) %arg) {
  54 bb:
  55   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  56   %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
  57   %load = load <3 x i32>, ptr addrspace(3) %gep, align 4
  58   %v1 = extractelement <3 x i32> %load, i32 0
  59   %v2 = extractelement <3 x i32> %load, i32 1
  60   %v3 = extractelement <3 x i32> %load, i32 2
  61   %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
  62   %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
  63   %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
  64   store <3 x i32> %v7, ptr addrspace(3) %gep, align 4
  65   ret void
  66 }
  67
  68 ; GCN-LABEL: test_flat_misaligned_v2:
  69 ; VECT-DAG:  flat_load_{{dwordx2|b64}} v
  70 ; VECT-DAG:  flat_store_{{dwordx2|b64}} v
  71 ; SPLIT-DAG: flat_load_{{dword|b32}} v
  72 ; SPLIT-DAG: flat_load_{{dword|b32}} v
  73 ; SPLIT-DAG: flat_store_{{dword|b32}} v
  74 ; SPLIT-DAG: flat_store_{{dword|b32}} v
  75 define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) {
  76 bb:
  77   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  78   %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
  79   %load = load <2 x i32>, ptr %gep, align 4
  80   %v1 = extractelement <2 x i32> %load, i32 0
  81   %v2 = extractelement <2 x i32> %load, i32 1
  82   %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
  83   %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
  84   store <2 x i32> %v4, ptr %gep, align 4
  85   ret void
  86 }
  87
  88 ; GCN-LABEL: test_flat_misaligned_v4:
  89 ; VECT-DAG:  flat_load_{{dwordx4|b128}} v
  90 ; VECT-DAG:  flat_store_{{dwordx4|b128}} v
  91 ; SPLIT-DAG: flat_load_{{dword|b32}} v
  92 ; SPLIT-DAG: flat_load_{{dword|b32}} v
  93 ; SPLIT-DAG: flat_load_{{dword|b32}} v
  94 ; SPLIT-DAG: flat_load_{{dword|b32}} v
  95 ; SPLIT-DAG: flat_store_{{dword|b32}} v
  96 ; SPLIT-DAG: flat_store_{{dword|b32}} v
  97 ; SPLIT-DAG: flat_store_{{dword|b32}} v
  98 ; SPLIT-DAG: flat_store_{{dword|b32}} v
  99 define amdgpu_kernel void @test_flat_misaligned_v4(ptr %arg) {
 100 bb:
 101   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 102   %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
 103   %load = load <4 x i32>, ptr %gep, align 4
 104   %v1 = extractelement <4 x i32> %load, i32 0
 105   %v2 = extractelement <4 x i32> %load, i32 1
 106   %v3 = extractelement <4 x i32> %load, i32 2
 107   %v4 = extractelement <4 x i32> %load, i32 3
 108   %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
 109   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
 110   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
 111   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
 112   store <4 x i32> %v8, ptr %gep, align 4
 113   ret void
 114 }
 115
 116 ; GCN-LABEL: test_flat_misaligned_v3:
 117 ; VECT-DAG:  flat_load_{{dwordx3|b96}} v
 118 ; VECT-DAG:  flat_store_{{dwordx3|b96}} v
 119 ; SPLIT-DAG: flat_load_{{dword|b32}} v
 120 ; SPLIT-DAG: flat_load_{{dword|b32}} v
 121 ; SPLIT-DAG: flat_load_{{dword|b32}} v
 122 ; SPLIT-DAG: flat_store_{{dword|b32}} v
 123 ; SPLIT-DAG: flat_store_{{dword|b32}} v
 124 ; SPLIT-DAG: flat_store_{{dword|b32}} v
 125 define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) {
 126 bb:
 127   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 128   %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
 129   %load = load <3 x i32>, ptr %gep, align 4
 130   %v1 = extractelement <3 x i32> %load, i32 0
 131   %v2 = extractelement <3 x i32> %load, i32 1
 132   %v3 = extractelement <3 x i32> %load, i32 2
 133   %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
 134   %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
 135   %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
 136   store <3 x i32> %v7, ptr %gep, align 4
 137   ret void
 138 }
 139
 140 ; GCN-LABEL: test_local_aligned_v2:
 141 ; GCN-DAG: ds_{{read|load}}_b64
 142 ; GCN-DAG: ds_{{write|store}}_b64
 143 define amdgpu_kernel void @test_local_aligned_v2(ptr addrspace(3) %arg) {
 144 bb:
 145   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 146   %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
 147   %load = load <2 x i32>, ptr addrspace(3) %gep, align 8
 148   %v1 = extractelement <2 x i32> %load, i32 0
 149   %v2 = extractelement <2 x i32> %load, i32 1
 150   %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
 151   %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
 152   store <2 x i32> %v4, ptr addrspace(3) %gep, align 8
 153   ret void
 154 }
 155
 156 ; GCN-LABEL: test_local_aligned_v3:
 157 ; GCN-DAG: ds_{{read|load}}_b96
 158 ; GCN-DAG: ds_{{write|store}}_b96
 159 define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) {
 160 bb:
 161   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 162   %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
 163   %load = load <3 x i32>, ptr addrspace(3) %gep, align 16
 164   %v1 = extractelement <3 x i32> %load, i32 0
 165   %v2 = extractelement <3 x i32> %load, i32 1
 166   %v3 = extractelement <3 x i32> %load, i32 2
 167   %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
 168   %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
 169   %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
 170   store <3 x i32> %v7, ptr addrspace(3) %gep, align 16
 171   ret void
 172 }
 173
 174 ; GCN-LABEL: test_flat_aligned_v2:
 175 ; GCN-DAG: flat_load_{{dwordx2|b64}} v
 176 ; GCN-DAG: flat_store_{{dwordx2|b64}} v
 177 define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) {
 178 bb:
 179   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 180   %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
 181   %load = load <2 x i32>, ptr %gep, align 8
 182   %v1 = extractelement <2 x i32> %load, i32 0
 183   %v2 = extractelement <2 x i32> %load, i32 1
 184   %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
 185   %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
 186   store <2 x i32> %v4, ptr %gep, align 8
 187   ret void
 188 }
 189
 190 ; GCN-LABEL: test_flat_aligned_v4:
 191 ; GCN-DAG: flat_load_{{dwordx4|b128}} v
 192 ; GCN-DAG: flat_store_{{dwordx4|b128}} v
 193 define amdgpu_kernel void @test_flat_aligned_v4(ptr %arg) {
 194 bb:
 195   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 196   %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
 197   %load = load <4 x i32>, ptr %gep, align 16
 198   %v1 = extractelement <4 x i32> %load, i32 0
 199   %v2 = extractelement <4 x i32> %load, i32 1
 200   %v3 = extractelement <4 x i32> %load, i32 2
 201   %v4 = extractelement <4 x i32> %load, i32 3
 202   %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
 203   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
 204   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
 205   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
 206   store <4 x i32> %v8, ptr %gep, align 16
 207   ret void
 208 }
 209
 210 ; GCN-LABEL: test_local_v4_aligned8:
 211 ; ALIGNED-DAG: ds_{{read2|load_2addr}}_b64
 212 ; ALIGNED-DAG: ds_{{write2|store_2addr}}_b64
 213 ; UNALIGNED-DAG: ds_{{read2|load_2addr}}_b64
 214 ; UNALIGNED-DAG: ds_{{write2|store_2addr}}_b64
 215 define amdgpu_kernel void @test_local_v4_aligned8(ptr addrspace(3) %arg) {
 216 bb:
 217   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 218   %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
 219   %load = load <4 x i32>, ptr addrspace(3) %gep, align 8
 220   %v1 = extractelement <4 x i32> %load, i32 0
 221   %v2 = extractelement <4 x i32> %load, i32 1
 222   %v3 = extractelement <4 x i32> %load, i32 2
 223   %v4 = extractelement <4 x i32> %load, i32 3
 224   %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
 225   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
 226   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
 227   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
 228   store <4 x i32> %v8, ptr addrspace(3) %gep, align 8
 229   ret void
 230 }
 231
 232 ; GCN-LABEL: test_flat_v4_aligned8:
 233 ; VECT-DAG:  flat_load_{{dwordx4|b128}} v
 234 ; VECT-DAG:  flat_store_{{dwordx4|b128}} v
 235 ; SPLIT-DAG: flat_load_{{dwordx2|b64}} v
 236 ; SPLIT-DAG: flat_load_{{dwordx2|b64}} v
 237 ; SPLIT-DAG: flat_store_{{dwordx2|b64}} v
 238 ; SPLIT-DAG: flat_store_{{dwordx2|b64}} v
 239 define amdgpu_kernel void @test_flat_v4_aligned8(ptr %arg) {
 240 bb:
 241   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 242   %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
 243   %load = load <4 x i32>, ptr %gep, align 8
 244   %v1 = extractelement <4 x i32> %load, i32 0
 245   %v2 = extractelement <4 x i32> %load, i32 1
 246   %v3 = extractelement <4 x i32> %load, i32 2
 247   %v4 = extractelement <4 x i32> %load, i32 3
 248   %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
 249   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
 250   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
 251   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
 252   store <4 x i32> %v8, ptr %gep, align 8
 253   ret void
 254 }
 255
 256 declare i32 @llvm.amdgcn.workitem.id.x()