test/CodeGen/AMDGPU/lds-misaligned-bug.ll

   1 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
   2 ; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VECT %s
   3 ; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
   4 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s
   5
   6 ; GCN-LABEL: test_local_misaligned_v2:
   7 ; GCN-DAG: ds_read2_b32
   8 ; GCN-DAG: ds_write2_b32
   9 define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) {
  10 bb:
  11   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  12   %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
  13   %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
  14   %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4
  15   %v1 = extractelement <2 x i32> %load, i32 0
  16   %v2 = extractelement <2 x i32> %load, i32 1
  17   %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
  18   %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
  19   store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4
  20   ret void
  21 }
  22
  23 ; GCN-LABEL: test_local_misaligned_v4:
  24 ; GCN-DAG: ds_read2_b32
  25 ; GCN-DAG: ds_read2_b32
  26 ; GCN-DAG: ds_write2_b32
  27 ; GCN-DAG: ds_write2_b32
  28 define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
  29 bb:
  30   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  31   %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
  32   %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
  33   %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
  34   %v1 = extractelement <4 x i32> %load, i32 0
  35   %v2 = extractelement <4 x i32> %load, i32 1
  36   %v3 = extractelement <4 x i32> %load, i32 2
  37   %v4 = extractelement <4 x i32> %load, i32 3
  38   %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
  39   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
  40   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
  41   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
  42   store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4
  43   ret void
  44 }
  45
  46 ; GCN-LABEL: test_local_misaligned_v3:
  47 ; GCN-DAG: ds_read2_b32
  48 ; GCN-DAG: ds_read_b32
  49 ; GCN-DAG: ds_write2_b32
  50 ; GCN-DAG: ds_write_b32
  51 define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
  52 bb:
  53   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  54   %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
  55   %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
  56   %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
  57   %v1 = extractelement <3 x i32> %load, i32 0
  58   %v2 = extractelement <3 x i32> %load, i32 1
  59   %v3 = extractelement <3 x i32> %load, i32 2
  60   %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
  61   %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
  62   %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
  63   store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4
  64   ret void
  65 }
  66
  67 ; GCN-LABEL: test_flat_misaligned_v2:
  68 ; VECT-DAG:  flat_load_dwordx2 v
  69 ; VECT-DAG:  flat_store_dwordx2 v
  70 ; SPLIT-DAG: flat_load_dword v
  71 ; SPLIT-DAG: flat_load_dword v
  72 ; SPLIT-DAG: flat_store_dword v
  73 ; SPLIT-DAG: flat_store_dword v
  74 define amdgpu_kernel void @test_flat_misaligned_v2(i32* %arg) {
  75 bb:
  76   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  77   %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
  78   %ptr = bitcast i32* %gep to <2 x i32>*
  79   %load = load <2 x i32>, <2 x i32>* %ptr, align 4
  80   %v1 = extractelement <2 x i32> %load, i32 0
  81   %v2 = extractelement <2 x i32> %load, i32 1
  82   %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
  83   %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
  84   store <2 x i32> %v4, <2 x i32>* %ptr, align 4
  85   ret void
  86 }
  87
  88 ; GCN-LABEL: test_flat_misaligned_v4:
  89 ; VECT-DAG:  flat_load_dwordx4 v
  90 ; VECT-DAG:  flat_store_dwordx4 v
  91 ; SPLIT-DAG: flat_load_dword v
  92 ; SPLIT-DAG: flat_load_dword v
  93 ; SPLIT-DAG: flat_load_dword v
  94 ; SPLIT-DAG: flat_load_dword v
  95 ; SPLIT-DAG: flat_store_dword v
  96 ; SPLIT-DAG: flat_store_dword v
  97 ; SPLIT-DAG: flat_store_dword v
  98 ; SPLIT-DAG: flat_store_dword v
  99 define amdgpu_kernel void @test_flat_misaligned_v4(i32* %arg) {
 100 bb:
 101   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 102   %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
 103   %ptr = bitcast i32* %gep to <4 x i32>*
 104   %load = load <4 x i32>, <4 x i32>* %ptr, align 4
 105   %v1 = extractelement <4 x i32> %load, i32 0
 106   %v2 = extractelement <4 x i32> %load, i32 1
 107   %v3 = extractelement <4 x i32> %load, i32 2
 108   %v4 = extractelement <4 x i32> %load, i32 3
 109   %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
 110   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
 111   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
 112   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
 113   store <4 x i32> %v8, <4 x i32>* %ptr, align 4
 114   ret void
 115 }
 116
 117 ; TODO: Reinstate the test below once v3i32/v3f32 is reinstated.
 118
 119 ; GCN-LABEL: test_flat_misaligned_v3:
 120 ; xVECT-DAG:  flat_load_dwordx3 v
 121 ; xVECT-DAG:  flat_store_dwordx3 v
 122 ; xSPLIT-DAG: flat_load_dword v
 123 ; xSPLIT-DAG: flat_load_dword v
 124 ; xSPLIT-DAG: flat_load_dword v
 125 ; xSPLIT-DAG: flat_store_dword v
 126 ; xSPLIT-DAG: flat_store_dword v
 127 ; xSPLIT-DAG: flat_store_dword v
 128 define amdgpu_kernel void @test_flat_misaligned_v3(i32* %arg) {
 129 bb:
 130   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 131   %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
 132   %ptr = bitcast i32* %gep to <3 x i32>*
 133   %load = load <3 x i32>, <3 x i32>* %ptr, align 4
 134   %v1 = extractelement <3 x i32> %load, i32 0
 135   %v2 = extractelement <3 x i32> %load, i32 1
 136   %v3 = extractelement <3 x i32> %load, i32 2
 137   %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
 138   %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
 139   %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
 140   store <3 x i32> %v7, <3 x i32>* %ptr, align 4
 141   ret void
 142 }
 143
 144 ; GCN-LABEL: test_local_aligned_v2:
 145 ; GCN-DAG: ds_read_b64
 146 ; GCN-DAG: ds_write_b64
 147 define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) {
 148 bb:
 149   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 150   %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
 151   %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
 152   %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8
 153   %v1 = extractelement <2 x i32> %load, i32 0
 154   %v2 = extractelement <2 x i32> %load, i32 1
 155   %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
 156   %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
 157   store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8
 158   ret void
 159 }
 160
 161 ; GCN-LABEL: test_local_aligned_v3:
 162 ; GCN-DAG: ds_read_b64
 163 ; GCN-DAG: ds_read_b32
 164 ; GCN-DAG: ds_write_b64
 165 ; GCN-DAG: ds_write_b32
 166 define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) {
 167 bb:
 168   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 169   %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
 170   %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
 171   %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16
 172   %v1 = extractelement <3 x i32> %load, i32 0
 173   %v2 = extractelement <3 x i32> %load, i32 1
 174   %v3 = extractelement <3 x i32> %load, i32 2
 175   %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
 176   %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
 177   %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
 178   store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16
 179   ret void
 180 }
 181
 182 ; GCN-LABEL: test_flat_aligned_v2:
 183 ; GCN-DAG: flat_load_dwordx2 v
 184 ; GCN-DAG: flat_store_dwordx2 v
 185 define amdgpu_kernel void @test_flat_aligned_v2(i32* %arg) {
 186 bb:
 187   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 188   %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
 189   %ptr = bitcast i32* %gep to <2 x i32>*
 190   %load = load <2 x i32>, <2 x i32>* %ptr, align 8
 191   %v1 = extractelement <2 x i32> %load, i32 0
 192   %v2 = extractelement <2 x i32> %load, i32 1
 193   %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
 194   %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
 195   store <2 x i32> %v4, <2 x i32>* %ptr, align 8
 196   ret void
 197 }
 198
 199 ; GCN-LABEL: test_flat_aligned_v4:
 200 ; GCN-DAG: flat_load_dwordx4 v
 201 ; GCN-DAG: flat_store_dwordx4 v
 202 define amdgpu_kernel void @test_flat_aligned_v4(i32* %arg) {
 203 bb:
 204   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 205   %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
 206   %ptr = bitcast i32* %gep to <4 x i32>*
 207   %load = load <4 x i32>, <4 x i32>* %ptr, align 16
 208   %v1 = extractelement <4 x i32> %load, i32 0
 209   %v2 = extractelement <4 x i32> %load, i32 1
 210   %v3 = extractelement <4 x i32> %load, i32 2
 211   %v4 = extractelement <4 x i32> %load, i32 3
 212   %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
 213   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
 214   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
 215   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
 216   store <4 x i32> %v8, <4 x i32>* %ptr, align 16
 217   ret void
 218 }
 219
 220 ; GCN-LABEL: test_local_v4_aligned8:
 221 ; GCN-DAG: ds_read2_b64
 222 ; GCN-DAG: ds_write2_b64
 223 define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
 224 bb:
 225   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 226   %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
 227   %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
 228   %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
 229   %v1 = extractelement <4 x i32> %load, i32 0
 230   %v2 = extractelement <4 x i32> %load, i32 1
 231   %v3 = extractelement <4 x i32> %load, i32 2
 232   %v4 = extractelement <4 x i32> %load, i32 3
 233   %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
 234   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
 235   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
 236   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
 237   store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8
 238   ret void
 239 }
 240
 241 ; GCN-LABEL: test_flat_v4_aligned8:
 242 ; VECT-DAG:  flat_load_dwordx4 v
 243 ; VECT-DAG:  flat_store_dwordx4 v
 244 ; SPLIT-DAG: flat_load_dwordx2 v
 245 ; SPLIT-DAG: flat_load_dwordx2 v
 246 ; SPLIT-DAG: flat_store_dwordx2 v
 247 ; SPLIT-DAG: flat_store_dwordx2 v
 248 define amdgpu_kernel void @test_flat_v4_aligned8(i32* %arg) {
 249 bb:
 250   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 251   %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
 252   %ptr = bitcast i32* %gep to <4 x i32>*
 253   %load = load <4 x i32>, <4 x i32>* %ptr, align 8
 254   %v1 = extractelement <4 x i32> %load, i32 0
 255   %v2 = extractelement <4 x i32> %load, i32 1
 256   %v3 = extractelement <4 x i32> %load, i32 2
 257   %v4 = extractelement <4 x i32> %load, i32 3
 258   %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
 259   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
 260   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
 261   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
 262   store <4 x i32> %v8, <4 x i32>* %ptr, align 8
 263   ret void
 264 }
 265
 266 declare i32 @llvm.amdgcn.workitem.id.x()