llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll

   1 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
   2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
   3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
   4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
   5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s
   6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
   7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
   8 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode -early-live-intervals < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
   9 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s
  10
  11 ; GCN-LABEL: test_local_misaligned_v2:
  12 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
  13 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
  14 define amdgpu_kernel void @test_local_misaligned_v2(ptr addrspace(3) %arg) {
  15 bb:
  16   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  17   %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
  18   %load = load <2 x i32>, ptr addrspace(3) %gep, align 4
  19   %v1 = extractelement <2 x i32> %load, i32 0
  20   %v2 = extractelement <2 x i32> %load, i32 1
  21   %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
  22   %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
  23   store <2 x i32> %v4, ptr addrspace(3) %gep, align 4
  24   ret void
  25 }
  26
  27 ; GCN-LABEL: test_local_misaligned_v4:
  28 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
  29 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
  30 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
  31 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
  32 define amdgpu_kernel void @test_local_misaligned_v4(ptr addrspace(3) %arg) {
  33 bb:
  34   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  35   %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
  36   %load = load <4 x i32>, ptr addrspace(3) %gep, align 4
  37   %v1 = extractelement <4 x i32> %load, i32 0
  38   %v2 = extractelement <4 x i32> %load, i32 1
  39   %v3 = extractelement <4 x i32> %load, i32 2
  40   %v4 = extractelement <4 x i32> %load, i32 3
  41   %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
  42   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
  43   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
  44   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
  45   store <4 x i32> %v8, ptr addrspace(3) %gep, align 4
  46   ret void
  47 }
  48
  49 ; GCN-LABEL: test_local_misaligned_v3:
  50 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
  51 ; GCN-DAG: ds_{{read|load}}_b32
  52 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
  53 ; GCN-DAG: ds_{{write|store}}_b32
  54 define amdgpu_kernel void @test_local_misaligned_v3(ptr addrspace(3) %arg) {
  55 bb:
  56   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  57   %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
  58   %load = load <3 x i32>, ptr addrspace(3) %gep, align 4
  59   %v1 = extractelement <3 x i32> %load, i32 0
  60   %v2 = extractelement <3 x i32> %load, i32 1
  61   %v3 = extractelement <3 x i32> %load, i32 2
  62   %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
  63   %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
  64   %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
  65   store <3 x i32> %v7, ptr addrspace(3) %gep, align 4
  66   ret void
  67 }
  68
  69 ; GCN-LABEL: test_flat_misaligned_v2:
  70 ; VECT-DAG:  flat_load_{{dwordx2|b64}} v
  71 ; VECT-DAG:  flat_store_{{dwordx2|b64}} v
  72 ; SPLIT-DAG: flat_load_{{dword|b32}} v
  73 ; SPLIT-DAG: flat_load_{{dword|b32}} v
  74 ; SPLIT-DAG: flat_store_{{dword|b32}} v
  75 ; SPLIT-DAG: flat_store_{{dword|b32}} v
  76 define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) {
  77 bb:
  78   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
  79   %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
  80   %load = load <2 x i32>, ptr %gep, align 4
  81   %v1 = extractelement <2 x i32> %load, i32 0
  82   %v2 = extractelement <2 x i32> %load, i32 1
  83   %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
  84   %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
  85   store <2 x i32> %v4, ptr %gep, align 4
  86   ret void
  87 }
  88
  89 ; GCN-LABEL: test_flat_misaligned_v4:
  90 ; VECT-DAG:  flat_load_{{dwordx4|b128}} v
  91 ; VECT-DAG:  flat_store_{{dwordx4|b128}} v
  92 ; SPLIT-DAG: flat_load_{{dword|b32}} v
  93 ; SPLIT-DAG: flat_load_{{dword|b32}} v
  94 ; SPLIT-DAG: flat_load_{{dword|b32}} v
  95 ; SPLIT-DAG: flat_load_{{dword|b32}} v
  96 ; SPLIT-DAG: flat_store_{{dword|b32}} v
  97 ; SPLIT-DAG: flat_store_{{dword|b32}} v
  98 ; SPLIT-DAG: flat_store_{{dword|b32}} v
  99 ; SPLIT-DAG: flat_store_{{dword|b32}} v
 100 define amdgpu_kernel void @test_flat_misaligned_v4(ptr %arg) {
 101 bb:
 102   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 103   %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
 104   %load = load <4 x i32>, ptr %gep, align 4
 105   %v1 = extractelement <4 x i32> %load, i32 0
 106   %v2 = extractelement <4 x i32> %load, i32 1
 107   %v3 = extractelement <4 x i32> %load, i32 2
 108   %v4 = extractelement <4 x i32> %load, i32 3
 109   %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
 110   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
 111   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
 112   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
 113   store <4 x i32> %v8, ptr %gep, align 4
 114   ret void
 115 }
 116
 117 ; GCN-LABEL: test_flat_misaligned_v3:
 118 ; VECT-DAG:  flat_load_{{dwordx3|b96}} v
 119 ; VECT-DAG:  flat_store_{{dwordx3|b96}} v
 120 ; SPLIT-DAG: flat_load_{{dword|b32}} v
 121 ; SPLIT-DAG: flat_load_{{dword|b32}} v
 122 ; SPLIT-DAG: flat_load_{{dword|b32}} v
 123 ; SPLIT-DAG: flat_store_{{dword|b32}} v
 124 ; SPLIT-DAG: flat_store_{{dword|b32}} v
 125 ; SPLIT-DAG: flat_store_{{dword|b32}} v
 126 define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) {
 127 bb:
 128   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 129   %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
 130   %load = load <3 x i32>, ptr %gep, align 4
 131   %v1 = extractelement <3 x i32> %load, i32 0
 132   %v2 = extractelement <3 x i32> %load, i32 1
 133   %v3 = extractelement <3 x i32> %load, i32 2
 134   %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
 135   %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
 136   %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
 137   store <3 x i32> %v7, ptr %gep, align 4
 138   ret void
 139 }
 140
 141 ; GCN-LABEL: test_local_aligned_v2:
 142 ; GCN-DAG: ds_{{read|load}}_b64
 143 ; GCN-DAG: ds_{{write|store}}_b64
 144 define amdgpu_kernel void @test_local_aligned_v2(ptr addrspace(3) %arg) {
 145 bb:
 146   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 147   %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
 148   %load = load <2 x i32>, ptr addrspace(3) %gep, align 8
 149   %v1 = extractelement <2 x i32> %load, i32 0
 150   %v2 = extractelement <2 x i32> %load, i32 1
 151   %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
 152   %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
 153   store <2 x i32> %v4, ptr addrspace(3) %gep, align 8
 154   ret void
 155 }
 156
 157 ; GCN-LABEL: test_local_aligned_v3:
 158 ; GCN-DAG: ds_{{read|load}}_b96
 159 ; GCN-DAG: ds_{{write|store}}_b96
 160 define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) {
 161 bb:
 162   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 163   %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
 164   %load = load <3 x i32>, ptr addrspace(3) %gep, align 16
 165   %v1 = extractelement <3 x i32> %load, i32 0
 166   %v2 = extractelement <3 x i32> %load, i32 1
 167   %v3 = extractelement <3 x i32> %load, i32 2
 168   %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
 169   %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
 170   %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
 171   store <3 x i32> %v7, ptr addrspace(3) %gep, align 16
 172   ret void
 173 }
 174
 175 ; GCN-LABEL: test_flat_aligned_v2:
 176 ; GCN-DAG: flat_load_{{dwordx2|b64}} v
 177 ; GCN-DAG: flat_store_{{dwordx2|b64}} v
 178 define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) {
 179 bb:
 180   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 181   %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
 182   %load = load <2 x i32>, ptr %gep, align 8
 183   %v1 = extractelement <2 x i32> %load, i32 0
 184   %v2 = extractelement <2 x i32> %load, i32 1
 185   %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
 186   %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
 187   store <2 x i32> %v4, ptr %gep, align 8
 188   ret void
 189 }
 190
 191 ; GCN-LABEL: test_flat_aligned_v4:
 192 ; GCN-DAG: flat_load_{{dwordx4|b128}} v
 193 ; GCN-DAG: flat_store_{{dwordx4|b128}} v
 194 define amdgpu_kernel void @test_flat_aligned_v4(ptr %arg) {
 195 bb:
 196   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 197   %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
 198   %load = load <4 x i32>, ptr %gep, align 16
 199   %v1 = extractelement <4 x i32> %load, i32 0
 200   %v2 = extractelement <4 x i32> %load, i32 1
 201   %v3 = extractelement <4 x i32> %load, i32 2
 202   %v4 = extractelement <4 x i32> %load, i32 3
 203   %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
 204   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
 205   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
 206   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
 207   store <4 x i32> %v8, ptr %gep, align 16
 208   ret void
 209 }
 210
 211 ; GCN-LABEL: test_local_v4_aligned8:
 212 ; ALIGNED-DAG: ds_{{read2|load_2addr}}_b64
 213 ; ALIGNED-DAG: ds_{{write2|store_2addr}}_b64
 214 ; UNALIGNED-DAG: ds_{{read2|load_2addr}}_b64
 215 ; UNALIGNED-DAG: ds_{{write2|store_2addr}}_b64
 216 define amdgpu_kernel void @test_local_v4_aligned8(ptr addrspace(3) %arg) {
 217 bb:
 218   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 219   %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
 220   %load = load <4 x i32>, ptr addrspace(3) %gep, align 8
 221   %v1 = extractelement <4 x i32> %load, i32 0
 222   %v2 = extractelement <4 x i32> %load, i32 1
 223   %v3 = extractelement <4 x i32> %load, i32 2
 224   %v4 = extractelement <4 x i32> %load, i32 3
 225   %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
 226   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
 227   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
 228   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
 229   store <4 x i32> %v8, ptr addrspace(3) %gep, align 8
 230   ret void
 231 }
 232
 233 ; GCN-LABEL: test_flat_v4_aligned8:
 234 ; VECT-DAG:  flat_load_{{dwordx4|b128}} v
 235 ; VECT-DAG:  flat_store_{{dwordx4|b128}} v
 236 ; SPLIT-DAG: flat_load_{{dwordx2|b64}} v
 237 ; SPLIT-DAG: flat_load_{{dwordx2|b64}} v
 238 ; SPLIT-DAG: flat_store_{{dwordx2|b64}} v
 239 ; SPLIT-DAG: flat_store_{{dwordx2|b64}} v
 240 define amdgpu_kernel void @test_flat_v4_aligned8(ptr %arg) {
 241 bb:
 242   %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
 243   %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
 244   %load = load <4 x i32>, ptr %gep, align 8
 245   %v1 = extractelement <4 x i32> %load, i32 0
 246   %v2 = extractelement <4 x i32> %load, i32 1
 247   %v3 = extractelement <4 x i32> %load, i32 2
 248   %v4 = extractelement <4 x i32> %load, i32 3
 249   %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
 250   %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
 251   %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
 252   %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
 253   store <4 x i32> %v8, ptr %gep, align 8
 254   ret void
 255 }
 256
 257 declare i32 @llvm.amdgcn.workitem.id.x()