llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll

   1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
   2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
   3 ; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
   4
   5 ; GCN-LABEL: {{^}}float4_alloca_store4:
   6 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4
   7
   8 ; GCN-NOT: buffer_
   9 ; GCN: v_cndmask_b32
  10 ; GCN: v_cndmask_b32
  11 ; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,
  12 ; GCN: store_dword v{{.+}}, [[RES]]
  13
  14 ; OPT:  %0 = extractelement <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, i32 %sel2
  15 ; OPT:  store float %0, ptr addrspace(1) %out, align 4
  16
  17 define amdgpu_kernel void @float4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
  18 entry:
  19   %alloca = alloca <4 x float>, align 16, addrspace(5)
  20   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
  21   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
  22   %c1 = icmp uge i32 %x, 3
  23   %c2 = icmp uge i32 %y, 3
  24   %sel1 = select i1 %c1, i32 1, i32 2
  25   %sel2 = select i1 %c2, i32 0, i32 %sel1
  26   %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
  27   store <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, ptr addrspace(5) %alloca, align 4
  28   %load = load float, ptr addrspace(5) %gep, align 4
  29   store float %load, ptr addrspace(1) %out, align 4
  30   ret void
  31 }
  32
  33 ; GCN-LABEL: {{^}}float4_alloca_load4:
  34 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_load4
  35
  36 ; GCN-NOT: v_movrel
  37 ; GCN-NOT: buffer_
  38 ; GCN-NOT: v_cmp_
  39 ; GCN-NOT: v_cndmask_
  40 ; GCN:     v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
  41 ; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
  42 ; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
  43 ; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
  44 ; GCN:     store_dwordx4 v{{.+}},
  45
  46 ; OPT: %0 = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel2
  47 ; OPT: store <4 x float> %0, ptr addrspace(1) %out, align 4
  48
  49 define amdgpu_kernel void @float4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
  50 entry:
  51   %alloca = alloca <4 x float>, align 16, addrspace(5)
  52   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
  53   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
  54   %c1 = icmp uge i32 %x, 3
  55   %c2 = icmp uge i32 %y, 3
  56   %sel1 = select i1 %c1, i32 1, i32 2
  57   %sel2 = select i1 %c2, i32 0, i32 %sel1
  58   %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
  59   store float 1.0, ptr addrspace(5) %gep, align 4
  60   %load = load <4 x float>, ptr addrspace(5) %alloca, align 4
  61   store <4 x float> %load, ptr addrspace(1) %out, align 4
  62   ret void
  63 }
  64
  65 ; GCN-LABEL: {{^}}half4_alloca_store4:
  66 ; OPT-LABEL: define amdgpu_kernel void @half4_alloca_store4
  67
  68 ; GCN-NOT: buffer_
  69 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200
  70 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
  71 ; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
  72
  73 ; OPT: %0 = extractelement <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, i32 %sel2
  74 ; OPT: store half %0, ptr addrspace(1) %out, align 2
  75
  76 define amdgpu_kernel void @half4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
  77 entry:
  78   %alloca = alloca <4 x half>, align 16, addrspace(5)
  79   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
  80   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
  81   %c1 = icmp uge i32 %x, 3
  82   %c2 = icmp uge i32 %y, 3
  83   %sel1 = select i1 %c1, i32 1, i32 2
  84   %sel2 = select i1 %c2, i32 0, i32 %sel1
  85   %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
  86   store <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, ptr addrspace(5) %alloca, align 2
  87   %load = load half, ptr addrspace(5) %gep, align 2
  88   store half %load, ptr addrspace(1) %out, align 2
  89   ret void
  90 }
  91
  92 ; GCN-LABEL: {{^}}half4_alloca_load4:
  93 ; OPT-LABEL: define amdgpu_kernel void @half4_alloca_load4
  94
  95 ; GCN-NOT: buffer_
  96 ; GCN:     s_mov_b64 s[{{[0-9:]+}}], 0xffff
  97
  98 ; OPT: %0 = insertelement <4 x half> undef, half 0xH3C00, i32 %sel2
  99 ; OPT: store <4 x half> %0, ptr addrspace(1) %out, align 2
 100
 101 define amdgpu_kernel void @half4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 102 entry:
 103   %alloca = alloca <4 x half>, align 16, addrspace(5)
 104   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
 105   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
 106   %c1 = icmp uge i32 %x, 3
 107   %c2 = icmp uge i32 %y, 3
 108   %sel1 = select i1 %c1, i32 1, i32 2
 109   %sel2 = select i1 %c2, i32 0, i32 %sel1
 110   %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
 111   store half 1.0, ptr addrspace(5) %gep, align 4
 112   %load = load <4 x half>, ptr addrspace(5) %alloca, align 2
 113   store <4 x half> %load, ptr addrspace(1) %out, align 2
 114   ret void
 115 }
 116
 117 ; GCN-LABEL: {{^}}short4_alloca_store4:
 118 ; OPT-LABEL: define amdgpu_kernel void @short4_alloca_store4
 119
 120 ; GCN-NOT: buffer_
 121 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x40003
 122 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
 123 ; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
 124
 125 ; OPT: %0 = extractelement <4 x i16> <i16 1, i16 2, i16 3, i16 4>, i32 %sel2
 126 ; OPT: store i16 %0, ptr addrspace(1) %out, align 2
 127
 128 define amdgpu_kernel void @short4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 129 entry:
 130   %alloca = alloca <4 x i16>, align 16, addrspace(5)
 131   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
 132   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
 133   %c1 = icmp uge i32 %x, 3
 134   %c2 = icmp uge i32 %y, 3
 135   %sel1 = select i1 %c1, i32 1, i32 2
 136   %sel2 = select i1 %c2, i32 0, i32 %sel1
 137   %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
 138   store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, ptr addrspace(5) %alloca, align 2
 139   %load = load i16, ptr addrspace(5) %gep, align 2
 140   store i16 %load, ptr addrspace(1) %out, align 2
 141   ret void
 142 }
 143
 144 ; GCN-LABEL: {{^}}short4_alloca_load4:
 145 ; OPT-LABEL: define amdgpu_kernel void @short4_alloca_load4
 146
 147 ; GCN-NOT: buffer_
 148 ; GCN:     s_mov_b64 s[{{[0-9:]+}}], 0xffff
 149
 150 ; OPT: %0 = insertelement <4 x i16> undef, i16 1, i32 %sel2
 151 ; OPT: store <4 x i16> %0, ptr addrspace(1) %out, align 2
 152
 153 define amdgpu_kernel void @short4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 154 entry:
 155   %alloca = alloca <4 x i16>, align 16, addrspace(5)
 156   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
 157   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
 158   %c1 = icmp uge i32 %x, 3
 159   %c2 = icmp uge i32 %y, 3
 160   %sel1 = select i1 %c1, i32 1, i32 2
 161   %sel2 = select i1 %c2, i32 0, i32 %sel1
 162   %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
 163   store i16 1, ptr addrspace(5) %gep, align 4
 164   %load = load <4 x i16>, ptr addrspace(5) %alloca, align 2
 165   store <4 x i16> %load, ptr addrspace(1) %out, align 2
 166   ret void
 167 }
 168
 169 ; GCN-LABEL: {{^}}ptr_alloca_bitcast:
 170 ; OPT-LABEL: define i64 @ptr_alloca_bitcast
 171
 172 ; GCN-NOT: buffer_
 173 ; GCN: v_mov_b32_e32 v1, 0
 174
 175 ; OPT: ret i64 undef
 176
 177 define i64 @ptr_alloca_bitcast() {
 178 entry:
 179   %private_iptr = alloca <2 x i32>, align 8, addrspace(5)
 180   %tmp1 = load i64, ptr addrspace(5) %private_iptr, align 8
 181   ret i64 %tmp1
 182 }
 183
 184 declare i32 @llvm.amdgcn.workitem.id.x()
 185 declare i32 @llvm.amdgcn.workitem.id.y()