llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll

   1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
   2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
   3 ; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
   4
   5 ; GCN-LABEL: {{^}}float4_alloca_store4:
   6 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4
   7
   8 ; GCN-NOT: buffer_
   9 ; GCN: v_cndmask_b32
  10 ; GCN: v_cndmask_b32
  11 ; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,
  12 ; GCN: store_dword v{{.+}}, [[RES]]
  13
  14 ; OPT:  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
  15 ; OPT:  store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, <4 x float> addrspace(5)* %alloca, align 4
  16 ; OPT:  %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
  17 ; OPT:  %1 = extractelement <4 x float> %0, i32 %sel2
  18 ; OPT:  store float %1, float addrspace(1)* %out, align 4
  19
  20 define amdgpu_kernel void @float4_alloca_store4(float addrspace(1)* %out, float addrspace(3)* %dummy_lds) {
  21 entry:
  22   %alloca = alloca <4 x float>, align 16, addrspace(5)
  23   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
  24   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
  25   %c1 = icmp uge i32 %x, 3
  26   %c2 = icmp uge i32 %y, 3
  27   %sel1 = select i1 %c1, i32 1, i32 2
  28   %sel2 = select i1 %c2, i32 0, i32 %sel1
  29   %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
  30   store <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x float> addrspace(5)* %alloca, align 4
  31   %load = load float, float addrspace(5)* %gep, align 4
  32   store float %load, float addrspace(1)* %out, align 4
  33   ret void
  34 }
  35
  36 ; GCN-LABEL: {{^}}float4_alloca_load4:
  37 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_load4
  38
  39 ; GCN-NOT: v_movrel
  40 ; GCN-NOT: buffer_
  41 ; GCN-NOT: v_cmp_
  42 ; GCN-NOT: v_cndmask_
  43 ; GCN:     v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
  44 ; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
  45 ; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
  46 ; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
  47 ; GCN:     store_dwordx4 v{{.+}},
  48
  49 ; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
  50 ; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
  51 ; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2
  52 ; OPT: store <4 x float> %1, <4 x float> addrspace(5)* %alloca
  53 ; OPT: %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4
  54 ; OPT:  store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4
  55
  56 define amdgpu_kernel void @float4_alloca_load4(<4 x float> addrspace(1)* %out, float addrspace(3)* %dummy_lds) {
  57 entry:
  58   %alloca = alloca <4 x float>, align 16, addrspace(5)
  59   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
  60   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
  61   %c1 = icmp uge i32 %x, 3
  62   %c2 = icmp uge i32 %y, 3
  63   %sel1 = select i1 %c1, i32 1, i32 2
  64   %sel2 = select i1 %c2, i32 0, i32 %sel1
  65   %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
  66   store float 1.0, float addrspace(5)* %gep, align 4
  67   %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4
  68   store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4
  69   ret void
  70 }
  71
  72 ; GCN-LABEL: {{^}}half4_alloca_store4:
  73 ; OPT-LABEL: define amdgpu_kernel void @half4_alloca_store4
  74
  75 ; GCN-NOT: buffer_
  76 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200
  77 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
  78 ; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]
  79
  80 ; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
  81 ; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, <4 x half> addrspace(5)* %alloca, align 2
  82 ; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
  83 ; OPT: %1 = extractelement <4 x half> %0, i32 %sel2
  84 ; OPT: store half %1, half addrspace(1)* %out, align 2
  85
  86 define amdgpu_kernel void @half4_alloca_store4(half addrspace(1)* %out, half addrspace(3)* %dummy_lds) {
  87 entry:
  88   %alloca = alloca <4 x half>, align 16, addrspace(5)
  89   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
  90   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
  91   %c1 = icmp uge i32 %x, 3
  92   %c2 = icmp uge i32 %y, 3
  93   %sel1 = select i1 %c1, i32 1, i32 2
  94   %sel2 = select i1 %c2, i32 0, i32 %sel1
  95   %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
  96   store <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, <4 x half> addrspace(5)* %alloca, align 2
  97   %load = load half, half addrspace(5)* %gep, align 2
  98   store half %load, half addrspace(1)* %out, align 2
  99   ret void
 100 }
 101
 102 ; GCN-LABEL: {{^}}half4_alloca_load4:
 103 ; OPT-LABEL: define amdgpu_kernel void @half4_alloca_load4
 104
 105 ; GCN-NOT: buffer_
 106 ; GCN:     s_mov_b64 s[{{[0-9:]+}}], 0xffff
 107
 108 ; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
 109 ; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
 110 ; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2
 111 ; OPT: store <4 x half> %1, <4 x half> addrspace(5)* %alloca
 112 ; OPT: %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2
 113 ; OPT: store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2
 114
 115 define amdgpu_kernel void @half4_alloca_load4(<4 x half> addrspace(1)* %out, half addrspace(3)* %dummy_lds) {
 116 entry:
 117   %alloca = alloca <4 x half>, align 16, addrspace(5)
 118   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
 119   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
 120   %c1 = icmp uge i32 %x, 3
 121   %c2 = icmp uge i32 %y, 3
 122   %sel1 = select i1 %c1, i32 1, i32 2
 123   %sel2 = select i1 %c2, i32 0, i32 %sel1
 124   %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
 125   store half 1.0, half addrspace(5)* %gep, align 4
 126   %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2
 127   store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2
 128   ret void
 129 }
 130
 131 ; GCN-LABEL: {{^}}short4_alloca_store4:
 132 ; OPT-LABEL: define amdgpu_kernel void @short4_alloca_store4
 133
 134 ; GCN-NOT: buffer_
 135 ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x40003
 136 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
 137 ; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]
 138
 139 ; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
 140 ; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> addrspace(5)* %alloca, align 2
 141 ; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
 142 ; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2
 143 ; OPT: store i16 %1, i16 addrspace(1)* %out, align 2
 144
 145 define amdgpu_kernel void @short4_alloca_store4(i16 addrspace(1)* %out, i16 addrspace(3)* %dummy_lds) {
 146 entry:
 147   %alloca = alloca <4 x i16>, align 16, addrspace(5)
 148   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
 149   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
 150   %c1 = icmp uge i32 %x, 3
 151   %c2 = icmp uge i32 %y, 3
 152   %sel1 = select i1 %c1, i32 1, i32 2
 153   %sel2 = select i1 %c2, i32 0, i32 %sel1
 154   %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
 155   store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> addrspace(5)* %alloca, align 2
 156   %load = load i16, i16 addrspace(5)* %gep, align 2
 157   store i16 %load, i16 addrspace(1)* %out, align 2
 158   ret void
 159 }
 160
 161 ; GCN-LABEL: {{^}}short4_alloca_load4:
 162 ; OPT-LABEL: define amdgpu_kernel void @short4_alloca_load4
 163
 164 ; GCN-NOT: buffer_
 165 ; GCN:     s_mov_b64 s[{{[0-9:]+}}], 0xffff
 166
 167 ; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
 168 ; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
 169 ; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2
 170 ; OPT: store <4 x i16> %1, <4 x i16> addrspace(5)* %alloca
 171 ; OPT: %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2
 172 ; OPT: store <4 x i16> %load, <4 x i16> addrspace(1)* %out, align 2
 173
 174 define amdgpu_kernel void @short4_alloca_load4(<4 x i16> addrspace(1)* %out, i16 addrspace(3)* %dummy_lds) {
 175 entry:
 176   %alloca = alloca <4 x i16>, align 16, addrspace(5)
 177   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
 178   %y = tail call i32 @llvm.amdgcn.workitem.id.y()
 179   %c1 = icmp uge i32 %x, 3
 180   %c2 = icmp uge i32 %y, 3
 181   %sel1 = select i1 %c1, i32 1, i32 2
 182   %sel2 = select i1 %c2, i32 0, i32 %sel1
 183   %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
 184   store i16 1, i16 addrspace(5)* %gep, align 4
 185   %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2
 186   store <4 x i16> %load, <4 x i16> addrspace(1)* %out, align 2
 187   ret void
 188 }
 189
 190 ; GCN-LABEL: {{^}}ptr_alloca_bitcast:
 191 ; OPT-LABEL: define i64 @ptr_alloca_bitcast
 192
 193 ; GCN-NOT: buffer_
 194 ; GCN: v_mov_b32_e32 v1, 0
 195
 196 ; OPT: %private_iptr = alloca <2 x i32>, align 8, addrspace(5)
 197 ; OPT: %cast = bitcast <2 x i32> addrspace(5)* %private_iptr to i64 addrspace(5)*
 198 ; OPT: %tmp1 = load i64, i64 addrspace(5)* %cast, align 8
 199
 200 define i64 @ptr_alloca_bitcast() {
 201 entry:
 202   %private_iptr = alloca <2 x i32>, align 8, addrspace(5)
 203   %cast = bitcast <2 x i32> addrspace(5)* %private_iptr to i64 addrspace(5)*
 204   %tmp1 = load i64, i64 addrspace(5)* %cast, align 8
 205   ret i64 %tmp1
 206 }
 207
 208 declare i32 @llvm.amdgcn.workitem.id.x()
 209 declare i32 @llvm.amdgcn.workitem.id.y()