1 ; RUN: llc -mtriple=amdgcn-- -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
2 ; RUN: llc -mtriple=amdgcn-- -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
3 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
4 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
5 ; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s
6 ; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
7 target datalayout = "A5"
9 ; OPT-LABEL: @vector_read(
10 ; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
11 ; OPT: store i32 %0, ptr addrspace(1) %out, align 4
13 ; FUNC-LABEL: {{^}}vector_read:
19 define amdgpu_kernel void @vector_read(ptr addrspace(1) %out, i32 %index) {
21 %tmp = alloca [4 x i32], addrspace(5)
22 %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
23 %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
24 %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
25 store i32 0, ptr addrspace(5) %tmp
26 store i32 1, ptr addrspace(5) %y
27 store i32 2, ptr addrspace(5) %z
28 store i32 3, ptr addrspace(5) %w
29 %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
30 %tmp2 = load i32, ptr addrspace(5) %tmp1
31 store i32 %tmp2, ptr addrspace(1) %out
35 ; OPT-LABEL: @vector_write(
36 ; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
37 ; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index
38 ; OPT: store i32 %1, ptr addrspace(1) %out, align 4
40 ; FUNC-LABEL: {{^}}vector_write:
47 define amdgpu_kernel void @vector_write(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) {
49 %tmp = alloca [4 x i32], addrspace(5)
50 %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
51 %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
52 %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
53 store i32 0, ptr addrspace(5) %tmp
54 store i32 0, ptr addrspace(5) %y
55 store i32 0, ptr addrspace(5) %z
56 store i32 0, ptr addrspace(5) %w
57 %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %w_index
58 store i32 1, ptr addrspace(5) %tmp1
59 %tmp2 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %r_index
60 %tmp3 = load i32, ptr addrspace(5) %tmp2
61 store i32 %tmp3, ptr addrspace(1) %out
65 ; This test should be optimize to:
66 ; store i32 0, ptr addrspace(1) %out
68 ; OPT-LABEL: @bitcast_gep(
69 ; OPT-LABEL: store i32 0, ptr addrspace(1) %out, align 4
71 ; FUNC-LABEL: {{^}}bitcast_gep:
73 define amdgpu_kernel void @bitcast_gep(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) {
75 %tmp = alloca [4 x i32], addrspace(5)
76 %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
77 %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
78 %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
79 store i32 0, ptr addrspace(5) %tmp
80 store i32 0, ptr addrspace(5) %y
81 store i32 0, ptr addrspace(5) %z
82 store i32 0, ptr addrspace(5) %w
83 %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
84 %tmp4 = load i32, ptr addrspace(5) %tmp1
85 store i32 %tmp4, ptr addrspace(1) %out
89 ; OPT-LABEL: @vector_read_bitcast_gep(
90 ; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
91 ; OPT: store i32 %0, ptr addrspace(1) %out, align 4
92 define amdgpu_kernel void @vector_read_bitcast_gep(ptr addrspace(1) %out, i32 %index) {
94 %tmp = alloca [4 x i32], addrspace(5)
95 %y = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
96 %z = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
97 %w = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
98 store float 1.0, ptr addrspace(5) %tmp
99 store i32 1, ptr addrspace(5) %y
100 store i32 2, ptr addrspace(5) %z
101 store i32 3, ptr addrspace(5) %w
102 %tmp1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
103 %tmp2 = load i32, ptr addrspace(5) %tmp1
104 store i32 %tmp2, ptr addrspace(1) %out
108 ; OPT-LABEL: @vector_read_bitcast_alloca(
109 ; OPT: %0 = extractelement <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, i32 %index
110 ; OPT: store float %0, ptr addrspace(1) %out, align 4
111 define amdgpu_kernel void @vector_read_bitcast_alloca(ptr addrspace(1) %out, i32 %index) {
113 %tmp = alloca [4 x i32], addrspace(5)
114 %y = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 1
115 %z = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 2
116 %w = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 3
117 store float 0.0, ptr addrspace(5) %tmp
118 store float 1.0, ptr addrspace(5) %y
119 store float 2.0, ptr addrspace(5) %z
120 store float 4.0, ptr addrspace(5) %w
121 %tmp1 = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 %index
122 %tmp2 = load float, ptr addrspace(5) %tmp1
123 store float %tmp2, ptr addrspace(1) %out
127 ; The pointer arguments in local address space should not affect promotion to vector.
129 ; OPT-LABEL: @vector_read_with_local_arg(
130 ; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
131 ; OPT: store i32 %0, ptr addrspace(1) %out, align 4
132 define amdgpu_kernel void @vector_read_with_local_arg(ptr addrspace(3) %stopper, ptr addrspace(1) %out, i32 %index) {
134 %tmp = alloca [4 x i32], addrspace(5)
135 %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
136 %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
137 %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
138 store i32 0, ptr addrspace(5) %tmp
139 store i32 1, ptr addrspace(5) %y
140 store i32 2, ptr addrspace(5) %z
141 store i32 3, ptr addrspace(5) %w
142 %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
143 %tmp2 = load i32, ptr addrspace(5) %tmp1
144 store i32 %tmp2, ptr addrspace(1) %out