1 ; RUN: llc -amdgpu-load-store-vectorizer=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
3 declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1
4 declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1
5 declare i32 @llvm.amdgcn.workitem.id.x()
6 declare i32 @llvm.amdgcn.workitem.id.y()
8 ; GCN-LABEL: {{^}}v_permlane16_b32_vss:
9 ; GFX10-NOT: v_readfirstlane_b32
10 ; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
11 define amdgpu_kernel void @v_permlane16_b32_vss(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
12 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 0)
13 store i32 %v, i32 addrspace(1)* %out
17 ; GCN-LABEL: {{^}}v_permlane16_b32_vii:
18 ; GFX10-NOT: v_readfirstlane_b32
19 ; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2{{$}}
20 define amdgpu_kernel void @v_permlane16_b32_vii(i32 addrspace(1)* %out, i32 %src0) #1 {
21 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 1, i32 2, i1 0, i1 0)
22 store i32 %v, i32 addrspace(1)* %out
26 ; GCN-LABEL: {{^}}v_permlane16_b32_vll:
27 ; FIXME-GFX10: It is allowed to have both immediates as literals
28 ; GFX10-DAG: s_movk_i32 [[SRC1:s[0-9]+]], 0x1234
29 ; GFX10-DAG: s_mov_b32 [[SRC2:s[0-9]+]], 0xc1d1
30 ; GFX10-NOT: v_readfirstlane_b32
31 ; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}}
32 define amdgpu_kernel void @v_permlane16_b32_vll(i32 addrspace(1)* %out, i32 %src0) #1 {
33 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 0, i1 0)
34 store i32 %v, i32 addrspace(1)* %out
38 ; GCN-LABEL: {{^}}v_permlane16_b32_vvv:
39 ; GFX10-DAG: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0
40 ; GFX10-DAG: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1
41 ; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}}
42 define amdgpu_kernel void @v_permlane16_b32_vvv(i32 addrspace(1)* %out, i32 %src0) #1 {
43 %tidx = call i32 @llvm.amdgcn.workitem.id.x()
44 %tidy = call i32 @llvm.amdgcn.workitem.id.y()
45 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 0, i1 0)
46 store i32 %v, i32 addrspace(1)* %out
50 ; GCN-LABEL: {{^}}v_permlane16_b32_vvs:
51 ; GFX10-NOT: v_readfirstlane_b32
52 ; GFX10: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0
53 ; GFX10-NOT: v_readfirstlane_b32
54 ; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], s{{[0-9]+}}{{$}}
55 define amdgpu_kernel void @v_permlane16_b32_vvs(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #1 {
56 %tidx = call i32 @llvm.amdgcn.workitem.id.x()
57 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 0, i1 0)
58 store i32 %v, i32 addrspace(1)* %out
62 ; GCN-LABEL: {{^}}v_permlane16_b32_vsv:
63 ; GFX10-NOT: v_readfirstlane_b32
64 ; GFX10: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1
65 ; GFX10-NOT: v_readfirstlane_b32
66 ; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, [[SRC2]]{{$}}
67 define amdgpu_kernel void @v_permlane16_b32_vsv(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
68 %tidy = call i32 @llvm.amdgcn.workitem.id.y()
69 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 0, i1 0)
70 store i32 %v, i32 addrspace(1)* %out
74 ; GCN-LABEL: {{^}}v_permlane16_b32_vss_fi:
75 ; GFX10-NOT: v_readfirstlane_b32
76 ; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}}
77 define amdgpu_kernel void @v_permlane16_b32_vss_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
78 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 0)
79 store i32 %v, i32 addrspace(1)* %out
83 ; GCN-LABEL: {{^}}v_permlane16_b32_vss_bc:
84 ; GFX10-NOT: v_readfirstlane_b32
85 ; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}}
86 define amdgpu_kernel void @v_permlane16_b32_vss_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
87 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 1)
88 store i32 %v, i32 addrspace(1)* %out
92 ; GCN-LABEL: {{^}}v_permlane16_b32_vss_fi_bc:
93 ; GFX10-NOT: v_readfirstlane_b32
94 ; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}}
95 define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
96 %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 1)
97 store i32 %v, i32 addrspace(1)* %out
101 ; GCN-LABEL: {{^}}v_permlanex16_b32_vss:
102 ; GFX10-NOT: v_readfirstlane_b32
103 ; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
104 define amdgpu_kernel void @v_permlanex16_b32_vss(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
105 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 0)
106 store i32 %v, i32 addrspace(1)* %out
110 ; GCN-LABEL: {{^}}v_permlanex16_b32_vii:
111 ; GFX10-NOT: v_readfirstlane_b32
112 ; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2{{$}}
113 define amdgpu_kernel void @v_permlanex16_b32_vii(i32 addrspace(1)* %out, i32 %src0) #1 {
114 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 1, i32 2, i1 0, i1 0)
115 store i32 %v, i32 addrspace(1)* %out
119 ; GCN-LABEL: {{^}}v_permlanex16_b32_vll:
120 ; FIXME-GFX10: It is allowed to have both immediates as literals
121 ; GFX10-DAG: s_movk_i32 [[SRC1:s[0-9]+]], 0x1234
122 ; GFX10-DAG: s_mov_b32 [[SRC2:s[0-9]+]], 0xc1d1
123 ; GFX10-NOT: v_readfirstlane_b32
124 ; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}}
125 define amdgpu_kernel void @v_permlanex16_b32_vll(i32 addrspace(1)* %out, i32 %src0) #1 {
126 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 0, i1 0)
127 store i32 %v, i32 addrspace(1)* %out
131 ; GCN-LABEL: {{^}}v_permlanex16_b32_vvv:
132 ; GFX10-DAG: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0
133 ; GFX10-DAG: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1
134 ; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}}
135 define amdgpu_kernel void @v_permlanex16_b32_vvv(i32 addrspace(1)* %out, i32 %src0) #1 {
136 %tidx = call i32 @llvm.amdgcn.workitem.id.x()
137 %tidy = call i32 @llvm.amdgcn.workitem.id.y()
138 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 0, i1 0)
139 store i32 %v, i32 addrspace(1)* %out
143 ; GCN-LABEL: {{^}}v_permlanex16_b32_vvs:
144 ; GFX10-NOT: v_readfirstlane_b32
145 ; GFX10: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0
146 ; GFX10-NOT: v_readfirstlane_b32
147 ; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], s{{[0-9]+}}{{$}}
148 define amdgpu_kernel void @v_permlanex16_b32_vvs(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #1 {
149 %tidx = call i32 @llvm.amdgcn.workitem.id.x()
150 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 0, i1 0)
151 store i32 %v, i32 addrspace(1)* %out
155 ; GCN-LABEL: {{^}}v_permlanex16_b32_vsv:
156 ; GFX10-NOT: v_readfirstlane_b32
157 ; GFX10: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1
158 ; GFX10-NOT: v_readfirstlane_b32
159 ; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, [[SRC2]]{{$}}
160 define amdgpu_kernel void @v_permlanex16_b32_vsv(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
161 %tidy = call i32 @llvm.amdgcn.workitem.id.y()
162 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 0, i1 0)
163 store i32 %v, i32 addrspace(1)* %out
167 ; GCN-LABEL: {{^}}v_permlanex16_b32_vss_fi:
168 ; GFX10-NOT: v_readfirstlane_b32
169 ; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}}
170 define amdgpu_kernel void @v_permlanex16_b32_vss_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
171 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 0)
172 store i32 %v, i32 addrspace(1)* %out
176 ; GCN-LABEL: {{^}}v_permlanex16_b32_vss_bc:
177 ; GFX10-NOT: v_readfirstlane_b32
178 ; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}}
179 define amdgpu_kernel void @v_permlanex16_b32_vss_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
180 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 1)
181 store i32 %v, i32 addrspace(1)* %out
185 ; GCN-LABEL: {{^}}v_permlanex16_b32_vss_fi_bc:
186 ; GFX10-NOT: v_readfirstlane_b32
187 ; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}}
188 define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
189 %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 1)
190 store i32 %v, i32 addrspace(1)* %out
194 ; GCN-LABEL: {{^}}v_permlane16_b32_tid_tid:
195 ; GFX10: v_permlane16_b32 v0, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
196 define amdgpu_kernel void @v_permlane16_b32_tid_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
197 %tidx = call i32 @llvm.amdgcn.workitem.id.x()
198 %v = call i32 @llvm.amdgcn.permlane16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
199 store i32 %v, i32 addrspace(1)* %out
203 ; GCN-LABEL: {{^}}v_permlane16_b32_undef_tid:
204 ; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
205 define amdgpu_kernel void @v_permlane16_b32_undef_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
206 %tidx = call i32 @llvm.amdgcn.workitem.id.x()
207 %v = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
208 store i32 %v, i32 addrspace(1)* %out
212 ; GCN-LABEL: {{^}}v_permlane16_b32_i_tid:
213 ; GFX10: v_mov_b32_e32 [[OLD:v[0-9]+]], 0x3039
214 ; GFX10: v_permlane16_b32 [[OLD]], v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
215 define amdgpu_kernel void @v_permlane16_b32_i_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
216 %tidx = call i32 @llvm.amdgcn.workitem.id.x()
217 %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
218 store i32 %v, i32 addrspace(1)* %out
222 ; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_fi:
224 ; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}}
225 define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
226 %tidx = call i32 @llvm.amdgcn.workitem.id.x()
227 %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 0)
228 store i32 %v, i32 addrspace(1)* %out
232 ; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_bc:
234 ; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}}
235 define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
236 %tidx = call i32 @llvm.amdgcn.workitem.id.x()
237 %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 1)
238 store i32 %v, i32 addrspace(1)* %out
242 ; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_fi_bc:
244 ; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}}
245 define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
246 %tidx = call i32 @llvm.amdgcn.workitem.id.x()
247 %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 1)
248 store i32 %v, i32 addrspace(1)* %out
252 ; GCN-LABEL: {{^}}v_permlanex16_b32_tid_tid:
253 ; GFX10: v_permlanex16_b32 v0, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
254 define amdgpu_kernel void @v_permlanex16_b32_tid_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
255 %tidx = call i32 @llvm.amdgcn.workitem.id.x()
256 %v = call i32 @llvm.amdgcn.permlanex16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
257 store i32 %v, i32 addrspace(1)* %out
261 ; GCN-LABEL: {{^}}v_permlanex16_b32_undef_tid:
262 ; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
263 define amdgpu_kernel void @v_permlanex16_b32_undef_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
264 %tidx = call i32 @llvm.amdgcn.workitem.id.x()
265 %v = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
266 store i32 %v, i32 addrspace(1)* %out
270 ; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid:
271 ; GFX10: v_mov_b32_e32 [[OLD:v[0-9]+]], 0x3039
272 ; GFX10: v_permlanex16_b32 [[OLD]], v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
273 define amdgpu_kernel void @v_permlanex16_b32_i_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
274 %tidx = call i32 @llvm.amdgcn.workitem.id.x()
275 %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
276 store i32 %v, i32 addrspace(1)* %out
280 ; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_fi:
282 ; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}}
283 define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
284 %tidx = call i32 @llvm.amdgcn.workitem.id.x()
285 %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 0)
286 store i32 %v, i32 addrspace(1)* %out
290 ; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_bc:
292 ; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}}
293 define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
294 %tidx = call i32 @llvm.amdgcn.workitem.id.x()
295 %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 1)
296 store i32 %v, i32 addrspace(1)* %out
300 ; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_fi_bc:
302 ; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}}
303 define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
304 %tidx = call i32 @llvm.amdgcn.workitem.id.x()
305 %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 1)
306 store i32 %v, i32 addrspace(1)* %out
310 attributes #0 = { nounwind readnone convergent }
311 attributes #1 = { nounwind }