1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -o - %s | FileCheck -check-prefix=GFX7 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
6 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefix=GFX10 %s
8 define amdgpu_ps i32 @s_bswap_i32(i32 inreg %src) {
9 ; GFX7-LABEL: s_bswap_i32:
11 ; GFX7-NEXT: v_alignbit_b32 v0, s0, s0, 8
12 ; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 24
13 ; GFX7-NEXT: s_mov_b32 s0, 0xff00ff
14 ; GFX7-NEXT: v_bfi_b32 v0, s0, v1, v0
15 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
16 ; GFX7-NEXT: ; return to shader part epilog
18 ; GFX8-LABEL: s_bswap_i32:
20 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
21 ; GFX8-NEXT: s_mov_b32 s0, 0x10203
22 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0
23 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
24 ; GFX8-NEXT: ; return to shader part epilog
26 ; GFX9-LABEL: s_bswap_i32:
28 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
29 ; GFX9-NEXT: s_mov_b32 s0, 0x10203
30 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0
31 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
32 ; GFX9-NEXT: ; return to shader part epilog
34 ; GFX10-LABEL: s_bswap_i32:
36 ; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0x10203
37 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
38 ; GFX10-NEXT: ; return to shader part epilog
39 %bswap = call i32 @llvm.bswap.i32(i32 %src)
43 define i32 @v_bswap_i32(i32 %src) {
44 ; GFX7-LABEL: v_bswap_i32:
46 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47 ; GFX7-NEXT: v_alignbit_b32 v1, v0, v0, 8
48 ; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24
49 ; GFX7-NEXT: s_mov_b32 s4, 0xff00ff
50 ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1
51 ; GFX7-NEXT: s_setpc_b64 s[30:31]
53 ; GFX8-LABEL: v_bswap_i32:
55 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56 ; GFX8-NEXT: s_mov_b32 s4, 0x10203
57 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4
58 ; GFX8-NEXT: s_setpc_b64 s[30:31]
60 ; GFX9-LABEL: v_bswap_i32:
62 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
63 ; GFX9-NEXT: s_mov_b32 s4, 0x10203
64 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4
65 ; GFX9-NEXT: s_setpc_b64 s[30:31]
67 ; GFX10-LABEL: v_bswap_i32:
69 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70 ; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x10203
71 ; GFX10-NEXT: s_setpc_b64 s[30:31]
72 %bswap = call i32 @llvm.bswap.i32(i32 %src)
76 define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) {
77 ; GFX7-LABEL: s_bswap_v2i32:
79 ; GFX7-NEXT: v_alignbit_b32 v0, s0, s0, 8
80 ; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 24
81 ; GFX7-NEXT: s_mov_b32 s0, 0xff00ff
82 ; GFX7-NEXT: v_bfi_b32 v0, s0, v1, v0
83 ; GFX7-NEXT: v_alignbit_b32 v1, s1, s1, 8
84 ; GFX7-NEXT: v_alignbit_b32 v2, s1, s1, 24
85 ; GFX7-NEXT: v_bfi_b32 v1, s0, v2, v1
86 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
87 ; GFX7-NEXT: v_readfirstlane_b32 s1, v1
88 ; GFX7-NEXT: ; return to shader part epilog
90 ; GFX8-LABEL: s_bswap_v2i32:
92 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
93 ; GFX8-NEXT: s_mov_b32 s0, 0x10203
94 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
95 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0
96 ; GFX8-NEXT: v_perm_b32 v1, 0, v1, s0
97 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
98 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
99 ; GFX8-NEXT: ; return to shader part epilog
101 ; GFX9-LABEL: s_bswap_v2i32:
103 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
104 ; GFX9-NEXT: s_mov_b32 s0, 0x10203
105 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
106 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0
107 ; GFX9-NEXT: v_perm_b32 v1, 0, v1, s0
108 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
109 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
110 ; GFX9-NEXT: ; return to shader part epilog
112 ; GFX10-LABEL: s_bswap_v2i32:
114 ; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0x10203
115 ; GFX10-NEXT: v_perm_b32 v1, 0, s1, 0x10203
116 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
117 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
118 ; GFX10-NEXT: ; return to shader part epilog
119 %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src)
123 define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) {
124 ; GFX7-LABEL: v_bswap_v2i32:
126 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127 ; GFX7-NEXT: v_alignbit_b32 v2, v0, v0, 8
128 ; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24
129 ; GFX7-NEXT: s_mov_b32 s4, 0xff00ff
130 ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v2
131 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v1, 8
132 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v1, 24
133 ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2
134 ; GFX7-NEXT: s_setpc_b64 s[30:31]
136 ; GFX8-LABEL: v_bswap_v2i32:
138 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
139 ; GFX8-NEXT: s_mov_b32 s4, 0x10203
140 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4
141 ; GFX8-NEXT: v_perm_b32 v1, 0, v1, s4
142 ; GFX8-NEXT: s_setpc_b64 s[30:31]
144 ; GFX9-LABEL: v_bswap_v2i32:
146 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147 ; GFX9-NEXT: s_mov_b32 s4, 0x10203
148 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4
149 ; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4
150 ; GFX9-NEXT: s_setpc_b64 s[30:31]
152 ; GFX10-LABEL: v_bswap_v2i32:
154 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155 ; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x10203
156 ; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0x10203
157 ; GFX10-NEXT: s_setpc_b64 s[30:31]
158 %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src)
162 define amdgpu_ps i64 @s_bswap_i64(i64 inreg %src) {
163 ; GFX7-LABEL: s_bswap_i64:
165 ; GFX7-NEXT: v_alignbit_b32 v0, s1, s1, 8
166 ; GFX7-NEXT: v_alignbit_b32 v1, s1, s1, 24
167 ; GFX7-NEXT: s_mov_b32 s1, 0xff00ff
168 ; GFX7-NEXT: v_bfi_b32 v0, s1, v1, v0
169 ; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 8
170 ; GFX7-NEXT: v_alignbit_b32 v2, s0, s0, 24
171 ; GFX7-NEXT: v_bfi_b32 v1, s1, v2, v1
172 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
173 ; GFX7-NEXT: v_readfirstlane_b32 s1, v1
174 ; GFX7-NEXT: ; return to shader part epilog
176 ; GFX8-LABEL: s_bswap_i64:
178 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
179 ; GFX8-NEXT: s_mov_b32 s1, 0x10203
180 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
181 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s1
182 ; GFX8-NEXT: v_perm_b32 v1, 0, v1, s1
183 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
184 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
185 ; GFX8-NEXT: ; return to shader part epilog
187 ; GFX9-LABEL: s_bswap_i64:
189 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
190 ; GFX9-NEXT: s_mov_b32 s1, 0x10203
191 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
192 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s1
193 ; GFX9-NEXT: v_perm_b32 v1, 0, v1, s1
194 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
195 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
196 ; GFX9-NEXT: ; return to shader part epilog
198 ; GFX10-LABEL: s_bswap_i64:
200 ; GFX10-NEXT: v_perm_b32 v0, 0, s1, 0x10203
201 ; GFX10-NEXT: v_perm_b32 v1, 0, s0, 0x10203
202 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
203 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
204 ; GFX10-NEXT: ; return to shader part epilog
205 %bswap = call i64 @llvm.bswap.i64(i64 %src)
209 define i64 @v_bswap_i64(i64 %src) {
210 ; GFX7-LABEL: v_bswap_i64:
212 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
213 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v1, 8
214 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v1, 24
215 ; GFX7-NEXT: s_mov_b32 s4, 0xff00ff
216 ; GFX7-NEXT: v_bfi_b32 v2, s4, v1, v2
217 ; GFX7-NEXT: v_alignbit_b32 v1, v0, v0, 8
218 ; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24
219 ; GFX7-NEXT: v_bfi_b32 v1, s4, v0, v1
220 ; GFX7-NEXT: v_mov_b32_e32 v0, v2
221 ; GFX7-NEXT: s_setpc_b64 s[30:31]
223 ; GFX8-LABEL: v_bswap_i64:
225 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
226 ; GFX8-NEXT: s_mov_b32 s4, 0x10203
227 ; GFX8-NEXT: v_perm_b32 v2, 0, v1, s4
228 ; GFX8-NEXT: v_perm_b32 v1, 0, v0, s4
229 ; GFX8-NEXT: v_mov_b32_e32 v0, v2
230 ; GFX8-NEXT: s_setpc_b64 s[30:31]
232 ; GFX9-LABEL: v_bswap_i64:
234 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235 ; GFX9-NEXT: s_mov_b32 s4, 0x10203
236 ; GFX9-NEXT: v_perm_b32 v2, 0, v1, s4
237 ; GFX9-NEXT: v_perm_b32 v1, 0, v0, s4
238 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
239 ; GFX9-NEXT: s_setpc_b64 s[30:31]
241 ; GFX10-LABEL: v_bswap_i64:
243 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
244 ; GFX10-NEXT: v_perm_b32 v2, 0, v1, 0x10203
245 ; GFX10-NEXT: v_perm_b32 v1, 0, v0, 0x10203
246 ; GFX10-NEXT: v_mov_b32_e32 v0, v2
247 ; GFX10-NEXT: s_setpc_b64 s[30:31]
248 %bswap = call i64 @llvm.bswap.i64(i64 %src)
252 define amdgpu_ps <2 x i64> @s_bswap_v2i64(<2 x i64> inreg %src) {
253 ; GFX7-LABEL: s_bswap_v2i64:
255 ; GFX7-NEXT: v_alignbit_b32 v0, s1, s1, 8
256 ; GFX7-NEXT: v_alignbit_b32 v1, s1, s1, 24
257 ; GFX7-NEXT: s_mov_b32 s1, 0xff00ff
258 ; GFX7-NEXT: v_bfi_b32 v0, s1, v1, v0
259 ; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 8
260 ; GFX7-NEXT: v_alignbit_b32 v2, s0, s0, 24
261 ; GFX7-NEXT: v_bfi_b32 v1, s1, v2, v1
262 ; GFX7-NEXT: v_alignbit_b32 v2, s3, s3, 8
263 ; GFX7-NEXT: v_alignbit_b32 v3, s3, s3, 24
264 ; GFX7-NEXT: v_bfi_b32 v2, s1, v3, v2
265 ; GFX7-NEXT: v_alignbit_b32 v3, s2, s2, 8
266 ; GFX7-NEXT: v_alignbit_b32 v4, s2, s2, 24
267 ; GFX7-NEXT: v_bfi_b32 v3, s1, v4, v3
268 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
269 ; GFX7-NEXT: v_readfirstlane_b32 s1, v1
270 ; GFX7-NEXT: v_readfirstlane_b32 s2, v2
271 ; GFX7-NEXT: v_readfirstlane_b32 s3, v3
272 ; GFX7-NEXT: ; return to shader part epilog
274 ; GFX8-LABEL: s_bswap_v2i64:
276 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
277 ; GFX8-NEXT: s_mov_b32 s1, 0x10203
278 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
279 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
280 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
281 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s1
282 ; GFX8-NEXT: v_perm_b32 v1, 0, v1, s1
283 ; GFX8-NEXT: v_perm_b32 v2, 0, v2, s1
284 ; GFX8-NEXT: v_perm_b32 v3, 0, v3, s1
285 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
286 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
287 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
288 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
289 ; GFX8-NEXT: ; return to shader part epilog
291 ; GFX9-LABEL: s_bswap_v2i64:
293 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
294 ; GFX9-NEXT: s_mov_b32 s1, 0x10203
295 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
296 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
297 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
298 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s1
299 ; GFX9-NEXT: v_perm_b32 v1, 0, v1, s1
300 ; GFX9-NEXT: v_perm_b32 v2, 0, v2, s1
301 ; GFX9-NEXT: v_perm_b32 v3, 0, v3, s1
302 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
303 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
304 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
305 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
306 ; GFX9-NEXT: ; return to shader part epilog
308 ; GFX10-LABEL: s_bswap_v2i64:
310 ; GFX10-NEXT: v_perm_b32 v0, 0, s1, 0x10203
311 ; GFX10-NEXT: v_perm_b32 v1, 0, s0, 0x10203
312 ; GFX10-NEXT: v_perm_b32 v2, 0, s3, 0x10203
313 ; GFX10-NEXT: v_perm_b32 v3, 0, s2, 0x10203
314 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
315 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
316 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2
317 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3
318 ; GFX10-NEXT: ; return to shader part epilog
319 %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src)
323 define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) {
324 ; GFX7-LABEL: v_bswap_v2i64:
326 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
327 ; GFX7-NEXT: v_alignbit_b32 v4, v1, v1, 8
328 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v1, 24
329 ; GFX7-NEXT: s_mov_b32 s4, 0xff00ff
330 ; GFX7-NEXT: v_bfi_b32 v4, s4, v1, v4
331 ; GFX7-NEXT: v_alignbit_b32 v1, v0, v0, 8
332 ; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24
333 ; GFX7-NEXT: v_bfi_b32 v1, s4, v0, v1
334 ; GFX7-NEXT: v_alignbit_b32 v0, v3, v3, 8
335 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v3, 24
336 ; GFX7-NEXT: v_bfi_b32 v5, s4, v3, v0
337 ; GFX7-NEXT: v_alignbit_b32 v0, v2, v2, 8
338 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v2, 24
339 ; GFX7-NEXT: v_bfi_b32 v3, s4, v2, v0
340 ; GFX7-NEXT: v_mov_b32_e32 v0, v4
341 ; GFX7-NEXT: v_mov_b32_e32 v2, v5
342 ; GFX7-NEXT: s_setpc_b64 s[30:31]
344 ; GFX8-LABEL: v_bswap_v2i64:
346 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
347 ; GFX8-NEXT: s_mov_b32 s4, 0x10203
348 ; GFX8-NEXT: v_perm_b32 v4, 0, v1, s4
349 ; GFX8-NEXT: v_perm_b32 v5, 0, v3, s4
350 ; GFX8-NEXT: v_perm_b32 v1, 0, v0, s4
351 ; GFX8-NEXT: v_perm_b32 v3, 0, v2, s4
352 ; GFX8-NEXT: v_mov_b32_e32 v0, v4
353 ; GFX8-NEXT: v_mov_b32_e32 v2, v5
354 ; GFX8-NEXT: s_setpc_b64 s[30:31]
356 ; GFX9-LABEL: v_bswap_v2i64:
358 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
359 ; GFX9-NEXT: s_mov_b32 s4, 0x10203
360 ; GFX9-NEXT: v_perm_b32 v4, 0, v1, s4
361 ; GFX9-NEXT: v_perm_b32 v5, 0, v3, s4
362 ; GFX9-NEXT: v_perm_b32 v1, 0, v0, s4
363 ; GFX9-NEXT: v_perm_b32 v3, 0, v2, s4
364 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
365 ; GFX9-NEXT: v_mov_b32_e32 v2, v5
366 ; GFX9-NEXT: s_setpc_b64 s[30:31]
368 ; GFX10-LABEL: v_bswap_v2i64:
370 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
371 ; GFX10-NEXT: v_perm_b32 v4, 0, v1, 0x10203
372 ; GFX10-NEXT: v_perm_b32 v5, 0, v3, 0x10203
373 ; GFX10-NEXT: v_perm_b32 v1, 0, v0, 0x10203
374 ; GFX10-NEXT: v_perm_b32 v3, 0, v2, 0x10203
375 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
376 ; GFX10-NEXT: v_mov_b32_e32 v2, v5
377 ; GFX10-NEXT: s_setpc_b64 s[30:31]
378 %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src)
382 define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) {
383 ; GFX7-LABEL: s_bswap_i16:
385 ; GFX7-NEXT: s_lshl_b32 s1, s0, 8
386 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80008
387 ; GFX7-NEXT: s_or_b32 s0, s0, s1
388 ; GFX7-NEXT: ; return to shader part epilog
390 ; GFX8-LABEL: s_bswap_i16:
392 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
393 ; GFX8-NEXT: s_mov_b32 s0, 0xc0c0001
394 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0
395 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
396 ; GFX8-NEXT: ; return to shader part epilog
398 ; GFX9-LABEL: s_bswap_i16:
400 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
401 ; GFX9-NEXT: s_mov_b32 s0, 0xc0c0001
402 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0
403 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
404 ; GFX9-NEXT: ; return to shader part epilog
406 ; GFX10-LABEL: s_bswap_i16:
408 ; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0xc0c0001
409 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
410 ; GFX10-NEXT: ; return to shader part epilog
411 %bswap = call i16 @llvm.bswap.i16(i16 %src)
415 define i16 @v_bswap_i16(i16 %src) {
416 ; GFX7-LABEL: v_bswap_i16:
418 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
419 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
420 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
421 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
422 ; GFX7-NEXT: s_setpc_b64 s[30:31]
424 ; GFX8-LABEL: v_bswap_i16:
426 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
427 ; GFX8-NEXT: s_mov_b32 s4, 0xc0c0001
428 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4
429 ; GFX8-NEXT: s_setpc_b64 s[30:31]
431 ; GFX9-LABEL: v_bswap_i16:
433 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
434 ; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001
435 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4
436 ; GFX9-NEXT: s_setpc_b64 s[30:31]
438 ; GFX10-LABEL: v_bswap_i16:
440 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
441 ; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001
442 ; GFX10-NEXT: s_setpc_b64 s[30:31]
443 %bswap = call i16 @llvm.bswap.i16(i16 %src)
447 define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) {
448 ; GFX7-LABEL: s_bswap_v2i16:
450 ; GFX7-NEXT: s_lshl_b32 s2, s0, 8
451 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80008
452 ; GFX7-NEXT: s_or_b32 s0, s0, s2
453 ; GFX7-NEXT: s_lshl_b32 s2, s1, 8
454 ; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80008
455 ; GFX7-NEXT: s_or_b32 s1, s1, s2
456 ; GFX7-NEXT: s_and_b32 s1, 0xffff, s1
457 ; GFX7-NEXT: s_and_b32 s0, 0xffff, s0
458 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16
459 ; GFX7-NEXT: s_or_b32 s0, s0, s1
460 ; GFX7-NEXT: ; return to shader part epilog
462 ; GFX8-LABEL: s_bswap_v2i16:
464 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
465 ; GFX8-NEXT: s_mov_b32 s0, 0x2030001
466 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0
467 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
468 ; GFX8-NEXT: ; return to shader part epilog
470 ; GFX9-LABEL: s_bswap_v2i16:
472 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
473 ; GFX9-NEXT: s_mov_b32 s0, 0x2030001
474 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0
475 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
476 ; GFX9-NEXT: ; return to shader part epilog
478 ; GFX10-LABEL: s_bswap_v2i16:
480 ; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0x2030001
481 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
482 ; GFX10-NEXT: ; return to shader part epilog
483 %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src)
484 %cast = bitcast <2 x i16> %bswap to i32
488 define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
489 ; GFX7-LABEL: v_bswap_i16_zext_to_i32:
491 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
492 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
493 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
494 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
495 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
496 ; GFX7-NEXT: s_setpc_b64 s[30:31]
498 ; GFX8-LABEL: v_bswap_i16_zext_to_i32:
500 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
501 ; GFX8-NEXT: s_mov_b32 s4, 0xc0c0001
502 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4
503 ; GFX8-NEXT: s_setpc_b64 s[30:31]
505 ; GFX9-LABEL: v_bswap_i16_zext_to_i32:
507 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
508 ; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001
509 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4
510 ; GFX9-NEXT: s_setpc_b64 s[30:31]
512 ; GFX10-LABEL: v_bswap_i16_zext_to_i32:
514 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
515 ; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001
516 ; GFX10-NEXT: s_setpc_b64 s[30:31]
517 %bswap = call i16 @llvm.bswap.i16(i16 %src)
518 %zext = zext i16 %bswap to i32
522 define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
523 ; GFX7-LABEL: v_bswap_i16_sext_to_i32:
525 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
526 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
527 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
528 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
529 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
530 ; GFX7-NEXT: s_setpc_b64 s[30:31]
532 ; GFX8-LABEL: v_bswap_i16_sext_to_i32:
534 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
535 ; GFX8-NEXT: s_mov_b32 s4, 0xc0c0001
536 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4
537 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 16
538 ; GFX8-NEXT: s_setpc_b64 s[30:31]
540 ; GFX9-LABEL: v_bswap_i16_sext_to_i32:
542 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
543 ; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001
544 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4
545 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
546 ; GFX9-NEXT: s_setpc_b64 s[30:31]
548 ; GFX10-LABEL: v_bswap_i16_sext_to_i32:
550 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
551 ; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001
552 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
553 ; GFX10-NEXT: s_setpc_b64 s[30:31]
554 %bswap = call i16 @llvm.bswap.i16(i16 %src)
555 %zext = sext i16 %bswap to i32
559 define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
560 ; GFX7-LABEL: v_bswap_v2i16:
562 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
563 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v0
564 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
565 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
566 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v1
567 ; GFX7-NEXT: v_bfe_u32 v1, v1, 8, 8
568 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
569 ; GFX7-NEXT: s_setpc_b64 s[30:31]
571 ; GFX8-LABEL: v_bswap_v2i16:
573 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
574 ; GFX8-NEXT: s_mov_b32 s4, 0x2030001
575 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4
576 ; GFX8-NEXT: s_setpc_b64 s[30:31]
578 ; GFX9-LABEL: v_bswap_v2i16:
580 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
581 ; GFX9-NEXT: s_mov_b32 s4, 0x2030001
582 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4
583 ; GFX9-NEXT: s_setpc_b64 s[30:31]
585 ; GFX10-LABEL: v_bswap_v2i16:
587 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
588 ; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x2030001
589 ; GFX10-NEXT: s_setpc_b64 s[30:31]
590 %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src)
594 define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
595 ; GFX7-LABEL: v_bswap_v3i16:
597 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
598 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v0
599 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
600 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3
601 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v1
602 ; GFX7-NEXT: v_bfe_u32 v1, v1, 8, 8
603 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
604 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v2
605 ; GFX7-NEXT: v_bfe_u32 v2, v2, 8, 8
606 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
607 ; GFX7-NEXT: s_setpc_b64 s[30:31]
609 ; GFX8-LABEL: v_bswap_v3i16:
611 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
612 ; GFX8-NEXT: s_mov_b32 s4, 0x2030001
613 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4
614 ; GFX8-NEXT: v_perm_b32 v1, 0, v1, s4
615 ; GFX8-NEXT: s_setpc_b64 s[30:31]
617 ; GFX9-LABEL: v_bswap_v3i16:
619 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
620 ; GFX9-NEXT: s_mov_b32 s4, 0x2030001
621 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4
622 ; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4
623 ; GFX9-NEXT: s_setpc_b64 s[30:31]
625 ; GFX10-LABEL: v_bswap_v3i16:
627 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
628 ; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x2030001
629 ; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0x2030001
630 ; GFX10-NEXT: s_setpc_b64 s[30:31]
631 %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %src)
635 define i64 @v_bswap_i48(i64 %src) {
636 ; GFX7-LABEL: v_bswap_i48:
638 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
639 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v1, 8
640 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v1, 24
641 ; GFX7-NEXT: s_mov_b32 s4, 0xff00ff
642 ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2
643 ; GFX7-NEXT: v_alignbit_b32 v2, v0, v0, 8
644 ; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24
645 ; GFX7-NEXT: v_bfi_b32 v2, s4, v0, v2
646 ; GFX7-NEXT: v_lshr_b64 v[0:1], v[1:2], 16
647 ; GFX7-NEXT: s_setpc_b64 s[30:31]
649 ; GFX8-LABEL: v_bswap_i48:
651 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
652 ; GFX8-NEXT: s_mov_b32 s4, 0x10203
653 ; GFX8-NEXT: v_perm_b32 v1, 0, v1, s4
654 ; GFX8-NEXT: v_perm_b32 v2, 0, v0, s4
655 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2]
656 ; GFX8-NEXT: s_setpc_b64 s[30:31]
658 ; GFX9-LABEL: v_bswap_i48:
660 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
661 ; GFX9-NEXT: s_mov_b32 s4, 0x10203
662 ; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4
663 ; GFX9-NEXT: v_perm_b32 v2, 0, v0, s4
664 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2]
665 ; GFX9-NEXT: s_setpc_b64 s[30:31]
667 ; GFX10-LABEL: v_bswap_i48:
669 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
670 ; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0x10203
671 ; GFX10-NEXT: v_perm_b32 v2, 0, v0, 0x10203
672 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2]
673 ; GFX10-NEXT: s_setpc_b64 s[30:31]
674 %trunc = trunc i64 %src to i48
675 %bswap = call i48 @llvm.bswap.i48(i48 %trunc)
676 %zext = zext i48 %bswap to i64
680 declare i16 @llvm.bswap.i16(i16) #1
681 declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) #1
682 declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>) #1
683 declare i32 @llvm.bswap.i32(i32) #1
684 declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) #1
685 declare i64 @llvm.bswap.i64(i64) #1
686 declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) #1
687 declare i48 @llvm.bswap.i48(i48) #1
689 attributes #0 = { convergent nounwind readnone }
690 attributes #1 = { nounwind readnone speculatable willreturn }