1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -o - %s | FileCheck -check-prefix=GFX7 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
7 define amdgpu_ps i32 @s_bswap_i32(i32 inreg %src) {
8 ; GFX7-LABEL: s_bswap_i32:
10 ; GFX7-NEXT: v_alignbit_b32 v0, s0, s0, 8
11 ; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 24
12 ; GFX7-NEXT: s_mov_b32 s0, 0xff00ff
13 ; GFX7-NEXT: v_bfi_b32 v0, s0, v1, v0
14 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
15 ; GFX7-NEXT: ; return to shader part epilog
17 ; GFX8-LABEL: s_bswap_i32:
19 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
20 ; GFX8-NEXT: s_mov_b32 s0, 0x10203
21 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0
22 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
23 ; GFX8-NEXT: ; return to shader part epilog
25 ; GFX9-LABEL: s_bswap_i32:
27 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
28 ; GFX9-NEXT: s_mov_b32 s0, 0x10203
29 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0
30 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
31 ; GFX9-NEXT: ; return to shader part epilog
33 ; GFX10-LABEL: s_bswap_i32:
35 ; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0x10203
36 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
37 ; GFX10-NEXT: ; return to shader part epilog
38 %bswap = call i32 @llvm.bswap.i32(i32 %src)
42 define i32 @v_bswap_i32(i32 %src) {
43 ; GFX7-LABEL: v_bswap_i32:
45 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46 ; GFX7-NEXT: v_alignbit_b32 v1, v0, v0, 8
47 ; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24
48 ; GFX7-NEXT: s_mov_b32 s4, 0xff00ff
49 ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1
50 ; GFX7-NEXT: s_setpc_b64 s[30:31]
52 ; GFX8-LABEL: v_bswap_i32:
54 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
55 ; GFX8-NEXT: s_mov_b32 s4, 0x10203
56 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4
57 ; GFX8-NEXT: s_setpc_b64 s[30:31]
59 ; GFX9-LABEL: v_bswap_i32:
61 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62 ; GFX9-NEXT: s_mov_b32 s4, 0x10203
63 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4
64 ; GFX9-NEXT: s_setpc_b64 s[30:31]
66 ; GFX10-LABEL: v_bswap_i32:
68 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
69 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
70 ; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x10203
71 ; GFX10-NEXT: s_setpc_b64 s[30:31]
72 %bswap = call i32 @llvm.bswap.i32(i32 %src)
76 define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) {
77 ; GFX7-LABEL: s_bswap_v2i32:
79 ; GFX7-NEXT: v_alignbit_b32 v0, s0, s0, 8
80 ; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 24
81 ; GFX7-NEXT: s_mov_b32 s0, 0xff00ff
82 ; GFX7-NEXT: v_bfi_b32 v0, s0, v1, v0
83 ; GFX7-NEXT: v_alignbit_b32 v1, s1, s1, 8
84 ; GFX7-NEXT: v_alignbit_b32 v2, s1, s1, 24
85 ; GFX7-NEXT: v_bfi_b32 v1, s0, v2, v1
86 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
87 ; GFX7-NEXT: v_readfirstlane_b32 s1, v1
88 ; GFX7-NEXT: ; return to shader part epilog
90 ; GFX8-LABEL: s_bswap_v2i32:
92 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
93 ; GFX8-NEXT: s_mov_b32 s0, 0x10203
94 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
95 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0
96 ; GFX8-NEXT: v_perm_b32 v1, 0, v1, s0
97 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
98 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
99 ; GFX8-NEXT: ; return to shader part epilog
101 ; GFX9-LABEL: s_bswap_v2i32:
103 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
104 ; GFX9-NEXT: s_mov_b32 s0, 0x10203
105 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
106 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0
107 ; GFX9-NEXT: v_perm_b32 v1, 0, v1, s0
108 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
109 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
110 ; GFX9-NEXT: ; return to shader part epilog
112 ; GFX10-LABEL: s_bswap_v2i32:
114 ; GFX10-NEXT: s_mov_b32 s2, 0x10203
115 ; GFX10-NEXT: v_perm_b32 v0, 0, s0, s2
116 ; GFX10-NEXT: v_perm_b32 v1, 0, s1, s2
117 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
118 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
119 ; GFX10-NEXT: ; return to shader part epilog
120 %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src)
124 define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) {
125 ; GFX7-LABEL: v_bswap_v2i32:
127 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128 ; GFX7-NEXT: v_alignbit_b32 v2, v0, v0, 8
129 ; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24
130 ; GFX7-NEXT: s_mov_b32 s4, 0xff00ff
131 ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v2
132 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v1, 8
133 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v1, 24
134 ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2
135 ; GFX7-NEXT: s_setpc_b64 s[30:31]
137 ; GFX8-LABEL: v_bswap_v2i32:
139 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140 ; GFX8-NEXT: s_mov_b32 s4, 0x10203
141 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4
142 ; GFX8-NEXT: v_perm_b32 v1, 0, v1, s4
143 ; GFX8-NEXT: s_setpc_b64 s[30:31]
145 ; GFX9-LABEL: v_bswap_v2i32:
147 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
148 ; GFX9-NEXT: s_mov_b32 s4, 0x10203
149 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4
150 ; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4
151 ; GFX9-NEXT: s_setpc_b64 s[30:31]
153 ; GFX10-LABEL: v_bswap_v2i32:
155 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
156 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
157 ; GFX10-NEXT: s_mov_b32 s4, 0x10203
158 ; GFX10-NEXT: v_perm_b32 v0, 0, v0, s4
159 ; GFX10-NEXT: v_perm_b32 v1, 0, v1, s4
160 ; GFX10-NEXT: s_setpc_b64 s[30:31]
161 %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src)
165 define amdgpu_ps i64 @s_bswap_i64(i64 inreg %src) {
166 ; GFX7-LABEL: s_bswap_i64:
168 ; GFX7-NEXT: v_alignbit_b32 v0, s1, s1, 8
169 ; GFX7-NEXT: v_alignbit_b32 v1, s1, s1, 24
170 ; GFX7-NEXT: s_mov_b32 s1, 0xff00ff
171 ; GFX7-NEXT: v_bfi_b32 v0, s1, v1, v0
172 ; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 8
173 ; GFX7-NEXT: v_alignbit_b32 v2, s0, s0, 24
174 ; GFX7-NEXT: v_bfi_b32 v1, s1, v2, v1
175 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
176 ; GFX7-NEXT: v_readfirstlane_b32 s1, v1
177 ; GFX7-NEXT: ; return to shader part epilog
179 ; GFX8-LABEL: s_bswap_i64:
181 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
182 ; GFX8-NEXT: s_mov_b32 s1, 0x10203
183 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
184 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s1
185 ; GFX8-NEXT: v_perm_b32 v1, 0, v1, s1
186 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
187 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
188 ; GFX8-NEXT: ; return to shader part epilog
190 ; GFX9-LABEL: s_bswap_i64:
192 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
193 ; GFX9-NEXT: s_mov_b32 s1, 0x10203
194 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
195 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s1
196 ; GFX9-NEXT: v_perm_b32 v1, 0, v1, s1
197 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
198 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
199 ; GFX9-NEXT: ; return to shader part epilog
201 ; GFX10-LABEL: s_bswap_i64:
203 ; GFX10-NEXT: s_mov_b32 s2, 0x10203
204 ; GFX10-NEXT: v_perm_b32 v0, 0, s1, s2
205 ; GFX10-NEXT: v_perm_b32 v1, 0, s0, s2
206 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
207 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
208 ; GFX10-NEXT: ; return to shader part epilog
209 %bswap = call i64 @llvm.bswap.i64(i64 %src)
213 define i64 @v_bswap_i64(i64 %src) {
214 ; GFX7-LABEL: v_bswap_i64:
216 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
217 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v1, 8
218 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v1, 24
219 ; GFX7-NEXT: s_mov_b32 s4, 0xff00ff
220 ; GFX7-NEXT: v_bfi_b32 v2, s4, v1, v2
221 ; GFX7-NEXT: v_alignbit_b32 v1, v0, v0, 8
222 ; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24
223 ; GFX7-NEXT: v_bfi_b32 v1, s4, v0, v1
224 ; GFX7-NEXT: v_mov_b32_e32 v0, v2
225 ; GFX7-NEXT: s_setpc_b64 s[30:31]
227 ; GFX8-LABEL: v_bswap_i64:
229 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
230 ; GFX8-NEXT: s_mov_b32 s4, 0x10203
231 ; GFX8-NEXT: v_perm_b32 v2, 0, v1, s4
232 ; GFX8-NEXT: v_perm_b32 v1, 0, v0, s4
233 ; GFX8-NEXT: v_mov_b32_e32 v0, v2
234 ; GFX8-NEXT: s_setpc_b64 s[30:31]
236 ; GFX9-LABEL: v_bswap_i64:
238 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
239 ; GFX9-NEXT: s_mov_b32 s4, 0x10203
240 ; GFX9-NEXT: v_perm_b32 v2, 0, v1, s4
241 ; GFX9-NEXT: v_perm_b32 v1, 0, v0, s4
242 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
243 ; GFX9-NEXT: s_setpc_b64 s[30:31]
245 ; GFX10-LABEL: v_bswap_i64:
247 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
249 ; GFX10-NEXT: s_mov_b32 s4, 0x10203
250 ; GFX10-NEXT: v_perm_b32 v2, 0, v1, s4
251 ; GFX10-NEXT: v_perm_b32 v1, 0, v0, s4
252 ; GFX10-NEXT: v_mov_b32_e32 v0, v2
253 ; GFX10-NEXT: s_setpc_b64 s[30:31]
254 %bswap = call i64 @llvm.bswap.i64(i64 %src)
258 define amdgpu_ps <2 x i64> @s_bswap_v2i64(<2 x i64> inreg %src) {
259 ; GFX7-LABEL: s_bswap_v2i64:
261 ; GFX7-NEXT: v_alignbit_b32 v0, s1, s1, 8
262 ; GFX7-NEXT: v_alignbit_b32 v1, s1, s1, 24
263 ; GFX7-NEXT: s_mov_b32 s1, 0xff00ff
264 ; GFX7-NEXT: v_bfi_b32 v0, s1, v1, v0
265 ; GFX7-NEXT: v_alignbit_b32 v1, s0, s0, 8
266 ; GFX7-NEXT: v_alignbit_b32 v2, s0, s0, 24
267 ; GFX7-NEXT: v_bfi_b32 v1, s1, v2, v1
268 ; GFX7-NEXT: v_alignbit_b32 v2, s3, s3, 8
269 ; GFX7-NEXT: v_alignbit_b32 v3, s3, s3, 24
270 ; GFX7-NEXT: v_bfi_b32 v2, s1, v3, v2
271 ; GFX7-NEXT: v_alignbit_b32 v3, s2, s2, 8
272 ; GFX7-NEXT: v_alignbit_b32 v4, s2, s2, 24
273 ; GFX7-NEXT: v_bfi_b32 v3, s1, v4, v3
274 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
275 ; GFX7-NEXT: v_readfirstlane_b32 s1, v1
276 ; GFX7-NEXT: v_readfirstlane_b32 s2, v2
277 ; GFX7-NEXT: v_readfirstlane_b32 s3, v3
278 ; GFX7-NEXT: ; return to shader part epilog
280 ; GFX8-LABEL: s_bswap_v2i64:
282 ; GFX8-NEXT: v_mov_b32_e32 v0, s1
283 ; GFX8-NEXT: s_mov_b32 s1, 0x10203
284 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
285 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
286 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
287 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s1
288 ; GFX8-NEXT: v_perm_b32 v1, 0, v1, s1
289 ; GFX8-NEXT: v_perm_b32 v2, 0, v2, s1
290 ; GFX8-NEXT: v_perm_b32 v3, 0, v3, s1
291 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
292 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
293 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
294 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
295 ; GFX8-NEXT: ; return to shader part epilog
297 ; GFX9-LABEL: s_bswap_v2i64:
299 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
300 ; GFX9-NEXT: s_mov_b32 s1, 0x10203
301 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
302 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
303 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
304 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s1
305 ; GFX9-NEXT: v_perm_b32 v1, 0, v1, s1
306 ; GFX9-NEXT: v_perm_b32 v2, 0, v2, s1
307 ; GFX9-NEXT: v_perm_b32 v3, 0, v3, s1
308 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
309 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
310 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
311 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
312 ; GFX9-NEXT: ; return to shader part epilog
314 ; GFX10-LABEL: s_bswap_v2i64:
316 ; GFX10-NEXT: s_mov_b32 s4, 0x10203
317 ; GFX10-NEXT: v_perm_b32 v0, 0, s1, s4
318 ; GFX10-NEXT: v_perm_b32 v1, 0, s0, s4
319 ; GFX10-NEXT: v_perm_b32 v2, 0, s3, s4
320 ; GFX10-NEXT: v_perm_b32 v3, 0, s2, s4
321 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
322 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
323 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2
324 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3
325 ; GFX10-NEXT: ; return to shader part epilog
326 %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src)
330 define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) {
331 ; GFX7-LABEL: v_bswap_v2i64:
333 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
334 ; GFX7-NEXT: v_alignbit_b32 v4, v1, v1, 8
335 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v1, 24
336 ; GFX7-NEXT: s_mov_b32 s4, 0xff00ff
337 ; GFX7-NEXT: v_bfi_b32 v4, s4, v1, v4
338 ; GFX7-NEXT: v_alignbit_b32 v1, v0, v0, 8
339 ; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24
340 ; GFX7-NEXT: v_bfi_b32 v1, s4, v0, v1
341 ; GFX7-NEXT: v_alignbit_b32 v0, v3, v3, 8
342 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v3, 24
343 ; GFX7-NEXT: v_bfi_b32 v5, s4, v3, v0
344 ; GFX7-NEXT: v_alignbit_b32 v0, v2, v2, 8
345 ; GFX7-NEXT: v_alignbit_b32 v2, v2, v2, 24
346 ; GFX7-NEXT: v_bfi_b32 v3, s4, v2, v0
347 ; GFX7-NEXT: v_mov_b32_e32 v0, v4
348 ; GFX7-NEXT: v_mov_b32_e32 v2, v5
349 ; GFX7-NEXT: s_setpc_b64 s[30:31]
351 ; GFX8-LABEL: v_bswap_v2i64:
353 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
354 ; GFX8-NEXT: s_mov_b32 s4, 0x10203
355 ; GFX8-NEXT: v_perm_b32 v4, 0, v1, s4
356 ; GFX8-NEXT: v_perm_b32 v5, 0, v3, s4
357 ; GFX8-NEXT: v_perm_b32 v1, 0, v0, s4
358 ; GFX8-NEXT: v_perm_b32 v3, 0, v2, s4
359 ; GFX8-NEXT: v_mov_b32_e32 v0, v4
360 ; GFX8-NEXT: v_mov_b32_e32 v2, v5
361 ; GFX8-NEXT: s_setpc_b64 s[30:31]
363 ; GFX9-LABEL: v_bswap_v2i64:
365 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
366 ; GFX9-NEXT: s_mov_b32 s4, 0x10203
367 ; GFX9-NEXT: v_perm_b32 v4, 0, v1, s4
368 ; GFX9-NEXT: v_perm_b32 v5, 0, v3, s4
369 ; GFX9-NEXT: v_perm_b32 v1, 0, v0, s4
370 ; GFX9-NEXT: v_perm_b32 v3, 0, v2, s4
371 ; GFX9-NEXT: v_mov_b32_e32 v0, v4
372 ; GFX9-NEXT: v_mov_b32_e32 v2, v5
373 ; GFX9-NEXT: s_setpc_b64 s[30:31]
375 ; GFX10-LABEL: v_bswap_v2i64:
377 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
378 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
379 ; GFX10-NEXT: s_mov_b32 s4, 0x10203
380 ; GFX10-NEXT: v_perm_b32 v4, 0, v1, s4
381 ; GFX10-NEXT: v_perm_b32 v5, 0, v3, s4
382 ; GFX10-NEXT: v_perm_b32 v1, 0, v0, s4
383 ; GFX10-NEXT: v_perm_b32 v3, 0, v2, s4
384 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
385 ; GFX10-NEXT: v_mov_b32_e32 v2, v5
386 ; GFX10-NEXT: s_setpc_b64 s[30:31]
387 %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src)
391 define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) {
392 ; GFX7-LABEL: s_bswap_i16:
394 ; GFX7-NEXT: s_lshl_b32 s1, s0, 8
395 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80008
396 ; GFX7-NEXT: s_or_b32 s0, s0, s1
397 ; GFX7-NEXT: ; return to shader part epilog
399 ; GFX8-LABEL: s_bswap_i16:
401 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
402 ; GFX8-NEXT: s_mov_b32 s0, 0xc0c0001
403 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0
404 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
405 ; GFX8-NEXT: ; return to shader part epilog
407 ; GFX9-LABEL: s_bswap_i16:
409 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
410 ; GFX9-NEXT: s_mov_b32 s0, 0xc0c0001
411 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0
412 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
413 ; GFX9-NEXT: ; return to shader part epilog
415 ; GFX10-LABEL: s_bswap_i16:
417 ; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0xc0c0001
418 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
419 ; GFX10-NEXT: ; return to shader part epilog
420 %bswap = call i16 @llvm.bswap.i16(i16 %src)
424 define i16 @v_bswap_i16(i16 %src) {
425 ; GFX7-LABEL: v_bswap_i16:
427 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
428 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
429 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
430 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
431 ; GFX7-NEXT: s_setpc_b64 s[30:31]
433 ; GFX8-LABEL: v_bswap_i16:
435 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
436 ; GFX8-NEXT: s_mov_b32 s4, 0xc0c0001
437 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4
438 ; GFX8-NEXT: s_setpc_b64 s[30:31]
440 ; GFX9-LABEL: v_bswap_i16:
442 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
443 ; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001
444 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4
445 ; GFX9-NEXT: s_setpc_b64 s[30:31]
447 ; GFX10-LABEL: v_bswap_i16:
449 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
451 ; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001
452 ; GFX10-NEXT: s_setpc_b64 s[30:31]
453 %bswap = call i16 @llvm.bswap.i16(i16 %src)
457 define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) {
458 ; GFX7-LABEL: s_bswap_v2i16:
460 ; GFX7-NEXT: s_mov_b32 s3, 0x80008
461 ; GFX7-NEXT: s_lshl_b32 s2, s0, 8
462 ; GFX7-NEXT: s_bfe_u32 s0, s0, s3
463 ; GFX7-NEXT: s_or_b32 s0, s0, s2
464 ; GFX7-NEXT: s_lshl_b32 s2, s1, 8
465 ; GFX7-NEXT: s_bfe_u32 s1, s1, s3
466 ; GFX7-NEXT: s_or_b32 s1, s1, s2
467 ; GFX7-NEXT: s_bfe_u32 s1, s1, 0x100000
468 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x100000
469 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16
470 ; GFX7-NEXT: s_or_b32 s0, s0, s1
471 ; GFX7-NEXT: ; return to shader part epilog
473 ; GFX8-LABEL: s_bswap_v2i16:
475 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
476 ; GFX8-NEXT: s_mov_b32 s0, 0x2030001
477 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s0
478 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
479 ; GFX8-NEXT: ; return to shader part epilog
481 ; GFX9-LABEL: s_bswap_v2i16:
483 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
484 ; GFX9-NEXT: s_mov_b32 s0, 0x2030001
485 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s0
486 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
487 ; GFX9-NEXT: ; return to shader part epilog
489 ; GFX10-LABEL: s_bswap_v2i16:
491 ; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0x2030001
492 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
493 ; GFX10-NEXT: ; return to shader part epilog
494 %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src)
495 %cast = bitcast <2 x i16> %bswap to i32
499 define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
500 ; GFX7-LABEL: v_bswap_i16_zext_to_i32:
502 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
503 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
504 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
505 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
506 ; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16
507 ; GFX7-NEXT: s_setpc_b64 s[30:31]
509 ; GFX8-LABEL: v_bswap_i16_zext_to_i32:
511 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
512 ; GFX8-NEXT: s_mov_b32 s4, 0xc0c0001
513 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4
514 ; GFX8-NEXT: s_setpc_b64 s[30:31]
516 ; GFX9-LABEL: v_bswap_i16_zext_to_i32:
518 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
519 ; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001
520 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4
521 ; GFX9-NEXT: s_setpc_b64 s[30:31]
523 ; GFX10-LABEL: v_bswap_i16_zext_to_i32:
525 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
526 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
527 ; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001
528 ; GFX10-NEXT: s_setpc_b64 s[30:31]
529 %bswap = call i16 @llvm.bswap.i16(i16 %src)
530 %zext = zext i16 %bswap to i32
534 define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
535 ; GFX7-LABEL: v_bswap_i16_sext_to_i32:
537 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
538 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
539 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
540 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
541 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
542 ; GFX7-NEXT: s_setpc_b64 s[30:31]
544 ; GFX8-LABEL: v_bswap_i16_sext_to_i32:
546 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
547 ; GFX8-NEXT: s_mov_b32 s4, 0xc0c0001
548 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4
549 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 16
550 ; GFX8-NEXT: s_setpc_b64 s[30:31]
552 ; GFX9-LABEL: v_bswap_i16_sext_to_i32:
554 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
555 ; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001
556 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4
557 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
558 ; GFX9-NEXT: s_setpc_b64 s[30:31]
560 ; GFX10-LABEL: v_bswap_i16_sext_to_i32:
562 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
563 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
564 ; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0xc0c0001
565 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
566 ; GFX10-NEXT: s_setpc_b64 s[30:31]
567 %bswap = call i16 @llvm.bswap.i16(i16 %src)
568 %zext = sext i16 %bswap to i32
572 define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
573 ; GFX7-LABEL: v_bswap_v2i16:
575 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
576 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v0
577 ; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
578 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
579 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v1
580 ; GFX7-NEXT: v_bfe_u32 v1, v1, 8, 8
581 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
582 ; GFX7-NEXT: s_setpc_b64 s[30:31]
584 ; GFX8-LABEL: v_bswap_v2i16:
586 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
587 ; GFX8-NEXT: s_mov_b32 s4, 0x2030001
588 ; GFX8-NEXT: v_perm_b32 v0, 0, v0, s4
589 ; GFX8-NEXT: s_setpc_b64 s[30:31]
591 ; GFX9-LABEL: v_bswap_v2i16:
593 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
594 ; GFX9-NEXT: s_mov_b32 s4, 0x2030001
595 ; GFX9-NEXT: v_perm_b32 v0, 0, v0, s4
596 ; GFX9-NEXT: s_setpc_b64 s[30:31]
598 ; GFX10-LABEL: v_bswap_v2i16:
600 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
601 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
602 ; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x2030001
603 ; GFX10-NEXT: s_setpc_b64 s[30:31]
604 %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src)
609 ; define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
610 ; %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %ext.src)
611 ; ret <3 x i16> %bswap
614 define i64 @v_bswap_i48(i64 %src) {
615 ; GFX7-LABEL: v_bswap_i48:
617 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
618 ; GFX7-NEXT: v_alignbit_b32 v2, v1, v1, 8
619 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v1, 24
620 ; GFX7-NEXT: s_mov_b32 s4, 0xff00ff
621 ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2
622 ; GFX7-NEXT: v_alignbit_b32 v2, v0, v0, 8
623 ; GFX7-NEXT: v_alignbit_b32 v0, v0, v0, 24
624 ; GFX7-NEXT: v_bfi_b32 v2, s4, v0, v2
625 ; GFX7-NEXT: v_lshr_b64 v[0:1], v[1:2], 16
626 ; GFX7-NEXT: s_setpc_b64 s[30:31]
628 ; GFX8-LABEL: v_bswap_i48:
630 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
631 ; GFX8-NEXT: s_mov_b32 s4, 0x10203
632 ; GFX8-NEXT: v_perm_b32 v1, 0, v1, s4
633 ; GFX8-NEXT: v_perm_b32 v2, 0, v0, s4
634 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2]
635 ; GFX8-NEXT: s_setpc_b64 s[30:31]
637 ; GFX9-LABEL: v_bswap_i48:
639 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
640 ; GFX9-NEXT: s_mov_b32 s4, 0x10203
641 ; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4
642 ; GFX9-NEXT: v_perm_b32 v2, 0, v0, s4
643 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2]
644 ; GFX9-NEXT: s_setpc_b64 s[30:31]
646 ; GFX10-LABEL: v_bswap_i48:
648 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
649 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
650 ; GFX10-NEXT: s_mov_b32 s4, 0x10203
651 ; GFX10-NEXT: v_perm_b32 v1, 0, v1, s4
652 ; GFX10-NEXT: v_perm_b32 v2, 0, v0, s4
653 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2]
654 ; GFX10-NEXT: s_setpc_b64 s[30:31]
655 %trunc = trunc i64 %src to i48
656 %bswap = call i48 @llvm.bswap.i48(i48 %trunc)
657 %zext = zext i48 %bswap to i64
661 declare i16 @llvm.bswap.i16(i16) #1
662 declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) #1
663 declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>) #1
664 declare i32 @llvm.bswap.i32(i32) #1
665 declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) #1
666 declare i64 @llvm.bswap.i64(i64) #1
667 declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) #1
668 declare i48 @llvm.bswap.i48(i48) #1
670 attributes #0 = { convergent nounwind readnone }
671 attributes #1 = { nounwind readnone speculatable willreturn }