1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
5 ; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8-GISEL %s
6 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10-GISEL %s
8 ; BFI_INT Definition pattern from ISA docs
11 define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
12 ; GFX7-LABEL: s_bfi_def_i32:
13 ; GFX7: ; %bb.0: ; %entry
14 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
15 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd
16 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
17 ; GFX7-NEXT: s_mov_b32 s2, -1
18 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
19 ; GFX7-NEXT: s_mov_b32 s0, s4
20 ; GFX7-NEXT: s_mov_b32 s1, s5
21 ; GFX7-NEXT: s_andn2_b32 s4, s8, s6
22 ; GFX7-NEXT: s_and_b32 s5, s7, s6
23 ; GFX7-NEXT: s_or_b32 s4, s4, s5
24 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
25 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
28 ; GFX8-LABEL: s_bfi_def_i32:
29 ; GFX8: ; %bb.0: ; %entry
30 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
31 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34
32 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
33 ; GFX8-NEXT: s_and_b32 s1, s7, s6
34 ; GFX8-NEXT: s_andn2_b32 s0, s0, s6
35 ; GFX8-NEXT: s_or_b32 s0, s0, s1
36 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
37 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
38 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
39 ; GFX8-NEXT: flat_store_dword v[0:1], v2
42 ; GFX10-LABEL: s_bfi_def_i32:
43 ; GFX10: ; %bb.0: ; %entry
44 ; GFX10-NEXT: s_clause 0x1
45 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
46 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
47 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
48 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
49 ; GFX10-NEXT: s_and_b32 s1, s7, s6
50 ; GFX10-NEXT: s_andn2_b32 s0, s0, s6
51 ; GFX10-NEXT: s_or_b32 s0, s0, s1
52 ; GFX10-NEXT: v_mov_b32_e32 v1, s0
53 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
54 ; GFX10-NEXT: s_endpgm
56 ; GFX8-GISEL-LABEL: s_bfi_def_i32:
57 ; GFX8-GISEL: ; %bb.0: ; %entry
58 ; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
59 ; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
60 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
61 ; GFX8-GISEL-NEXT: s_and_b32 s1, s7, s6
62 ; GFX8-GISEL-NEXT: s_andn2_b32 s0, s0, s6
63 ; GFX8-GISEL-NEXT: s_or_b32 s0, s0, s1
64 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4
65 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
66 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s5
67 ; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v2
68 ; GFX8-GISEL-NEXT: s_endpgm
70 ; GFX10-GISEL-LABEL: s_bfi_def_i32:
71 ; GFX10-GISEL: ; %bb.0: ; %entry
72 ; GFX10-GISEL-NEXT: s_clause 0x1
73 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
74 ; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
75 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
76 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
77 ; GFX10-GISEL-NEXT: s_and_b32 s1, s7, s6
78 ; GFX10-GISEL-NEXT: s_andn2_b32 s0, s0, s6
79 ; GFX10-GISEL-NEXT: s_or_b32 s0, s0, s1
80 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
81 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
82 ; GFX10-GISEL-NEXT: s_endpgm
88 store i32 %3, ptr addrspace(1) %out
92 define i32 @v_bfi_def_i32(i32 %x, i32 %y, i32 %z) {
93 ; GFX7-LABEL: v_bfi_def_i32:
94 ; GFX7: ; %bb.0: ; %entry
95 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
96 ; GFX7-NEXT: v_bfi_b32 v0, v0, v1, v2
97 ; GFX7-NEXT: s_setpc_b64 s[30:31]
99 ; GFX8-LABEL: v_bfi_def_i32:
100 ; GFX8: ; %bb.0: ; %entry
101 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102 ; GFX8-NEXT: v_bfi_b32 v0, v0, v1, v2
103 ; GFX8-NEXT: s_setpc_b64 s[30:31]
105 ; GFX10-LABEL: v_bfi_def_i32:
106 ; GFX10: ; %bb.0: ; %entry
107 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108 ; GFX10-NEXT: v_bfi_b32 v0, v0, v1, v2
109 ; GFX10-NEXT: s_setpc_b64 s[30:31]
111 ; GFX8-GISEL-LABEL: v_bfi_def_i32:
112 ; GFX8-GISEL: ; %bb.0: ; %entry
113 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v1, v2
115 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
117 ; GFX10-GISEL-LABEL: v_bfi_def_i32:
118 ; GFX10-GISEL: ; %bb.0: ; %entry
119 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
120 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v1, v2
121 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
130 ; SHA-256 Ch function
132 define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
133 ; GFX7-LABEL: s_bfi_sha256_ch:
134 ; GFX7: ; %bb.0: ; %entry
135 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
136 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd
137 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
138 ; GFX7-NEXT: s_mov_b32 s2, -1
139 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
140 ; GFX7-NEXT: s_mov_b32 s0, s4
141 ; GFX7-NEXT: s_xor_b32 s4, s7, s8
142 ; GFX7-NEXT: s_and_b32 s4, s6, s4
143 ; GFX7-NEXT: s_xor_b32 s4, s8, s4
144 ; GFX7-NEXT: s_mov_b32 s1, s5
145 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
146 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
147 ; GFX7-NEXT: s_endpgm
149 ; GFX8-LABEL: s_bfi_sha256_ch:
150 ; GFX8: ; %bb.0: ; %entry
151 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
152 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34
153 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
154 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
155 ; GFX8-NEXT: s_xor_b32 s1, s7, s0
156 ; GFX8-NEXT: s_and_b32 s1, s6, s1
157 ; GFX8-NEXT: s_xor_b32 s0, s0, s1
158 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
159 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
160 ; GFX8-NEXT: flat_store_dword v[0:1], v2
161 ; GFX8-NEXT: s_endpgm
163 ; GFX10-LABEL: s_bfi_sha256_ch:
164 ; GFX10: ; %bb.0: ; %entry
165 ; GFX10-NEXT: s_clause 0x1
166 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
167 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
168 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
169 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
170 ; GFX10-NEXT: s_xor_b32 s1, s7, s0
171 ; GFX10-NEXT: s_and_b32 s1, s6, s1
172 ; GFX10-NEXT: s_xor_b32 s0, s0, s1
173 ; GFX10-NEXT: v_mov_b32_e32 v1, s0
174 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
175 ; GFX10-NEXT: s_endpgm
177 ; GFX8-GISEL-LABEL: s_bfi_sha256_ch:
178 ; GFX8-GISEL: ; %bb.0: ; %entry
179 ; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
180 ; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
181 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
182 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4
183 ; GFX8-GISEL-NEXT: s_xor_b32 s1, s7, s0
184 ; GFX8-GISEL-NEXT: s_and_b32 s1, s6, s1
185 ; GFX8-GISEL-NEXT: s_xor_b32 s0, s0, s1
186 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
187 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s5
188 ; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v2
189 ; GFX8-GISEL-NEXT: s_endpgm
191 ; GFX10-GISEL-LABEL: s_bfi_sha256_ch:
192 ; GFX10-GISEL: ; %bb.0: ; %entry
193 ; GFX10-GISEL-NEXT: s_clause 0x1
194 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
195 ; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
196 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
197 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
198 ; GFX10-GISEL-NEXT: s_xor_b32 s1, s7, s0
199 ; GFX10-GISEL-NEXT: s_and_b32 s1, s6, s1
200 ; GFX10-GISEL-NEXT: s_xor_b32 s0, s0, s1
201 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
202 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
203 ; GFX10-GISEL-NEXT: s_endpgm
208 store i32 %2, ptr addrspace(1) %out
212 define i32 @v_bfi_sha256_ch(i32 %x, i32 %y, i32 %z) {
213 ; GFX7-LABEL: v_bfi_sha256_ch:
214 ; GFX7: ; %bb.0: ; %entry
215 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
216 ; GFX7-NEXT: v_bfi_b32 v0, v0, v1, v2
217 ; GFX7-NEXT: s_setpc_b64 s[30:31]
219 ; GFX8-LABEL: v_bfi_sha256_ch:
220 ; GFX8: ; %bb.0: ; %entry
221 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
222 ; GFX8-NEXT: v_bfi_b32 v0, v0, v1, v2
223 ; GFX8-NEXT: s_setpc_b64 s[30:31]
225 ; GFX10-LABEL: v_bfi_sha256_ch:
226 ; GFX10: ; %bb.0: ; %entry
227 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228 ; GFX10-NEXT: v_bfi_b32 v0, v0, v1, v2
229 ; GFX10-NEXT: s_setpc_b64 s[30:31]
231 ; GFX8-GISEL-LABEL: v_bfi_sha256_ch:
232 ; GFX8-GISEL: ; %bb.0: ; %entry
233 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
234 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v1, v2
235 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
237 ; GFX10-GISEL-LABEL: v_bfi_sha256_ch:
238 ; GFX10-GISEL: ; %bb.0: ; %entry
239 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v1, v2
241 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
249 define amdgpu_ps float @v_s_s_bfi_sha256_ch(i32 %x, i32 inreg %y, i32 inreg %z) {
250 ; GFX7-LABEL: v_s_s_bfi_sha256_ch:
251 ; GFX7: ; %bb.0: ; %entry
252 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
253 ; GFX7-NEXT: v_bfi_b32 v0, v0, v1, s1
254 ; GFX7-NEXT: ; return to shader part epilog
256 ; GFX8-LABEL: v_s_s_bfi_sha256_ch:
257 ; GFX8: ; %bb.0: ; %entry
258 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
259 ; GFX8-NEXT: v_bfi_b32 v0, v0, v1, s1
260 ; GFX8-NEXT: ; return to shader part epilog
262 ; GFX10-LABEL: v_s_s_bfi_sha256_ch:
263 ; GFX10: ; %bb.0: ; %entry
264 ; GFX10-NEXT: v_bfi_b32 v0, v0, s0, s1
265 ; GFX10-NEXT: ; return to shader part epilog
267 ; GFX8-GISEL-LABEL: v_s_s_bfi_sha256_ch:
268 ; GFX8-GISEL: ; %bb.0: ; %entry
269 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s0
270 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v1, s1
271 ; GFX8-GISEL-NEXT: ; return to shader part epilog
273 ; GFX10-GISEL-LABEL: v_s_s_bfi_sha256_ch:
274 ; GFX10-GISEL: ; %bb.0: ; %entry
275 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, s0, s1
276 ; GFX10-GISEL-NEXT: ; return to shader part epilog
278 %xor0 = xor i32 %y, %z
279 %and = and i32 %x, %xor0
280 %xor1 = xor i32 %z, %and
281 %cast = bitcast i32 %xor1 to float
285 define amdgpu_ps float @s_v_s_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 inreg %z) {
286 ; GFX7-LABEL: s_v_s_bfi_sha256_ch:
287 ; GFX7: ; %bb.0: ; %entry
288 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
289 ; GFX7-NEXT: v_bfi_b32 v0, v1, v0, s1
290 ; GFX7-NEXT: ; return to shader part epilog
292 ; GFX8-LABEL: s_v_s_bfi_sha256_ch:
293 ; GFX8: ; %bb.0: ; %entry
294 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
295 ; GFX8-NEXT: v_bfi_b32 v0, v1, v0, s1
296 ; GFX8-NEXT: ; return to shader part epilog
298 ; GFX10-LABEL: s_v_s_bfi_sha256_ch:
299 ; GFX10: ; %bb.0: ; %entry
300 ; GFX10-NEXT: v_bfi_b32 v0, s0, v0, s1
301 ; GFX10-NEXT: ; return to shader part epilog
303 ; GFX8-GISEL-LABEL: s_v_s_bfi_sha256_ch:
304 ; GFX8-GISEL: ; %bb.0: ; %entry
305 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
306 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1
307 ; GFX8-GISEL-NEXT: ; return to shader part epilog
309 ; GFX10-GISEL-LABEL: s_v_s_bfi_sha256_ch:
310 ; GFX10-GISEL: ; %bb.0: ; %entry
311 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, v0, s1
312 ; GFX10-GISEL-NEXT: ; return to shader part epilog
314 %xor0 = xor i32 %y, %z
315 %and = and i32 %x, %xor0
316 %xor1 = xor i32 %z, %and
317 %cast = bitcast i32 %xor1 to float
321 define amdgpu_ps float @s_s_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) {
322 ; GFX7-LABEL: s_s_v_bfi_sha256_ch:
323 ; GFX7: ; %bb.0: ; %entry
324 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
325 ; GFX7-NEXT: v_bfi_b32 v0, v1, s1, v0
326 ; GFX7-NEXT: ; return to shader part epilog
328 ; GFX8-LABEL: s_s_v_bfi_sha256_ch:
329 ; GFX8: ; %bb.0: ; %entry
330 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
331 ; GFX8-NEXT: v_bfi_b32 v0, v1, s1, v0
332 ; GFX8-NEXT: ; return to shader part epilog
334 ; GFX10-LABEL: s_s_v_bfi_sha256_ch:
335 ; GFX10: ; %bb.0: ; %entry
336 ; GFX10-NEXT: v_bfi_b32 v0, s0, s1, v0
337 ; GFX10-NEXT: ; return to shader part epilog
339 ; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ch:
340 ; GFX8-GISEL: ; %bb.0: ; %entry
341 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s0
342 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v1, s1, v0
343 ; GFX8-GISEL-NEXT: ; return to shader part epilog
345 ; GFX10-GISEL-LABEL: s_s_v_bfi_sha256_ch:
346 ; GFX10-GISEL: ; %bb.0: ; %entry
347 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, s1, v0
348 ; GFX10-GISEL-NEXT: ; return to shader part epilog
350 %xor0 = xor i32 %y, %z
351 %and = and i32 %x, %xor0
352 %xor1 = xor i32 %z, %and
353 %cast = bitcast i32 %xor1 to float
357 define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 %z) {
358 ; GFX7-LABEL: s_v_v_bfi_sha256_ch:
359 ; GFX7: ; %bb.0: ; %entry
360 ; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1
361 ; GFX7-NEXT: ; return to shader part epilog
363 ; GFX8-LABEL: s_v_v_bfi_sha256_ch:
364 ; GFX8: ; %bb.0: ; %entry
365 ; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1
366 ; GFX8-NEXT: ; return to shader part epilog
368 ; GFX10-LABEL: s_v_v_bfi_sha256_ch:
369 ; GFX10: ; %bb.0: ; %entry
370 ; GFX10-NEXT: v_bfi_b32 v0, s0, v0, v1
371 ; GFX10-NEXT: ; return to shader part epilog
373 ; GFX8-GISEL-LABEL: s_v_v_bfi_sha256_ch:
374 ; GFX8-GISEL: ; %bb.0: ; %entry
375 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1
376 ; GFX8-GISEL-NEXT: ; return to shader part epilog
378 ; GFX10-GISEL-LABEL: s_v_v_bfi_sha256_ch:
379 ; GFX10-GISEL: ; %bb.0: ; %entry
380 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1
381 ; GFX10-GISEL-NEXT: ; return to shader part epilog
383 %xor0 = xor i32 %y, %z
384 %and = and i32 %x, %xor0
385 %xor1 = xor i32 %z, %and
386 %cast = bitcast i32 %xor1 to float
390 define amdgpu_ps float @v_s_v_bfi_sha256_ch(i32 %x, i32 inreg %y, i32 %z) {
391 ; GFX7-LABEL: v_s_v_bfi_sha256_ch:
392 ; GFX7: ; %bb.0: ; %entry
393 ; GFX7-NEXT: v_bfi_b32 v0, v0, s0, v1
394 ; GFX7-NEXT: ; return to shader part epilog
396 ; GFX8-LABEL: v_s_v_bfi_sha256_ch:
397 ; GFX8: ; %bb.0: ; %entry
398 ; GFX8-NEXT: v_bfi_b32 v0, v0, s0, v1
399 ; GFX8-NEXT: ; return to shader part epilog
401 ; GFX10-LABEL: v_s_v_bfi_sha256_ch:
402 ; GFX10: ; %bb.0: ; %entry
403 ; GFX10-NEXT: v_bfi_b32 v0, v0, s0, v1
404 ; GFX10-NEXT: ; return to shader part epilog
406 ; GFX8-GISEL-LABEL: v_s_v_bfi_sha256_ch:
407 ; GFX8-GISEL: ; %bb.0: ; %entry
408 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, s0, v1
409 ; GFX8-GISEL-NEXT: ; return to shader part epilog
411 ; GFX10-GISEL-LABEL: v_s_v_bfi_sha256_ch:
412 ; GFX10-GISEL: ; %bb.0: ; %entry
413 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, s0, v1
414 ; GFX10-GISEL-NEXT: ; return to shader part epilog
416 %xor0 = xor i32 %y, %z
417 %and = and i32 %x, %xor0
418 %xor1 = xor i32 %z, %and
419 %cast = bitcast i32 %xor1 to float
423 define amdgpu_ps float @v_v_s_bfi_sha256_ch(i32 %x, i32 %y, i32 inreg %z) {
424 ; GFX7-LABEL: v_v_s_bfi_sha256_ch:
425 ; GFX7: ; %bb.0: ; %entry
426 ; GFX7-NEXT: v_bfi_b32 v0, v0, v1, s0
427 ; GFX7-NEXT: ; return to shader part epilog
429 ; GFX8-LABEL: v_v_s_bfi_sha256_ch:
430 ; GFX8: ; %bb.0: ; %entry
431 ; GFX8-NEXT: v_bfi_b32 v0, v0, v1, s0
432 ; GFX8-NEXT: ; return to shader part epilog
434 ; GFX10-LABEL: v_v_s_bfi_sha256_ch:
435 ; GFX10: ; %bb.0: ; %entry
436 ; GFX10-NEXT: v_bfi_b32 v0, v0, v1, s0
437 ; GFX10-NEXT: ; return to shader part epilog
439 ; GFX8-GISEL-LABEL: v_v_s_bfi_sha256_ch:
440 ; GFX8-GISEL: ; %bb.0: ; %entry
441 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v1, s0
442 ; GFX8-GISEL-NEXT: ; return to shader part epilog
444 ; GFX10-GISEL-LABEL: v_v_s_bfi_sha256_ch:
445 ; GFX10-GISEL: ; %bb.0: ; %entry
446 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v1, s0
447 ; GFX10-GISEL-NEXT: ; return to shader part epilog
449 %xor0 = xor i32 %y, %z
450 %and = and i32 %x, %xor0
451 %xor1 = xor i32 %z, %and
452 %cast = bitcast i32 %xor1 to float
456 ; SHA-256 Ma function
457 ; ((x & z) | (y & (x | z)))
458 define amdgpu_kernel void @s_bfi_sha256_ma(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
459 ; GFX7-LABEL: s_bfi_sha256_ma:
460 ; GFX7: ; %bb.0: ; %entry
461 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
462 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd
463 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
464 ; GFX7-NEXT: s_mov_b32 s2, -1
465 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
466 ; GFX7-NEXT: s_mov_b32 s1, s5
467 ; GFX7-NEXT: s_or_b32 s5, s6, s8
468 ; GFX7-NEXT: s_mov_b32 s0, s4
469 ; GFX7-NEXT: s_and_b32 s4, s6, s8
470 ; GFX7-NEXT: s_and_b32 s5, s7, s5
471 ; GFX7-NEXT: s_or_b32 s4, s4, s5
472 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
473 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
474 ; GFX7-NEXT: s_endpgm
476 ; GFX8-LABEL: s_bfi_sha256_ma:
477 ; GFX8: ; %bb.0: ; %entry
478 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
479 ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34
480 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
481 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
482 ; GFX8-NEXT: s_and_b32 s1, s6, s0
483 ; GFX8-NEXT: s_or_b32 s0, s6, s0
484 ; GFX8-NEXT: s_and_b32 s0, s7, s0
485 ; GFX8-NEXT: s_or_b32 s0, s1, s0
486 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
487 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
488 ; GFX8-NEXT: flat_store_dword v[0:1], v2
489 ; GFX8-NEXT: s_endpgm
491 ; GFX10-LABEL: s_bfi_sha256_ma:
492 ; GFX10: ; %bb.0: ; %entry
493 ; GFX10-NEXT: s_clause 0x1
494 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
495 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
496 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
497 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
498 ; GFX10-NEXT: s_or_b32 s1, s6, s0
499 ; GFX10-NEXT: s_and_b32 s0, s6, s0
500 ; GFX10-NEXT: s_and_b32 s1, s7, s1
501 ; GFX10-NEXT: s_or_b32 s0, s0, s1
502 ; GFX10-NEXT: v_mov_b32_e32 v1, s0
503 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
504 ; GFX10-NEXT: s_endpgm
506 ; GFX8-GISEL-LABEL: s_bfi_sha256_ma:
507 ; GFX8-GISEL: ; %bb.0: ; %entry
508 ; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
509 ; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
510 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
511 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4
512 ; GFX8-GISEL-NEXT: s_and_b32 s1, s6, s0
513 ; GFX8-GISEL-NEXT: s_or_b32 s0, s6, s0
514 ; GFX8-GISEL-NEXT: s_and_b32 s0, s7, s0
515 ; GFX8-GISEL-NEXT: s_or_b32 s0, s1, s0
516 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
517 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s5
518 ; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v2
519 ; GFX8-GISEL-NEXT: s_endpgm
521 ; GFX10-GISEL-LABEL: s_bfi_sha256_ma:
522 ; GFX10-GISEL: ; %bb.0: ; %entry
523 ; GFX10-GISEL-NEXT: s_clause 0x1
524 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
525 ; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
526 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
527 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
528 ; GFX10-GISEL-NEXT: s_or_b32 s1, s6, s0
529 ; GFX10-GISEL-NEXT: s_and_b32 s0, s6, s0
530 ; GFX10-GISEL-NEXT: s_and_b32 s1, s7, s1
531 ; GFX10-GISEL-NEXT: s_or_b32 s0, s0, s1
532 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
533 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
534 ; GFX10-GISEL-NEXT: s_endpgm
540 store i32 %3, ptr addrspace(1) %out
544 define i32 @v_bfi_sha256_ma(i32 %x, i32 %y, i32 %z) {
545 ; GFX7-LABEL: v_bfi_sha256_ma:
546 ; GFX7: ; %bb.0: ; %entry
547 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
548 ; GFX7-NEXT: v_xor_b32_e32 v0, v0, v1
549 ; GFX7-NEXT: v_bfi_b32 v0, v0, v2, v1
550 ; GFX7-NEXT: s_setpc_b64 s[30:31]
552 ; GFX8-LABEL: v_bfi_sha256_ma:
553 ; GFX8: ; %bb.0: ; %entry
554 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
555 ; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1
556 ; GFX8-NEXT: v_bfi_b32 v0, v0, v2, v1
557 ; GFX8-NEXT: s_setpc_b64 s[30:31]
559 ; GFX10-LABEL: v_bfi_sha256_ma:
560 ; GFX10: ; %bb.0: ; %entry
561 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
562 ; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1
563 ; GFX10-NEXT: v_bfi_b32 v0, v0, v2, v1
564 ; GFX10-NEXT: s_setpc_b64 s[30:31]
566 ; GFX8-GISEL-LABEL: v_bfi_sha256_ma:
567 ; GFX8-GISEL: ; %bb.0: ; %entry
568 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
569 ; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
570 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v2, v1
571 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
573 ; GFX10-GISEL-LABEL: v_bfi_sha256_ma:
574 ; GFX10-GISEL: ; %bb.0: ; %entry
575 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
576 ; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
577 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v2, v1
578 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
587 define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %mask) {
588 ; GFX7-LABEL: v_bitselect_v2i32_pat1:
590 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
591 ; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4
592 ; GFX7-NEXT: v_bfi_b32 v1, v3, v1, v5
593 ; GFX7-NEXT: s_setpc_b64 s[30:31]
595 ; GFX8-LABEL: v_bitselect_v2i32_pat1:
597 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
598 ; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4
599 ; GFX8-NEXT: v_bfi_b32 v1, v3, v1, v5
600 ; GFX8-NEXT: s_setpc_b64 s[30:31]
602 ; GFX10-LABEL: v_bitselect_v2i32_pat1:
604 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
605 ; GFX10-NEXT: v_bfi_b32 v0, v2, v0, v4
606 ; GFX10-NEXT: v_bfi_b32 v1, v3, v1, v5
607 ; GFX10-NEXT: s_setpc_b64 s[30:31]
609 ; GFX8-GISEL-LABEL: v_bitselect_v2i32_pat1:
610 ; GFX8-GISEL: ; %bb.0:
611 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
612 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4
613 ; GFX8-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5
614 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
616 ; GFX10-GISEL-LABEL: v_bitselect_v2i32_pat1:
617 ; GFX10-GISEL: ; %bb.0:
618 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
619 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4
620 ; GFX10-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5
621 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
622 %xor.0 = xor <2 x i32> %a, %mask
623 %and = and <2 x i32> %xor.0, %b
624 %bitselect = xor <2 x i32> %and, %mask
625 ret <2 x i32> %bitselect
628 define i64 @v_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
629 ; GFX7-LABEL: v_bitselect_i64_pat_0:
631 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
632 ; GFX7-NEXT: v_bfi_b32 v1, v1, v3, v5
633 ; GFX7-NEXT: v_bfi_b32 v0, v0, v2, v4
634 ; GFX7-NEXT: s_setpc_b64 s[30:31]
636 ; GFX8-LABEL: v_bitselect_i64_pat_0:
638 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
639 ; GFX8-NEXT: v_bfi_b32 v1, v1, v3, v5
640 ; GFX8-NEXT: v_bfi_b32 v0, v0, v2, v4
641 ; GFX8-NEXT: s_setpc_b64 s[30:31]
643 ; GFX10-LABEL: v_bitselect_i64_pat_0:
645 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
646 ; GFX10-NEXT: v_bfi_b32 v0, v0, v2, v4
647 ; GFX10-NEXT: v_bfi_b32 v1, v1, v3, v5
648 ; GFX10-NEXT: s_setpc_b64 s[30:31]
650 ; GFX8-GISEL-LABEL: v_bitselect_i64_pat_0:
651 ; GFX8-GISEL: ; %bb.0:
652 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
653 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v2, v4
654 ; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, v3, v5
655 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
657 ; GFX10-GISEL-LABEL: v_bitselect_i64_pat_0:
658 ; GFX10-GISEL: ; %bb.0:
659 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
660 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v2, v4
661 ; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, v3, v5
662 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
663 %and0 = and i64 %a, %b
664 %not.a = xor i64 %a, -1
665 %and1 = and i64 %not.a, %mask
666 %bitselect = or i64 %and0, %and1
670 define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_0(i64 %a, i64 inreg %b, i64 inreg %mask) {
671 ; GFX7-LABEL: v_s_s_bitselect_i64_pat_0:
673 ; GFX7-NEXT: v_mov_b32_e32 v2, s3
674 ; GFX7-NEXT: v_bfi_b32 v1, v1, s1, v2
675 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
676 ; GFX7-NEXT: v_bfi_b32 v0, v0, s0, v2
677 ; GFX7-NEXT: ; return to shader part epilog
679 ; GFX8-LABEL: v_s_s_bitselect_i64_pat_0:
681 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
682 ; GFX8-NEXT: v_bfi_b32 v1, v1, s1, v2
683 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
684 ; GFX8-NEXT: v_bfi_b32 v0, v0, s0, v2
685 ; GFX8-NEXT: ; return to shader part epilog
687 ; GFX10-LABEL: v_s_s_bitselect_i64_pat_0:
689 ; GFX10-NEXT: v_bfi_b32 v0, v0, s0, s2
690 ; GFX10-NEXT: v_bfi_b32 v1, v1, s1, s3
691 ; GFX10-NEXT: ; return to shader part epilog
693 ; GFX8-GISEL-LABEL: v_s_s_bitselect_i64_pat_0:
694 ; GFX8-GISEL: ; %bb.0:
695 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
696 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v2, s2
697 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s1
698 ; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, v2, s3
699 ; GFX8-GISEL-NEXT: ; return to shader part epilog
701 ; GFX10-GISEL-LABEL: v_s_s_bitselect_i64_pat_0:
702 ; GFX10-GISEL: ; %bb.0:
703 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, s0, s2
704 ; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, s1, s3
705 ; GFX10-GISEL-NEXT: ; return to shader part epilog
706 %and0 = and i64 %a, %b
707 %not.a = xor i64 %a, -1
708 %and1 = and i64 %not.a, %mask
709 %bitselect = or i64 %and0, %and1
710 %cast = bitcast i64 %bitselect to <2 x float>
711 ret <2 x float> %cast
714 define amdgpu_ps <2 x float> @s_v_s_bitselect_i64_pat_0(i64 inreg %a, i64 %b, i64 inreg %mask) {
715 ; GFX7-LABEL: s_v_s_bitselect_i64_pat_0:
717 ; GFX7-NEXT: v_mov_b32_e32 v2, s3
718 ; GFX7-NEXT: v_bfi_b32 v1, s1, v1, v2
719 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
720 ; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v2
721 ; GFX7-NEXT: ; return to shader part epilog
723 ; GFX8-LABEL: s_v_s_bitselect_i64_pat_0:
725 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
726 ; GFX8-NEXT: v_bfi_b32 v1, s1, v1, v2
727 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
728 ; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v2
729 ; GFX8-NEXT: ; return to shader part epilog
731 ; GFX10-LABEL: s_v_s_bitselect_i64_pat_0:
733 ; GFX10-NEXT: v_bfi_b32 v0, s0, v0, s2
734 ; GFX10-NEXT: v_bfi_b32 v1, s1, v1, s3
735 ; GFX10-NEXT: ; return to shader part epilog
737 ; GFX8-GISEL-LABEL: s_v_s_bitselect_i64_pat_0:
738 ; GFX8-GISEL: ; %bb.0:
739 ; GFX8-GISEL-NEXT: v_and_b32_e32 v0, s0, v0
740 ; GFX8-GISEL-NEXT: v_and_b32_e32 v1, s1, v1
741 ; GFX8-GISEL-NEXT: s_andn2_b64 s[0:1], s[2:3], s[0:1]
742 ; GFX8-GISEL-NEXT: v_or_b32_e32 v0, s0, v0
743 ; GFX8-GISEL-NEXT: v_or_b32_e32 v1, s1, v1
744 ; GFX8-GISEL-NEXT: ; return to shader part epilog
746 ; GFX10-GISEL-LABEL: s_v_s_bitselect_i64_pat_0:
747 ; GFX10-GISEL: ; %bb.0:
748 ; GFX10-GISEL-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
749 ; GFX10-GISEL-NEXT: v_and_or_b32 v0, s0, v0, s2
750 ; GFX10-GISEL-NEXT: v_and_or_b32 v1, s1, v1, s3
751 ; GFX10-GISEL-NEXT: ; return to shader part epilog
752 %and0 = and i64 %a, %b
753 %not.a = xor i64 %a, -1
754 %and1 = and i64 %not.a, %mask
755 %bitselect = or i64 %and0, %and1
756 %cast = bitcast i64 %bitselect to <2 x float>
757 ret <2 x float> %cast
760 define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_0(i64 inreg %a, i64 inreg %b, i64 %mask) {
761 ; GFX7-LABEL: s_s_v_bitselect_i64_pat_0:
763 ; GFX7-NEXT: v_mov_b32_e32 v2, s3
764 ; GFX7-NEXT: v_bfi_b32 v1, s1, v2, v1
765 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
766 ; GFX7-NEXT: v_bfi_b32 v0, s0, v2, v0
767 ; GFX7-NEXT: ; return to shader part epilog
769 ; GFX8-LABEL: s_s_v_bitselect_i64_pat_0:
771 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
772 ; GFX8-NEXT: v_bfi_b32 v1, s1, v2, v1
773 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
774 ; GFX8-NEXT: v_bfi_b32 v0, s0, v2, v0
775 ; GFX8-NEXT: ; return to shader part epilog
777 ; GFX10-LABEL: s_s_v_bitselect_i64_pat_0:
779 ; GFX10-NEXT: v_bfi_b32 v0, s0, s2, v0
780 ; GFX10-NEXT: v_bfi_b32 v1, s1, s3, v1
781 ; GFX10-NEXT: ; return to shader part epilog
783 ; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_0:
784 ; GFX8-GISEL: ; %bb.0:
785 ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3]
786 ; GFX8-GISEL-NEXT: s_not_b64 s[0:1], s[0:1]
787 ; GFX8-GISEL-NEXT: v_and_b32_e32 v0, s0, v0
788 ; GFX8-GISEL-NEXT: v_and_b32_e32 v1, s1, v1
789 ; GFX8-GISEL-NEXT: v_or_b32_e32 v0, s2, v0
790 ; GFX8-GISEL-NEXT: v_or_b32_e32 v1, s3, v1
791 ; GFX8-GISEL-NEXT: ; return to shader part epilog
793 ; GFX10-GISEL-LABEL: s_s_v_bitselect_i64_pat_0:
794 ; GFX10-GISEL: ; %bb.0:
795 ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3]
796 ; GFX10-GISEL-NEXT: s_not_b64 s[0:1], s[0:1]
797 ; GFX10-GISEL-NEXT: v_and_or_b32 v0, s0, v0, s2
798 ; GFX10-GISEL-NEXT: v_and_or_b32 v1, s1, v1, s3
799 ; GFX10-GISEL-NEXT: ; return to shader part epilog
800 %and0 = and i64 %a, %b
801 %not.a = xor i64 %a, -1
802 %and1 = and i64 %not.a, %mask
803 %bitselect = or i64 %and0, %and1
804 %cast = bitcast i64 %bitselect to <2 x float>
805 ret <2 x float> %cast
808 define amdgpu_ps <2 x float> @v_v_s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 inreg %mask) {
809 ; GFX7-LABEL: v_v_s_bitselect_i64_pat_0:
811 ; GFX7-NEXT: v_bfi_b32 v1, v1, v3, s1
812 ; GFX7-NEXT: v_bfi_b32 v0, v0, v2, s0
813 ; GFX7-NEXT: ; return to shader part epilog
815 ; GFX8-LABEL: v_v_s_bitselect_i64_pat_0:
817 ; GFX8-NEXT: v_bfi_b32 v1, v1, v3, s1
818 ; GFX8-NEXT: v_bfi_b32 v0, v0, v2, s0
819 ; GFX8-NEXT: ; return to shader part epilog
821 ; GFX10-LABEL: v_v_s_bitselect_i64_pat_0:
823 ; GFX10-NEXT: v_bfi_b32 v0, v0, v2, s0
824 ; GFX10-NEXT: v_bfi_b32 v1, v1, v3, s1
825 ; GFX10-NEXT: ; return to shader part epilog
827 ; GFX8-GISEL-LABEL: v_v_s_bitselect_i64_pat_0:
828 ; GFX8-GISEL: ; %bb.0:
829 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v2, s0
830 ; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, v3, s1
831 ; GFX8-GISEL-NEXT: ; return to shader part epilog
833 ; GFX10-GISEL-LABEL: v_v_s_bitselect_i64_pat_0:
834 ; GFX10-GISEL: ; %bb.0:
835 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v2, s0
836 ; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, v3, s1
837 ; GFX10-GISEL-NEXT: ; return to shader part epilog
838 %and0 = and i64 %a, %b
839 %not.a = xor i64 %a, -1
840 %and1 = and i64 %not.a, %mask
841 %bitselect = or i64 %and0, %and1
842 %cast = bitcast i64 %bitselect to <2 x float>
843 ret <2 x float> %cast
846 define amdgpu_ps <2 x float> @v_s_v_bitselect_i64_pat_0(i64 %a, i64 inreg %b, i64 %mask) {
847 ; GFX7-LABEL: v_s_v_bitselect_i64_pat_0:
849 ; GFX7-NEXT: v_bfi_b32 v1, v1, s1, v3
850 ; GFX7-NEXT: v_bfi_b32 v0, v0, s0, v2
851 ; GFX7-NEXT: ; return to shader part epilog
853 ; GFX8-LABEL: v_s_v_bitselect_i64_pat_0:
855 ; GFX8-NEXT: v_bfi_b32 v1, v1, s1, v3
856 ; GFX8-NEXT: v_bfi_b32 v0, v0, s0, v2
857 ; GFX8-NEXT: ; return to shader part epilog
859 ; GFX10-LABEL: v_s_v_bitselect_i64_pat_0:
861 ; GFX10-NEXT: v_bfi_b32 v0, v0, s0, v2
862 ; GFX10-NEXT: v_bfi_b32 v1, v1, s1, v3
863 ; GFX10-NEXT: ; return to shader part epilog
865 ; GFX8-GISEL-LABEL: v_s_v_bitselect_i64_pat_0:
866 ; GFX8-GISEL: ; %bb.0:
867 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, s0, v2
868 ; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, s1, v3
869 ; GFX8-GISEL-NEXT: ; return to shader part epilog
871 ; GFX10-GISEL-LABEL: v_s_v_bitselect_i64_pat_0:
872 ; GFX10-GISEL: ; %bb.0:
873 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, s0, v2
874 ; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, s1, v3
875 ; GFX10-GISEL-NEXT: ; return to shader part epilog
876 %and0 = and i64 %a, %b
877 %not.a = xor i64 %a, -1
878 %and1 = and i64 %not.a, %mask
879 %bitselect = or i64 %and0, %and1
880 %cast = bitcast i64 %bitselect to <2 x float>
881 ret <2 x float> %cast
884 define amdgpu_ps <2 x float> @s_v_v_bitselect_i64_pat_0(i64 inreg %a, i64 %b, i64 %mask) {
885 ; GFX7-LABEL: s_v_v_bitselect_i64_pat_0:
887 ; GFX7-NEXT: v_bfi_b32 v1, s1, v1, v3
888 ; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v2
889 ; GFX7-NEXT: ; return to shader part epilog
891 ; GFX8-LABEL: s_v_v_bitselect_i64_pat_0:
893 ; GFX8-NEXT: v_bfi_b32 v1, s1, v1, v3
894 ; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v2
895 ; GFX8-NEXT: ; return to shader part epilog
897 ; GFX10-LABEL: s_v_v_bitselect_i64_pat_0:
899 ; GFX10-NEXT: v_bfi_b32 v0, s0, v0, v2
900 ; GFX10-NEXT: v_bfi_b32 v1, s1, v1, v3
901 ; GFX10-NEXT: ; return to shader part epilog
903 ; GFX8-GISEL-LABEL: s_v_v_bitselect_i64_pat_0:
904 ; GFX8-GISEL: ; %bb.0:
905 ; GFX8-GISEL-NEXT: v_and_b32_e32 v0, s0, v0
906 ; GFX8-GISEL-NEXT: v_and_b32_e32 v1, s1, v1
907 ; GFX8-GISEL-NEXT: s_not_b64 s[0:1], s[0:1]
908 ; GFX8-GISEL-NEXT: v_and_b32_e32 v2, s0, v2
909 ; GFX8-GISEL-NEXT: v_and_b32_e32 v3, s1, v3
910 ; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v0, v2
911 ; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v1, v3
912 ; GFX8-GISEL-NEXT: ; return to shader part epilog
914 ; GFX10-GISEL-LABEL: s_v_v_bitselect_i64_pat_0:
915 ; GFX10-GISEL: ; %bb.0:
916 ; GFX10-GISEL-NEXT: s_not_b64 s[2:3], s[0:1]
917 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2
918 ; GFX10-GISEL-NEXT: v_and_b32_e32 v3, s3, v3
919 ; GFX10-GISEL-NEXT: v_and_or_b32 v0, s0, v0, v2
920 ; GFX10-GISEL-NEXT: v_and_or_b32 v1, s1, v1, v3
921 ; GFX10-GISEL-NEXT: ; return to shader part epilog
922 %and0 = and i64 %a, %b
923 %not.a = xor i64 %a, -1
924 %and1 = and i64 %not.a, %mask
925 %bitselect = or i64 %and0, %and1
926 %cast = bitcast i64 %bitselect to <2 x float>
927 ret <2 x float> %cast
930 define i64 @v_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
931 ; GFX7-LABEL: v_bitselect_i64_pat_1:
933 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
934 ; GFX7-NEXT: v_bfi_b32 v1, v3, v1, v5
935 ; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4
936 ; GFX7-NEXT: s_setpc_b64 s[30:31]
938 ; GFX8-LABEL: v_bitselect_i64_pat_1:
940 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
941 ; GFX8-NEXT: v_bfi_b32 v1, v3, v1, v5
942 ; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4
943 ; GFX8-NEXT: s_setpc_b64 s[30:31]
945 ; GFX10-LABEL: v_bitselect_i64_pat_1:
947 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
948 ; GFX10-NEXT: v_bfi_b32 v0, v2, v0, v4
949 ; GFX10-NEXT: v_bfi_b32 v1, v3, v1, v5
950 ; GFX10-NEXT: s_setpc_b64 s[30:31]
952 ; GFX8-GISEL-LABEL: v_bitselect_i64_pat_1:
953 ; GFX8-GISEL: ; %bb.0:
954 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
955 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4
956 ; GFX8-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5
957 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
959 ; GFX10-GISEL-LABEL: v_bitselect_i64_pat_1:
960 ; GFX10-GISEL: ; %bb.0:
961 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
962 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4
963 ; GFX10-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5
964 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
965 %xor.0 = xor i64 %a, %mask
966 %and = and i64 %xor.0, %b
967 %bitselect = xor i64 %and, %mask
971 define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_1(i64 %a, i64 inreg %b, i64 inreg %mask) {
972 ; GFX7-LABEL: v_s_s_bitselect_i64_pat_1:
974 ; GFX7-NEXT: v_mov_b32_e32 v2, s3
975 ; GFX7-NEXT: v_bfi_b32 v1, s1, v1, v2
976 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
977 ; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v2
978 ; GFX7-NEXT: ; return to shader part epilog
980 ; GFX8-LABEL: v_s_s_bitselect_i64_pat_1:
982 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
983 ; GFX8-NEXT: v_bfi_b32 v1, s1, v1, v2
984 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
985 ; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v2
986 ; GFX8-NEXT: ; return to shader part epilog
988 ; GFX10-LABEL: v_s_s_bitselect_i64_pat_1:
990 ; GFX10-NEXT: v_bfi_b32 v0, s0, v0, s2
991 ; GFX10-NEXT: v_bfi_b32 v1, s1, v1, s3
992 ; GFX10-NEXT: ; return to shader part epilog
994 ; GFX8-GISEL-LABEL: v_s_s_bitselect_i64_pat_1:
995 ; GFX8-GISEL: ; %bb.0:
996 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
997 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, v0, s2
998 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s1
999 ; GFX8-GISEL-NEXT: v_bfi_b32 v1, v2, v1, s3
1000 ; GFX8-GISEL-NEXT: ; return to shader part epilog
1002 ; GFX10-GISEL-LABEL: v_s_s_bitselect_i64_pat_1:
1003 ; GFX10-GISEL: ; %bb.0:
1004 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, v0, s2
1005 ; GFX10-GISEL-NEXT: v_bfi_b32 v1, s1, v1, s3
1006 ; GFX10-GISEL-NEXT: ; return to shader part epilog
1007 %xor.0 = xor i64 %a, %mask
1008 %and = and i64 %xor.0, %b
1009 %bitselect = xor i64 %and, %mask
1010 %cast = bitcast i64 %bitselect to <2 x float>
1011 ret <2 x float> %cast
1014 define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_1(i64 inreg %a, i64 inreg %b, i64 %mask) {
1015 ; GFX7-LABEL: s_s_v_bitselect_i64_pat_1:
1017 ; GFX7-NEXT: v_mov_b32_e32 v2, s1
1018 ; GFX7-NEXT: v_bfi_b32 v1, s3, v2, v1
1019 ; GFX7-NEXT: v_mov_b32_e32 v2, s0
1020 ; GFX7-NEXT: v_bfi_b32 v0, s2, v2, v0
1021 ; GFX7-NEXT: ; return to shader part epilog
1023 ; GFX8-LABEL: s_s_v_bitselect_i64_pat_1:
1025 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
1026 ; GFX8-NEXT: v_bfi_b32 v1, s3, v2, v1
1027 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
1028 ; GFX8-NEXT: v_bfi_b32 v0, s2, v2, v0
1029 ; GFX8-NEXT: ; return to shader part epilog
1031 ; GFX10-LABEL: s_s_v_bitselect_i64_pat_1:
1033 ; GFX10-NEXT: v_bfi_b32 v0, s2, s0, v0
1034 ; GFX10-NEXT: v_bfi_b32 v1, s3, s1, v1
1035 ; GFX10-NEXT: ; return to shader part epilog
1037 ; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_1:
1038 ; GFX8-GISEL: ; %bb.0:
1039 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s2
1040 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, s0, v0
1041 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s3
1042 ; GFX8-GISEL-NEXT: v_bfi_b32 v1, v2, s1, v1
1043 ; GFX8-GISEL-NEXT: ; return to shader part epilog
1045 ; GFX10-GISEL-LABEL: s_s_v_bitselect_i64_pat_1:
1046 ; GFX10-GISEL: ; %bb.0:
1047 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, s2, s0, v0
1048 ; GFX10-GISEL-NEXT: v_bfi_b32 v1, s3, s1, v1
1049 ; GFX10-GISEL-NEXT: ; return to shader part epilog
1050 %xor.0 = xor i64 %a, %mask
1051 %and = and i64 %xor.0, %b
1052 %bitselect = xor i64 %and, %mask
1053 %cast = bitcast i64 %bitselect to <2 x float>
1054 ret <2 x float> %cast
1057 define amdgpu_ps <2 x float> @s_v_s_bitselect_i64_pat_1(i64 inreg %a, i64 %b, i64 inreg %mask) {
1058 ; GFX7-LABEL: s_v_s_bitselect_i64_pat_1:
1060 ; GFX7-NEXT: v_mov_b32_e32 v2, s3
1061 ; GFX7-NEXT: v_bfi_b32 v1, v1, s1, v2
1062 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
1063 ; GFX7-NEXT: v_bfi_b32 v0, v0, s0, v2
1064 ; GFX7-NEXT: ; return to shader part epilog
1066 ; GFX8-LABEL: s_v_s_bitselect_i64_pat_1:
1068 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
1069 ; GFX8-NEXT: v_bfi_b32 v1, v1, s1, v2
1070 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
1071 ; GFX8-NEXT: v_bfi_b32 v0, v0, s0, v2
1072 ; GFX8-NEXT: ; return to shader part epilog
1074 ; GFX10-LABEL: s_v_s_bitselect_i64_pat_1:
1076 ; GFX10-NEXT: v_bfi_b32 v0, v0, s0, s2
1077 ; GFX10-NEXT: v_bfi_b32 v1, v1, s1, s3
1078 ; GFX10-NEXT: ; return to shader part epilog
1080 ; GFX8-GISEL-LABEL: s_v_s_bitselect_i64_pat_1:
1081 ; GFX8-GISEL: ; %bb.0:
1082 ; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
1083 ; GFX8-GISEL-NEXT: v_and_b32_e32 v0, s0, v0
1084 ; GFX8-GISEL-NEXT: v_and_b32_e32 v1, s1, v1
1085 ; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, s2, v0
1086 ; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, s3, v1
1087 ; GFX8-GISEL-NEXT: ; return to shader part epilog
1089 ; GFX10-GISEL-LABEL: s_v_s_bitselect_i64_pat_1:
1090 ; GFX10-GISEL: ; %bb.0:
1091 ; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
1092 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s0, v0
1093 ; GFX10-GISEL-NEXT: v_and_b32_e32 v1, s1, v1
1094 ; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, s2, v0
1095 ; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, s3, v1
1096 ; GFX10-GISEL-NEXT: ; return to shader part epilog
1097 %xor.0 = xor i64 %a, %mask
1098 %and = and i64 %xor.0, %b
1099 %bitselect = xor i64 %and, %mask
1100 %cast = bitcast i64 %bitselect to <2 x float>
1101 ret <2 x float> %cast
1104 define i64 @v_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
1105 ; GFX7-LABEL: v_bitselect_i64_pat_2:
1107 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1108 ; GFX7-NEXT: v_bfi_b32 v1, v3, v1, v5
1109 ; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4
1110 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1112 ; GFX8-LABEL: v_bitselect_i64_pat_2:
1114 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1115 ; GFX8-NEXT: v_bfi_b32 v1, v3, v1, v5
1116 ; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4
1117 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1119 ; GFX10-LABEL: v_bitselect_i64_pat_2:
1121 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1122 ; GFX10-NEXT: v_bfi_b32 v0, v2, v0, v4
1123 ; GFX10-NEXT: v_bfi_b32 v1, v3, v1, v5
1124 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1126 ; GFX8-GISEL-LABEL: v_bitselect_i64_pat_2:
1127 ; GFX8-GISEL: ; %bb.0:
1128 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1129 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4
1130 ; GFX8-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5
1131 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
1133 ; GFX10-GISEL-LABEL: v_bitselect_i64_pat_2:
1134 ; GFX10-GISEL: ; %bb.0:
1135 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1136 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4
1137 ; GFX10-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5
1138 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
1139 %xor.0 = xor i64 %a, %mask
1140 %and = and i64 %xor.0, %b
1141 %bitselect = xor i64 %and, %mask
1145 define i64 @v_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
1146 ; GFX7-LABEL: v_bfi_sha256_ma_i64:
1147 ; GFX7: ; %bb.0: ; %entry
1148 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1149 ; GFX7-NEXT: v_xor_b32_e32 v1, v1, v3
1150 ; GFX7-NEXT: v_xor_b32_e32 v0, v0, v2
1151 ; GFX7-NEXT: v_bfi_b32 v1, v1, v5, v3
1152 ; GFX7-NEXT: v_bfi_b32 v0, v0, v4, v2
1153 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1155 ; GFX8-LABEL: v_bfi_sha256_ma_i64:
1156 ; GFX8: ; %bb.0: ; %entry
1157 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1158 ; GFX8-NEXT: v_xor_b32_e32 v1, v1, v3
1159 ; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2
1160 ; GFX8-NEXT: v_bfi_b32 v1, v1, v5, v3
1161 ; GFX8-NEXT: v_bfi_b32 v0, v0, v4, v2
1162 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1164 ; GFX10-LABEL: v_bfi_sha256_ma_i64:
1165 ; GFX10: ; %bb.0: ; %entry
1166 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1167 ; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2
1168 ; GFX10-NEXT: v_xor_b32_e32 v1, v1, v3
1169 ; GFX10-NEXT: v_bfi_b32 v0, v0, v4, v2
1170 ; GFX10-NEXT: v_bfi_b32 v1, v1, v5, v3
1171 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1173 ; GFX8-GISEL-LABEL: v_bfi_sha256_ma_i64:
1174 ; GFX8-GISEL: ; %bb.0: ; %entry
1175 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1176 ; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
1177 ; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
1178 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v4, v2
1179 ; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, v5, v3
1180 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
1182 ; GFX10-GISEL-LABEL: v_bfi_sha256_ma_i64:
1183 ; GFX10-GISEL: ; %bb.0: ; %entry
1184 ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1185 ; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
1186 ; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
1187 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v4, v2
1188 ; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, v5, v3
1189 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
1191 %and0 = and i64 %x, %z
1192 %or0 = or i64 %x, %z
1193 %and1 = and i64 %y, %or0
1194 %or1 = or i64 %and0, %and1
1198 define amdgpu_ps <2 x float> @v_s_s_bfi_sha256_ma_i64(i64 %x, i64 inreg %y, i64 inreg %z) {
1199 ; GFX7-LABEL: v_s_s_bfi_sha256_ma_i64:
1200 ; GFX7: ; %bb.0: ; %entry
1201 ; GFX7-NEXT: v_xor_b32_e32 v1, s1, v1
1202 ; GFX7-NEXT: v_mov_b32_e32 v2, s1
1203 ; GFX7-NEXT: v_bfi_b32 v1, v1, s3, v2
1204 ; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0
1205 ; GFX7-NEXT: v_mov_b32_e32 v2, s0
1206 ; GFX7-NEXT: v_bfi_b32 v0, v0, s2, v2
1207 ; GFX7-NEXT: ; return to shader part epilog
1209 ; GFX8-LABEL: v_s_s_bfi_sha256_ma_i64:
1210 ; GFX8: ; %bb.0: ; %entry
1211 ; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1
1212 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
1213 ; GFX8-NEXT: v_bfi_b32 v1, v1, s3, v2
1214 ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0
1215 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
1216 ; GFX8-NEXT: v_bfi_b32 v0, v0, s2, v2
1217 ; GFX8-NEXT: ; return to shader part epilog
1219 ; GFX10-LABEL: v_s_s_bfi_sha256_ma_i64:
1220 ; GFX10: ; %bb.0: ; %entry
1221 ; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0
1222 ; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1
1223 ; GFX10-NEXT: v_bfi_b32 v0, v0, s2, s0
1224 ; GFX10-NEXT: v_bfi_b32 v1, v1, s3, s1
1225 ; GFX10-NEXT: ; return to shader part epilog
1227 ; GFX8-GISEL-LABEL: v_s_s_bfi_sha256_ma_i64:
1228 ; GFX8-GISEL: ; %bb.0: ; %entry
1229 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s2
1230 ; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, s0, v0
1231 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v2, s0
1232 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s3
1233 ; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, s1, v1
1234 ; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, v2, s1
1235 ; GFX8-GISEL-NEXT: ; return to shader part epilog
1237 ; GFX10-GISEL-LABEL: v_s_s_bfi_sha256_ma_i64:
1238 ; GFX10-GISEL: ; %bb.0: ; %entry
1239 ; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, s0, v0
1240 ; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, s1, v1
1241 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, s2, s0
1242 ; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, s3, s1
1243 ; GFX10-GISEL-NEXT: ; return to shader part epilog
1245 %and0 = and i64 %x, %z
1246 %or0 = or i64 %x, %z
1247 %and1 = and i64 %y, %or0
1248 %or1 = or i64 %and0, %and1
1249 %cast = bitcast i64 %or1 to <2 x float>
1250 ret <2 x float> %cast
1253 define amdgpu_ps <2 x float> @s_v_s_bfi_sha256_ma_i64(i64 inreg %x, i64 %y, i64 inreg %z) {
1254 ; GFX7-LABEL: s_v_s_bfi_sha256_ma_i64:
1255 ; GFX7: ; %bb.0: ; %entry
1256 ; GFX7-NEXT: v_xor_b32_e32 v2, s1, v1
1257 ; GFX7-NEXT: v_bfi_b32 v1, v2, s3, v1
1258 ; GFX7-NEXT: v_xor_b32_e32 v2, s0, v0
1259 ; GFX7-NEXT: v_bfi_b32 v0, v2, s2, v0
1260 ; GFX7-NEXT: ; return to shader part epilog
1262 ; GFX8-LABEL: s_v_s_bfi_sha256_ma_i64:
1263 ; GFX8: ; %bb.0: ; %entry
1264 ; GFX8-NEXT: v_xor_b32_e32 v2, s1, v1
1265 ; GFX8-NEXT: v_bfi_b32 v1, v2, s3, v1
1266 ; GFX8-NEXT: v_xor_b32_e32 v2, s0, v0
1267 ; GFX8-NEXT: v_bfi_b32 v0, v2, s2, v0
1268 ; GFX8-NEXT: ; return to shader part epilog
1270 ; GFX10-LABEL: s_v_s_bfi_sha256_ma_i64:
1271 ; GFX10: ; %bb.0: ; %entry
1272 ; GFX10-NEXT: v_xor_b32_e32 v2, s0, v0
1273 ; GFX10-NEXT: v_xor_b32_e32 v3, s1, v1
1274 ; GFX10-NEXT: v_bfi_b32 v0, v2, s2, v0
1275 ; GFX10-NEXT: v_bfi_b32 v1, v3, s3, v1
1276 ; GFX10-NEXT: ; return to shader part epilog
1278 ; GFX8-GISEL-LABEL: s_v_s_bfi_sha256_ma_i64:
1279 ; GFX8-GISEL: ; %bb.0: ; %entry
1280 ; GFX8-GISEL-NEXT: s_and_b64 s[4:5], s[0:1], s[2:3]
1281 ; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
1282 ; GFX8-GISEL-NEXT: v_and_b32_e32 v0, s0, v0
1283 ; GFX8-GISEL-NEXT: v_and_b32_e32 v1, s1, v1
1284 ; GFX8-GISEL-NEXT: v_or_b32_e32 v0, s4, v0
1285 ; GFX8-GISEL-NEXT: v_or_b32_e32 v1, s5, v1
1286 ; GFX8-GISEL-NEXT: ; return to shader part epilog
1288 ; GFX10-GISEL-LABEL: s_v_s_bfi_sha256_ma_i64:
1289 ; GFX10-GISEL: ; %bb.0: ; %entry
1290 ; GFX10-GISEL-NEXT: s_and_b64 s[4:5], s[0:1], s[2:3]
1291 ; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
1292 ; GFX10-GISEL-NEXT: v_and_or_b32 v0, v0, s0, s4
1293 ; GFX10-GISEL-NEXT: v_and_or_b32 v1, v1, s1, s5
1294 ; GFX10-GISEL-NEXT: ; return to shader part epilog
1296 %and0 = and i64 %x, %z
1297 %or0 = or i64 %x, %z
1298 %and1 = and i64 %y, %or0
1299 %or1 = or i64 %and0, %and1
1300 %cast = bitcast i64 %or1 to <2 x float>
1301 ret <2 x float> %cast
1304 define amdgpu_ps <2 x float> @s_s_v_bfi_sha256_ma_i64(i64 inreg %x, i64 inreg %y, i64 %z) {
1305 ; GFX7-LABEL: s_s_v_bfi_sha256_ma_i64:
1306 ; GFX7: ; %bb.0: ; %entry
1307 ; GFX7-NEXT: v_mov_b32_e32 v2, s3
1308 ; GFX7-NEXT: v_xor_b32_e32 v2, s1, v2
1309 ; GFX7-NEXT: v_bfi_b32 v1, v2, v1, s3
1310 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
1311 ; GFX7-NEXT: v_xor_b32_e32 v2, s0, v2
1312 ; GFX7-NEXT: v_bfi_b32 v0, v2, v0, s2
1313 ; GFX7-NEXT: ; return to shader part epilog
1315 ; GFX8-LABEL: s_s_v_bfi_sha256_ma_i64:
1316 ; GFX8: ; %bb.0: ; %entry
1317 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
1318 ; GFX8-NEXT: v_xor_b32_e32 v2, s1, v2
1319 ; GFX8-NEXT: v_bfi_b32 v1, v2, v1, s3
1320 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
1321 ; GFX8-NEXT: v_xor_b32_e32 v2, s0, v2
1322 ; GFX8-NEXT: v_bfi_b32 v0, v2, v0, s2
1323 ; GFX8-NEXT: ; return to shader part epilog
1325 ; GFX10-LABEL: s_s_v_bfi_sha256_ma_i64:
1326 ; GFX10: ; %bb.0: ; %entry
1327 ; GFX10-NEXT: v_xor_b32_e64 v2, s0, s2
1328 ; GFX10-NEXT: v_xor_b32_e64 v3, s1, s3
1329 ; GFX10-NEXT: v_bfi_b32 v0, v2, v0, s2
1330 ; GFX10-NEXT: v_bfi_b32 v1, v3, v1, s3
1331 ; GFX10-NEXT: ; return to shader part epilog
1333 ; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ma_i64:
1334 ; GFX8-GISEL: ; %bb.0: ; %entry
1335 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
1336 ; GFX8-GISEL-NEXT: v_xor_b32_e32 v2, s2, v2
1337 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, v0, s2
1338 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s1
1339 ; GFX8-GISEL-NEXT: v_xor_b32_e32 v2, s3, v2
1340 ; GFX8-GISEL-NEXT: v_bfi_b32 v1, v2, v1, s3
1341 ; GFX8-GISEL-NEXT: ; return to shader part epilog
1343 ; GFX10-GISEL-LABEL: s_s_v_bfi_sha256_ma_i64:
1344 ; GFX10-GISEL: ; %bb.0: ; %entry
1345 ; GFX10-GISEL-NEXT: v_xor_b32_e64 v2, s0, s2
1346 ; GFX10-GISEL-NEXT: v_xor_b32_e64 v3, s1, s3
1347 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v2, v0, s2
1348 ; GFX10-GISEL-NEXT: v_bfi_b32 v1, v3, v1, s3
1349 ; GFX10-GISEL-NEXT: ; return to shader part epilog
1351 %and0 = and i64 %x, %z
1352 %or0 = or i64 %x, %z
1353 %and1 = and i64 %y, %or0
1354 %or1 = or i64 %and0, %and1
1355 %cast = bitcast i64 %or1 to <2 x float>
1356 ret <2 x float> %cast
1359 define amdgpu_ps <2 x float> @v_s_v_bfi_sha256_ma_i64(i64 %x, i64 inreg %y, i64 %z) {
1360 ; GFX7-LABEL: v_s_v_bfi_sha256_ma_i64:
1361 ; GFX7: ; %bb.0: ; %entry
1362 ; GFX7-NEXT: v_xor_b32_e32 v1, s1, v1
1363 ; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0
1364 ; GFX7-NEXT: v_bfi_b32 v1, v1, v3, s1
1365 ; GFX7-NEXT: v_bfi_b32 v0, v0, v2, s0
1366 ; GFX7-NEXT: ; return to shader part epilog
1368 ; GFX8-LABEL: v_s_v_bfi_sha256_ma_i64:
1369 ; GFX8: ; %bb.0: ; %entry
1370 ; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1
1371 ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0
1372 ; GFX8-NEXT: v_bfi_b32 v1, v1, v3, s1
1373 ; GFX8-NEXT: v_bfi_b32 v0, v0, v2, s0
1374 ; GFX8-NEXT: ; return to shader part epilog
1376 ; GFX10-LABEL: v_s_v_bfi_sha256_ma_i64:
1377 ; GFX10: ; %bb.0: ; %entry
1378 ; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0
1379 ; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1
1380 ; GFX10-NEXT: v_bfi_b32 v0, v0, v2, s0
1381 ; GFX10-NEXT: v_bfi_b32 v1, v1, v3, s1
1382 ; GFX10-NEXT: ; return to shader part epilog
1384 ; GFX8-GISEL-LABEL: v_s_v_bfi_sha256_ma_i64:
1385 ; GFX8-GISEL: ; %bb.0: ; %entry
1386 ; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, s0, v0
1387 ; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, s1, v1
1388 ; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v2, s0
1389 ; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, v3, s1
1390 ; GFX8-GISEL-NEXT: ; return to shader part epilog
1392 ; GFX10-GISEL-LABEL: v_s_v_bfi_sha256_ma_i64:
1393 ; GFX10-GISEL: ; %bb.0: ; %entry
1394 ; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, s0, v0
1395 ; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, s1, v1
1396 ; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v2, s0
1397 ; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, v3, s1
1398 ; GFX10-GISEL-NEXT: ; return to shader part epilog
1400 %and0 = and i64 %x, %z
1401 %or0 = or i64 %x, %z
1402 %and1 = and i64 %y, %or0
1403 %or1 = or i64 %and0, %and1
1404 %cast = bitcast i64 %or1 to <2 x float>
1405 ret <2 x float> %cast
1408 define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
1409 ; GFX7-LABEL: s_bitselect_i64_pat_0:
1411 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1412 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1413 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1414 ; GFX7-NEXT: s_mov_b32 s2, -1
1415 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1416 ; GFX7-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
1417 ; GFX7-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
1418 ; GFX7-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1]
1419 ; GFX7-NEXT: s_add_u32 s0, s0, 10
1420 ; GFX7-NEXT: s_addc_u32 s1, s1, 0
1421 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
1422 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
1423 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1424 ; GFX7-NEXT: s_endpgm
1426 ; GFX8-LABEL: s_bitselect_i64_pat_0:
1428 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1429 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1430 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1431 ; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
1432 ; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
1433 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
1434 ; GFX8-NEXT: s_add_u32 s0, s0, 10
1435 ; GFX8-NEXT: s_addc_u32 s1, s1, 0
1436 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1437 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1438 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
1439 ; GFX8-NEXT: s_endpgm
1441 ; GFX10-LABEL: s_bitselect_i64_pat_0:
1443 ; GFX10-NEXT: s_clause 0x1
1444 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1445 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1446 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1447 ; GFX10-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
1448 ; GFX10-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
1449 ; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
1450 ; GFX10-NEXT: s_add_u32 s0, s0, 10
1451 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1452 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1453 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1454 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
1455 ; GFX10-NEXT: s_endpgm
1457 ; GFX8-GISEL-LABEL: s_bitselect_i64_pat_0:
1458 ; GFX8-GISEL: ; %bb.0:
1459 ; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1460 ; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1461 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1462 ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
1463 ; GFX8-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
1464 ; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
1465 ; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
1466 ; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
1467 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
1468 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
1469 ; GFX8-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
1470 ; GFX8-GISEL-NEXT: s_endpgm
1472 ; GFX10-GISEL-LABEL: s_bitselect_i64_pat_0:
1473 ; GFX10-GISEL: ; %bb.0:
1474 ; GFX10-GISEL-NEXT: s_clause 0x1
1475 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1476 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1477 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1478 ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
1479 ; GFX10-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
1480 ; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
1481 ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
1482 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
1483 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
1484 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
1485 ; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
1486 ; GFX10-GISEL-NEXT: s_endpgm
1487 %and0 = and i64 %a, %b
1488 %not.a = xor i64 %a, -1
1489 %and1 = and i64 %not.a, %mask
1490 %bitselect = or i64 %and0, %and1
1491 %scalar.use = add i64 %bitselect, 10
1492 store i64 %scalar.use, ptr addrspace(1) undef
1496 define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
1497 ; GFX7-LABEL: s_bitselect_i64_pat_1:
1499 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1500 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1501 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1502 ; GFX7-NEXT: s_mov_b32 s2, -1
1503 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1504 ; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1]
1505 ; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
1506 ; GFX7-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
1507 ; GFX7-NEXT: s_add_u32 s0, s0, 10
1508 ; GFX7-NEXT: s_addc_u32 s1, s1, 0
1509 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
1510 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
1511 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1512 ; GFX7-NEXT: s_endpgm
1514 ; GFX8-LABEL: s_bitselect_i64_pat_1:
1516 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1517 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1518 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1519 ; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
1520 ; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
1521 ; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
1522 ; GFX8-NEXT: s_add_u32 s0, s0, 10
1523 ; GFX8-NEXT: s_addc_u32 s1, s1, 0
1524 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1525 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1526 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
1527 ; GFX8-NEXT: s_endpgm
1529 ; GFX10-LABEL: s_bitselect_i64_pat_1:
1531 ; GFX10-NEXT: s_clause 0x1
1532 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1533 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1535 ; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
1536 ; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
1537 ; GFX10-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
1538 ; GFX10-NEXT: s_add_u32 s0, s0, 10
1539 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1540 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1541 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1542 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
1543 ; GFX10-NEXT: s_endpgm
1545 ; GFX8-GISEL-LABEL: s_bitselect_i64_pat_1:
1546 ; GFX8-GISEL: ; %bb.0:
1547 ; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1548 ; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1549 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1550 ; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
1551 ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
1552 ; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
1553 ; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
1554 ; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
1555 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
1556 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
1557 ; GFX8-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
1558 ; GFX8-GISEL-NEXT: s_endpgm
1560 ; GFX10-GISEL-LABEL: s_bitselect_i64_pat_1:
1561 ; GFX10-GISEL: ; %bb.0:
1562 ; GFX10-GISEL-NEXT: s_clause 0x1
1563 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1564 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1565 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1566 ; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
1567 ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
1568 ; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
1569 ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
1570 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
1571 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
1572 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
1573 ; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
1574 ; GFX10-GISEL-NEXT: s_endpgm
1575 %xor.0 = xor i64 %a, %mask
1576 %and = and i64 %xor.0, %b
1577 %bitselect = xor i64 %and, %mask
1579 %scalar.use = add i64 %bitselect, 10
1580 store i64 %scalar.use, ptr addrspace(1) undef
1584 define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
1585 ; GFX7-LABEL: s_bitselect_i64_pat_2:
1587 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1588 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1589 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1590 ; GFX7-NEXT: s_mov_b32 s2, -1
1591 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1592 ; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1]
1593 ; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
1594 ; GFX7-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
1595 ; GFX7-NEXT: s_add_u32 s0, s0, 10
1596 ; GFX7-NEXT: s_addc_u32 s1, s1, 0
1597 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
1598 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
1599 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1600 ; GFX7-NEXT: s_endpgm
1602 ; GFX8-LABEL: s_bitselect_i64_pat_2:
1604 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1605 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1606 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1607 ; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
1608 ; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
1609 ; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
1610 ; GFX8-NEXT: s_add_u32 s0, s0, 10
1611 ; GFX8-NEXT: s_addc_u32 s1, s1, 0
1612 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1613 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1614 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
1615 ; GFX8-NEXT: s_endpgm
1617 ; GFX10-LABEL: s_bitselect_i64_pat_2:
1619 ; GFX10-NEXT: s_clause 0x1
1620 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1621 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1622 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1623 ; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
1624 ; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
1625 ; GFX10-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
1626 ; GFX10-NEXT: s_add_u32 s0, s0, 10
1627 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1628 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1629 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1630 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
1631 ; GFX10-NEXT: s_endpgm
1633 ; GFX8-GISEL-LABEL: s_bitselect_i64_pat_2:
1634 ; GFX8-GISEL: ; %bb.0:
1635 ; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1636 ; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1637 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1638 ; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
1639 ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
1640 ; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
1641 ; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
1642 ; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
1643 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
1644 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
1645 ; GFX8-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
1646 ; GFX8-GISEL-NEXT: s_endpgm
1648 ; GFX10-GISEL-LABEL: s_bitselect_i64_pat_2:
1649 ; GFX10-GISEL: ; %bb.0:
1650 ; GFX10-GISEL-NEXT: s_clause 0x1
1651 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1652 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1653 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1654 ; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
1655 ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
1656 ; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
1657 ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
1658 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
1659 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
1660 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
1661 ; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
1662 ; GFX10-GISEL-NEXT: s_endpgm
1663 %xor.0 = xor i64 %a, %mask
1664 %and = and i64 %xor.0, %b
1665 %bitselect = xor i64 %and, %mask
1667 %scalar.use = add i64 %bitselect, 10
1668 store i64 %scalar.use, ptr addrspace(1) undef
1672 define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
1673 ; GFX7-LABEL: s_bfi_sha256_ma_i64:
1674 ; GFX7: ; %bb.0: ; %entry
1675 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1676 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1677 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1678 ; GFX7-NEXT: s_mov_b32 s2, -1
1679 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1680 ; GFX7-NEXT: s_and_b64 s[8:9], s[4:5], s[0:1]
1681 ; GFX7-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
1682 ; GFX7-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
1683 ; GFX7-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]
1684 ; GFX7-NEXT: s_add_u32 s0, s0, 10
1685 ; GFX7-NEXT: s_addc_u32 s1, s1, 0
1686 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
1687 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
1688 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1689 ; GFX7-NEXT: s_endpgm
1691 ; GFX8-LABEL: s_bfi_sha256_ma_i64:
1692 ; GFX8: ; %bb.0: ; %entry
1693 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1694 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1695 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1696 ; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1]
1697 ; GFX8-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
1698 ; GFX8-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
1699 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
1700 ; GFX8-NEXT: s_add_u32 s0, s0, 10
1701 ; GFX8-NEXT: s_addc_u32 s1, s1, 0
1702 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1703 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1704 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
1705 ; GFX8-NEXT: s_endpgm
1707 ; GFX10-LABEL: s_bfi_sha256_ma_i64:
1708 ; GFX10: ; %bb.0: ; %entry
1709 ; GFX10-NEXT: s_clause 0x1
1710 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1711 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1712 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1713 ; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1]
1714 ; GFX10-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
1715 ; GFX10-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
1716 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
1717 ; GFX10-NEXT: s_add_u32 s0, s0, 10
1718 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1719 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1720 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1721 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
1722 ; GFX10-NEXT: s_endpgm
1724 ; GFX8-GISEL-LABEL: s_bfi_sha256_ma_i64:
1725 ; GFX8-GISEL: ; %bb.0: ; %entry
1726 ; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1727 ; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1728 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1729 ; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1]
1730 ; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
1731 ; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
1732 ; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
1733 ; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
1734 ; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
1735 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
1736 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
1737 ; GFX8-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
1738 ; GFX8-GISEL-NEXT: s_endpgm
1740 ; GFX10-GISEL-LABEL: s_bfi_sha256_ma_i64:
1741 ; GFX10-GISEL: ; %bb.0: ; %entry
1742 ; GFX10-GISEL-NEXT: s_clause 0x1
1743 ; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1744 ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1745 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1746 ; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1]
1747 ; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
1748 ; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
1749 ; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
1750 ; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
1751 ; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
1752 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
1753 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
1754 ; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
1755 ; GFX10-GISEL-NEXT: s_endpgm
1757 %and0 = and i64 %x, %z
1758 %or0 = or i64 %x, %z
1759 %and1 = and i64 %y, %or0
1760 %or1 = or i64 %and0, %and1
1762 %scalar.use = add i64 %or1, 10
1763 store i64 %scalar.use, ptr addrspace(1) undef